diff --git a/.gitignore b/.gitignore index c246a56cf15a4..6be36bf8c243e 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ paddle/fluid/API_DEV.spec paddle/fluid/API_PR.spec paddle/fluid/op_use_default_grad_maker_DEV.spec paddle/fluid/op_use_default_grad_maker_PR.spec +paddle/pten/api/*/api* *.DS_Store *.vs diff --git a/CMakeLists.txt b/CMakeLists.txt index 334a6cfcd0ee1..03f8522ad5446 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,6 +46,7 @@ option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) +option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) # NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON # to develop some acl related functionality on x86 option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND}) @@ -215,6 +216,7 @@ option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE} option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF) option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) option(WITH_CINN "Compile PaddlePaddle with CINN" OFF) +option(WITH_INFRT "Compile PaddlePaddle with INFRT" OFF) option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON) option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON) option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index f15db6e094c17..a77f9f72ca6ad 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -97,6 +97,11 @@ if(WITH_XPU) add_definitions(-DPADDLE_WITH_XPU) endif() +if(WITH_IPU) + message(STATUS "Compile with IPU!") + add_definitions(-DPADDLE_WITH_IPU) +endif() + if(WITH_GPU) add_definitions(-DPADDLE_WITH_CUDA) add_definitions(-DEIGEN_USE_GPU) diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake index 581a5f93768d0..41b90345c8c5f 100644 --- a/cmake/external/cinn.cmake +++ b/cmake/external/cinn.cmake @@ -26,8 +26,7 @@ add_definitions(-w) ###################################### include(ExternalProject) set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN) -# TODO(zhhsplendid): Modify git tag after we have release tag -set(CINN_GIT_TAG develop) +set(CINN_GIT_TAG release/v0.1) set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION} -DWITH_CUDA=${WITH_GPU} -DWITH_CUDNN=${WITH_GPU} diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake index 913fbfed316d8..27a013c1763a7 100644 --- a/cmake/external/cryptopp.cmake +++ b/cmake/external/cryptopp.cmake @@ -22,9 +22,9 @@ SET(CRYPTOPP_TAG CRYPTOPP_8_2_0) IF(WIN32) SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE) - # There is a compilation parameter 'FI\"winapifamily.h\"' can't be used correctly + # There is a compilation parameter "/FI\"winapifamily.h\"" or "/FIwinapifamily.h" can't be used correctly # with Ninja on Windows. The only difference between the patch file and original - # file is that the compilation parameters are changed to 'FIwinapifamily.h'. This + # file is that the compilation parameters are changed to '/nologo'. This # patch command can be removed when upgrading to a higher version. if("${CMAKE_GENERATOR}" STREQUAL "Ninja") set(CRYPTOPP_PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different "${PADDLE_SOURCE_DIR}/patches/cryptopp/CMakeLists.txt" "/") diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake new file mode 100644 index 0000000000000..8fd4a0741eaba --- /dev/null +++ b/cmake/external/llvm.cmake @@ -0,0 +1,110 @@ +include(FetchContent) + +set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/CINN/llvm11.tar.gz) +set(LLVM_MD5 39d32b6be466781dddf5869318dcba53) + +set(FETCHCONTENT_BASE_DIR ${THIRD_PARTY_PATH}/llvm) +set(FETCHCONTENT_QUIET OFF) +FetchContent_Declare(external_llvm + URL ${LLVM_DOWNLOAD_URL} + URL_MD5 ${LLVM_MD5} + PREFIX ${THIRD_PARTY_PATH}/llvm + SOURCE_DIR ${THIRD_PARTY_PATH}/install/llvm +) +if (NOT LLVM_PATH) + FetchContent_GetProperties(external_llvm) + if (NOT external_llvm_POPULATED) + FetchContent_Populate(external_llvm) + endif() + set(LLVM_PATH ${THIRD_PARTY_PATH}/install/llvm) + set(LLVM_DIR ${THIRD_PARTY_PATH}/install/llvm/lib/cmake/llvm) + set(MLIR_DIR ${THIRD_PARTY_PATH}/install/llvm/lib/cmake/mlir) +else () + set(LLVM_DIR ${LLVM_PATH}/lib/cmake/llvm) + set(MLIR_DIR ${LLVM_PATH}/lib/cmake/mlir) +endif() + +if (${CMAKE_CXX_COMPILER} STREQUAL "clang++") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi") +endif() + +message(STATUS "set LLVM_DIR: ${LLVM_DIR}") +message(STATUS "set MLIR_DIR: ${MLIR_DIR}") +find_package(LLVM REQUIRED CONFIG HINTS ${LLVM_DIR}) +find_package(MLIR REQUIRED CONFIG HINTS ${MLIR_DIR}) +find_package(ZLIB REQUIRED) + +list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}") +include(AddLLVM) + +include_directories(${LLVM_INCLUDE_DIRS}) +list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}") +list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}") +include(AddLLVM) +include(TableGen) +include(AddMLIR) + +message(STATUS "Found MLIR: ${MLIR_DIR}") +message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") +message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") + +# To build with MLIR, the LLVM is build from source code using the following flags: + +#[==[ +cmake -G Ninja ../llvm \ + -DLLVM_ENABLE_PROJECTS="mlir;clang" \ + -DLLVM_BUILD_EXAMPLES=OFF \ + -DLLVM_TARGETS_TO_BUILD="X86" \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_ENABLE_ASSERTIONS=ON \ + -DLLVM_ENABLE_ZLIB=OFF \ + -DLLVM_ENABLE_RTTI=ON \ +#]==] +# The matched llvm-project version is f9dc2b7079350d0fed3bb3775f496b90483c9e42 (currently a temporary commit) + +add_definitions(${LLVM_DEFINITIONS}) + +llvm_map_components_to_libnames(llvm_libs Support Core irreader + X86 executionengine orcjit mcjit all codegen) + +message(STATUS "LLVM libs: ${llvm_libs}") + +get_property(mlir_libs GLOBAL PROPERTY MLIR_ALL_LIBS) +message(STATUS "MLIR libs: ${mlir_libs}") +add_definitions(${LLVM_DEFINITIONS}) + + +# The minimum needed libraries for MLIR IR parse and transform. +set(MLIR_IR_LIBS MLIRAnalysis MLIRStandardOps MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib) + + +# tb_base is the name of a xxx.td file (without the .td suffix) +function(mlir_tablegen_on td_base) + set(options) + set(oneValueArgs DIALECT) + cmake_parse_arguments(mlir_tablegen_on "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + set(LLVM_TARGET_DEFINITIONS ${td_base}.td) + mlir_tablegen(${td_base}.hpp.inc -gen-op-decls) + mlir_tablegen(${td_base}.cpp.inc -gen-op-defs) + if (mlir_tablegen_on_DIALECT) + mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls -dialect=${mlir_tablegen_on_DIALECT}) + endif() + add_public_tablegen_target(${td_base}_IncGen) + add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen) +endfunction() + +function(mlir_add_rewriter td_base) + set(LLVM_TARGET_DEFINITIONS ${td_base}.td) + mlir_tablegen(${td_base}.hpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass") + add_public_tablegen_target(${td_base}_IncGen) + add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen) +endfunction() + +# Execute the mlir script with infrt-exec program. +# @name: name of the test +# @script: path to the mlir script file +function (infrt_exec_check name script) + add_test(NAME ${name} + COMMAND sh -c "${CMAKE_BINARY_DIR}/infrt/host_context/infrt-exec -i ${CMAKE_CURRENT_SOURCE_DIR}/${script}| ${LLVM_PATH}/bin/FileCheck ${CMAKE_CURRENT_SOURCE_DIR}/${script}") +endfunction() diff --git a/cmake/external/poplar.cmake b/cmake/external/poplar.cmake new file mode 100644 index 0000000000000..7947a54f8b5f1 --- /dev/null +++ b/cmake/external/poplar.cmake @@ -0,0 +1,61 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +if(WITH_IPU) + set(POPLAR_DIR CACHE PATH "Path to a Poplar install") + set(POPART_DIR CACHE PATH "Path to a Popart install") + set(POPLAR_SDK_DIR CACHE PATH "Path to an extracted SDK archive or to a Poplar & Popart install directory (Will populate POPLAR_DIR and POPART_DIR)") + + if(DEFINED ENV{POPLAR_SDK_DIR}) + set(POPLAR_SDK_DIR $ENV{POPLAR_SDK_DIR}) + execute_process(COMMAND find ${POPLAR_SDK_DIR}/ -maxdepth 1 -type d -name "popart*" + OUTPUT_VARIABLE POPART_DIR OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND find ${POPLAR_SDK_DIR}/ -maxdepth 1 -type d -name "poplar-*" -o -name "poplar" + OUTPUT_VARIABLE POPLAR_DIR OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT IS_DIRECTORY "${POPLAR_DIR}") + message(FATAL_ERROR "Couldn't find a \"poplar\" or \"poplar-*\" folder in '${POPLAR_SDK_DIR}'") + endif() + if(NOT IS_DIRECTORY "${POPART_DIR}") + message(FATAL_ERROR "Couldn't find a \"popart*\" folder in '${POPLAR_SDK_DIR}'") + endif() + else() + message(FATAL_ERROR "You must provide a path to a Poplar install using export POPLAR_SDK_DIR=/path/to/poplar_sdk") + endif() + + message("POPLAR_DIR is ${POPLAR_DIR}") + message("POPART_DIR is ${POPART_DIR}") + + if(EXISTS ${POPLAR_DIR}) + list(APPEND CMAKE_PREFIX_PATH ${POPLAR_DIR}) + set(ENABLE_POPLAR_CMD "source ${POPLAR_DIR}/enable.sh") + find_package(poplar REQUIRED) + include_directories("${POPLAR_DIR}/include") + link_directories("${POPLAR_DIR}/lib") + endif() + if(NOT poplar_FOUND) + message(FATAL_ERROR "You must provide a path to a Poplar install using -DPOPLAR_DIR=/path/to/popart/build/install") + endif() + + if(EXISTS ${POPART_DIR}) + list(APPEND CMAKE_PREFIX_PATH ${POPART_DIR}) + set(ENABLE_POPART_CMD "source ${POPART_DIR}/enable.sh") + find_package(popart REQUIRED COMPONENTS popart-only) + include_directories("${POPART_DIR}/include") + link_directories("${POPART_DIR}/lib") + endif() + if(NOT popart_FOUND) + message(FATAL_ERROR "You must provide a path to a Popart build using -DPOPART_DIR=/path/to/popart/build") + endif() + add_definitions(-DONNX_NAMESPACE=onnx) + add_custom_target(extern_poplar DEPENDS poplar popart-only) +endif() diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 86acd1a001250..2a028b8dc7e7f 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -204,6 +204,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) SET(PROTOBUF_TAG v3.8.0) + elseif(WITH_IPU) + SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) + SET(PROTOBUF_TAG d750fbf648256c7c631f51ffdbf67d7c18b0114e) else() SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546) @@ -243,6 +246,8 @@ ENDFUNCTION() if(WITH_ASCEND OR WITH_ASCEND_CL) SET(PROTOBUF_VERSION 3.8.0) +elseif(WITH_IPU) + SET(PROTOBUF_VERSION 3.6.1) else() SET(PROTOBUF_VERSION 3.1.0) endif() diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 7f828fd66e2aa..d89ecd27c0954 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -34,8 +34,13 @@ ELSE () SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") ENDIF() -SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") -SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211129") +if(NOT DEFINED XPU_BASE_URL) + SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211129") +else() + SET(XPU_BASE_URL "${XPU_BASE_URL}") +endif() + SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 7afff25664bbb..7495ee32bab95 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -151,6 +151,13 @@ set(COMMON_FLAGS ${fsanitize} ) +if(WITH_IPU) + set(COMMON_FLAGS ${COMMON_FLAGS} + -Wno-sign-compare # Warnings in Popart + -Wno-non-virtual-dtor # Warnings in Popart + ) +endif() + if(NOT APPLE) if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM)) set(COMMON_FLAGS diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index f2efc974073e5..71e1856147449 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -391,4 +391,14 @@ if (WIN32) list(APPEND third_party_deps extern_dirent) endif (WIN32) +if (WITH_INFRT) + include(external/llvm) + list(APPEND third_party_deps external_llvm) +endif() + +if (WITH_IPU) + include(external/poplar) + list(APPEND third_party_deps extern_poplar) +endif() + add_custom_target(third_party ALL DEPENDS ${third_party_deps}) diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index b3a1b2e8c9587..4b88689b9b6df 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -2,4 +2,5 @@ add_subdirectory(scripts) add_subdirectory(testing) set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory") add_subdirectory(pten) +add_subdirectory(infrt) add_subdirectory(fluid) diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt index 641110802f1fd..51f1d936bd70a 100644 --- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt +++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt @@ -11,14 +11,15 @@ else() endif() cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc - interceptor.cc compute_interceptor.cc interceptor_message_service.cc message_bus.cc - DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto collective_helper - ${BRPC_DEPS}) + interceptor.cc compute_interceptor.cc amplifier_interceptor.cc interceptor_message_service.cc message_bus.cc + DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto collective_helper op_registry + executor_gc_helper ${BRPC_DEPS}) if(WITH_DISTRIBUTE) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(compute_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(amplifier_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(message_bus.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(message_bus.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(fleet_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) diff --git a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc new file mode 100644 index 0000000000000..72c689732b5b7 --- /dev/null +++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc @@ -0,0 +1,60 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h" + +#include "paddle/fluid/distributed/fleet_executor/task_node.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace distributed { + +AmplifierInterceptor::AmplifierInterceptor(int64_t interceptor_id, + TaskNode* node) + : ComputeInterceptor(interceptor_id, node) { + run_per_steps_ = node->run_per_steps(); + run_at_offset_ = node->run_at_offset(); + reply_up_per_steps_ = node->reply_up_per_steps(); + send_down_per_steps_ = node->send_down_per_steps(); +} + +void AmplifierInterceptor::RunOps() { + // run_per_steps_, run_at_offset_ + // 4, 0 --> run at step 0, 4, 8, 12 + // 4, 3 --> run at step 3, 7, 11, 15 + if ((step_ % run_per_steps_) == run_at_offset_) { + ComputeInterceptor::RunOps(); + } +} + +void AmplifierInterceptor::SendDataReadyToDownStream() { + // run multi times, send ready one times to downstream, that is + // input multi times, output one times + if (step_ % send_down_per_steps_ == 0) { + ComputeInterceptor::SendDataReadyToDownStream(); + } +} + +void AmplifierInterceptor::ReplyCompletedToUpStream() { + // run multi times, reply one times to upstream, that is + // input one times, output multi times + if (step_ % reply_up_per_steps_ == 0) { + ComputeInterceptor::ReplyCompletedToUpStream(); + } +} + +REGISTER_INTERCEPTOR(Amplifier, AmplifierInterceptor); + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h new file mode 100644 index 0000000000000..776aa8d3e88db --- /dev/null +++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h @@ -0,0 +1,43 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h" + +namespace paddle { +namespace distributed { + +class AmplifierInterceptor : public ComputeInterceptor { + public: + AmplifierInterceptor(int64_t interceptor_id, TaskNode* node); + + private: + void RunOps() override; + void SendDataReadyToDownStream() override; + void ReplyCompletedToUpStream() override; + + int64_t run_per_steps_{1}; + int64_t run_at_offset_{0}; + + // one input produces multi times output + int64_t reply_up_per_steps_{1}; + // one output need multi times input + int64_t send_down_per_steps_{1}; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc index 108a21b92fdfd..009df6438e270 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.cc +++ b/paddle/fluid/distributed/fleet_executor/carrier.cc @@ -16,22 +16,25 @@ #include "paddle/fluid/distributed/fleet_executor/interceptor.h" #include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h" #include "paddle/fluid/distributed/fleet_executor/message_bus.h" +#include "paddle/fluid/distributed/fleet_executor/runtime_graph.h" #include "paddle/fluid/distributed/fleet_executor/task_node.h" +#include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/scope.h" namespace paddle { namespace distributed { USE_INTERCEPTOR(Compute); +USE_INTERCEPTOR(Amplifier); -void Carrier::Init( - const std::unordered_map& interceptor_id_to_node, - framework::Scope* root_scope, framework::Scope* minibatch_scope, - const std::vector& microbatch_scopes, - const platform::Place& place) { +void Carrier::Init(std::shared_ptr runtime_graph, + framework::Scope* root_scope, + framework::Scope* minibatch_scope, + const std::vector& microbatch_scopes, + const platform::Place& place) { PADDLE_ENFORCE_EQ(is_init_, false, platform::errors::AlreadyExists( "Carrier is already init.")); - interceptor_id_to_node_ = interceptor_id_to_node; + runtime_graph_ = runtime_graph; minibatch_scope_ = minibatch_scope; microbatch_scopes_ = microbatch_scopes; place_ = place; @@ -41,15 +44,34 @@ void Carrier::Init( is_init_ = true; } -Carrier::~Carrier() { +void Carrier::Release() { // NOTE(wangxi): must join before `Derived Interceptor` destruct, // otherwise Derived object will be destructed before thread complete. + + // Sending STOP msg to the source interceptor + MessageBus& msg_bus = MessageBus::Instance(); + PADDLE_ENFORCE_EQ(msg_bus.IsInit(), true, + platform::errors::PreconditionNotMet( + "Message bus has not been initialized.")); + for (int64_t id : source_interceptor_ids_) { + VLOG(3) << "Carrier Release is sending stop to source interceptor " << id + << "."; + InterceptorMessage stop_msg; + // source node STOP is send by carrier, so set src_id=-1 + stop_msg.set_src_id(-1); + stop_msg.set_dst_id(id); + stop_msg.set_message_type(STOP); + msg_bus.Send(stop_msg); + } + // TODO(wangxi): Maybe need a better to use thread. for (auto& interceptor : interceptor_idx_to_interceptor_) { interceptor.second->Join(); } } +Carrier::~Carrier() { VLOG(3) << "Carrier's destructor."; } + bool Carrier::EnqueueInterceptorMessage( const InterceptorMessage& interceptor_message) { // enqueue message to interceptor @@ -92,19 +114,22 @@ Interceptor* Carrier::GetInterceptor(int64_t interceptor_id) { } void Carrier::Start() { - // TODO(fleet_executor dev): this start is a faked one, need replace - for (const auto& pair : interceptor_idx_to_interceptor_) { - VLOG(3) << "Fake run is sending start to interceptor " << pair.first << "."; - InterceptorMessage tmp_msg; - tmp_msg.set_src_id(pair.first); - tmp_msg.set_dst_id(pair.first); - tmp_msg.set_message_type(DATA_IS_READY); - MessageBus& message_bus_instance = MessageBus::Instance(); - PADDLE_ENFORCE_EQ(message_bus_instance.IsInit(), true, - platform::errors::PreconditionNotMet( - "Message bus has not been initialized.")); - message_bus_instance.Send(tmp_msg); + MessageBus& msg_bus = MessageBus::Instance(); + PADDLE_ENFORCE_EQ(msg_bus.IsInit(), true, + platform::errors::PreconditionNotMet( + "Message bus has not been initialized.")); + + for (int64_t id : source_interceptor_ids_) { + VLOG(3) << "Carrier Start is sending start to source interceptor " << id + << "."; + InterceptorMessage start_msg; + // source node data_is_ready is send by carrier, so set src_id=-1 + start_msg.set_src_id(-1); + start_msg.set_dst_id(id); + start_msg.set_message_type(DATA_IS_READY); + msg_bus.Send(start_msg); } + std::unique_lock lock(running_mutex_); cond_var_.wait(lock); dev_ctx_->Wait(); @@ -136,6 +161,17 @@ void Carrier::SetCreatingFlag(bool flag) { creating_interceptors_ = flag; creating_flag_mutex_.unlock(); if (!flag) { + for (auto& pair : interceptor_idx_to_interceptor_) { + // update the source interceptor id + if (std::find(source_interceptor_ids_.begin(), + source_interceptor_ids_.end(), + pair.first) == source_interceptor_ids_.end()) { + auto task = pair.second->GetTaskNode(); + if (task != nullptr && task->upstream().empty()) { + source_interceptor_ids_.emplace_back(pair.first); + } + } + } // finish create interceptors outside, handle tmp messsages HandleTmpMessages(); } @@ -156,32 +192,70 @@ void Carrier::HandleTmpMessages() { message_tmp_.clear(); } +static std::shared_ptr GetGC( + const platform::Place& place) { + int64_t max_memory_size = framework::GetEagerDeletionThreshold(); + std::shared_ptr gc; + if (max_memory_size >= 0) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (platform::is_gpu_place(place)) { + if (framework::IsFastEagerDeletionModeEnabled()) { + gc.reset(new framework::UnsafeFastGPUGarbageCollector( + BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size)); + } + } +#endif + } // max_memory_size >= 0 + + return gc; +} + void Carrier::CreateInterceptors() { + if (runtime_graph_->intercepter_id_to_node().empty()) return; + + auto gc = GetGC(place_); + // create each Interceptor - if (!interceptor_id_to_node_.empty()) { - // no auto init since there is no config - for (const auto& item : interceptor_id_to_node_) { - int64_t interceptor_id = item.first; - TaskNode* task_node = item.second; - - // TODO(wangxi): use node_type to select different Interceptor - auto interceptor = - std::make_unique(interceptor_id, task_node); - interceptor->SetPlace(place_); - interceptor->SetMiniBatchScope(minibatch_scope_); - interceptor->SetMicroBatchScope(microbatch_scopes_); - interceptor->SetRootScope(root_scope_); - SetInterceptor(interceptor_id, std::move(interceptor)); - VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id - << "."; + // no auto init since there is no config + for (const auto& item : runtime_graph_->intercepter_id_to_node()) { + int64_t interceptor_id = item.first; + TaskNode* task_node = item.second; + + PADDLE_ENFORCE_LT( + task_node->run_at_offset(), task_node->run_per_steps(), + platform::errors::InvalidArgument( + "Interceptor's run_at_offset must < run_per_steps, must now " + "run_at_offset=%ld run_per_steps=%ld", + task_node->run_at_offset(), task_node->run_per_steps())); + + std::unique_ptr interceptor; + if (task_node->type().empty()) { + // TODO(wangxi): delete this in future + interceptor.reset(new Interceptor(interceptor_id, task_node)); + } else { + interceptor = InterceptorFactory::Create(task_node->type(), + interceptor_id, task_node); + } + interceptor->SetPlace(place_); + interceptor->SetMiniBatchScope(minibatch_scope_); + interceptor->SetMicroBatchScope(microbatch_scopes_); + interceptor->SetRootScope(root_scope_); + interceptor->SetGC(gc); + + SetInterceptor(interceptor_id, std::move(interceptor)); + VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id + << " with type: " << task_node->type() << "."; + + if (task_node->upstream().empty()) { + source_interceptor_ids_.emplace_back(interceptor_id); } - // The carrier will be always waiting for outside initializer - // since there is no interceptor has been created during auto init - creating_flag_mutex_.lock(); - creating_interceptors_ = false; - creating_flag_mutex_.unlock(); - HandleTmpMessages(); } + // The carrier will be always waiting for outside initializer + // since there is no interceptor has been created during auto init + creating_flag_mutex_.lock(); + creating_interceptors_ = false; + creating_flag_mutex_.unlock(); + HandleTmpMessages(); } } // namespace distributed diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h index c4c6a41846474..0c54201c94034 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.h +++ b/paddle/fluid/distributed/fleet_executor/carrier.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -38,6 +39,7 @@ namespace distributed { class TaskNode; class InterceptorMessageServiceImpl; +class RuntimeGraph; // A singleton MessageBus class Carrier final { @@ -47,13 +49,13 @@ class Carrier final { return carrier; } - void Init( - const std::unordered_map& interceptor_id_to_node, - framework::Scope* root_scope, framework::Scope* minibatch_scope, - const std::vector& microbatch_scopes, - const platform::Place& place); + void Init(std::shared_ptr runtime_graph, + framework::Scope* root_scope, framework::Scope* minibatch_scope, + const std::vector& microbatch_scopes, + const platform::Place& place); ~Carrier(); + void Release(); // Enqueue a message to corresponding interceptor id bool EnqueueInterceptorMessage(const InterceptorMessage& interceptor_message); @@ -83,13 +85,12 @@ class Carrier final { void HandleTmpMessages(); - // interceptor logic id to the Nodes info - std::unordered_map interceptor_id_to_node_; - // interceptor logic id to actually interceptor std::unordered_map> interceptor_idx_to_interceptor_; + std::vector source_interceptor_ids_; + std::vector message_tmp_{}; std::mutex tmp_message_mutex_; bool creating_interceptors_{true}; @@ -102,7 +103,8 @@ class Carrier final { framework::Scope* root_scope_; framework::Scope* minibatch_scope_; paddle::platform::Place place_; - paddle::platform::DeviceContext* dev_ctx_ = nullptr; + paddle::platform::DeviceContext* dev_ctx_{nullptr}; + std::shared_ptr runtime_graph_; }; } // namespace distributed diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc index 3008c83069942..35905125a0a43 100644 --- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc +++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h" #include "paddle/fluid/distributed/fleet_executor/task_node.h" +#include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/operator.h" namespace paddle { @@ -27,19 +28,15 @@ ComputeInterceptor::ComputeInterceptor(int64_t interceptor_id, TaskNode* node) } void ComputeInterceptor::PrepareDeps() { - auto& upstream = GetTaskNode()->upstream(); - auto& downstream = GetTaskNode()->downstream(); + auto& upstream = node_->upstream(); + auto& downstream = node_->downstream(); - // TODO(wangxi): get from task node - int64_t in_buff_size = std::numeric_limits::max(); - int64_t out_buff_size = 2; - - for (auto up_id : upstream) { - in_readys_.emplace(up_id, std::make_pair(in_buff_size, 0)); - in_stops_.emplace(up_id, false); + for (auto up : upstream) { + in_readys_.emplace(up.first, std::make_pair(up.second, 0)); + in_stops_.emplace(up.first, false); } - for (auto down_id : downstream) { - out_buffs_.emplace(down_id, std::make_pair(out_buff_size, 0)); + for (auto down : downstream) { + out_buffs_.emplace(down.first, std::make_pair(down.second, 0)); } // source compute node, should we add a new SourceInterceptor? @@ -50,18 +47,28 @@ void ComputeInterceptor::PrepareDeps() { "Source ComputeInterceptor must run at least one " "times, but now max_run_times=%ld", node_->max_run_times())); + in_readys_.emplace(-1, + std::make_pair(std::numeric_limits::max(), 0)); } + + // If there is no downstream or every downstream is in different rank, + // then this interceptor is the last one for current rank. + // This can be get during init, can be cached for later use. + is_last_ = downstream.empty(); } void ComputeInterceptor::IncreaseReady(int64_t up_id) { - // source node has no upstream, data_is_ready is send by carrier or others - if (is_source_ && up_id == -1) return; - auto it = in_readys_.find(up_id); PADDLE_ENFORCE_NE(it, in_readys_.end(), platform::errors::NotFound( "Cannot find upstream=%lld in in_readys.", up_id)); + // source node has no upstream, data_is_ready is send by carrier or others + if (is_source_ && up_id == -1) { + it->second.second += GetTaskNode()->max_run_times(); + return; + } + auto max_ready_size = it->second.first; auto ready_size = it->second.second; ready_size += 1; @@ -92,7 +99,11 @@ bool ComputeInterceptor::IsInputReady() { for (auto& ins : in_readys_) { auto ready_size = ins.second.second; // not ready, return false - if (ready_size == 0) return false; + if (ready_size == 0) { + VLOG(3) << "Interceptor " << GetInterceptorId() + << "'s upstreams aren't all ready."; + return false; + } } return true; } @@ -102,17 +113,15 @@ bool ComputeInterceptor::CanWriteOutput() { auto max_buffer_size = outs.second.first; auto used_size = outs.second.second; // full, return false - if (used_size == max_buffer_size) return false; + if (used_size == max_buffer_size) { + VLOG(3) << "Interceptor " << GetInterceptorId() + << "'s out buffer is full."; + return false; + } } return true; } -// only source node need reset -bool ComputeInterceptor::ShouldReset() { - if (is_source_ && step_ == node_->max_run_times()) return true; - return false; -} - void ComputeInterceptor::SendDataReadyToDownStream() { for (auto& outs : out_buffs_) { auto down_id = outs.first; @@ -129,7 +138,9 @@ void ComputeInterceptor::SendDataReadyToDownStream() { InterceptorMessage ready_msg; ready_msg.set_message_type(DATA_IS_READY); - VLOG(3) << "ComputeInterceptor Send data_is_ready msg to " << down_id; + VLOG(3) << "ComputeInterceptor " << interceptor_id_ + << " Send data_is_ready msg to " << down_id + << " for step: " << step_; Send(down_id, ready_msg); } } @@ -146,40 +157,47 @@ void ComputeInterceptor::ReplyCompletedToUpStream() { ready_size)); ins.second.second = ready_size; + VLOG(3) << "ComputeInterceptor " << interceptor_id_ + << " Reply data_is_useless msg to " << up_id + << " for step: " << step_; + if (up_id == -1) return; + InterceptorMessage reply_msg; reply_msg.set_message_type(DATE_IS_USELESS); - VLOG(3) << "ComputeInterceptor Reply data_is_useless msg to " << up_id; Send(up_id, reply_msg); } } -void ComputeInterceptor::Run() { - // If there is no limit, source interceptor can be executed - // an unlimited number of times. - // Now source node can only run - if (ShouldReset()) { - for (auto& out_buff : out_buffs_) { - // buffer is using - if (out_buff.second.second != 0) return; +void ComputeInterceptor::RunOps() { + VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the " + << step_ + 1 << " time."; + for (auto op : node_->ops()) { + op->Run(*microbatch_scopes_[step_ % node_->max_run_times()], place_); + if (gc_) { + framework::DeleteUnusedTensors( + *microbatch_scopes_[step_ % node_->max_run_times()], op, + node_->unused_vars(), gc_.get()); } - step_ = 0; // reset - return; } +} - while (IsInputReady() && CanWriteOutput() && !ShouldReset()) { +void ComputeInterceptor::Run() { + while (IsInputReady() && CanWriteOutput()) { VLOG(3) << "id=" << GetInterceptorId() << " ComputeInterceptor running"; - // step_ %= node_->max_run_times(); - for (auto op : node_->ops()) { - auto* scope = microbatch_scopes_[step_ % node_->max_slot_nums()]; - op->Run(*scope, place_); - } + RunOps(); ++step_; // send to downstream and increase buff used SendDataReadyToDownStream(); // reply to upstream and decrease ready data ReplyCompletedToUpStream(); + // Try to stop Carrier + if (is_last_ && (step_ % node_->max_run_times() == 0)) { + VLOG(3) << "Interceptor " << GetInterceptorId() + << " is stopping carrier."; + StopCarrier(); + } } } @@ -221,11 +239,6 @@ void ComputeInterceptor::TryStop() { Send(down_id, stop); } stop_ = true; - - if (out_buffs_.size() == 0) { - // TODO(fleet executor dev) need a better place to notify - StopCarrier(); - } } void ComputeInterceptor::Compute(const InterceptorMessage& msg) { diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h index 97e6da2f00eae..fb82ce76c7bdb 100644 --- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h +++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h @@ -25,16 +25,20 @@ class ComputeInterceptor : public Interceptor { public: ComputeInterceptor(int64_t interceptor_id, TaskNode* node); + protected: + virtual void RunOps(); + virtual void SendDataReadyToDownStream(); + virtual void ReplyCompletedToUpStream(); + + int64_t step_{0}; + + private: void PrepareDeps(); void IncreaseReady(int64_t up_id); void DecreaseBuff(int64_t down_id); bool IsInputReady(); bool CanWriteOutput(); - bool ShouldReset(); - - void SendDataReadyToDownStream(); - void ReplyCompletedToUpStream(); void Run(); void Compute(const InterceptorMessage& msg); @@ -42,9 +46,8 @@ class ComputeInterceptor : public Interceptor { void ReceivedStop(int64_t up_id); void TryStop(); - private: bool is_source_{false}; - int64_t step_{0}; + bool is_last_{false}; // upstream_id-->(max_ready_size, ready_size) std::map> in_readys_{}; diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc index ec60ec5fd5901..3a823674d842c 100644 --- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc +++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc @@ -31,14 +31,12 @@ FleetExecutor::FleetExecutor(const std::string& exe_desc_str) { "Error occurs while parsing string to proto")); } -FleetExecutor::~FleetExecutor() { - // Destroy Executor -} +FleetExecutor::~FleetExecutor() { root_scope_->DropKids(); } void FleetExecutor::Init(const framework::ProgramDesc& program_desc, framework::Scope* scope, const platform::Place& place) { - runtime_graph_ = std::make_unique(program_desc, exe_desc_); + runtime_graph_ = std::make_shared(program_desc, exe_desc_); root_scope_ = scope; place_ = place; PADDLE_ENFORCE_NOT_NULL(root_scope_, platform::errors::InvalidArgument( @@ -58,8 +56,8 @@ void FleetExecutor::Init(const framework::ProgramDesc& program_desc, void FleetExecutor::InitCarrier() { Carrier& carrier_instance = Carrier::Instance(); if (!carrier_instance.IsInit()) { - carrier_instance.Init(runtime_graph_->intercepter_id_to_node(), root_scope_, - minibatch_scope_, microbatch_scopes_, place_); + carrier_instance.Init(runtime_graph_, root_scope_, minibatch_scope_, + microbatch_scopes_, place_); } } @@ -111,10 +109,17 @@ void FleetExecutor::Run() { message_bus_instance.IsInit(), true, platform::errors::Unavailable("MessageBus has not been init yet.")); carrier_instance.Start(); + for (auto* micro_scop : microbatch_scopes_) { + // By default, we should delete all kid scopes after run executor because + // some operators may create local scope when running, such as while_op. + // But when while_op also create a local executor to run it's sub block, + // the sub scopes it created should not be dropped immediately, because + // while_grad_op will use some variables created during while_op run, so + // we need to keep the kids and wait for the outer executor to drop them. + micro_scop->DropKids(); + } } -void FleetExecutor::Release() { root_scope_->DropKids(); } - void FleetExecutor::CopyParameters(int microbatch_id, const framework::ProgramDesc& program) { auto& global_block = program.Block(0); diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h index 7be18772e9ec9..ac857fb6c38a2 100644 --- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h +++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h @@ -39,7 +39,6 @@ class FleetExecutor final { void Init(const framework::ProgramDesc& program_desc, framework::Scope* scope, const platform::Place& place); void Run(); - void Release(); private: DISABLE_COPY_AND_ASSIGN(FleetExecutor); @@ -47,7 +46,7 @@ class FleetExecutor final { void InitCarrier(); void CopyParameters(int microbatch_id, const framework::ProgramDesc& program); FleetExecutorDesc exe_desc_; - std::unique_ptr runtime_graph_; + std::shared_ptr runtime_graph_; framework::Scope* root_scope_; framework::Scope* minibatch_scope_; platform::Place place_; diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto b/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto index 1b12f1239dcbd..6890c311ec003 100644 --- a/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto +++ b/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto @@ -21,12 +21,11 @@ message RankInfo { } message FleetExecutorDesc { - optional string grain = 1 [ default = "coarse" ]; - optional int64 cur_rank = 2 [ default = 0 ]; // Rank id of current processor - repeated RankInfo cluster_info = 3; - optional int32 dp_degree = 4 [ default = 1 ]; - optional int32 mp_degree = 5 [ default = 1 ]; - optional int32 pp_degree = 6 [ default = 1 ]; - optional int64 num_micro_batches = 7 [ default = 1 ]; - optional int64 num_slots = 8 [ default = 1 ]; + optional int64 cur_rank = 1 [ default = 0 ]; // Rank id of current processor + repeated RankInfo cluster_info = 2; + optional int32 dp_degree = 3 [ default = 1 ]; + optional int32 mp_degree = 4 [ default = 1 ]; + optional int32 pp_degree = 5 [ default = 1 ]; + optional int64 num_micro_batches = 6 [ default = 1 ]; + optional int64 num_slots = 7 [ default = 1 ]; } diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.cc b/paddle/fluid/distributed/fleet_executor/interceptor.cc index 40429502825c9..dd7b89c4b8119 100644 --- a/paddle/fluid/distributed/fleet_executor/interceptor.cc +++ b/paddle/fluid/distributed/fleet_executor/interceptor.cc @@ -40,22 +40,9 @@ void Interceptor::Join() { void Interceptor::RegisterMsgHandle(MsgHandle handle) { handle_ = handle; } void Interceptor::Handle(const InterceptorMessage& msg) { - if (handle_) { - handle_(msg); - } else { - VLOG(3) << "Interceptor is using default message handler. This handler is " - "only used for test purpose. Check whether you init interceptor " - "in the proper way."; - if (msg.message_type() == DATA_IS_READY) { - VLOG(3) << "Fake handler is sending stop message to it self."; - InterceptorMessage msg; - msg.set_message_type(STOP); - Send(interceptor_id_, msg); - } else if (msg.message_type() == STOP) { - stop_ = true; - StopCarrier(); - } - } + PADDLE_ENFORCE_NOT_NULL(handle_, platform::errors::PreconditionNotMet( + "Message handle is not registered.")); + handle_(msg); } void Interceptor::StopCarrier() { diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.h b/paddle/fluid/distributed/fleet_executor/interceptor.h index ef1ffb1a53b3f..b0c1e46f03138 100644 --- a/paddle/fluid/distributed/fleet_executor/interceptor.h +++ b/paddle/fluid/distributed/fleet_executor/interceptor.h @@ -31,6 +31,7 @@ namespace paddle { namespace framework { class Scope; +class GarbageCollector; } namespace distributed { @@ -73,6 +74,9 @@ class Interceptor { void SetMicroBatchScope(const std::vector& scopes) { microbatch_scopes_ = scopes; } + void SetGC(const std::shared_ptr& gc) { + gc_ = gc; + } TaskNode* GetTaskNode() const { return node_; } @@ -94,6 +98,7 @@ class Interceptor { framework::Scope* root_scope_{nullptr}; framework::Scope* minibatch_scope_{nullptr}; std::vector microbatch_scopes_{}; + std::shared_ptr gc_{nullptr}; private: // pool the local mailbox, parse the Message diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc index 2071477372c9e..f087de69fa96b 100644 --- a/paddle/fluid/distributed/fleet_executor/message_bus.cc +++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc @@ -14,6 +14,7 @@ #include #include +#include #include #include "paddle/fluid/distributed/fleet_executor/carrier.h" @@ -56,6 +57,10 @@ void MessageBus::Init( bool MessageBus::IsInit() const { return is_init_; } MessageBus::~MessageBus() { + // NOTE: fleet_executor inits carrier before message bus, + // therefore the message bus's destructor will be called first + Carrier& carrier = Carrier::Instance(); + carrier.Release(); VLOG(3) << "Message bus releases resource."; #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ !defined(PADDLE_WITH_ASCEND_CL) @@ -86,6 +91,8 @@ bool MessageBus::Send(const InterceptorMessage& interceptor_message) { << retry_time << " times retries."; return true; } + VLOG(3) << "Message bus sends failed, retry after 1 seconds."; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); } VLOG(3) << "Message bus sends inter rank fail after 10 times retries."; return false; @@ -117,16 +124,40 @@ void MessageBus::ListenPort() { brpc::ServerOptions options; options.idle_timeout_sec = -1; int retry_times = 0; - int interval = 1000; + int interval = 100; while (server_.Start(ip_for_brpc, &options) != 0) { ++retry_times; LOG(INFO) << "Message bus is retring for starting brpc for " << retry_times << " times. And will retry after " << interval / 1000 << " seconds."; std::this_thread::sleep_for(std::chrono::milliseconds(interval)); - interval += 2000; + interval += 500; } LOG(INFO) << "Message bus's listen port thread starts successful."; + + std::set visit; + InterceptorMessage tmp_msg; + tmp_msg.set_ctrl_message(true); + for (auto pair : interceptor_id_to_rank_) { + if (rank_to_addr_.at(pair.second) == addr_) { + tmp_msg.set_src_id(pair.first); + } + } + for (auto pair : interceptor_id_to_rank_) { + int64_t rank = pair.second; + if (rank_to_addr_.at(rank) == addr_) { + continue; + } + tmp_msg.set_dst_id(pair.first); + if (visit.find(rank) == visit.end()) { + VLOG(3) << "Message bus is testing connection for rank: " << rank << "."; + visit.insert(rank); + while (!Send(tmp_msg)) { + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + VLOG(3) << "Message bus has connected to rank: " << rank << "."; + } + } #else LOG(WARNING) << "Fleet executor's ListenPort() is a fake function when Paddle is " @@ -136,6 +167,9 @@ void MessageBus::ListenPort() { } bool MessageBus::IsSameRank(int64_t src_id, int64_t dst_id) { + // -1 is sent by carrier to source interceptor + if (src_id == -1) src_id = dst_id; + // check whether the dst is the same rank or different rank with src const auto& src_rank = interceptor_id_to_rank_.find(src_id); const auto& dst_rank = interceptor_id_to_rank_.find(dst_id); diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc index 3a76bd43f9d55..32f9e36e53037 100644 --- a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc +++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/distributed/fleet_executor/runtime_graph.h" #include "paddle/fluid/distributed/fleet_executor/task_node.h" +#include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" @@ -100,7 +101,9 @@ std::vector RuntimeGraph::functionality_order = { RuntimeGraph::RuntimeGraph(const ProgramDesc& program, const FleetExecutorDesc& exe_desc) : exe_desc_(exe_desc) { - if (exe_desc.grain() == "coarse") { + if (exe_desc.pp_degree() == 1) { + OriginProgramCompile(program); + } else { SplitProgramBasedFunctionality(program); AssignTaskToIntercepter(); FakeDependence(); @@ -108,10 +111,32 @@ RuntimeGraph::RuntimeGraph(const ProgramDesc& program, } } +void RuntimeGraph::OriginProgramCompile(const ProgramDesc& program) { + int64_t cur_rank = exe_desc_.cur_rank(); + int64_t max_run_times = exe_desc_.num_micro_batches(); + int64_t max_slot_nums = exe_desc_.num_slots(); + + auto task_node = std::make_unique(program, cur_rank, max_run_times, + max_slot_nums); + // TODO(wangxi): add skip vars + auto unused_vars = + framework::GetUnusedVars(program.Block(0), task_node->unique_ops(), {}); + task_node->SetType("Compute"); + task_node->SetUnusedVars(unused_vars); + + task_nodes_.emplace_back(std::move(task_node)); + int64_t task_id = task_nodes_[0]->task_id(); + intercepter_id_to_rank_.insert({task_id, cur_rank}); + intercepter_id_to_node_.insert({task_id, task_nodes_[0].get()}); +} + void RuntimeGraph::SplitProgramBasedFunctionality(const ProgramDesc& program) { for (const auto& op_desc : program.Block(0).AllOps()) { ops_.emplace_back(OpRegistry::CreateOp(*op_desc)); } + // TODO(wangxi): how to gc pipeline backward send + auto unused_vars = framework::GetUnusedVars(program.Block(0), ops_, {}); + std::unordered_map> role_to_ops; for (const auto& op : ops_) { int32_t op_role = op->Attr("op_role"); @@ -135,33 +160,44 @@ void RuntimeGraph::SplitProgramBasedFunctionality(const ProgramDesc& program) { } role_to_ops.at(new_op_role_id).emplace_back(op.get()); } + int64_t cur_rank = exe_desc_.cur_rank(); DistCoordSys coord_sys(exe_desc_.dp_degree(), exe_desc_.pp_degree(), exe_desc_.mp_degree()); const auto& coord = coord_sys.RankToCoord(cur_rank); int pipeline_stage = coord.pp_idx; int64_t num_pipeline_stages = exe_desc_.pp_degree(); + // TODO(fleet_executor dev): start up steps should be a config `num_slots` int64_t start_up_steps = num_pipeline_stages - pipeline_stage; int64_t num_micro_batches = exe_desc_.num_micro_batches(); int64_t task_id = cur_rank * functionality_order.size(); for (std::size_t i = 0; i < functionality_order.size(); ++i) { + VLOG(3) << "Runtime graph is creating task node for: " << task_id << "."; OpRole role = functionality_order[i]; int32_t role_id = static_cast(role); int64_t max_run_times = num_micro_batches; int64_t max_slot_nums = start_up_steps; - if (IsLRSched(role_id) || IsOptimize(role_id)) { - max_run_times = 1; - max_slot_nums = 1; + // NOTE: use short path, each interceptor should run for max_run_times + std::vector task_ops{}; + if (role_to_ops.find(role_id) != role_to_ops.end()) { + task_ops = role_to_ops.at(role_id); } - if (role_to_ops.find(role_id) == role_to_ops.end()) { - task_nodes_.emplace_back(TaskNode::CreateEmptyTaskNode( - role_id, cur_rank, task_id, max_run_times, max_slot_nums)); + std::unique_ptr task_node = std::make_unique( + role_id, task_ops, cur_rank, task_id, max_run_times, max_slot_nums); + if (IsLRSched(role_id) || IsOptimize(role_id)) { + task_node->SetType("Amplifier"); + if (IsLRSched(role_id)) { + task_node->SetRunPerSteps(max_run_times); + } else { + task_node->SetRunAtOffset(max_run_times - 1); + task_node->SetRunPerSteps(max_run_times); + } } else { - task_nodes_.emplace_back( - TaskNode::CreateTaskNode(role_id, role_to_ops.at(role_id), cur_rank, - task_id, max_run_times, max_slot_nums)); + task_node->SetType("Compute"); } + task_node->SetUnusedVars(unused_vars); + task_nodes_.emplace_back(std::move(task_node)); ++task_id; } } @@ -176,42 +212,77 @@ void RuntimeGraph::FakeDependence() { downstream_coord.pp_idx += 1; int64_t pp_upstream = coord_sys.CoordToRank(upstream_coord); int64_t pp_downstream = coord_sys.CoordToRank(downstream_coord); + bool is_first_stage = (pp_upstream == -1); + bool is_last_stage = (pp_downstream == -1); + int32_t num_of_functionality = functionality_order.size(); - // lr -> forward -> backward -> optimize - // | | - // lr -> forward -> backward -> optimize + // lr(1:m) -> forward -> backward -> (m:1)optimize + // ↑ ↓ + // lr(1:m) -> forward -> backward -> (m:1)optimize + // ↑ ↓ + // lr(1:m) -> forward -> backward -> (m:1)optimize for (std::size_t i = 0; i < task_nodes_.size(); ++i) { - if (i != 0) { - task_nodes_[i]->AddUpstreamTask(cur_rank * num_of_functionality + i - 1); + auto& node = task_nodes_[i]; + bool is_forward = IsForward(node->role()); + bool is_backward = IsBackward(node->role()); + + int64_t cur_id = cur_rank * num_of_functionality + i; + int64_t prev_id = cur_id - 1; + int64_t next_id = cur_id + 1; + + int64_t upstream_id = pp_upstream * num_of_functionality + i; + int64_t downstream_id = pp_downstream * num_of_functionality + i; + + // 1F1B, last stage pp_buff_size should be 1, while first stage + // pp_buff_size should be pp_degree + int64_t pp_buff_size = exe_desc_.pp_degree() - coord.pp_idx; + + std::vector> ups; + std::vector> downs; + + if (i != 0) { // not lr + int64_t buff_size = is_backward ? pp_buff_size : 2; + ups.emplace_back(prev_id, buff_size); } - if (i != task_nodes_.size() - 1) { - task_nodes_[i]->AddDownstreamTask(cur_rank * num_of_functionality + i + - 1); + if (i != task_nodes_.size() - 1) { // not optimize + int64_t buff_size = is_forward ? pp_buff_size : 2; + downs.emplace_back(next_id, buff_size); } - if (IsForward(task_nodes_[i]->role())) { - if (pp_upstream != -1) { - task_nodes_[i]->AddUpstreamTask(pp_upstream * num_of_functionality + i); + + if (is_forward) { + if (!is_first_stage) { + ups.emplace_back(upstream_id, 2); } - if (pp_downstream != -1) { - task_nodes_[i]->AddDownstreamTask(pp_downstream * num_of_functionality + - i); + if (!is_last_stage) { + downs.emplace_back(downstream_id, 2); } - } else if (IsBackward(task_nodes_[i]->role())) { - if (pp_downstream != -1) { - task_nodes_[i]->AddUpstreamTask(pp_downstream * num_of_functionality + - i); + } else if (is_backward) { + if (!is_last_stage) { + ups.emplace_back(downstream_id, 2); } - if (pp_upstream != -1) { - task_nodes_[i]->AddDownstreamTask(pp_upstream * num_of_functionality + - i); + if (!is_first_stage) { + downs.emplace_back(upstream_id, 2); } } + + for (auto up : ups) { + VLOG(3) << "Task(" << cur_id << ") AddUpstream Task(" << up.first + << ") with buff_size=" << up.second; + node->AddUpstreamTask(up.first, up.second); + } + for (auto down : downs) { + VLOG(3) << "Task(" << cur_id << ") AddDownstream Task(" << down.first + << ") with buff_size=" << down.second; + node->AddDownstreamTask(down.first, down.second); + } } } void RuntimeGraph::AssignTaskToIntercepter() { for (const auto& task : task_nodes_) { int64_t intercepter_id = task->task_id(); + VLOG(3) << "Runtime graph is assigning task to interceptor: " + << intercepter_id << " with type: " << task->type() << "."; if (intercepter_id_to_node_.find(intercepter_id) != intercepter_id_to_node_.end()) { PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.h b/paddle/fluid/distributed/fleet_executor/runtime_graph.h index b19456962d631..26b758767c07f 100644 --- a/paddle/fluid/distributed/fleet_executor/runtime_graph.h +++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.h @@ -52,6 +52,7 @@ class RuntimeGraph final { void FakeDependence(); void AssignTaskToIntercepter(); void FakeRuntimeInfo(); + void OriginProgramCompile(const ProgramDesc& program); // LRSched, Forward, Backward, Optimize static std::vector functionality_order; std::vector> task_nodes_; diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc index 07fd091b04d97..e92ab09d481e8 100644 --- a/paddle/fluid/distributed/fleet_executor/task_node.cc +++ b/paddle/fluid/distributed/fleet_executor/task_node.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/fleet_executor/task_node.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" namespace paddle { @@ -30,6 +31,12 @@ TaskNode::TaskNode(const framework::ProgramDesc& program, int64_t rank, // Should be serially invoked, not thread-safe static int64_t task_node_cnt = 0; task_id_ = task_node_cnt++; + for (const auto& op_desc : program.Block(0).AllOps()) { + ops_vec_.emplace_back(framework::OpRegistry::CreateOp(*op_desc)); + } + for (const auto& op : ops_vec_) { + ops_.emplace_back(op.get()); + } } TaskNode::TaskNode(int32_t role, const std::vector& ops, @@ -50,30 +57,14 @@ TaskNode::TaskNode(int32_t role, int64_t rank, int64_t task_id, max_run_times_(max_run_times), max_slot_nums_(max_slot_nums) {} -std::unique_ptr TaskNode::CreateEmptyTaskNode(int32_t role, - int64_t rank, - int64_t task_id, - int64_t max_run_times, - int64_t max_slot_nums) { - return std::make_unique(role, rank, task_id, max_run_times, - max_slot_nums); -} - -std::unique_ptr TaskNode::CreateTaskNode( - int32_t role, const std::vector& ops, int64_t rank, - int64_t task_id, int64_t max_run_times, int64_t max_slot_nums) { - return std::make_unique(role, ops, rank, task_id, max_run_times, - max_slot_nums); +bool TaskNode::AddUpstreamTask(int64_t task_id, int64_t buff_size) { + const auto& ret = upstream_.emplace(task_id, buff_size); + return ret.second; } -bool TaskNode::AddUpstreamTask(int64_t task_id) { - const auto& ret = upstream_.insert(task_id); - return *ret.first == task_id; -} - -bool TaskNode::AddDownstreamTask(int64_t task_id) { - const auto& ret = downstream_.insert(task_id); - return *ret.first == task_id; +bool TaskNode::AddDownstreamTask(int64_t task_id, int64_t buff_size) { + const auto& ret = downstream_.emplace(task_id, buff_size); + return ret.second; } std::string TaskNode::DebugString() const { @@ -85,5 +76,34 @@ std::string TaskNode::DebugString() const { os << "\n"; return os.str(); } + +void TaskNode::SetRunPerSteps(int64_t value) { + PADDLE_ENFORCE_GE(value, 1, + platform::errors::InvalidArgument( + "run_per_steps must >= 1, but received %ld", value)); + run_per_steps_ = value; +} + +void TaskNode::SetRunAtOffset(int64_t value) { + PADDLE_ENFORCE_GE(value, 0, + platform::errors::InvalidArgument( + "run_at_offset must >= 0, but received %ld", value)); + run_at_offset_ = value; +} + +void TaskNode::SetReplyUpPerSteps(int64_t value) { + PADDLE_ENFORCE_GE( + value, 1, platform::errors::InvalidArgument( + "reply_up_per_steps must >= 1, but received %ld", value)); + reply_up_per_steps_ = value; +} + +void TaskNode::SetSendDownPerSteps(int64_t value) { + PADDLE_ENFORCE_GE( + value, 1, platform::errors::InvalidArgument( + "send_down_per_steps must >= 1, but received %ld", value)); + send_down_per_steps_ = value; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h index 8f4f9d80c42a5..37105bdd230ab 100644 --- a/paddle/fluid/distributed/fleet_executor/task_node.h +++ b/paddle/fluid/distributed/fleet_executor/task_node.h @@ -44,38 +44,69 @@ class TaskNode final { int32_t role() const { return role_; } int64_t max_run_times() const { return max_run_times_; } int64_t max_slot_nums() const { return max_slot_nums_; } - const std::unordered_set& upstream() const { return upstream_; } - const std::unordered_set& downstream() const { return downstream_; } + int64_t run_per_steps() const { return run_per_steps_; } + int64_t run_at_offset() const { return run_at_offset_; } + int64_t reply_up_per_steps() const { return reply_up_per_steps_; } + int64_t send_down_per_steps() const { return send_down_per_steps_; } + const std::unordered_map& upstream() const { + return upstream_; + } + const std::unordered_map& downstream() const { + return downstream_; + } const std::string& type() const { return type_; } const paddle::framework::ProgramDesc& program() const { return program_; } const std::vector& ops() const { return ops_; } + const std::vector>& unique_ops() const { + return ops_vec_; + } + const std::unordered_map>& + unused_vars() const { + return unused_vars_; + } - bool AddUpstreamTask(int64_t task_id); - bool AddDownstreamTask(int64_t task_id); - std::string DebugString() const; + void SetRunPerSteps(int64_t value); + void SetRunAtOffset(int64_t value); + void SetReplyUpPerSteps(int64_t value); + void SetSendDownPerSteps(int64_t value); + void SetType(const std::string& type) { type_ = type; } + void SetUnusedVars( + const std::unordered_map>& + unused_vars) { + unused_vars_ = unused_vars; + } - static std::unique_ptr CreateEmptyTaskNode(int32_t role, - int64_t rank, - int64_t task_id, - int64_t max_run_times, - int64_t max_slot_nums); - static std::unique_ptr CreateTaskNode( - int32_t role, const std::vector& ops, int64_t rank, - int64_t task_id, int64_t max_run_times, int64_t max_slot_nums); + // upstream need buffs? + bool AddUpstreamTask(int64_t task_id, int64_t buff_size = 1); + bool AddDownstreamTask(int64_t task_id, int64_t buff_size = 1); + std::string DebugString() const; private: DISABLE_COPY_AND_ASSIGN(TaskNode); TaskNode() = default; + // ops_ will be removed in the future std::vector ops_; - std::unordered_set upstream_; - std::unordered_set downstream_; + // task_id-->buff_size + std::unordered_map upstream_; + std::unordered_map downstream_; framework::ProgramDesc program_; + std::vector> ops_vec_; + std::unordered_map> + unused_vars_; + int32_t role_; int64_t rank_; int64_t task_id_; int64_t max_run_times_; int64_t max_slot_nums_; + int64_t run_per_steps_{1}; + int64_t run_at_offset_{0}; + // one input produces multi times output + int64_t reply_up_per_steps_{1}; + // one output need multi times input + int64_t send_down_per_steps_{1}; + std::string type_; }; diff --git a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt index b0f00d7058476..d4587b90c87f3 100644 --- a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt +++ b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt @@ -4,6 +4,12 @@ cc_test(interceptor_ping_pong_test SRCS interceptor_ping_pong_test.cc DEPS fleet set_source_files_properties(compute_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(compute_interceptor_test SRCS compute_interceptor_test.cc DEPS fleet_executor ${BRPC_DEPS}) +set_source_files_properties(interceptor_pipeline_short_path_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(interceptor_pipeline_short_path_test SRCS interceptor_pipeline_short_path_test.cc DEPS fleet_executor ${BRPC_DEPS}) + +set_source_files_properties(interceptor_pipeline_long_path_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(interceptor_pipeline_long_path_test SRCS interceptor_pipeline_long_path_test.cc DEPS fleet_executor ${BRPC_DEPS}) + set_source_files_properties(compute_interceptor_run_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(compute_interceptor_run_op_test SRCS compute_interceptor_run_op_test.cc DEPS fleet_executor ${BRPC_DEPS} op_registry fill_constant_op elementwise_add_op scope device_context) diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc index 2d9776738f831..c5348db83e029 100644 --- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc @@ -61,15 +61,15 @@ TEST(ComputeInterceptor, Compute) { std::vector scopes = {scope, scope}; platform::Place place = platform::CPUPlace(); + Carrier& carrier = Carrier::Instance(); + MessageBus& msg_bus = MessageBus::Instance(); msg_bus.Init({{0, 0}, {1, 0}}, {{0, "127.0.0.0:0"}}, "127.0.0.0:0"); - Carrier& carrier = Carrier::Instance(); - // FIXME: don't delete, otherwise interceptor will use undefined node TaskNode* node_a = - new TaskNode(0, ops, 0, 0, 2, 2); // role, ops, rank, task_id - TaskNode* node_b = new TaskNode(0, 0, 1, 0, 0); + new TaskNode(0, ops, 0, 0, 2, 0); // role, ops, rank, task_id + TaskNode* node_b = new TaskNode(0, 0, 1, 2, 0); // a->b node_a->AddDownstreamTask(1); @@ -90,13 +90,6 @@ TEST(ComputeInterceptor, Compute) { msg.set_src_id(-1); msg.set_dst_id(0); carrier.EnqueueInterceptorMessage(msg); - - // stop - InterceptorMessage stop; - stop.set_message_type(STOP); - stop.set_src_id(-1); - stop.set_dst_id(0); - carrier.EnqueueInterceptorMessage(stop); } } // namespace distributed diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc index 3cfd3073c8cb9..44dc0c9bc9b0c 100644 --- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc @@ -35,35 +35,29 @@ class StartInterceptor : public Interceptor { void NOP(const InterceptorMessage& msg) { if (msg.message_type() == STOP) { stop_ = true; + InterceptorMessage stop; + stop.set_message_type(STOP); + Send(1, stop); // stop 1, compute return; } std::cout << GetInterceptorId() << " recv msg from " << msg.src_id() << std::endl; - ++count_; - if (count_ == 3) { - InterceptorMessage stop; - stop.set_message_type(STOP); - Send(msg.dst_id(), stop); // stop 0, this - Send(msg.src_id(), stop); // stop 1, compute - } } - int count_{0}; }; TEST(ComputeInterceptor, Compute) { + Carrier& carrier = Carrier::Instance(); MessageBus& msg_bus = MessageBus::Instance(); msg_bus.Init({{0, 0}, {1, 0}, {2, 0}}, {{0, "127.0.0.0:0"}}, "127.0.0.0:0"); - Carrier& carrier = Carrier::Instance(); - // NOTE: don't delete, otherwise interceptor will use undefined node - TaskNode* node_a = new TaskNode(0, 0, 0, 0, 0); // role, rank, task_id - TaskNode* node_b = new TaskNode(0, 0, 1, 0, 0); - TaskNode* node_c = new TaskNode(0, 0, 2, 0, 0); + TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0); // role, rank, task_id + TaskNode* node_b = new TaskNode(0, 0, 1, 3, 0); + TaskNode* node_c = new TaskNode(0, 0, 2, 3, 0); // a->b->c - node_a->AddDownstreamTask(1); - node_b->AddUpstreamTask(0); + node_a->AddDownstreamTask(1, 3); + node_b->AddUpstreamTask(0, 3); node_b->AddDownstreamTask(2); node_c->AddUpstreamTask(1); diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc new file mode 100644 index 0000000000000..b3fdb0b7adff0 --- /dev/null +++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc @@ -0,0 +1,94 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/distributed/fleet_executor/carrier.h" +#include "paddle/fluid/distributed/fleet_executor/interceptor.h" +#include "paddle/fluid/distributed/fleet_executor/message_bus.h" +#include "paddle/fluid/distributed/fleet_executor/task_node.h" + +namespace paddle { +namespace distributed { + +void LinkNodes(const std::vector& nodes) { + size_t size = nodes.size(); + if (size <= 1) return; + + { // i = 0 + TaskNode* now = nodes[0]; + TaskNode* next = nodes[1]; + now->AddDownstreamTask(next->task_id()); + } + { // i = size - 1 + TaskNode* prev = nodes[size - 2]; + TaskNode* now = nodes[size - 1]; + now->AddUpstreamTask(prev->task_id()); + } + + for (size_t i = 1; i < size - 1; ++i) { + TaskNode* prev = nodes[i - 1]; + TaskNode* now = nodes[i]; + TaskNode* next = nodes[i + 1]; + + now->AddUpstreamTask(prev->task_id()); + now->AddDownstreamTask(next->task_id()); + } +} + +TEST(AmplifierInterceptor, Amplifier) { + Carrier& carrier = Carrier::Instance(); + MessageBus& msg_bus = MessageBus::Instance(); + msg_bus.Init({{0, 0}, {1, 0}, {2, 0}, {3, 0}, {4, 0}, {5, 0}}, + {{0, "127.0.0.0:0"}}, "127.0.0.0:0"); + + int64_t micro_steps = 3; + + // NOTE: don't delete, otherwise interceptor will use undefined node + TaskNode* node_a = new TaskNode(0, 0, 0, 1, 0); // role, rank, task_id + TaskNode* node_b = new TaskNode(0, 0, 1, 1, 0); + TaskNode* node_c = new TaskNode(0, 0, 2, 1, 0); + TaskNode* node_d = new TaskNode(0, 0, 3, 1, 0); + TaskNode* node_e = new TaskNode(0, 0, 4, 1, 0); + TaskNode* node_f = new TaskNode(0, 0, 5, 1, 0); + + // a->b->c->d->e->f + LinkNodes({node_a, node_b, node_c, node_d, node_e, node_f}); + + // LR->b(1:3)->F->B->e(3:1)->U + node_b->SetReplyUpPerSteps(micro_steps); + node_e->SetSendDownPerSteps(micro_steps); + + carrier.SetInterceptor(0, InterceptorFactory::Create("Compute", 0, node_a)); + carrier.SetInterceptor(1, InterceptorFactory::Create("Amplifier", 1, node_b)); + carrier.SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c)); + carrier.SetInterceptor(3, InterceptorFactory::Create("Compute", 3, node_d)); + carrier.SetInterceptor(4, InterceptorFactory::Create("Amplifier", 4, node_e)); + carrier.SetInterceptor(5, InterceptorFactory::Create("Compute", 5, node_f)); + + carrier.SetCreatingFlag(false); + + // start + InterceptorMessage msg; + msg.set_message_type(DATA_IS_READY); + msg.set_src_id(-1); + msg.set_dst_id(0); + carrier.EnqueueInterceptorMessage(msg); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc new file mode 100644 index 0000000000000..936a970c05f7c --- /dev/null +++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc @@ -0,0 +1,109 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/distributed/fleet_executor/carrier.h" +#include "paddle/fluid/distributed/fleet_executor/interceptor.h" +#include "paddle/fluid/distributed/fleet_executor/message_bus.h" +#include "paddle/fluid/distributed/fleet_executor/task_node.h" + +namespace paddle { +namespace distributed { + +int64_t GetBuffSize( + const std::map, int64_t> buffs, + TaskNode* from, TaskNode* to) { + if (buffs.find({from, to}) != buffs.end()) { + return buffs.at({from, to}); + } + if (buffs.find({to, from}) != buffs.end()) { + return buffs.at({to, from}); + } + return 2; // set default 2 +} + +void LinkNodes(const std::vector& nodes, + const std::map, int64_t> buffs) { + size_t size = nodes.size(); + if (size <= 1) return; + + { // i = 0 + TaskNode* now = nodes[0]; + TaskNode* next = nodes[1]; + auto buff_size = GetBuffSize(buffs, now, next); + now->AddDownstreamTask(next->task_id(), buff_size); + } + { // i = size - 1 + TaskNode* prev = nodes[size - 2]; + TaskNode* now = nodes[size - 1]; + auto buff_size = GetBuffSize(buffs, prev, now); + now->AddUpstreamTask(prev->task_id(), buff_size); + } + + for (size_t i = 1; i < size - 1; ++i) { + TaskNode* prev = nodes[i - 1]; + TaskNode* now = nodes[i]; + TaskNode* next = nodes[i + 1]; + + auto buff_size = GetBuffSize(buffs, prev, now); + now->AddUpstreamTask(prev->task_id(), buff_size); + + buff_size = GetBuffSize(buffs, now, next); + now->AddDownstreamTask(next->task_id(), buff_size); + } +} + +TEST(AmplifierInterceptor, Amplifier) { + Carrier& carrier = Carrier::Instance(); + MessageBus& msg_bus = MessageBus::Instance(); + msg_bus.Init({{0, 0}, {1, 0}, {2, 0}, {3, 0}}, {{0, ""}}, ""); + + int64_t micro_steps = 6; + + // NOTE: don't delete, otherwise interceptor will use undefined node + TaskNode* node_a = + new TaskNode(0, 0, 0, micro_steps, 0); // role, rank, task_id + TaskNode* node_b = new TaskNode(0, 0, 1, 3, 0); + TaskNode* node_c = new TaskNode(0, 0, 2, 3, 0); + TaskNode* node_d = new TaskNode(0, 0, 3, micro_steps, 0); + + // a->b->c->d + // LR->F->B->U + LinkNodes({node_a, node_b, node_c, node_d}, {{{node_b, node_c}, 1}}); + + node_a->SetRunPerSteps(micro_steps); + node_d->SetRunPerSteps(micro_steps); + node_d->SetRunAtOffset(micro_steps - 1); + + carrier.SetInterceptor(0, InterceptorFactory::Create("Amplifier", 0, node_a)); + carrier.SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b)); + carrier.SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c)); + carrier.SetInterceptor(3, InterceptorFactory::Create("Amplifier", 3, node_d)); + + carrier.SetCreatingFlag(false); + + // start + InterceptorMessage msg; + msg.set_message_type(DATA_IS_READY); + msg.set_src_id(-1); + msg.set_dst_id(0); + carrier.EnqueueInterceptorMessage(msg); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc index c5ad4b0099479..a9682d6a6efcc 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.cc +++ b/paddle/fluid/distributed/service/graph_brpc_client.cc @@ -514,6 +514,42 @@ std::future GraphBrpcClient::random_sample_nodes( return fut; } +std::future GraphBrpcClient::load_graph_split_config( + uint32_t table_id, std::string path) { + DownpourBrpcClosure *closure = new DownpourBrpcClosure( + server_size, [&, server_size = this->server_size ](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + size_t fail_num = 0; + for (size_t request_idx = 0; request_idx < server_size; ++request_idx) { + if (closure->check_response(request_idx, + PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG) != 0) { + ++fail_num; + break; + } + } + ret = fail_num == 0 ? 0 : -1; + closure->set_promise_value(ret); + }); + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + for (size_t i = 0; i < server_size; i++) { + int server_index = i; + closure->request(server_index) + ->set_cmd_id(PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG); + closure->request(server_index)->set_table_id(table_id); + closure->request(server_index)->set_client_id(_client_id); + closure->request(server_index)->add_params(path); + GraphPsService_Stub rpc_stub = + getServiceStub(get_cmd_channel(server_index)); + closure->cntl(server_index)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(server_index), + closure->request(server_index), + closure->response(server_index), closure); + } + return fut; +} std::future GraphBrpcClient::use_neighbors_sample_cache( uint32_t table_id, size_t total_size_limit, size_t ttl) { DownpourBrpcClosure *closure = new DownpourBrpcClosure( diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h index e3d2ff1d32d72..2e5d5b6ee93cb 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.h +++ b/paddle/fluid/distributed/service/graph_brpc_client.h @@ -93,6 +93,8 @@ class GraphBrpcClient : public BrpcPsClient { virtual std::future use_neighbors_sample_cache(uint32_t table_id, size_t size_limit, size_t ttl); + virtual std::future load_graph_split_config(uint32_t table_id, + std::string path); virtual std::future remove_graph_node( uint32_t table_id, std::vector& node_id_list); virtual int32_t initialize(); diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc index 094ecbbd402c0..c1348e4804e2b 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.cc +++ b/paddle/fluid/distributed/service/graph_brpc_server.cc @@ -204,6 +204,8 @@ int32_t GraphBrpcService::initialize() { &GraphBrpcService::sample_neighbors_across_multi_servers; _service_handler_map[PS_GRAPH_USE_NEIGHBORS_SAMPLE_CACHE] = &GraphBrpcService::use_neighbors_sample_cache; + _service_handler_map[PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG] = + &GraphBrpcService::load_graph_split_config; // shard初始化,server启动后才可从env获取到server_list的shard信息 initialize_shard_info(); @@ -658,5 +660,20 @@ int32_t GraphBrpcService::use_neighbors_sample_cache( ((GraphTable *)table)->make_neighbor_sample_cache(size_limit, ttl); return 0; } + +int32_t GraphBrpcService::load_graph_split_config( + Table *table, const PsRequestMessage &request, PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 1) { + set_response_code(response, -1, + "load_graph_split_configrequest requires at least 1 " + "argument1[file_path]"); + return 0; + } + ((GraphTable *)table)->load_graph_split_config(request.params(0)); + return 0; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h index d1a6aa63604f3..ecd78d28ca812 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.h +++ b/paddle/fluid/distributed/service/graph_brpc_server.h @@ -126,6 +126,10 @@ class GraphBrpcService : public PsBaseService { PsResponseMessage &response, brpc::Controller *cntl); + int32_t load_graph_split_config(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl); + private: bool _is_initialize_shard_info; std::mutex _initialize_shard_mutex; diff --git a/paddle/fluid/distributed/service/heter_server.h b/paddle/fluid/distributed/service/heter_server.h index 66141622f8cdc..5f062755c9242 100644 --- a/paddle/fluid/distributed/service/heter_server.h +++ b/paddle/fluid/distributed/service/heter_server.h @@ -336,7 +336,7 @@ class HeterServer { bool IsExit() { return service_.IsExit(); } - HeterServer() { this->ready_ = 0; } + HeterServer() : service_(), ready_(0) {} void RegisterServiceHandler(std::string message_name, HeterServiceHandler func); @@ -391,7 +391,7 @@ class HeterServer { DISABLE_COPY_AND_ASSIGN(HeterServer); std::mutex mutex_ready_; - int ready_ = 0; + int ready_; }; } // end namespace distributed diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto index 8ee9b3590721a..6dfaff1ffa1df 100644 --- a/paddle/fluid/distributed/service/sendrecv.proto +++ b/paddle/fluid/distributed/service/sendrecv.proto @@ -58,6 +58,7 @@ enum PsCmdID { PS_GRAPH_SET_NODE_FEAT = 37; PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER = 38; PS_GRAPH_USE_NEIGHBORS_SAMPLE_CACHE = 39; + PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG = 40; } message PsRequestMessage { diff --git a/paddle/fluid/distributed/service/service.cc b/paddle/fluid/distributed/service/service.cc index 29941e36ea051..698ceb1578f47 100644 --- a/paddle/fluid/distributed/service/service.cc +++ b/paddle/fluid/distributed/service/service.cc @@ -51,7 +51,6 @@ void PSCore::init_gflag(const std::string& gflags) { std::vector flags = paddle::string::split_string(gflags); if (flags.size() < 1) { flags.push_back("-max_body_size=314217728"); - flags.push_back("-bthread_concurrency=200"); flags.push_back("-socket_max_unwritten_bytes=2048000000"); flags.push_back("-max_connection_pool_size=1950"); } diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc index b690d71eab84d..042a4dee62bda 100644 --- a/paddle/fluid/distributed/table/common_graph_table.cc +++ b/paddle/fluid/distributed/table/common_graph_table.cc @@ -56,7 +56,7 @@ int32_t GraphTable::add_graph_node(std::vector &id_list, tasks.push_back(_shards_task_pool[i]->enqueue([&batch, i, this]() -> int { for (auto &p : batch[i]) { size_t index = p.first % this->shard_num - this->shard_start; - this->shards[index].add_graph_node(p.first)->build_edges(p.second); + this->shards[index]->add_graph_node(p.first)->build_edges(p.second); } return 0; })); @@ -79,7 +79,7 @@ int32_t GraphTable::remove_graph_node(std::vector &id_list) { tasks.push_back(_shards_task_pool[i]->enqueue([&batch, i, this]() -> int { for (auto &p : batch[i]) { size_t index = p % this->shard_num - this->shard_start; - this->shards[index].delete_node(p); + this->shards[index]->delete_node(p); } return 0; })); @@ -97,6 +97,7 @@ void GraphShard::clear() { } GraphShard::~GraphShard() { clear(); } + void GraphShard::delete_node(uint64_t id) { auto iter = node_location.find(id); if (iter == node_location.end()) return; @@ -117,6 +118,14 @@ GraphNode *GraphShard::add_graph_node(uint64_t id) { return (GraphNode *)bucket[node_location[id]]; } +GraphNode *GraphShard::add_graph_node(Node *node) { + auto id = node->get_id(); + if (node_location.find(id) == node_location.end()) { + node_location[id] = bucket.size(); + bucket.push_back(node); + } + return (GraphNode *)bucket[node_location[id]]; +} FeatureNode *GraphShard::add_feature_node(uint64_t id) { if (node_location.find(id) == node_location.end()) { node_location[id] = bucket.size(); @@ -134,6 +143,33 @@ Node *GraphShard::find_node(uint64_t id) { return iter == node_location.end() ? nullptr : bucket[iter->second]; } +GraphTable::~GraphTable() { + for (auto p : shards) { + delete p; + } + for (auto p : extra_shards) { + delete p; + } + shards.clear(); + extra_shards.clear(); +} + +int32_t GraphTable::load_graph_split_config(const std::string &path) { + VLOG(4) << "in server side load graph split config\n"; + std::ifstream file(path); + std::string line; + while (std::getline(file, line)) { + auto values = paddle::string::split_string(line, "\t"); + if (values.size() < 2) continue; + size_t index = (size_t)std::stoi(values[0]); + if (index != _shard_idx) continue; + auto dst_id = std::stoull(values[1]); + extra_nodes.insert(dst_id); + } + if (extra_nodes.size() != 0) use_duplicate_nodes = true; + return 0; +} + int32_t GraphTable::load(const std::string &path, const std::string ¶m) { bool load_edge = (param[0] == 'e'); bool load_node = (param[0] == 'n'); @@ -154,7 +190,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges( res.clear(); std::vector>> tasks; for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) { - end = total_size + shards[i].get_size(); + end = total_size + shards[i]->get_size(); start = total_size; while (start < end && index < ranges.size()) { if (ranges[index].second <= start) @@ -169,11 +205,11 @@ int32_t GraphTable::get_nodes_ids_by_ranges( second -= total_size; tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( [this, first, second, i]() -> std::vector { - return shards[i].get_ids_by_range(first, second); + return shards[i]->get_ids_by_range(first, second); })); } } - total_size += shards[i].get_size(); + total_size += shards[i]->get_size(); } for (size_t i = 0; i < tasks.size(); i++) { auto vec = tasks[i].get(); @@ -217,7 +253,7 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { size_t index = shard_id - shard_start; - auto node = shards[index].add_feature_node(id); + auto node = shards[index]->add_feature_node(id); node->set_feature_size(feat_name.size()); @@ -245,7 +281,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { std::string sample_type = "random"; bool is_weighted = false; int valid_count = 0; - + int extra_alloc_index = 0; for (auto path : paths) { std::ifstream file(path); std::string line; @@ -268,8 +304,24 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { size_t src_shard_id = src_id % shard_num; if (src_shard_id >= shard_end || src_shard_id < shard_start) { - VLOG(4) << "will not load " << src_id << " from " << path - << ", please check id distribution"; + if (use_duplicate_nodes == false || + extra_nodes.find(src_id) == extra_nodes.end()) { + VLOG(4) << "will not load " << src_id << " from " << path + << ", please check id distribution"; + continue; + } + int index; + if (extra_nodes_to_thread_index.find(src_id) != + extra_nodes_to_thread_index.end()) { + index = extra_nodes_to_thread_index[src_id]; + } else { + index = extra_alloc_index++; + extra_alloc_index %= task_pool_size_; + extra_nodes_to_thread_index[src_id] = index; + } + extra_shards[index]->add_graph_node(src_id)->build_edges(is_weighted); + extra_shards[index]->add_neighbor(src_id, dst_id, weight); + valid_count++; continue; } if (count % 1000000 == 0) { @@ -278,36 +330,130 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { } size_t index = src_shard_id - shard_start; - shards[index].add_graph_node(src_id)->build_edges(is_weighted); - shards[index].add_neighbor(src_id, dst_id, weight); + shards[index]->add_graph_node(src_id)->build_edges(is_weighted); + shards[index]->add_neighbor(src_id, dst_id, weight); valid_count++; } } VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in " << path; + std::vector used(task_pool_size_, 0); // Build Sampler j for (auto &shard : shards) { - auto bucket = shard.get_bucket(); + auto bucket = shard->get_bucket(); for (size_t i = 0; i < bucket.size(); i++) { bucket[i]->build_sampler(sample_type); + used[get_thread_pool_index(bucket[i]->get_id())]++; } } + /*----------------------- + relocate the duplicate nodes to make them distributed evenly among threads. +*/ + for (auto &shard : extra_shards) { + auto bucket = shard->get_bucket(); + for (size_t i = 0; i < bucket.size(); i++) { + bucket[i]->build_sampler(sample_type); + } + } + int size = extra_nodes_to_thread_index.size(); + if (size == 0) return 0; + std::vector index; + for (int i = 0; i < used.size(); i++) index.push_back(i); + sort(index.begin(), index.end(), + [&](int &a, int &b) { return used[a] < used[b]; }); + + std::vector alloc(index.size(), 0), has_alloc(index.size(), 0); + int t = 1, aim = 0, mod = 0; + for (; t < used.size(); t++) { + if ((used[index[t]] - used[index[t - 1]]) * t >= size) { + break; + } else { + size -= (used[index[t]] - used[index[t - 1]]) * t; + } + } + aim = used[index[t - 1]] + size / t; + mod = size % t; + for (int x = t - 1; x >= 0; x--) { + alloc[index[x]] = aim; + if (t - x <= mod) alloc[index[x]]++; + alloc[index[x]] -= used[index[x]]; + } + std::vector vec[index.size()]; + for (auto p : extra_nodes_to_thread_index) { + has_alloc[p.second]++; + vec[p.second].push_back(p.first); + } + sort(index.begin(), index.end(), [&](int &a, int &b) { + return has_alloc[a] - alloc[a] < has_alloc[b] - alloc[b]; + }); + int left = 0, right = index.size() - 1; + while (left < right) { + if (has_alloc[index[right]] - alloc[index[right]] == 0) break; + int x = std::min(alloc[index[left]] - has_alloc[index[left]], + has_alloc[index[right]] - alloc[index[right]]); + has_alloc[index[left]] += x; + has_alloc[index[right]] -= x; + uint64_t id; + while (x--) { + id = vec[index[right]].back(); + vec[index[right]].pop_back(); + extra_nodes_to_thread_index[id] = index[left]; + vec[index[left]].push_back(id); + } + if (has_alloc[index[right]] - alloc[index[right]] == 0) right--; + if (alloc[index[left]] - has_alloc[index[left]] == 0) left++; + } + std::vector extra_shards_copy; + for (int i = 0; i < task_pool_size_; ++i) { + extra_shards_copy.push_back(new GraphShard()); + } + for (auto &shard : extra_shards) { + auto &bucket = shard->get_bucket(); + auto &node_location = shard->get_node_location(); + while (bucket.size()) { + Node *temp = bucket.back(); + bucket.pop_back(); + node_location.erase(temp->get_id()); + extra_shards_copy[extra_nodes_to_thread_index[temp->get_id()]] + ->add_graph_node(temp); + } + } + for (int i = 0; i < task_pool_size_; ++i) { + delete extra_shards[i]; + extra_shards[i] = extra_shards_copy[i]; + } return 0; } Node *GraphTable::find_node(uint64_t id) { size_t shard_id = id % shard_num; if (shard_id >= shard_end || shard_id < shard_start) { - return nullptr; + if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0) + return nullptr; + auto iter = extra_nodes_to_thread_index.find(id); + if (iter == extra_nodes_to_thread_index.end()) + return nullptr; + else { + return extra_shards[iter->second]->find_node(id); + } } size_t index = shard_id - shard_start; - Node *node = shards[index].find_node(id); + Node *node = shards[index]->find_node(id); return node; } uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) { - return node_id % shard_num % shard_num_per_server % task_pool_size_; + if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0) + return node_id % shard_num % shard_num_per_server % task_pool_size_; + size_t src_shard_id = node_id % shard_num; + if (src_shard_id >= shard_end || src_shard_id < shard_start) { + auto iter = extra_nodes_to_thread_index.find(node_id); + if (iter != extra_nodes_to_thread_index.end()) { + return iter->second; + } + } + return src_shard_id % shard_num_per_server % task_pool_size_; } uint32_t GraphTable::get_thread_pool_index_by_shard_index( @@ -319,11 +465,16 @@ int32_t GraphTable::clear_nodes() { std::vector> tasks; for (size_t i = 0; i < shards.size(); i++) { tasks.push_back( - _shards_task_pool[get_thread_pool_index_by_shard_index(i)]->enqueue( - [this, i]() -> int { - this->shards[i].clear(); - return 0; - })); + _shards_task_pool[i % task_pool_size_]->enqueue([this, i]() -> int { + this->shards[i]->clear(); + return 0; + })); + } + for (size_t i = 0; i < extra_shards.size(); i++) { + tasks.push_back(_shards_task_pool[i]->enqueue([this, i]() -> int { + this->extra_shards[i]->clear(); + return 0; + })); } for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); return 0; @@ -334,7 +485,7 @@ int32_t GraphTable::random_sample_nodes(int sample_size, int &actual_size) { int total_size = 0; for (int i = 0; i < shards.size(); i++) { - total_size += shards[i].get_size(); + total_size += shards[i]->get_size(); } if (sample_size > total_size) sample_size = total_size; int range_num = random_sample_nodes_ranges; @@ -401,8 +552,8 @@ int32_t GraphTable::random_sample_neighbors( size_t node_num = buffers.size(); std::function char_del = [](char *c) { delete[] c; }; std::vector> tasks; - std::vector> seq_id(shard_end - shard_start); - std::vector> id_list(shard_end - shard_start); + std::vector> seq_id(task_pool_size_); + std::vector> id_list(task_pool_size_); size_t index; for (size_t idx = 0; idx < node_num; ++idx) { index = get_thread_pool_index(node_ids[idx]); @@ -524,7 +675,7 @@ int32_t GraphTable::set_node_feat( tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( [&, idx, node_id]() -> int { size_t index = node_id % this->shard_num - this->shard_start; - auto node = shards[index].add_feature_node(node_id); + auto node = shards[index]->add_feature_node(node_id); node->set_feature_size(this->feat_name.size()); for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { const std::string &feature_name = feature_names[feat_idx]; @@ -581,7 +732,7 @@ int32_t GraphTable::pull_graph_list(int start, int total_size, int size = 0, cur_size; std::vector>> tasks; for (size_t i = 0; i < shards.size() && total_size > 0; i++) { - cur_size = shards[i].get_size(); + cur_size = shards[i]->get_size(); if (size + cur_size <= start) { size += cur_size; continue; @@ -590,7 +741,7 @@ int32_t GraphTable::pull_graph_list(int start, int total_size, int end = start + (count - 1) * step + 1; tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( [this, i, start, end, step, size]() -> std::vector { - return this->shards[i].get_batch(start - size, end - size, step); + return this->shards[i]->get_batch(start - size, end - size, step); })); start += count * step; total_size -= count; @@ -665,7 +816,14 @@ int32_t GraphTable::initialize() { shard_end = shard_start + shard_num_per_server; VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start " << shard_start << " shard_end " << shard_end; - shards = std::vector(shard_num_per_server, GraphShard(shard_num)); + for (int i = 0; i < shard_num_per_server; i++) { + shards.push_back(new GraphShard()); + } + use_duplicate_nodes = false; + for (int i = 0; i < task_pool_size_; i++) { + extra_shards.push_back(new GraphShard()); + } + return 0; } } // namespace distributed diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h index 9ca59db3bb268..b76ab0ae95060 100644 --- a/paddle/fluid/distributed/table/common_graph_table.h +++ b/paddle/fluid/distributed/table/common_graph_table.h @@ -47,7 +47,6 @@ class GraphShard { public: size_t get_size(); GraphShard() {} - GraphShard(int shard_num) { this->shard_num = shard_num; } ~GraphShard(); std::vector &get_bucket() { return bucket; } std::vector get_batch(int start, int end, int step); @@ -60,18 +59,18 @@ class GraphShard { } GraphNode *add_graph_node(uint64_t id); + GraphNode *add_graph_node(Node *node); FeatureNode *add_feature_node(uint64_t id); Node *find_node(uint64_t id); void delete_node(uint64_t id); void clear(); void add_neighbor(uint64_t id, uint64_t dst_id, float weight); - std::unordered_map get_node_location() { + std::unordered_map &get_node_location() { return node_location; } private: std::unordered_map node_location; - int shard_num; std::vector bucket; }; @@ -355,7 +354,7 @@ class ScaledLRU { class GraphTable : public SparseTable { public: GraphTable() { use_cache = false; } - virtual ~GraphTable() {} + virtual ~GraphTable(); virtual int32_t pull_graph_list(int start, int size, std::unique_ptr &buffer, int &actual_size, bool need_feature, @@ -374,6 +373,7 @@ class GraphTable : public SparseTable { virtual int32_t initialize(); int32_t load(const std::string &path, const std::string ¶m); + int32_t load_graph_split_config(const std::string &path); int32_t load_edges(const std::string &path, bool reverse); @@ -434,7 +434,7 @@ class GraphTable : public SparseTable { } protected: - std::vector shards; + std::vector shards, extra_shards; size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num; const int task_pool_size_ = 24; const int random_sample_nodes_ranges = 3; @@ -449,7 +449,9 @@ class GraphTable : public SparseTable { std::vector> _shards_task_pool; std::vector> _shards_task_rng_pool; std::shared_ptr> scaled_lru; - bool use_cache; + std::unordered_set extra_nodes; + std::unordered_map extra_nodes_to_thread_index; + bool use_cache, use_duplicate_nodes; mutable std::mutex mutex_; }; } // namespace distributed diff --git a/paddle/fluid/distributed/table/graph/graph_node.cc b/paddle/fluid/distributed/table/graph/graph_node.cc index e2311cc307b60..52c708be88488 100644 --- a/paddle/fluid/distributed/table/graph/graph_node.cc +++ b/paddle/fluid/distributed/table/graph/graph_node.cc @@ -65,6 +65,9 @@ void GraphNode::build_edges(bool is_weighted) { } } void GraphNode::build_sampler(std::string sample_type) { + if (sampler != nullptr) { + return; + } if (sample_type == "random") { sampler = new RandomSampler(); } else if (sample_type == "weighted") { diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index 597a08973b957..62de82832e133 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -21,6 +21,9 @@ cc_test(brpc_utils_test SRCS brpc_utils_test.cc DEPS brpc_utils scope math_funct set_source_files_properties(graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) +set_source_files_properties(graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(graph_node_split_test SRCS graph_node_split_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) + set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table) diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc new file mode 100644 index 0000000000000..3fcddde787f69 --- /dev/null +++ b/paddle/fluid/distributed/test/graph_node_split_test.cc @@ -0,0 +1,275 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include // NOLINT +#include +#include +#include +#include // NOLINT +#include +#include +#include "google/protobuf/text_format.h" + +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/service/env.h" +#include "paddle/fluid/distributed/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/graph_py_service.h" +#include "paddle/fluid/distributed/service/ps_client.h" +#include "paddle/fluid/distributed/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/service/service.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace math = paddle::operators::math; +namespace memory = paddle::memory; +namespace distributed = paddle::distributed; + +std::vector edges = { + std::string("37\t45\t0.34"), std::string("37\t145\t0.31"), + std::string("37\t112\t0.21"), std::string("96\t48\t1.4"), + std::string("96\t247\t0.31"), std::string("96\t111\t1.21"), + std::string("59\t45\t0.34"), std::string("59\t145\t0.31"), + std::string("59\t122\t0.21"), std::string("97\t48\t0.34"), + std::string("97\t247\t0.31"), std::string("97\t111\t0.21")}; +char edge_file_name[] = "edges.txt"; + +std::vector nodes = { + std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"), + std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"), + std::string("user\t59\ta 0.11\tb 11 14"), + std::string("user\t97\ta 0.11\tb 12 11"), + std::string("item\t45\ta 0.21"), + std::string("item\t145\ta 0.21"), + std::string("item\t112\ta 0.21"), + std::string("item\t48\ta 0.21"), + std::string("item\t247\ta 0.21"), + std::string("item\t111\ta 0.21"), + std::string("item\t46\ta 0.21"), + std::string("item\t146\ta 0.21"), + std::string("item\t122\ta 0.21"), + std::string("item\t49\ta 0.21"), + std::string("item\t248\ta 0.21"), + std::string("item\t113\ta 0.21")}; +char node_file_name[] = "nodes.txt"; + +std::vector graph_split = {std::string("0\t97")}; +char graph_split_file_name[] = "graph_split.txt"; + +void prepare_file(char file_name[], std::vector data) { + std::ofstream ofile; + ofile.open(file_name); + for (auto x : data) { + ofile << x << std::endl; + } + + ofile.close(); +} +void GetDownpourSparseTableProto( + ::paddle::distributed::TableParameter* sparse_table_proto) { + sparse_table_proto->set_table_id(0); + sparse_table_proto->set_table_class("GraphTable"); + sparse_table_proto->set_shard_num(127); + sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE); + ::paddle::distributed::TableAccessorParameter* accessor_proto = + sparse_table_proto->mutable_accessor(); + accessor_proto->set_accessor_class("CommMergeAccessor"); +} + +::paddle::distributed::PSParameter GetServerProto() { + // Generate server proto desc + ::paddle::distributed::PSParameter server_fleet_desc; + ::paddle::distributed::ServerParameter* server_proto = + server_fleet_desc.mutable_server_param(); + ::paddle::distributed::DownpourServerParameter* downpour_server_proto = + server_proto->mutable_downpour_server_param(); + ::paddle::distributed::ServerServiceParameter* server_service_proto = + downpour_server_proto->mutable_service_param(); + server_service_proto->set_service_class("GraphBrpcService"); + server_service_proto->set_server_class("GraphBrpcServer"); + server_service_proto->set_client_class("GraphBrpcClient"); + server_service_proto->set_start_server_port(0); + server_service_proto->set_server_thread_num(12); + + ::paddle::distributed::TableParameter* sparse_table_proto = + downpour_server_proto->add_downpour_table_param(); + GetDownpourSparseTableProto(sparse_table_proto); + return server_fleet_desc; +} + +::paddle::distributed::PSParameter GetWorkerProto() { + ::paddle::distributed::PSParameter worker_fleet_desc; + ::paddle::distributed::WorkerParameter* worker_proto = + worker_fleet_desc.mutable_worker_param(); + + ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto = + worker_proto->mutable_downpour_worker_param(); + + ::paddle::distributed::TableParameter* worker_sparse_table_proto = + downpour_worker_proto->add_downpour_table_param(); + GetDownpourSparseTableProto(worker_sparse_table_proto); + + ::paddle::distributed::ServerParameter* server_proto = + worker_fleet_desc.mutable_server_param(); + ::paddle::distributed::DownpourServerParameter* downpour_server_proto = + server_proto->mutable_downpour_server_param(); + ::paddle::distributed::ServerServiceParameter* server_service_proto = + downpour_server_proto->mutable_service_param(); + server_service_proto->set_service_class("GraphBrpcService"); + server_service_proto->set_server_class("GraphBrpcServer"); + server_service_proto->set_client_class("GraphBrpcClient"); + server_service_proto->set_start_server_port(0); + server_service_proto->set_server_thread_num(12); + + ::paddle::distributed::TableParameter* server_sparse_table_proto = + downpour_server_proto->add_downpour_table_param(); + GetDownpourSparseTableProto(server_sparse_table_proto); + + return worker_fleet_desc; +} + +/*-------------------------------------------------------------------------*/ + +std::string ip_ = "127.0.0.1", ip2 = "127.0.0.1"; +uint32_t port_ = 5209, port2 = 5210; + +std::vector host_sign_list_; + +std::shared_ptr pserver_ptr_, + pserver_ptr2; + +std::shared_ptr worker_ptr_; + +void RunServer() { + LOG(INFO) << "init first server"; + ::paddle::distributed::PSParameter server_proto = GetServerProto(); + + auto _ps_env = paddle::distributed::PaddlePSEnvironment(); + _ps_env.set_ps_servers(&host_sign_list_, 2); // test + pserver_ptr_ = std::shared_ptr( + (paddle::distributed::GraphBrpcServer*) + paddle::distributed::PSServerFactory::create(server_proto)); + std::vector empty_vec; + framework::ProgramDesc empty_prog; + empty_vec.push_back(empty_prog); + pserver_ptr_->configure(server_proto, _ps_env, 0, empty_vec); + LOG(INFO) << "first server, run start(ip,port)"; + pserver_ptr_->start(ip_, port_); + pserver_ptr_->build_peer2peer_connection(0); + LOG(INFO) << "init first server Done"; +} + +void RunServer2() { + LOG(INFO) << "init second server"; + ::paddle::distributed::PSParameter server_proto2 = GetServerProto(); + + auto _ps_env2 = paddle::distributed::PaddlePSEnvironment(); + _ps_env2.set_ps_servers(&host_sign_list_, 2); // test + pserver_ptr2 = std::shared_ptr( + (paddle::distributed::GraphBrpcServer*) + paddle::distributed::PSServerFactory::create(server_proto2)); + std::vector empty_vec2; + framework::ProgramDesc empty_prog2; + empty_vec2.push_back(empty_prog2); + pserver_ptr2->configure(server_proto2, _ps_env2, 1, empty_vec2); + pserver_ptr2->start(ip2, port2); + pserver_ptr2->build_peer2peer_connection(1); +} + +void RunClient( + std::map>& dense_regions, + int index, paddle::distributed::PsBaseService* service) { + ::paddle::distributed::PSParameter worker_proto = GetWorkerProto(); + paddle::distributed::PaddlePSEnvironment _ps_env; + auto servers_ = host_sign_list_.size(); + _ps_env = paddle::distributed::PaddlePSEnvironment(); + _ps_env.set_ps_servers(&host_sign_list_, servers_); + worker_ptr_ = std::shared_ptr( + (paddle::distributed::GraphBrpcClient*) + paddle::distributed::PSClientFactory::create(worker_proto)); + worker_ptr_->configure(worker_proto, dense_regions, _ps_env, 0); + worker_ptr_->set_shard_num(127); + worker_ptr_->set_local_channel(index); + worker_ptr_->set_local_graph_service( + (paddle::distributed::GraphBrpcService*)service); +} + +void RunGraphSplit() { + setenv("http_proxy", "", 1); + setenv("https_proxy", "", 1); + prepare_file(edge_file_name, edges); + prepare_file(node_file_name, nodes); + prepare_file(graph_split_file_name, graph_split); + auto ph_host = paddle::distributed::PSHost(ip_, port_, 0); + host_sign_list_.push_back(ph_host.serialize_to_string()); + + // test-start + auto ph_host2 = paddle::distributed::PSHost(ip2, port2, 1); + host_sign_list_.push_back(ph_host2.serialize_to_string()); + // test-end + // Srart Server + std::thread* server_thread = new std::thread(RunServer); + + std::thread* server_thread2 = new std::thread(RunServer2); + + sleep(2); + std::map> dense_regions; + dense_regions.insert( + std::pair>(0, {})); + auto regions = dense_regions[0]; + + RunClient(dense_regions, 0, pserver_ptr_->get_service()); + + /*-----------------------Test Server Init----------------------------------*/ + + auto pull_status = worker_ptr_->load_graph_split_config( + 0, std::string(graph_split_file_name)); + pull_status.wait(); + pull_status = + worker_ptr_->load(0, std::string(edge_file_name), std::string("e>")); + srand(time(0)); + pull_status.wait(); + std::vector> _vs; + std::vector> vs; + pull_status = worker_ptr_->batch_sample_neighbors( + 0, std::vector(1, 10240001024), 4, _vs, vs, true); + pull_status.wait(); + ASSERT_EQ(0, _vs[0].size()); + _vs.clear(); + vs.clear(); + pull_status = worker_ptr_->batch_sample_neighbors( + 0, std::vector(1, 97), 4, _vs, vs, true); + pull_status.wait(); + ASSERT_EQ(3, _vs[0].size()); + std::remove(edge_file_name); + std::remove(node_file_name); + std::remove(graph_split_file_name); + LOG(INFO) << "Run stop_server"; + worker_ptr_->stop_server(); + LOG(INFO) << "Run finalize_worker"; + worker_ptr_->finalize_worker(); +} + +TEST(RunGraphSplit, Run) { RunGraphSplit(); } \ No newline at end of file diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index e8cb55b7afeb9..d5abf639c83db 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -2,7 +2,7 @@ set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) set(generated_deps dygraph_function dygraph_node) -if(NOT DEFINED ON_INFER) +if(NOT ON_INFER) message("Performing Eager Dygraph Auto Code Generation") add_subdirectory(auto_code_generator) endif() diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc index 7345c3612381b..9d475d96e56ce 100644 --- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc +++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc @@ -193,13 +193,14 @@ void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) { // TODO(jiabin): Support NPU here PADDLE_TENSOR_ADD(float); - // NOTE(phlrain): xpu only support float +// NOTE(phlrain): xpu only support float +#ifndef PADDLE_WITH_XPU PADDLE_TENSOR_ADD(double); // NOTE(chenweihang): only support complex grad tensor accumulated, // support selected rows if needed in the future PADDLE_TENSOR_ADD(paddle::platform::complex); PADDLE_TENSOR_ADD(paddle::platform::complex); - +#endif #undef PADDLE_TENSOR_ADD if (data_type == paddle::framework::proto::VarType::FP16) { @@ -268,13 +269,14 @@ void VariableAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) { // TODO(jiabin): Support NPU here PADDLE_TENSOR_ADD(float); - // NOTE(phlrain): xpu only support float +// NOTE(phlrain): xpu only support float +#ifndef PADDLE_WITH_XPU PADDLE_TENSOR_ADD(double); // NOTE(chenweihang): only support complex grad tensor accumulated, // support selected rows if needed in the future PADDLE_TENSOR_ADD(paddle::platform::complex); PADDLE_TENSOR_ADD(paddle::platform::complex); - +#endif #undef PADDLE_TENSOR_ADD if (data_type == paddle::framework::proto::VarType::FP16) { diff --git a/paddle/fluid/eager/api/generated/CMakeLists.txt b/paddle/fluid/eager/api/generated/CMakeLists.txt index 407a8d69e52da..ebbef286f7923 100644 --- a/paddle/fluid/eager/api/generated/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/CMakeLists.txt @@ -1,5 +1,5 @@ add_subdirectory(eager_generated) -if(NOT DEFINED ON_INFER) +if(NOT ON_INFER) add_subdirectory(fluid_generated) endif() diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc index 7f85d014fa842..85ff6687e0dbe 100644 --- a/paddle/fluid/eager/api/utils/hook_utils.cc +++ b/paddle/fluid/eager/api/utils/hook_utils.cc @@ -20,6 +20,7 @@ #include "paddle/pten/core/dense_tensor.h" namespace egr { +namespace egr_utils_api { void RegisterGradientHookForTensor( const egr::EagerTensor& tensor, @@ -90,4 +91,5 @@ void RetainGradForTensor(const egr::EagerTensor& tensor) { } } +} // namespace egr_utils_api } // namespace egr diff --git a/paddle/fluid/eager/api/utils/hook_utils.h b/paddle/fluid/eager/api/utils/hook_utils.h index bf320f0b15d4a..7e4faa5a2c701 100644 --- a/paddle/fluid/eager/api/utils/hook_utils.h +++ b/paddle/fluid/eager/api/utils/hook_utils.h @@ -18,6 +18,7 @@ #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/pten/api/all.h" namespace egr { +namespace egr_utils_api { void RegisterGradientHookForTensor( const egr::EagerTensor& tensor, @@ -27,4 +28,5 @@ void RegisterReduceHookForTensor(const egr::EagerTensor& tensor, const std::function& hook); void RetainGradForTensor(const egr::EagerTensor& tensor); +} // namespace egr_utils_api } // namespace egr diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc index 9dbb308a2c906..ad6c34b7cf86c 100644 --- a/paddle/fluid/eager/api/utils/tensor_utils.cc +++ b/paddle/fluid/eager/api/utils/tensor_utils.cc @@ -26,6 +26,7 @@ #include "paddle/fluid/framework/variable.h" namespace egr { +namespace egr_utils_api { bool IsLeafTensor(const egr::EagerTensor& target) { std::shared_ptr grad_node = EagerUtils::grad_node(target); @@ -58,4 +59,5 @@ egr::EagerTensor CreateTensorWithValue(const pten::DDim& ddim, return out; } +} // namespace egr_utils_api } // namespace egr diff --git a/paddle/fluid/eager/api/utils/tensor_utils.h b/paddle/fluid/eager/api/utils/tensor_utils.h index a0d8caf3cb307..b3c4b59682320 100644 --- a/paddle/fluid/eager/api/utils/tensor_utils.h +++ b/paddle/fluid/eager/api/utils/tensor_utils.h @@ -18,6 +18,7 @@ #include "paddle/pten/api/all.h" namespace egr { +namespace egr_utils_api { // If and only if the tensor holds an AccumulationNode // Then it's treated as a leaf tensor @@ -29,4 +30,5 @@ egr::EagerTensor CreateTensorWithValue(const pten::DDim& ddim, const pten::DataLayout& layout, float value, bool is_leaf = true); +} // namespace egr_utils_api } // namespace egr diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt index 5d31c9139baa8..187c3db445222 100644 --- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt @@ -17,13 +17,42 @@ execute_process( ) if(WIN32) + set(EAGER_CODEGEN_DEPS eager_generator) + if("${CMAKE_GENERATOR}" STREQUAL "Ninja") + set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}") + else() + set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}") + endif() + + if(${CBLAS_PROVIDER} STREQUAL MKLML) + message("Copied libiomp5md.dll for Eager AutoCodeGen") + ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/libiomp5md.dll + COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${eager_generator_path} + DEPENDS mklml) + list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/libiomp5md.dll) + else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS) + message("Copied openblas.dll for Eager AutoCodeGen") + ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/openblas.dll + COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${eager_generator_path} + DEPENDS extern_openblas) + list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/openblas.dll) + endif() + + if(WITH_MKLDNN) + message("Copied mkldnn.dll for Eager AutoCodeGen") + ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/mkldnn.dll + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${eager_generator_path} + DEPENDS mkldnn) + list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/mkldnn.dll) + endif() + add_custom_target(eager_codegen - COMMAND "${CMAKE_CURRENT_BINARY_DIR}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" - DEPENDS eager_generator + COMMAND "${eager_generator_path}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/op_list.txt" + DEPENDS ${EAGER_CODEGEN_DEPS} VERBATIM) else() add_custom_target(eager_codegen - COMMAND "${CMAKE_CURRENT_BINARY_DIR}/eager_generator" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" + COMMAND "${CMAKE_CURRENT_BINARY_DIR}/eager_generator" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/op_list.txt" DEPENDS eager_generator VERBATIM) endif() diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index c0714775da852..fe29792b6e75c 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -22,33 +22,28 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/pybind/op_function_generator.h" #include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/string/string_helper.h" +namespace paddle { +namespace framework { + +static std::unordered_map + operators_with_attrs = {}; + static std::unordered_set operators_to_skip = { - "fused_elemwise_add_activation", // No Default Attr - "fused_elemwise_activation", // No Default Attr - "reverse", // Attr Error - "flip", // Attr Error - "cast", // Attr Error - "sum", - "minus", // Multiple ops_ - "pull_sparse", - "pull_box_extended_sparse", - "pull_sparse_v2", - "pull_box_sparse", - "fused_attention", - "diag_v2", + "minus", }; -static std::unordered_set operators_to_codegen = { - "sigmoid", "matmul_v2", "reduce_sum", "elementwise_add", - "share_buffer", "var_conv_2d", "split"}; - +static std::unordered_set operators_to_codegen = {}; static std::unordered_set skipped_operators = {}; -namespace paddle { -namespace framework { +static std::string LegalizeVariableName(const std::string& var_name) { + std::string ret = var_name; + std::replace(ret.begin(), ret.end(), '-', '_'); // replace all '-' to '_' + return ret; +} static std::string AttrTypeToString(const proto::AttrType& type) { std::string ret; @@ -358,15 +353,81 @@ static bool CheckOpProto(proto::OpProto* op_proto) { return true; } -/* -------------------------------- */ -/* --------- Collect Info --------- */ -/* -------------------------------- */ -static bool CollectInformationFromOpInfo( - const paddle::framework::OpInfo& op_info, - std::vector* grad_node_default_attr_maps, - std::vector* grad_op_types, +/* --------------------------------------- */ +/* --------- Preprocess Ins/Outs --------- */ +/* --------------------------------------- */ +static void PurifyForwardOpProto( + const proto::OpProto& op_proto, std::unordered_map* fwd_inputs_name_pos_map, std::unordered_map* fwd_outputs_name_pos_map, + std::vector* in_vars, + std::vector* out_vars) { + // Op Name + const std::string op_name = op_proto.type(); + + // Handle dispensable inputs + for (const proto::OpProto::Var& input : op_proto.inputs()) { + std::string input_name = input.name(); + + // Delete dispensable tensor unless specified in op_ins_map + if (input.dispensable()) { + if (!op_ins_map.count(op_name) || + !op_ins_map[op_name].count(input_name)) { + VLOG(6) << "Removing Dispensable Input: " << input_name; + + // in_vars + auto iter = in_vars->begin(); + for (iter = in_vars->begin(); iter != in_vars->end(); iter++) { + if (iter->name() == input_name) { + break; + } + } + in_vars->erase(iter); + } + } + } + + for (const proto::OpProto::Var& output : op_proto.outputs()) { + std::string output_name = output.name(); + + // Delete dispensable tensor unless specified in op_outs_map + if (output.dispensable()) { + if (!op_outs_map.count(op_name) || + !op_outs_map[op_name].count(output_name)) { + VLOG(6) << "Removing Dispensable Output: " << output_name; + + // out_vars + auto iter = out_vars->begin(); + for (iter = out_vars->begin(); iter != out_vars->end(); iter++) { + if (iter->name() == output_name) { + break; + } + } + out_vars->erase(iter); + } + } + } + + /* ------ Maping forward slot name to fwd position ------ */ + size_t in_pos = 0; + for (const auto& var : *in_vars) { + VLOG(6) << "Mapping input tensor: " << var.name() + << " To position: " << in_pos; + (*fwd_inputs_name_pos_map)[var.name()] = in_pos; + in_pos++; + } + + size_t out_pos = 0; + for (const auto& var : *out_vars) { + VLOG(6) << "Mapping output tensor: " << var.name() + << " To position: " << out_pos; + (*fwd_outputs_name_pos_map)[var.name()] = out_pos; + out_pos++; + } +} + +static void PurifyGradOpProto( + const proto::OpProto& op_proto, std::map* grad_outs_slotname_map, std::map* grad_ins_fwd_slotname_map, std::map* grad_ins_grad_slotname_map, @@ -376,6 +437,114 @@ static bool CollectInformationFromOpInfo( std::map>>* grad_outs) { + // Op Name + const std::string op_name = op_proto.type(); + + // Handle dispensable inputs + for (const proto::OpProto::Var& input : op_proto.inputs()) { + std::string input_name = input.name(); + + // Delete dispensable tensor unless specified in op_ins_map + if (input.dispensable()) { + if (!op_ins_map.count(op_name) || + !op_ins_map[op_name].count(input_name)) { + VLOG(6) << "Removing Dispensable Input: " << input_name; + + // grad_outs_slotname_map + auto grad_outs_slotname_map_purified = *grad_outs_slotname_map; + for (const auto& iter : *grad_outs_slotname_map) { + const std::string& grad_output_name = iter.first; + const std::string& matched_input_name = iter.second; + if (matched_input_name == input_name) { + grad_outs_slotname_map_purified.erase(grad_output_name); + + PADDLE_ENFORCE( + grad_outs->count(grad_output_name) > 0, + paddle::platform::errors::Fatal( + "Unable to find gradient output name in grad_outs.")); + // grad_outs + grad_outs->erase(grad_output_name); + } + } + *grad_outs_slotname_map = grad_outs_slotname_map_purified; + + // grad_ins_fwd_slotname_map: output as tensorwrapper + if (grad_ins_fwd_slotname_map->count(input_name)) + grad_ins_fwd_slotname_map->erase(input_name); + + // grad_ins: output as tensorwrapper + if (grad_ins->count(input_name)) grad_ins->erase(input_name); + } + } + } + + for (const proto::OpProto::Var& output : op_proto.outputs()) { + std::string output_name = output.name(); + + // Delete dispensable tensor unless specified in op_outs_map + if (output.dispensable()) { + if (!op_outs_map.count(op_name) || + !op_outs_map[op_name].count(output_name)) { + VLOG(6) << "Removing Dispensable Output: " << output_name; + + // grad_ins_grad_slotname_map + auto grad_ins_grad_slotname_map_purified = *grad_ins_grad_slotname_map; + for (const auto& iter : *grad_ins_grad_slotname_map) { + const std::string& grad_input_name = iter.first; + const std::string& matched_output_name = iter.second; + if (matched_output_name == output_name) { + grad_ins_grad_slotname_map_purified.erase(grad_input_name); + + PADDLE_ENFORCE( + grad_ins->count(grad_input_name) > 0, + paddle::platform::errors::Fatal( + "Unable to find gradient input name in grad_ins.")); + // grad_ins + grad_ins->erase(grad_input_name); + } + } + *grad_ins_grad_slotname_map = grad_ins_grad_slotname_map_purified; + + // grad_ins_fwd_slotname_map: output as tensorwrapper + if (grad_ins_fwd_slotname_map->count(output_name)) + grad_ins_fwd_slotname_map->erase(output_name); + + // grad_ins: output as tensorwrapper + if (grad_ins->count(output_name)) grad_ins->erase(output_name); + } + } + } +} + +/* -------------------------------- */ +/* --------- Collect Info --------- */ +/* -------------------------------- */ +static void CollectForwardInformationFromOpInfo( + const paddle::framework::OpInfo& op_info, + std::vector* in_vars, + std::vector* out_vars) { + const proto::OpProto& op_proto = *op_info.proto_; + for (const proto::OpProto::Var& input : op_proto.inputs()) { + in_vars->push_back(input); + } + for (const proto::OpProto::Var& output : op_proto.outputs()) { + out_vars->push_back(output); + } +} + +static bool CollectGradInformationFromOpInfo( + const paddle::framework::OpInfo& op_info, bool* generate_forward_only, + std::vector* grad_op_types, // grad + std::map* grad_outs_slotname_map, // grad + std::map* grad_ins_fwd_slotname_map, // grad + std::map* grad_ins_grad_slotname_map, // grad + std::map>>* + grad_ins, // grad + std::map>>* + grad_outs // grad + ) { const proto::OpProto& op_proto = *op_info.proto_; const std::string& op_type = op_proto.type(); std::vector dims = {1, 1, 1, 1}; @@ -429,13 +598,23 @@ static bool CollectInformationFromOpInfo( paddle::framework::AttributeMap default_attrs; auto* attr_checker = op_info.Checker(); if (attr_checker) { + VLOG(6) << "Checking AttributeMap Settings"; attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true); default_attrs = attr_checker->GetDefaultAttrMap(); + VLOG(6) << "AttributeMap Checking Passed"; } else { VLOG(6) << "Detected Null Attribute Checker, use empty default_attrs"; } + if (operators_with_attrs.count(op_type)) { + VLOG(6) << "Found operator " << op_type << " using special AttributeMap"; + attrs = operators_with_attrs[op_type]; + } + VLOG(6) << "Prepared Default Attributes Map, size = " << default_attrs.size(); + for (const auto& iter : default_attrs) { + VLOG(6) << iter.first; + } /* ---------------------------- */ /* --------- Backward --------- */ @@ -465,8 +644,8 @@ static bool CollectInformationFromOpInfo( /* ------ Run GradOpMaker ------ */ if (!op_info.dygraph_grad_op_maker_) { - VLOG(6) << op_type << " has no GradOpMaker, skip it"; - skipped_operators.insert(op_type); + VLOG(6) << op_type << " has no GradOpMaker"; + *generate_forward_only = true; return false; } @@ -476,17 +655,19 @@ static bool CollectInformationFromOpInfo( if (!grad_node) { VLOG(6) << "Got nullptr GradOpNode for " << op_type - << " likely registered EmptyGradOpMaker, skip it"; - skipped_operators.insert(op_type); + << " likely registered EmptyGradOpMaker"; + *generate_forward_only = true; return false; } + /* if (grad_node->size() > 1) { // Backward attributes can be super complicated VLOG(6) << "Skip GradOpNode with multiple OpBases for now: " << op_type; skipped_operators.insert(op_type); return false; } + */ VLOG(6) << "Prepared GradOpNode"; @@ -494,7 +675,6 @@ static bool CollectInformationFromOpInfo( for (auto iter = grad_node->begin(); iter < grad_node->end(); iter++) { // Each OpBase paddle::imperative::OpBase& op_base = *iter; - grad_node_default_attr_maps->push_back(op_base.DefaultAttrsMap()); grad_op_types->push_back(op_base.Type()); } @@ -538,22 +718,6 @@ static bool CollectInformationFromOpInfo( grad_outs_slotname_map); VLOG(6) << "Finished Slotname Matching for Grad_Outs"; - /* ------ Maping forward slot name to fwd position ------ */ - size_t in_pos = 0; - for (const auto& iter : ins) { - VLOG(6) << "Mapping input tensor: " << iter.first - << " To position: " << in_pos; - (*fwd_inputs_name_pos_map)[iter.first] = in_pos; - in_pos++; - } - size_t out_pos = 0; - for (const auto& iter : outs) { - VLOG(6) << "Mapping output tensor: " << iter.first - << " To position: " << out_pos; - (*fwd_outputs_name_pos_map)[iter.first] = out_pos; - out_pos++; - } - return true; } @@ -561,16 +725,13 @@ static bool CollectInformationFromOpInfo( /* --------- CodeGen: Forward GradNode Creation ------ */ /* --------------------------------------------------- */ static std::string GenerateGradNodeCreationContent( - const std::vector& - grad_node_default_attr_maps, const std::unordered_map& fwd_inputs_name_pos_map, const std::unordered_map& fwd_outputs_name_pos_map, const std::map& grad_ins_fwd_slotname_map, - const proto::OpProto& op_proto) { + const std::string& op_type, const std::vector& in_vars, + const std::vector& out_vars) { VLOG(6) << "Generating GradNode Creation codes"; - const std::string& op_type = op_proto.type(); - // [Generation] Construct GradOpNode // Run ComputeRequiredGrad @@ -578,7 +739,7 @@ static std::string GenerateGradNodeCreationContent( // then generate: "egr::AutogradMeta* p_autograd_out = // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")" std::string get_autograd_meta_str = " // Prepare Autograd Meta \n"; - for (const proto::OpProto::Var& input : op_proto.inputs()) { + for (const proto::OpProto::Var& input : in_vars) { const std::string& input_name = input.name(); const std::string& input_autograd_name = "p_autograd_" + input_name; @@ -602,7 +763,7 @@ static std::string GenerateGradNodeCreationContent( // If single output slotname and not duplicable, // then generate: "egr::AutogradMeta* p_autograd_out = // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")" - for (const proto::OpProto::Var& output : op_proto.outputs()) { + for (const proto::OpProto::Var& output : out_vars) { const std::string& output_name = output.name(); const std::string& output_autograd_name = "p_autograd_" + output_name; @@ -636,8 +797,8 @@ static std::string GenerateGradNodeCreationContent( // [GradOpNode] Generation std::string grad_node_creation_str = ""; - size_t bwd_in_slot_num = op_proto.outputs().size(); - size_t bwd_out_slot_num = op_proto.inputs().size(); + size_t bwd_in_slot_num = out_vars.size(); + size_t bwd_out_slot_num = in_vars.size(); const char* GRAD_OP_NODE_TEMPLATE = " auto grad_node = std::make_shared(%d, %d);\n"; grad_node_creation_str += " // Create GradOpNode\n"; @@ -669,7 +830,7 @@ static std::string GenerateGradNodeCreationContent( // [GradOpNode] SetGradOutMeta // [GradOpNode] Add Edges std::string compute_require_grad_args = "trace_backward"; - for (const proto::OpProto::Var& input : op_proto.inputs()) { + for (const proto::OpProto::Var& input : in_vars) { const std::string& input_name = input.name(); const std::string& input_autograd_name = "p_autograd_" + input_name; compute_require_grad_args += ", &" + input_autograd_name; @@ -689,7 +850,7 @@ static std::string GenerateGradNodeCreationContent( // [AutogradMeta] SetOutRank // [AutogradMeta] SetHistory std::string pass_stop_gradient_args = "false"; - for (const proto::OpProto::Var& output : op_proto.outputs()) { + for (const proto::OpProto::Var& output : out_vars) { const std::string& output_name = output.name(); const std::string& output_autograd_name = "p_autograd_" + output_name; pass_stop_gradient_args += ", &" + output_autograd_name; @@ -727,24 +888,11 @@ static std::string GenerateGradNodeCreationContent( return grad_node_creation_body_str; } -static std::string AppendUseOp(const std::string& op_type) { - // [Generation] Append USE_OP - const char* USE_OP_TEMPLATE = "USE_OP(%s);\n"; - std::string return_str = paddle::string::Sprintf(USE_OP_TEMPLATE, op_type); - - // Special Ops - if (op_type == "reduce_sum") - return_str += paddle::string::Sprintf(USE_OP_TEMPLATE, "reduce_sum_grad"); - - return return_str; -} - /* -------------------------------- */ /* --------- CodeGen: Forward ----- */ /* -------------------------------- */ static std::pair GenerateForwardFunctionContents( - const std::vector& - grad_node_default_attr_maps, + bool generate_forward_only, const std::unordered_map& fwd_inputs_name_pos_map, const std::unordered_map& fwd_outputs_name_pos_map, const std::map& grad_ins_fwd_slotname_map, @@ -758,7 +906,8 @@ static std::pair GenerateForwardFunctionContents( std::string, std::vector>>& grad_outs, - const proto::OpProto& op_proto) { + const std::string& op_type, const std::vector& in_vars, + const std::vector& out_vars) { /* // Forward Function Example: std::tuple, Tensor, vector> @@ -779,6 +928,7 @@ static std::pair GenerateForwardFunctionContents( ,ConstructDuplicableOutput(Out1Num)} }; // According to op_proto->attrs() + egr::legacy::RunOp("op_type", ins, outs, attr_map, Controller.Instance().GetExpectedPlace(), {}); @@ -795,8 +945,6 @@ static std::pair GenerateForwardFunctionContents( */ VLOG(6) << "Generating Dygraph Forward Function"; - const std::string& op_type = op_proto.type(); - std::string generated_function_body = ""; std::string dygraph_function_args_str = ""; @@ -806,8 +954,8 @@ static std::pair GenerateForwardFunctionContents( // [Generation] Get Ins Map std::string ins_contents_str = ""; - std::vector input_args_str_list(op_proto.inputs().size()); - for (const proto::OpProto::Var& input : op_proto.inputs()) { + std::vector input_args_str_list(in_vars.size()); + for (const proto::OpProto::Var& input : in_vars) { const std::string& input_name = input.name(); size_t input_position = fwd_inputs_name_pos_map.at(input_name); if (input.duplicable()) { @@ -848,7 +996,7 @@ static std::pair GenerateForwardFunctionContents( // [Generation] Get Outs Map std::string outs_contents_str = ""; - for (const proto::OpProto::Var& output : op_proto.outputs()) { + for (const proto::OpProto::Var& output : out_vars) { const std::string& output_name = output.name(); std::string outnum = "1"; if (output.duplicable()) { @@ -859,7 +1007,7 @@ static std::pair GenerateForwardFunctionContents( paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, outnum); dygraph_function_args_str += arg_str; const char* FWD_OUTS_CONTENT_TEMPLATE = - "{ \"%s\", egr::ConstructDuplicableOutput(%s) },"; + "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput(%s) },"; outs_contents_str += paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE, output_name, outnum); } else { @@ -888,7 +1036,6 @@ static std::pair GenerateForwardFunctionContents( // [Generation] Get Attrs dygraph_function_args_str += ", const paddle::framework::AttributeMap& attr_map"; - generated_function_body += "\n"; // [Generation] Get TraceOp const char* FWD_TRACE_OP_TEMPLATE = @@ -898,54 +1045,57 @@ static std::pair GenerateForwardFunctionContents( " egr::Controller::Instance().GetExpectedPlace(),\n" " &default_attrs, true, {});\n"; std::string trace_op_str = - paddle::string::Sprintf(FWD_TRACE_OP_TEMPLATE, op_proto.type()); + paddle::string::Sprintf(FWD_TRACE_OP_TEMPLATE, op_type); generated_function_body += trace_op_str; generated_function_body += "\n"; VLOG(6) << "Generated AttrMap & TraceOp"; // [Generation] Convert output VarBase to Vector/Tensor - size_t output_size = op_proto.outputs().size(); + size_t output_size = out_vars.size(); std::vector return_contents(output_size); std::vector return_types(output_size); - for (const proto::OpProto::Var& output : op_proto.outputs()) { + for (const proto::OpProto::Var& output : out_vars) { const std::string& output_name = output.name(); std::string out_tensor_str; size_t return_position = fwd_outputs_name_pos_map.at(output_name); + std::string output_varname = LegalizeVariableName(output_name); if (output.duplicable()) { const char* FWD_OUT_TENSORS_TEMPLATE = " std::vector %s = " "egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n"; out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE, - output_name, output_name); + output_varname, output_name); return_types[return_position] = "std::vector"; } else { const char* FWD_OUT_TENSOR_TEMPLATE = " egr::EagerTensor %s = " "egr::EagerUtils::GetOutput(outs[\"%s\"][0]);\n"; out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, - output_name, output_name); + output_varname, output_name); return_types[return_position] = "egr::EagerTensor"; } - return_contents[return_position] = output_name; + return_contents[return_position] = output_varname; generated_function_body += out_tensor_str; } generated_function_body += "\n"; VLOG(6) << "Converted Output VarBase to EagerTensor(s)"; // [Generation] ComputeRequireGrad -> GradNodeCreation - std::string grad_node_creation_body_str = GenerateGradNodeCreationContent( - grad_node_default_attr_maps, fwd_inputs_name_pos_map, - fwd_outputs_name_pos_map, grad_ins_fwd_slotname_map, op_proto); - generated_function_body += grad_node_creation_body_str; - generated_function_body += "\n"; - VLOG(6) << "Generated GradNode Creation codes"; + if (!generate_forward_only) { + std::string grad_node_creation_body_str = GenerateGradNodeCreationContent( + fwd_inputs_name_pos_map, fwd_outputs_name_pos_map, + grad_ins_fwd_slotname_map, op_type, in_vars, out_vars); + generated_function_body += grad_node_creation_body_str; + generated_function_body += "\n"; + VLOG(6) << "Generated GradNode Creation codes"; + } // [Generation] Handle return: Tuple/Vector/Tensor generated_function_body += "\n"; - std::string return_str; + std::string return_str = ""; std::string return_type_str = ""; std::string function_proto_return_type_str = ""; if (return_contents.size() > 1) { @@ -968,14 +1118,20 @@ static std::pair GenerateForwardFunctionContents( const char* FWD_FUNCTION_PROTO_RETURN_TEMPLATE = "std::tuple<%s>"; function_proto_return_type_str = paddle::string::Sprintf( FWD_FUNCTION_PROTO_RETURN_TEMPLATE, return_type_str); - } else { + + } else if (return_contents.size() == 1) { // Return vector or Tensor return_type_str = return_types[0]; const char* FWD_TENSOR_RETURN_TEMPLATE = " return %s;"; return_str = paddle::string::Sprintf(FWD_TENSOR_RETURN_TEMPLATE, return_contents[0]); function_proto_return_type_str = return_type_str; + + } else { + return_str = "return nullptr;"; + function_proto_return_type_str = "void*"; } + generated_function_body += return_str; generated_function_body += "\n"; VLOG(6) << "Generated return codes"; @@ -983,14 +1139,16 @@ static std::pair GenerateForwardFunctionContents( // [Generation] Get Full Function std::string function_name = op_type + "_dygraph_function"; + if (dygraph_function_args_str.size() > 0) { + auto iter = dygraph_function_args_str.begin(); + if ((*iter) == ',') dygraph_function_args_str.erase(iter); + } + const char* FWD_FUNCTION_TEMPLATE = "%s %s(%s) {\n\n%s\n}\n\n"; std::string fwd_function_str = paddle::string::Sprintf( FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name, dygraph_function_args_str, generated_function_body); - // [Generation] Append USE_OP - fwd_function_str += AppendUseOp(op_type); - // [Generation] Generate forward functions header const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n"; std::string dygraph_function_declaration_str = paddle::string::Sprintf( @@ -1004,8 +1162,6 @@ static std::pair GenerateForwardFunctionContents( /* --------- CodeGen: GradNode::operator() ------ */ /* ---------------------------------------------- */ static std::string GenerateGradNodeCCContents( - const std::vector& - grad_node_default_attr_maps, const std::vector& grad_op_types, const std::unordered_map& fwd_inputs_name_pos_map, const std::unordered_map& fwd_outputs_name_pos_map, @@ -1020,7 +1176,8 @@ static std::string GenerateGradNodeCCContents( std::string, std::vector>>& grad_outs, - const proto::OpProto& op_proto) { + const std::string& op_type, const std::vector& in_vars, + const std::vector& out_vars) { VLOG(6) << "Generating Grad Node CC"; /* [Outline] @@ -1066,7 +1223,6 @@ static std::string GenerateGradNodeCCContents( } */ - const std::string& op_type = op_proto.type(); std::string generated_grad_function_body = ""; // [Generation] Get Tracer @@ -1122,7 +1278,7 @@ static std::string GenerateGradNodeCCContents( // [Generation] Get Outs Map std::unordered_set duplicable_input_name_set; - for (const auto& in : op_proto.inputs()) { + for (const auto& in : in_vars) { if (in.duplicable()) duplicable_input_name_set.insert(in.name()); } @@ -1132,23 +1288,76 @@ static std::string GenerateGradNodeCCContents( if (grad_outs_slotname_map.count(grad_output_name)) { // Fwd Tensor - const std::string& fwd_input_name = - grad_outs_slotname_map.at(grad_output_name); - size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_input_name); - - if (duplicable_input_name_set.count(fwd_input_name)) { - const char* GRAD_OUTS_CONTENT_TEMPLATE = - "{ \"%s\", egr::ConstructDuplicableOutput( " - "this->OutputMeta()[%d].Size() ) },"; + const std::string& fwd_name = grad_outs_slotname_map.at(grad_output_name); + + /* Handle Special Case: "PullSparseOp", etc + + Forward: + + Ids W + | | + PullSparseOp + | + Out + + Backward: + + Ids GradOut W + | | | + PullSparseGradOp + | + GradOut + + Its grad output "GradOut" corresponds to forward output "Out", + where there is a hiden inplace involved. So we find "GradOut"'s index + in + grads, and perform the inplace operation by constructing outs = + {{"Out", grads[i]}} + + GradOut -> Out -> fwd_output_pos -> grads position -> grads[i] + outs = {{"Out", grads[i]}} + + For returns, append "GradOut" to the very end of return list. + */ + if (!fwd_inputs_name_pos_map.count(fwd_name)) { + PADDLE_ENFORCE(fwd_outputs_name_pos_map.count(fwd_name), + paddle::platform::errors::Fatal( + "fwd_name not found in fwd_inputs_name_pos_map nor " + "fwd_outputs_name_pos_map")); + + size_t grads_position = fwd_outputs_name_pos_map.at(fwd_name); + std::string grad_ptr_name = fwd_name + "_ptrs"; + const char* GET_GRADS_PTR_TEMPLATE = + " std::vector> %s;\n" + " for(const auto& t : grads[%d]) {\n " + "%s.emplace_back(std::move(std::make_shared(t)));" + "\n }\n"; + std::string grads_ptr_str = + paddle::string::Sprintf(GET_GRADS_PTR_TEMPLATE, grad_ptr_name, + grads_position, grad_ptr_name); + generated_grad_function_body += grads_ptr_str; + generated_grad_function_body += "\n"; + + const char* GRAD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", %s },"; outs_contents_str += paddle::string::Sprintf( - GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position); + GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, grad_ptr_name); + } else { - const char* GRAD_OUTS_CONTENT_TEMPLATE = - "{ \"%s\", " - "{std::make_shared(egr::Controller::Instance()." - "GenerateUniqueName())}},"; - outs_contents_str += paddle::string::Sprintf(GRAD_OUTS_CONTENT_TEMPLATE, - grad_output_name); + size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name); + if (duplicable_input_name_set.count(fwd_name)) { + const char* GRAD_OUTS_CONTENT_TEMPLATE = + "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput( " + "this->OutputMeta()[%d].Size() ) },"; + outs_contents_str += paddle::string::Sprintf( + GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position); + } else { + const char* GRAD_OUTS_CONTENT_TEMPLATE = + "{ \"%s\", " + "{std::make_shared(egr::Controller::Instance()." + "GenerateUniqueName())}},"; + outs_contents_str += paddle::string::Sprintf( + GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name); + } } } else { PADDLE_THROW(platform::errors::Fatal( @@ -1173,7 +1382,7 @@ static std::string GenerateGradNodeCCContents( // [Generation] Get Attrs Map std::string trace_opbase_str = ""; - for (size_t i = 0; i < grad_node_default_attr_maps.size(); i++) { + for (size_t i = 0; i < grad_op_types.size(); i++) { const std::string& op_base_type = grad_op_types[i]; const char* TRACE_OP_TEMPLATE = @@ -1192,15 +1401,39 @@ static std::string GenerateGradNodeCCContents( // [Generation] Get Return std::string outputs_str = ""; + size_t num_appended_outputs = 0; for (auto iter : grad_outs) { const std::string& grad_out_name = iter.first; - size_t fwd_input_position = - fwd_inputs_name_pos_map.at(grad_outs_slotname_map.at(grad_out_name)); + const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name); + + if (fwd_inputs_name_pos_map.count(fwd_name)) { + size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name); + const char* BWD_OUTPUT_TEMPLATE = + " outputs[%d] = egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n"; + outputs_str += paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE, + fwd_input_position, grad_out_name); + num_appended_outputs++; + } else { + PADDLE_ENFORCE(fwd_outputs_name_pos_map.count(fwd_name), + paddle::platform::errors::Fatal( + "fwd_name not found in fwd_inputs_name_pos_map nor " + "fwd_outputs_name_pos_map")); + } + } - const char* BWD_OUTPUT_TEMPLATE = - " outputs[%d] = egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n"; - outputs_str += paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE, - fwd_input_position, grad_out_name); + /* Handle Special Case: "PullSparseOp", etc + For returns, append "GradOut" to the very end of return list. */ + for (auto iter : grad_outs) { + const std::string& grad_out_name = iter.first; + const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name); + + if (fwd_outputs_name_pos_map.count(fwd_name)) { + const char* BWD_OUTPUT_TEMPLATE = + " outputs[%d] = egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n"; + outputs_str += paddle::string::Sprintf( + BWD_OUTPUT_TEMPLATE, num_appended_outputs, grad_out_name); + num_appended_outputs++; + } } const char* BWD_RETURN_TEMPLATE = @@ -1230,10 +1463,9 @@ static std::string GenerateGradNodeCCContents( /* --------- CodeGen: GradNode Header ------ */ /* ----------------------------------------- */ static std::string GenerateGradNodeHeaderContents( - const std::vector& - grad_node_default_attr_maps, const std::map& grad_ins_fwd_slotname_map, - const proto::OpProto& op_proto) { + const std::string& op_type, const std::vector& in_vars, + const std::vector& out_vars) { VLOG(6) << "Generating Grad Node Header"; const char* GRAD_NODE_TEMPLATE = @@ -1261,8 +1493,6 @@ static std::string GenerateGradNodeHeaderContents( "%s\n" "};"; - const std::string& op_type = op_proto.type(); - // [Generation] Handle Attributes std::string set_attr_map_str = " void SetAttrMap(paddle::framework::AttributeMap&& attr_map) {\n " @@ -1279,12 +1509,12 @@ static std::string GenerateGradNodeHeaderContents( // [Generation] Handle TensorWrappers std::unordered_set duplicable_tensors; - for (const proto::OpProto::Var& input : op_proto.inputs()) { + for (const proto::OpProto::Var& input : in_vars) { if (input.duplicable()) { duplicable_tensors.insert(input.name()); } } - for (const proto::OpProto::Var& output : op_proto.outputs()) { + for (const proto::OpProto::Var& output : out_vars) { if (output.duplicable()) { duplicable_tensors.insert(output.name()); } @@ -1363,34 +1593,31 @@ static void GenerateForwardHFile(const std::string& output_dir, forward_header_stream.close(); } -static void GenerateForwardDygraphFile(const std::string& op_type, - const std::string& output_dir, +static void GenerateForwardDygraphFile(const std::string& output_dir, const std::string& fwd_function_str) { std::string forwards_dir = output_dir + "/forwards/"; - std::string node_h_filename = op_type + "_node.h"; - std::string forward_cc_filename = op_type + "_dygraph.cc"; + std::string forward_cc_filename = "dygraph_forward_functions.cc"; std::string forward_cc_path = forwards_dir + forward_cc_filename; const char* FORWARD_INCLUDE_TEMPLATE = "#include " "\"paddle/fluid/eager/api/generated/fluid_generated/" "dygraph_forward_api.h\"\n" "#include " - "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/%s\"\n\n" + "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n" "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n" "#include \"paddle/fluid/eager/legacy/op_runner.h\"\n"; std::string forward_cc_include_str = - paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE, node_h_filename); + paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE); std::ofstream forward_cc_stream(forward_cc_path, std::ios::out); forward_cc_stream << forward_cc_include_str; forward_cc_stream << fwd_function_str; forward_cc_stream.close(); } -static void GenerateNodeHFile(const std::string& op_type, - const std::string& output_dir, +static void GenerateNodeHFile(const std::string& output_dir, const std::string& grad_node_str) { std::string nodes_dir = output_dir + "/nodes/"; - std::string node_h_filename = op_type + "_node.h"; + std::string node_h_filename = "nodes.h"; std::string node_h_path = nodes_dir + node_h_filename; std::string node_h_include_str = "#pragma once\n" @@ -1403,12 +1630,10 @@ static void GenerateNodeHFile(const std::string& op_type, node_h_stream.close(); } -static void GenerateNodeCCFile(const std::string& op_type, - const std::string& output_dir, +static void GenerateNodeCCFile(const std::string& output_dir, const std::string& grad_function_str) { std::string nodes_dir = output_dir + "/nodes/"; - std::string node_h_filename = op_type + "_node.h"; - std::string node_cc_filename = op_type + "_node.cc"; + std::string node_cc_filename = "nodes.cc"; std::string node_cc_path = nodes_dir + node_cc_filename; const char* NODE_CC_INCLUDE_TEMPLATE = "#include \"glog/logging.h\"\n" @@ -1418,9 +1643,9 @@ static void GenerateNodeCCFile(const std::string& op_type, "#include \"paddle/fluid/eager/utils.h\"\n" "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n" "#include " - "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/%s\"\n\n"; + "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n"; std::string node_cc_include_str = - paddle::string::Sprintf(NODE_CC_INCLUDE_TEMPLATE, node_h_filename); + paddle::string::Sprintf(NODE_CC_INCLUDE_TEMPLATE); std::ofstream node_cc_stream(node_cc_path, std::ios::out); node_cc_stream << node_cc_include_str; node_cc_stream << grad_function_str; @@ -1441,6 +1666,9 @@ static std::string GenerateDygraphHFileIncludes() { static void DygraphCodeGeneration(const std::string& output_dir) { std::string dygraph_forward_api_str = GenerateDygraphHFileIncludes(); + std::string fwd_function_str = ""; + std::string grad_node_h_str = ""; + std::string grad_node_cc_str = ""; auto& op_info_map = paddle::framework::OpInfoMap::Instance().map(); @@ -1454,10 +1682,9 @@ static void DygraphCodeGeneration(const std::string& output_dir) { /* ----------------------------- */ /* ---- Collect Information ---- */ /* ----------------------------- */ - std::vector grad_node_default_attr_maps; std::vector grad_op_types; - std::unordered_map fwd_inputs_name_pos_map; - std::unordered_map fwd_outputs_name_pos_map; + std::vector in_vars; + std::vector out_vars; std::map grad_outs_slotname_map; std::map grad_ins_fwd_slotname_map; std::map grad_ins_grad_slotname_map; @@ -1469,65 +1696,152 @@ static void DygraphCodeGeneration(const std::string& output_dir) { grad_outs; VLOG(6) << "-------- CollectInformationFromOpInfo -------"; - bool is_available = CollectInformationFromOpInfo( - op_info, &grad_node_default_attr_maps, &grad_op_types, - &fwd_inputs_name_pos_map, &fwd_outputs_name_pos_map, + + CollectForwardInformationFromOpInfo(op_info, &in_vars, &out_vars); + + bool generate_forward_only = false; + bool is_available = CollectGradInformationFromOpInfo( + op_info, &generate_forward_only, &grad_op_types, &grad_outs_slotname_map, &grad_ins_fwd_slotname_map, &grad_ins_grad_slotname_map, &grad_ins, &grad_outs); - if (!is_available) continue; + if (!is_available && !generate_forward_only) { + VLOG(6) << "Skipped operator: " << op_type; + continue; + } + + VLOG(6) << "-------- PurifyOpProto -------"; + std::unordered_map fwd_inputs_name_pos_map; + std::unordered_map fwd_outputs_name_pos_map; + PurifyForwardOpProto(*op_proto, &fwd_inputs_name_pos_map, + &fwd_outputs_name_pos_map, &in_vars, &out_vars); + + if (!generate_forward_only) { + PurifyGradOpProto(*op_proto, &grad_outs_slotname_map, + &grad_ins_fwd_slotname_map, &grad_ins_grad_slotname_map, + &grad_ins, &grad_outs); + } /* --------------------------- */ /* --------- CodeGen --------- */ /* --------------------------- */ - /* ---- xxx_dygraph.cc ---- */ + /* ---- forward_dygraph_functions.cc ---- */ VLOG(6) << "-------- GenerateForwardFunctionContents -------"; std::pair body_and_declaration = GenerateForwardFunctionContents( - grad_node_default_attr_maps, fwd_inputs_name_pos_map, + generate_forward_only, fwd_inputs_name_pos_map, fwd_outputs_name_pos_map, grad_ins_fwd_slotname_map, grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins, - grad_outs, *op_proto); - std::string fwd_function_str = body_and_declaration.first; - GenerateForwardDygraphFile(op_type, output_dir, fwd_function_str); + grad_outs, op_type, in_vars, out_vars); + + fwd_function_str += body_and_declaration.first + "\n"; /* ---- dygraph_forward_api.h ---- */ std::string fwd_function_declare_str = body_and_declaration.second; dygraph_forward_api_str += fwd_function_declare_str; - /* ---- xxx_node.h ---- */ + if (generate_forward_only) continue; + + /* ---- nodes.h ---- */ VLOG(6) << "-------- GenerateGradNodeHeaderContents -------"; - std::string grad_node_h_str = GenerateGradNodeHeaderContents( - grad_node_default_attr_maps, grad_ins_fwd_slotname_map, *op_proto); - GenerateNodeHFile(op_type, output_dir, grad_node_h_str); + grad_node_h_str += + GenerateGradNodeHeaderContents(grad_ins_fwd_slotname_map, op_type, + in_vars, out_vars) + + "\n"; - /* ---- xxx_node.cc ---- */ + /* ---- nodes.cc ---- */ VLOG(6) << "-------- GenerateGradNodeCCContents -------"; - std::string grad_node_cc_str = GenerateGradNodeCCContents( - grad_node_default_attr_maps, grad_op_types, fwd_inputs_name_pos_map, - fwd_outputs_name_pos_map, grad_ins_fwd_slotname_map, - grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins, grad_outs, - *op_proto); - GenerateNodeCCFile(op_type, output_dir, grad_node_cc_str); - - VLOG(6) << op_type << ": Finished Generation"; + grad_node_cc_str += GenerateGradNodeCCContents( + grad_op_types, fwd_inputs_name_pos_map, + fwd_outputs_name_pos_map, grad_ins_fwd_slotname_map, + grad_ins_grad_slotname_map, grad_outs_slotname_map, + grad_ins, grad_outs, op_type, in_vars, out_vars) + + "\n"; + + VLOG(6) << op_type << ": Finished Generating Op: " << op_type; } + /* ---- dygraph_forward_function.cc ---- */ + VLOG(6) << "-------- GenerateDygraphForwardCCFile -------"; + GenerateForwardDygraphFile(output_dir, fwd_function_str); /* ---- dygraph_forward_api.h ---- */ VLOG(6) << "-------- GenerateForwardHFile -------"; GenerateForwardHFile(output_dir, dygraph_forward_api_str); + + /* ---- nodes.h ---- */ + VLOG(6) << "-------- GenerateNodeHFile -------"; + GenerateNodeHFile(output_dir, grad_node_h_str); + + /* ---- nodes.cc ---- */ + VLOG(6) << "-------- GenerateNodeCCFile -------"; + GenerateNodeCCFile(output_dir, grad_node_cc_str); +} + +static void PrepareAttrMapForOps() { + // Handle "fused_elemwise_add_activation" + std::vector functor_list = {"a", "b"}; + operators_with_attrs["fused_elemwise_add_activation"] = {}; + operators_with_attrs["fused_elemwise_add_activation"]["functor_list"] = + functor_list; + + // Handle "fused_elemwise_activation" + operators_with_attrs["fused_elemwise_activation"] = {}; + operators_with_attrs["fused_elemwise_activation"]["functor_list"] = + functor_list; + + // Handle "reverse" + std::vector axis = {0}; + operators_with_attrs["reverse"] = {}; + operators_with_attrs["reverse"]["axis"] = axis; + + // Handle "flip" + operators_with_attrs["flip"] = {}; + operators_with_attrs["flip"]["axis"] = axis; + + // Handle "cast" + operators_with_attrs["cast"] = {}; + operators_with_attrs["cast"]["out_dtype"] = 5; + operators_with_attrs["cast"]["in_dtype"] = 5; + + // Handle "transfer_dtype" + operators_with_attrs["transfer_dtype"] = {}; + operators_with_attrs["transfer_dtype"]["out_dtype"] = 5; + operators_with_attrs["transfer_dtype"]["in_dtype"] = 5; + + // Handle "c_split" + operators_with_attrs["c_split"] = {}; + operators_with_attrs["c_split"]["nranks"] = 1; +} + +static void CollectOperatorsToCodeGen(const std::string& op_list_path) { + std::string line; + std::ifstream op_list_file(op_list_path); + if (op_list_file.is_open()) { + while (getline(op_list_file, line)) { + operators_to_codegen.insert(line); + } + op_list_file.close(); + } else { + PADDLE_THROW( + paddle::platform::errors::Fatal("Unable to open op_list.txt file")); + } } } // namespace framework } // namespace paddle int main(int argc, char* argv[]) { - if (argc != 2) { - std::cerr << "argc must be 2" << std::endl; + if (argc != 3) { + std::cerr << "argc must be 3" << std::endl; return -1; } std::string eager_root = argv[1]; + std::string op_list_path = argv[2]; + + paddle::framework::CollectOperatorsToCodeGen(op_list_path); + paddle::framework::PrepareAttrMapForOps(); + paddle::framework::DygraphCodeGeneration(eager_root); return 0; diff --git a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py index af6cf2cec0246..56ec287561c56 100644 --- a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py +++ b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py @@ -18,12 +18,6 @@ if __name__ == "__main__": assert len(sys.argv) == 2 eager_dir = sys.argv[1] - - op_list = [] - with open(f"{eager_dir}/auto_code_generator/op_list.txt", "r") as f: - for line in f: - line = str(line.strip()) - op_list.append(line) """ paddle/fluid/eager |- generated @@ -31,15 +25,15 @@ | | "add_subdirectory(forwards), add_subdirectory(nodes)" | | |- forwards - | |- op_name + "_dygraph.cc" + | |- "dygraph_forward_functions.cc" | |- CMakeLists.txt - | | "cc_library(dygraph_function SRCS op_name+"_dygraph.cc" DEPS ${eager_deps} ${fluid_deps} GLOB_OP_LIB)" + | | "cc_library(dygraph_function SRCS dygraph_forward_functions.cc DEPS ${eager_deps} ${fluid_deps} GLOB_OP_LIB)" | | |- nodes - | |- op_name + "_node.cc" - | |- op_name + "_node.h" + | |- "nodes.cc" + | |- "nodes.h" | |- CMakeLists.txt - | | "cc_library(dygraph_node SRCS op_name+"_node.cc" DEPS ${eager_deps} ${fluid_deps})" + | | "cc_library(dygraph_node SRCS nodes.cc DEPS ${eager_deps} ${fluid_deps})" | | |- dygraph_forward_api.h """ @@ -56,10 +50,10 @@ dygraph_forward_api_h_path = os.path.join(generated_dir, "dygraph_forward_api.h") empty_files = [dygraph_forward_api_h_path] - for op_name in op_list: - empty_files.append(os.path.join(forwards_dir, op_name + "_dygraph.cc")) - empty_files.append(os.path.join(nodes_dir, op_name + "_node.cc")) - empty_files.append(os.path.join(nodes_dir, op_name + "_node.h")) + empty_files.append( + os.path.join(forwards_dir, "dygraph_forward_functions.cc")) + empty_files.append(os.path.join(nodes_dir, "nodes.cc")) + empty_files.append(os.path.join(nodes_dir, "nodes.h")) for path in empty_files: if not os.path.exists(path): @@ -73,14 +67,14 @@ with open(nodes_level_cmakelist_path, "w") as f: f.write( - "cc_library(dygraph_node SRCS %s DEPS ${eager_deps} ${fluid_deps})\n" - % " ".join([op_name + '_node.cc' for op_name in op_list])) + "cc_library(dygraph_node SRCS nodes.cc DEPS ${eager_deps} ${fluid_deps})\n" + ) f.write("add_dependencies(dygraph_node eager_codegen)") with open(forwards_level_cmakelist_path, "w") as f: f.write( - "cc_library(dygraph_function SRCS %s DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB})\n" - % " ".join([op_name + '_dygraph.cc' for op_name in op_list])) + "cc_library(dygraph_function SRCS dygraph_forward_functions.cc DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})\n" + ) f.write("add_dependencies(dygraph_function eager_codegen)") with open(generated_level_cmakelist_path, "w") as f: diff --git a/paddle/fluid/eager/auto_code_generator/op_list.txt b/paddle/fluid/eager/auto_code_generator/op_list.txt index 00a9abde156fb..699a84169d700 100644 --- a/paddle/fluid/eager/auto_code_generator/op_list.txt +++ b/paddle/fluid/eager/auto_code_generator/op_list.txt @@ -1,4 +1,542 @@ -sigmoid +rsqrt +multihead_matmul +addmm +gru +round +rank_attention +fused_embedding_fc_lstm +where_index +bicubic_interp +arg_min +tile +bilinear_tensor_product +ctc_align +pow2_decay_with_linear_warmup +split +fc +clear_float_status matmul_v2 -reduce_sum +load +c_embedding +elementwise_max +adadelta +chunk_eval +check_finite_and_unscale +sparse_momentum +tan +adam +fsp +where +logical_xor +multiclass_nms3 +one_hot_v2 +sequence_softmax +affine_channel +triangular_solve +sequence_topk_avg_pooling +space_to_depth +reverse +fused_embedding_eltwise_layernorm +expand_v2 +lgamma +solve +deformable_psroi_pooling +transfer_layout +instance_norm +decode_jpeg +distributed_push_sparse +gather_nd +reduce_prod +matrix_rank +asin +lstmp +iou_similarity +huber_loss +one_hot +sequence_slice +lookup_table +softplus +depthwise_conv2d +c_allreduce_sum +fused_fc_elementwise_layernorm +sigmoid_cross_entropy_with_logits +exp +scatter +c_allreduce_min +equal_all +searchsorted +fusion_squared_mat_sub +unique +log +conv_shift +smooth_l1_loss +linear_interp_v2 +momentum +temporal_shift +nce +mv +global_scatter +proximal_gd +memcpy_h2d +add_position_encoding +cosh +hash +grad_add +sign +prelu +linspace +fill_diagonal +logsigmoid +load_combine +fetch_v2 +randperm +sequence_scatter +partial_sum +relu6 +partial_allgather +c_scatter +alltoall +conv3d +lstm_unit +not_equal +transpose2 +c_sync_comm_stream +uniform_random_batch_size_like +unfold +lrn +isclose +softmax_with_cross_entropy +isfinite_v2 +bernoulli +max_pool3d_with_index +gaussian_random +flatten2 +matmul +cvm +recv_v2 +adamax +masked_select +range +bitwise_not +trace +multinomial +modified_huber_loss +c_reduce_prod +roll +squared_l2_distance +conv3d_transpose +share_data +fake_quantize_abs_max +unique_with_counts +fill +concat +fill_zeros_like +hierarchical_sigmoid +isinf_v2 +squeeze +multiclass_nms2 +bpr_loss +fft_c2c +bicubic_interp_v2 +angle +reshape +coalesce_tensor +dgc +roi_align +reshape2 +reduce_any +unstack +scatter_nd_add +sequence_reshape +bilateral_slice +fill_any_like +empty +partial_recv +pad_constant_like +pool2d +size +imag +eigh +stack +dgc_momentum +lamb +generate_proposals_v2 +c_sync_calc_stream +bitwise_or +gru_unit +fake_channel_wise_quantize_dequantize_abs_max +sampling_id +unsqueeze2 +transfer_dtype +allreduce +average_accumulates +sequence_enumerate +fusion_seqconv_eltadd_relu +bce_loss +generate_proposal_labels +im2sequence +isinf +c_reducescatter +adagrad +linear_chain_crf +retinanet_target_assign +fusion_group +teacher_student_sigmoid_loss +random_crop +lookup_table_v2 +detection_map +l1_norm +sqrt +partial_send +fused_elemwise_activation +slogdeterminant +share_buffer +bitwise_and +diag_embed +unbind +dropout +moving_average_abs_max_scale +beam_search +log_loss +greater_than +kron +sigmoid_focal_loss +rmsprop +conv2d +uniform_random_inplace +maxout +linear_interp +auc +logical_or +batch_norm +c_reduce_sum elementwise_add +acos +send_and_recv +unpool +cumprod +sample_logits +pull_box_extended_sparse +crop_tensor +fill_constant +deformable_conv +generate_mask_labels +locality_aware_nms +expand_as +matrix_power +greater_equal +generate_proposals +bilinear_interp +sigmoid +inplace_abn +softshrink +mul +data_norm +get_tensor_from_selected_rows +spp +floor +gelu +retinanet_detection_output +push_dense +silu +sequence_erase +real +nearest_interp_v2 +dgc_clip_by_norm +squeeze2 +strided_slice +conj +precision_recall +save +fusion_seqexpand_concat_fc +fake_quantize_range_abs_max +depthwise_conv2d_transpose +positive_negative_pair +square +var_conv_2d +log1p +fused_softmax_mask_upper_triangle +clip_by_norm +atan2 +box_decoder_and_assign +fft_r2c +roi_pool +overlap_add +fill_constant_batch_size_like +fill_any +dequantize_log +c_split +barrier +max_pool2d_with_index +pad3d +norm +viterbi_decode +mish +box_coder +flatten +elementwise_mod +margin_cross_entropy +pull_sparse +logical_and +pow +stanh +label_smooth +merged_momentum +c_reduce_min +ascend_trigger +fused_feedforward +rpn_target_assign +roi_perspective_transform +expand +prroi_pool +pool3d +memcpy +distribute_fpn_proposals +frame +bincount +shape +group_norm +c_softmax_with_cross_entropy +resnet_unit +sequence_expand_as +cos_sim +eigvals +save_combine +class_center_sample +read_file +isfinite +arg_max +equal +fake_dequantize_max_abs +qr +anchor_generator +layer_norm +merge_selected_rows +less_equal +rnn +fusion_lstm +lars_momentum +hard_sigmoid +isnan +elementwise_floordiv +correlation +histogram +gather_tree +segment_pool +sync_batch_norm +fusion_repeated_fc_relu +nop +fused_attention +expand_as_v2 +filter_by_instag +diag_v2 +pull_box_sparse +nll_loss +dot +scale +ncclBcast +shuffle_batch +ncclReduce +diag +multiplex +leaky_relu +allclose +adamw +elementwise_pow +prior_box +p_norm +c_concat +unique_consecutive +lod_reset +pad +sequence_conv +log10 +set_value +bitwise_xor +center_loss +randint +attention_lstm +uniform_random +slice +meshgrid +hard_swish +sin +mean_iou +pad2d +inverse +spectral_norm +shuffle_channel +send_v2 +psroi_pool +seed +ceil +eig +reduce_min +cos +ncclAllReduce +cudnn_lstm +reduce_sum +digamma +assign_value +increment +tdm_sampler +fused_softmax_mask +sequence_reverse +eigvalsh +diagonal +trunc +log2 +marker +tanh +yolov3_loss +graph_send_recv +accuracy +atan +less_than +unsqueeze +crf_decoding +global_gather +c_allreduce_prod +log_softmax +ftrl +matrix_nms +top_k_v2 +cast +tanh_shrink +hard_shrink +multiclass_nms +c_broadcast +fusion_transpose_flatten_concat +sequence_unpad +fused_elemwise_add_activation +pull_sparse_v2 +frobenius_norm +crop +cross_entropy2 +skip_layernorm +tdm_child +fused_embedding_seq_pool +erf +conv2d_inception_fusion +trilinear_interp +logsumexp +fusion_seqpool_concat +alloc_float_status +sequence_concat +fusion_seqpool_cvm_concat +similarity_focus +c_allreduce_max +argsort +sequence_expand +sgd +fused_bn_add_activation +bilinear_interp_v2 +clip +deformable_conv_v1 +hinge_loss +determinant +conv2d_transpose +memcpy_d2h +softsign +fake_quantize_dequantize_abs_max +broadcast_tensors +grid_sampler +fft_c2r +pyramid_hash +fake_quantize_dequantize_moving_average_abs_max +multi_dot +sequence_pool +broadcast +transpose +top_k +dist +affine_grid +gaussian_random_batch_size_like +fake_channel_wise_dequantize_max_abs +reciprocal +sequence_mask +fill_diagonal_tensor +abs +partial_concat +elu +index_select +row_conv +cross +elementwise_mul +decayed_adagrad +bipartite_match +run_program +fake_quantize_moving_average_abs_max +mine_hard_examples +target_assign +lstm +truncated_gaussian_random +match_matrix_tensor +elementwise_div +kldiv_loss +cumsum +sum +proximal_adagrad +update_loss_scaling +shard_index +selu +mean +gumbel_softmax +sequence_pad +tree_conv +assign +flatten_contiguous_range +tril_triu +brelu +celu +reduce_mean +sinh +rank_loss +reduce_max +fusion_gru +fill_zeros_like2 +expm1 +squared_l2_norm +elementwise_sub +margin_rank_loss +faster_tokenizer +c_identity +c_reduce_max +relu +is_empty +reduce_all +edit_distance +distributed_lookup_table +bmm +yolo_box +soft_relu +density_prior_box +eye +swish +cross_entropy +dpsgd +cholesky +batch_fc +nearest_interp +gather +trilinear_interp_v2 +box_clip +c_allgather +isnan_v2 +softmax +conv2d_fusion +fused_batch_norm_act +get_float_status +index_sample +elementwise_min +logical_not +collect_fpn_proposals +pixel_shuffle +thresholded_relu +polygon_box_transform +lookup_table_dequant +warpctc +fake_channel_wise_quantize_abs_max +dequantize_abs_max +svd +flip diff --git a/paddle/fluid/eager/tests/CMakeLists.txt b/paddle/fluid/eager/tests/CMakeLists.txt index 289f24dfa6367..c1506d8139b43 100644 --- a/paddle/fluid/eager/tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/CMakeLists.txt @@ -1,2 +1,6 @@ add_subdirectory(data_structure_tests) add_subdirectory(task_tests) + +if(NOT ON_INFER) + add_subdirectory(performance_tests) +endif() diff --git a/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt b/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt new file mode 100644 index 0000000000000..8811aa8ad38a5 --- /dev/null +++ b/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt @@ -0,0 +1,7 @@ +cc_library(performance_benchmark_utils SRCS benchmark_utils.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node scale_op matmul_v2_op) + +cc_test(test_egr_performance_benchmark_eager_cpu SRCS benchmark_eager_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) +cc_test(test_egr_performance_benchmark_fluid_cpu SRCS benchmark_fluid_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) + +cc_test(test_egr_performance_benchmark_eager_cuda SRCS benchmark_eager_cuda.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) +cc_test(test_egr_performance_benchmark_fluid_cuda SRCS benchmark_fluid_cuda.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc index 0a84f3b523aee..c100e3b70f384 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc @@ -14,6 +14,7 @@ // Eager Dygraph +#include #include #include "gtest/gtest.h" @@ -25,15 +26,15 @@ #include "paddle/fluid/imperative/tracer.h" -#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h" +#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h" #include "paddle/fluid/eager/tests/test_utils.h" #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" #endif -// TODO(jiabin): remove nolint here!!! -using namespace egr; // NOLINT +using namespace egr; // NOLINT +using namespace egr_utils_api; // NOLINT // Disable pten path DECLARE_bool(run_pten_kernel); @@ -42,11 +43,11 @@ TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; } TEST(Benchmark, EagerScaleCPU) { // Prepare Device Contexts - egr::InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); for (const std::string& mode : {"Accuracy", "Performance"}) { paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4}); - egr::EagerTensor tensor = EagerUtils::CreateTensorWithValue( + egr::EagerTensor tensor = CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 5.0, true); RetainGradForTensor(tensor); @@ -78,20 +79,20 @@ TEST(Benchmark, EagerScaleCPU) { TEST(Benchmark, EagerIntermediateMatmulCPU) { // Prepare Device Contexts - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); auto tracer = std::make_shared(); paddle::imperative::SetCurrentTracer(tracer); for (const std::string& mode : {"Accuracy", "Performance"}) { paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2}); - egr::EagerTensor X = EagerUtils::CreateTensorWithValue( + egr::EagerTensor X = CreateTensorWithValue( ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 1.0, true); RetainGradForTensor(X); paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2}); - egr::EagerTensor Y = EagerUtils::CreateTensorWithValue( + egr::EagerTensor Y = CreateTensorWithValue( ddimY, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 2.0, true); RetainGradForTensor(Y); @@ -122,7 +123,7 @@ TEST(Benchmark, EagerIntermediateMatmulCPU) { TEST(Benchmark, EagerIntermediateMLPCPU) { // Prepare Device Contexts - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); auto tracer = std::make_shared(); paddle::imperative::SetCurrentTracer(tracer); @@ -130,7 +131,7 @@ TEST(Benchmark, EagerIntermediateMLPCPU) { for (const std::string& mode : {"Accuracy", "Performance"}) { paddle::framework::DDim ddimX = paddle::framework::make_ddim({MLP_M, MLP_N}); - egr::EagerTensor X = EagerUtils::CreateTensorWithValue( + egr::EagerTensor X = CreateTensorWithValue( ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, MLP_X_VAL, true); RetainGradForTensor(X); @@ -140,13 +141,13 @@ TEST(Benchmark, EagerIntermediateMLPCPU) { for (size_t i = 0; i < MLP_NUM_LINEAR; i++) { paddle::framework::DDim ddimW = paddle::framework::make_ddim({MLP_N, MLP_K}); - egr::EagerTensor W = EagerUtils::CreateTensorWithValue( + egr::EagerTensor W = CreateTensorWithValue( ddimW, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, MLP_W_VAL, true); RetainGradForTensor(W); paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K}); - egr::EagerTensor B = EagerUtils::CreateTensorWithValue( + egr::EagerTensor B = CreateTensorWithValue( ddimB, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, MLP_B_VAL, true); RetainGradForTensor(B); @@ -178,3 +179,8 @@ TEST(Benchmark, EagerIntermediateMLPCPU) { } } } + +USE_OP(scale); +USE_OP(elementwise_add); +USE_OP(matmul_v2); +USE_OP(reduce_sum); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc index b373802c79eb4..c8f4b1b32e453 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc @@ -13,6 +13,7 @@ // limitations under the License. // Eager Dygraph +#include #include #include "gtest/gtest.h" @@ -24,26 +25,28 @@ #include "paddle/fluid/imperative/tracer.h" -#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h" +#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h" #include "paddle/fluid/eager/tests/test_utils.h" #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" #endif -// TODO(jiabin): remove nolint here!!! -using namespace egr; // NOLINT +using namespace egr; // NOLINT +using namespace egr_utils_api; // NOLINT DECLARE_bool(run_pten_kernel); TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + TEST(Benchmark, EagerScaleCUDA) { - egr::InitEnv(paddle::platform::CUDAPlace()); + eager_test::InitEnv(paddle::platform::CUDAPlace()); for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4}); - egr::EagerTensor tensor = EagerUtils::CreateTensorWithValue( + egr::EagerTensor tensor = CreateTensorWithValue( ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); RetainGradForTensor(tensor); @@ -77,7 +80,7 @@ TEST(Benchmark, EagerScaleCUDA) { TEST(Benchmark, EagerIntermediateMatmulCUDA) { paddle::platform::CUDAPlace place; - egr::InitEnv(place); + eager_test::InitEnv(place); auto tracer = std::make_shared(); tracer->SetExpectedPlace(place); @@ -85,13 +88,13 @@ TEST(Benchmark, EagerIntermediateMatmulCUDA) { for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2}); - egr::EagerTensor X = EagerUtils::CreateTensorWithValue( + egr::EagerTensor X = CreateTensorWithValue( ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 1.0, true); RetainGradForTensor(X); paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2}); - egr::EagerTensor Y = EagerUtils::CreateTensorWithValue( + egr::EagerTensor Y = CreateTensorWithValue( ddimY, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 2.0, true); RetainGradForTensor(Y); @@ -125,7 +128,7 @@ TEST(Benchmark, EagerIntermediateMatmulCUDA) { TEST(Benchmark, EagerIntermediateMLPCUDA) { paddle::platform::CUDAPlace place; - egr::InitEnv(place); + eager_test::InitEnv(place); auto tracer = std::make_shared(); tracer->SetExpectedPlace(place); @@ -134,7 +137,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) { for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { paddle::framework::DDim ddimX = paddle::framework::make_ddim({MLP_M, MLP_N}); - egr::EagerTensor X = EagerUtils::CreateTensorWithValue( + egr::EagerTensor X = CreateTensorWithValue( ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, MLP_X_VAL, true); RetainGradForTensor(X); @@ -144,13 +147,13 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) { for (size_t i = 0; i < MLP_NUM_LINEAR; i++) { paddle::framework::DDim ddimW = paddle::framework::make_ddim({MLP_N, MLP_K}); - egr::EagerTensor W = EagerUtils::CreateTensorWithValue( + egr::EagerTensor W = CreateTensorWithValue( ddimW, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, MLP_W_VAL, true); RetainGradForTensor(W); paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K}); - egr::EagerTensor B = EagerUtils::CreateTensorWithValue( + egr::EagerTensor B = CreateTensorWithValue( ddimB, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, MLP_B_VAL, true); RetainGradForTensor(B); @@ -185,3 +188,11 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) { } } } + +USE_OP(scale); +USE_OP(matmul_v2); +USE_OP(reduce_sum); +USE_OP(reduce_sum_grad); +USE_OP(elementwise_add); + +#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc index 20844055e300d..68e7512eedbde 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc @@ -24,7 +24,7 @@ #include "glog/logging.h" #include "gtest/gtest.h" -#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h" +#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h" #include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/imperative/basic_engine.h" #include "paddle/fluid/imperative/tracer.h" @@ -45,7 +45,7 @@ namespace imperative { TEST(Benchmark, FluidScaleCPU) { // Prepare Device Contexts platform::CPUPlace place; - egr::InitEnv(place); + eager_test::InitEnv(place); for (const std::string& mode : {"Accuracy", "Performance"}) { std::shared_ptr X(new imperative::VarBase(true, "X")); @@ -88,7 +88,7 @@ TEST(Benchmark, FluidScaleCPU) { TEST(Benchmark, FluidMatmulCPU) { // Prepare Device Contexts platform::CPUPlace place; - egr::InitEnv(place); + eager_test::InitEnv(place); for (const std::string& mode : {"Accuracy", "Performance"}) { std::shared_ptr X(new imperative::VarBase(true, "X")); @@ -141,7 +141,7 @@ TEST(Benchmark, FluidMatmulCPU) { TEST(Benchmark, FluidMLPCPU) { // Prepare Device Contexts platform::CPUPlace place; - egr::InitEnv(place); + eager_test::InitEnv(place); for (const std::string& mode : {"Accuracy", "Performance"}) { std::vector x_src_data(MLP_M * MLP_N, MLP_X_VAL); @@ -217,5 +217,6 @@ TEST(Benchmark, FluidMLPCPU) { } // namespace paddle USE_OP(scale); +USE_OP(elementwise_add); USE_OP(matmul_v2); USE_OP(reduce_sum); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc index 620a4d1cd128d..50423b5a64fcf 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc @@ -24,7 +24,7 @@ #include "glog/logging.h" #include "gtest/gtest.h" -#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h" +#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h" #include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/imperative/basic_engine.h" #include "paddle/fluid/imperative/tracer.h" @@ -39,13 +39,15 @@ DECLARE_bool(run_pten_kernel); TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + namespace paddle { namespace imperative { TEST(Benchmark, FluidScaleCUDA) { // Prepare Device Contexts platform::CUDAPlace place; - egr::InitEnv(place); + eager_test::InitEnv(place); for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { std::shared_ptr X(new imperative::VarBase(true, "X")); @@ -98,7 +100,7 @@ TEST(Benchmark, FluidScaleCUDA) { TEST(Benchmark, FluidMatmulCUDA) { // Prepare Device Contexts platform::CUDAPlace place; - egr::InitEnv(place); + eager_test::InitEnv(place); for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { std::shared_ptr X(new imperative::VarBase(true, "X")); @@ -161,7 +163,7 @@ TEST(Benchmark, FluidMatmulCUDA) { TEST(Benchmark, FluidMLPCUDA) { // Prepare Device Contexts platform::CUDAPlace place; - egr::InitEnv(place); + eager_test::InitEnv(place); for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { paddle::platform::DeviceContextPool& pool = @@ -252,3 +254,6 @@ USE_OP(scale); USE_OP(matmul_v2); USE_OP(reduce_sum); USE_OP(reduce_sum_grad); +USE_OP(elementwise_add); + +#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc index ae5d02c1e943f..baa99dc93c2dd 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc @@ -36,10 +36,6 @@ #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h" - -#include "paddle/pten/core/kernel_registry.h" - static size_t max_num_benchmark_runs = 5000; namespace egr { @@ -64,9 +60,9 @@ void benchmark_eager_scale(const EagerTensor& tensor, bool accuracy_check) { if (accuracy_check) { // Examine Forward Grad (w.r.t max_num_runs = 10) - CompareTensorWithValue(input_tensor, 8189.0); + eager_test::CompareTensorWithValue(input_tensor, 8189.0); // Examine Backward Grad (w.r.t max_num_runs = 10) - CompareGradTensorWithValue(tensor, 1024.0); + eager_test::CompareGradTensorWithValue(tensor, 1024.0); } } @@ -89,10 +85,10 @@ void benchmark_eager_intermediate_matmul(const EagerTensor& X, if (accuracy_check) { // Examine Forward Grad (w.r.t max_num_runs = 2) - CompareVariableWithValue(input_tensor0, 16); + eager_test::CompareVariableWithValue(input_tensor0, 16); // Examine Backward Grad (w.r.t max_num_runs = 2) - CompareGradVariableWithValue(X, 16); - CompareGradVariableWithValue(Y, 16); + eager_test::CompareGradVariableWithValue(X, 16); + eager_test::CompareGradVariableWithValue(Y, 16); } } @@ -122,11 +118,11 @@ void benchmark_eager_intermediate_mlp(const EagerTensor& X, compute_mlp_expected_results(); // Examine Forward Grad (w.r.t max_num_runs = 2) - CompareVariableWithValue(Out, result["Out"]); + eager_test::CompareVariableWithValue(Out, result["Out"]); // Examine Backward Grad (w.r.t max_num_runs = 2) - CompareGradVariableWithValue(X, result["GradX"]); - CompareGradVariableWithValue(Ws[0], result["GradW"]); + eager_test::CompareGradVariableWithValue(X, result["GradX"]); + eager_test::CompareGradVariableWithValue(Ws[0], result["GradW"]); } } @@ -141,6 +137,8 @@ static void FluidCheckTensorValue(const std::shared_ptr& X, auto* tensor = X->MutableVar()->GetMutable(); float* t_ptr = tensor->mutable_data(place); std::vector host_data(tensor->numel()); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (place == paddle::platform::CUDAPlace()) { paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); @@ -153,6 +151,8 @@ static void FluidCheckTensorValue(const std::shared_ptr& X, sizeof(float) * tensor->numel(), stream); t_ptr = host_data.data(); } +#endif + VLOG(6) << "Tensor Value: " << t_ptr[0] << ", Expected Value: " << value; PADDLE_ENFORCE( t_ptr[0] == value, @@ -166,6 +166,8 @@ static void FluidCheckGradTensorValue( auto* grad_tensor = X->MutableGradVar()->GetMutable(); float* g_ptr = grad_tensor->mutable_data(place); std::vector g_host_data(grad_tensor->numel()); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (place == paddle::platform::CUDAPlace()) { paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); @@ -178,6 +180,8 @@ static void FluidCheckGradTensorValue( sizeof(float) * grad_tensor->numel(), stream); g_ptr = g_host_data.data(); } +#endif + VLOG(6) << "Tensor Value: " << g_ptr[0] << ", Expected Value: " << value; PADDLE_ENFORCE( g_ptr[0] == value, diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt index 3921ce5b69cd7..c03db1a1575df 100644 --- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt @@ -6,6 +6,6 @@ cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} ea cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) -if(NOT DEFINED ON_INFER) +if(NOT ON_INFER) cc_test(test_egr_task_autocodegen SRCS generated_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps}) endif() diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc index d63cff23ba9c8..0ec86b7cc360c 100644 --- a/paddle/fluid/eager/tests/task_tests/backward_test.cc +++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc @@ -30,19 +30,17 @@ #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/tensor_meta.h" -using namespace egr; // NOLINT - -namespace eager_test { +namespace egr { TEST(Backward, SingleNodeEmptyGrad) { // Prepare Device Contexts - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); // Prepare Inputs paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); // Create Target Tensor - egr::EagerTensor target_tensor = CreateTensorWithValue( + egr::EagerTensor target_tensor = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); @@ -67,7 +65,7 @@ TEST(Backward, SingleNodeEmptyGrad) { std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); - egr::RetainGradForTensor(leaf_tensor); + egr_utils_api::RetainGradForTensor(leaf_tensor); // Connect Node0 -> AccumulationNode via Edge auto meta = egr::AutogradMeta(); @@ -80,26 +78,26 @@ TEST(Backward, SingleNodeEmptyGrad) { RunBackward(outs, {}); // Check Output Value - CompareGradTensorWithValue(leaf_tensor, 5.0); + eager_test::CompareGradTensorWithValue(leaf_tensor, 5.0); } TEST(Backward, SingleNodeCustomGrad) { // Prepare Device Contexts - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); // Prepare Inputs std::vector target_tensors; paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); // Create Target Tensor - egr::EagerTensor tensor = CreateTensorWithValue( + egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); target_tensors.emplace_back(std::move(tensor)); std::vector grad_tensors; // Create Grad Tensor - egr::EagerTensor grad_tensor = CreateTensorWithValue( + egr::EagerTensor grad_tensor = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/); grad_tensors.emplace_back(std::move(grad_tensor)); @@ -128,7 +126,7 @@ TEST(Backward, SingleNodeCustomGrad) { std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); - egr::RetainGradForTensor(leaf_tensor); + egr_utils_api::RetainGradForTensor(leaf_tensor); // Connect Node0 -> AccumulationNode via Edge auto meta = egr::AutogradMeta(); @@ -141,7 +139,7 @@ TEST(Backward, SingleNodeCustomGrad) { RunBackward(target_tensors, grad_tensors); // Check Output Value - CompareGradTensorWithValue(leaf_tensor, 50.0); + eager_test::CompareGradTensorWithValue(leaf_tensor, 50.0); } /* @@ -153,14 +151,14 @@ Node0 */ TEST(Backward, LinearNodes) { // Prepare Device Contexts - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); // Prepare Inputs std::vector target_tensors; paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); // Create Target Tensor - egr::EagerTensor tensor = CreateTensorWithValue( + egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); target_tensors.emplace_back(std::move(tensor)); @@ -202,7 +200,7 @@ TEST(Backward, LinearNodes) { std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); - egr::RetainGradForTensor(leaf_tensor); + egr_utils_api::RetainGradForTensor(leaf_tensor); // Connect Node1 -> AccumulationNode via Edge auto meta1 = egr::AutogradMeta(); @@ -215,7 +213,7 @@ TEST(Backward, LinearNodes) { RunBackward(target_tensors, {}); // Check Output Value - CompareGradTensorWithValue(leaf_tensor, 50.0); + eager_test::CompareGradTensorWithValue(leaf_tensor, 50.0); } /* @@ -227,17 +225,17 @@ Node0 Node1 */ TEST(Backward, WithAccumulation) { // Prepare Device Contexts - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); // Prepare Inputs paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); // Create Target Tensor std::vector target_tensors; - egr::EagerTensor tensor0 = CreateTensorWithValue( + egr::EagerTensor tensor0 = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); - egr::EagerTensor tensor1 = CreateTensorWithValue( + egr::EagerTensor tensor1 = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); target_tensors.emplace_back(std::move(tensor0)); @@ -245,10 +243,10 @@ TEST(Backward, WithAccumulation) { // Create Grad Tensor std::vector grad_tensors; - egr::EagerTensor grad_tensor0 = CreateTensorWithValue( + egr::EagerTensor grad_tensor0 = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/); - egr::EagerTensor grad_tensor1 = CreateTensorWithValue( + egr::EagerTensor grad_tensor1 = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/); grad_tensors.emplace_back(std::move(grad_tensor0)); @@ -303,7 +301,7 @@ TEST(Backward, WithAccumulation) { std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta2->SetSingleOutRankWithSlot(0, 0); - egr::RetainGradForTensor(leaf_tensor); + egr_utils_api::RetainGradForTensor(leaf_tensor); // Connect Node2 -> AccumulationNode via Edge auto meta2 = egr::AutogradMeta(); @@ -314,7 +312,7 @@ TEST(Backward, WithAccumulation) { RunBackward(target_tensors, grad_tensors); - CompareGradTensorWithValue(leaf_tensor, 2500.0); + eager_test::CompareGradTensorWithValue(leaf_tensor, 2500.0); } -} // namespace eager_test +} // namespace egr diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc index e1e138cdee8ba..52e10b2b1b8a0 100644 --- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc +++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc @@ -31,17 +31,15 @@ #include "paddle/fluid/eager/tests/test_utils.h" -using namespace egr; // NOLINT - -namespace eager_test { +namespace egr { TEST(CrossBatchAccumulation, SingleScaleNode) { - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); std::vector target_tensors; paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); - egr::EagerTensor tensor = CreateTensorWithValue( + egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); target_tensors.emplace_back(std::move(tensor)); @@ -60,7 +58,7 @@ TEST(CrossBatchAccumulation, SingleScaleNode) { auto_grad_meta->SetGradNode( std::dynamic_pointer_cast(scale_node_ptr)); auto_grad_meta->SetSingleOutRankWithSlot(0, 0); - RetainGradForTensor(target_tensor); // result: 1.0 + egr_utils_api::RetainGradForTensor(target_tensor); // result: 1.0 auto meta = AutogradMeta(); meta.SetSingleOutRankWithSlot(0, 0); @@ -71,18 +69,18 @@ TEST(CrossBatchAccumulation, SingleScaleNode) { auto_grad_meta1->SetGradNode( std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); - RetainGradForTensor(leaf_tensor); + egr_utils_api::RetainGradForTensor(leaf_tensor); } RunBackward(target_tensors, {}); - CompareGradTensorWithValue(target_tensor, 1.0); - CompareGradTensorWithValue(leaf_tensor, 5.0); + eager_test::CompareGradTensorWithValue(target_tensor, 1.0); + eager_test::CompareGradTensorWithValue(leaf_tensor, 5.0); RunBackward(target_tensors, {}); - CompareGradTensorWithValue(target_tensor, 1.0); - CompareGradTensorWithValue(leaf_tensor, 10.0); + eager_test::CompareGradTensorWithValue(target_tensor, 1.0); + eager_test::CompareGradTensorWithValue(leaf_tensor, 10.0); } -} // namespace eager_test +} // namespace egr diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc index 4d93f0188a746..ea9aae83ff189 100644 --- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc +++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc @@ -24,10 +24,7 @@ #include "paddle/pten/api/lib/utils/allocator.h" -// TODO(jiabin): remove nolint here!!! -using namespace egr; // NOLINT - -namespace eager_test { +namespace egr { TEST(EagerUtils, AutoGradMeta) { // Construct Eager Tensor @@ -63,7 +60,7 @@ TEST(EagerUtils, AutoGradMeta) { std::vector autograd_metas = EagerUtils::multi_autograd_meta(&ets); std::vector unsafe_autograd_metas = - EagerUtils::unsafe_autograd_meta(&ets); + EagerUtils::unsafe_autograd_meta(ets); CHECK_NOTNULL(unsafe_autograd_metas[0]); CHECK_NOTNULL(unsafe_autograd_metas[1]); @@ -167,7 +164,7 @@ TEST(EagerUtils, PassStopGradient) { TEST(EagerUtils, SyncToVarsSingle) { paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4}); - auto tensor = eager_test::CreateTestCPUTensor(5.0f, ddim); + auto tensor = CreateTestCPUTensor(5.0f, ddim); std::vector> var_bases = egr::EagerUtils::SyncToVars(tensor); @@ -185,9 +182,8 @@ TEST(EagerUtils, SyncToVarsSingle) { TEST(EagerUtils, SyncToVarsMultiple) { paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4}); - std::vector tensors = { - eager_test::CreateTestCPUTensor(1.0f, ddim), - eager_test::CreateTestCPUTensor(2.0f, ddim)}; + std::vector tensors = {CreateTestCPUTensor(1.0f, ddim), + CreateTestCPUTensor(2.0f, ddim)}; std::vector> var_bases = egr::EagerUtils::SyncToVars(tensors); @@ -280,4 +276,4 @@ TEST(EagerUtils, ConstructDuplicableOutput) { CHECK(outs[0]->initialized() == false); } -} // namespace eager_test +} // namespace egr diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc index 6e23226cde432..205f231eceeed 100644 --- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc +++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc @@ -27,21 +27,18 @@ #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/tensor_meta.h" -// TODO(jiabin): remove nolint here!!! -using namespace egr; // NOLINT - -namespace eager_test { +namespace egr { TEST(Forward, SingleNode) { // Prepare Device Contexts - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); // Prepare Inputs std::vector target_tensors; paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); // Create Target Tensor - egr::EagerTensor t = CreateTensorWithValue( + egr::EagerTensor t = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/); target_tensors.emplace_back(std::move(t)); @@ -55,7 +52,7 @@ TEST(Forward, SingleNode) { tensor, scale, bias, true /*bias_after_scale*/, true /*trace_backward*/); // Examine Forward Output - CompareTensorWithValue(out, 13.0); + eager_test::CompareTensorWithValue(out, 13.0); // Examine GradNode { @@ -80,14 +77,14 @@ Node1 out */ TEST(Forward, LinearNodes) { - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); // Prepare Inputs std::vector target_tensors; paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); // Create Target Tensor - egr::EagerTensor t = CreateTensorWithValue( + egr::EagerTensor t = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/); target_tensors.emplace_back(std::move(t)); @@ -108,10 +105,10 @@ TEST(Forward, LinearNodes) { out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/); // Examine Forward Output 0 - CompareTensorWithValue(out0, 13.0); + eager_test::CompareTensorWithValue(out0, 13.0); // Examine Forward Output 1 - CompareTensorWithValue(out1, 75.0); + eager_test::CompareTensorWithValue(out1, 75.0); // Examine GradNode { @@ -156,14 +153,14 @@ TEST(Forward, LinearNodes) { out1 out2 */ TEST(Forward, BranchedNodes) { - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); // Prepare Inputs std::vector target_tensors; paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); // Create Target Tensor - egr::EagerTensor t = CreateTensorWithValue( + egr::EagerTensor t = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/); target_tensors.emplace_back(std::move(t)); @@ -190,13 +187,13 @@ TEST(Forward, BranchedNodes) { out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/); // Examine Forward Output 0 - CompareTensorWithValue(out0, 13.0); + eager_test::CompareTensorWithValue(out0, 13.0); // Examine Forward Output 1 - CompareTensorWithValue(out1, 75.0); + eager_test::CompareTensorWithValue(out1, 75.0); // Examine Forward Output 2 - CompareTensorWithValue(out2, 150.0); + eager_test::CompareTensorWithValue(out2, 150.0); // Examine GradNode { @@ -248,4 +245,4 @@ TEST(Forward, BranchedNodes) { } } -} // namespace eager_test +} // namespace egr diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc index 751e95487659c..e292844c8ee58 100644 --- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc +++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc @@ -29,10 +29,7 @@ #include "paddle/fluid/eager/tests/test_utils.h" -// TODO(jiabin): remove nolint here!!! -using namespace egr; // NOLINT - -namespace eager_test { +namespace egr { egr::EagerTensor hook_function(const egr::EagerTensor& t) { auto t_dense = std::dynamic_pointer_cast(t.impl()); @@ -61,14 +58,14 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) { } TEST(FwdBwdJoint, SingleNode) { - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); // 1. Prepare Input paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); - egr::EagerTensor tensor = CreateTensorWithValue( + egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); - RetainGradForTensor(tensor); + egr_utils_api::RetainGradForTensor(tensor); // 3. Run Forward float scale = 2.0; @@ -77,7 +74,7 @@ TEST(FwdBwdJoint, SingleNode) { tensor, scale, bias, true /*bias_after_scale*/, true /*trace_backward*/); // Examine Forward Output - CompareTensorWithValue(out, 13.0); + eager_test::CompareTensorWithValue(out, 13.0); std::vector outs = {out}; // 4. Run Backward @@ -88,7 +85,7 @@ TEST(FwdBwdJoint, SingleNode) { EagerUtils::unsafe_autograd_meta(tensor)->Grad().impl()) ->data()[0]; // Examine Backward Grad - CompareGradTensorWithValue(tensor, 2.0); + eager_test::CompareGradTensorWithValue(tensor, 2.0); } /* @@ -101,14 +98,14 @@ Node1 out */ TEST(FwdBwdJoint, LinearNodes) { - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); // 1. Prepare Input paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); - egr::EagerTensor tensor = CreateTensorWithValue( + egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); - RetainGradForTensor(tensor); + egr_utils_api::RetainGradForTensor(tensor); // 3. Run Forward // Run Forward Node 0 @@ -125,17 +122,17 @@ TEST(FwdBwdJoint, LinearNodes) { out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/); // Examine Forward Output 0 - CompareTensorWithValue(out0, 13.0); + eager_test::CompareTensorWithValue(out0, 13.0); // Examine Forward Output 1 - CompareTensorWithValue(out1, 75.0); + eager_test::CompareTensorWithValue(out1, 75.0); std::vector outs = {out1}; // 4. Run Backward RunBackward(outs, {}); // Examine Backward Grad - CompareGradTensorWithValue(tensor, 10.0); + eager_test::CompareGradTensorWithValue(tensor, 10.0); } /* @@ -149,14 +146,14 @@ TEST(FwdBwdJoint, LinearNodes) { out1 out2 */ TEST(FwdBwdJoint, BranchedNodes) { - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); // 1. Prepare Input paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); - egr::EagerTensor tensor = CreateTensorWithValue( + egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); - RetainGradForTensor(tensor); + egr_utils_api::RetainGradForTensor(tensor); // 3. Run Forward // Run Forward Node 0 @@ -179,10 +176,10 @@ TEST(FwdBwdJoint, BranchedNodes) { out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/); // Examine Forward Output 0 - CompareTensorWithValue(out0, 13.0); + eager_test::CompareTensorWithValue(out0, 13.0); // Examine Forward Output 1 - CompareTensorWithValue(out1, 75.0); + eager_test::CompareTensorWithValue(out1, 75.0); // Examine Forward Output 2 { @@ -201,7 +198,7 @@ TEST(FwdBwdJoint, BranchedNodes) { RunBackward(outs, {}); // Examine Backward Grad - CompareGradTensorWithValue(tensor, 30.0); + eager_test::CompareGradTensorWithValue(tensor, 30.0); } /* @@ -215,14 +212,14 @@ TEST(FwdBwdJoint, BranchedNodes) { out1 out2 */ TEST(FwdBwdJoint, GradientHook) { - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); // 1. Prepare Input paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); - egr::EagerTensor tensor = CreateTensorWithValue( + egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); - RetainGradForTensor(tensor); + egr_utils_api::RetainGradForTensor(tensor); std::function hook = &hook_function; @@ -234,24 +231,24 @@ TEST(FwdBwdJoint, GradientHook) { egr::EagerTensor out0 = egr::scale(tensor, scale0, bias0, true /*bias_after_scale*/, true /*trace_backward*/); - RetainGradForTensor(out0); // hook: +5 - RegisterGradientHookForTensor(out0, hook); // hook: +5 + egr_utils_api::RetainGradForTensor(out0); // hook: +5 + egr_utils_api::RegisterGradientHookForTensor(out0, hook); // hook: +5 // Run Forward Node 1 float scale1 = 5.0; float bias1 = 10.0; egr::EagerTensor out1 = egr::scale( out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/); - RetainGradForTensor(out1); // hook: +5 - RegisterGradientHookForTensor(out1, hook); // hook: +5 + egr_utils_api::RetainGradForTensor(out1); // hook: +5 + egr_utils_api::RegisterGradientHookForTensor(out1, hook); // hook: +5 // Run Forward Node 2 float scale2 = 10.0; float bias2 = 20.0; egr::EagerTensor out2 = egr::scale( out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/); - RetainGradForTensor(out2); // hook: +5 - RegisterGradientHookForTensor(out2, hook); // hook: +5 + egr_utils_api::RetainGradForTensor(out2); // hook: +5 + egr_utils_api::RegisterGradientHookForTensor(out2, hook); // hook: +5 // 4. Run Backward std::vector outs = {out1, out2}; @@ -259,16 +256,16 @@ TEST(FwdBwdJoint, GradientHook) { // Examine Backward Grad // leaf grad - CompareGradTensorWithValue(tensor, 190.0); + eager_test::CompareGradTensorWithValue(tensor, 190.0); // out0 grad - CompareGradTensorWithValue(out0, 90.0); + eager_test::CompareGradTensorWithValue(out0, 90.0); // out1 grad - CompareGradTensorWithValue(out1, 1.0); + eager_test::CompareGradTensorWithValue(out1, 1.0); // out2 grad - CompareGradTensorWithValue(out2, 1.0); + eager_test::CompareGradTensorWithValue(out2, 1.0); } /* @@ -282,14 +279,14 @@ TEST(FwdBwdJoint, GradientHook) { out1 out2 */ TEST(FwdBwdJoint, CrossBatchAccumulation) { - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); // 1. Prepare Input paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); - egr::EagerTensor tensor = CreateTensorWithValue( + egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); - RetainGradForTensor(tensor); + egr_utils_api::RetainGradForTensor(tensor); // 3. Run Forward // Run Forward Node 0 @@ -316,13 +313,13 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) { RunBackward(outs, {}); // Examine Backward Grad - CompareGradTensorWithValue(tensor, 30.0); + eager_test::CompareGradTensorWithValue(tensor, 30.0); // Cross Batch Accumulation RunBackward(outs, {}); // Examine Backward Grad - CompareGradTensorWithValue(tensor, 60.0); + eager_test::CompareGradTensorWithValue(tensor, 60.0); } /* ---------------------------------------------------- */ @@ -331,14 +328,14 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(FwdBwdJoint, SingleNodeCUDA) { - InitEnv(paddle::platform::CUDAPlace()); + eager_test::InitEnv(paddle::platform::CUDAPlace()); // 1. Prepare Input paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); - egr::EagerTensor tensor = CreateTensorWithValue( + egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); - RetainGradForTensor(tensor); + egr_utils_api::RetainGradForTensor(tensor); // 3. Run Forward float scale = 2.0; @@ -347,14 +344,14 @@ TEST(FwdBwdJoint, SingleNodeCUDA) { tensor, scale, bias, true /*bias_after_scale*/, true /*trace_backward*/); // Examine Forward Output - CompareTensorWithValue(out, 13.0); + eager_test::CompareTensorWithValue(out, 13.0); std::vector outs = {out}; // 4. Run Backward RunBackward(outs, {}); // Examine Backward Grad - CompareGradTensorWithValue(tensor, 2.0); + eager_test::CompareGradTensorWithValue(tensor, 2.0); } /* @@ -368,14 +365,14 @@ TEST(FwdBwdJoint, SingleNodeCUDA) { out1 out2 */ TEST(FwdBwdJoint, BranchedNodesCUDA) { - InitEnv(paddle::platform::CUDAPlace()); + eager_test::InitEnv(paddle::platform::CUDAPlace()); // 1. Prepare Input paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); - egr::EagerTensor tensor = CreateTensorWithValue( + egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); - RetainGradForTensor(tensor); + egr_utils_api::RetainGradForTensor(tensor); // 3. Run Forward // Run Forward Node 0 @@ -398,11 +395,11 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) { out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/); // Examine Forward Output 0 - CompareTensorWithValue(out0, 13.0); + eager_test::CompareTensorWithValue(out0, 13.0); // Examine Forward Output 1 - CompareTensorWithValue(out1, 75.0); + eager_test::CompareTensorWithValue(out1, 75.0); // Examine Forward Output 2 - CompareTensorWithValue(out2, 150.0); + eager_test::CompareTensorWithValue(out2, 150.0); // TODO(jiabin): fix this with add functor // 4. Run Backward @@ -410,8 +407,8 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) { RunBackward(outs, {}); // Examine Backward Grad - CompareGradTensorWithValue(tensor, 30.0); + eager_test::CompareGradTensorWithValue(tensor, 30.0); } #endif -} // namespace eager_test +} // namespace egr diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc index eb8d1e517eaf3..a06091247bf7a 100644 --- a/paddle/fluid/eager/tests/task_tests/generated_test.cc +++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc @@ -30,66 +30,98 @@ #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" #include "paddle/pten/core/kernel_registry.h" -// TODO(jiabin): remove nolint here!!! -using namespace egr; // NOLINT - -namespace eager_test { +namespace egr { TEST(Generated, Sigmoid) { // Prepare Device Contexts - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); VLOG(6) << "Init Env"; // 1. Prepare Input paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4}); VLOG(6) << "Make Dim"; - egr::EagerTensor tensor = CreateTensorWithValue( + egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 0.0, true); VLOG(6) << "Make EagerTensor"; - RetainGradForTensor(tensor); + egr_utils_api::RetainGradForTensor(tensor); VLOG(6) << "Retain Grad for Tensor"; auto output_tensor = sigmoid_dygraph_function(tensor, {}); VLOG(6) << "Run Backward"; - CompareVariableWithValue(output_tensor, 0.5); + eager_test::CompareVariableWithValue(output_tensor, 0.5); std::vector target_tensors = {output_tensor}; VLOG(6) << "Runing Backward"; RunBackward(target_tensors, {}); VLOG(6) << "Finish Backward"; - CompareGradVariableWithValue(tensor, 0.25); + eager_test::CompareGradVariableWithValue(tensor, 0.25); } TEST(Generated, Matmul_v2) { // Prepare Device Contexts - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); auto tracer = std::make_shared(); paddle::imperative::SetCurrentTracer(tracer); // 1. Prepare Input paddle::framework::DDim ddimX = paddle::framework::make_ddim({4, 16}); - egr::EagerTensor X = CreateTensorWithValue( + egr::EagerTensor X = egr_utils_api::CreateTensorWithValue( ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 3.0, true); - RetainGradForTensor(X); + egr_utils_api::RetainGradForTensor(X); paddle::framework::DDim ddimY = paddle::framework::make_ddim({16, 20}); - egr::EagerTensor Y = CreateTensorWithValue( + egr::EagerTensor Y = egr_utils_api::CreateTensorWithValue( ddimY, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 2.0, true); - RetainGradForTensor(Y); + egr_utils_api::RetainGradForTensor(Y); auto output_tensor = matmul_v2_dygraph_function( X, Y, {{"trans_x", false}, {"trans_y", false}}); - CompareVariableWithValue(output_tensor, 96); + eager_test::CompareVariableWithValue(output_tensor, 96); + + std::vector target_tensors = {output_tensor}; + RunBackward(target_tensors, {}); + + eager_test::CompareGradVariableWithValue(X, 2.0 * 20); + eager_test::CompareGradVariableWithValue(Y, 3.0 * 4); +} + +TEST(Generated, ElementwiseAdd) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + auto tracer = std::make_shared(); + paddle::imperative::SetCurrentTracer(tracer); + + // 1. Prepare Input + paddle::framework::DDim ddimX = paddle::framework::make_ddim({4, 16}); + egr::EagerTensor X = egr_utils_api::CreateTensorWithValue( + ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, 3.0, true); + egr_utils_api::RetainGradForTensor(X); + + paddle::framework::DDim ddimY = paddle::framework::make_ddim({4, 16}); + egr::EagerTensor Y = egr_utils_api::CreateTensorWithValue( + ddimY, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, 2.0, true); + egr_utils_api::RetainGradForTensor(Y); + + auto output_tensor = elementwise_add_dygraph_function(X, Y, {}); + + eager_test::CompareVariableWithValue(output_tensor, 5); std::vector target_tensors = {output_tensor}; RunBackward(target_tensors, {}); - CompareGradVariableWithValue(X, 2.0 * 20); - CompareGradVariableWithValue(Y, 3.0 * 4); + eager_test::CompareGradVariableWithValue(X, 1.0); + eager_test::CompareGradVariableWithValue(Y, 1.0); } -} // namespace eager_test +} // namespace egr + +USE_OP(sigmoid); +USE_OP(elementwise_add); +USE_OP(matmul_v2); diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc index 326240d0cb7b9..32b28d8efd21b 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc @@ -30,9 +30,7 @@ #include "paddle/fluid/eager/tests/test_utils.h" -using namespace egr; // NOLINT - -namespace eager_test { +namespace egr { egr::EagerTensor hook_function(const egr::EagerTensor& t) { auto t_dense = std::dynamic_pointer_cast(t.impl()); @@ -61,14 +59,14 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) { } TEST(RetainGrad, HookBeforeRetainGrad) { - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); // Prepare Inputs std::vector target_tensors; paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); // Create Target Tensor - egr::EagerTensor tensor = CreateTensorWithValue( + egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); target_tensors.emplace_back(std::move(tensor)); @@ -99,8 +97,9 @@ TEST(RetainGrad, HookBeforeRetainGrad) { std::dynamic_pointer_cast( auto_grad_meta)); - RegisterGradientHookForTensor(target_tensor, hook); - RetainGradForTensor(target_tensor); // result: 1.0 + 3.0 = 4.0 + egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook); + egr_utils_api::RetainGradForTensor( + target_tensor); // result: 1.0 + 3.0 = 4.0 } // Connect ScaleNode -> AccumulationNode via Edge @@ -126,25 +125,26 @@ TEST(RetainGrad, HookBeforeRetainGrad) { std::dynamic_pointer_cast( auto_grad_meta)); - RegisterGradientHookForTensor(leaf_tensor, hook); - RetainGradForTensor(leaf_tensor); // result: 4.0*5.0 + 3.0 = 23.0 + egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook); + egr_utils_api::RetainGradForTensor( + leaf_tensor); // result: 4.0*5.0 + 3.0 = 23.0 } RunBackward(target_tensors, {}); - CompareGradTensorWithValue(target_tensor, 4.0); - CompareGradTensorWithValue(leaf_tensor, 23.0); + eager_test::CompareGradTensorWithValue(target_tensor, 4.0); + eager_test::CompareGradTensorWithValue(leaf_tensor, 23.0); } TEST(RetainGrad, HookAfterRetainGrad) { - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); // Prepare Inputs std::vector target_tensors; paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); // Create Target Tensor - egr::EagerTensor tensor = CreateTensorWithValue( + egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); target_tensors.emplace_back(std::move(tensor)); @@ -173,8 +173,8 @@ TEST(RetainGrad, HookAfterRetainGrad) { std::dynamic_pointer_cast( auto_grad_meta)); - RetainGradForTensor(target_tensor); // result: 1.0 - RegisterGradientHookForTensor(target_tensor, hook); + egr_utils_api::RetainGradForTensor(target_tensor); // result: 1.0 + egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook); } // Connect ScaleNode -> AccumulationNode via Edge @@ -200,15 +200,15 @@ TEST(RetainGrad, HookAfterRetainGrad) { std::dynamic_pointer_cast( auto_grad_meta)); - RetainGradForTensor(leaf_tensor); // RetainGrad for leaf tensor gets - // postponed, result: 4.0*5.0 + 3.0 = - // 23.0 - RegisterGradientHookForTensor(leaf_tensor, hook); + egr_utils_api::RetainGradForTensor( + leaf_tensor); // RetainGrad for leaf tensor gets + // postponed, result: 4.0*5.0 + 3.0 = + // 23.0 + egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook); } RunBackward(target_tensors, {}); - CompareGradTensorWithValue(target_tensor, 1.0); - CompareGradTensorWithValue(leaf_tensor, 23.0); + eager_test::CompareGradTensorWithValue(target_tensor, 1.0); + eager_test::CompareGradTensorWithValue(leaf_tensor, 23.0); } - -} // namespace eager_test +} // namespace egr diff --git a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc index 5b96c726b2228..5e86cac83a285 100644 --- a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc +++ b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc @@ -23,39 +23,34 @@ #include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/pten/api/lib/utils/allocator.h" -#include "paddle/pten/core/kernel_registry.h" - -// TODO(jiabin): remove nolint here!!! -using namespace egr; // NOLINT - -namespace eager_test { +namespace egr { TEST(TensorUtils, Test) { // Prepare Device Contexts - InitEnv(paddle::platform::CPUPlace()); + eager_test::InitEnv(paddle::platform::CPUPlace()); // Prepare Inputs std::vector target_tensors; paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32}); // Create Target Tensor - egr::EagerTensor t = CreateTensorWithValue( + egr::EagerTensor t = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); - egr::EagerTensor t_grad = CreateTensorWithValue( + egr::EagerTensor t_grad = egr_utils_api::CreateTensorWithValue( ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); - CHECK_EQ(IsLeafTensor(t), true); + CHECK_EQ(egr_utils_api::IsLeafTensor(t), true); // Test Utils - CompareTensorWithValue(t, 5.0); + eager_test::CompareTensorWithValue(t, 5.0); egr::AutogradMeta* meta = egr::EagerUtils::autograd_meta(&t); *meta->MutableGrad() = t_grad; - CompareGradTensorWithValue(t, 1.0); + eager_test::CompareGradTensorWithValue(t, 1.0); } -} // namespace eager_test +} // namespace egr diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 28eefd62c5aa0..be06bf9eb344b 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -48,9 +48,9 @@ AutogradMeta* EagerUtils::unsafe_autograd_meta(const egr::EagerTensor& target) { } std::vector EagerUtils::unsafe_autograd_meta( - std::vector* targets) { + const std::vector& targets) { std::vector metas; - for (const egr::EagerTensor& t : *targets) { + for (const egr::EagerTensor& t : targets) { metas.push_back(unsafe_autograd_meta(t)); } return metas; diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h index f7e226a2aba36..03f922e5bf9ba 100644 --- a/paddle/fluid/eager/utils.h +++ b/paddle/fluid/eager/utils.h @@ -114,7 +114,7 @@ class EagerUtils { // This method will return an AutogradMeta pointer unsafely. static AutogradMeta* unsafe_autograd_meta(const egr::EagerTensor& target); static std::vector unsafe_autograd_meta( - std::vector* targets); + const std::vector& targets); template static bool ComputeRequireGrad(T trace_backward, Args&&... args) { diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h index db8dc22f68663..51446f287e94b 100644 --- a/paddle/fluid/framework/conv_search_cache.h +++ b/paddle/fluid/framework/conv_search_cache.h @@ -17,11 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator_kernel_configs.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#else -#include "paddle/fluid/platform/cudnn_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index c511526c3159d..1b5db8380514d 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -463,6 +463,11 @@ void DatasetImpl::WaitPreLoadDone() { // release memory data template void DatasetImpl::ReleaseMemory() { + release_thread_ = new std::thread(&DatasetImpl::ReleaseMemoryFun, this); +} + +template +void DatasetImpl::ReleaseMemoryFun() { VLOG(3) << "DatasetImpl::ReleaseMemory() begin"; if (input_channel_) { input_channel_->Clear(); diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h index b41f701548f3f..58223a2f28b4f 100644 --- a/paddle/fluid/framework/data_set.h +++ b/paddle/fluid/framework/data_set.h @@ -63,6 +63,7 @@ class Dataset { virtual void SetTrainerNum(int trainer_num) = 0; // set fleet send batch size virtual void SetFleetSendBatchSize(int64_t size) = 0; + virtual void ReleaseMemoryFun() = 0; // set fs name and ugi virtual void SetHdfsConfig(const std::string& fs_name, const std::string& fs_ugi) = 0; @@ -168,8 +169,13 @@ template class DatasetImpl : public Dataset { public: DatasetImpl(); - virtual ~DatasetImpl() {} + virtual ~DatasetImpl() { + if (release_thread_ != nullptr) { + release_thread_->join(); + } + } virtual void SetFileList(const std::vector& filelist); + virtual void ReleaseMemoryFun(); virtual void SetThreadNum(int thread_num); virtual void SetTrainerNum(int trainer_num); virtual void SetFleetSendBatchSize(int64_t size); @@ -295,6 +301,7 @@ class DatasetImpl : public Dataset { int64_t fleet_send_batch_size_; int64_t fleet_send_sleep_seconds_; std::vector preload_threads_; + std::thread* release_thread_ = nullptr; bool merge_by_insid_; bool parse_ins_id_; bool parse_content_; diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 3429677a2403e..b1573093ec333 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -291,13 +291,9 @@ void AllReduceOpHandle::SyncNCCLAllReduce() { nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_); auto &nccl_ctx = nccl_ctxs->at(dev_id); auto stream = nccl_ctx.stream(); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError()); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError()); -#endif + + platform::GpuStreamSync(stream); + PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); } } } diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h index 033d9396e9bf2..02e35895205b7 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -33,7 +33,7 @@ class NCCLCommunicator; } // namespace paddle #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #elif defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/framework/details/bkcl_op_handle.h" #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 36b840e4945a0..a11a244214d4f 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -111,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar( broadcast_calls.emplace_back( [send_recv_buffer, numel, type, root_id, &nccl_ctx] { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( send_recv_buffer, numel, static_cast(type), root_id, nccl_ctx.comm_, nccl_ctx.stream())); }); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 0b062b1a3f49a..055c7e63863b3 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -44,7 +44,7 @@ struct BKCLContextMap; } // namespace paddle #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #elif defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" #endif diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h index 6ca4baa6d8b04..2e82fe22dba73 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.h +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -95,7 +95,7 @@ struct TestBroadcastOpHandle { #endif } else if (use_device_ == p::kCUDA) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - int count = p::GetCUDADeviceCount(); + int count = p::GetGPUDeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " "device count is " diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 68c5daaac5d78..f9c28cbee50c3 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -40,7 +40,7 @@ class NCCLCommunicator; } // namespace paddle #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" #endif diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 07f7bbdb97a8d..bcdd6129230b0 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -49,10 +49,10 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( platform::CUDADeviceGuard guard( BOOST_GET_CONST(platform::CUDAPlace, place).device); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event_, hipEventDisableTiming)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); #endif PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument( @@ -75,9 +75,9 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace()); platform::CUDADeviceGuard guard(gpu_place.device); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_)); #endif } #endif @@ -160,12 +160,12 @@ void EagerDeletionOpHandle::ClearGarbages( reinterpret_cast(gc_)->stream(); auto callback_func = [=]() { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, compute_stream)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(callback_stream, event_, 0)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamWaitEvent(callback_stream, event_, 0)); #endif }; diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index 94507140a81d6..bd153f24fa318 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -55,9 +55,9 @@ FusedAllReduceOpHandle::~FusedAllReduceOpHandle() { auto destroy_event = [](gpuEvent_t event) { if (event == nullptr) return; #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); #endif }; destroy_event(start_event_); @@ -87,10 +87,10 @@ void FusedAllReduceOpHandle::RunImpl() { auto create_event = [](gpuEvent_t *event) { if (*event) return; #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(event, hipEventDisableTiming)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(event, cudaEventDisableTiming)); #endif }; @@ -109,12 +109,12 @@ void FusedAllReduceOpHandle::RunImpl() { auto &nccl_ctx = flat_nccl_ctxs->at(gpu_place.device); nccl_stream = nccl_ctx.stream(); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(start_event_, compute_stream)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(start_event_, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(nccl_stream, start_event_, 0)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(start_event_, compute_stream)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event_, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamWaitEvent(nccl_stream, start_event_, 0)); #endif } else { @@ -169,12 +169,12 @@ void FusedAllReduceOpHandle::RunImpl() { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (FLAGS_allreduce_record_one_event) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(end_event_, nccl_stream)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(end_event_, nccl_stream)); + PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(compute_stream, end_event_, 0)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(end_event_, nccl_stream)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(end_event_, nccl_stream)); + PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamWaitEvent(compute_stream, end_event_, 0)); #endif } diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h index 31336b92c4dfb..d522981c77fa1 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h @@ -35,7 +35,7 @@ class NCCLCommunicator; } // namespace paddle #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #elif defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" #endif diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h index 2fd1e0e7e9889..e08a768f8ce07 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h @@ -37,7 +37,7 @@ struct NCCLContextMap; } // namespace paddle #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc index 98c37ca3c406a..38e20127f1612 100644 --- a/paddle/fluid/framework/details/gather_op_handle_test.cc +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -48,7 +48,7 @@ struct TestGatherOpHandle { void InitCtxOnGpu(bool use_gpu) { if (use_gpu) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - int count = p::GetCUDADeviceCount(); + int count = p::GetGPUDeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " "device count is " diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h index c59f61347303d..9cfc3ada6ac3d 100644 --- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h @@ -35,7 +35,7 @@ class NCCLCommunicator; } // namespace paddle #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index a9ea336e42545..8255707654416 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -40,7 +40,7 @@ static std::vector& multi_op_var2gpu_str_mutex() { } static void InitMultiGPUOpVarMap() { - int dev_count = platform::GetCUDADeviceCount(); + int dev_count = platform::GetGPUDeviceCount(); PADDLE_ENFORCE_GT(dev_count, 0, platform::errors::NotFound( "cuda device must > 0, now dev_count=%d", dev_count)); @@ -161,11 +161,11 @@ void TensorCheckerVisitor::apply( op_var)); #ifdef __HIPCC__ - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1, hipMemcpyHostToDevice, dev_ctx->stream())); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1, cudaMemcpyHostToDevice, dev_ctx->stream())); #endif diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h index 762f4071b5cab..324d39ed8bb77 100644 --- a/paddle/fluid/framework/details/nccl_op_handle.h +++ b/paddle/fluid/framework/details/nccl_op_handle.h @@ -27,7 +27,7 @@ #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/dynload/rccl.h" #endif -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" DECLARE_bool(sync_nccl_allreduce); @@ -52,16 +52,16 @@ class NCCLOpHandleBase : public OpHandleBase { virtual ~NCCLOpHandleBase() { for (auto& ev : inter_events_) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second)); #endif } for (auto& ev : exter_events_) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second)); #endif } } @@ -109,14 +109,14 @@ class NCCLOpHandleBase : public OpHandleBase { platform::SetDeviceId(dev_id); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags( + PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags( &inter_events_[dev_id], hipEventDisableTiming)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags( + PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags( &exter_events_[dev_id], hipEventDisableTiming)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags( + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags( &inter_events_[dev_id], cudaEventDisableTiming)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags( + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags( &exter_events_[dev_id], cudaEventDisableTiming)); #endif VLOG(10) << "Create events on dev_id:" << dev_id @@ -142,7 +142,7 @@ class NCCLOpHandleBase : public OpHandleBase { << ", dev_id:" << dev_id << ", dtype:" << datatype << ", place:" << place; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( sendbuff, recvbuff, count, datatype, op, comm, stream)); } @@ -192,7 +192,7 @@ class NCCLOpHandleBase : public OpHandleBase { << ", dtype:" << datatype << ", place:" << place << ", stream:" << stream; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream)); #ifdef PADDLE_WITH_HIP @@ -202,11 +202,7 @@ class NCCLOpHandleBase : public OpHandleBase { #endif if (FLAGS_sync_nccl_allreduce) { -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif + platform::GpuStreamSync(stream); } } @@ -230,26 +226,21 @@ class NCCLOpHandleBase : public OpHandleBase { #ifdef PADDLE_WITH_HIP hipStreamWaitEvent(stream, inter_events_.at(dev_id), 0); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( sendbuff, recvbuff, count, datatype, op, comm, stream)); hipEventRecord(exter_events_.at(dev_id), stream); - - if (FLAGS_sync_nccl_allreduce) { - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); - } #else cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( sendbuff, recvbuff, count, datatype, op, comm, stream)); cudaEventRecord(exter_events_.at(dev_id), stream); - +#endif if (FLAGS_sync_nccl_allreduce) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); + platform::GpuStreamSync(stream); } -#endif } void InterBroadCast(platform::Place place, void* sendbuff, size_t count, @@ -269,7 +260,7 @@ class NCCLOpHandleBase : public OpHandleBase { #else cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0); #endif - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( sendbuff, count, datatype, 0, comm, stream)); } diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 4b5d0563d7394..25b5eefc05cda 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -35,9 +35,9 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW { for (auto &ev : events_) { if (ev.second) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second)); #endif } } @@ -50,10 +50,10 @@ void OpHandleBase::InitCUDA() { int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; platform::SetDeviceId(dev_id); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); #endif } @@ -182,9 +182,9 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { static_cast(waited_ctx)->stream(); for (auto &ev : events_) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0)); #endif } } @@ -221,10 +221,10 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { static_cast(dev_ctxes_.at(place)) ->stream(); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); #endif #else @@ -250,11 +250,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { auto stream = static_cast(pool.Get(place)) ->stream(); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif + platform::GpuStreamSync(stream); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Not compiled with CUDA.")); @@ -279,10 +275,10 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { dev_ctxes_.at(in_var_handle->place())) ->stream(); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); #endif #else @@ -319,10 +315,10 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { auto *cuda_dev_ctx = static_cast(p.second); VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id; #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); #endif } diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index a485838a95942..bbc458804a195 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -193,7 +193,7 @@ void ReduceOpHandle::RunImpl() { size_t numel = static_cast(lod_tensor.numel()); all_reduce_calls.emplace_back( [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( buffer, recvbuffer, numel, static_cast(type), ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream())); }); diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index d56b6b3663003..4b9f289eaa787 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -41,7 +41,7 @@ struct NCCLContextMap; } // namespace platform } // namespace paddle #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #elif defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" #endif diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 82f5ea6a66891..35dba48845472 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -59,7 +59,7 @@ struct TestReduceOpHandle { use_gpu_ = use_gpu; if (use_gpu) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - int count = p::GetCUDADeviceCount(); + int count = p::GetGPUDeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " "device count is " diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc index ccc64a9cdc335..1225e2ee025b2 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc +++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc @@ -39,14 +39,14 @@ ShareTensorBufferFunctor::ShareTensorBufferFunctor( Scope *scope, size_t scope_idx, const std::string &op_type, const std::vector &in_var_infos, const std::vector &out_var_names, const bool &is_variant_scope, - bool share_dims) + bool share_dims_and_dtype) : scope_(scope), scope_idx_(scope_idx), op_type_(op_type), in_var_infos_(in_var_infos), out_var_names_(out_var_names), is_variant_scope_(is_variant_scope), - share_dims_(share_dims) { + share_dims_and_dtype_(share_dims_and_dtype) { PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size(), platform::errors::PreconditionNotMet( "The number of input variables and output variables " @@ -147,12 +147,14 @@ void ShareTensorBufferFunctor::operator()(Scope *exec_scope) { // NOTE(zhiqiu): In the case of inplace addto, if the operator of // the in_out_vars is skipped during running, we should set the dims of // output as the same as input. - if (share_dims_) { + if (share_dims_and_dtype_) { out_tensor->Resize(in_tensor.dims()); + out_tensor->ShareDataTypeWith(in_tensor); } VLOG(2) << "Share tensor buffer when running " << op_type_ << " : " - << in_var_info->Name() << " -> " << out_var_names_[i]; + << in_var_info->Name() << " -> " << out_var_names_[i] + << " share_dims_and_dtype = " << share_dims_and_dtype_; } } } diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.h b/paddle/fluid/framework/details/share_tensor_buffer_functor.h index 528b047bccc13..f0ddb3f0137a2 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_functor.h +++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.h @@ -73,12 +73,14 @@ class ShareTensorBufferFunctor { Scope *scope, size_t scope_idx, const std::string &op_type, const std::vector &in_var_infos, const std::vector &out_var_names, - const bool &is_variant_scope, bool share_dims = false); + const bool &is_variant_scope, bool share_dims_and_dtype = false); void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name); - void SetShareDims(bool share_dims) { share_dims_ = share_dims; } + void SetShareDimsAndDtype(bool share_dims_and_dtype) { + share_dims_and_dtype_ = share_dims_and_dtype; + } void operator()(Scope *exec_scope); @@ -108,7 +110,7 @@ class ShareTensorBufferFunctor { // NOTE(zhiqiu): In the case of inplace addto, if the operator of // the in_out_vars is skipped during running, we should set the dims of output // as the same as input. - bool share_dims_{false}; + bool share_dims_and_dtype_{false}; }; } // namespace details diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc index 7e10c669ac478..aa942415fb404 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc +++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc @@ -64,10 +64,10 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle( ShareTensorBufferOpHandle::ShareTensorBufferOpHandle( ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type, const std::vector &in_var_infos, - const std::vector &out_var_names, bool share_dims) + const std::vector &out_var_names, bool share_dims_and_dtype) : OpHandleBase(node), functor_(scope, scope_idx, op_type, in_var_infos, out_var_names, - is_variant_scope_, share_dims) {} + is_variant_scope_, share_dims_and_dtype) {} std::unordered_map ShareTensorBufferOpHandle::ReusedVars() const { @@ -79,8 +79,9 @@ void ShareTensorBufferOpHandle::AddReuseVarPair( functor_.AddReuseVarPair(in_var_info, out_var_name); } -void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) { - functor_.SetShareDims(share_dims); +void ShareTensorBufferOpHandle::SetShareDimsAndDtype( + bool share_dims_and_dtype) { + functor_.SetShareDimsAndDtype(share_dims_and_dtype); } void ShareTensorBufferOpHandle::InitCUDA() { diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h index dd2364fec4af5..d3852a85d019b 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h +++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h @@ -56,7 +56,7 @@ class ShareTensorBufferOpHandle : public OpHandleBase { void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name); - void SetShareDims(bool share_dims); + void SetShareDimsAndDtype(bool share_dims_and_dtype); const ShareTensorBufferFunctor &Functor() const { return functor_; } diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc index 37399e5ddc09d..d916b9bc26276 100644 --- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc @@ -23,7 +23,7 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/cuda_device_guard.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/profiler.h" DECLARE_bool(sync_nccl_allreduce); @@ -182,7 +182,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() { << ", k:" << k << ", place:" << place << ", dtype:" << dtype; all_gather_calls.emplace_back([=] { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( in_tensor_buf, gather_buff, 2 * k, static_cast(dtype), comm, stream)); }); diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h index 8bfea0f1ae8b8..5c3aef71ec40e 100644 --- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h @@ -21,7 +21,7 @@ #include "paddle/fluid/framework/details/dgc_const_values.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 600d75db53c7e..15acedf3cf50a 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -54,7 +54,7 @@ class DeviceContext; } // namespace paddle #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 5aef43263575e..739e05e1d7971 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -181,7 +181,7 @@ enum TableType { message TableParameter { optional uint64 table_id = 1; optional string table_class = 2; - optional uint64 shard_num = 3; + optional uint64 shard_num = 3 [ default = 1000 ]; optional TableType type = 4; optional TableAccessorParameter accessor = 5; } @@ -190,42 +190,73 @@ message TableAccessorParameter { optional string accessor_class = 1; optional SGDParameter embed_sgd_param = 2; optional SGDParameter embedx_sgd_param = 3; - optional uint32 fea_dim = 4; // for sparse table, this means field size of one - // value; for dense table, this means total value - // num - optional uint32 embedx_dim = 5; // embedx feature size - optional uint32 embedx_threshold = 6; // embedx feature create threshold + optional uint32 fea_dim = 4 [ default = 11 ]; // field size of one value + optional uint32 embedx_dim = 5 [ default = 8 ]; // embedx feature size + optional uint32 embedx_threshold = 6 + [ default = 10 ]; // embedx feature create threshold optional CtrAccessorParameter ctr_accessor_param = 7; + repeated TableAccessorSaveParameter table_accessor_save_param = 8; } // TODO(guanqun): add NaiveSGD/Adam... message SGDParameter { optional string name = 1; - optional SGDRuleParameter adagrad = 2; + optional SparseNaiveSGDRuleParameter naive = 2; + optional SparseAdagradSGDRuleParameter adagrad = 3; + optional SparseAdamSGDParameter adam = 4; } -message SGDRuleParameter { - optional double learning_rate = 1; - optional double initial_g2sum = 2; - optional double initial_range = 3 [ default = 0 ]; +message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule + optional double learning_rate = 1 [ default = 0.05 ]; + optional double initial_range = 2 [ default = 0.0001 ]; + repeated float weight_bounds = 3; +} + +message + SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule + optional double learning_rate = 1 [ default = 0.05 ]; + optional double initial_g2sum = 2 [ default = 3.0 ]; + optional double initial_range = 3 [ default = 0.0001 ]; repeated float weight_bounds = 4; } +message SparseAdamSGDParameter { // SparseAdamSGDRule + optional double learning_rate = 1 [ default = 0.001 ]; + optional double initial_range = 2 [ default = 0.0001 ]; + optional double beta1_decay_rate = 3 [ default = 0.9 ]; + optional double beta2_decay_rate = 4 [ default = 0.999 ]; + optional double ada_epsilon = 5 [ default = 1e-08 ]; + repeated float weight_bounds = 6; +} + message CtrAccessorParameter { - optional float nonclk_coeff = 1; // to calculate show_click_score - optional float click_coeff = 2; // to calculate show_click_score - optional float base_threshold = - 3; // show_click_score > base_threshold, this feature can be saved - optional float delta_threshold = - 4; // delta_score > delta_threshold, this feature can be saved - optional float delta_keep_days = - 5; // unseen_day < delta_keep_days, this feature can be saved - optional float show_click_decay_rate = 6; // show/click will update to - // show/click * - // show_click_decay_rate after a day - optional float delete_threshold = 7; // threshold to shrink a feasign - optional float delete_after_unseen_days = 8; - optional int32 ssd_unseenday_threshold = 9; + optional float nonclk_coeff = 1 + [ default = 0.1 ]; // to calculate show_click_score + optional float click_coeff = 2 + [ default = 1 ]; // to calculate show_click_score + optional float base_threshold = 3 [ + default = 1.5 + ]; // show_click_score > base_threshold, this feature can be saved + optional float delta_threshold = 4 + [ default = + 0.25 ]; // delta_score > delta_threshold, this feature can be saved + optional float delta_keep_days = 5 + [ default = + 16 ]; // unseen_day < delta_keep_days, this feature can be saved + optional float show_click_decay_rate = 6 + [ default = 0.98 ]; // show/click will update to + // show/click * + // show_click_decay_rate after a day + optional float delete_threshold = 7 + [ default = 0.8 ]; // threshold to shrink a feasign + optional float delete_after_unseen_days = 8 [ default = 30 ]; + optional int32 ssd_unseenday_threshold = 9 [ default = 1 ]; +} + +message TableAccessorSaveParameter { + optional uint32 param = 1; + optional string converter = 2; + optional string deconverter = 3; } message FsClientParameter { @@ -274,6 +305,7 @@ message DistributedStrategy { optional bool semi_auto = 35 [ default = false ]; optional bool adam_d2sum = 36 [ default = true ]; optional bool auto_search = 37 [ default = false ]; + optional bool heter_ccl_mode = 38 [ default = false ]; optional RecomputeConfig recompute_configs = 101; optional AMPConfig amp_configs = 102; diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc index 4e2d7bb979b61..9b8bdebe706eb 100644 --- a/paddle/fluid/framework/dlpack_tensor_test.cc +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -115,7 +115,7 @@ void TestMainLoop() { std::vector places{platform::CPUPlace(), platform::CUDAPlace(0), platform::CUDAPinnedPlace()}; - if (platform::GetCUDADeviceCount() > 1) { + if (platform::GetGPUDeviceCount() > 1) { places.emplace_back(platform::CUDAPlace(1)); } #else diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h index f749ee8cfa0ba..82ce3b28776f1 100644 --- a/paddle/fluid/framework/fleet/ascend_wrapper.h +++ b/paddle/fluid/framework/fleet/ascend_wrapper.h @@ -24,7 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/timer.h" diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc index 37fbf47f854ad..8564a42165961 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cc +++ b/paddle/fluid/framework/fleet/box_wrapper.cc @@ -19,7 +19,7 @@ #include #include #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu index c9b5abf7a9bef..c91d371f5a155 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cu +++ b/paddle/fluid/framework/fleet/box_wrapper.cu @@ -19,7 +19,7 @@ #include #include "paddle/fluid/framework/fleet/box_wrapper.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h index 645d725871a06..b043edca138a8 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.h +++ b/paddle/fluid/framework/fleet/box_wrapper.h @@ -40,7 +40,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/timer.h" #include "paddle/fluid/string/string_helper.h" @@ -397,7 +397,7 @@ class BoxWrapper { if (nullptr != s_instance_) { VLOG(3) << "Begin InitializeGPU"; std::vector stream_list; - for (int i = 0; i < platform::GetCUDADeviceCount(); ++i) { + for (int i = 0; i < platform::GetGPUDeviceCount(); ++i) { VLOG(3) << "before get context i[" << i << "]"; platform::CUDADeviceContext* context = dynamic_cast( @@ -416,7 +416,7 @@ class BoxWrapper { slot_name_omited_in_feedpass_.insert(slot_name); } slot_vector_ = slot_vector; - keys_tensor.resize(platform::GetCUDADeviceCount()); + keys_tensor.resize(platform::GetGPUDeviceCount()); } } diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 66c043e137a24..225c2656fbfd1 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -740,10 +740,10 @@ void FleetWrapper::PushDenseVarsAsync( BOOST_GET_CONST(platform::CUDAPlace, place), g_data, sizeof(float) * count, stream); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream)); hipEventSynchronize(event); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream)); cudaEventSynchronize(event); #endif diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index 6fddedccf0258..deb2b90c93353 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -35,7 +35,7 @@ limitations under the License. */ #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #ifdef PADDLE_WITH_HETERPS -#include "paddle/fluid/platform/type_defs.h" +#include "paddle/fluid/platform/device/gpu/gpu_types.h" #endif namespace paddle { diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index 646a2e97d319f..e7f098320c6c7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -28,7 +28,7 @@ limitations under the License. */ // #include "cudf/concurrent_unordered_map.cuh.h" #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h" #ifdef PADDLE_WITH_HETERPS -#include "paddle/fluid/platform/type_defs.h" +#include "paddle/fluid/platform/device/gpu/gpu_types.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index ec852ec83ca09..c293b07e8995c 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -347,7 +347,7 @@ void HeterComm::build_ps(int num, KeyType* h_keys, gpuStream_t streams[stream_num]; for (int i = 0; i < stream_num; ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(streams[i]))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(streams[i]))); auto d_k_buf = memory::AllocShared(place, chunk_size * sizeof(KeyType)); auto d_v_buf = memory::AllocShared(place, chunk_size * sizeof(ValType)); d_key_bufs.push_back(d_k_buf); @@ -360,11 +360,11 @@ void HeterComm::build_ps(int num, KeyType* h_keys, while (cur_len < len) { cur_stream = cur_stream % stream_num; int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMemcpyAsync(d_key_bufs[cur_stream]->ptr(), h_keys + cur_len, sizeof(KeyType) * tmp_len, cudaMemcpyHostToDevice, streams[cur_stream])); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMemcpyAsync(d_val_bufs[cur_stream]->ptr(), h_vals + cur_len, sizeof(ValType) * tmp_len, cudaMemcpyHostToDevice, streams[cur_stream])); @@ -378,7 +378,7 @@ void HeterComm::build_ps(int num, KeyType* h_keys, for (int i = 0; i < stream_num; ++i) { cudaStreamSynchronize(streams[i]); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams[i])); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams[i])); } } @@ -402,14 +402,14 @@ void HeterComm::merge_grad(int gpu_num, GradType* d_merge_grads_ptr = reinterpret_cast(d_merge_grads->ptr()); - PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceRadixSort::SortPairs( + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false)); void* d_buff = NULL; auto d_temp_storage = memory::AllocShared(place, temp_storage_bytes); - PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceRadixSort::SortPairs( + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr, d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false)); temp_storage_bytes = 0; @@ -417,7 +417,7 @@ void HeterComm::merge_grad(int gpu_num, auto d_num_runs_out_mem = memory::AllocShared(place, sizeof(int)); int* d_num_runs_out = reinterpret_cast(d_num_runs_out_mem->ptr()); - PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceReduce::ReduceByKey( + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey( NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_merge_grads_ptr, d_grads, d_num_runs_out, merger_, len, stream, false)); @@ -426,13 +426,13 @@ void HeterComm::merge_grad(int gpu_num, d_temp_storage = memory::AllocShared(place, temp_storage_bytes); } - PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceReduce::ReduceByKey( + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey( d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys, d_merge_grads_ptr, d_grads, d_num_runs_out, merger_, len, stream, false)); cudaMemcpyAsync(&uniq_len, d_num_runs_out, sizeof(int), cudaMemcpyDeviceToHost, stream); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); } template @@ -461,12 +461,12 @@ void HeterComm::split_input_to_shard( size_t temp_storage_bytes; const int num_bits = 1 + log2i(total_gpu); - PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceRadixSort::SortPairs( + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( NULL, temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream)); auto d_temp_storage = memory::AllocShared(place, temp_storage_bytes); - PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceRadixSort::SortPairs( + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream)); calc_shard_offset<<>>(d_shard_index_ptr, @@ -720,12 +720,12 @@ int HeterComm::gather_one_node_grad( cudaMemcpyHostToDevice); // allgather grad len - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart()); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( (const void*)(d_node_len + gpu_num), (void*)d_node_len, 1, ncclInt, nccl_inner_comm, stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd()); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); cudaMemcpy(h_node_len, d_node_len, sizeof(int) * total_gpu, cudaMemcpyDeviceToHost); @@ -737,15 +737,15 @@ int HeterComm::gather_one_node_grad( storage.alloc(max_size * total_gpu); // allgather keys and grads - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart()); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( d_keys, storage.all_keys, max_size, ncclUint64, nccl_inner_comm, stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( d_grads, storage.all_grads, max_size * sizeof(GradType), ncclUint8, nccl_inner_comm, stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd()); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); int h_left[total_gpu]; int h_right[total_gpu]; @@ -802,11 +802,11 @@ int HeterComm::gather_multi_node_grad( cudaMemcpy(d_node_len, h_node_len, sizeof(int), cudaMemcpyHostToDevice); // allgather grad len - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart()); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( d_node_len, d_node_len, 1, ncclInt, nccl_inter_comm, stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd()); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); cudaMemcpy(h_node_len, d_node_len, sizeof(int) * node_size_, cudaMemcpyDeviceToHost); @@ -818,15 +818,15 @@ int HeterComm::gather_multi_node_grad( storage.alloc(max_size * node_size_); // allgather keys and grads - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart()); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( d_keys, storage.all_keys, max_size, ncclUint64, nccl_inter_comm, stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( d_grads, storage.all_grads, max_size * sizeof(GradType), ncclUint8, nccl_inter_comm, stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd()); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); int merge_num = 0; for (int i = 0; i < node_size_; ++i) { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc index a369a612d4935..ccdb6c5cdd64e 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc +++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc @@ -30,11 +30,11 @@ GPUResource::GPUResource(std::vector& dev_ids, int index) { remote_streams_.resize(dev_ids_.size()); for (size_t i = 0; i < dev_ids_.size(); ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamCreateWithFlags(&local_streams_[i], cudaStreamNonBlocking)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamCreateWithFlags(&comm_streams_[i], cudaStreamNonBlocking)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamCreateWithFlags(&remote_streams_[i], cudaStreamNonBlocking)); } } @@ -42,13 +42,13 @@ GPUResource::GPUResource(std::vector& dev_ids, int index) { GPUResource::~GPUResource() { platform::CUDADeviceGuard guard(dev_id_); for (size_t i = 0; i < local_streams_.size(); ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(local_streams_[i])); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(local_streams_[i])); } for (size_t i = 0; i < comm_streams_.size(); ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(comm_streams_[i])); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(comm_streams_[i])); } for (size_t i = 0; i < remote_streams_.size(); ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(remote_streams_[i])); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(remote_streams_[i])); } } @@ -58,7 +58,7 @@ void HeterPsResource::enable_p2p() { for (size_t j = 0; j < dev_ids_.size(); ++j) { if (i != j) { int p2p_flag; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaDeviceCanAccessPeer(&p2p_flag, dev_ids_[i], dev_ids_[j])); if (p2p_flag == 1) { cudaError_t ret = cudaDeviceEnablePeerAccess(dev_ids_[j], 0); diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc index 3ac95632de6bf..cbd06deeafc75 100644 --- a/paddle/fluid/framework/fleet/nccl_wrapper.cc +++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc @@ -22,7 +22,7 @@ bool NCCLWrapper::is_initialized_ = false; void NCCLWrapper::InitNCCL() { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclCommInitRank( &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_, nccl_info_.my_global_rank_)); #endif @@ -38,7 +38,7 @@ void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) { NCCLInfo NCCLWrapper::GetNCCLId() { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_))); #endif return nccl_info_; @@ -52,9 +52,9 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank, nccl_info_.global_ranks_ = ranks; platform::SetDeviceId(local_rank); #ifdef PADDLE_WITH_RCCL - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&(nccl_info_.stream_))); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&(nccl_info_.stream_))); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_))); #endif #endif return; @@ -67,7 +67,7 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope, auto var = scope.FindVar(name); LoDTensor* tensor = var->GetMutable(); int32_t total_size = tensor->numel(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( reinterpret_cast(tensor->data()), total_size, ncclFloat, root_rank, nccl_info_.comm_, nccl_info_.stream_)); #ifdef PADDLE_WITH_RCCL diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 6519a514ff3b6..a0954ef0709dc 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h" #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index b726a629586e1..c163c2de11019 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -37,8 +37,8 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/dynload/nccl.h" -#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_PSCORE @@ -230,7 +230,7 @@ class PSGPUWrapper { ? 1.0 : config["mf_max_bound"]; for (size_t i = 0; i < heter_devices_.size(); i++) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(heter_devices_[i])); + PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(heter_devices_[i])); this->SetSparseSGD(nonclk_coeff, clk_coeff, min_bound, max_bound, learning_rate, initial_g2sum, initial_range); this->SetEmbedxSGD(mf_create_thresholds, mf_learning_rate, diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 9ab6b5d8c178b..06d1ef84c1955 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -53,6 +53,15 @@ void XPUGarbageCollector::ClearCallback(const std::function &callback) { } #endif +#ifdef PADDLE_WITH_IPU +IPUGarbageCollector::IPUGarbageCollector(const platform::IPUPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} +void IPUGarbageCollector::ClearCallback(const std::function &callback) { + callback(); +} +#endif + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( const platform::CUDAPlace &place, size_t max_memory_size) @@ -83,9 +92,9 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, : GarbageCollector(place, max_memory_size) { platform::CUDADeviceGuard guard(place.device); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream_)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream_)); callback_manager_.reset( new platform::StreamCallbackManager(stream_)); #endif @@ -94,13 +103,8 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, StreamGarbageCollector::~StreamGarbageCollector() { auto place = BOOST_GET_CONST(platform::CUDAPlace, this->dev_ctx_->GetPlace()); platform::CUDADeviceGuard guard(place.device); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream_)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_)); -#endif + platform::GpuStreamSync(stream_); + platform::GpuDestroyStream(stream_); } gpuStream_t StreamGarbageCollector::stream() const { return stream_; } diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 2c2b57bbe420a..0cfeda37c222e 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -80,6 +80,16 @@ class XPUGarbageCollector : public GarbageCollector { }; #endif +#ifdef PADDLE_WITH_IPU +class IPUGarbageCollector : public GarbageCollector { + public: + IPUGarbageCollector(const platform::IPUPlace &place, size_t max_memory_size); + + protected: + void ClearCallback(const std::function &callback) override; +}; +#endif + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class UnsafeFastGPUGarbageCollector : public GarbageCollector { public: diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc index 154154fc79517..a020bda823167 100644 --- a/paddle/fluid/framework/generator.cc +++ b/paddle/fluid/framework/generator.cc @@ -18,8 +18,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/gpu_info.h" namespace paddle { namespace framework { @@ -33,7 +33,7 @@ const std::shared_ptr& GetDefaultCUDAGenerator(int64_t device_id) { static std::vector> default_cuda_generators; std::call_once(num_devices_init_flag, []() { - num_cuda_devices = paddle::platform::GetCUDADeviceCount(); + num_cuda_devices = paddle::platform::GetGPUDeviceCount(); cuda_device_flags.resize(num_cuda_devices); default_cuda_generators.resize(num_cuda_devices); }); diff --git a/paddle/fluid/framework/heter_pipeline_trainer_test.cc b/paddle/fluid/framework/heter_pipeline_trainer_test.cc index af8eca32ee2f4..417c7685bcbeb 100644 --- a/paddle/fluid/framework/heter_pipeline_trainer_test.cc +++ b/paddle/fluid/framework/heter_pipeline_trainer_test.cc @@ -115,8 +115,6 @@ TEST(HeterPipelineTrainerTest, GPU) { t3.add_trainers(1); t3.add_trainers(1); t3.add_trainers(1); - t3.add_dump_fields("hello"); - t3.add_dump_param("fc_0"); auto* heter_section_param3 = t3.mutable_heter_section_param(); heter_section_param3->set_num_pipeline_stages(3); heter_section_param3->set_pipeline_stage(2); diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc index a8db38f8077dd..69a4a180a9071 100644 --- a/paddle/fluid/framework/heter_section_worker.cc +++ b/paddle/fluid/framework/heter_section_worker.cc @@ -277,7 +277,7 @@ void HeterSectionWorker::CopyParameters(int microbatch_id, void HeterSectionWorker::Run() { if (debug_) { size_t total_ops_size = forward_ops_.size() + backward_ops_.size(); - op_name_.resize(total_ops_size); + op_name_.reserve(total_ops_size); op_total_time_.resize(total_ops_size); platform::SetNumThreads(1); // forward op + backward op diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc index 8049a1c9424be..93b7869cc1d25 100644 --- a/paddle/fluid/framework/heterxpu_trainer.cc +++ b/paddle/fluid/framework/heterxpu_trainer.cc @@ -51,11 +51,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc, platform::CUDAPlace place = platform::CUDAPlace(num); platform::CUDADeviceGuard guard(place.device); cudaStream_t stream; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream)); copy_streams_.push_back(stream); places_.push_back(place); cudaEvent_t event; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); events_.push_back(event); #endif @@ -104,7 +104,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc, // platform::CUDAPlace place = platform::CUDAPlace(num); // platform::CUDADeviceGuard guard(place.device); // cudaStream_t stream; - // PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream)); + // PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream)); // copy_streams_.push_back(stream); // places_.push_back(place); // } @@ -157,7 +157,7 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) { } } #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream)); cudaEventSynchronize(event); #endif } @@ -287,7 +287,7 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) { #ifdef PADDLE_WITH_CUDA auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; platform::CUDADeviceGuard guard(dev_id); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming)); #endif object_pool_.Push(context); @@ -441,7 +441,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, #ifdef PADDLE_WITH_CUDA auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; platform::CUDADeviceGuard guard(dev_id); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming)); #endif } @@ -461,7 +461,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, #endif } #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaEventRecord(context->event_, copy_streams_[context->place_num_])); while (cudaEventQuery(context->event_) != cudaSuccess) { VLOG(3) << "wait for kernel"; @@ -481,7 +481,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, #ifdef PADDLE_WITH_CUDA auto* dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaEventRecord(context->event_, dev_ctx->stream())); // cudaEventSynchronize(context->event_); { diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index e384cb4633794..b98a228868266 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -161,7 +161,7 @@ cc_test(test_skip_layernorm_fuse_pass SRCS skip_layernorm_fuse_pass_tester.cc DE cc_test(test_multihead_matmul_fuse_pass SRCS multihead_matmul_fuse_pass_tester.cc DEPS multihead_matmul_fuse_pass) cc_test(test_conv_bn_fuse_pass_cc SRCS conv_bn_fuse_pass_tester.cc DEPS conv_bn_fuse_pass) cc_test(test_adaptive_pool2d_convert_global_pass SRCS adaptive_pool2d_convert_global_pass_tester.cc DEPS adaptive_pool2d_convert_global_pass) -cc_test(test_unsqueeze2_eltwise_fuse_pass SRCS unsqueeze2_eltwise_fuse_pass_tester.cc DEPS unsqueeze2_eltwise_fuse_pass) +cc_test(test_unsqueeze2_eltwise_fuse_pass_cc SRCS unsqueeze2_eltwise_fuse_pass_tester.cc DEPS unsqueeze2_eltwise_fuse_pass) cc_test(test_layer_norm_fuse_pass_cc SRCS layer_norm_fuse_pass_tester.cc DEPS layer_norm_fuse_pass pass_test_util naive_executor) cc_test(test_generate_pass_cc SRCS generate_pass_tester.cc DEPS generate_pass pass_desc_proto) if(WITH_GPU OR WITH_ROCM) diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index bb78cdab67752..e246a10961c0c 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -130,6 +130,32 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const { GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern); + + // Only support 2D-Tensor as weight for FC + std::vector w_shape = w->Var()->GetShape(); + size_t w_rank = w_shape.size(); + if (w_rank != 2) return; + + // axis of elementwise_add should be -1 or x_num_col_dims + auto x_num_col_dims = + BOOST_GET_CONST(int, mul->Op()->GetAttr("x_num_col_dims")); + auto axis = BOOST_GET_CONST(int, elementwise_add->Op()->GetAttr("axis")); + if (axis != -1 && axis != x_num_col_dims) return; + + // Shape of bias should be [1, out_size] or [out_size] + std::vector b_shape = bias->Var()->GetShape(); + if (b_shape.size() == 1) { + if (b_shape[0] != w_shape[1]) { + return; + } + } else if (b_shape.size() == 2) { + if (b_shape[0] != 1 || b_shape[1] != w_shape[1]) { + return; + } + } else { + return; + } + Node* relu = nullptr; Node* relu_out = nullptr; if (with_relu) { diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc index 5046911036818..39b544e716079 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc @@ -55,14 +55,14 @@ TEST(FCFusePass, basic) { auto* bias_0 = layers.data("conv2d_bias_0", {}, true); auto* conv2d_out = layers.conv2d(a, filters_0, bias_0, false); auto* relu_out_0 = layers.relu(conv2d_out); - auto* weights_0 = layers.data("weights_0", {}, true); + auto* weights_0 = layers.data("weights_0", {5, 4}, true); auto* mul_out_0 = layers.mul(relu_out_0, weights_0); - auto* bias_1 = layers.data("bias_1", {}, true); + auto* bias_1 = layers.data("bias_1", {4}, true); auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1, nullptr, 1); auto* relu_out_1 = layers.relu(add_out_0); - auto* weights_1 = layers.data("weights_1", {}, true); + auto* weights_1 = layers.data("weights_1", {8, 9}, true); auto* mul_out_1 = layers.mul(relu_out_1, weights_1); - auto* bias_2 = layers.data("bias_2", {}, true); + auto* bias_2 = layers.data("bias_2", {1, 9}, true); auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2, nullptr, 1); VLOG(4) << add_out_1; diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc index ae662c64af331..f12273e94dddd 100644 --- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc @@ -24,12 +24,8 @@ class Node; } // namespace ir } // namespace framework } // namespace paddle -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cudnn_helper.h" -#endif -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#endif + +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc index ec014d331fa44..005f006ab0478 100644 --- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc @@ -15,13 +15,8 @@ #include "paddle/fluid/framework/ir/fuse_bn_add_act_pass.h" #include #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/enforce.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cudnn_helper.h" -#endif -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#endif namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index dd0ffe8b9fd0d..5334b08248992 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1619,6 +1619,26 @@ PDNode *patterns::Reshape::operator()() { return reshape_out; } +PDNode *patterns::Slice::operator()() { + auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op(); + + auto slice_op = pattern->NewNode(slice_op_repr())->assert_is_op("slice"); + + auto slice_in = pattern->NewNode(slice_in_repr()) + ->AsInput() + ->assert_is_op_input("slice", "Input"); + auto slice_out = pattern->NewNode(slice_out_repr()) + ->AsOutput() + ->assert_is_op_output("slice", "Out"); + + auto next_op = pattern->NewNode(next_op_repr())->assert_is_op(); + + prev_op->LinksTo({slice_in}); + slice_op->LinksFrom({slice_in}).LinksTo({slice_out}); + next_op->LinksFrom({slice_out}); + return slice_out; +} + PDNode *patterns::Matmul::operator()() { auto matmul_op = pattern->NewNode(matmul_op_repr())->assert_is_op("matmul"); @@ -2315,7 +2335,7 @@ PDNode *patterns::QuantizePlacement::operator()( std::unordered_set({"concat", "conv2d", "elementwise_add", "fc", "matmul", "pool2d", "prior_box", "reshape2", "transpose2", "fusion_gru", - "fusion_lstm", "multi_gru"}); + "fusion_lstm", "multi_gru", "slice"}); if (!quantize_enabled_op_types.empty()) { supported_op_types = quantize_enabled_op_types; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index d7bfdc57d1c7e..fa8504d074a88 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -980,6 +980,20 @@ struct Reshape : public PatternBase { PATTERN_DECL_NODE(reshape_out); PATTERN_DECL_NODE(next_op); }; +// Slice op +// Forward pass for slice. +// slice_out is a result of the operator. +struct Slice : public PatternBase { + Slice(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "slice") {} + + PDNode* operator()(); + PATTERN_DECL_NODE(prev_op); + PATTERN_DECL_NODE(slice_in); + PATTERN_DECL_NODE(slice_op); + PATTERN_DECL_NODE(slice_out); + PATTERN_DECL_NODE(next_op); +}; // Matmul op // Forward pass for matmul. diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc index bf7cd55fab268..1ca6e989f275c 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc @@ -283,7 +283,8 @@ void BufferSharedInplaceOpPass::ApplyImpl(ProgramDesc *main_program, op->SetInput("X", inputs); op->SetOutput("Out", outputs); op->SetOutput("XOut", inputs); // add necessary dependency - op->SetAttr("share_dims", std::vector(inputs.size(), false)); + op->SetAttr("share_dims_and_dtype", + std::vector(inputs.size(), false)); } block->Flush(); } diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc index d09de5be84c35..0ed2ec51b89cb 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc @@ -277,7 +277,7 @@ static void BuildInplaceAddToGraph(Node *in_var_0, Node *in_var_1, grad_add_op_desc->SetInput("X", {in_var_1->Name()}); grad_add_op_desc->SetOutput("Out", {out_var->Name()}); grad_add_op_desc->SetOutput("XOut", {in_var_1->Name()}); - grad_add_op_desc->SetAttr("share_dims", std::vector(1, true)); + grad_add_op_desc->SetAttr("share_dims_and_dtype", std::vector(1, true)); // Add share_buffer op between in_var_0 and in_var_1 OpDesc share_buffer_op; @@ -285,7 +285,7 @@ static void BuildInplaceAddToGraph(Node *in_var_0, Node *in_var_1, share_buffer_op.SetInput("X", {in_var_0->Name()}); share_buffer_op.SetOutput("Out", {in_var_1->Name()}); share_buffer_op.SetOutput("XOut", {in_var_0->Name()}); - share_buffer_op.SetAttr("share_dims", std::vector(1, false)); + share_buffer_op.SetAttr("share_dims_and_dtype", std::vector(1, false)); auto *new_share_buffer_op = graph->CreateOpNode(&share_buffer_op); new_share_buffer_op->inputs.push_back(in_var_0); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc index f6465d385841d..9d1e2301704b3 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc @@ -329,7 +329,7 @@ bool MemoryReusePass::IsVarPairReusable( void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var, details::VarHandle *out_var, - bool share_dims) const { + bool share_dims_and_dtype) const { PADDLE_ENFORCE_GT( (*var_infos_)[op->GetScopeIdx()].count(in_var->Name()), 0, platform::errors::NotFound("Var(%s) does not in mem opt var infos.", @@ -349,8 +349,8 @@ void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op, share_buffer_op->AddInput(in_var); } - if (share_dims) { - share_buffer_op->SetShareDims(true); + if (share_dims_and_dtype) { + share_buffer_op->SetShareDimsAndDtype(true); } share_buffer_op->AddReuseVarPair( diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 2bf8a3b64f0a7..3df4a84470524 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -676,6 +676,57 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const { PrettyLogDetail("--- quantized %d reshape ops", quantize_reshape_count); } +void CPUQuantizePass::QuantizeSlice(Graph* graph) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + patterns::Slice slice_pattern{pattern, name_scope_}; + slice_pattern(); + + int quantize_slice_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Quantize slice op"; + GET_IR_NODE_FROM_SUBGRAPH(slice_op, slice_op, slice_pattern); + + // skip if should not be quantized + if (!platform::HasOpINT8DataType(slice_op->Op())) { + LogQuantizationDisabled(slice_op); + return; + } + GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, slice_pattern); + GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, slice_pattern); + + // skip if prev op and next op is not quantized + if (!IsOpDequantized(prev_op) && !IsOpQuantized(next_op)) { + return; + } + GET_IR_NODE_FROM_SUBGRAPH(slice_in, slice_in, slice_pattern); + GET_IR_NODE_FROM_SUBGRAPH(slice_out, slice_out, slice_pattern); + + if (!AreScalesPresentForNodes({slice_out})) { + LogCannotQuantizeOp(slice_op); + return; + } + + bool is_input_unsigned{false}; + auto input_scale = GetScaleValueForNode(slice_out, &is_input_unsigned); + QuantizeInput(g, slice_op, slice_in, "Input", input_scale, + is_input_unsigned); + + bool is_output_unsigned{false}; + auto output_scale = GetScaleValueForNode(slice_out, &is_output_unsigned); + DequantizeOutput(g, slice_op, slice_out, "Out", output_scale, + is_output_unsigned); + + ++quantize_slice_count; + }; + + gpd(graph, handler); + AddStatis(quantize_slice_count); + + PrettyLogDetail("--- quantized %d slice ops", quantize_slice_count); +} + void CPUQuantizePass::QuantizeMatmul(Graph* graph) const { GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); @@ -1024,6 +1075,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizeFusionGru(graph); QuantizeMultiGru(graph); QuantizeFusionLSTM(graph); + QuantizeSlice(graph); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index 18735633c0d69..b3ee98263c0c0 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -61,6 +61,7 @@ class CPUQuantizePass : public FusePassBase { void QuantizeFusionGru(Graph* graph) const; void QuantizeMultiGru(Graph* graph) const; void QuantizeFusionLSTM(Graph* graph) const; + void QuantizeSlice(Graph* graph) const; void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name, double scale_to_one, bool is_input_unsigned, diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index b6a8de263aa2a..838912f659ff7 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -55,6 +55,10 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetInput("X", {inputs[0]}); op->SetOutput("Out", {outputs[0]}); op->SetAttr("mkldnn_data_type", mkldnn_data_type); + } else if (type == "slice") { + op->SetInput("Input", {inputs[0]}); + op->SetOutput("Out", {outputs[0]}); + op->SetAttr("mkldnn_data_type", mkldnn_data_type); } else if (type == "dropout") { op->SetInput("X", {inputs[0]}); op->SetOutput("Out", {outputs[0]}); @@ -784,6 +788,113 @@ TEST(CpuQuantizePass, reshapeBetweenNonQuantizedOp) { added_nodes_count, 2.0f * 127); } +static const std::initializer_list variable_names_slice = { + "a", "b", "c", "d"}; + +// a->Dequantize->b +// b->Slice->c +// c->Dropout->d +ProgramDesc BuildProgramDescSlice() { + ProgramDesc prog; + for (auto& v : variable_names_slice) { + prog.MutableBlock(0)->Var(v); + } + SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true); + SetOp(&prog, "slice", "Slice", {"b"}, {"c"}, true, "int8"); + SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32"); + + return prog; +} + +// a->Transpose->b +// b->slice->c +// c->Dropout->d +ProgramDesc BuildProgramDescSliceBetweenNonQuantizedOp() { + ProgramDesc prog; + for (auto& v : variable_names_slice) { + prog.MutableBlock(0)->Var(v); + } + + SetOp(&prog, "transpose2", "Transpose2", {"a"}, {"b"}, true, "float32"); + SetOp(&prog, "slice", "Slice", {"b"}, {"c"}, true, "int8"); + SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32"); + + return prog; +} + +void MainTestSlice(const ProgramDesc& prog, int transpose_count, + int slice_count, int quant_count, int dequant_count, + int added_nodes_count, float scale) { + std::unique_ptr graph(new ir::Graph(prog)); + int original_nodes_num, current_nodes_num; + PreparePass(&graph, prog, variable_names_slice, &original_nodes_num, + ¤t_nodes_num); + + float quant_scale = 1.0f; + float dequant_scale = 1.0f; + int quantize_nodes_count = 0; + int dequantize_nodes_count = 0; + int transpose_nodes_count = 0; + int slice_nodes_count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->Type() == "transpose2") { + transpose_nodes_count++; + } else if (op->Type() == "slice") { + slice_nodes_count++; + } else if (op->Type() == "quantize") { + quantize_nodes_count++; + quant_scale = BOOST_GET_CONST(float, op->GetAttr("Scale")); + EXPECT_EQ(quant_scale, scale) << "Scale for node '" + op->Type() + "'."; + } else if (op->Type() == "dequantize") { + dequantize_nodes_count++; + auto op_name = op->GetAttrIfExists("name"); + VLOG(3) << op_name << "\n"; + if (op_name != "Dequantize1") { + dequant_scale = BOOST_GET_CONST(float, op->GetAttr("Scale")); + EXPECT_EQ(dequant_scale, scale) + << "Scale for node '" + op->Type() + "'."; + } + } + } + } + EXPECT_EQ(transpose_nodes_count, transpose_count); + EXPECT_EQ(slice_nodes_count, slice_count); + EXPECT_EQ(quantize_nodes_count, quant_count); + EXPECT_EQ(dequantize_nodes_count, dequant_count); + EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num); +} + +TEST(CpuQuantizePass, slice) { + // a->Dequantize->b + // b2->Quant->b3->slice->c1->Dequant->c2 + // c2->Dropout->d + int slice_count = 1; + int transpose_count = 0; + int quant_count = 1; + int dequant_count = 2; + // 1 Quant + 1 IN + 1 DeQuant + 1 OUT + int added_nodes_count = 4; + MainTestSlice(BuildProgramDescSlice(), transpose_count, slice_count, + quant_count, dequant_count, added_nodes_count, 2.0f * 127); +} + +TEST(CpuQuantizePass, sliceBetweenNonQuantizedOp) { + // a->Transpos2->b + // b->slice->c + // c->Dropout->d + int slice_count = 1; + int transpose_count = 1; + int quant_count = 0; + int dequant_count = 0; + // 0 Quant + 0 IN + 0 DeQuant + 0 OUT + int added_nodes_count = 0; + MainTestSlice(BuildProgramDescSliceBetweenNonQuantizedOp(), transpose_count, + slice_count, quant_count, dequant_count, added_nodes_count, + 2.0f * 127); +} + static const std::initializer_list variable_names_matmul = { "a", "b", "c", "d", "e", "f"}; diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc index b2b1a7515f0a5..2d60129165a60 100644 --- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc +++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -231,3 +232,7 @@ void SimplifyWithBasicOpsPass::ReplaceOutputVar(Node* op, Node* old_var, REGISTER_PASS(simplify_with_basic_ops_pass, paddle::framework::ir::SimplifyWithBasicOpsPass); +REGISTER_PASS_CAPABILITY(simplify_with_basic_ops_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination().EQ( + "scale", 0)); diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h index 8fe314cf5f18c..f7539aa485957 100644 --- a/paddle/fluid/framework/library_type.h +++ b/paddle/fluid/framework/library_type.h @@ -61,6 +61,8 @@ inline LibraryType StringToLibraryType(const char* ctype) { return LibraryType::kPlain; } else if (s == std::string("XPU")) { return LibraryType::kPlain; + } else if (s == std::string("IPU")) { + return LibraryType::kPlain; } else if (s == std::string("NPU")) { return LibraryType::kPlain; } else if (s == std::string("CUDA")) { @@ -68,7 +70,7 @@ inline LibraryType StringToLibraryType(const char* ctype) { } else { PADDLE_THROW(platform::errors::Unimplemented( "Unknown LibraryType string (%s), only support library type string " - "include PLAIN, MKLDNN, CUDNN, CPU and CUDA.", + "include PLAIN, MKLDNN, CUDNN, CPU, CUDA and IPU.", s.c_str())); } } diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu index 8fb59d682e40f..10e7ed0fb6021 100644 --- a/paddle/fluid/framework/mixed_vector_test.cu +++ b/paddle/fluid/framework/mixed_vector_test.cu @@ -24,7 +24,7 @@ #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/mixed_vector.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" template using vec = paddle::framework::Vector; @@ -63,7 +63,7 @@ TEST(mixed_vector, GPU_VECTOR) { } TEST(mixed_vector, MultiGPU) { - if (paddle::platform::GetCUDADeviceCount() < 2) { + if (paddle::platform::GetGPUDeviceCount() < 2) { LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple " "GPUs in your machine."; return; diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc index 15e6b2a1ff939..064dfa0170bdb 100644 --- a/paddle/fluid/framework/new_executor/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/data_transfer.cc @@ -137,7 +137,7 @@ std::shared_ptr TransferLayout(const std::string& var_name, // 1. Generate new_var_name and Initialize it *new_var_name = var_name + "_layout_" + std::to_string(var_scope->VarSize() + 1); - auto* ptr = local_scope->Var(new_var_name); + auto* ptr = local_scope->Var(*new_var_name); auto var_type = var_scope->Var(var_name)->Type(); InitializeVariable(ptr, static_cast(var_type)); @@ -171,8 +171,8 @@ std::shared_ptr TransferDtype(const std::string& var_name, // 1. Generate new_var_name and Initialize it *new_var_name = var_name + "_dtype_" + std::to_string(var_scope->VarSize() + 1); - auto* ptr = local_scope->Var(new_var_name); - var_scope->SetVarDesc(var_name, nullptr); + auto* ptr = local_scope->Var(*new_var_name); + auto var_type = var_scope->Var(var_name)->Type(); InitializeVariable(ptr, static_cast(var_type)); @@ -211,7 +211,7 @@ std::shared_ptr TransferDevice(const std::string& var_name, // 1. Generate new_var_name and Initialize it *new_var_name = var_name + "_device_" + std::to_string(var_scope->VarSize() + 1); - auto* ptr = local_scope->Var(new_var_name); + auto* ptr = local_scope->Var(*new_var_name); auto var_type = var_scope->Var(var_name)->Type(); InitializeVariable(ptr, static_cast(var_type)); diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 94b2118ba9d73..dcbdd12f88fb7 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -398,13 +398,8 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { /*For profiling/benchmark only*/ if (FLAGS_benchmark) { instr_node.DeviceContext().Wait(); -#if defined(PADDLE_WITH_CUDA) - PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError()); - VLOG(4) << "Operator(" << op->Type() - << "): context wait and get last error"; -#endif -#if defined(PADDLE_WITH_HIP) - PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError()); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op->Type() << "): context wait and get last error"; #endif @@ -439,6 +434,7 @@ void InterpreterCore::ExecuteInstructionList( if (UNLIKELY(exception_holder_.IsCaught())) { VLOG(4) << "Exception caught " << exception_holder_.Type(); + async_work_queue_->Cancel(); exception_holder_.ReThrow(); } @@ -514,7 +510,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) { ready_ops.pop(); auto& instr_node = vec_instruction_.at(instr_id); auto* op = instr_node.OpBase(); - platform::RecordEvent instruction_event(op->Type()); + platform::RecordEvent instruction_event(op->Type().c_str()); interpreter::WaitEvent(instr_node, place_); try { diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 0501522a7a810..3817a11b9afe4 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -20,9 +20,26 @@ #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h" #include "paddle/fluid/operators/controlflow/while_op_helper.h" +PADDLE_DEFINE_EXPORTED_bool( + new_executor_sequential_run, false, + "Enable sequential execution for standalone executor, used for debug"); namespace paddle { namespace framework { namespace interpreter { + +void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type, + std::function fn) { + // NOTE(zhiqiu): use thhe second queue of size of, so only one thread is used. + if (FLAGS_new_executor_sequential_run) { + VLOG(4) << "FLAGS_new_executor_sequential_run:" + << FLAGS_new_executor_sequential_run; + queue_group_->AddTask(static_cast(OpFuncType::kQueueAsync), + std::move(fn)); + } else { + queue_group_->AddTask(static_cast(op_func_type), std::move(fn)); + } +} + using VariableIdMap = std::map>; AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicDeps( diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h index c92cea6c97c86..8f27c7e1811fb 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.h +++ b/paddle/fluid/framework/new_executor/interpretercore_util.h @@ -77,9 +77,7 @@ class AsyncWorkQueue { // void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); } - void AddTask(const OpFuncType& op_func_type, std::function fn) { - queue_group_->AddTask(static_cast(op_func_type), std::move(fn)); - } + void AddTask(const OpFuncType& op_func_type, std::function fn); void Cancel() { queue_group_->Cancel(); } diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h index 51c9e3d66a6f0..8df8db35592bb 100644 --- a/paddle/fluid/framework/new_executor/profiler.h +++ b/paddle/fluid/framework/new_executor/profiler.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/timer.h" namespace paddle { @@ -45,7 +45,7 @@ class ProfilerGuard { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place); cost_info_->device_memory_bytes = - platform::RecordedCudaMallocSize(cuda_place.device); + platform::RecordedGpuMallocSize(cuda_place.device); #endif } } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 6ef44fb127afb..4236fcf8dc134 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1181,9 +1181,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } BuildPtenKernelContext(*runtime_ctx, dev_ctx); (*pt_kernel_)(pt_kernel_context_.get()); - WriteBackToOutputs(runtime_ctx); - pt_kernel_context_->ClearData(); } else { (*kernel_func_)( @@ -1214,14 +1212,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope, /*For profiling/benchmark only*/ if (FLAGS_benchmark) { dev_ctx->Wait(); -#if defined(PADDLE_WITH_CUDA) - PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError()); - VLOG(4) << "Operator(" << Type() << "): context wait and get last error"; +#if defined(PADDLE_WITH_CUDA) || defined(PADLDE_WITH_ROCM) + PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); #endif -#if defined(PADDLE_WITH_HIP) - PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError()); VLOG(4) << "Operator(" << Type() << "): context wait and get last error"; -#endif } if (FLAGS_check_nan_inf) { @@ -1814,45 +1808,31 @@ void OperatorWithKernel::BuildPtenKernelContext( size_t start_idx = (i == 0 ? 0 : pt_kernel_context_->InputRangeAt(i - 1).second); size_t end_idx = start_idx + ins_vector.size(); - - // The current size of input/output in pt_kernel_context_ is at least equal - // the start_idx. For the reason of reusing the allocted of inputs or - // outputs in pt_kernel_context_, the current size of input/output can be - // greater then the index of which the tensort wanted to set to, so it will - // use ReMakePtenDenseTensorFromVar to make pten tensor. - if (pt_kernel_context_->InputsSize() == start_idx) { - paddle::SmallVector> tmp_inputs; - for (auto* var : ins_vector) { - tmp_inputs.emplace_back( - experimental::MakePtenTensorBaseFromVar(*var, in_def)); - } - pt_kernel_context_->EmplaceBackInputs(std::move(tmp_inputs)); - } else if (pt_kernel_context_->InputsSize() > start_idx) { - size_t input_size = pt_kernel_context_->InputsSize(); - for (size_t j = 0; j < ins_vector.size(); ++j) { - if (input_size > start_idx + j) { + auto current_vector_size = pt_kernel_context_->InputsSize(); + + // If the memory needed is less than the current memory allocated, we will + // reuse the current memory by using ReMakePtenDenseTensorFromVar. + // Otherwise,we will create new storage. + for (size_t offset = 0; offset < ins_vector.size(); ++offset) { + if (current_vector_size > start_idx + offset) { + auto& input_ptr = + pt_kernel_context_->MutableInputPtrAt(start_idx + offset); + if (input_ptr == nullptr) { + input_ptr = experimental::MakePtenTensorBaseFromVar( + *ins_vector[offset], in_def); + } else { experimental::ReMakePtenDenseTensorFromVar( - *ins_vector[j], in_def, + *ins_vector[offset], in_def, pt_kernel_context_->MutableInputAt(start_idx + - j)); - // TODO(chentianyu03): When multi input kernel, open this code - /* - } else { - pt_kernel_context_->EmplaceBackInputWithoutSetRange( - experimental::MakePtenTensorBaseFromVar(*ins_vector[j], - in_def)); - */ + offset)); } + } else { + pt_kernel_context_->EmplaceBackInputWithoutSetRange( + experimental::MakePtenTensorBaseFromVar(*ins_vector[offset], + in_def)); } - pt_kernel_context_->MutableInputRangeAt(i) = - std::make_pair(start_idx, end_idx); - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Error start index when trying to set new tensor to inputs, start " - "index is `%d`, but current pt_kernel_context_.inputs.size() is " - "`%d`.", - start_idx, pt_kernel_context_->InputsSize())); } + pt_kernel_context_->AssignInputRange(std::make_pair(start_idx, end_idx), i); } for (size_t i = 0; i < output_names.size(); ++i) { @@ -1862,46 +1842,25 @@ void OperatorWithKernel::BuildPtenKernelContext( size_t start_idx = (i == 0 ? 0 : pt_kernel_context_->OutputRangeAt(i - 1).second); size_t end_idx = start_idx + outs_vector.size(); - - // The current size of input/output in pt_kernel_context_ is at least equal - // the start_idx. For the reason of reusing the allocted of inputs or - // outputs in pt_kernel_context_, the current size of input/output can be - // greater then the index of which the tensort wanted to set to, so it will - // use ReMakePtenDenseTensorFromVar to make pten tensor. - if (pt_kernel_context_->OutputsSize() == start_idx) { - paddle::SmallVector> tmp_outputs; - for (auto* var : outs_vector) { - tmp_outputs.emplace_back( - experimental::MakePtenTensorBaseFromVar(var, out_def)); - } - pt_kernel_context_->EmplaceBackOutputs(std::move(tmp_outputs)); - } else if (pt_kernel_context_->OutputsSize() > start_idx) { - size_t output_size = pt_kernel_context_->OutputsSize(); - for (size_t j = 0; j < outs_vector.size(); ++j) { - if (output_size > start_idx + j) { - experimental::ReMakePtenDenseTensorFromVar( - outs_vector[j], out_def, - pt_kernel_context_->MutableOutputAt(start_idx + - j)); - - // TODO(chentianyu03): When multi output kernel, open this code - /* - } else { - pt_kernel_context_->EmplaceBackOutputWithoutSetRange( - experimental::MakePtenTensorBaseFromVar(outs_vector[j], - out_def)); - */ - } + auto current_vector_size = pt_kernel_context_->OutputsSize(); + + // If the memory needed is less than the current memory allocated, we will + // reuse the current memory by using ReMakePtenDenseTensorFromVar. + // Otherwise,we will create new storage. + for (size_t offset = 0; offset < outs_vector.size(); ++offset) { + if (current_vector_size > start_idx + offset) { + experimental::ReMakePtenDenseTensorFromVar( + outs_vector[offset], out_def, + pt_kernel_context_->MutableOutputAt(start_idx + + offset)); + } else { + pt_kernel_context_->EmplaceBackOutputWithoutSetRange( + experimental::MakePtenTensorBaseFromVar(outs_vector[offset], + out_def)); } - pt_kernel_context_->MutableOutputRangeAt(i) = - std::make_pair(start_idx, end_idx); - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Error start index when trying to set new tensor to inputs, start " - "index is `%d`, but current pt_kernel_context_.outputs.size() is " - "`%d`.", - start_idx, pt_kernel_context_->OutputsSize())); } + pt_kernel_context_->AssignOutputRange(std::make_pair(start_idx, end_idx), + i); } for (size_t i = 0; i < attr_names.size(); ++i) { diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt index b80a265f8a41b..b13166cff60aa 100644 --- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt +++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt @@ -4,20 +4,22 @@ cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn) cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn) cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn) -cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn) -set_tests_properties(cinn_lib_test PROPERTIES LABELS "RUN_TYPE=CINN") +if (WITH_TESTING) + cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn) + set_tests_properties(cinn_lib_test PROPERTIES LABELS "RUN_TYPE=CINN") -cc_test(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS cinn_cache_key) -set_tests_properties(cinn_cache_key_test PROPERTIES LABELS "RUN_TYPE=CINN") + cc_test(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS cinn_cache_key) + set_tests_properties(cinn_cache_key_test PROPERTIES LABELS "RUN_TYPE=CINN") -cc_test(build_cinn_pass_test SRCS build_cinn_pass_test.cc DEPS build_cinn_pass cinn_compiler) -set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN") + cc_test(build_cinn_pass_test SRCS build_cinn_pass_test.cc DEPS build_cinn_pass cinn_compiler) + set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN") -cc_test(transform_desc_test SRCS transform_desc_test.cc DEPS transform_desc) -set_tests_properties(transform_desc_test PROPERTIES LABELS "RUN_TYPE=CINN") + cc_test(transform_desc_test SRCS transform_desc_test.cc DEPS transform_desc) + set_tests_properties(transform_desc_test PROPERTIES LABELS "RUN_TYPE=CINN") -cc_test(cinn_graph_symbolization_test SRCS cinn_graph_symbolization_test.cc DEPS cinn_graph_symbolization) -set_tests_properties(cinn_graph_symbolization_test PROPERTIES LABELS "RUN_TYPE=CINN") + cc_test(cinn_graph_symbolization_test SRCS cinn_graph_symbolization_test.cc DEPS cinn_graph_symbolization) + set_tests_properties(cinn_graph_symbolization_test PROPERTIES LABELS "RUN_TYPE=CINN") -cc_test(cinn_compiler_test SRCS cinn_compiler_test.cc DEPS cinn_compiler place proto_desc graph_viz_pass build_cinn_pass cinn) -set_tests_properties(cinn_compiler_test PROPERTIES LABELS "RUN_TYPE=CINN") + cc_test(cinn_compiler_test SRCS cinn_compiler_test.cc DEPS cinn_compiler place proto_desc graph_viz_pass build_cinn_pass cinn) + set_tests_properties(cinn_compiler_test PROPERTIES LABELS "RUN_TYPE=CINN") +endif() diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index 3f1b6c78d8417..7fc8eff3d31c9 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -193,6 +193,8 @@ std::unique_ptr CinnCompiler::CompileGraph( CinnGraphSymbolization symbol{compiled_num, graph, target, input_tensors}; auto frontend_program = symbol(); ProgramPass::Apply(&frontend_program, target, {"Decomposer"}); + auto fetch_ids = symbol.GetFetchIds(); + ::cinn::frontend::ApplyPass(&frontend_program, fetch_ids, "RemoveIdentity"); auto cinn_graph = std::make_shared<::cinn::hlir::framework::Graph>( frontend_program, target); VLOG(1) << "-- The " << compiled_num << "-th compilation (" @@ -201,7 +203,6 @@ std::unique_ptr CinnCompiler::CompileGraph( ApplyPass(cinn_graph.get(), "OpFusion"); auto scope = BuildScope(target, cinn_graph); - auto fetch_ids = symbol.GetFetchIds(); VLOG(4) << "All fetch var ids in CINN: " << string::join_strings(fetch_ids, ','); @@ -209,6 +210,7 @@ std::unique_ptr CinnCompiler::CompileGraph( std::make_unique(target, scope, cinn_graph); GraphCompiler::CompileOptions options; options.with_instantiate_variables = false; + options.with_buffer_handle_instruction_inserted = true; auto compiled_res = graph_compiler->Build(options, std::move(fetch_ids), stream); auto compiled_obj = std::make_unique(); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 78774f0489638..18d0ee78ffbbc 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -34,7 +34,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc index b423d0e05e174..51a2d641bb00a 100644 --- a/paddle/fluid/framework/pten_utils.cc +++ b/paddle/fluid/framework/pten_utils.cc @@ -136,17 +136,17 @@ KernelArgsNameMakerByOpProto::GetInputArgsNames() { auto& in = op_proto_->inputs()[i]; auto& in_name = in.name(); if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) { - VLOG(1) << "Parse PtenKernel input: skip extra & quant input - " + VLOG(3) << "Parse PtenKernel input: skip extra & quant input - " << in_name; continue; } // If contains dispensable input, we should override the // GetExpectedPtenKernelArgs method self if (in.has_dispensable() && in.dispensable()) { - VLOG(1) << "Parse PtenKernel input: skip dispensable input - " << in_name; + VLOG(3) << "Parse PtenKernel input: skip dispensable input - " << in_name; continue; } - VLOG(1) << "Parse PtenKernel input: " << in_name; + VLOG(3) << "Parse PtenKernel input: " << in_name; input_names_.emplace_back(in_name); } return input_names_; @@ -158,7 +158,7 @@ KernelArgsNameMakerByOpProto::GetOutputArgsNames() { auto& out = op_proto_->outputs()[i]; auto& out_name = out.name(); // TODO(chenweihang): outputs also need skip some cases - VLOG(1) << "Parse PtenKernel output: " << out_name; + VLOG(3) << "Parse PtenKernel output: " << out_name; output_names_.emplace_back(out_name); } return output_names_; @@ -172,17 +172,17 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() { if (attr_name == "use_mkldnn" || attr_name == "op_role" || attr_name == "op_role_var" || attr_name == "op_namescope" || attr_name == "op_callstack" || attr_name == "op_device") { - VLOG(1) << "Parse PtenKernel attribute: skip needless attr - " + VLOG(3) << "Parse PtenKernel attribute: skip needless attr - " << attr_name; continue; } if ((attr.has_extra() && attr.extra()) || (attr.has_quant() && attr.quant())) { - VLOG(1) << "Parse PtenKernel attribute: skip extra & quant attr - " + VLOG(3) << "Parse PtenKernel attribute: skip extra & quant attr - " << attr_name; continue; } - VLOG(1) << "Parse PtenKernel attribute: " << attr_name; + VLOG(3) << "Parse PtenKernel attribute: " << attr_name; attr_names_.emplace_back(attr_name); } diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index 8d927b87c9abe..cbbc020989d1e 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" +DECLARE_bool(use_stream_safe_cuda_allocator); + namespace paddle { namespace memory { namespace allocation { @@ -60,14 +62,7 @@ void* Tensor::mutable_data(const platform::Place& place, "The Tensor's shape is [", dims(), "] now")); size_t size = numel() * SizeOfType(type); - if (requested_size) { - PADDLE_ENFORCE_GE( - requested_size, size, - platform::errors::InvalidArgument( - "The requested memory size is less than the memory size of Tensor. " - "But received requested memory size is %d, " - "memory size of Tensor is %d.", - requested_size, size)); + if (requested_size && (requested_size > size)) { size = requested_size; } /* some versions of boost::variant don't have operator!= */ @@ -89,6 +84,35 @@ void* Tensor::mutable_data(const platform::Place& place, return mutable_data(place, type_, requested_size); } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +void* Tensor::mutable_data(const platform::CUDAPlace& place, + proto::VarType::Type type, + const gpuStream_t& stream) { + if (!FLAGS_use_stream_safe_cuda_allocator) { + return mutable_data(place, type); + } + + type_ = type; + PADDLE_ENFORCE_GE( + numel(), 0, + platform::errors::PreconditionNotMet( + "The Tensor's element number must be equal or greater than zero. " + "The Tensor's shape is [", + dims(), "] now")); + size_t size = numel() * SizeOfType(type); + + /* some versions of boost::variant don't have operator!= */ + if (holder_ == nullptr || !(holder_->place() == place) || + holder_->size() < size + offset_) { + holder_.reset(); + holder_ = memory::AllocShared(place, size, stream); + offset_ = 0; + } + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} +#endif + Tensor& Tensor::ShareDataWith(const Tensor& src) { src.check_memory_size(); *this = src; diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 7f8d7bffa986e..4b1ae041fc4ca 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -81,6 +81,7 @@ class TensorInplaceVersion { bool IsUnique() const { return inplace_version_ == 0; } void Bump() { ++inplace_version_; } uint32_t CurrentVersion() const { return inplace_version_; } + void SetInplaceVersionToZero() { inplace_version_ = 0; } private: uint32_t inplace_version_; @@ -149,6 +150,11 @@ class Tensor { void* mutable_data(const platform::Place& place, size_t requested_size = 0); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + void* mutable_data(const platform::CUDAPlace& place, + proto::VarType::Type type, const gpuStream_t& stream); +#endif + /** * @brief Return a pointer to mutable memory block. * @@ -260,6 +266,8 @@ class Tensor { // should not be copied. } + void ShareDataTypeWith(const Tensor& tensor) { type_ = tensor.type_; } + bool IsSharedBufferWith(const Tensor& src) const { return holder_ && holder_ == src.Holder(); } diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index 1d5e638729361..eb8a1e4cea9fb 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -22,7 +22,7 @@ #ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif #include #include "paddle/fluid/operators/conv_cudnn_op_cache.h" @@ -30,8 +30,8 @@ #endif #ifdef PADDLE_WITH_HIP #if defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT -#include "paddle/fluid/platform/nccl_helper.h" // NOLINT +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT #endif #include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT #include "paddle/fluid/operators/miopen_rnn_cache.h" diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc index ae7ae85207d84..9a9b90cd81179 100644 --- a/paddle/fluid/framework/var_type_traits_test.cc +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -23,15 +23,15 @@ #ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/cudnn_rnn_cache.h" #endif #ifdef PADDLE_WITH_HIP #if defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT -#include "paddle/fluid/platform/nccl_helper.h" // NOLINT +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT #endif #include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT #include "paddle/fluid/operators/miopen_rnn_cache.h" diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h index 792a2accd41d6..f8ad990a668ce 100644 --- a/paddle/fluid/framework/variable.h +++ b/paddle/fluid/framework/variable.h @@ -75,6 +75,7 @@ class Variable { framework::TensorInplaceVersion* InplaceVersionCounter(); public: + void SetInplaceVersionToZero(); uint32_t CurrentInplaceVersion(); void BumpInplaceVersion(); @@ -134,6 +135,12 @@ inline framework::TensorInplaceVersion* Variable::InplaceVersionCounter() { return version_counter_ptr; } +inline void Variable::SetInplaceVersionToZero() { + auto inplace_version_counter = this->InplaceVersionCounter(); + if (inplace_version_counter) + inplace_version_counter->SetInplaceVersionToZero(); +} + inline uint32_t Variable::CurrentInplaceVersion() { auto version_counter_ptr = InplaceVersionCounter(); if (version_counter_ptr) { diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 9121610d29eaa..594b0d48a8aad 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -30,6 +30,9 @@ if(NOT WIN32) cc_library(hccl_context SRCS hccl_context.cc DEPS collective_helper device_context tensor var_type_traits) cc_library(reducer SRCS reducer.cc DEPS layer) endif() + if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) + cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits) + endif() cc_library(data_loader SRCS data_loader.cc DEPS enforce) endif(NOT WIN32) if(WITH_GLOO) diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc index b922811b4f104..31da214fbc39a 100644 --- a/paddle/fluid/imperative/all_reduce.cc +++ b/paddle/fluid/imperative/all_reduce.cc @@ -28,8 +28,8 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/parallel_context.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/string/string_helper.h" namespace paddle { @@ -64,7 +64,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst, dst->Resize(src.dims()); auto *dst_ptr = dst->mutable_data(src.place(), src.type()); auto nccl_dtype = platform::ToNCCLDataType(src.type()); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( src_ptr, dst_ptr, src.numel(), nccl_dtype, ncclSum, comm->comm(), stream)); } @@ -100,16 +100,12 @@ static void AllReduce(const framework::SelectedRows &src, if (!use_calc_stream) { dev_ctx->Wait(); } - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( gpu_rows_num_ptr + strategy.local_rank_, gpu_rows_num_ptr, 1, ncclInt64, comm->comm(), stream)); if (!use_calc_stream) { -#ifdef PADDLE_WITH_RCCL - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif + platform::GpuStreamSync(stream); } const auto *cpu_rows_num_ptr = rows_num_vector.data(); @@ -146,11 +142,11 @@ static void AllReduce(const framework::SelectedRows &src, // allgather is used to speed up the allreduce by replacing broadcast. auto row_sendcount = cpu_rows_num_ptr[0]; VLOG(3) << "allgather replaces broadcast to speed up in sparse allreduce"; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( src_rows_ptr, dst_rows_ptr, row_sendcount, ncclInt64, comm->comm(), stream)); auto value_sendcount = cpu_rows_num_ptr[0] * feature_size; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( src_tensor_ptr, dst_tensor_ptr, value_sendcount, nccl_dtype, comm->comm(), stream)); return; @@ -158,13 +154,13 @@ static void AllReduce(const framework::SelectedRows &src, for (int i = 0; i < strategy.nranks_; ++i) { if (cpu_rows_num_ptr[i] > 0) { // 2. Broadcast the rows of SelectedRows - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBroadcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast( src_rows_ptr, dst_rows_ptr + row_offset, cpu_rows_num_ptr[i], ncclInt64, i, comm->comm(), stream)); // 3. Broadcast the tensor data of SelectedRows auto *dst_tensor_ptr_i = reinterpret_cast(dst_tensor_ptr) + row_offset * feature_size * sizeof_dtype; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBroadcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast( src_tensor_ptr, dst_tensor_ptr_i, cpu_rows_num_ptr[i] * feature_size, nccl_dtype, i, comm->comm(), stream)); row_offset += cpu_rows_num_ptr[i]; @@ -209,12 +205,8 @@ void AllReduce(const framework::Variable &src, framework::Variable *dst, AllReduce(src.Get(), tmp_dst.GetMutable(), strategy, stream, comm); -// stream must synchronize to ensure accuracy of the move operation -#ifdef PADDLE_WITH_RCCL - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif + // stream must synchronize to ensure accuracy of the move operation + platform::GpuStreamSync(stream); *dst = std::move(tmp_dst); } #endif diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc index 8c6b840f60a59..6569929d6f5d7 100644 --- a/paddle/fluid/imperative/bkcl_context.cc +++ b/paddle/fluid/imperative/bkcl_context.cc @@ -150,6 +150,23 @@ void BKCLParallelContext::AllReduceByStream(const framework::Variable &src, } } +void BKCLParallelContext::Broadcast(framework::Variable *src, int ring_id) { + VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id; + framework::Tensor *src_tensor = src->GetMutable(); + const auto &place = src_tensor->place(); + platform::BKCLComm *comm = + platform::BKCLCommContext::Instance().Get(ring_id, place); + XPUStream stream = comm->stream(); + + void *src_ptr = src_tensor->data(); + auto data_type = platform::ToBKCLDataType(src_tensor->type()); + + PADDLE_ENFORCE_EQ(bkcl_broadcast(comm->comm(), src_ptr, src_ptr, + src_tensor->numel(), data_type, 0, stream), + BKCL_SUCCESS, + platform::errors::Unavailable("bkcl_broadcast failed")); +} + paddle::platform::DeviceContext *BKCLParallelContext::GetDeviceContext( int ring_id) { return static_cast( diff --git a/paddle/fluid/imperative/bkcl_context.h b/paddle/fluid/imperative/bkcl_context.h index 652b7689666c6..a5a10b19389c0 100644 --- a/paddle/fluid/imperative/bkcl_context.h +++ b/paddle/fluid/imperative/bkcl_context.h @@ -42,6 +42,8 @@ class BKCLParallelContext : public ParallelContext { framework::Variable* dst, int ring_id, bool use_calc_stream) override; + void Broadcast(framework::Variable* src, int ring_id) override; + paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override; void WaitCompute(int ring_id) override; diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc index ef1bf0d158787..1eaf0c6538043 100644 --- a/paddle/fluid/imperative/gloo_context.cc +++ b/paddle/fluid/imperative/gloo_context.cc @@ -37,7 +37,7 @@ void GLOOParallelContext::Init() { gloo_wrapper->SetSize(strategy_.nranks_); gloo_wrapper->SetRank(strategy_.local_rank_); gloo_wrapper->SetPrefix(""); - gloo_wrapper->SetIface("lo"); + gloo_wrapper->SetIface(""); auto addr = paddle::string::Split(strategy_.trainer_endpoints_[0], ':'); VLOG(4) << "Server is" << strategy_.trainer_endpoints_[0]; std::string host = addr[0]; @@ -176,6 +176,11 @@ void GLOOParallelContext::AllReduce(const framework::SelectedRows &src, } } +void GLOOParallelContext::Broadcast(framework::Variable *src, int ring_id) { + PADDLE_THROW(platform::errors::Unimplemented( + "Unimplemented inter-broadcast for CPU now.")); +} + paddle::platform::DeviceContext *GLOOParallelContext::GetDeviceContext( int ring_id) { // return the CPUDeviceContext diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h index 305a75a881153..e7c9ba4cfddb6 100644 --- a/paddle/fluid/imperative/gloo_context.h +++ b/paddle/fluid/imperative/gloo_context.h @@ -47,6 +47,8 @@ class GLOOParallelContext : public ParallelContext { framework::Variable* dst, int ring_id, bool use_calc_stream) override; + void Broadcast(framework::Variable* src, int ring_id) override; + paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override; void WaitCompute(int ring_id) override; diff --git a/paddle/fluid/imperative/hccl_context.cc b/paddle/fluid/imperative/hccl_context.cc index 4f1135fa9ddd4..55c52ae6c11de 100644 --- a/paddle/fluid/imperative/hccl_context.cc +++ b/paddle/fluid/imperative/hccl_context.cc @@ -158,6 +158,29 @@ void HCCLParallelContext::AllReduceByStream(const framework::Variable &src, } } +void HCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) { + VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id; + if (src->IsType()) { + framework::Tensor *src_tensor = src->GetMutable(); + const auto &place = src_tensor->place(); + platform::HCCLComm *comm = + platform::HCCLCommContext::Instance().Get(ring_id, place); + aclrtStream stream = comm->stream(); + + void *src_ptr = + reinterpret_cast(const_cast(src_tensor->data())); + auto hccl_dtype = platform::ToHCCLDataType(src_tensor->type()); + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast( + src_ptr, src_tensor->numel(), hccl_dtype, 0, comm->comm(), + reinterpret_cast(stream))); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported variable type %s for imperative allreduce, only " + "LoDTensor is supported.", + platform::demangle(framework::ToTypeName(src->Type())))); + } +} + paddle::platform::DeviceContext *HCCLParallelContext::GetDeviceContext( int ring_id) { return static_cast( diff --git a/paddle/fluid/imperative/hccl_context.h b/paddle/fluid/imperative/hccl_context.h index b7f22f3a0b0f1..e5f58dea9fb06 100644 --- a/paddle/fluid/imperative/hccl_context.h +++ b/paddle/fluid/imperative/hccl_context.h @@ -50,6 +50,8 @@ class HCCLParallelContext : public ParallelContext { framework::Variable* dst, int ring_id, bool use_calc_stream) override; + void Broadcast(framework::Variable* src, int ring_id) override; + paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override; void WaitCompute(int ring_id) override; diff --git a/paddle/fluid/imperative/heter_ccl_context.cc b/paddle/fluid/imperative/heter_ccl_context.cc new file mode 100644 index 0000000000000..896f29fdd0c25 --- /dev/null +++ b/paddle/fluid/imperative/heter_ccl_context.cc @@ -0,0 +1,205 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/heter_ccl_context.h" + +// NCCL first +#ifdef PADDLE_WITH_NCCL +#include "paddle/fluid/imperative/all_reduce.h" +#endif + +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/split.h" +#include "paddle/fluid/string/string_helper.h" + +namespace paddle { +namespace framework { +class Variable; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace imperative { + +HeterParallelContext::HeterParallelContext(const ParallelStrategy &strategy, + const int &device_id) +#ifdef PADDLE_WITH_NCCL + : ParallelContext(strategy, platform::CUDAPlace(device_id)) +#elif PADDLE_WITH_XPU_BKCL + : ParallelContext(strategy, platform::XPUPlace(device_id)) +#elif PADDLE_WITH_ASCEND_CL + : ParallelContext(strategy, platform::NPUPlace(device_id)) +#else + : ParallelContext(strategy, platform::CPUPlace()) +#endif +{ + // construct node_strategy_ from global strategy by selecting the + // endpoints with same ip address. + std::string node_ip = strategy_.current_endpoint_.substr( + 0, strategy_.current_endpoint_.find(':')); + int node_nranks = 0; + int inter_rank = -1; + + std::vector all_eps = strategy_.trainer_endpoints_; + std::vector inter_endpoints; + std::set nodes_ips; + for (auto ep : all_eps) { + std::string ip = ep.substr(0, ep.find(':')); + // record ip of different nodes + if (nodes_ips.find(ip) == nodes_ips.end()) { + if (ep == strategy_.current_endpoint_) { + inter_rank = nodes_ips.size(); + } + inter_endpoints.push_back(ep); + nodes_ips.emplace(ip); + } + + if (ip == node_ip) { + if (ep == strategy_.current_endpoint_) { + node_strategy_.local_rank_ = node_nranks; + } + node_nranks++; + node_strategy_.trainer_endpoints_.push_back(ep); + } + } + + VLOG(0) << "init node size " << node_nranks << " rank " + << node_strategy_.local_rank_; + + PADDLE_ENFORCE_NE(node_nranks, 0, + platform::errors::InvalidArgument( + "The number of local nranks should not be zero.")); + node_strategy_.nranks_ = node_nranks; + node_strategy_.current_endpoint_ = strategy_.current_endpoint_; + + if (inter_rank >= 0 && inter_endpoints.size() > 1) { + inter_strategy_.nranks_ = inter_endpoints.size(); + inter_strategy_.local_rank_ = inter_rank; + inter_strategy_.current_endpoint_ = strategy_.current_endpoint_; + inter_strategy_.trainer_endpoints_ = inter_endpoints; +#ifdef PADDLE_WITH_GLOO + inter_parallel_ctx_ = std::make_shared( + inter_strategy_, platform::CPUPlace()); +#endif + } + + VLOG(0) << "init inter size " << inter_endpoints.size() << " rank " + << inter_rank; + +#ifdef PADDLE_WITH_NCCL + node_place_ = platform::CUDAPlace(device_id); + node_parallel_ctx_ = + std::make_shared(node_strategy_, node_place_); +#endif +#ifdef PADDLE_WITH_XPU_BKCL + node_place_ = platform::XPUPlace(device_id); + node_parallel_ctx_ = + std::make_shared(node_strategy_, node_place_); +#endif +#ifdef PADDLE_WITH_ASCEND_CL + node_place_ = platform::NPUPlace(device_id); + node_parallel_ctx_ = + std::make_shared(node_strategy_, node_place_); +#endif +} + +void HeterParallelContext::Init() { + PADDLE_ENFORCE_NE( + node_parallel_ctx_, nullptr, + platform::errors::Unavailable( + "The heter parallel context has not been initialized.")); + + if (inter_parallel_ctx_ != nullptr) { + inter_parallel_ctx_->Init(); + } + + node_parallel_ctx_->Init(); + + VLOG(3) << "/// DEBUG /// heter parallel env init done..." << std::endl; +} + +void HeterParallelContext::InitWithRingID(int ring_id) { + PADDLE_THROW(platform::errors::Unimplemented( + "Unimplemented InitWithRingID from heter ctx.")); +} + +void HeterParallelContext::AllReduceByStream(const framework::Variable &src, + framework::Variable *dst, + int ring_id, + bool use_calc_stream) { + // step 1: call reduce within node + VLOG(3) << "/// DEBUG /// step 1: reduce in node... "; + node_parallel_ctx_->AllReduceByStream(src, dst, ring_id, false); + node_parallel_ctx_->WaitComm(ring_id); + + // step 2: call allreduce between nodes with gloo + if (inter_parallel_ctx_ != nullptr) { + // copy src to cpu + // dst is now the src + auto src_tensor = dst->Get(); + framework::Variable src_cpu; + auto src_cpu_tensor = src_cpu.GetMutable(); + framework::TensorCopySync(src_tensor, platform::CPUPlace(), src_cpu_tensor); + + // allreduce src/cpu to dst/cpu + framework::Variable dst_cpu; + inter_parallel_ctx_->AllReduceByStream(src_cpu, &dst_cpu, ring_id, false); + inter_parallel_ctx_->WaitComm(ring_id); + + // copy dst/cpu to dst + auto dst_cpu_tensor = dst_cpu.Get(); + auto dst_tensor = dst->GetMutable(); + framework::TensorCopySync(dst_cpu_tensor, dst_tensor->place(), dst_tensor); + + inter_parallel_ctx_->WaitComm(ring_id); + } + + // step 3: call broadcast within node + VLOG(3) << "/// DEBUG /// step 3: broadcast within node... "; + node_parallel_ctx_->WaitComm(ring_id); + node_parallel_ctx_->Broadcast(dst, ring_id); + node_parallel_ctx_->WaitComm(ring_id); +} + +void HeterParallelContext::Broadcast(framework::Variable *src, int ring_id) { + PADDLE_THROW(platform::errors::Unimplemented("Unimplemented function.")); +} + +paddle::platform::DeviceContext *HeterParallelContext::GetDeviceContext( + int ring_id) { + // directly call the implementation of target parallel ctx. + return node_parallel_ctx_->GetDeviceContext(ring_id); +} + +void HeterParallelContext::WaitCompute(int ring_id) { + // directly call the implementation of target parallel ctx. + node_parallel_ctx_->WaitCompute(ring_id); +} + +void HeterParallelContext::WaitComm(int ring_id) { + // directly call the implementation of target parallel ctx. + node_parallel_ctx_->WaitComm(ring_id); +} + +void HeterParallelContext::SynchronizeCompute() { + // directly call the implementation of target parallel ctx. + node_parallel_ctx_->SynchronizeCompute(); +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/heter_ccl_context.h b/paddle/fluid/imperative/heter_ccl_context.h new file mode 100644 index 0000000000000..8ea5e85603ab5 --- /dev/null +++ b/paddle/fluid/imperative/heter_ccl_context.h @@ -0,0 +1,78 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include + +#ifdef PADDLE_WITH_NCCL +#include "paddle/fluid/imperative/nccl_context.h" +#endif + +#ifdef PADDLE_WITH_XPU_BKCL +#include "paddle/fluid/imperative/bkcl_context.h" +#endif + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/imperative/hccl_context.h" +#endif + +#include "paddle/fluid/imperative/gloo_context.h" +#include "paddle/fluid/imperative/parallel_context.h" + +namespace paddle { +namespace framework { +class Variable; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace imperative { + +class HeterParallelContext : public ParallelContext { + public: + explicit HeterParallelContext(const ParallelStrategy& strategy, + const int& device_id); + + ~HeterParallelContext() override = default; + + void Init() override; + + void InitWithRingID(int ring_id) override; + + void AllReduceByStream(const framework::Variable& src, + framework::Variable* dst, int ring_id, + bool use_calc_stream) override; + + void Broadcast(framework::Variable* src, int ring_id) override; + + paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override; + + void WaitCompute(int ring_id) override; + + void WaitComm(int ring_id) override; + + void SynchronizeCompute() override; + + private: + ParallelStrategy inter_strategy_; + ParallelStrategy node_strategy_; + platform::Place node_place_; + std::shared_ptr node_parallel_ctx_{nullptr}; + std::shared_ptr inter_parallel_ctx_{nullptr}; +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index ec5fb63f0d933..892c864027d11 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -281,16 +281,6 @@ class VarBase { static ThreadSafeNameSet name_set_; }; -class Layer { - public: - virtual ~Layer() {} - - virtual std::vector> Forward( - const std::vector>& inputs) { - return {}; - } -}; - std::shared_ptr CreateGradOpNode( const framework::OperatorBase& op, const NameVarBaseMap& ins, const NameVarBaseMap& outs, const framework::AttributeMap& attrs, diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index 32becda4edc95..15146f6c1204e 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -20,6 +20,14 @@ #include "paddle/fluid/platform/gen_comm_id_helper.h" #endif +#ifdef PADDLE_WITH_NCCL +#include +#include "paddle/fluid/platform/dynload/nccl.h" +#endif + +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" @@ -127,6 +135,20 @@ void NCCLParallelContext::AllReduceByStream(const framework::Variable &src, AllReduce(src, dst, strategy_, ring_id, use_calc_stream); } +void NCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) { + VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id; + framework::Tensor *src_tensor = src->GetMutable(); + const auto &place = src_tensor->place(); + platform::NCCLComm *comm = + platform::NCCLCommContext::Instance().Get(ring_id, place); + gpuStream_t stream = comm->stream(); + + void *src_ptr = src_tensor->data(); + auto nccl_dtype = platform::ToNCCLDataType(src_tensor->type()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( + src_ptr, src_tensor->numel(), nccl_dtype, 0, comm->comm(), stream)); +} + paddle::platform::DeviceContext *NCCLParallelContext::GetDeviceContext( int ring_id) { return static_cast( @@ -153,11 +175,11 @@ void NCCLParallelContext::WaitCompute(int ring_id) { // compute_stream-->event-->comm_stream #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, compute_stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, compute_stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0)); #endif } @@ -179,11 +201,11 @@ void NCCLParallelContext::WaitComm(int ring_id) { // comm_stream-->event-->compute_stream #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, comm_stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, comm_stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0)); #endif } diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h index 1eee393aa714b..bb5b8ea32df4f 100644 --- a/paddle/fluid/imperative/nccl_context.h +++ b/paddle/fluid/imperative/nccl_context.h @@ -18,7 +18,7 @@ #include #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/cuda_resource_pool.h" +#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #endif #ifdef PADDLE_WITH_NCCL @@ -60,6 +60,8 @@ class NCCLParallelContext : public ParallelContext { framework::Variable* dst, int ring_id, bool use_calc_stream) override; + void Broadcast(framework::Variable* src, int ring_id) override; + paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override; void WaitCompute(int ring_id) override; diff --git a/paddle/fluid/imperative/parallel_context.h b/paddle/fluid/imperative/parallel_context.h index f537a316014d6..8bdfccc144243 100644 --- a/paddle/fluid/imperative/parallel_context.h +++ b/paddle/fluid/imperative/parallel_context.h @@ -56,6 +56,8 @@ class ParallelContext { framework::Variable* dst, int ring_id, bool use_calc_stream) = 0; + virtual void Broadcast(framework::Variable* src, int ring_id) = 0; + virtual paddle::platform::DeviceContext* GetDeviceContext(int ring_id) = 0; // comm_stream[ring_id] wait compute_stream. diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 604f9d2be9e48..8875ef74bce14 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -24,6 +24,8 @@ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_op_list.h" #endif +#include "paddle/fluid/platform/device/gpu/gpu_info.h" + DECLARE_bool(check_nan_inf); DECLARE_bool(run_pten_kernel); DECLARE_bool(benchmark); @@ -299,44 +301,28 @@ static void BuildDygraphPtenKernelContext( size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second); size_t end_idx = start_idx + ins_vector.size(); - - // The current size of input/output in pt_kernel_context_ is at least equal - // the start_idx. For the reason of reusing the allocted of inputs or - // outputs in pt_kernel_context_, the current size of input/output can be - // greater then the index of which the tensort wanted to set to, so it will - // use ReMakePtenDenseTensorFromVar to make pten tensor. - if (kernel_ctx->InputsSize() == start_idx) { - paddle::SmallVector> tmp_inputs; - for (const auto& var : ins_vector) { - const auto& variable = var->Var(); - tmp_inputs.emplace_back( - experimental::MakePtenTensorBaseFromVar(variable, in_def)); - } - kernel_ctx->EmplaceBackInputs(std::move(tmp_inputs)); - } else if (kernel_ctx->InputsSize() > start_idx) { - size_t input_size = kernel_ctx->InputsSize(); - for (size_t j = 0; j < ins_vector.size(); ++j) { - if (input_size > start_idx + j) { + auto current_vector_size = kernel_ctx->InputsSize(); + + // If the memory needed is less than the current memory allocated, we will + // reuse the current memory by using ReMakePtenDenseTensorFromVar. + // Otherwise,we will create new storage. + for (size_t offset = 0; offset < ins_vector.size(); ++offset) { + const auto& variable = ins_vector[offset]->Var(); + if (current_vector_size > start_idx + offset) { + auto& input_ptr = kernel_ctx->MutableInputPtrAt(start_idx + offset); + if (input_ptr == nullptr) { + input_ptr = experimental::MakePtenTensorBaseFromVar(variable, in_def); + } else { experimental::ReMakePtenDenseTensorFromVar( - ins_vector[j]->Var(), in_def, - kernel_ctx->MutableInputAt(start_idx + j)); - // TODO(chentianyu03): When multi input kernel, open this code - /* - } else { - kernel_ctx->EmplaceBackInputWithoutSetRange( - experimental::MakePtenTensorBaseFromVar(ins_vector[j]->Var(), - in_def)); - */ + variable, in_def, kernel_ctx->MutableInputAt( + start_idx + offset)); } + } else { + kernel_ctx->EmplaceBackInputWithoutSetRange( + experimental::MakePtenTensorBaseFromVar(variable, in_def)); } - kernel_ctx->MutableInputRangeAt(i) = std::make_pair(start_idx, end_idx); - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Error start index when trying to set new tensor to inputs, start " - "index is `%d`, but current pt_kernel_context_.inputs.size() is " - "`%d`.", - start_idx, kernel_ctx->InputsSize())); } + kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); } for (size_t i = 0; i < output_names.size(); ++i) { @@ -345,44 +331,22 @@ static void BuildDygraphPtenKernelContext( size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second); size_t end_idx = start_idx + outs_vector.size(); - - // The current size of input/output in pt_kernel_context_ is at least equal - // the start_idx. For the reason of reusing the allocted of inputs or - // outputs in pt_kernel_context_, the current size of input/output can be - // greater then the index of which the tensort wanted to set to, so it will - // use ReMakePtenDenseTensorFromVar to make pten tensor. - if (kernel_ctx->OutputsSize() == start_idx) { - paddle::SmallVector> tmp_outputs; - for (auto& var : outs_vector) { - auto* variable = var->MutableVar(); - tmp_outputs.emplace_back( - experimental::MakePtenTensorBaseFromVar(variable, out_def)); - } - kernel_ctx->EmplaceBackOutputs(std::move(tmp_outputs)); - } else if (kernel_ctx->OutputsSize() > start_idx) { - size_t output_size = kernel_ctx->OutputsSize(); - for (size_t j = 0; j < outs_vector.size(); ++j) { - if (output_size > i + j) { - experimental::ReMakePtenDenseTensorFromVar( - outs_vector[j]->MutableVar(), out_def, - kernel_ctx->MutableOutputAt(i + j)); - // TODO(chentianyu03): When multi output kernel, open this code - /* - } else { - kernel_ctx->EmplaceBackOutputWithoutSetRange( - experimental::MakePtenTensorBaseFromVar( - outs_vector[j]->MutableVar(), out_def)); - */ - } + auto current_vector_size = kernel_ctx->OutputsSize(); + // If the memory needed is less than the current memory allocated, we will + // reuse the current memory by using ReMakePtenDenseTensorFromVar. + // Otherwise,we will create new storage. + for (size_t offset = 0; offset < outs_vector.size(); ++offset) { + if (current_vector_size > start_idx + offset) { + experimental::ReMakePtenDenseTensorFromVar( + outs_vector[offset]->MutableVar(), out_def, + kernel_ctx->MutableOutputAt(start_idx + offset)); + } else { + kernel_ctx->EmplaceBackOutputWithoutSetRange( + experimental::MakePtenTensorBaseFromVar( + outs_vector[offset]->MutableVar(), out_def)); } - kernel_ctx->MutableOutputRangeAt(i) = std::make_pair(start_idx, end_idx); - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Error start index when trying to set new tensor to inputs, start " - "index is `%d`, but current pt_kernel_context_.outputs.size() is " - "`%d`.", - start_idx, kernel_ctx->OutputsSize())); } + kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i); } for (size_t i = 0; i < attr_names.size(); ++i) { @@ -561,12 +525,8 @@ static void PreparedOpRunPtImpl( if (FLAGS_benchmark) { dev_ctx->Wait(); -#if defined(PADDLE_WITH_CUDA) - PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError()); - VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; -#endif -#if defined(PADDLE_WITH_HIP) - PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError()); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; #endif } diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h index 1baf73ab3b95d..159371970dcac 100644 --- a/paddle/fluid/imperative/py_layer_fwd.h +++ b/paddle/fluid/imperative/py_layer_fwd.h @@ -101,6 +101,28 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls, "`%s` type argument can not be cast into `Tensor`.", ptr->ptr()->ob_type->tp_name)); } + } else if (py::isinstance(*ptr) || + py::isinstance(*ptr)) { + try { + auto tuple_arg = ptr->cast(); + for (auto iter = tuple_arg.begin(); iter != tuple_arg.end(); ++iter) { + try { + auto t = iter->cast>(); + input_vars.push_back(t); + } catch (py::cast_error& err) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The `PyLayer.forward` function contains invalid argument, " + "the " + "`%s` type argument can not be cast into `Tensor`.", + ptr->ptr()->ob_type->tp_name)); + } + } + } catch (py::cast_error& err) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The `PyLayer.forward` function contains invalid argument, the " + "`%s` type argument can not be cast into `Tensor`.", + ptr->ptr()->ob_type->tp_name)); + } } } } @@ -119,6 +141,28 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls, "`%s` type argument can not be cast into `Tensor`.", ptr->second.ptr()->ob_type->tp_name)); } + } else if (py::isinstance(*ptr->second) || + py::isinstance(*ptr->second)) { + try { + auto tuple_arg = ptr->second.cast(); + for (auto iter = tuple_arg.begin(); iter != tuple_arg.end(); ++iter) { + try { + auto t = iter->cast>(); + input_vars.push_back(t); + } catch (py::cast_error& err) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The `PyLayer.forward` function contains invalid argument, " + "the " + "`%s` type argument can not be cast into `Tensor`.", + ptr->second.ptr()->ob_type->tp_name)); + } + } + } catch (py::cast_error& err) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The `PyLayer.forward` function contains invalid argument, the " + "`%s` type argument can not be cast into `Tensor`.", + ptr->second.ptr()->ob_type->tp_name)); + } } } } @@ -182,6 +226,15 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls, } } if (if_inplace) { + // when pylayer forward is inplace strategy, check whether tensor is leaf + for (auto& t : input_vars) { + PADDLE_ENFORCE_EQ(t->IsLeaf() && !t->OverridedStopGradient(), false, + platform::errors::InvalidArgument( + "Leaf Var (%s) that doesn't stop gradient can't " + "use inplace strategy.", + t->Name())); + } + inplace_map["X"] = "Out"; } diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 2f023f644fd06..068de4f0435bb 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -27,8 +27,9 @@ namespace paddle { namespace imperative { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ + defined(PADDLE_WITH_ASCEND_CL) // div the nranks void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { framework::Tensor *tensor = @@ -41,6 +42,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) DivNRanks(tensor, nranks, context); #endif + } else if (platform::is_npu_place(tensor->place())) { + // TODO(kuizhiqing) + VLOG(4) << "divnrank for npu not support yet"; } else if (platform::is_cpu_place(tensor->place())) { VLOG(4) << "before div 2" << *tensor; VLOG(4) << "NDiv for cpu devices : rank = " << nranks; @@ -207,6 +211,70 @@ void SplitTensorsWithType( } #endif +// NOTE(liubo48): Only implement operators::math::SplitFunctor for npu now. +// If later the operators::StridedMemcpyWithAxis0 is supported, +// then this specific SplitTensorsForAllReduce can be removed. +#ifdef PADDLE_WITH_ASCEND_CL +template <> +void SplitTensorsForAllReduce( + const platform::NPUDeviceContext &context, + framework::Variable *p_dense_contents, + std::vector *p_dense_tensors) { + auto *in = p_dense_contents->GetMutable(); + std::vector outs; + std::vector shape_refer; + + outs.reserve(p_dense_tensors->size()); + shape_refer.reserve(p_dense_tensors->size()); + + for (auto &tensor : *p_dense_tensors) { + outs.emplace_back(&tensor); + shape_refer.emplace_back(&tensor); + } + operators::math::SplitFunctor + split_functor_; + split_functor_(context, *in, shape_refer, 0, &outs); +} + +template <> +void ConcatTensorsWithType( + const platform::NPUDeviceContext &context, + const std::vector &dense_tensors_, + framework::Variable *p_dense_contents, + framework::proto::VarType::Type type) { + switch (type) { + case framework::proto::VarType::FP32: + ConcatTensorsForAllReduce( + context, dense_tensors_, p_dense_contents); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it concats tensors for " + "allreduce.", + framework::DataTypeToString(type))); + } +} + +template <> +void SplitTensorsWithType( + const platform::NPUDeviceContext &context, + framework::Variable *p_dense_contents, + std::vector *p_dense_tensors, + framework::proto::VarType::Type type) { + switch (type) { + case framework::proto::VarType::FP32: + SplitTensorsForAllReduce( + context, p_dense_contents, p_dense_tensors); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it splits tensors for " + "allreduce.", + framework::DataTypeToString(type))); + } +} +#endif + void Group::ConcatTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { @@ -831,7 +899,7 @@ void Reducer::MarkGroupReady(size_t group_index) { } }); #elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \ - defined(PADDLE_WITH_GLOO) + defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) FusedAllReduceSchedule(run_order, group, next_group_); #else PADDLE_THROW(platform::errors::PreconditionNotMet( @@ -1014,7 +1082,7 @@ void Reducer::FinalizeBackward() { if (find_unused_vars_each_step_) { // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_GLOO) + defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) ProcessUnusedDenseVars(); #endif // Initialize local used vars diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index b5a7dd149f09f..3c03babc52cbe 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -48,8 +48,9 @@ class VariableWrapper; namespace paddle { namespace imperative { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ + defined(PADDLE_WITH_ASCEND_CL) template struct DivNRanksFunctor { diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt index adb560df77c78..32e982f1f15ca 100644 --- a/paddle/fluid/imperative/tests/CMakeLists.txt +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -1,8 +1,10 @@ if(WIN32) cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS device_context) else() - if (WITH_NCCL OR WITH_RCCL) + if (WITH_GLOO AND (WITH_NCCL OR WITH_RCCL)) cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context) + cc_test(heter_ccl_context_test SRCS heter_ccl_context_test.cc DEPS heter_ccl_context nccl_context imperative_gloo_context gloo_context gloo_wrapper gloo fs shell) + #set_tests_properties(heter_ccl_context_test PROPERTIES LABELS "RUN_TYPE=DIST") endif() if (WITH_XPU_BKCL) cc_test(bkcl_context_test SRCS bkcl_context_test.cc DEPS bkcl_context) diff --git a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc new file mode 100644 index 0000000000000..d36743510e5ba --- /dev/null +++ b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include // NOLINT + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/imperative/heter_ccl_context.h" + +#include "gtest/gtest.h" + +namespace imperative = paddle::imperative; +namespace platform = paddle::platform; +namespace framework = paddle::framework; + +imperative::ParallelStrategy GetStrategy(int local_rank) { + std::vector eps = {"127.0.0.1:37580", "127.0.0.1:37581"}; + imperative::ParallelStrategy strategy; + strategy.trainer_endpoints_ = eps; + strategy.current_endpoint_ = eps[local_rank]; + strategy.nranks_ = eps.size(); + strategy.local_rank_ = local_rank; + return strategy; +} + +#ifdef PADDLE_WITH_NCCL +void AllReduceByStream(int local_rank, int device_id) { + int data_size = 32; + const auto& place = platform::CUDAPlace(device_id); + platform::CUDADeviceContext ctx(place); + + // heter_parallel_ctx + imperative::HeterParallelContext hpc(GetStrategy(local_rank), device_id); + + // init + hpc.Init(); + + // input and output data + framework::Variable* src_dev_var(new framework::Variable()); + auto* src_dev_tensor = src_dev_var->GetMutable(); + src_dev_tensor->mutable_data(framework::make_ddim({data_size}), place); + + std::vector src_vec; + for (int i = 0; i < data_size; i++) { + src_vec.push_back(1.0 + local_rank); + } + framework::TensorFromVector(src_vec, ctx, src_dev_tensor); + ctx.Wait(); + + framework::Variable* dst_dev_var(new framework::Variable()); + auto* dst_dev_tensor = dst_dev_var->GetMutable(); + dst_dev_tensor->mutable_data(framework::make_ddim({data_size}), place); + + // call allreduce + hpc.AllReduceByStream(*src_dev_var, dst_dev_var, 0, false); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + + // check result + std::vector dst_vec; + framework::TensorToVector(*dst_dev_tensor, ctx, &dst_vec); + ctx.Wait(); + + EXPECT_EQ(dst_vec.size(), src_vec.size()); + for (int i = 0; i < data_size; i++) { + EXPECT_EQ(dst_vec[i], 3.0); + } +} + +TEST(AllReduceByStream, Run) { + if (platform::GetGPUDeviceCount() >= 2) { + std::thread t0(AllReduceByStream, 0, 0); + std::thread t1(AllReduceByStream, 1, 1); + t0.join(); + t1.join(); + } +} +#endif diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc index 2d8a08217b0b8..401e4e324eb89 100644 --- a/paddle/fluid/imperative/tests/nccl_context_test.cc +++ b/paddle/fluid/imperative/tests/nccl_context_test.cc @@ -14,6 +14,8 @@ #include // NOLINT +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/nccl_context.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" @@ -21,6 +23,7 @@ namespace imperative = paddle::imperative; namespace platform = paddle::platform; +namespace framework = paddle::framework; int nrings = 2; imperative::ParallelStrategy GetStrategy(int local_rank) { @@ -68,4 +71,51 @@ TEST(BcastNCCLId, Run) { NCCL_UNIQUE_ID_BYTES)); } } + +void Broadcast(int local_rank, int device_id) { + int data_size = 4; + float test_data = 7; + const auto& place = platform::CUDAPlace(device_id); + platform::CUDADeviceContext ctx(place); + + imperative::NCCLParallelContext npc(GetStrategy(local_rank), place); + + // init + npc.Init(); + + framework::Variable* src_dev_var(new framework::Variable()); + auto* src_dev_tensor = src_dev_var->GetMutable(); + src_dev_tensor->mutable_data(framework::make_ddim({data_size}), place); + + // fill data for rank 0 only + std::vector src_vec; + if (local_rank == 0) { + for (int i = 0; i < data_size; i++) { + src_vec.push_back(test_data); + } + framework::TensorFromVector(src_vec, ctx, src_dev_tensor); + } + ctx.Wait(); + + npc.Broadcast(src_dev_var, 0); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + + // check result + std::vector dst_vec; + framework::TensorToVector(*src_dev_tensor, ctx, &dst_vec); + ctx.Wait(); + + for (int i = 0; i < data_size; i++) { + EXPECT_EQ(dst_vec[i], test_data); + } +} + +TEST(Broadcast, Run) { + if (platform::GetGPUDeviceCount() >= 2) { + std::thread t0(Broadcast, 0, 0); + std::thread t1(Broadcast, 1, 1); + t0.join(); + t1.join(); + } +} #endif diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h index 9fbbe7d06f8ad..c257191a546e4 100644 --- a/paddle/fluid/imperative/variable_wrapper.h +++ b/paddle/fluid/imperative/variable_wrapper.h @@ -209,13 +209,23 @@ class VariableWrapper { uint32_t InplaceVersionSnapshot() const { return inplace_version_snapshot_; } - void ResetInplaceVersion() { - auto new_version = var_.CurrentInplaceVersion(); + void ResetInplaceVersion(bool set_to_zero = false) { + if (!set_to_zero) { + auto new_version = var_.CurrentInplaceVersion(); - VLOG(6) << "The wrapper version of VariableWrapper '" << name_ - << "' will be updated from " << inplace_version_snapshot_ << "to " - << new_version; - inplace_version_snapshot_ = new_version; + VLOG(6) << "The wrapper version of VariableWrapper '" << name_ + << "' will be updated from " << inplace_version_snapshot_ << "to " + << new_version; + inplace_version_snapshot_ = new_version; + + } else { + // Reset Snapshot & InplaceVersion to zero + inplace_version_snapshot_ = 0; + auto var = this->MutableVar(); + if (var) { + var->SetInplaceVersionToZero(); + } + } } bool hasCacheKey(const paddle::framework::OpKernelType& key) { diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index 2202b94bee727..3fa417c2ea631 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -52,11 +52,11 @@ typedef struct { // The traversal order also affect the lifecycles, so different sort_kind is // used. void MemoryOptimizePass::CollectLifeCycle( - std::unordered_map* lifecycles, + Graph* graph, std::unordered_map* lifecycles, int sort_kind) const { - max_lifecycle_ = 0; + int max_lifecycle = 0; for (auto* op_node : framework::ir::TopologyVarientSort( - *graph_, static_cast(sort_kind))) { + *graph, static_cast(sort_kind))) { if (!op_node->IsOp()) continue; auto reads = op_node->inputs; auto writes = op_node->outputs; @@ -77,20 +77,20 @@ void MemoryOptimizePass::CollectLifeCycle( if (node->Var()->Persistable()) continue; std::string var = node->Name(); if (!lifecycles->count(var)) { - (*lifecycles)[var] = std::make_pair(max_lifecycle_, max_lifecycle_); + (*lifecycles)[var] = std::make_pair(max_lifecycle, max_lifecycle); } else { (*lifecycles)[var].second = - std::max(max_lifecycle_, lifecycles->at(var).second); // max() + std::max(max_lifecycle, lifecycles->at(var).second); // max() } } } - ++max_lifecycle_; + ++max_lifecycle; } } void MemoryOptimizePass::CollectVarMemorySize( - space_table_t* space_table) const { + Graph* graph, space_table_t* space_table) const { const int fake_batch_size = 1; auto valid_var = [&](framework::ir::Node* node) -> bool { @@ -130,7 +130,7 @@ void MemoryOptimizePass::CollectVarMemorySize( // although it's not always the case. so black list is the best compromise // between performance and underlying principle. std::unordered_set black_list; - for (auto* node : graph_->Nodes()) { + for (auto* node : graph->Nodes()) { if (node->IsVar() && node->Var()->GetType() == framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) { @@ -141,7 +141,7 @@ void MemoryOptimizePass::CollectVarMemorySize( } // Collect tensors from graph. - for (auto* node : graph_->Nodes()) { + for (auto* node : graph->Nodes()) { if (node->IsVar() && node->Var()->GetType() == framework::proto::VarType::Type::VarType_Type_LOD_TENSOR && @@ -304,7 +304,10 @@ void MemoryOptimizePass::RunImpl(Argument* argument) { // 3. Perform reuse plan: Replace all var's name in the model according to the // mapping table. if (!argument->enable_memory_optim()) return; - graph_ = argument->main_graph_ptr(); + // Because of pass is a singleton, graph can not be member + // variables,otherwise,errors will be caused under multithreading + // conditions. + auto graph = argument->main_graph_ptr(); int sort_kind = 0; std::unordered_map lifecycles; @@ -312,10 +315,10 @@ void MemoryOptimizePass::RunImpl(Argument* argument) { std::unordered_map node2cluster; std::unordered_map cluster_size; - CollectLifeCycle(&lifecycles, sort_kind); - CollectVarMemorySize(&space_table); + CollectLifeCycle(graph, &lifecycles, sort_kind); + CollectVarMemorySize(graph, &space_table); MakeSimpleReusePlan(lifecycles, space_table, &node2cluster, &cluster_size); - UpdateOpDescsByReuse(graph_, node2cluster, sort_kind); + UpdateOpDescsByReuse(graph, node2cluster, sort_kind); return; } diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h index 6d20aee295b7c..57052243d2f18 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h @@ -57,17 +57,15 @@ class MemoryOptimizePass : public AnalysisPass { private: void CollectLifeCycle( + framework::ir::Graph *graph, std::unordered_map *lifecycles, int sort_kind) const; - void CollectVarMemorySize(space_table_t *space_table) const; + void CollectVarMemorySize(framework::ir::Graph *graph, + space_table_t *space_table) const; public: std::string repr() const override; - - private: - mutable framework::ir::Graph *graph_{nullptr}; - mutable int max_lifecycle_{-1}; }; } // namespace analysis diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index ceca7e8146a79..49c4b8d7372e2 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -19,8 +19,8 @@ #include "paddle/fluid/inference/api/paddle_pass_builder.h" #include "paddle/fluid/inference/utils/table_printer.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/gpu_info.h" #ifdef PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/helper.h" diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index b1408995fa157..2293b70246853 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -41,8 +41,8 @@ #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/pten/api/ext/op_meta_info.h" diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index c0038f6c3f038..d5452f82d08b5 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -19,9 +19,8 @@ PADDLE_ROOT=$1 TURN_ON_MKL=$2 # use MKL or Openblas TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode DATA_DIR=$4 # dataset -TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include -TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib -MSVC_STATIC_CRT=$7 +TENSORRT_ROOT_DIR=$5 # TensorRT root dir, default to /usr +MSVC_STATIC_CRT=$6 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform @@ -39,7 +38,7 @@ else fi USE_TENSORRT=OFF -if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then +if [ -d "$TENSORRT_ROOT_DIR" ]; then USE_TENSORRT=ON fi @@ -132,6 +131,28 @@ for WITH_STATIC_LIB in ON OFF; do fi done done + + # --------tensorrt mobilenet on windows------ + if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then + rm -rf * + cmake .. -G "Visual Studio 15 2017" -A x64 -T host=x64 -DPADDLE_LIB=${inference_install_dir} \ + -DWITH_MKL=$TURN_ON_MKL \ + -DDEMO_NAME=trt_mobilenet_demo \ + -DWITH_GPU=$TEST_GPU_CPU \ + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ + -DUSE_TENSORRT=$USE_TENSORRT \ + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR + msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln + Release/trt_mobilenet_demo.exe \ + --modeldir=$DATA_DIR/mobilenet/model \ + --data=$DATA_DIR/mobilenet/data.txt \ + --refer=$DATA_DIR/mobilenet/result.txt + if [ $? -ne 0 ]; then + echo "trt demo trt_mobilenet_demo runs fail." + exit 1 + fi + fi else # -----simple_on_word2vec on linux/mac----- rm -rf * @@ -183,8 +204,7 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ -DUSE_TENSORRT=$USE_TENSORRT \ - -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \ - -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR make -j$(nproc) ./trt_mobilenet_demo \ --modeldir=$DATA_DIR/mobilenet/model \ diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc index 654b58a2ded34..aa29b779e471b 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc @@ -134,6 +134,16 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForOpOutputs( scales_[var_name] = scales_[input_var_name]; } compute_scale = false; + } else if (op->Type() == "slice") { + auto input_var_name = op->Input("Input")[0]; + PADDLE_ENFORCE_NE(scales_.find(input_var_name), scales_.end(), + platform::errors::PreconditionNotMet( + "Input scales must be calculated before the " + "output scales to infer if output is unsigned.")); + if (scales_.find(input_var_name) != scales_.end()) { + scales_[var_name] = scales_[input_var_name]; + } + compute_scale = false; } else if (op->Type() == "concat") { // output of ops with unsigned input must be unsigned is_unsigned = true; diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc index 5a07cc7e240d5..6642a2c030b26 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc @@ -42,6 +42,9 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() { rules_["transpose2"]["X"] = ScaleAlgo::KL; rules_["transpose2"]["Out"] = ScaleAlgo::NONE; + rules_["slice"]["Input"] = ScaleAlgo::KL; + rules_["slice"]["Out"] = ScaleAlgo::NONE; + rules_["fc"]["Input"] = ScaleAlgo::KL; rules_["fc"]["W"] = ScaleAlgo::MAX_CH_T; rules_["fc"]["Bias"] = ScaleAlgo::NONE; diff --git a/paddle/fluid/inference/api/paddle_infer_contrib.cc b/paddle/fluid/inference/api/paddle_infer_contrib.cc index 57b5167337e25..d27f20a93b3a4 100644 --- a/paddle/fluid/inference/api/paddle_infer_contrib.cc +++ b/paddle/fluid/inference/api/paddle_infer_contrib.cc @@ -27,7 +27,7 @@ using paddle::PaddleDType; void* TensorUtils::CudaMallocPinnedMemory(size_t size) { #if defined(PADDLE_WITH_CUDA) void* ptr = nullptr; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMallocHost(&ptr, size)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMallocHost(&ptr, size)); return ptr; #else return nullptr; @@ -36,7 +36,7 @@ void* TensorUtils::CudaMallocPinnedMemory(size_t size) { void TensorUtils::CudaFreePinnedMemory(void* ptr) { #if defined(PADDLE_WITH_CUDA) - PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(ptr)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(ptr)); #endif } diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc index d9cf9e2e86001..b468518fa5a3c 100644 --- a/paddle/fluid/inference/tensorrt/convert/io_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc @@ -45,7 +45,7 @@ class DefaultIOConverter : public EngineIOConverter { "the input max_size. But in's memory_size = %u, max_size = %u.", size, max_size)); if (is_cpu_place(place)) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync( + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( out, in.data(), size, cudaMemcpyHostToDevice, *stream_)); } else if (is_gpu_place(place)) { PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 35c9658108ab5..26d87e4832f5f 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -162,20 +162,6 @@ class Pool2dOpConverter : public OpConverter { } layer = pool_layer; } else if (!adaptive && !global_pooling && ceil_mode) { - nvinfer1::DimsHW pre_pad(0, 0); - nvinfer1::DimsHW post_pad(0, 0); - // If ceil mode is true, we will pad the appropriate size to the input. - DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad, - input_dims); - auto *pad_layer = TRT_ENGINE_ADD_LAYER( - engine_, Padding, *const_cast(input1), pre_pad, - post_pad); - PADDLE_ENFORCE_NOT_NULL( - pad_layer, platform::errors::Fatal( - "Pad layer in poolOp converter could not be " - "created. The pointer to pad layer is `NULL`.")); - input1 = pad_layer->getOutput(0); - auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, nv_pool_type, nv_ksize); pool_layer->setStride(nv_strides); @@ -183,6 +169,8 @@ class Pool2dOpConverter : public OpConverter { pool_layer->setAverageCountExcludesPadding(exclusive); if (padding_algorithm == "SAME") { pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); + } else { + pool_layer->setPaddingMode(nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP); } layer = pool_layer; } else if (global_pooling) { diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 64116b7973e71..2addff52829c8 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -20,8 +20,8 @@ limitations under the License. */ #include "cuda_runtime_api.h" // NOLINT #include "paddle/fluid/inference/tensorrt/helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/gpu_info.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu index 0f32183c0fbc1..70e5a7bcc7b4f 100644 --- a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu @@ -43,16 +43,16 @@ nvinfer1::Weights DeformableConvPlugin::copyToDevice(const void* hostData, size_t count) { int num_bytes = (data_type_ == nvinfer1::DataType::kFLOAT ? 4 : 2); void* deviceData; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&deviceData, count * num_bytes)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy( - deviceData, hostData, count * num_bytes, cudaMemcpyHostToDevice)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(&deviceData, count * num_bytes)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(deviceData, hostData, count * num_bytes, + cudaMemcpyHostToDevice)); return nvinfer1::Weights{data_type_, deviceData, int64_t(count)}; } void DeformableConvPlugin::serializeFromDevice( void** hostBuffer, const nvinfer1::Weights& deviceWeights) const { int num_bytes = (data_type_ == nvinfer1::DataType::kFLOAT ? 4 : 2); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMemcpy(static_cast(*hostBuffer), deviceWeights.values, deviceWeights.count * num_bytes, cudaMemcpyDeviceToHost)); hostBuffer += deviceWeights.count * num_bytes; diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu index a9a50543e7bb7..a4880a9997a53 100644 --- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu @@ -17,7 +17,7 @@ #include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu index 88e075386d093..7cab12b625d23 100644 --- a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu @@ -33,31 +33,31 @@ void Ltgemm_int8_linear( cublasLtMatmulDesc_t matmulDesc, void* alpha_scale, void* alpha_zero, void* alpha_one, void* workspace, cudaStream_t stream) { if (transA_) { - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform( ltHandle, transformDescT, alpha_one, A, Adesc, alpha_zero, nullptr, nullptr, Atransform, AtransformDesc, stream)); } else { - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform( ltHandle, transformDescN, alpha_one, A, Adesc, alpha_zero, nullptr, nullptr, Atransform, AtransformDesc, stream)); } if (transB_) { - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform( ltHandle, transformDescN, alpha_one, B, Bdesc, alpha_zero, nullptr, nullptr, Btransform, BtransformDesc, stream)); } else { - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform( ltHandle, transformDescT, alpha_one, B, Bdesc, alpha_zero, nullptr, nullptr, Btransform, BtransformDesc, stream)); } - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmul( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul( ltHandle, matmulDesc, alpha_scale, Atransform, AtransformDesc, Btransform, BtransformDesc, nullptr, Ctransform, CtransformDesc, Ctransform, CtransformDesc, nullptr, workspace, 0, stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform( ltHandle, transformDescN, alpha_one, Ctransform, CtransformDesc, alpha_zero, nullptr, nullptr, C, Cdesc, stream)); } @@ -69,7 +69,7 @@ void Ltgemm_fp32_linear(cublasLtHandle_t ltHandle, const float* A, cublasLtMatmulDesc_t matmulDesc, void* alpha_scale, void* alpha_zero, void* workspace, cudaStream_t stream) { - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmul( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul( ltHandle, matmulDesc, alpha_scale, A, Adesc, B, Bdesc, alpha_zero, C, Cdesc, C, Cdesc, nullptr, workspace, 0, stream)); } @@ -81,7 +81,7 @@ void Ltgemm_fp16_linear(cublasLtHandle_t ltHandle, const half* A, cublasLtMatmulDesc_t matmulDesc, void* alpha_scale, void* alpha_zero, void* workspace, cudaStream_t stream) { - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmul( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul( ltHandle, matmulDesc, alpha_scale, A, Adesc, B, Bdesc, alpha_zero, C, Cdesc, C, Cdesc, nullptr, workspace, 0, stream)); } @@ -182,98 +182,98 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, int const ldatransform = 32 * n_; int const ldbtransform = 32 * ((m_ + 8 - 1) / 8 * 8); int const ldctransform = 32 * n_; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc( + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( (void**)&Atransform_, sizeof(int8_t) * ((k_ + 32 - 1) / 32 * 32) / 32 * ldatransform)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc( + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( (void**)&Btransform_, sizeof(int8_t) * ((k_ + 32 - 1) / 32 * 32) / 32 * ldbtransform)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc( + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( (void**)&Ctransform_, sizeof(int8_t) * ((m_ + 32 - 1) / 32 * 32) / 32 * ldctransform)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_, AopTranspose == CUBLAS_OP_N ? k_ : n_, AopTranspose == CUBLAS_OP_N ? n_ : k_)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), sizeof(stridea))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_, BopTranspose == CUBLAS_OP_N ? m_ : k_, BopTranspose == CUBLAS_OP_N ? k_ : m_)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), sizeof(strideb))); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), sizeof(stridec))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &AtransformDesc_, cudadataTypeIO, n_, k_, ldatransform)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( AtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( AtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &BtransformDesc_, cudadataTypeIO, m_, k_, ldbtransform)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( BtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( BtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL4_4R2_8C, sizeof(COL4_4R2_8C))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &CtransformDesc_, cudadataTypeIO, n_, m_, ldctransform)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( CtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( CtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32))); cublasOperation_t Transpose = CUBLAS_OP_T; cublasLtPointerMode_t transform_model = CUBLASLT_POINTER_MODE_DEVICE; - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescCreate( &transformDescT_, cudaDataTypeS)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, &cudaDataTypeS, sizeof(cudaDataTypeS))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &Transpose, sizeof(Transpose))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, &transform_model, sizeof(transform_model))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescCreate( &transformDescN_, cudaDataTypeS)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( transformDescN_, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, &cudaDataTypeS, sizeof(cudaDataTypeS))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( transformDescN_, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, &transform_model, sizeof(transform_model))); @@ -282,20 +282,20 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO; #if CUBLAS_VER_MAJOR < 11 - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatmulDescCreate(&matmulDesc_, cudaComputeType)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate( &matmulDesc_, cudaComputeType, cudaDataTypeS)); #endif - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &ATranspose, sizeof(ATranspose))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BTranspose, sizeof(BTranspose))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, sizeof(matmul_model))); @@ -303,17 +303,16 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, for (int i = 0; i < n_; i++) { alpha_tem[i] = alpha_ * inscale_0 * inscale_1 / outscale; } - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc((void**)&alpha_scale_, n_ * sizeof(float))); cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_ * sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMalloc((void**)&alpha_zero_, sizeof(float))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); float one_tem = 1; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float))); cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float), cudaMemcpyHostToDevice); } else if (type_ == nvinfer1::DataType::kHALF) { @@ -324,70 +323,69 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, #else cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F; #endif - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_, AopTranspose == CUBLAS_OP_N ? k_ : n_, AopTranspose == CUBLAS_OP_N ? n_ : k_)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), sizeof(stridea))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_, BopTranspose == CUBLAS_OP_N ? m_ : k_, BopTranspose == CUBLAS_OP_N ? k_ : m_)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), sizeof(strideb))); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), sizeof(stridec))); cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE; #if CUBLAS_VER_MAJOR < 11 - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatmulDescCreate(&matmulDesc_, cudaComputeType)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate( &matmulDesc_, cudaComputeType, cudaDataTypeS)); #endif - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose, sizeof(AopTranspose))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose, sizeof(BopTranspose))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, sizeof(matmul_model))); half alpha_tem = static_cast(alpha_); - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMalloc((void**)&alpha_scale_, sizeof(half))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_scale_, sizeof(half))); cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half), cudaMemcpyHostToDevice); half zero_tem = static_cast(zero); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half), cudaMemcpyHostToDevice); } else { @@ -398,71 +396,70 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, #else cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F; #endif - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_, AopTranspose == CUBLAS_OP_N ? k_ : n_, AopTranspose == CUBLAS_OP_N ? n_ : k_)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), sizeof(stridea))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_, BopTranspose == CUBLAS_OP_N ? m_ : k_, BopTranspose == CUBLAS_OP_N ? k_ : m_)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), sizeof(strideb))); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), sizeof(stridec))); cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE; #if CUBLAS_VER_MAJOR < 11 - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatmulDescCreate(&matmulDesc_, cudaComputeType)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate( &matmulDesc_, cudaComputeType, cudaDataTypeS)); #endif - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose, sizeof(AopTranspose))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose, sizeof(BopTranspose))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, sizeof(matmul_model))); float alpha_tem = alpha_; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc((void**)&alpha_scale_, sizeof(float))); cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMalloc((void**)&alpha_zero_, sizeof(float))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); } @@ -613,13 +610,13 @@ void MatmulPluginDynamic::configurePlugin( int const ldatransform = 32 * n_max; int const ldbtransform = 32 * ((m_max + 8 - 1) / 8 * 8); int const ldctransform = 32 * n_max; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc( + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( (void**)&Atransform_, sizeof(int8_t) * ((k_max + 32 - 1) / 32 * 32) / 32 * ldatransform)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc( + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( (void**)&Btransform_, sizeof(int8_t) * ((k_max + 32 - 1) / 32 * 32) / 32 * ldbtransform)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc( + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( (void**)&Ctransform_, sizeof(int8_t) * ((m_max + 32 - 1) / 32 * 32) / 32 * ldctransform)); @@ -628,38 +625,35 @@ void MatmulPluginDynamic::configurePlugin( for (int i = 0; i < n_max; i++) { alpha_tem[i] = alpha_ * inscale_0 * inscale_1 / outscale; } - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc((void**)&alpha_scale_, n_max * sizeof(float))); cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_max * sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMalloc((void**)&alpha_zero_, sizeof(float))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); float one_tem = 1; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float))); cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float), cudaMemcpyHostToDevice); } else if (type_ == nvinfer1::DataType::kHALF) { half alpha_tem = static_cast(alpha_); - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMalloc((void**)&alpha_scale_, sizeof(half))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_scale_, sizeof(half))); cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half), cudaMemcpyHostToDevice); half zero_tem = static_cast(zero); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half), cudaMemcpyHostToDevice); } else { float alpha_tem = alpha_; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMalloc((void**)&alpha_scale_, sizeof(float))); cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMalloc((void**)&alpha_zero_, sizeof(float))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); } @@ -766,88 +760,88 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, cublasLtOrder_t COL32 = CUBLASLT_ORDER_COL32; cublasLtOrder_t COL4_4R2_8C = CUBLASLT_ORDER_COL4_4R2_8C; - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k, AopTranspose == CUBLAS_OP_N ? k : n, AopTranspose == CUBLAS_OP_N ? n : k)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), sizeof(stridea))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m, BopTranspose == CUBLAS_OP_N ? m : k, BopTranspose == CUBLAS_OP_N ? k : m)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), sizeof(strideb))); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), sizeof(stridec))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &AtransformDesc, cudadataTypeIO, n, k, ldatransform)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( AtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &BtransformDesc, cudadataTypeIO, m, k, ldbtransform)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( BtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL4_4R2_8C, sizeof(COL4_4R2_8C))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &CtransformDesc, cudadataTypeIO, n, m, ldctransform)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( CtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32))); cublasOperation_t Transpose = CUBLAS_OP_T; cublasLtPointerMode_t transform_model = CUBLASLT_POINTER_MODE_DEVICE; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixTransformDescCreate(&transformDescT, cudaDataTypeS)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, &cudaDataTypeS, sizeof(cudaDataTypeS))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &Transpose, sizeof(Transpose))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, &transform_model, sizeof(transform_model))); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixTransformDescCreate(&transformDescN, cudaDataTypeS)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( transformDescN, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE, &cudaDataTypeS, sizeof(cudaDataTypeS))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute( transformDescN, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE, &transform_model, sizeof(transform_model))); @@ -856,20 +850,20 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO; #if CUBLAS_VER_MAJOR < 11 - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatmulDescCreate(&matmulDesc, cudaComputeType)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate( &matmulDesc, cudaComputeType, cudaDataTypeS)); #endif - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &ATranspose, sizeof(ATranspose))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BTranspose, sizeof(BTranspose))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, sizeof(matmul_model))); @@ -889,60 +883,60 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, #else cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F; #endif - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k, AopTranspose == CUBLAS_OP_N ? k : n, AopTranspose == CUBLAS_OP_N ? n : k)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), sizeof(stridea))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m, BopTranspose == CUBLAS_OP_N ? m : k, BopTranspose == CUBLAS_OP_N ? k : m)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), sizeof(strideb))); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), sizeof(stridec))); cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE; #if CUBLAS_VER_MAJOR < 11 - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatmulDescCreate(&matmulDesc, cudaComputeType)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate( &matmulDesc, cudaComputeType, cudaDataTypeS)); #endif - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose, sizeof(AopTranspose))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose, sizeof(BopTranspose))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, sizeof(matmul_model))); @@ -959,60 +953,60 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, #else cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F; #endif - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k, AopTranspose == CUBLAS_OP_N ? k : n, AopTranspose == CUBLAS_OP_N ? n : k)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea), sizeof(stridea))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( &Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m, BopTranspose == CUBLAS_OP_N ? m : k, BopTranspose == CUBLAS_OP_N ? k : m)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb), sizeof(strideb))); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n)); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO, sizeof(cudadataTypeIO))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute( Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec), sizeof(stridec))); cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE; #if CUBLAS_VER_MAJOR < 11 - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dyl::cublasLtMatmulDescCreate(&matmulDesc, cudaComputeType)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate( &matmulDesc, cudaComputeType, cudaDataTypeS)); #endif - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose, sizeof(AopTranspose))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose, sizeof(BopTranspose))); - PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( + PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute( matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model, sizeof(matmul_model))); diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index 091680ff672d0..ec4fcca6d74d0 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -136,7 +136,7 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs, float const* input_ptr = reinterpret_cast(inputs[0]); float* const* h_odatas = reinterpret_cast(outputs); float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync( + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*), cudaMemcpyHostToDevice, stream)); @@ -263,7 +263,7 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, float* const* h_odatas = reinterpret_cast(outputs); float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync( + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*), cudaMemcpyHostToDevice, stream)); @@ -279,7 +279,7 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, half* const* h_odatas = reinterpret_cast(outputs); half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync( + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(half*), cudaMemcpyHostToDevice, stream)); diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu index c3b4a6ff4af1c..74a6c3cdf3e4e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu @@ -107,8 +107,13 @@ bool StackPluginDynamic::supportsFormatCombination( const nvinfer1::PluginTensorDesc& in = in_out[pos]; if (pos == 0) { if (with_fp16_) { - return (in.type == nvinfer1::DataType::kFLOAT || - in.type == nvinfer1::DataType::kHALF) && + return ( +// It's workaround for ernie fix len model. +// Enabling float, half on the same time will cause trt hang. +#if IS_TRT_VERSION_LT(8000) + in.type == nvinfer1::DataType::kFLOAT || +#endif + in.type == nvinfer1::DataType::kHALF) && (in.format == nvinfer1::TensorFormat::kLINEAR); } else { return (in.type == nvinfer1::DataType::kFLOAT) && diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc index 86666950bc36e..c330867607f8e 100644 --- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc @@ -85,7 +85,7 @@ bool TRTInt8Calibrator::setBatch( engine_name_, it.first)); } const auto& d = dataptr->second; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice)); } diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 6fd3944a6c528..a28b0c172aff0 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -94,6 +94,17 @@ function(inference_analysis_api_test target install_dir filename) ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt --refer_result=${install_dir}/result.txt) endfunction() +function(inference_analysis_api_int8_test target install_dir filename) + inference_analysis_test(${target} SRCS ${filename} + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${install_dir}/model + --infer_data=${install_dir}/data.txt + --refer_result=${install_dir}/result.txt + --accuracy=0.8 + --batch_size=5 + --enable_int8=true) +endfunction() + function(inference_multiple_models_analysis_api_test target install_dir filename) inference_analysis_test(${target} SRCS ${filename} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} @@ -284,13 +295,14 @@ set(PYRAMID_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/pyramid_dnn") download_model_and_data_without_verify(${PYRAMID_DNN_INSTALL_DIR} "PyramidDNN_model.tar.gz" "PyramidDNN_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_pyramid_dnn ${PYRAMID_DNN_INSTALL_DIR} analyzer_pyramid_dnn_tester.cc) -#Ernie +# Ernie set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie") download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_model.tar.gz" aa59192dd41ed377f9f168e3a1309fa6 "Ernie_data.txt.tar.gz" 5396e63548edad7ca561e7e26a9476d1) download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz" 73beea65abda2edb61c1662cd3180c62) inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR} analyzer_ernie_tester.cc) +inference_analysis_api_int8_test(test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR} analyzer_ernie_int8_tester.cc) -#Ernie large +# Ernie large set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie_Large") download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_large_model.tar.gz" af7715245ed32cc77374625d4c80f7ef "Ernie_large_data.txt.tar.gz" edb2113eec93783cad56ed76d47ba57f) download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz" 1facda98eef1085dc9d435ebf3f23a73) @@ -426,7 +438,7 @@ if(WITH_MKLDNN) # TODO(grygielski) Enable after MKL-DNN 1.0 merge set(INT8_VGG16_MODEL_DIR "${INT8_DATA_DIR}/vgg16") download_int8_data_without_verify(${INT8_VGG16_MODEL_DIR} "VGG16_int8_model.tar.gz" ) -# inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH}) +# inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH}) # vgg19 int8 # TODO(grygielski) Enable after MKL-DNN 1.0 merge @@ -730,6 +742,7 @@ set_tests_properties(test_analyzer_mobilenet_transpose PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_resnet50 PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_ner PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_ernie PROPERTIES TIMEOUT 120) +set_tests_properties(test_analyzer_ernie_int8 PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_googlenet PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_small_dam PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_transformer PROPERTIES TIMEOUT 120) diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_int8_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ernie_int8_tester.cc new file mode 100644 index 0000000000000..b85726647b548 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_ernie_int8_tester.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/analyzer_ernie_tester.h" + +namespace paddle { +namespace inference { + +using paddle::PaddleTensor; + +#ifdef PADDLE_WITH_MKLDNN +void SetInt8Config(AnalysisConfig *cfg, + std::vector data) { + cfg->SetModel(FLAGS_infer_model); + cfg->EnableMKLDNN(); + cfg->EnableMkldnnQuantizer(); + auto warmup_data = std::make_shared>(data); + cfg->mkldnn_quantizer_config()->SetWarmupData(warmup_data); + cfg->mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_batch_size); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); + cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads); +} + +// Compare result of NativeConfig and AnalysisConfig +void compare_int8(bool use_mkldnn = false) { + std::vector> inputs; + LoadInputData(&inputs); + + AnalysisConfig cfg; + SetInt8Config(&cfg, inputs[0]); + + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), inputs); +} + +TEST(Analyzer_ernie, compare_int8_mkldnn) { + compare_int8(true /* use_mkldnn */); +} +#endif + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc index 0c2a140023e29..d6ff3e422368b 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc @@ -12,142 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/tests/api/tester_helper.h" +#include "paddle/fluid/inference/tests/api/analyzer_ernie_tester.h" namespace paddle { namespace inference { using paddle::PaddleTensor; -template -void GetValueFromStream(std::stringstream *ss, T *t) { - (*ss) >> (*t); -} - -template <> -void GetValueFromStream(std::stringstream *ss, std::string *t) { - *t = ss->str(); -} - -// Split string to vector -template -void Split(const std::string &line, char sep, std::vector *v) { - std::stringstream ss; - T t; - for (auto c : line) { - if (c != sep) { - ss << c; - } else { - GetValueFromStream(&ss, &t); - v->push_back(std::move(t)); - ss.str({}); - ss.clear(); - } - } - - if (!ss.str().empty()) { - GetValueFromStream(&ss, &t); - v->push_back(std::move(t)); - ss.str({}); - ss.clear(); - } -} - -// Parse tensor from string -template -bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) { - std::vector data; - Split(field, ':', &data); - if (data.size() < 2) return false; - - std::string shape_str = data[0]; - - std::vector shape; - Split(shape_str, ' ', &shape); - - std::string mat_str = data[1]; - - std::vector mat; - Split(mat_str, ' ', &mat); - - tensor->shape = shape; - auto size = - std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()) * - sizeof(T); - tensor->data.Resize(size); - std::copy(mat.begin(), mat.end(), static_cast(tensor->data.data())); - tensor->dtype = GetPaddleDType(); - - return true; -} - -// Parse input tensors from string -bool ParseLine(const std::string &line, - std::vector *tensors) { - std::vector fields; - Split(line, ';', &fields); - - tensors->clear(); - tensors->reserve(4); - - int i = 0; - auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_"; - for (; i < 3; i++) { - paddle::PaddleTensor temp; - ParseTensor(fields[i], &temp); - temp.name = input_name + std::to_string(i); - tensors->push_back(temp); - } - - // input_mask - paddle::PaddleTensor input_mask; - ParseTensor(fields[i], &input_mask); - input_mask.name = input_name + std::to_string(i); - tensors->push_back(input_mask); - - return true; -} - -bool LoadInputData(std::vector> *inputs) { - if (FLAGS_infer_data.empty()) { - LOG(ERROR) << "please set input data path"; - return false; - } - - std::ifstream fin(FLAGS_infer_data); - std::string line; - int sample = 0; - - // The unit-test dataset only have 10 samples, each sample have 5 feeds. - while (std::getline(fin, line)) { - std::vector feed_data; - ParseLine(line, &feed_data); - inputs->push_back(std::move(feed_data)); - sample++; - if (!FLAGS_test_all_data && sample == FLAGS_batch_size) break; - } - LOG(INFO) << "number of samples: " << sample; - return true; -} - -void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false, - bool use_gpu = false) { - cfg->SetModel(FLAGS_infer_model); - if (use_mkldnn) { - cfg->EnableMKLDNN(); - } - if (use_gpu) { - cfg->EnableUseGpu(100, 0); - } else { - cfg->DisableGpu(); - } - cfg->SwitchSpecifyInputNames(); - cfg->SwitchIrOptim(); - cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads); -} - void profile(bool use_mkldnn = false, bool use_gpu = false) { AnalysisConfig config; + SetConfig(&config, use_mkldnn, use_gpu); std::vector> outputs; @@ -189,11 +63,12 @@ TEST(Analyzer_Ernie, fuse_statis) { // Compare result of NativeConfig and AnalysisConfig void compare(bool use_mkldnn = false) { + std::vector> inputs; + LoadInputData(&inputs); + AnalysisConfig cfg; SetConfig(&cfg, use_mkldnn, false); - std::vector> inputs; - LoadInputData(&inputs); CompareNativeAndAnalysis( reinterpret_cast(&cfg), inputs); } diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h new file mode 100644 index 0000000000000..dd3faac759210 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h @@ -0,0 +1,152 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +using paddle::PaddleTensor; + +template +void GetValueFromStream(std::stringstream *ss, T *t) { + (*ss) >> (*t); +} + +template <> +void GetValueFromStream(std::stringstream *ss, std::string *t) { + *t = ss->str(); +} + +// Split string to vector +template +void Split(const std::string &line, char sep, std::vector *v) { + std::stringstream ss; + T t; + for (auto c : line) { + if (c != sep) { + ss << c; + } else { + GetValueFromStream(&ss, &t); + v->push_back(std::move(t)); + ss.str({}); + ss.clear(); + } + } + + if (!ss.str().empty()) { + GetValueFromStream(&ss, &t); + v->push_back(std::move(t)); + ss.str({}); + ss.clear(); + } +} + +// Parse tensor from string +template +bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) { + std::vector data; + Split(field, ':', &data); + if (data.size() < 2) return false; + + std::string shape_str = data[0]; + + std::vector shape; + Split(shape_str, ' ', &shape); + + std::string mat_str = data[1]; + + std::vector mat; + Split(mat_str, ' ', &mat); + + tensor->shape = shape; + auto size = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()) * + sizeof(T); + tensor->data.Resize(size); + std::copy(mat.begin(), mat.end(), static_cast(tensor->data.data())); + tensor->dtype = GetPaddleDType(); + + return true; +} + +// Parse input tensors from string +bool ParseLine(const std::string &line, + std::vector *tensors) { + std::vector fields; + Split(line, ';', &fields); + + tensors->clear(); + tensors->reserve(4); + + int i = 0; + auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_"; + for (; i < 3; i++) { + paddle::PaddleTensor temp; + ParseTensor(fields[i], &temp); + temp.name = input_name + std::to_string(i); + tensors->push_back(temp); + } + + // input_mask + paddle::PaddleTensor input_mask; + ParseTensor(fields[i], &input_mask); + input_mask.name = input_name + std::to_string(i); + tensors->push_back(input_mask); + + return true; +} + +bool LoadInputData(std::vector> *inputs) { + if (FLAGS_infer_data.empty()) { + LOG(ERROR) << "please set input data path"; + return false; + } + + std::ifstream fin(FLAGS_infer_data); + std::string line; + int sample = 0; + + // The unit-test dataset only have 10 samples, each sample have 5 feeds. + while (std::getline(fin, line)) { + std::vector feed_data; + ParseLine(line, &feed_data); + inputs->push_back(std::move(feed_data)); + sample++; + if (!FLAGS_test_all_data && sample == FLAGS_batch_size) break; + } + LOG(INFO) << "number of samples: " << sample; + return true; +} + +void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false, + bool use_gpu = false) { + cfg->SetModel(FLAGS_infer_model); + if (use_mkldnn) { + cfg->EnableMKLDNN(); + } + if (use_gpu) { + cfg->EnableUseGpu(100, 0); + } else { + cfg->DisableGpu(); + } + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); + cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 69134e1c76bb7..97952e4b71641 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -19,14 +19,13 @@ if (WITH_GPU) DEPS device_context malloc) nv_test(stream_safe_cuda_alloc_test SRCS stream_safe_cuda_alloc_test.cu - DEPS malloc) + DEPS malloc cuda_graph_with_memory_pool) if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test) set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES - ENVIRONMENT "FLAGS_use_system_allocator=false" - ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true" - ENVIRONMENT "FLAGS_allocator_strategy=auto_growth") - endif() + ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true; + FLAGS_allocator_strategy=auto_growth") + endif() endif() if (WITH_ROCM) diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 4d44c533b7456..b3351f44dc35a 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -48,6 +48,8 @@ if (WITH_GPU OR WITH_ROCM) endif() elseif(WITH_XPU) set(AllocatorFacadeDeps xpu_info) +elseif(WITH_IPU) + set(AllocatorFacadeDeps ipu_info) elseif(WITH_ASCEND) set(AllocatorFacadeDeps ascend_npu_info) else () diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index b7b238bd0bf53..2aed7ec001d2a 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -30,13 +30,10 @@ #include "paddle/fluid/memory/allocation/pinned_allocator.h" #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" #include "paddle/fluid/memory/allocation/thread_local_allocator.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #ifdef PADDLE_WITH_CUDA -#include -#include "paddle/fluid/platform/cuda_graph.h" -#else -#include +#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h" #endif #if CUDA_VERSION >= 10020 @@ -54,6 +51,10 @@ #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" #endif +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/platform/device/ipu/ipu_info.h" +#endif + PADDLE_DEFINE_EXPORTED_int64( gpu_allocator_retry_time, 10000, "The retry time (milliseconds) when allocator fails " @@ -70,7 +71,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false, // NOTE(Ruibiao): This FLAGS is just to be compatibled with // the old single-stream CUDA allocator. It will be removed // after StreamSafeCudaAllocator has been fully tested. -PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true, +PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false, "Enable StreamSafeCUDAAllocator"); DECLARE_string(allocator_strategy); @@ -139,14 +140,18 @@ class AllocatorFacadePrivate { switch (strategy_) { case AllocatorStrategy::kNaiveBestFit: { InitNaiveBestFitCPUAllocator(); +#ifdef PADDLE_WITH_IPU + for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) { + InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id)); + } +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator) { LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for " "naive_best_fit strategy"; FLAGS_use_stream_safe_cuda_allocator = false; } - for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); - ++dev_id) { + for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); } InitNaiveBestFitCUDAPinnedAllocator(); @@ -172,13 +177,13 @@ class AllocatorFacadePrivate { if (FLAGS_use_stream_safe_cuda_allocator) { // TODO(Ruibiao): Support multi-stream allocator for other strategies default_stream_ = nullptr; - for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); + for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id), default_stream_); } } else { - for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); + for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), allow_free_idle_chunk_); @@ -190,6 +195,11 @@ class AllocatorFacadePrivate { for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); } +#endif +#ifdef PADDLE_WITH_IPU + for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) { + InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id)); + } #endif break; } @@ -201,6 +211,11 @@ class AllocatorFacadePrivate { InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); } #endif +#ifdef PADDLE_WITH_IPU + for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) { + InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id)); + } +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator) { LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for " @@ -208,8 +223,7 @@ class AllocatorFacadePrivate { FLAGS_use_stream_safe_cuda_allocator = false; } - for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); - ++dev_id) { + for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id)); } InitNaiveBestFitCUDAPinnedAllocator(); @@ -399,10 +413,10 @@ class AllocatorFacadePrivate { CUdevice device; int val; try { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId())); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cuDeviceGetAttribute( &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, device)); @@ -476,10 +490,10 @@ class AllocatorFacadePrivate { CUdevice device; int val; try { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId())); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cuDeviceGetAttribute( &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, device)); @@ -575,6 +589,12 @@ class AllocatorFacadePrivate { } #endif +#ifdef PADDLE_WITH_IPU + void InitNaiveBestFitIPUAllocator(platform::IPUPlace p) { + allocators_[p] = std::make_shared(p); + } +#endif + #ifdef PADDLE_WITH_ASCEND_CL void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) { allocators_[p] = std::make_shared(p); @@ -596,10 +616,17 @@ class AllocatorFacadePrivate { system_allocators_[p] = std::make_shared(p); } #endif +#ifdef PADDLE_WITH_IPU + int device_count = platform::GetIPUDeviceCount(); + for (int i = 0; i < device_count; ++i) { + platform::IPUPlace p(i); + system_allocators_[p] = std::make_shared(p); + } +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) system_allocators_[platform::CUDAPinnedPlace()] = std::make_shared(); - int device_count = platform::GetCUDADeviceCount(); + int device_count = platform::GetGPUDeviceCount(); for (int i = 0; i < device_count; ++i) { platform::CUDAPlace p(i); system_allocators_[p] = std::make_shared(p); @@ -612,7 +639,7 @@ class AllocatorFacadePrivate { std::vector places; places.emplace_back(platform::CPUPlace()); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - int device_count = platform::GetCUDADeviceCount(); + int device_count = platform::GetGPUDeviceCount(); for (int dev_id = 0; dev_id < device_count; ++dev_id) { places.emplace_back(platform::CUDAPlace(dev_id)); } @@ -630,6 +657,12 @@ class AllocatorFacadePrivate { places.emplace_back(platform::NPUPlace(dev_id)); } #endif +#ifdef PADDLE_WITH_IPU + int device_count = platform::GetIPUDeviceCount(); + for (int dev_id = 0; dev_id < device_count; ++dev_id) { + places.emplace_back(platform::IPUPlace(dev_id)); + } +#endif for (auto& p : places) { zero_size_allocators_[p] = std::make_shared(p); @@ -704,10 +737,18 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + return m_->GetAllocator(place, + /* A non-zero num to choose allocator_ */ 1); + } +#endif + return m_->GetAllocator(BOOST_GET_CONST(platform::CUDAPlace, place), m_->GetDefaultStream()); } #endif + return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); } @@ -721,10 +762,17 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && size > 0 && FLAGS_use_system_allocator == false) { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + return m_->GetAllocator(place, size)->Allocate(size); + } +#endif + return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), size, m_->GetDefaultStream()); } #endif + return m_->GetAllocator(place, size)->Allocate(size); } @@ -732,6 +780,14 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + return m_ + ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) + ->Release(place); + } +#endif + return Release(BOOST_GET_CONST(platform::CUDAPlace, place), m_->GetDefaultStream()); } @@ -750,6 +806,14 @@ std::shared_ptr AllocatorFacade::AllocShared( "multi-stream 'AllocaShared' function. " "To enable it, you can enter 'export " "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + PADDLE_THROW(platform::errors::Unavailable( + "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + } +#endif + return std::shared_ptr(Alloc(place, size, stream)); } @@ -762,6 +826,14 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place, "multi-stream 'Alloca' function. " "To enable it, you can enter 'export " "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + PADDLE_THROW(platform::errors::Unavailable( + "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + } +#endif + if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { return m_->GetAllocator(place, stream, /* creat_if_not_found = */ true) ->Allocate(size); @@ -779,6 +851,14 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, "multi-stream 'Release' function. " "To enable it, you can enter 'export " "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + PADDLE_THROW(platform::errors::Unavailable( + "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + } +#endif + return m_->GetAllocator(place, stream)->Release(place); } @@ -791,6 +871,14 @@ void AllocatorFacade::RecordStream(Allocation* allocation, "'RecordStream' function. " "To enable it, you can enter 'export " "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + PADDLE_THROW(platform::errors::Unavailable( + "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + } +#endif + m_->RecordStream(allocation, stream); } diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 4cd8b4e91e614..0d9f1043d9e86 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -19,7 +19,7 @@ #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" #endif #ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index 9f34f5198a179..dd2a65d889d8d 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -100,11 +100,13 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) { VLOG(2) << "Not found and reallocate " << realloc_size << "(" << static_cast(p) << "), and remaining " << remaining_size; } + VLOG(10) << "Alloc " << block_it->size_ << " bytes, ptr = " << block_it->ptr_; return new BlockAllocation(block_it); } void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { - VLOG(10) << "Free " << allocation->size() << " bytes"; + VLOG(10) << "Free " << allocation->size() + << " bytes, ptr = " << allocation->ptr(); std::lock_guard guard(spinlock_); auto block_it = static_cast(allocation)->block_it_; auto &blocks = block_it->chunk_->blocks_; diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc index 193ef5a0cb922..4469673b305bf 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc @@ -19,7 +19,7 @@ #include // NOLINT #include "gflags/gflags.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) DECLARE_double(fraction_of_gpu_memory_to_use); diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index b1a45afa99d9a..4242083f2e617 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -25,8 +25,8 @@ #include #include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/gpu_info.h" namespace paddle { namespace memory { @@ -37,8 +37,8 @@ void CUDAAllocator::FreeImpl(Allocation* allocation) { BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_, platform::errors::PermissionDenied( "GPU memory is freed in incorrect device. This may be a bug")); - platform::RecordedCudaFree(allocation->ptr(), allocation->size(), - place_.device); + platform::RecordedGpuFree(allocation->ptr(), allocation->size(), + place_.device); delete allocation; } @@ -46,13 +46,13 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) { std::call_once(once_flag_, [this] { platform::SetDeviceId(place_.device); }); void* ptr; - auto result = platform::RecordedCudaMalloc(&ptr, size, place_.device); + auto result = platform::RecordedGpuMalloc(&ptr, size, place_.device); if (LIKELY(result == gpuSuccess)) { return new Allocation(ptr, size, platform::Place(place_)); } size_t avail, total, actual_avail, actual_total; - bool is_limited = platform::RecordedCudaMemGetInfo( + bool is_limited = platform::RecordedGpuMemGetInfo( &avail, &total, &actual_avail, &actual_total, place_.device); size_t allocated = total - avail; diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h index 3d6f1d7bcbea6..9e04fd3f0619e 100644 --- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -81,10 +81,10 @@ class CUDADeviceContextAllocator : public Allocator { : place_(place), default_stream_(default_stream) { platform::CUDADeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event_, hipEventDisableTiming)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreate(&event_, cudaEventDisableTiming)); #endif } @@ -93,9 +93,9 @@ class CUDADeviceContextAllocator : public Allocator { if (event_) { platform::CUDADeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_)); #endif } } @@ -111,12 +111,11 @@ class CUDADeviceContextAllocator : public Allocator { new CUDADeviceContextAllocation(memory::Alloc(place_, size)); // Wait for the event on stream #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, default_stream_)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamWaitEvent(default_stream_, event_, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, default_stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(default_stream_, event_, 0)); #endif return allocation; } diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc index e3780f2f11359..f4baca8288f03 100644 --- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc @@ -23,8 +23,8 @@ #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/dynload/cuda_driver.h" -#include "paddle/fluid/platform/gpu_info.h" #endif #if CUDA_VERSION >= 10020 @@ -49,10 +49,10 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator( // Prepare the access descriptor array indicating where and how the backings // should be visible. - for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { + for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { if (place.device != dev_id) { int capable = 0; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaDeviceCanAccessPeer(&capable, place.device, dev_id)); if (!capable) { VLOG(1) << "device(" << place.device @@ -73,10 +73,10 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator( // Get the minimum granularity needed for all devices // (the max of the minimum granularity of each participating device) granularity_ = 0; - for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { + for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { size_t granularity; prop.location.id = dev_id; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cuMemGetAllocationGranularity( &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); granularity_ = std::max(granularity, granularity_); @@ -84,7 +84,7 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator( size_t actual_avail, actual_total; paddle::platform::CUDADeviceGuard guard(place.device); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total)); virtual_mem_size_ = AlignedSize(actual_total, granularity_); @@ -93,7 +93,7 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator( // GPU, // so the virtual address space size we reserve is equal to the GPU video // memory size - PADDLE_ENFORCE_CUDA_SUCCESS(paddle::platform::dynload::cuMemAddressReserve( + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cuMemAddressReserve( &virtual_mem_base_, virtual_mem_size_, 0, 0, 0)); virtual_mem_alloced_offset_ = 0; @@ -123,11 +123,11 @@ void CUDAVirtualMemAllocator::FreeImpl(Allocation* allocation) { auto result = paddle::platform::dynload::cuMemUnmap(iter->first, iter->second.second); if (result != CUDA_ERROR_DEINITIALIZED) { - PADDLE_ENFORCE_CUDA_SUCCESS(result); + PADDLE_ENFORCE_GPU_SUCCESS(result); } if (result != CUDA_ERROR_DEINITIALIZED) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::RecordedCuMemRelease( + PADDLE_ENFORCE_GPU_SUCCESS(platform::RecordedGpuMemRelease( iter->second.first, iter->second.second, place_.device)); } @@ -166,12 +166,12 @@ Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) { // Create physical memory backing allocation. auto result = - platform::RecordedCuMemCreate(&handle, size, &prop_, 0, place_.device); + platform::RecordedGpuMemCreate(&handle, size, &prop_, 0, place_.device); if (result != CUDA_SUCCESS) { if (result == CUDA_ERROR_OUT_OF_MEMORY) { size_t actual_avail, actual_total; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total)); size_t actual_allocated = actual_total - actual_avail; PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( @@ -186,7 +186,7 @@ Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) { string::HumanReadableSize(actual_allocated), string::HumanReadableSize(actual_avail), place_.device)); } else { - PADDLE_ENFORCE_CUDA_SUCCESS(result); + PADDLE_ENFORCE_GPU_SUCCESS(result); } return nullptr; } @@ -197,8 +197,8 @@ Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) { result = paddle::platform::dynload::cuMemMap(ptr, size, 0, handle, 0); if (result != CUDA_SUCCESS) { - platform::RecordedCuMemRelease(handle, size, place_.device); - PADDLE_ENFORCE_CUDA_SUCCESS(result); + platform::RecordedGpuMemRelease(handle, size, place_.device); + PADDLE_ENFORCE_GPU_SUCCESS(result); return nullptr; } @@ -208,8 +208,8 @@ Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) { if (result != CUDA_SUCCESS) { paddle::platform::dynload::cuMemUnmap(ptr, size); - platform::RecordedCuMemRelease(handle, size, place_.device); - PADDLE_ENFORCE_CUDA_SUCCESS(result); + platform::RecordedGpuMemRelease(handle, size, place_.device); + PADDLE_ENFORCE_GPU_SUCCESS(result); return nullptr; } diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 3bdd856759dc1..6de32335c62b2 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -20,8 +20,8 @@ #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 5aa0514432844..c56a7235c109c 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -20,18 +20,18 @@ namespace allocation { bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; } void CPUPinnedAllocator::FreeImpl(Allocation *allocation) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipHostFree(allocation->ptr())); + PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr())); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(allocation->ptr())); + PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr())); #endif delete allocation; } Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { void *ptr; #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable)); + PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); #endif return new Allocation(ptr, size, platform::CUDAPinnedPlace()); } diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index b2e13af6ef956..86f3135ee4d14 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -103,6 +103,8 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { for (StreamSafeCUDAAllocator* allocator : allocators) { release_size += allocator->ProcessEventsAndFreeWithRelease(); } + VLOG(8) << "Release " << release_size + << " bytes memory from all stream for place " << place; return release_size; } @@ -112,13 +114,13 @@ void StreamSafeCUDAAllocator::CreateEventForAllRecordedStream( for (gpuStream_t stream : *recorded_streams) { gpuEvent_t event; #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event, hipEventDisableTiming)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream)); #endif outstanding_events->emplace_back(event); VLOG(9) << "Record event " << event << " in stream " << stream; @@ -162,8 +164,8 @@ void StreamSafeCUDAAllocator::ProcessEventsAndFree() { outstanding_events.erase(outstanding_events.begin(), deque_it); break; } - PADDLE_ENFORCE_CUDA_SUCCESS(err); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(*deque_it)); + PADDLE_ENFORCE_GPU_SUCCESS(err); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(*deque_it)); #else gpuError_t err = hipEventQuery(*deque_it); if (err == hipErrorNotReady) { @@ -173,8 +175,8 @@ void StreamSafeCUDAAllocator::ProcessEventsAndFree() { outstanding_events.erase(outstanding_events.begin(), deque_it); break; } - PADDLE_ENFORCE_CUDA_SUCCESS(err); - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(*deque_it)); + PADDLE_ENFORCE_GPU_SUCCESS(err); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(*deque_it)); #endif ++deque_it; } diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h index 654fb3fe7bc04..c55f579981b00 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator.h +++ b/paddle/fluid/memory/allocation/thread_local_allocator.h @@ -20,7 +20,7 @@ #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index 88dbec2bcfd0c..b7be895b35830 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -25,8 +25,8 @@ limitations under the License. */ #include "paddle/fluid/memory/detail/memory_block.h" #include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/npu/npu_info.h" -#include "paddle/fluid/platform/gpu_info.h" namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc index 8b3d776cef210..cd152843553a9 100644 --- a/paddle/fluid/memory/detail/buddy_allocator_test.cc +++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc @@ -24,8 +24,8 @@ limitations under the License. */ #include "gflags/gflags.h" #include "gtest/gtest.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/npu/npu_info.h" -#include "paddle/fluid/platform/gpu_info.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_ASCEND_CL) diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 75b93088e5502..b300f936f7a68 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -27,9 +27,9 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/gpu_info.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" @@ -115,7 +115,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { if (size <= 0) return nullptr; void* p; - auto result = platform::RecordedCudaMalloc(&p, size, gpu_id_); + auto result = platform::RecordedGpuMalloc(&p, size, gpu_id_); if (result == gpuSuccess) { *index = 0; @@ -123,7 +123,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { return p; } else { size_t avail, total, actual_avail, actual_total; - bool is_limited = platform::RecordedCudaMemGetInfo( + bool is_limited = platform::RecordedGpuMemGetInfo( &avail, &total, &actual_avail, &actual_total, gpu_id_); size_t allocated = total - avail; @@ -166,7 +166,7 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) { size, gpu_alloc_size_)); gpu_alloc_size_ -= size; - platform::RecordedCudaFree(p, size, gpu_id_); + platform::RecordedGpuFree(p, size, gpu_id_); } bool GPUAllocator::UseGpu() const { return true; } diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc index ead188341dac4..bb7f47f9d30ec 100644 --- a/paddle/fluid/memory/detail/system_allocator_test.cc +++ b/paddle/fluid/memory/detail/system_allocator_test.cc @@ -19,6 +19,9 @@ limitations under the License. */ #include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/allocator.h" +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#endif DECLARE_bool(use_pinned_memory); @@ -77,11 +80,7 @@ TEST(GPUAllocator, AllocFailure) { allocator.Alloc(&index, alloc_size); ASSERT_TRUE(false); } catch (paddle::memory::allocation::BadAlloc&) { -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError()); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError()); -#endif + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::GpuGetLastError()); } } #endif diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 574b152054399..fe38200efa8e2 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -33,6 +33,32 @@ void Copy(platform::CPUPlace, void* dst, VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num; std::memcpy(dst, src, num); } +#ifdef PADDLE_WITH_IPU +template <> +void Copy(platform::IPUPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, size_t num) { + if (UNLIKELY(num == 0)) return; + std::memcpy(dst, src, num); +} +template <> +void Copy(platform::CPUPlace dst_place, + void* dst, + platform::IPUPlace src_place, + const void* src, size_t num) { + if (UNLIKELY(num == 0)) return; + std::memcpy(dst, src, num); +} +template <> +void Copy(platform::IPUPlace dst_place, + void* dst, + platform::IPUPlace src_place, + const void* src, size_t num) { + if (UNLIKELY(num == 0)) return; + std::memcpy(dst, src, num); +} +#endif #ifdef PADDLE_WITH_XPU template <> diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h index c630437224cd0..7d2d2526ab124 100644 --- a/paddle/fluid/memory/memcpy.h +++ b/paddle/fluid/memory/memcpy.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/place.h" namespace paddle { diff --git a/paddle/fluid/memory/pinned_memory_test.cu b/paddle/fluid/memory/pinned_memory_test.cu index 76a880755e21b..837c964e2ad32 100644 --- a/paddle/fluid/memory/pinned_memory_test.cu +++ b/paddle/fluid/memory/pinned_memory_test.cu @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/cpu_info.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/place.h" // This unit test is an example comparing the performance between using pinned diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu index 6a5818fd9603b..134c368d4340e 100644 --- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu +++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu @@ -25,8 +25,10 @@ #include #include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" namespace paddle { namespace memory { @@ -38,6 +40,14 @@ __global__ void add_kernel(int *x, int n) { } } +void CheckMemLeak(const platform::CUDAPlace &place) { + uint64_t cuda_malloc_size = + platform::RecordedGpuMallocSize(place.GetDeviceId()); + ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size + << " bytes memory that not released yet," + << " there may be a memory leak problem"; +} + class StreamSafeCUDAAllocTest : public ::testing::Test { protected: void SetUp() override { @@ -53,9 +63,9 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { for (size_t i = 1; i < stream_num_; ++i) { gpuStream_t stream; #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream)); #endif streams_.emplace_back(stream); } @@ -65,10 +75,10 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { std::shared_ptr allocation = AllocShared(place_, allocation_size, streams_[i]); #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMemset(allocation->ptr(), 0, allocation->size())); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipMemset(allocation->ptr(), 0, allocation->size())); #endif allocations_.emplace_back(allocation); @@ -111,13 +121,13 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { // tricky code, the allocations are still accessible even though // allocations_.clear() has been called #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMemcpy(host_x.get(), allocations_[i]->ptr(), data_num_ * sizeof(int), cudaMemcpyDeviceToHost)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( - hipMemcpy(host_x.get(), allocations_[i]->ptr(), - data_num_ * sizeof(int), hipMemcpyDeviceToHost)); + PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(host_x.get(), allocations_[i]->ptr(), + data_num_ * sizeof(int), + hipMemcpyDeviceToHost)); #endif for (int j = 0; j < data_num_; ++j) { EXPECT_TRUE(host_x[j] == (j % thread_num) * stream_num_); @@ -127,9 +137,9 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { void TearDown() override { #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); #else - PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize()); + PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif for (gpuStream_t stream : streams_) { Release(place_, stream); @@ -137,17 +147,13 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { for (size_t i = 1; i < stream_num_; ++i) { #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams_[i])); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i])); #else - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(streams_[i])); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i])); #endif } - uint64_t cuda_malloc_size = - platform::RecordedCudaMallocSize(place_.GetDeviceId()); - ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size - << " bytes memory that not released yet," - << " there may be a memory leak problem"; + CheckMemLeak(place_); } size_t stream_num_; @@ -186,17 +192,70 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { Alloc(place, alloc_size, default_stream); EXPECT_GE(allocation_unique->size(), alloc_size); EXPECT_EQ(allocation_unique->ptr(), address); + allocation_unique.reset(); + + Release(place); + CheckMemLeak(place); } +TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) { + platform::CUDAPlace place = platform::CUDAPlace(); + auto &instance = allocation::AllocatorFacade::Instance(); + const std::shared_ptr &allocator = instance.GetAllocator(place); + + size_t alloc_size = 256; + std::shared_ptr allocation_from_allocator = + allocator->Allocate(alloc_size); + EXPECT_GE(allocation_from_allocator->size(), alloc_size); + void *address = allocation_from_allocator->ptr(); + allocation_from_allocator.reset(); + + std::shared_ptr allocation_implicit_stream = + AllocShared(place, alloc_size); + EXPECT_GE(allocation_implicit_stream->size(), alloc_size); + EXPECT_EQ(allocation_implicit_stream->ptr(), address); + allocation_implicit_stream.reset(); + + Release(place); + CheckMemLeak(place); +} + +#ifdef PADDLE_WITH_CUDA +TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) { + platform::CUDAPlace place = platform::CUDAPlace(); + size_t alloc_size = 1; + std::shared_ptr allocation = AllocShared(place, alloc_size); + + platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal); + EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet); + EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet); + EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet); + EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place), + paddle::platform::EnforceNotMet); + EXPECT_THROW(AllocShared(place, alloc_size, nullptr), + paddle::platform::EnforceNotMet); + EXPECT_THROW(Alloc(place, alloc_size, nullptr), + paddle::platform::EnforceNotMet); + EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet); + EXPECT_THROW(RecordStream(allocation.get(), nullptr), + paddle::platform::EnforceNotMet); + platform::EndCUDAGraphCapture(); + + allocation.reset(); + Release(place); + CheckMemLeak(place); +} +#endif + TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { platform::CUDAPlace place = platform::CUDAPlace(); gpuStream_t stream1, stream2; #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream1)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream2)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream1)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream2)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream1)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream2)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream1)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream2)); #endif size_t available_size = platform::GpuAvailableMemToAlloc(); // alloc_size < available_size < 2 * alloc_size @@ -216,13 +275,14 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { allocation2.reset(); #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); #else - PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize()); + PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif Release(place, stream1); Release(place, stream2); + CheckMemLeak(place); } } // namespace memory diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index a8f35d61f3c4c..f0621af9bbda5 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -169,8 +169,10 @@ endif() if (WITH_CINN) op_library(cinn_launch_op SRCS cinn_launch_op.cc cinn_launch_op.cu.cc DEPS transform_desc cinn_compiler cinn ${OP_HEADER_DEPS}) - cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op) - set_tests_properties(cinn_launch_op_test PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1) + if (WITH_TESTING) + cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op) + set_tests_properties(cinn_launch_op_test PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1) + endif() endif() # FIXME(typhoonzero): operator deps may not needed. @@ -203,6 +205,7 @@ elseif(WITH_ROCM) else() cc_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc DEPS tensor device_context eigen3) endif() +cc_test(share_buffer_op_cpp_test SRCS share_buffer_op_test.cc DEPS lod_tensor device_context share_buffer_op) cc_library(tensor_formatter SRCS tensor_formatter.cc DEPS ${OP_HEADER_DEPS}) if (WITH_PYTHON) diff --git a/paddle/fluid/operators/activation_cudnn.cu.cc b/paddle/fluid/operators/activation_cudnn.cu.cc index 38499783eb492..2ad92e36272b3 100644 --- a/paddle/fluid/operators/activation_cudnn.cu.cc +++ b/paddle/fluid/operators/activation_cudnn.cu.cc @@ -14,11 +14,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/activation_op.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_desc.h" -#else -#include "paddle/fluid/platform/cudnn_desc.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc index b197d3511f96b..2776fe9c13132 100644 --- a/paddle/fluid/operators/activation_cudnn_op.cu.cc +++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc @@ -14,11 +14,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/activation_op.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_desc.h" -#else -#include "paddle/fluid/platform/cudnn_desc.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace platform { @@ -64,13 +60,13 @@ struct CudnnActivationFunctor { x_desc.set(x); out_desc.set(GET_DATA_SAFELY(out, "Output", "Out", "CudnnActivation")); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenActivationForward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenActivationForward( ctx_.cudnn_handle(), act_desc.desc(), platform::CudnnDataType::kOne(), x_desc.desc(), x.data(), platform::CudnnDataType::kZero(), out_desc.desc(), out->mutable_data(ctx_.GetPlace()))); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnActivationForward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnActivationForward( ctx_.cudnn_handle(), act_desc.desc(), platform::CudnnDataType::kOne(), x_desc.desc(), x.data(), platform::CudnnDataType::kZero(), out_desc.desc(), @@ -108,14 +104,14 @@ struct CudnnActivationGradFunctor { dout_desc.set(dout); dx_desc.set(GET_DATA_SAFELY(dx, "Output", "X@GRAD", "CudnnActivationGrad")); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenActivationBackward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenActivationBackward( ctx_.cudnn_handle(), act_desc.desc(), platform::CudnnDataType::kOne(), out_desc.desc(), out.data(), dout_desc.desc(), dout.data(), x_desc.desc(), x.data(), platform::CudnnDataType::kZero(), dx_desc.desc(), dx->mutable_data(ctx_.GetPlace()))); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnActivationBackward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnActivationBackward( ctx_.cudnn_handle(), act_desc.desc(), platform::CudnnDataType::kOne(), out_desc.desc(), out.data(), dout_desc.desc(), dout.data(), x_desc.desc(), x.data(), diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 0294bfd5b05d5..07cf516c476e8 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/operators/math/math_cuda_utils.h" #include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu index 5fa1e18553bd5..cf4041f721af2 100644 --- a/paddle/fluid/operators/affine_channel_op.cu +++ b/paddle/fluid/operators/affine_channel_op.cu @@ -23,7 +23,7 @@ namespace cub = hipcub; #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc index b8ce52387b959..31801b14564d3 100644 --- a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc +++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ // HIP not support cudnnSpatialTfGridGeneratorForward #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { @@ -108,7 +108,7 @@ class CUDNNAffineGridGradOpKernel : public framework::OpKernel { const T* output_grad_data = output_grad->data(); T* theta_grad_data = theta_grad->mutable_data(ctx.GetPlace()); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnSpatialTfGridGeneratorBackward( handle, cudnn_st_desc, output_grad_data, theta_grad_data)); } diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc index 3125e005174de..d1da11028c05c 100644 --- a/paddle/fluid/operators/affine_grid_op.cc +++ b/paddle/fluid/operators/affine_grid_op.cc @@ -18,12 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cudnn_helper.h" -#endif -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu index 58b56bdcf5614..bcf7deefc98f0 100644 --- a/paddle/fluid/operators/affine_grid_op.cu +++ b/paddle/fluid/operators/affine_grid_op.cu @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/affine_grid_op.h" -#include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/angle_op.cc b/paddle/fluid/operators/angle_op.cc new file mode 100644 index 0000000000000..3cb0148681496 --- /dev/null +++ b/paddle/fluid/operators/angle_op.cc @@ -0,0 +1,125 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/angle_op.h" + +#include +#include +#include +#include +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +class AngleOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "angle"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "angle"); + + auto in_dims = ctx->GetInputDim("X"); + + ctx->SetOutputDim("Out", in_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class AngleOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of angle op."); + AddOutput("Out", "(Tensor), The output tensor of angle op."); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("use_cudnn", + "(bool, default false) Only used in cudnn kernel, need " + "install cudnn") + .SetDefault(false); + AddComment(R"DOC( +Angle Operator. + +This operator is used to perform elementwise angle for input $X$. +$$out = angle(x)$$ + +)DOC"); + } +}; + +class AngleGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + "Out@Grad", "angle_grad"); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Out@Grad", "angle_grad"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", + "X@Grad", "angle_grad"); + + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + ctx->SetOutputDim(framework::GradVarName("X"), dout_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(dtype, ctx.GetPlace()); + } +}; + +template +class AngleGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + void Apply(GradOpPtr retv) const override { + retv->SetType("angle_grad"); + retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + retv->SetInput("X", this->Input("X")); + retv->SetAttrMap(this->Attrs()); + retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(angle, ops::AngleOp, ops::AngleOpMaker, + ops::AngleGradMaker, + ops::AngleGradMaker); + +REGISTER_OP_CPU_KERNEL( + angle, ops::AngleKernel, + ops::AngleKernel, + ops::AngleKernel>, + ops::AngleKernel>); + +REGISTER_OPERATOR(angle_grad, ops::AngleGradOp); + +REGISTER_OP_CPU_KERNEL( + angle_grad, ops::AngleGradKernel, + ops::AngleGradKernel, + ops::AngleGradKernel>, + ops::AngleGradKernel>); diff --git a/paddle/fluid/operators/angle_op.cu b/paddle/fluid/operators/angle_op.cu new file mode 100644 index 0000000000000..3264f426a77d1 --- /dev/null +++ b/paddle/fluid/operators/angle_op.cu @@ -0,0 +1,31 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/angle_op.h" +#include "paddle/fluid/platform/complex.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + angle, ops::AngleKernel, + ops::AngleKernel, + ops::AngleKernel>, + ops::AngleKernel>); + +REGISTER_OP_CUDA_KERNEL( + angle_grad, ops::AngleGradKernel, + ops::AngleGradKernel, + ops::AngleGradKernel>, + ops::AngleGradKernel>); diff --git a/paddle/fluid/operators/angle_op.h b/paddle/fluid/operators/angle_op.h new file mode 100644 index 0000000000000..093a04f03df95 --- /dev/null +++ b/paddle/fluid/operators/angle_op.h @@ -0,0 +1,147 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES +#endif +#include +#include "paddle/fluid/operators/math/complex_functors.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +namespace math { +template +struct AngleFunctor; + +// angel function for complex +template +struct AngleFunctor>> { + AngleFunctor(const T* input, Real* output, int64_t numel) + : input_(input), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + output_[idx] = arg(input_[idx]); + } + + const T* input_; + Real* output_; + int64_t numel_; +}; + +// angel function for real +template +struct AngleFunctor>> { + AngleFunctor(const T* input, T* output, int64_t numel) + : input_(input), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + output_[idx] = input_[idx] < static_cast(0) ? M_PI : 0; + } + + const T* input_; + T* output_; + int64_t numel_; +}; + +template +struct AngleGradFunctor; + +// angle grad for complex +template +struct AngleGradFunctor>> { + AngleGradFunctor(const math::Real* dout, const T* x, T* dx, int64_t numel) + : dout_(dout), x_(x), dx_(dx), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + if (x_[idx] == T(0)) { + dx_[idx] = T(0); + } else { + const math::Real r_square = + x_[idx].real * x_[idx].real + x_[idx].imag * x_[idx].imag; + dx_[idx] = T(-dout_[idx] * x_[idx].imag / r_square, + dout_[idx] * x_[idx].real / r_square); + } + } + + const math::Real* dout_; + const T* x_; + T* dx_; + int64_t numel_; +}; + +// angle grad for real +template +struct AngleGradFunctor>> { + AngleGradFunctor(const math::Real* dout, const T* x, T* dx, int64_t numel) + : dout_(dout), x_(x), dx_(dx), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { dx_[idx] = 0; } + + const math::Real* dout_; + const T* x_; + T* dx_; + int64_t numel_; +}; +} // namespace math + +using Tensor = framework::Tensor; +template +class AngleKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* x = context.Input("X"); + Tensor* out = context.Output("Out"); + + auto numel = x->numel(); + auto* x_data = x->data(); + auto* out_data = out->mutable_data>( + context.GetPlace(), size_t(x->numel() * sizeof(math::Real))); + + auto& dev_ctx = context.template device_context(); + platform::ForRange for_range(dev_ctx, numel); + math::AngleFunctor functor(x_data, out_data, numel); + for_range(functor); + } +}; + +template +class AngleGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + const framework::Tensor* d_out = + ctx.Input(framework::GradVarName("Out")); + const framework::Tensor* x = ctx.Input("X"); + framework::Tensor* d_x = + ctx.Output(framework::GradVarName("X")); + + auto numel = d_out->numel(); + auto* dout_data = d_out->data>(); + auto* x_data = x->data(); + auto* dx_data = d_x->mutable_data( + ctx.GetPlace(), static_cast(numel * sizeof(T))); + + auto& dev_ctx = ctx.template device_context(); + platform::ForRange for_range(dev_ctx, numel); + math::AngleGradFunctor functor(dout_data, x_data, dx_data, numel); + for_range(functor); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu index f50d5e619ebea..6236a07de4bc6 100644 --- a/paddle/fluid/operators/argsort_op.cu +++ b/paddle/fluid/operators/argsort_op.cu @@ -26,8 +26,8 @@ namespace cub = hipcub; #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/argsort_op.h" #include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #ifdef __HIPCC__ namespace rocprim { @@ -169,7 +169,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, cu_stream); } - PADDLE_ENFORCE_CUDA_SUCCESS(err); + PADDLE_ENFORCE_GPU_SUCCESS(err); Tensor temp_storage; temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); @@ -188,7 +188,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, cu_stream); } - PADDLE_ENFORCE_CUDA_SUCCESS(err); + PADDLE_ENFORCE_GPU_SUCCESS(err); } template diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu index 2796a6b2239b9..3bffe0a05a8f7 100644 --- a/paddle/fluid/operators/average_accumulates_op.cu +++ b/paddle/fluid/operators/average_accumulates_op.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/average_accumulates_op.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu index b686c766e0f8b..c326929a14680 100644 --- a/paddle/fluid/operators/batch_fc_op.cu +++ b/paddle/fluid/operators/batch_fc_op.cu @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/batch_fc_op.h" #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index b4cf9c48df2a8..e3dc54e17cd7f 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -197,18 +197,18 @@ class BatchNormKernel // miopenTensorDescriptor_t bn_param_desc_; // miopenBatchNormMode_t mode_; -// PADDLE_ENFORCE_CUDA_SUCCESS( +// PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); -// PADDLE_ENFORCE_CUDA_SUCCESS( +// PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; cudnnBatchNormMode_t mode_; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); #endif @@ -251,23 +251,22 @@ class BatchNormKernel #ifdef PADDLE_WITH_HIP // TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( // data_desc_, CudnnDataType::type, // x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), // const_cast(strides.data()))); // Note: PERSISTENT not implemented for inference -// PADDLE_ENFORCE_CUDA_SUCCESS( +// PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenDeriveBNTensorDescriptor( // bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); // Note: PERSISTENT not implemented for inference - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, - test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, + test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_)); #endif const auto *scale = ctx.Input("Scale"); @@ -341,7 +340,7 @@ class BatchNormKernel } // TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_CUDA_SUCCESS( +// PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenBatchNormalizationForwardInference( // handle, miopenBNSpatial, // const_cast( @@ -364,7 +363,7 @@ class BatchNormKernel // est_var->template data>())), // epsilon)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardInference( handle, // Note: PERSISTENT not implemented for inference @@ -426,7 +425,7 @@ class BatchNormKernel "The argument ReserveSpace of batch_norm op is not found.")); // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload:: cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( /*handle=*/handle, @@ -440,7 +439,7 @@ class BatchNormKernel /*sizeInBytes=*/&workspace_size)); // -------------- cudnn batchnorm reserve space -------------- - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload:: cudnnGetBatchNormalizationTrainingExReserveSpaceSize( /*handle=*/handle, @@ -454,7 +453,7 @@ class BatchNormKernel ctx.GetPlace(), transformed_x.type(), reserve_space_size); workspace_ptr = workspace_tensor.mutable_data( ctx.GetPlace(), transformed_x.type(), workspace_size); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardTrainingEx( handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType::kOne(), CudnnDataType::kZero(), data_desc_, @@ -508,7 +507,7 @@ class BatchNormKernel } // TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_CUDA_SUCCESS( +// PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenBatchNormalizationForwardTraining( // handle, mode_, const_cast(static_cast( // CudnnDataType::kOne())), @@ -537,7 +536,7 @@ class BatchNormKernel // static_cast(saved_variance->template mutable_data< // BatchNormParamType>(ctx.GetPlace())))); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardTraining( handle, mode_, CudnnDataType::kOne(), CudnnDataType::kZero(), data_desc_, @@ -568,15 +567,15 @@ class BatchNormKernel #ifdef PADDLE_WITH_HIP // TODO(wangran16): wait for MIOpen to improve the performance of BN // clean when exit. -// PADDLE_ENFORCE_CUDA_SUCCESS( +// PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); -// PADDLE_ENFORCE_CUDA_SUCCESS( +// PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); #else // clean when exit. - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); #endif } @@ -981,18 +980,18 @@ class BatchNormGradKernel // miopenTensorDescriptor_t bn_param_desc_; // miopenBatchNormMode_t mode_; -// PADDLE_ENFORCE_CUDA_SUCCESS( +// PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); -// PADDLE_ENFORCE_CUDA_SUCCESS( +// PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; cudnnBatchNormMode_t mode_; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); #endif if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { @@ -1022,18 +1021,18 @@ class BatchNormGradKernel #ifdef PADDLE_WITH_HIP // TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( // data_desc_, CudnnDataType::type, // x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), // const_cast(strides.data()))); -// PADDLE_ENFORCE_CUDA_SUCCESS( +// PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, // data_desc_, mode_)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, data_desc_, mode_)); #endif @@ -1063,7 +1062,7 @@ class BatchNormGradKernel Tensor workspace_tensor; auto reserve_space_size = reserve_space->memory_size(); // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload:: cudnnGetBatchNormalizationBackwardExWorkspaceSize( /*handle=*/dev_ctx.cudnn_handle(), @@ -1081,7 +1080,7 @@ class BatchNormGradKernel workspace_ptr = workspace_tensor.mutable_data( ctx.GetPlace(), transformed_x.type(), workspace_size); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnBatchNormalizationBackwardEx( /*handle=*/dev_ctx.cudnn_handle(), /*mode=*/mode_, @@ -1151,7 +1150,7 @@ class BatchNormGradKernel } // TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_CUDA_SUCCESS( +// PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenBatchNormalizationBackward( // dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), // CudnnDataType::kZero(), CudnnDataType::kOne(), @@ -1166,7 +1165,7 @@ class BatchNormGradKernel // ctx.GetPlace()), // epsilon, saved_mean_data, saved_var_data)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnBatchNormalizationBackward( dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), CudnnDataType::kZero(), CudnnDataType::kOne(), @@ -1231,15 +1230,15 @@ class BatchNormGradKernel #ifdef PADDLE_WITH_HIP // TODO(wangran16): wait for MIOpen to improve the performance of BN // clean when exit. -// PADDLE_ENFORCE_CUDA_SUCCESS( +// PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); -// PADDLE_ENFORCE_CUDA_SUCCESS( +// PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); #else // clean when exit. - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); #endif } else { diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu index 8bd2b7fe2d127..73f73a81c088e 100644 --- a/paddle/fluid/operators/bce_loss_op.cu +++ b/paddle/fluid/operators/bce_loss_op.cu @@ -14,8 +14,8 @@ limitations under the License. */ #include #include "paddle/fluid/operators/bce_loss_op.h" #include "paddle/fluid/operators/math.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/hostdevice.h" namespace paddle { diff --git a/paddle/fluid/operators/bilateral_slice_op.cu b/paddle/fluid/operators/bilateral_slice_op.cu index 3c64ed1acc847..3fd8995745acb 100644 --- a/paddle/fluid/operators/bilateral_slice_op.cu +++ b/paddle/fluid/operators/bilateral_slice_op.cu @@ -12,8 +12,8 @@ #include #include #include "paddle/fluid/operators/bilateral_slice_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu index 757f728629106..34facf1ea1fa9 100644 --- a/paddle/fluid/operators/bincount_op.cu +++ b/paddle/fluid/operators/bincount_op.cu @@ -14,8 +14,8 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/bincount_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/hostdevice.h" namespace paddle { diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu index bb4246e3e9b84..6b393b5666bb2 100644 --- a/paddle/fluid/operators/cast_op.cu +++ b/paddle/fluid/operators/cast_op.cu @@ -14,8 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/platform/aligned_vector.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/gpu_launch_config.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu index f15d1fe5e02ac..549bb5ae75aff 100644 --- a/paddle/fluid/operators/center_loss_op.cu +++ b/paddle/fluid/operators/center_loss_op.cu @@ -14,8 +14,8 @@ limitations under the License. */ #include #include "paddle/fluid/operators/center_loss_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/cholesky_op.cu b/paddle/fluid/operators/cholesky_op.cu index 4426057305249..0bfddf8b5f386 100644 --- a/paddle/fluid/operators/cholesky_op.cu +++ b/paddle/fluid/operators/cholesky_op.cu @@ -131,27 +131,26 @@ class CholeskyGPUKernel : public framework::OpKernel { int lda, int* info) const { \ auto handle = dev_ctx.cusolver_dn_handle(); \ int workspace_size = 0; \ - PADDLE_ENFORCE_CUDA_SUCCESS( \ + PADDLE_ENFORCE_GPU_SUCCESS( \ platform::dynload::cusolverDn##C##potrf_bufferSize( \ handle, uplo, n, A, lda, &workspace_size)); \ auto workspace = memory::Alloc(dev_ctx, workspace_size); \ T* workspace_ptr = reinterpret_cast(workspace->ptr()); \ - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDn##C##potrf( \ + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##potrf( \ handle, uplo, n, A, lda, workspace_ptr, workspace_size, info)); \ } FUNC_WITH_TYPES(POTRF_INSTANCE); #if CUDA_VERSION >= 9020 && !defined(_WIN32) -#define POTRF_BATCH_INSTANCE(T, C) \ - template <> \ - void CholeskyGPUKernel::PotrfBatched( \ - const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo, \ - int n, T* Aarray[], int lda, int* info_array, int batch_size) const { \ - auto handle = dev_ctx.cusolver_dn_handle(); \ - PADDLE_ENFORCE_CUDA_SUCCESS( \ - platform::dynload::cusolverDn##C##potrfBatched( \ - handle, uplo, n, Aarray, lda, info_array, batch_size)); \ +#define POTRF_BATCH_INSTANCE(T, C) \ + template <> \ + void CholeskyGPUKernel::PotrfBatched( \ + const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo, \ + int n, T* Aarray[], int lda, int* info_array, int batch_size) const { \ + auto handle = dev_ctx.cusolver_dn_handle(); \ + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##potrfBatched( \ + handle, uplo, n, Aarray, lda, info_array, batch_size)); \ } FUNC_WITH_TYPES(POTRF_BATCH_INSTANCE); diff --git a/paddle/fluid/operators/cinn_launch_op.cc b/paddle/fluid/operators/cinn_launch_op.cc index e70a51d880516..f0ad5b3c3bf99 100644 --- a/paddle/fluid/operators/cinn_launch_op.cc +++ b/paddle/fluid/operators/cinn_launch_op.cc @@ -13,7 +13,10 @@ // limitations under the License. #include "paddle/fluid/operators/cinn_launch_op.h" + +#include #include + #include "paddle/fluid/string/string_helper.h" DECLARE_bool(cudnn_deterministic); @@ -108,33 +111,9 @@ std::unordered_set CinnLaunchContext::GetInternalVariableNames() { return all_parameters; } -void CinnLaunchContext::MutableTensorData(const std::string& var_name, - const platform::Place& place, - LoDTensor* paddle_tensor, - bool is_internal_var) { - auto cinn_name = var_name; - if (!is_internal_var) { - PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true, - platform::errors::InvalidArgument( - "Paddle variable(%s) not used by cinn", var_name)); - cinn_name = paddle2cinn_varmap_.at(var_name); - } - - auto cinn_tensor = GetCinnTensor(cinn_name); - // TODO(CtfGo): support mutable corresponding c++ type after CINN ready - VLOG(4) << "Only support float in cinn_launch op now."; - paddle_tensor->mutable_data( - framework::make_ddim(cinn_tensor->shape().data()), place); -} - void CinnLaunchContext::CheckTensorEquivalent(const std::string& paddle_name, const LoDTensor& paddle_tensor, const CinnTensor& cinn_tensor) { - PADDLE_ENFORCE_EQ( - paddle_tensor.IsInitialized(), true, - platform::errors::InvalidArgument( - "Tensor in variable(%s) is not initialized.", paddle_name)); - // check dimension auto cinn_dims = framework::make_ddim(cinn_tensor->shape().data()); PADDLE_ENFORCE_EQ(paddle_tensor.dims(), cinn_dims, @@ -147,27 +126,39 @@ void CinnLaunchContext::CheckTensorEquivalent(const std::string& paddle_name, } void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name, + const platform::Place& place, LoDTensor* paddle_tensor) { PADDLE_ENFORCE_EQ(IsVariableUsed(paddle_name), true, platform::errors::InvalidArgument( "Paddle variable(%s) not used by cinn", paddle_name)); const auto& cinn_name = paddle2cinn_varmap_.at(paddle_name); - CheckTensorEquivalent(paddle_name, *paddle_tensor, GetCinnTensor(cinn_name)); - return SetArgument(cinn_name, paddle_tensor); + CinnTensor cinn_tensor = GetCinnTensor(cinn_name); + if (!paddle_tensor->IsInitialized()) { + paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data())); + } + CheckTensorEquivalent(paddle_name, *paddle_tensor, cinn_tensor); + return SetArgument(cinn_name, place, /* free_mem_callback = */ false, + paddle_tensor); } void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name, + const platform::Place& place, LoDTensor* paddle_tensor) { PADDLE_ENFORCE_GT(cinn_variable_names_.count(cinn_name), 0, platform::errors::InvalidArgument( "Variable(%s) not found in cinn socpe.", cinn_name)); - CheckTensorEquivalent(cinn_name, *paddle_tensor, GetCinnTensor(cinn_name)); - return SetArgument(cinn_name, paddle_tensor); + CinnTensor cinn_tensor = GetCinnTensor(cinn_name); + if (!paddle_tensor->IsInitialized()) { + paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data())); + } + CheckTensorEquivalent(cinn_name, *paddle_tensor, cinn_tensor); + return SetArgument(cinn_name, place, /* free_mem_callback = */ true, + paddle_tensor); } std::unique_ptr CinnLaunchContext::ShareTensorWithCinnBuffer( - LoDTensor* tensor) { + const platform::Place& place, bool free_mem_callback, LoDTensor* tensor) { // convert paddle dimensions array to cinn format std::vector cinn_dims(tensor->dims().size()); for (auto i = 0; i < tensor->dims().size(); ++i) { @@ -177,19 +168,42 @@ std::unique_ptr CinnLaunchContext::ShareTensorWithCinnBuffer( auto cinn_buffer = std::make_unique(); // assign size and memory cinn_buffer->resize(cinn_dims.data(), cinn_dims.size()); - cinn_buffer->memory = reinterpret_cast(tensor->data()); + + cinn_buffer->external_malloc = new std::function( + [place, tensor](void* ctx, cinn_buffer_t* buffer) { + buffer->memory = + reinterpret_cast(tensor->mutable_data(place)); + return 0; + }); + + if (free_mem_callback) { + cinn_buffer->external_free = new std::function( + [tensor](void* ctx, cinn_buffer_t* buffer) { + tensor->clear(); + return 0; + }); + return cinn_buffer; + } + + cinn_buffer->external_free = new std::function( + [](void* ctx, cinn_buffer_t* buffer) { + // Do nothing + return 0; + }); return cinn_buffer; } void CinnLaunchContext::SetArgument(const std::string& cinn_name, + const platform::Place& place, + bool free_mem_callback, LoDTensor* paddle_tensor) { - auto buffer = ShareTensorWithCinnBuffer(paddle_tensor); + auto buffer = + ShareTensorWithCinnBuffer(place, free_mem_callback, paddle_tensor); name2argument_.emplace(cinn_name, buffer.get()); hold_buffers_.emplace_back(std::move(buffer)); VLOG(4) << "SetArgument-" << name2argument_.size() << ": " - << "name(" << cinn_name << "), " - << "type(" << framework::DataTypeToString(paddle_tensor->type()) - << "), dims(" << paddle_tensor->dims() << ")."; + << "name(" << cinn_name << "), dims(" << paddle_tensor->dims() + << ")."; } const std::map& diff --git a/paddle/fluid/operators/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn_launch_op.cu.cc index d557cfc7c0892..fae2d6ddb487d 100644 --- a/paddle/fluid/operators/cinn_launch_op.cu.cc +++ b/paddle/fluid/operators/cinn_launch_op.cu.cc @@ -18,9 +18,9 @@ limitations under the License. */ #include "cinn/runtime/cinn_runtime.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/type_defs.h" #ifdef PADDLE_WITH_CUDA #include @@ -45,9 +45,9 @@ void CUDART_CB ReleaseBuffers(void* data) { template <> void ReleaseResource( const std::vector& resources, void* stream) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaLaunchHostFunc( + PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc( static_cast(stream), ReleaseScope, resources[0])); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaLaunchHostFunc( + PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc( static_cast(stream), ReleaseBuffers, resources[1])); } diff --git a/paddle/fluid/operators/cinn_launch_op.h b/paddle/fluid/operators/cinn_launch_op.h index 53e6ff0d61387..2b1bf89197dff 100644 --- a/paddle/fluid/operators/cinn_launch_op.h +++ b/paddle/fluid/operators/cinn_launch_op.h @@ -49,16 +49,13 @@ class CinnLaunchContext { // Return whether a Paddle variable used on compiled kernels bool IsVariableUsed(const std::string& var_name); - // Allocate buffer to a Paddle tensor with assginment information from CINN - void MutableTensorData(const std::string& var_name, - const platform::Place& place, LoDTensor* paddle_tensor, - bool is_internal_var = false); - // Assign tensor buffer to input or output variables - void AssignExternalVariable(const std::string& var_name, LoDTensor* tensor); + void AssignExternalVariable(const std::string& var_name, + const platform::Place& place, LoDTensor* tensor); // Assign tensor buffer to internal variables - void AssignInternalVariable(const std::string& var_name, LoDTensor* tensor); + void AssignInternalVariable(const std::string& var_name, + const platform::Place& place, LoDTensor* tensor); // Extract internal variable names from CinnScope // by excluding used input and output variables @@ -83,10 +80,12 @@ class CinnLaunchContext { // Share the buffer of a Paddle tensor to CINN by delivering memory address // to a cinn_buffer_t object - std::unique_ptr ShareTensorWithCinnBuffer(LoDTensor* tensor); + std::unique_ptr ShareTensorWithCinnBuffer( + const platform::Place& place, bool free_mem_callback, LoDTensor* tensor); // Set an argument with (cinn name)->(paddle tensor) pair - void SetArgument(const std::string& cinn_name, LoDTensor* paddle_tensor); + void SetArgument(const std::string& cinn_name, const platform::Place& place, + bool free_mem_callback, LoDTensor* paddle_tensor); private: // a variable name map from paddle to cinn @@ -198,7 +197,7 @@ class CinnLaunchOpKernel : public framework::OpKernel { } launch_context->AssignExternalVariable( - var_name, scope.GetVar(var_name)->GetMutable()); + var_name, place, scope.GetVar(var_name)->GetMutable()); } // 3.2 Prepare output variables: all output variables should @@ -215,11 +214,7 @@ class CinnLaunchOpKernel : public framework::OpKernel { "Output variable(%s) not used by cinn", var_name)); auto* tensor = scope.GetVar(var_name)->GetMutable(); - if (!tensor->IsInitialized()) { - launch_context->MutableTensorData(var_name, place, tensor); - } - launch_context->AssignExternalVariable( - var_name, scope.GetVar(var_name)->GetMutable()); + launch_context->AssignExternalVariable(var_name, place, tensor); } // 3.3 Prepare internal or temporary variables: Create a temporary @@ -232,8 +227,7 @@ class CinnLaunchOpKernel : public framework::OpKernel { framework::Scope* temp_scope = scope.NewTmpScope().release(); for (const auto& var_name : internal_variable_names) { auto* tensor = temp_scope->Var(var_name)->GetMutable(); - launch_context->MutableTensorData(var_name, place, tensor, true); - launch_context->AssignInternalVariable(var_name, tensor); + launch_context->AssignInternalVariable(var_name, place, tensor); } // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic. diff --git a/paddle/fluid/operators/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn_launch_op_test.cc index 5a07a49a5969a..5e0b87d06afea 100644 --- a/paddle/fluid/operators/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn_launch_op_test.cc @@ -222,30 +222,9 @@ TEST(CinnLaunchContextTest, TestGetInternalVariableNames) { auto launch_context = std::make_unique(GetDefaultCompiledObj()); auto internal_variable_names = launch_context->GetInternalVariableNames(); - ASSERT_EQ(internal_variable_names.size(), 1); - EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2"); -} - -TEST(CinnLaunchContextTest, TestMutableTensorData) { - platform::CPUPlace place; - framework::Scope scope; - auto* tensor1 = scope.Var("var1")->GetMutable(); - auto* tensor2 = scope.Var("var2")->GetMutable(); - - auto launch_context = - std::make_unique(GetDefaultCompiledObj()); - // mutable_data on external variable - ASSERT_NO_THROW(launch_context->MutableTensorData("var1", place, tensor1)); - ASSERT_TRUE(tensor1->IsInitialized()); - ASSERT_EQ(tensor1->dims(), framework::make_ddim({3, 4})); - ASSERT_THROW(launch_context->MutableTensorData("not_exist", place, tensor1), - paddle::platform::EnforceNotMet); - - // mutable_data on internal variable - ASSERT_NO_THROW( - launch_context->MutableTensorData("cinn_var2", place, tensor2, true)); - ASSERT_TRUE(tensor2->IsInitialized()); - ASSERT_EQ(tensor2->dims(), framework::make_ddim({6, 7, 8})); + ASSERT_EQ(internal_variable_names.size(), 3); + EXPECT_NE(internal_variable_names.find("cinn_var2"), + internal_variable_names.end()); } TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) { @@ -255,12 +234,9 @@ TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) { framework::Scope scope; auto* tensor1 = scope.Var("var1")->GetMutable(); - // CheckTensorEquivalent: tensor is not initialized - ASSERT_THROW(launch_context->AssignExternalVariable("var1", tensor1), - paddle::platform::EnforceNotMet); // CheckTensorEquivalent: tensor dimension not equivalent tensor1->mutable_data(framework::make_ddim({3, 5}), place); - ASSERT_THROW(launch_context->AssignExternalVariable("var1", tensor1), + ASSERT_THROW(launch_context->AssignExternalVariable("var1", place, tensor1), paddle::platform::EnforceNotMet); } @@ -272,11 +248,12 @@ TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) { auto* tensor4 = scope.Var("var4")->GetMutable(); // not used - ASSERT_THROW(launch_context->AssignExternalVariable("var4", tensor4), + ASSERT_THROW(launch_context->AssignExternalVariable("var4", place, tensor4), paddle::platform::EnforceNotMet); // not found - ASSERT_THROW(launch_context->AssignExternalVariable("cinn_var4", tensor4), - paddle::platform::EnforceNotMet); + ASSERT_THROW( + launch_context->AssignExternalVariable("cinn_var4", place, tensor4), + paddle::platform::EnforceNotMet); } TEST(CinnLaunchContextTest, TestSetArgument) { @@ -286,22 +263,25 @@ TEST(CinnLaunchContextTest, TestSetArgument) { platform::CPUPlace place; framework::Scope scope; auto* tensor1 = scope.Var("var1")->GetMutable(); - tensor1->mutable_data(framework::make_ddim({3, 4}), place); - auto* data1 = tensor1->data(); + float* data1 = + tensor1->mutable_data(framework::make_ddim({3, 4}), place); data1[0] = 9.99f; data1[10] = 19.99f; // assign external variable - ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1", tensor1)); + ASSERT_NO_THROW( + launch_context->AssignExternalVariable("var1", place, tensor1)); auto* tensor2 = scope.Var("var2")->GetMutable(); tensor2->mutable_data(framework::make_ddim({6, 7, 8}), place); - ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2", tensor2)); + ASSERT_NO_THROW( + launch_context->AssignInternalVariable("cinn_var2", place, tensor2)); // FinalizeArguments not missed check ASSERT_THROW(launch_context->FinalizeArguments(), paddle::platform::EnforceNotMet); auto* tensor3 = scope.Var("var3")->GetMutable(); tensor3->mutable_data(framework::make_ddim({10, 16}), place); - ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3", tensor3)); + ASSERT_NO_THROW( + launch_context->AssignExternalVariable("var3", place, tensor3)); auto name2argument = launch_context->FinalizeArguments(); ASSERT_EQ(name2argument.size(), 3); @@ -310,6 +290,8 @@ TEST(CinnLaunchContextTest, TestSetArgument) { auto* cinn_buffer = static_cast(name2argument.at("cinn_var1")); + ASSERT_EQ(cinn_buffer->memory, nullptr); + cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer); ASSERT_NE(cinn_buffer->memory, nullptr); ASSERT_EQ(cinn_buffer->num_elements(), 12); auto* shadow_data = reinterpret_cast(cinn_buffer->memory); diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu index cfcfd04e6fc7c..29286be0dd6b2 100644 --- a/paddle/fluid/operators/class_center_sample_op.cu +++ b/paddle/fluid/operators/class_center_sample_op.cu @@ -30,7 +30,7 @@ namespace cub = hipcub; #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -335,7 +335,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel { static_cast( platform::DeviceContextPool::Instance().Get(ctx.GetPlace())) ->stream(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( num_classes_per_device_ptr, num_classes_per_device_ptr, num_classes_per_device.numel(), platform::ToNCCLDataType(num_classes_per_device.type()), ncclSum, @@ -346,13 +346,13 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel { // step 2: Determine temporary device storage requirements int num_buffer_ele = std::max(batch_size, num_classes); size_t cub_sort_temp_store_size = 0; - PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceRadixSort::SortPairs( + PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceRadixSort::SortPairs( nullptr, cub_sort_temp_store_size, nullptr, nullptr, nullptr, nullptr, num_buffer_ele, 0, sizeof(T) * 8, ctx.cuda_device_context().stream()))); size_t cub_sum_temp_store_size = 0; NotEqualToPreviousAdjacentIterator unique_counting_iter_temp(nullptr, 0); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( (cub::DeviceScan::InclusiveSum, T*>( nullptr, cub_sum_temp_store_size, unique_counting_iter_temp, @@ -360,7 +360,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel { size_t cub_scan_temp_store_size = 0; ActualNumSampledFunctor actual_num_sampled_op_temp(num_samples); - PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveScan( + PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceScan::InclusiveScan( nullptr, cub_scan_temp_store_size, num_classes_per_device_ptr, num_classes_per_device_ptr, actual_num_sampled_op_temp, nranks + 1, ctx.cuda_device_context().stream()))); @@ -384,7 +384,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel { void* cub_temp_storage_ptr = memory_buffer.cub_temp_storage_ptr(); // step 4: Calculate class interval among nranks - PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveSum( + PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceScan::InclusiveSum( cub_temp_storage_ptr, cub_temp_storage_bytes, num_classes_per_device_ptr, class_interval_ptr, nranks + 1, ctx.cuda_device_context().stream()))); @@ -415,13 +415,13 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel { // step 7: sort class center by ascending, so that positive class center // always be sampled. - PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceRadixSort::SortPairs( + PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceRadixSort::SortPairs( cub_temp_storage_ptr, cub_temp_storage_bytes, cub_sort_keys_ptr, cub_sort_keys_out_ptr, cub_sort_values_ptr, cub_sort_values_out_ptr, num_classes, 0, sizeof(T) * 8, ctx.cuda_device_context().stream()))); // step 8: sort input label ascending - PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceRadixSort::SortPairs( + PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceRadixSort::SortPairs( cub_temp_storage_ptr, cub_temp_storage_bytes, label->data(), cub_sort_keys_out_ptr, cub_sort_values_ptr, cub_sort_keys_ptr, batch_size, 0, sizeof(T) * 8, ctx.cuda_device_context().stream()))); @@ -430,8 +430,8 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel { // label NotEqualToPreviousAdjacentIterator unique_counting_iter( cub_sort_keys_out_ptr, 0); - PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveSum< - NotEqualToPreviousAdjacentIterator, T*>( + PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceScan::InclusiveSum< + NotEqualToPreviousAdjacentIterator, T*>( cub_temp_storage_ptr, cub_temp_storage_bytes, unique_counting_iter, cub_sort_values_ptr, batch_size, ctx.cuda_device_context().stream()))); @@ -445,13 +445,13 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel { // Since maybe num_positive_class_center > num_samples, // we need to ensure all positive class center per device are sampled. ActualNumSampledFunctor actual_num_sampled_op(num_samples); - PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveScan( + PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceScan::InclusiveScan( cub_temp_storage_ptr, cub_temp_storage_bytes, bound_value_ptr, num_classes_per_device_ptr, actual_num_sampled_op, nranks + 1, ctx.cuda_device_context().stream()))); // step 12: Calculate actual sampled class interval among nranks - PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveSum( + PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceScan::InclusiveSum( cub_temp_storage_ptr, cub_temp_storage_bytes, num_classes_per_device_ptr, class_interval_ptr, nranks + 1, ctx.cuda_device_context().stream()))); diff --git a/paddle/fluid/operators/collective/allreduce_op.h b/paddle/fluid/operators/collective/allreduce_op.h index 157924f08546b..4e6d86d49e863 100644 --- a/paddle/fluid/operators/collective/allreduce_op.h +++ b/paddle/fluid/operators/collective/allreduce_op.h @@ -22,7 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -69,15 +69,11 @@ class AllReduceOpKernel : public framework::OpKernel { red_type = ncclMin; break; } - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( sendbuff, recvbuff, numel, static_cast(dtype), red_type, comm, stream)); if (ctx.Attr("sync_mode")) { -#ifdef PADDLE_WITH_RCCL - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif + platform::GpuStreamSync(stream); } #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc index 1bcb47fc686cf..02b10f17da5a3 100644 --- a/paddle/fluid/operators/collective/alltoall_op.cu.cc +++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -62,15 +62,15 @@ class AllToAllOpCUDAKernel : public framework::OpKernel { auto recv_buf = out->mutable_data(out_dims, place); size_t offset = 0; send_numel /= nranks; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); for (auto i = 0; i < nranks; ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( send_buf + offset, send_numel, dtype, i, comm->comm(), stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( recv_buf + offset, send_numel, dtype, i, comm->comm(), stream)); offset += send_numel; } - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); #else PADDLE_THROW( platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc index b8631b44f14ca..c9aef237699f3 100644 --- a/paddle/fluid/operators/collective/barrier_op.cu.cc +++ b/paddle/fluid/operators/collective/barrier_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -41,13 +41,9 @@ class BarrierOpCUDAKernel : public framework::OpKernel { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto stream = static_cast(dev_ctx)->stream(); ncclRedOp_t nccl_red_type = ncclSum; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream)); -#ifdef PADDLE_WITH_RCCL - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif + platform::GpuStreamSync(stream); #else PADDLE_THROW(platform::errors::Unavailable( "PaddlePaddle should compile with NCCL.")); diff --git a/paddle/fluid/operators/collective/broadcast_op.cu.cc b/paddle/fluid/operators/collective/broadcast_op.cu.cc index fa4d7ee4cce5d..daaaf8b7a2e41 100644 --- a/paddle/fluid/operators/collective/broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/broadcast_op.cu.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace ops = paddle::operators; @@ -54,7 +54,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel { auto comm = dev_ctx.nccl_comm(); auto stream = dev_ctx.stream(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( send_recv_buffer, static_cast(in->numel()), platform::ToNCCLDataType(in->type()), root_dev_id, comm, stream)); @@ -62,11 +62,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel { << " From " << root_dev_id << " to " << dev_id; if (ctx.Attr("sync_mode")) { -#ifdef PADDLE_WITH_RCCL - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif + platform::GpuStreamSync(stream); } #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc index 597e4321d66bd..f174473c049ec 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -56,7 +56,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel { stream = comm->stream(); } - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( send_buff, recv_buff, send_numel, static_cast(dtype), comm->comm(), stream)); #else diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 6d569b454e691..714dc4e19f9b1 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -29,7 +29,7 @@ limitations under the License. */ #endif #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif #if defined(PADDLE_WITH_XPU_BKCL) @@ -386,7 +386,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { "Invalid reduce type: %d", red_type)); } - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream)); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc index b37bd250c1558..6deb837069761 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -46,7 +46,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { int root = ctx.Attr("root"); if (root == comm->rank()) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( reinterpret_cast(const_cast(x->data())), numel, dtype, root, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent " @@ -59,7 +59,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { static_cast(out)); } } else { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::ncclBcast(out->mutable_data(place), numel, dtype, root, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved " diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc index 60a9b1ee44fcc..db9a8428e3d03 100644 --- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { diff --git a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc index aee10dcdc2732..f69fe8f1e3f1f 100644 --- a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc @@ -26,7 +26,7 @@ limitations under the License. */ // #include "paddle/fluid/operators/distributed/request_handler_impl.h" #if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc index bfdc49c440aae..738ed16286131 100644 --- a/paddle/fluid/operators/collective/c_concat_op.cu.cc +++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc @@ -19,7 +19,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -71,7 +71,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); stream = static_cast(dev_ctx)->stream(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( send_buff, recv_buff, send_numel, static_cast(dtype), comm->comm(), stream)); diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu index 858ca79f85b0e..9b343b34a3e51 100644 --- a/paddle/fluid/operators/collective/c_embedding_op.cu +++ b/paddle/fluid/operators/collective/c_embedding_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/collective/c_embedding_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" namespace paddle { diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc index 0a0a824b77586..d392beb3a4834 100644 --- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc @@ -30,7 +30,7 @@ namespace operators { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) static void GenNCCLID(std::vector* nccl_ids) { for (size_t i = 0; i < nccl_ids->size(); ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::ncclGetUniqueId(&(*nccl_ids)[i])); } } diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h index 74f41bff9dc86..b950339bd22be 100644 --- a/paddle/fluid/operators/collective/c_reduce_op.h +++ b/paddle/fluid/operators/collective/c_reduce_op.h @@ -30,7 +30,7 @@ limitations under the License. */ #endif #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif #if defined(PADDLE_WITH_XPU_BKCL) @@ -316,7 +316,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel { "kRedMax, kRedMin, kRedProd.")); } - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( sendbuff, recvbuff, numel, dtype, nccl_red_type, root, comm->comm(), stream)); #else diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc index 4d19ee42641f4..141fa760413b3 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -57,7 +57,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel { stream = comm->stream(); } - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduceScatter( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter( send_buff, recv_buff, recv_numel, static_cast(dtype), ncclSum, comm->comm(), stream)); #else diff --git a/paddle/fluid/operators/collective/c_scatter_op.cu.cc b/paddle/fluid/operators/collective/c_scatter_op.cu.cc index 0c9dc2af14f39..4d4dc0c12af55 100644 --- a/paddle/fluid/operators/collective/c_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -66,7 +66,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel { framework::Tensor temp; auto out_ptr = temp.mutable_data(out_dims, place); if (root_id == comm->rank()) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( reinterpret_cast(const_cast(x->data())), numel, dtype, root_id, comm->comm(), stream)); @@ -74,7 +74,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel { *platform::DeviceContextPool::Instance().Get(place), static_cast(&temp)); } else { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( out_ptr, numel, dtype, root_id, comm->comm(), stream)); } diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu index 77db86e711111..6371d523cfa4a 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/softmax_impl.h" #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/string/string_helper.h" namespace paddle { @@ -119,7 +119,7 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel { Eigen::DSizes along_axis(1); eigen_logits_max.device(*dev_ctx.eigen_device()) = eigen_logits.maximum(along_axis); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( logits_max_buff, logits_max_buff, logits_max.numel(), platform::ToNCCLDataType(logits_max.type()), ncclMax, comm->comm(), stream)); @@ -160,7 +160,7 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel { } void* predict_logits_buff = predicted_logits.mutable_data(place); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( predict_logits_buff, predict_logits_buff, predicted_logits.numel(), platform::ToNCCLDataType(predicted_logits.type()), ncclSum, comm->comm(), stream)); @@ -178,7 +178,7 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel { eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) = eigen_softmax.sum(along_axis); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(), platform::ToNCCLDataType(sum_exp_logits.type()), ncclSum, comm->comm(), stream)); diff --git a/paddle/fluid/operators/collective/c_split_op.cu b/paddle/fluid/operators/collective/c_split_op.cu index 034accbb480c7..a8c4eafede41b 100644 --- a/paddle/fluid/operators/collective/c_split_op.cu +++ b/paddle/fluid/operators/collective/c_split_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_split_op.h" #include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc index 72faf4298cf60..72339bbd48752 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc @@ -55,11 +55,7 @@ class CSyncCalcStreamKernel : public framework::OpKernel { auto dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream())); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream())); -#endif + platform::GpuStreamSync(dev_ctx->stream()); #elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32) auto place = ctx.GetPlace(); diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc index 03894b24a913b..21bad096c2d49 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif #if defined(PADDLE_WITH_ASCEND_CL) @@ -67,11 +67,7 @@ class CSyncCommStreamKernel : public framework::OpKernel { auto stream = platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); -#ifdef PADDLE_WITH_RCCL - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif + platform::GpuStreamSync(stream); #elif defined(PADDLE_WITH_ASCEND_CL) PADDLE_ENFORCE_EQ(is_npu_place(place), true, diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc index d0dfc3bb1c2e5..dfa4dcd0fac59 100644 --- a/paddle/fluid/operators/collective/c_wait_comm_op.cc +++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc @@ -54,11 +54,11 @@ class CWaitCommOp : public framework::OperatorBase { // comm_stream-->event-->compute_stream #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, comm_stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, comm_stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0)); #endif #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc index 12a28040ef1c5..e038617bf3d6a 100644 --- a/paddle/fluid/operators/collective/c_wait_compute_op.cc +++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc @@ -57,11 +57,11 @@ class CWaitComputeOp : public framework::OperatorBase { // compute_stream-->event-->comm_stream #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, compute_stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, compute_stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0)); #endif #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc index 99a92469e8502..7a5b6b5f429b2 100644 --- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc @@ -20,9 +20,9 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/place.h" namespace paddle { @@ -37,7 +37,7 @@ namespace operators { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) static void GenNCCLID(std::vector* nccl_ids) { for (size_t i = 0; i < nccl_ids->size(); ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::ncclGetUniqueId(&(*nccl_ids)[i])); } } diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc index 70b5d0244d385..e2ff823420aef 100644 --- a/paddle/fluid/operators/collective/global_gather_op.cu.cc +++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -103,24 +103,24 @@ class GlobalGatherOpCUDAKernel : public framework::OpKernel { auto recv_buf = out->mutable_data(out_dims, place); for (auto i = 0; i < n_expert; ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); for (auto j = 0; j < nranks; ++j) { int idx = i + j * n_expert; if (cpu_global_count_data[idx]) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::ncclSend(send_buf + send_ptr * in_feat, cpu_global_count_data[idx] * in_feat, dtype, j, comm->comm(), stream)); send_ptr += cpu_global_count_data[idx]; } if (cpu_local_count_data[idx]) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::ncclRecv(recv_buf + expert_ptr[idx] * in_feat, cpu_local_count_data[idx] * in_feat, dtype, j, comm->comm(), stream)); } } - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); } #else PADDLE_THROW( diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc index bec984c6b57e1..c47d27366c5f2 100644 --- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -102,24 +102,24 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel { auto recv_buf = out->mutable_data(out_dims, place); for (auto i = 0; i < n_expert; ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); for (auto j = 0; j < nranks; ++j) { int idx = i + j * n_expert; if (cpu_local_count_data[idx]) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::ncclSend(send_buf + expert_ptr[idx] * in_feat, cpu_local_count_data[idx] * in_feat, dtype, j, comm->comm(), stream)); } if (cpu_global_count_data[idx]) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::ncclRecv(recv_buf + recv_ptr * in_feat, cpu_global_count_data[idx] * in_feat, dtype, j, comm->comm(), stream)); recv_ptr += cpu_global_count_data[idx]; } } - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); } #else diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc index 8c32f8c41bbf2..094847beca214 100644 --- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -67,7 +67,7 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel { stream = comm->stream(); } - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( send_buff, recv_buff, send_numel, static_cast(dtype), comm->comm(), stream)); #else diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc index 49eafa5c7c4f5..d59c062a31b8c 100644 --- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -80,7 +80,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel { int recv_numel = numel / num; int offset = recv_numel * id; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::ncclRecv(out->data() + offset, recv_numel, dtype, peer, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " recv " << recv_numel diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc index 2463f208746ed..8a4f7f750a15b 100644 --- a/paddle/fluid/operators/collective/partial_send_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -74,7 +74,7 @@ class PartialSendCUDAKernel : public framework::OpKernel { int send_numel = numel / num; int offset = send_numel * id; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( x->data() + offset, send_numel, dtype, peer, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " send " << send_numel << " from offset[" << offset << "] to " << peer; diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc index df94fee5223c6..18d6af4c2aaa1 100644 --- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -69,7 +69,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel { auto out_dims = out->dims(); out->mutable_data(out_dims, place, 0); auto numel = out->numel(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( out->data(), numel, dtype, peer, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " recv " << framework::product(out_dims) << " from " << peer; @@ -83,7 +83,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel { auto numel = out->numel(); out->mutable_data(out_dims, place); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( out->data(), numel, dtype, peer, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " recv " << framework::product(out->dims()) << " from " << peer; diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc index dc28910e9ec9c..952fcf2065d59 100644 --- a/paddle/fluid/operators/collective/send_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -62,7 +62,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel { auto& x = x_array.at(idx); int numel = x.numel(); ncclDataType_t dtype = platform::ToNCCLDataType(x.type()); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( x.data(), numel, dtype, peer, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " send " << framework::product(x.dims()) << " to " << peer; @@ -73,7 +73,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel { int numel = x->numel(); ncclDataType_t dtype = platform::ToNCCLDataType(x->type()); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( x->data(), numel, dtype, peer, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " send " << framework::product(x->dims()) << " to " << peer; diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc index dec0e789776a4..55bd4879ab794 100644 --- a/paddle/fluid/operators/controlflow/get_places_op.cc +++ b/paddle/fluid/operators/controlflow/get_places_op.cc @@ -27,7 +27,7 @@ class OpBase; } // namespace imperative } // namespace paddle #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif namespace paddle { @@ -35,7 +35,7 @@ namespace operators { static size_t CUDADevCount() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - return platform::GetCUDADeviceCount(); + return platform::GetGPUDeviceCount(); #else return 0UL; #endif diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index f4183bf570926..a783a619473ef 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -25,7 +25,8 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" -#include "paddle/fluid/platform/cudnn_desc.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" + namespace paddle { namespace operators { @@ -98,7 +99,7 @@ std::ostream& operator<<(std::ostream& out, const std::vector& v) { inline int MaxBwdFilterAlgos(cudnnHandle_t cudnn_handle) { int max_algos = 0; #if CUDNN_VERSION_MIN(7, 0, 1) - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( cudnn_handle, &max_algos)); #endif @@ -176,22 +177,22 @@ static void SetConvMathType(const framework::ExecutionContext& ctx, #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) auto& dev_ctx = ctx.template device_context(); if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_TENSOR_OP_MATH)); VLOG(5) << "use cudnn_tensor_op_math"; #if CUDA_VERSION >= 11000 #if CUDNN_VERSION_MIN(8, 1, 0) } else if (dev_ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_TENSOR_OP_MATH)); #endif // CUDNN_VERSION_MIN(8, 1, 0) } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_FMA_MATH)); #endif // CUDA_VERSION >= 11000 } else { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_DEFAULT_MATH)); VLOG(5) << "NOT use cudnn_tensor_op_math"; } @@ -245,7 +246,7 @@ struct SearchAlgorithm { int perf_count; int best_algo_idx = 0; std::unique_ptr perf_results(new perf_t[kNUM_CUDNN_FWD_ALGS]); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS, @@ -264,7 +265,7 @@ struct SearchAlgorithm { "the workspace size request(" << workspace_size << ") exceeds the limit(" << workspace_size_limit << ")"; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardAlgorithm( args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.odesc.desc(), @@ -273,7 +274,7 @@ struct SearchAlgorithm { #endif } #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardAlgorithm( args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.odesc.desc(), @@ -306,7 +307,7 @@ struct SearchAlgorithm { std::array perf_stat; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( args.handle, args.idesc.desc(), args.x->data(), args.wdesc.desc(), args.w->data(), args.cdesc.desc(), @@ -332,7 +333,7 @@ struct SearchAlgorithm { static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { size_t workspace_size = 0; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.odesc.desc(), algo, &workspace_size)); @@ -362,7 +363,7 @@ struct SearchAlgorithm { int best_algo_idx = 0; std::unique_ptr perf_results( new perf_t[kNUM_CUDNN_BWD_DATA_ALGS]); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7( args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS, @@ -395,7 +396,7 @@ struct SearchAlgorithm { "the workspace size request(" << workspace_size << ") exceeds the limit(" << workspace_size_limit << ")"; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.idesc.desc(), @@ -404,7 +405,7 @@ struct SearchAlgorithm { #endif } #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.idesc.desc(), @@ -435,7 +436,7 @@ struct SearchAlgorithm { std::array perf_stat; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload:: cudnnFindConvolutionBackwardDataAlgorithmEx( args.handle, args.wdesc.desc(), args.w->data(), @@ -464,7 +465,7 @@ struct SearchAlgorithm { static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { size_t workspace_size = 0; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.idesc.desc(), algo, &workspace_size)); @@ -496,7 +497,7 @@ struct SearchAlgorithm { int best_algo_idx = 0; std::unique_ptr perf_results( new perf_t[kNUM_CUDNN_BWD_FILTER_ALGS]); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7( args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS, @@ -515,7 +516,7 @@ struct SearchAlgorithm { "the workspace size request(" << workspace_size << ") exceeds the limit(" << workspace_size_limit << ")"; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), @@ -524,7 +525,7 @@ struct SearchAlgorithm { #endif } #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), @@ -553,7 +554,7 @@ struct SearchAlgorithm { int returned_algo_count; std::array perf_stat; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload:: cudnnFindConvolutionBackwardFilterAlgorithmEx( args.handle, args.idesc.desc(), args.x->data(), @@ -584,7 +585,7 @@ struct SearchAlgorithm { algo_t chosen_algo; std::vector perf_results(max_algos); int actual_algos = 0; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload:: cudnnFindConvolutionBackwardFilterAlgorithm( args.handle, args.idesc.desc(), args.odesc.desc(), @@ -605,7 +606,7 @@ struct SearchAlgorithm { static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { platform::CUDAGraphCaptureModeGuard guard; size_t workspace_size = 0; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), algo, &workspace_size)); diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index 275e81fc7f33a..566e99c357fbe 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -261,9 +261,8 @@ class CUDNNConvOpKernel : public framework::OpKernel { // cudnn 7 can support groups, no need to do it manually // FIXME(typhoonzero): find a better way to disable groups // rather than setting it to 1. - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnSetConvolutionGroupCount(args.cdesc.desc(), - groups)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount( + args.cdesc.desc(), groups)); groups = 1; #endif #ifdef PADDLE_WITH_HIP @@ -328,7 +327,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP workspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionForward( handle, &alpha, args.idesc.desc(), input_data, args.wdesc.desc(), filter_data, args.cdesc.desc(), algo, @@ -340,7 +339,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionForward( handle, &alpha, args.idesc.desc(), input_data + i * group_offset_in, args.wdesc.desc(), @@ -718,7 +717,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { T* temp_tensor_data = temp_tensor.mutable_data(ctx.GetPlace()); workspace_handle.RunFunc( [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionBackwardData( handle, &alpha, args1.odesc.desc(), output_grad_data, args1.wdesc.desc(), filter_data, args1.cdesc.desc(), @@ -726,7 +725,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { cudnn_workspace_ptr, workspace_size)); }, workspace_size); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor( handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(), transformed_input_grad_data, &alpha, args1.idesc.desc(), temp_tensor_data, &beta, args1.idesc.desc(), @@ -734,7 +733,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { } else { workspace_handle.RunFunc( [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionBackwardData( handle, &alpha, args1.odesc.desc(), output_grad_data, args1.wdesc.desc(), filter_data, args1.cdesc.desc(), @@ -749,7 +748,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionBackwardData( handle, &alpha, args1.wdesc.desc(), filter_data + i * group_offset_filter, args1.odesc.desc(), @@ -796,7 +795,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP workspace_handle.RunFunc( [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionBackwardWeights( handle, &alpha, args2.odesc.desc(), output_grad_data, args2.idesc.desc(), input_data, args2.cdesc.desc(), @@ -808,7 +807,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionBackwardFilter( handle, &alpha, args2.idesc.desc(), input_data + i * group_offset_in, args2.odesc.desc(), @@ -1228,7 +1227,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionForward( handle, &alpha, args1.idesc.desc(), ddx, args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1, @@ -1240,7 +1239,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionForward( handle, &alpha, args1.idesc.desc(), ddx + i * group_offset_in, args1.wdesc.desc(), @@ -1258,7 +1257,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { // MIOPEN ONLY support beta to be 0.0f wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionForward( handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(), ddw, args2.cdesc.desc(), fwd_algo2, &beta, @@ -1270,7 +1269,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionForward( handle, &alpha, args2.idesc.desc(), x + i * group_offset_in, args2.wdesc.desc(), @@ -1294,7 +1293,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionBackwardWeights( handle, &alpha, args3.odesc.desc(), transformed_dy_channel, args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo, @@ -1306,7 +1305,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionBackwardFilter( handle, &alpha, args3.idesc.desc(), ddx + i * group_offset_in, args3.odesc.desc(), @@ -1325,7 +1324,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionBackwardData( handle, &alpha, args4.odesc.desc(), transformed_dy_channel, args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo, @@ -1337,7 +1336,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionBackwardData( handle, &alpha, args4.wdesc.desc(), ddw + i * group_offset_filter, args4.odesc.desc(), diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h index 23a471cfa0067..291e5f92f322c 100644 --- a/paddle/fluid/operators/conv_cudnn_op_cache.h +++ b/paddle/fluid/operators/conv_cudnn_op_cache.h @@ -18,11 +18,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/operator.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#else -#include "paddle/fluid/platform/cudnn_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" DECLARE_uint64(conv_workspace_size_limit); DECLARE_bool(cudnn_exhaustive_search); diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h index befe09c8e6beb..9c9795143eb78 100644 --- a/paddle/fluid/operators/conv_miopen_helper.h +++ b/paddle/fluid/operators/conv_miopen_helper.h @@ -23,7 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/conv_search_cache.h" #include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" -#include "paddle/fluid/platform/miopen_desc.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { @@ -137,7 +137,7 @@ struct SearchAlgorithm { int find_count; miopenConvAlgoPerf_t find_result; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenFindConvolutionForwardAlgorithm( args.handle, args.idesc.desc(), args.x->data(), args.wdesc.desc(), args.w->data(), args.cdesc.desc(), @@ -154,7 +154,7 @@ struct SearchAlgorithm { static size_t GetWorkspaceSize(const ConvArgs& args) { size_t workspace_size = 0; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionForwardGetWorkSpaceSize( args.handle, args.wdesc.desc(), args.idesc.desc(), args.cdesc.desc(), args.odesc.desc(), &workspace_size)); @@ -179,7 +179,7 @@ struct SearchAlgorithm { int find_count; miopenConvAlgoPerf_t find_result; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenFindConvolutionBackwardDataAlgorithm( args.handle, args.odesc.desc(), args.o->data(), args.wdesc.desc(), args.w->data(), args.cdesc.desc(), @@ -196,7 +196,7 @@ struct SearchAlgorithm { static size_t GetWorkspaceSize(const ConvArgs& args) { size_t workspace_size = 0; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize( args.handle, args.odesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.idesc.desc(), &workspace_size)); @@ -221,7 +221,7 @@ struct SearchAlgorithm { int find_count; miopenConvAlgoPerf_t find_result; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenFindConvolutionBackwardWeightsAlgorithm( args.handle, args.odesc.desc(), args.o->data(), args.idesc.desc(), args.x->data(), args.cdesc.desc(), @@ -238,7 +238,7 @@ struct SearchAlgorithm { static size_t GetWorkspaceSize(const ConvArgs& args) { size_t workspace_size = 0; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize( args.handle, args.odesc.desc(), args.idesc.desc(), args.cdesc.desc(), args.wdesc.desc(), &workspace_size)); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 1610705c4694c..41f6f75200697 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -20,13 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cudnn_helper.h" -#endif - -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -222,7 +216,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( if (input_data_type == framework::proto::VarType::BF16 && library == framework::LibraryType::kCUDNN) { PADDLE_ENFORCE_GE( - platform::CudnnVersion(), 8100, + platform::DnnVersion(), 8100, platform::errors::InvalidArgument( "bfloat16 can only be used when CUDNN_VERSION >= 8100")); } diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu index 314d33310588e..2289104d2dbfb 100644 --- a/paddle/fluid/operators/conv_shift_op.cu +++ b/paddle/fluid/operators/conv_shift_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_shift_op.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index c4cd5854c0f78..19c0be44a1d0b 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -265,7 +265,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { for (int g = 0; g < groups; g++) { #ifdef PADDLE_WITH_HIP auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionBackwardData( handle, &alpha, args.odesc.desc(), input_data + input_offset * g, args.wdesc.desc(), @@ -275,7 +275,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { }; #else // PADDLE_WITH_HIP auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionBackwardData( handle, &alpha, args.wdesc.desc(), filter_data + filter_offset * g, args.odesc.desc(), @@ -549,7 +549,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { for (int g = 0; g < groups; g++) { #ifdef PADDLE_WITH_HIP auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionForward( handle, &alpha, args1.idesc.desc(), output_grad_data + output_grad_offset * g, args1.wdesc.desc(), @@ -560,13 +560,12 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { }; #else // PADDLE_WITH_HIP auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args1.idesc.desc(), - output_grad_data + output_grad_offset * g, args1.wdesc.desc(), - filter_data + filter_offset * g, args1.cdesc.desc(), - data_algo, cudnn_workspace, workspace_size, &beta, - args1.odesc.desc(), input_grad_data + input_offset * g)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnConvolutionForward( + handle, &alpha, args1.idesc.desc(), + output_grad_data + output_grad_offset * g, args1.wdesc.desc(), + filter_data + filter_offset * g, args1.cdesc.desc(), data_algo, + cudnn_workspace, workspace_size, &beta, args1.odesc.desc(), + input_grad_data + input_offset * g)); }; #endif // PADDLE_WITH_HIP workspace_handle.RunFunc(cudnn_func, workspace_size); @@ -598,7 +597,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { for (int g = 0; g < groups; g++) { #ifdef PADDLE_WITH_HIP auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionBackwardWeights( handle, &alpha, args2.odesc.desc(), input_data + input_offset * g, args2.idesc.desc(), @@ -609,7 +608,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { }; #else // PADDLE_WITH_HIP auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionBackwardFilter( handle, &alpha, args2.idesc.desc(), output_grad_data + output_grad_offset * g, args2.odesc.desc(), @@ -1054,7 +1053,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionBackwardData( handle, &alpha, args1.odesc.desc(), ddx + i * group_offset_in, args1.wdesc.desc(), @@ -1067,7 +1066,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { #else // PADDLE_WITH_HIP wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionBackwardData( handle, &alpha, args1.wdesc.desc(), w + i * group_offset_filter, args1.odesc.desc(), @@ -1089,7 +1088,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { T* conv_x_ddw_data = conv_x_ddw.mutable_data(ctx.GetPlace()); wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionBackwardData( handle, &alpha, args2.odesc.desc(), x + i * group_offset_in, args2.wdesc.desc(), @@ -1099,7 +1098,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { workspace_size)); }, workspace_size); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor( handle, miopenTensorOpAdd, &alpha, args2.idesc.desc(), transformed_ddy_channel + i * group_offset_out, &alpha, args2.idesc.desc(), conv_x_ddw_data + i * group_offset_out, &beta, @@ -1108,7 +1107,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { #else // PADDLE_WITH_HIP wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionBackwardData( handle, &alpha, args2.wdesc.desc(), ddw + i * group_offset_filter, args2.odesc.desc(), @@ -1152,7 +1151,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionBackwardWeights( handle, &alpha, args3.odesc.desc(), ddx + i * group_offset_in, args3.idesc.desc(), @@ -1165,7 +1164,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { #else // PADDLE_WITH_HIP wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionBackwardFilter( handle, &alpha, args3.idesc.desc(), transformed_dy_channel + i * group_offset_out, @@ -1185,7 +1184,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionForward( handle, &alpha, args4.idesc.desc(), transformed_dy_channel + i * group_offset_out, @@ -1198,7 +1197,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { #else // PADDLE_WITH_HIP wkspace_handle.RunFunc( [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionForward( handle, &alpha, args4.idesc.desc(), transformed_dy_channel + i * group_offset_out, diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h index b7859237e737a..5451cf815cae3 100644 --- a/paddle/fluid/operators/cudnn_lstm_cache.h +++ b/paddle/fluid/operators/cudnn_lstm_cache.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/dynload/cudnn.h" namespace paddle { @@ -77,7 +77,7 @@ class ScopedRNNBase { // ------------------- cudnn dropout descriptors --------------------- size_t state_size; if (!initialized_) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size)); dropout_state->mutable_data({static_cast(state_size)}, place); @@ -86,7 +86,7 @@ class ScopedRNNBase { dropout_state, seed_, state_size); // ------------------- cudnn rnn descriptors --------------------- - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6( handle, rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(), CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, @@ -94,14 +94,14 @@ class ScopedRNNBase { #if CUDNN_VERSION >= 7201 if (!sequence_length.empty()) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode( rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED)); } #endif // ------------------- cudnn weights_size --------------------- size_t weights_size_; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNParamsSize( handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type)); PADDLE_ENFORCE_EQ( weights_size_, sizeof(T) * weight_numel_, @@ -113,10 +113,10 @@ class ScopedRNNBase { std::vector dim_w = {dim_tmp, 1, 1}; weight_desc_.descriptor(layout, dim_w); // ------------------- cudnn workspace, reserve size --------------------- - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize( handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), workspace_size)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetRNNTrainingReserveSize( handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size)); diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index 27f64b41948be..6f696afa23886 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -111,14 +111,14 @@ void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle, // for inference // This interface is used when the input/output is unpadded. #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardInference( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNForwardInference( handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data, rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data, rnn->weight_desc(), w_data, rnn->y_descs(), out_data, rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data, workspace_data->data(), workspace_size)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardInference( handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data, rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data, rnn->weight_desc(), w_data, rnn->y_descs(), out_data, @@ -129,7 +129,7 @@ void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle, #if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201 // for inference // This interface is used when the input/output is padded. - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx( handle, rnn->rnn_desc(), rnn->x_seq_desc(), x_data, rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data, rnn->weight_desc(), w_data, rnn->y_seq_desc(), out_data, rnn->last_h_desc(), last_h_data, @@ -277,7 +277,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { // for train // This interface is used when the input/output is unpadded. #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardTraining( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNForwardTraining( handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data, rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data, rnn.weight_desc(), w_data, rnn.y_descs(), out_data, @@ -285,7 +285,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { workspace_data_.data(), workspace_size, reserve_data, reserve_size)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardTraining( handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data, rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data, rnn.weight_desc(), w_data, rnn.y_descs(), out_data, @@ -297,15 +297,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { #if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201 // for train // This interface is used when the input/output is padded. - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnRNNForwardTrainingEx( - handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, - rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data, - rnn.weight_desc(), w_data, rnn.y_seq_desc(), out_data, - rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data, - nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, - nullptr, workspace_data_.data(), workspace_size, - reserve_data, reserve_size)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardTrainingEx( + handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, rnn.init_h_desc(), + init_h_data, rnn.init_c_desc(), init_c_data, rnn.weight_desc(), + w_data, rnn.y_seq_desc(), out_data, rnn.last_h_desc(), last_h_data, + rnn.last_c_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, workspace_data_.data(), + workspace_size, reserve_data, reserve_size)); #else PADDLE_THROW(platform::errors::Unavailable( "The padded input is supported by " @@ -433,7 +431,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { if (!has_seq_length) { // This interface is used when the input/output is unpadded. #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardData( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNBackwardData( handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data, rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data, rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data, @@ -442,13 +440,13 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { rnn.init_c_desc(), init_c_grad_data, workspace_data_.data(), workspace_size, const_cast(reserve_data), reserve_size)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardWeights( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNBackwardWeights( handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data(), rnn.init_h_desc(), init_h->data(), rnn.y_descs(), out->data(), rnn.weight_desc(), weight_grad_data, workspace_data_.data(), workspace_size, const_cast(reserve_data), reserve_size)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardData( handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data, rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data, rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data, @@ -457,7 +455,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { rnn.init_c_desc(), init_c_grad_data, workspace_data_.data(), workspace_size, const_cast(reserve_data), reserve_size)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardWeights( handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data(), rnn.init_h_desc(), init_h->data(), rnn.y_descs(), out->data(), workspace_data_.data(), workspace_size, rnn.weight_desc(), @@ -467,7 +465,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { #if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201 // for train // This interface is used when the input/output is padded. - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx( handle, rnn.rnn_desc(), rnn.y_seq_desc(), out_data, rnn.y_seq_desc(), out_grad_data, nullptr, nullptr, rnn.last_h_desc(), last_h_grad_data, rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data, @@ -477,7 +475,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { workspace_data_.data(), workspace_size, const_cast(reserve_data), reserve_size)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeightsEx( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardWeightsEx( handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data(), rnn.init_h_desc(), init_h->data(), rnn.y_seq_desc(), out->data(), workspace_data_.data(), workspace_size, diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h index a6a23a91c76c0..6c059257b94e8 100644 --- a/paddle/fluid/operators/cudnn_rnn_cache.h +++ b/paddle/fluid/operators/cudnn_rnn_cache.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { @@ -92,15 +92,15 @@ struct CudnnRNNCache { std::vector strides_y = {hidden_size_ * numDirections, 1, 1}; for (size_t i = 0; i < seq_length_; ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( x_desc_[i], cudnn_type, 3, dims.data(), strides.data())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( y_desc_[i], cudnn_type, 3, dims_y.data(), strides_y.data())); } @@ -108,78 +108,78 @@ struct CudnnRNNCache { hidden_size_}; std::vector strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1}; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( hx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( cx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( hy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( cy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( dhx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( dcx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( dhy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( dcy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data())); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); size_t state_size; if (!initialized) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size)); dropout_state_->Resize({static_cast(state_size)}); uint8_t *dropout_state_data = dropout_state_->mutable_data(place); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetDropoutDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetDropoutDescriptor( dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, seed_)); } else { uint8_t *dropout_state_data = dropout_state_->data(); auto dropout_state_dims = dropout_state_->dims(); state_size = dropout_state_dims[0]; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnRestoreDropoutDescriptor( dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, 0)); } - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6( handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, CUDNN_RNN_ALGO_STANDARD, cudnn_type)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNParamsSize( handle, rnn_desc_, x_desc_[0], &weights_size_, cudnn_type)); PADDLE_ENFORCE_EQ( @@ -191,14 +191,14 @@ struct CudnnRNNCache { dim_w[0] = weights_size_ / cudnn_size; dim_w[1] = 1; dim_w[2] = 1; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor( w_desc_, cudnn_type, CUDNN_TENSOR_NCHW, 3, dim_w)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor( dw_desc_, cudnn_type, CUDNN_TENSOR_NCHW, 3, dim_w)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize( handle, rnn_desc_, seq_length_, x_desc_, &workspace_size_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetRNNTrainingReserveSize( handle, rnn_desc_, seq_length_, x_desc_, reserve_size_)); @@ -208,40 +208,40 @@ struct CudnnRNNCache { void release() { for (size_t i = 0; i < seq_length_; ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); } delete[] x_desc_; delete[] y_desc_; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); } }; diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu index d9e19eb7f61a6..977e301f13663 100644 --- a/paddle/fluid/operators/cumsum_op.cu +++ b/paddle/fluid/operators/cumsum_op.cu @@ -24,7 +24,7 @@ limitations under the License. */ namespace cub = hipcub; #endif #include "paddle/fluid/operators/cum_op.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" using Tensor = paddle::framework::Tensor; using LoDTensor = paddle::framework::LoDTensor; diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu index 75976c968c9e8..ad96dc24b9206 100644 --- a/paddle/fluid/operators/cvm_op.cu +++ b/paddle/fluid/operators/cvm_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/cvm_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu index 1043faa56f01b..5d157a77b3dd1 100644 --- a/paddle/fluid/operators/data_norm_op.cu +++ b/paddle/fluid/operators/data_norm_op.cu @@ -16,10 +16,10 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/data_norm_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -176,23 +176,19 @@ class DataNormGradKernel if (need_sync_stats) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto comm = platform::NCCLCommContext::Instance().Get(0, ctx.GetPlace()); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( reinterpret_cast(d_batch_size), reinterpret_cast(d_batch_size), C, platform::ToNCCLDataType(x->type()), ncclSum, comm->comm(), stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( reinterpret_cast(d_batch_sum), reinterpret_cast(d_batch_sum), C, platform::ToNCCLDataType(x->type()), ncclSum, comm->comm(), stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( reinterpret_cast(d_batch_square_sum), reinterpret_cast(d_batch_square_sum), C, platform::ToNCCLDataType(x->type()), ncclSum, comm->comm(), stream)); -#ifdef PADDLE_WITH_RCCL - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif + platform::GpuStreamSync(stream); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with GPU, and need_sync_stats connot be " diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu index 67f5ee332eeb2..924adafa4b8d8 100644 --- a/paddle/fluid/operators/deformable_conv_op.cu +++ b/paddle/fluid/operators/deformable_conv_op.cu @@ -27,7 +27,7 @@ #include "paddle/fluid/operators/deformable_conv_op.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cu b/paddle/fluid/operators/deformable_conv_v1_op.cu index e399a1fafdb71..c252700528c49 100644 --- a/paddle/fluid/operators/deformable_conv_v1_op.cu +++ b/paddle/fluid/operators/deformable_conv_v1_op.cu @@ -30,7 +30,7 @@ #include "paddle/fluid/operators/deformable_conv_v1_op.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu index c1d4cc9d17ab4..6489c1f9784cf 100644 --- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu @@ -32,7 +32,7 @@ #include "paddle/fluid/operators/deformable_psroi_pooling_op.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu index 9f63f8ed6f520..39f4fdb71b69d 100644 --- a/paddle/fluid/operators/dequantize_log_op.cu +++ b/paddle/fluid/operators/dequantize_log_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/dequantize_log_op.h" #include "paddle/fluid/operators/math.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/hostdevice.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 08c44a2d39ecf..a85bca3646499 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -18,11 +18,20 @@ endfunction() if (WITH_ASCEND_CL) detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu box_coder_op_npu.cc) detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu density_prior_box_op_npu.cc) - detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu prior_box_op_npu.cc) else() detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu) detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu) - detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) +endif() + +if(WITH_XPU) + detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_xpu.cc) + detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_xpu.cc) +elseif(WITH_ASCEND_CL) + detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_npu.cc) + detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu prior_box_op_npu.cc) +else() + detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu) + detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) endif() detection_library(bipartite_match_op SRCS bipartite_match_op.cc) @@ -63,14 +72,6 @@ else() detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc) endif() -if(WITH_XPU) - detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_xpu.cc) -elseif(WITH_ASCEND_CL) - detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_npu.cc) -else() - detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu) -endif() - detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu) #Export local libraries to parent # set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index 725983f8153e4..6f5137be62011 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -18,15 +18,14 @@ limitations under the License. */ #include #ifdef __NVCC__ #include "cub/cub.cuh" -#include "paddle/fluid/platform/cudnn_helper.h" #endif #ifdef __HIPCC__ #include -#include "paddle/fluid/platform/miopen_helper.h" namespace cub = hipcub; #endif #include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/for_range.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu index e02f99a613c01..17013efcc98b7 100644 --- a/paddle/fluid/operators/detection/box_clip_op.cu +++ b/paddle/fluid/operators/detection/box_clip_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detection/box_clip_op.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/hostdevice.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu index 0693029eaea9c..6e5fa1e293353 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cu +++ b/paddle/fluid/operators/detection/box_coder_op.cu @@ -13,7 +13,7 @@ limitations under the License. */ #include #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/detection/box_coder_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu index 70767f1d7b115..ed97559aa8bb5 100644 --- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu @@ -11,7 +11,7 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index ffd9ac6b2af80..bd5703022db90 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -26,7 +26,7 @@ namespace cub = hipcub; #include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/for_range.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index 7ccb354e1773a..1df7dcbe670c0 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -26,7 +26,7 @@ namespace cub = hipcub; #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" #include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/for_range.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu index 5977a434a6023..5ff479eac8df0 100644 --- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu +++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detection/prior_box_op_xpu.cc b/paddle/fluid/operators/detection/prior_box_op_xpu.cc new file mode 100644 index 0000000000000..bab394689546e --- /dev/null +++ b/paddle/fluid/operators/detection/prior_box_op_xpu.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/detection/prior_box_op.h" + +namespace paddle { +namespace operators { + +template +class PriorBoxOpXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* image = ctx.Input("Image"); + auto* boxes = ctx.Output("Boxes"); + auto* vars = ctx.Output("Variances"); + + auto min_sizes = ctx.Attr>("min_sizes"); + auto max_sizes = ctx.Attr>("max_sizes"); + auto input_aspect_ratio = ctx.Attr>("aspect_ratios"); + auto variances = ctx.Attr>("variances"); + auto flip = ctx.Attr("flip"); + auto clip = ctx.Attr("clip"); + auto min_max_aspect_ratios_order = + ctx.Attr("min_max_aspect_ratios_order"); + + std::vector aspect_ratios; + ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); + + K step_w = static_cast(ctx.Attr("step_w")); + K step_h = static_cast(ctx.Attr("step_h")); + K offset = static_cast(ctx.Attr("offset")); + + auto img_width = image->dims()[3]; + auto img_height = image->dims()[2]; + + auto feature_width = input->dims()[3]; + auto feature_height = input->dims()[2]; + + K step_width, step_height; + if (step_w == 0 || step_h == 0) { + step_width = static_cast(img_width) / feature_width; + step_height = static_cast(img_height) / feature_height; + } else { + step_width = step_w; + step_height = step_h; + } + + int num_priors = aspect_ratios.size() * min_sizes.size(); + if (max_sizes.size() > 0) { + num_priors += max_sizes.size(); + } + + boxes->mutable_data(ctx.GetPlace()); + vars->mutable_data(ctx.GetPlace()); + + const auto& dev_ctx = + ctx.template device_context(); + auto boxes_data = boxes->data(); + auto vars_data = vars->data(); + xpu::VectorParam aspect_ratios_param{ + aspect_ratios.data(), static_cast(aspect_ratios.size()), nullptr}; + xpu::VectorParam min_sizes_param{ + min_sizes.data(), static_cast(min_sizes.size()), nullptr}; + xpu::VectorParam max_sizes_param{ + max_sizes.data(), static_cast(max_sizes.size()), nullptr}; + + int ret = xpu::gen_prior_box( + dev_ctx.x_context(), boxes_data, aspect_ratios_param, min_sizes_param, + max_sizes_param, feature_height, feature_width, img_height, img_width, + offset, step_height, step_width, clip, min_max_aspect_ratios_order); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU gen_prior_box kernel return wrong value[%d %s]", + ret, XPUAPIErrorMsg[ret])); + + int box_num = feature_height * feature_width * num_priors; + int vlen = variances.size(); + for (int i = 0; i < box_num; ++i) { + ret = xpu_memcpy(vars_data + i * vlen, variances.data(), vlen * sizeof(K), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, platform::errors::External( + "XPU xpu_memcpy return wrong " + "value[%d %s] in prior_box.", + ret, XPUAPIErrorMsg[ret])); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL(prior_box, ops::PriorBoxOpXPUKernel); + +#endif diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index 7b34e197ffe21..2ddcc7a06f679 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" using paddle::platform::PADDLE_CUDA_NUM_THREADS; diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu index ed1676200dc47..10c402e5a4078 100644 --- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu +++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h" #include "paddle/fluid/operators/math.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/hostdevice.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu index 83a0eb87d02dd..23bd6af6bd2e8 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.cu +++ b/paddle/fluid/operators/detection/yolo_box_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/detection/yolo_box_op.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/diagonal_op.cu b/paddle/fluid/operators/diagonal_op.cu index e2b5f24d6619e..b1268e903df19 100644 --- a/paddle/fluid/operators/diagonal_op.cu +++ b/paddle/fluid/operators/diagonal_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/diagonal_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index bd4d690577a6f..c97a523caa767 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -33,7 +33,7 @@ limitations under the License. */ #include "paddle/fluid/operators/dropout_impl_util.h" #include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/platform/aligned_vector.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" namespace paddle { namespace operators { @@ -167,14 +167,14 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, auto* y_data = y->data(); if (dropout_prob == 1.0f) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream)); #endif return; diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu index 80490af33a1f9..f28fa4d6338d7 100644 --- a/paddle/fluid/operators/edit_distance_op.cu +++ b/paddle/fluid/operators/edit_distance_op.cu @@ -17,8 +17,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/edit_distance_op.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index d3ab8ad9d6985..ad5a55aede751 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -26,7 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/elementwise/elementwise_functor.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/transform.h" // only can include the headers in paddle/pten/include dirs @@ -43,8 +43,8 @@ limitations under the License. */ #include #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #ifdef __HIPCC__ constexpr int ELEMWISE_MAX_BLOCK_DIM = 256; diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu index 00562767c97a5..2b44c81a4550d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" @@ -30,12 +32,69 @@ static __global__ void SimpleElemwiseSubGradCUDAKernel(const T* dout, int col = blockIdx.x * blockDim.x + threadIdx.x; while (col < size) { - dx[col] = dout[col]; + if (dx != nullptr) { + dx[col] = dout[col]; + } dy[col] = -dout[col]; col += blockDim.x * gridDim.x; } } +template +typename std::enable_if< + std::is_same::value>::type +default_elementwise_sub_grad(const framework::ExecutionContext& ctx, + const framework::Tensor* x, + const framework::Tensor* y, + const framework::Tensor* out, + const framework::Tensor* dout, + framework::Tensor* dx, framework::Tensor* dy) { + int axis = ctx.Attr("axis"); + auto* dout_data = dout->data(); + // dx + if (dx != nullptr) { + auto* dx_data = dx->mutable_data(ctx.GetPlace()); + if (dx->dims() == dout->dims()) { + if (dx_data != dout_data) { + framework::TensorCopy( + *dout, ctx.GetPlace(), + ctx.template device_context(), dx); + } + } else { + // For inplace strategy, dx will be stored in addr of dout, which makes + // the result of dy wrong. + if (dx->IsSharedBufferWith(*dout)) { + dx->clear(); + dx->mutable_data(x->dims(), ctx.GetPlace()); + } + std::vector reduce_dims = GetReduceDim(x->dims(), out->dims(), axis); + gpuStream_t stream = ctx.cuda_device_context().stream(); + TensorReduceFunctorImpl(*dout, dx, reduce_dims, stream); + } + } + // dy + if (dy != nullptr) { + auto* dy_data = dy->mutable_data(ctx.GetPlace()); + if (dy->dims() == dout->dims()) { + if (dy_data != dout_data) { + dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1); + auto size = dy->numel(); + dim3 grid_size = dim3( + (size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1); + SimpleElemwiseSubGradCUDAKernel<<< + grid_size, block_size, 0, + ctx.template device_context().stream()>>>( + dout->data(), size, nullptr, + dy->mutable_data(ctx.GetPlace())); + } + } else { + std::vector reduce_dims = GetReduceDim(y->dims(), out->dims(), axis); + gpuStream_t stream = ctx.cuda_device_context().stream(); + TensorReduceFunctorImpl(*dout, dy, reduce_dims, stream); + } + } +} + template typename std::enable_if< std::is_same::value>::type diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h index 94c8edf24a127..08a4e709a37ad 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h @@ -71,6 +71,21 @@ struct SubGradDY { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; } }; +template +typename std::enable_if< + std::is_same::value>::type +default_elementwise_sub_grad(const framework::ExecutionContext& ctx, + const framework::Tensor* x, + const framework::Tensor* y, + const framework::Tensor* out, + const framework::Tensor* dout, + framework::Tensor* dx, framework::Tensor* dy) { + int axis = ctx.Attr("axis"); + + ElemwiseExplicitGradCompute, SubGradDY>( + ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX(), SubGradDY()); +} + template typename std::enable_if< std::is_same::value>::type @@ -79,13 +94,21 @@ elementwise_sub_grad(const framework::ExecutionContext& ctx, const framework::Tensor* out, const framework::Tensor* dout, framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - ElemwiseExplicitGradCompute, SubGradDY>( - ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX(), SubGradDY()); + default_elementwise_sub_grad(ctx, x, y, out, dout, dx, dy); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // cuda definition +template +typename std::enable_if< + std::is_same::value>::type +default_elementwise_sub_grad(const framework::ExecutionContext& ctx, + const framework::Tensor* x, + const framework::Tensor* y, + const framework::Tensor* out, + const framework::Tensor* dout, + framework::Tensor* dx, framework::Tensor* dy); + template typename std::enable_if< std::is_same::value>::type @@ -108,15 +131,13 @@ class ElementwiseSubGradKernel : public ElemwiseGradKernel { auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); // skip out auto* out = dout; if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) { elementwise_sub_grad(ctx, x, y, out, dout, dx, dy); } else { - ElemwiseExplicitGradCompute, SubGradDY>( - ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX(), - SubGradDY()); + default_elementwise_sub_grad(ctx, x, y, out, dout, dx, + dy); } } }; diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc index ab45b6f4de276..706475bc82fad 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc @@ -30,10 +30,9 @@ namespace operators { static void Memcpy(void *dst, const void *src, size_t n, bool copy_to_gpu) { if (copy_to_gpu) { #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice)); #elif defined(PADDLE_WITH_HIP) - PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(dst, src, n, hipMemcpyHostToDevice)); + PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(dst, src, n, hipMemcpyHostToDevice)); #else PADDLE_THROW( platform::errors::InvalidArgument("Check your paddle version, current " diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index 8f2235c7e3d21..b95bbc775a0d7 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/fake_quantize_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h index 77bc9e466e808..fd0f42df11875 100644 --- a/paddle/fluid/operators/filter_by_instag_op.h +++ b/paddle/fluid/operators/filter_by_instag_op.h @@ -65,19 +65,26 @@ class FilterByInstagKernel : public framework::OpKernel { // expected auto = const int64_t auto* x2_data = x2->data(); // e.g get [0, 1, 2, 3, ...] - auto x2_lods = x2->lod()[0]; + size_t x2_lods_size = x2->dims()[0]; Vector x1_lods(1, 0); if (!is_x1_lod) { for (int i = 0; i < x1->dims()[0]; i++) { x1_lods.push_back(i + 1); } } else { - x1_lods = context.Input("Ins")->lod()[0]; + // new: lod_level=0 => lod() return {} + if (x1->lod().size() != 0) { + x1_lods = x1->lod()[0]; + } else { + for (int i = 0; i < x1->dims()[0]; i++) { + x1_lods.push_back(i + 1); + } + } } std::unordered_map mmap_aux; Vector out_lods(1, 0); - for (size_t i = 0; i < x2_lods.size() - 1; i++) { - for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) { + for (size_t i = 0; i < x2_lods_size; i++) { + for (size_t j = i; j < i + 1; j++) { if (filter_tag.find(x2_data[j]) != filter_tag.end()) { size_t batch_len = x1_lods[i + 1] - x1_lods[i]; mmap_aux[out_lods.back()] = x1_lods[i]; diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index f7478364cdfc5..990ac8dbc8121 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -22,11 +22,7 @@ limitations under the License. */ namespace cub = hipcub; #endif -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#else -#include "paddle/fluid/platform/cudnn_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #ifdef __HIPCC__ #define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim) diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc index 6b94f4ea5bdd2..f2ce0bccd2fb5 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cc +++ b/paddle/fluid/operators/fused/conv_fusion_op.cc @@ -15,9 +15,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/conv_op.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cudnn_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index f5ee7f5599184..38326e7560c0d 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -18,11 +18,7 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/math/padding.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#else -#include "paddle/fluid/platform/cudnn_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" DECLARE_int64(cudnn_exhaustive_search_times); @@ -169,7 +165,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP miopenConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(padding_common, strides, dilations); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenSetConvolutionGroupCount(cudnn_conv_desc, groups)); // Now only support NCHW @@ -194,14 +190,14 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { auto f_dims = framework::vectorize(filter->dims()); size_t workspace_size = 0; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionForwardGetWorkSpaceSize( handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, cudnn_output_desc, &workspace_size)); int find_count; miopenConvAlgoPerf_t find_result; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenFindConvolutionForwardAlgorithm( handle, cudnn_input_desc, input_data, cudnn_filter_desc, filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, @@ -215,23 +211,23 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { { ScalingParamType alpha = 1.0f, beta = 0.0f; auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenConvolutionForward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenConvolutionForward( handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, filter_data, cudnn_conv_desc, algo, &beta, cudnn_output_desc, output_data, cudnn_workspace, workspace_size)); }; workspace_handle.RunFunc(cudnn_func, workspace_size); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenConvolutionForwardBias( handle, &alpha, cudnn_bias_desc, bias_data, &beta, cudnn_output_desc, output_data)); if (activation != "identity") { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenActivationForward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenActivationForward( handle, cudnn_act_desc, &alpha, cudnn_output_desc, output_data, &beta, cudnn_output_desc, output_data)); } if (residual) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor( handle, miopenTensorOpAdd, &alpha, cudnn_output_desc, output_data, &alpha, cudnn_output_desc, residual_data, &beta, cudnn_output_desc, output_data)); @@ -240,9 +236,8 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { #else // PADDLE_WITH_HIP cudnnConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(padding_common, strides, dilations); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnSetConvolutionGroupCount(cudnn_conv_desc, - groups)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount( + cudnn_conv_desc, groups)); cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( layout, framework::vectorize(transformed_input.dims())); @@ -273,13 +268,12 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { auto handle = dev_ctx.cudnn_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cudnn_conv_desc, CUDNN_DEFAULT_MATH)); #if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000 if (!platform::allow_tf32_cudnn) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnSetConvolutionMathType(cudnn_conv_desc, - CUDNN_FMA_MATH)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + cudnn_conv_desc, CUDNN_FMA_MATH)); } #endif // CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000 @@ -292,20 +286,20 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { size_t tmp_size = 0; std::unique_ptr perf_results( new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count, perf_results.get())); algo = (perf_results.get())[best_algo_idx].algo; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); if (workspace_size_in_bytes > workspace_size_limit) workspace_size_limit = workspace_size_in_bytes; #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardAlgorithm( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, @@ -319,7 +313,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { std::array fwd_perf_stat; auto cudnn_find_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( handle, cudnn_input_desc, input_data, cudnn_filter_desc, filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, @@ -355,7 +349,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { VLOG(3) << "choose algo " << algo; } - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); @@ -375,13 +369,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { // ------------- cudnn conv forward and bias add --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnConvolutionForward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnConvolutionForward( handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, filter_data, cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); }; workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnAddTensor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnAddTensor( handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc, output_data)); } else { @@ -392,7 +386,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { ScalingParamType alpha1 = 1.0f; ScalingParamType alpha2 = residual ? 1.0f : 0.0f; auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionBiasActivationForward( handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, filter_data, cudnn_conv_desc, algo, diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h index dc703f9a822b5..913772fb65050 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h +++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h @@ -15,8 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/fused/cudnn_fusion_helper.h" -#include "paddle/fluid/platform/cudnn_desc.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/cudnn_fusion_helper.h b/paddle/fluid/operators/fused/cudnn_fusion_helper.h index 1de64cf5ad947..13fad0b7cbb3d 100644 --- a/paddle/fluid/operators/fused/cudnn_fusion_helper.h +++ b/paddle/fluid/operators/fused/cudnn_fusion_helper.h @@ -31,19 +31,19 @@ class CudnnFusionOp { public: explicit CudnnFusionOp(cudnnFusedOps_t op_id) : plan_created_(false) { // New 'fused op' descriptor creation - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFusedOpsPlan(&op_, op_id)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateFusedOpsPlan(&op_, op_id)); + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnCreateFusedOpsConstParamPack(&op_const_params_, op_id)); - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFusedOpsVariantParamPack( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateFusedOpsVariantParamPack( &op_variant_params_, op_id)); } ~CudnnFusionOp() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_)); - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFusedOpsPlan(op_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyFusedOpsPlan(op_)); } // Execute fused op @@ -53,7 +53,7 @@ class CudnnFusionOp { platform::errors::Fatal( "CudnnFusionOp exec requested without a valid 'plan', need: " ", GetWorkspaceSizeBytes(), Execute().")); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnFusedOpsExecute(cudnn_handle, op_, op_variant_params_)); } @@ -61,9 +61,8 @@ class CudnnFusionOp { template void SetOpConstParamDesc(cudnnFusedOpsConstParamLabel_t param_label, T *param_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cudnnSetFusedOpsConstParamPackAttribute( - op_const_params_, param_label, param_ptr)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetFusedOpsConstParamPackAttribute( + op_const_params_, param_label, param_ptr)); plan_created_ = false; } @@ -81,9 +80,8 @@ class CudnnFusionOp { template void SetOpConstParamAttr(cudnnFusedOpsConstParamLabel_t param_label, T param) { - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cudnnSetFusedOpsConstParamPackAttribute(op_const_params_, - param_label, ¶m)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetFusedOpsConstParamPackAttribute( + op_const_params_, param_label, ¶m)); plan_created_ = false; } @@ -101,7 +99,7 @@ class CudnnFusionOp { template void SetOpVariantParamAttrPtr(cudnnFusedOpsVariantParamLabel_t param_label, T *param_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnSetFusedOpsVariantParamPackAttribute( op_variant_params_, param_label, param_ptr)); } @@ -120,7 +118,7 @@ class CudnnFusionOp { size_t GetWorkspaceSizeInBytes(cudnnHandle_t cudnn_handle) { if (!plan_created_) { workspace_bytes_ = 0U; - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnMakeFusedOpsPlan( cudnn_handle, op_, op_const_params_, &workspace_bytes_)); plan_created_ = true; } diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h index 9b9328a5ca620..c8871388dd450 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h +++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h @@ -15,8 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/fused/cudnn_fusion_helper.h" -#include "paddle/fluid/platform/cudnn_desc.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { @@ -320,7 +319,7 @@ class CudnnNormConvolutionGrad { ScalingParamType beta = use_addto ? 1.0f : 0.0f; ctx.cudnn_workspace_handle().RunFunc( [&](void *cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionBackwardData( cudnn_handle, &alpha, args_.filter_desc.desc(), filter_ptr, args_.out_desc.desc(), output_grad_ptr, @@ -370,7 +369,7 @@ class CudnnNormConvolutionGrad { size_t GetWorkspaceSizeBwdData(const platform::CUDADeviceContext &ctx) { size_t workspace_size = 0U; auto handle = ctx.cudnn_handle(); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( handle, args_.filter_desc.desc(), args_.out_desc.desc(), args_.conv_desc.desc(), args_.in_desc.desc(), dgrad_algo_, diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h index 5166ff27234f2..d0205208acc47 100644 --- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h +++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h @@ -15,8 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/fused/cudnn_fusion_helper.h" -#include "paddle/fluid/platform/cudnn_desc.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index 9f6d6e2270673..173ef48b83dc2 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -16,8 +16,8 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu index 9339ae8e470de..83328caf3844f 100644 --- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu @@ -22,7 +22,7 @@ #include "paddle/fluid/operators/fused/fused_bn_activation_op.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/norm_utils.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/float16.h" DECLARE_bool(cudnn_batchnorm_spatial_persistent); @@ -107,22 +107,21 @@ class FusedBatchNormActKernel cudnnTensorDescriptor_t bn_param_desc_; cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); VLOG(3) << "Setting descriptors."; std::vector dims = {N, C, H, W, D}; std::vector strides = {H * W * D * C, 1, W * D * C, D * C, C}; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); double this_factor = 1. - momentum; cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION; @@ -144,7 +143,7 @@ class FusedBatchNormActKernel "The argument ReserveSpace of batch_norm op is not found.")); // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload:: cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( /*handle=*/handle, @@ -158,7 +157,7 @@ class FusedBatchNormActKernel /*sizeInBytes=*/&workspace_size)); // -------------- cudnn batchnorm reserve space -------------- - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize( /*handle=*/handle, /*mode=*/mode_, @@ -171,7 +170,7 @@ class FusedBatchNormActKernel reserve_space_size); workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(), workspace_size); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardTrainingEx( handle, mode_, bnOps_, CudnnDataType::kOne(), CudnnDataType::kZero(), data_desc_, x->template data(), @@ -190,9 +189,9 @@ class FusedBatchNormActKernel reserve_space_size)); // clean when exit. - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); } }; @@ -271,9 +270,9 @@ class FusedBatchNormActGradKernel cudnnTensorDescriptor_t bn_param_desc_; cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { LOG(ERROR) << "Provided epsilon is smaller than " @@ -282,12 +281,11 @@ class FusedBatchNormActGradKernel } epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); const auto *saved_mean = ctx.Input("SavedMean"); const auto *saved_var = ctx.Input("SavedVariance"); @@ -305,7 +303,7 @@ class FusedBatchNormActGradKernel cudnnActivationDescriptor_t activation_desc_ = scope_act_desc.descriptor(act_type); // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize( /*handle=*/dev_ctx.cudnn_handle(), /*mode=*/mode_, @@ -322,7 +320,7 @@ class FusedBatchNormActGradKernel workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(), workspace_size); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnBatchNormalizationBackwardEx( /*handle=*/dev_ctx.cudnn_handle(), /*mode=*/mode_, @@ -358,9 +356,9 @@ class FusedBatchNormActGradKernel /*reserveSpaceSizeInBytes=*/reserve_space_size)); // clean when exit. - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); } }; diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu index c92b13b5f5847..7c124a0d6b661 100644 --- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu @@ -21,7 +21,7 @@ #include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/norm_utils.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/float16.h" DECLARE_bool(cudnn_batchnorm_spatial_persistent); @@ -87,20 +87,19 @@ class FusedBatchNormAddActKernel cudnnTensorDescriptor_t bn_param_desc_; cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); std::vector dims = {N, C, H, W, D}; std::vector strides = {H * W * D * C, 1, W * D * C, D * C, C}; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data())); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); double this_factor = 1. - momentum; cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION; @@ -122,7 +121,7 @@ class FusedBatchNormAddActKernel "The argument ReserveSpace of batch_norm op is not found.")); // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload:: cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( /*handle=*/handle, @@ -136,7 +135,7 @@ class FusedBatchNormAddActKernel /*sizeInBytes=*/&workspace_size)); // -------------- cudnn batchnorm reserve space -------------- - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize( /*handle=*/handle, /*mode=*/mode_, @@ -149,7 +148,7 @@ class FusedBatchNormAddActKernel reserve_space_size); workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(), workspace_size); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardTrainingEx( handle, mode_, bnOps_, CudnnDataType::kOne(), CudnnDataType::kZero(), data_desc_, x->template data(), @@ -169,9 +168,9 @@ class FusedBatchNormAddActKernel reserve_space_size)); // clean when exit. - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); } }; @@ -231,9 +230,9 @@ class FusedBatchNormAddActGradKernel cudnnTensorDescriptor_t bn_param_desc_; cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { LOG(ERROR) << "Provided epsilon is smaller than " @@ -242,12 +241,11 @@ class FusedBatchNormAddActGradKernel } epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data())); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); const auto *saved_mean = ctx.Input("SavedMean"); const auto *saved_var = ctx.Input("SavedVariance"); @@ -265,7 +263,7 @@ class FusedBatchNormAddActGradKernel cudnnActivationDescriptor_t activation_desc_ = scope_act_desc.descriptor(act_type); // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize( /*handle=*/dev_ctx.cudnn_handle(), /*mode=*/mode_, @@ -281,7 +279,7 @@ class FusedBatchNormAddActGradKernel workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(), workspace_size); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnBatchNormalizationBackwardEx( /*handle=*/dev_ctx.cudnn_handle(), /*mode=*/mode_, @@ -315,9 +313,9 @@ class FusedBatchNormAddActGradKernel /*reserveSpaceSizeInBytes=*/reserve_space_size)); // clean when exit. - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); } }; diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index 049c37f1ea0c4..eb651e4ea7b4f 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -23,10 +23,10 @@ limitations under the License. */ #include "paddle/fluid/operators/layer_norm_kernel.cu.h" #include "paddle/fluid/operators/math/functors.h" #include "paddle/fluid/platform/aligned_vector.h" -#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/gpu_launch_config.h" namespace paddle { namespace operators { @@ -93,7 +93,7 @@ __forceinline__ __device__ void RandVec<8>(curandStatePhilox4_32_10_t *state, template inline void SetZero(const platform::CUDADeviceContext &ctx, T *ptr, const size_t size) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(ptr, 0, size * sizeof(T), ctx.stream())); } diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu index dc068e02be4ec..c5b1fd9392950 100644 --- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu @@ -22,7 +22,7 @@ namespace cub = hipcub; #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h index f257d3efa433e..1827e137c15f1 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h @@ -169,7 +169,7 @@ void LaunchLayernormResidualDropoutBias( auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); memory::Copy(cuda_place, dst, cuda_place, residual, rows * cols * sizeof(T), ctx.stream()); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync( + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync( mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream())); // call layernorm forward diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc index ea1e9512ca519..eeeb004003c9c 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #endif #include "paddle/fluid/platform/cudnn_workspace_helper.h" diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index b3796f1df5fdf..44312be797398 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" DECLARE_uint64(conv_workspace_size_limit); @@ -95,15 +95,15 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { cudnnConvolutionDescriptor_t* conv_desc = new cudnnConvolutionDescriptor_t[4]; for (int i = 0; i < 4; ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i])); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i])); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i])); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i])); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i])); } @@ -127,11 +127,11 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { for (int i = 0; i < 4; ++i) { filter_dims.push_back(framework::vectorize(filters[i]->dims())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor( filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data())); bias_dims.push_back({1, filter_dims[i][0], 1, 1}); bias_strides.push_back({filter_dims[i][0], 1, 1, 1}); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( bias_desc[i], cudnn_dtype, 4, bias_dims[i].data(), bias_strides[i].data())); in_dims.push_back({n, filter_dims[i][1], h, w}); @@ -140,22 +140,21 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { out_strides.push_back({oc * h * w, h * w, w, 1}); if (i < 2) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnSetConvolutionNdDescriptor( conv_desc[i], 2, k0x0.data(), k1x1.data(), k1x1.data(), CUDNN_CROSS_CORRELATION, compute_type)); } else { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnSetConvolutionNdDescriptor( conv_desc[i], 2, k1x1.data(), k1x1.data(), k1x1.data(), CUDNN_CROSS_CORRELATION, compute_type)); } - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnSetConvolutionMathType(conv_desc[i], - CUDNN_DEFAULT_MATH)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + conv_desc[i], CUDNN_DEFAULT_MATH)); #if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000 if (!platform::allow_tf32_cudnn) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnSetConvolutionMathType(conv_desc[i], CUDNN_FMA_MATH)); } @@ -165,7 +164,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { in_strides[2][0] = oc * h * w; out_strides[2][0] = filter_dims[2][0] * h * w; // this out is continuous. in_strides[3][0] = filter_dims[2][0] * h * w; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2)); cudnnConvolutionFwdAlgo_t algo[4]; @@ -181,9 +180,9 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { } for (int i = 0; i < 4; ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( out_desc[i], cudnn_dtype, 4, out_dims[i].data(), out_strides[i].data())); @@ -192,13 +191,13 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { size_t tmp_size = 0; std::unique_ptr perf_results( new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], kNUM_CUDNN_FWD_ALGS, &perf_count, perf_results.get())); algo[i] = (perf_results.get())[best_algo_idx].algo; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], algo[i], &tmp_size)); @@ -215,7 +214,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { // branch1: pool + 1x1 conv ScalingParamType alpha = 1.0f, beta = 0.0f; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnPoolingForward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingForward( handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta, pool_out_desc, temp_data)); @@ -237,7 +236,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { for (int i = 0; i < 4; ++i) { auto func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnConvolutionBiasActivationForward( handle, &alpha, in_desc[i], in_datas[i], filter_desc[i], static_cast(filters[i]->data()), conv_desc[i], @@ -252,34 +251,34 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { cudnnTensorDescriptor_t x_desc; cudnnTensorDescriptor_t y_desc; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&x_desc)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&y_desc)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnTransformTensor( handle, CudnnDataType::kOne(), x_desc, static_cast(out_datas[2]), CudnnDataType::kZero(), y_desc, static_cast(output_data + (oc0 + oc1) * h * w))); for (int i = 0; i < 4; ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i])); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i])); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i])); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i])); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i])); } - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(x_desc)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(y_desc)); } }; diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc index 37a442a781571..1fa4225934d39 100644 --- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc +++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace platform { @@ -50,9 +50,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel { cudnnTensorDescriptor_t in_desc; cudnnTensorDescriptor_t out_desc; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&in_desc)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&out_desc)); cudnnDataType_t cudnn_dtype = CudnnDataType::type; @@ -92,12 +92,12 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel { dims_y[i] = 1; } - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnTransformTensor( handle, CudnnDataType::kOne(), in_desc, static_cast(ins[k]->data()), CudnnDataType::kZero(), out_desc, static_cast(odata))); @@ -108,9 +108,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel { odata += flat_shape[1]; } } - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(in_desc)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(out_desc)); } }; diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h index 05af4ff150f39..700de8074ff8a 100644 --- a/paddle/fluid/operators/gather.cu.h +++ b/paddle/fluid/operators/gather.cu.h @@ -19,8 +19,8 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/place.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu index d9f56ec4dc038..6e5e203e2d943 100644 --- a/paddle/fluid/operators/graph_send_recv_op.cu +++ b/paddle/fluid/operators/graph_send_recv_op.cu @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/graph_send_recv_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/place.h" namespace paddle { diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc index d2002b487ca33..080dadeacaae7 100644 --- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc +++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc @@ -16,7 +16,7 @@ limitations under the License. */ // HIP not support cudnnSpatialTfGridGeneratorForward #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace framework { @@ -70,7 +70,7 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel { cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( DataLayout::kNCHW, framework::vectorize(output->dims())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward( handle, cudnn_st_desc, CudnnDataType::kOne(), cudnn_input_desc, input_data, grid_data, CudnnDataType::kZero(), cudnn_output_desc, output_data)); @@ -123,13 +123,12 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel { output_grad_desc.descriptor( DataLayout::kNCHW, framework::vectorize(output_grad->dims())); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnSpatialTfSamplerBackward( - handle, cudnn_st_dest, CudnnDataType::kOne(), cudnn_input_desc, - input_data, CudnnDataType::kZero(), cudnn_input_grad_desc, - input_grad_data, CudnnDataType::kOne(), cudnn_output_grad_desc, - output_grad_data, grid_data, CudnnDataType::kZero(), - grid_grad_data)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSpatialTfSamplerBackward( + handle, cudnn_st_dest, CudnnDataType::kOne(), cudnn_input_desc, + input_data, CudnnDataType::kZero(), cudnn_input_grad_desc, + input_grad_data, CudnnDataType::kOne(), cudnn_output_grad_desc, + output_grad_data, grid_data, CudnnDataType::kZero(), + grid_grad_data)); } }; diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index 0b410f07fcb57..04aa6a3e10f6e 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -17,12 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cudnn_helper.h" -#endif -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu index 762d14096a5ab..8e9f445f3b116 100644 --- a/paddle/fluid/operators/grid_sampler_op.cu +++ b/paddle/fluid/operators/grid_sampler_op.cu @@ -15,9 +15,9 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/grid_sampler_op.h" -#include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index e029c84090af1..055fd791af5a3 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -21,8 +21,8 @@ namespace cub = hipcub; #endif #include "paddle/fluid/operators/group_norm_op.h" -#include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/histogram_op.cu b/paddle/fluid/operators/histogram_op.cu index 6a9183a8b465b..b9419cbcc57b5 100644 --- a/paddle/fluid/operators/histogram_op.cu +++ b/paddle/fluid/operators/histogram_op.cu @@ -14,8 +14,8 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/histogram_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/hostdevice.h" namespace paddle { diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu index 46dd91fed6cbc..40a968b8a397d 100644 --- a/paddle/fluid/operators/index_sample_op.cu +++ b/paddle/fluid/operators/index_sample_op.cu @@ -15,8 +15,8 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/index_sample_op.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/index_select_op.cu b/paddle/fluid/operators/index_select_op.cu index 2353781daaa39..acf959896f949 100644 --- a/paddle/fluid/operators/index_select_op.cu +++ b/paddle/fluid/operators/index_select_op.cu @@ -15,7 +15,7 @@ #pragma once #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/index_select_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -110,22 +110,14 @@ class IndexSelectCUDAKernel : public framework::OpKernel { (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data, numel, stride, size, delta); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif + platform::GpuStreamSync(stream); } else { const int* index_data = index->data(); index_select_cuda_kernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( in_data, out_data, index_data, numel, stride, size, delta); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif + platform::GpuStreamSync(stream); } } }; @@ -181,11 +173,7 @@ class IndexSelectGradCUDAKernel : public framework::OpKernel { PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data, index_data, index_nums, out_nums, stride, size, delta); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif + platform::GpuStreamSync(stream); } else { const int* index_data = index->data(); index_select_grad_cuda_kernel<<< @@ -193,11 +181,7 @@ class IndexSelectGradCUDAKernel : public framework::OpKernel { PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data, index_data, index_nums, out_nums, stride, size, delta); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif + platform::GpuStreamSync(stream); } } }; diff --git a/paddle/fluid/operators/instance_norm_op.cu b/paddle/fluid/operators/instance_norm_op.cu index affd0b7e1edd7..e0401366693b1 100644 --- a/paddle/fluid/operators/instance_norm_op.cu +++ b/paddle/fluid/operators/instance_norm_op.cu @@ -26,12 +26,7 @@ namespace cub = hipcub; #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/instance_norm_op.h" #include "paddle/fluid/operators/math/math_function.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cudnn_helper.h" -#endif -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { @@ -114,17 +109,17 @@ class InstanceNormKernel miopenTensorDescriptor_t data_desc_; miopenTensorDescriptor_t in_param_desc_; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t in_param_desc_; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); #endif if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { @@ -143,20 +138,19 @@ class InstanceNormKernel auto &dev_ctx = ctx.template device_context(); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( data_desc_, CudnnDataType::type, x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), const_cast(strides.data()))); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDeriveBNTensorDescriptor( in_param_desc_, data_desc_, miopenBNSpatial)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDeriveBNTensorDescriptor( - in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); #endif const auto *scale = ctx.Input("Scale"); @@ -202,7 +196,7 @@ class InstanceNormKernel functor(dev_ctx, saved_variance, static_cast>(0)); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenBatchNormalizationForwardTraining( handle, miopenBNSpatial, const_cast( @@ -225,12 +219,12 @@ class InstanceNormKernel saved_variance->template mutable_data>( ctx.GetPlace())))); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardTraining( handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), CudnnDataType::kZero(), data_desc_, x_tmp.template data(), @@ -243,9 +237,9 @@ class InstanceNormKernel saved_variance->template mutable_data>( ctx.GetPlace()))); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); #endif } @@ -396,17 +390,17 @@ class InstanceNormGradKernel miopenTensorDescriptor_t data_desc_; miopenTensorDescriptor_t in_param_desc_; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t in_param_desc_; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); #endif @@ -418,20 +412,19 @@ class InstanceNormGradKernel epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( data_desc_, CudnnDataType::type, x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), const_cast(strides.data()))); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDeriveBNTensorDescriptor( in_param_desc_, data_desc_, miopenBNSpatial)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDeriveBNTensorDescriptor( - in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); #endif const auto *saved_mean = ctx.Input("SavedMean"); @@ -442,7 +435,7 @@ class InstanceNormGradKernel saved_var->template data>(); if (d_scale && d_bias) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenBatchNormalizationBackward( dev_ctx.cudnn_handle(), miopenBNSpatial, CudnnDataType::kOne(), CudnnDataType::kZero(), CudnnDataType::kOne(), @@ -456,7 +449,7 @@ class InstanceNormGradKernel ctx.GetPlace()), epsilon, saved_mean_data, saved_var_data)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnBatchNormalizationBackward( dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), CudnnDataType::kZero(), @@ -487,14 +480,14 @@ class InstanceNormGradKernel } #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); #endif } diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu index 6be7dbdc110d5..3c857eb326ace 100644 --- a/paddle/fluid/operators/interpolate_op.cu +++ b/paddle/fluid/operators/interpolate_op.cu @@ -12,8 +12,8 @@ #include #include #include "paddle/fluid/operators/interpolate_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index fe9228135606d..bc1ab704aafe3 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -13,9 +13,9 @@ #include #include "paddle/fluid/operators/interpolate_v2_op.h" #include "paddle/fluid/operators/math/math_cuda_utils.h" -#include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives.h b/paddle/fluid/operators/kernel_primitives/compute_primitives.h index 73316d66b6cf2..2320b9e0b2fbf 100644 --- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/compute_primitives.h @@ -21,7 +21,7 @@ #include #endif -#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/float16.h" namespace paddle { diff --git a/paddle/fluid/operators/kernel_primitives/functor_primitives.h b/paddle/fluid/operators/kernel_primitives/functor_primitives.h index 3fce3b1c0920a..d7aed8595ba05 100644 --- a/paddle/fluid/operators/kernel_primitives/functor_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h @@ -86,6 +86,20 @@ struct DivideFunctor { Tx n_inv; }; +/** + * @brief Default inverse functor + */ +template +struct InverseFunctor { + HOSTDEVICE inline InverseFunctor() {} + + HOSTDEVICE explicit inline InverseFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(-x); + } +}; + /** * @brief Default unary square functor */ diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index 4280c86ca99ab..3656bd1a18167 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -23,13 +23,8 @@ namespace cub = hipcub; #endif #include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/platform/cuda_device_function.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cudnn_helper.h" -#endif -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu index a4f0693323297..4bf2a7cb372cb 100644 --- a/paddle/fluid/operators/linspace_op.cu +++ b/paddle/fluid/operators/linspace_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/linspace_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h index ec9f5dd95d4d0..5d2a1683d381b 100644 --- a/paddle/fluid/operators/lite/lite_engine_op.h +++ b/paddle/fluid/operators/lite/lite_engine_op.h @@ -26,7 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/inference/analysis/helper.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/inference/lite/engine.h" #include "paddle/fluid/inference/lite/tensor_utils.h" diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu index 7c47ad90502eb..6676cde1cafca 100644 --- a/paddle/fluid/operators/log_softmax_op.cu +++ b/paddle/fluid/operators/log_softmax_op.cu @@ -16,7 +16,7 @@ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/log_softmax_op.h" #include "paddle/fluid/operators/math/functors.h" -#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 3edea025b2a04..5aa546cbcc21a 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/lookup_table_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" namespace paddle { diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu index 493966ecda7bd..317f9eeb94f39 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/lookup_table_v2_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" namespace paddle { diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu index 7c5e64d2afa46..1deaa3ef1ee7c 100644 --- a/paddle/fluid/operators/margin_cross_entropy_op.cu +++ b/paddle/fluid/operators/margin_cross_entropy_op.cu @@ -30,7 +30,7 @@ namespace cub = hipcub; #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif namespace paddle { @@ -69,7 +69,7 @@ void GetClassInterval(const gpuStream_t& stream, const platform::Place& place, platform::DeviceContextPool::Instance().Get(place)) ->stream(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( num_classes_per_device_ptr, num_classes_per_device_ptr, num_classes_per_device.numel(), platform::ToNCCLDataType(num_classes_per_device.type()), ncclSum, @@ -314,7 +314,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nranks > 1) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( logits_max_buff, logits_max_buff, logits_max.numel(), platform::ToNCCLDataType(logits_max.type()), ncclMax, comm->comm(), stream)); @@ -335,7 +335,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nranks > 1) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(), platform::ToNCCLDataType(sum_exp_logits.type()), ncclSum, comm->comm(), stream)); @@ -368,7 +368,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nranks > 1) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( loss_ptr, loss_ptr, loss->numel(), platform::ToNCCLDataType(loss->type()), ncclSum, comm->comm(), stream)); diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu index ed3ead47d171e..0cc552d34c587 100644 --- a/paddle/fluid/operators/math/beam_search.cu +++ b/paddle/fluid/operators/math/beam_search.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/beam_search.h" -#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index 70c6cf9dcab03..92162e639ff86 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -17,7 +17,7 @@ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/dynload/cublas.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" DECLARE_bool(enable_cublas_tensor_op_math); @@ -32,33 +32,33 @@ template <> struct CUBlas { template static void GEMM(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemm(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgemm(args...)); } template static void AXPY(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSaxpy(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSaxpy(args...)); } template static void SCAL(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSscal(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSscal(args...)); } template static void VCOPY(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasScopy(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasScopy(args...)); } template static void GEMV(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemv(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgemv(args...)); } template static void GEMM_STRIDED_BATCH(ARGS... args) { #if CUDA_VERSION >= 8000 - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasSgemmStridedBatched(args...)); #else PADDLE_THROW(platform::errors::Unimplemented( @@ -82,7 +82,7 @@ struct CUBlas { VLOG(5) << "use_tensor_op_math: " << (dev_ctx->tensor_core_available() ? "True" : "False"); dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemmEx( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgemmEx( handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc)); }); @@ -94,36 +94,33 @@ struct CUBlas { template static void TRSM(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasStrsm(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasStrsm(args...)); } template static void GETRF_BATCH(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cublasSgetrfBatched(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgetrfBatched(args...)); } template static void GETRI_BATCH(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cublasSgetriBatched(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgetriBatched(args...)); } template static void MATINV_BATCH(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasSmatinvBatched(args...)); } template static void GETRS_BATCH(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cublasSgetrsBatched(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgetrsBatched(args...)); } template static void TRSM_BATCH(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasStrsmBatched(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasStrsmBatched(args...)); } }; @@ -131,33 +128,33 @@ template <> struct CUBlas { template static void GEMM(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDgemm(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDgemm(args...)); } template static void AXPY(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDaxpy(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDaxpy(args...)); } template static void SCAL(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDscal(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDscal(args...)); } template static void VCOPY(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDcopy(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDcopy(args...)); } template static void GEMV(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDgemv(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDgemv(args...)); } template static void GEMM_STRIDED_BATCH(ARGS... args) { #if CUDA_VERSION >= 8000 - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasDgemmStridedBatched(args...)); #else PADDLE_THROW(platform::errors::Unimplemented( @@ -173,36 +170,33 @@ struct CUBlas { template static void TRSM(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDtrsm(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDtrsm(args...)); } template static void GETRF_BATCH(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cublasDgetrfBatched(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDgetrfBatched(args...)); } template static void GETRI_BATCH(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cublasDgetriBatched(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDgetriBatched(args...)); } template static void MATINV_BATCH(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasDmatinvBatched(args...)); } template static void GETRS_BATCH(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cublasDgetrsBatched(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDgetrsBatched(args...)); } template static void TRSM_BATCH(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDtrsmBatched(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDtrsmBatched(args...)); } }; @@ -215,7 +209,7 @@ struct CUBlas { const float16 *alpha, const float16 *A, int lda, const float16 *B, int ldb, const float16 *beta, float16 *C, int ldc) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasHgemm(handle, transa, transb, m, n, k, reinterpret_cast(alpha), reinterpret_cast(A), lda, @@ -235,7 +229,7 @@ struct CUBlas { long long int strideC, // NOLINT int batchCount) { #if CUDA_VERSION >= 8000 - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasHgemmStridedBatched( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasHgemmStridedBatched( handle, transa, transb, m, n, k, reinterpret_cast(alpha), reinterpret_cast(A), lda, strideA, @@ -270,7 +264,7 @@ struct CUBlas { #endif // CUDA_VERSION >= 9000 dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmEx( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx( handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)); }); @@ -289,7 +283,7 @@ struct CUBlas> { const platform::complex *B, int ldb, const platform::complex *beta, platform::complex *C, int ldc) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCgemv( handle, transa, m, n, reinterpret_cast(alpha), reinterpret_cast(A), lda, reinterpret_cast(B), ldb, @@ -301,7 +295,7 @@ struct CUBlas> { const platform::complex *alpha, const platform::complex *X, const int incX, platform::complex *Y, const int incY) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCaxpy( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCaxpy( handle, n, reinterpret_cast(alpha), reinterpret_cast(X), incX, reinterpret_cast(Y), incY)); @@ -320,7 +314,7 @@ struct CUBlas> { long long int strideC, // NOLINT int batchCount) { #if CUDA_VERSION >= 8000 - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemmStridedBatched( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCgemmStridedBatched( handle, transa, transb, m, n, k, reinterpret_cast(alpha), reinterpret_cast(A), lda, strideA, @@ -340,7 +334,7 @@ struct CUBlas> { const platform::complex *B, int ldb, const platform::complex *beta, platform::complex *C, int ldc) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemm( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCgemm( handle, transa, transb, m, n, k, reinterpret_cast(alpha), reinterpret_cast(A), lda, @@ -355,7 +349,7 @@ struct CUBlas> { const paddle::platform::complex *alpha, const paddle::platform::complex *A, int lda, paddle::platform::complex *B, int ldb) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCtrsm( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCtrsm( handle, side, uplo, transa, diag, m, n, reinterpret_cast(alpha), reinterpret_cast(A), lda, @@ -384,7 +378,7 @@ struct CUBlas> { #endif // CUDA_VERSION >= 9000 dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmEx( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx( handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)); }); @@ -401,7 +395,7 @@ struct CUBlas> { const paddle::platform::complex **A, int lda, paddle::platform::complex **B, int ldb, int batch_size) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCtrsmBatched( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCtrsmBatched( handle, side, uplo, transa, diag, m, n, reinterpret_cast(alpha), reinterpret_cast(A), lda, @@ -417,7 +411,7 @@ struct CUBlas> { const platform::complex *B, int ldb, const platform::complex *beta, platform::complex *C, int ldc) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZgemv( handle, transa, m, n, reinterpret_cast(alpha), reinterpret_cast(A), lda, reinterpret_cast(B), ldb, @@ -429,7 +423,7 @@ struct CUBlas> { const platform::complex *alpha, const platform::complex *X, const int incX, platform::complex *Y, const int incY) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZaxpy( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZaxpy( handle, n, reinterpret_cast(alpha), reinterpret_cast(X), incX, reinterpret_cast(Y), incY)); @@ -448,7 +442,7 @@ struct CUBlas> { long long int strideC, // NOLINT int batchCount) { #if CUDA_VERSION >= 8000 - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemmStridedBatched( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZgemmStridedBatched( handle, transa, transb, m, n, k, reinterpret_cast(alpha), reinterpret_cast(A), lda, strideA, @@ -468,7 +462,7 @@ struct CUBlas> { const platform::complex *B, int ldb, const platform::complex *beta, platform::complex *C, int ldc) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemm( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZgemm( handle, transa, transb, m, n, k, reinterpret_cast(alpha), reinterpret_cast(A), lda, @@ -483,7 +477,7 @@ struct CUBlas> { const paddle::platform::complex *alpha, const paddle::platform::complex *A, int lda, paddle::platform::complex *B, int ldb) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZtrsm( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZtrsm( handle, side, uplo, transa, diag, m, n, reinterpret_cast(alpha), reinterpret_cast(A), lda, @@ -497,7 +491,7 @@ struct CUBlas> { const paddle::platform::complex **A, int lda, paddle::platform::complex **B, int ldb, int batch_size) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZtrsmBatched( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZtrsmBatched( handle, side, uplo, transa, diag, m, n, reinterpret_cast(alpha), reinterpret_cast(A), lda, @@ -526,7 +520,7 @@ struct CUBlas> { #endif // CUDA_VERSION >= 9000 dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmEx( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx( handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)); }); @@ -842,7 +836,7 @@ void Blas::BatchedGEMM( auto fp = std::is_same::value ? CUDA_R_32F : CUDA_R_16F; context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx( handle, cuTransB, cuTransA, N, M, K, &alpha, B, fp, ldb, strideB, A, fp, lda, strideA, &beta, C, fp, ldc, strideC, batchCount, fp, algo)); }); diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h index f972d38adda5f..32479189eea58 100644 --- a/paddle/fluid/operators/math/blas_impl.hip.h +++ b/paddle/fluid/operators/math/blas_impl.hip.h @@ -15,8 +15,8 @@ #pragma once #include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/dynload/rocblas.h" -#include "paddle/fluid/platform/gpu_info.h" DECLARE_bool(enable_cublas_tensor_op_math); @@ -31,32 +31,32 @@ template <> struct CUBlas { template static void GEMM(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_sgemm(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_sgemm(args...)); } template static void AXPY(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_saxpy(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_saxpy(args...)); } template static void SCAL(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_sscal(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_sscal(args...)); } template static void VCOPY(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_scopy(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_scopy(args...)); } template static void GEMV(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_sgemv(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_sgemv(args...)); } template static void GEMM_STRIDED_BATCH(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::rocblas_sgemm_strided_batched(args...)); } @@ -70,7 +70,7 @@ struct CUBlas { template static void TRSM(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_strsm(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_strsm(args...)); } template @@ -102,32 +102,32 @@ template <> struct CUBlas { template static void GEMM(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_dgemm(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_dgemm(args...)); } template static void AXPY(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_daxpy(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_daxpy(args...)); } template static void SCAL(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_dscal(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_dscal(args...)); } template static void VCOPY(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_dcopy(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_dcopy(args...)); } template static void GEMV(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_dgemv(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_dgemv(args...)); } template static void GEMM_STRIDED_BATCH(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::rocblas_dgemm_strided_batched(args...)); } @@ -139,7 +139,7 @@ struct CUBlas { template static void TRSM(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_dtrsm(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_dtrsm(args...)); } template @@ -176,7 +176,7 @@ struct CUBlas { const float16 *alpha, const float16 *A, int lda, const float16 *B, int ldb, const float16 *beta, float16 *C, int ldc) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_hgemm( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_hgemm( handle, transa, transb, m, n, k, reinterpret_cast(alpha), reinterpret_cast(A), lda, @@ -195,14 +195,13 @@ struct CUBlas { const float16 *beta, float16 *C, int ldc, long long int strideC, // NOLINT int batchCount) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::rocblas_hgemm_strided_batched( - handle, transa, transb, m, n, k, - reinterpret_cast(alpha), - reinterpret_cast(A), lda, strideA, - reinterpret_cast(B), ldb, strideB, - reinterpret_cast(beta), - reinterpret_cast(C), ldc, strideC, batchCount)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_hgemm_strided_batched( + handle, transa, transb, m, n, k, + reinterpret_cast(alpha), + reinterpret_cast(A), lda, strideA, + reinterpret_cast(B), ldb, strideB, + reinterpret_cast(beta), + reinterpret_cast(C), ldc, strideC, batchCount)); } // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. @@ -217,7 +216,7 @@ struct CUBlas { rocblas_datatype computeType) { rocblas_gemm_algo algo = rocblas_gemm_algo_standard; dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_gemm_ex( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex( handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0)); }); @@ -232,7 +231,7 @@ struct CUBlas> { const platform::complex *B, int ldb, const platform::complex *beta, platform::complex *C, int ldc) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_cgemv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_cgemv( handle, transa, m, n, reinterpret_cast(alpha), reinterpret_cast(A), lda, @@ -245,7 +244,7 @@ struct CUBlas> { const platform::complex *alpha, const platform::complex *X, const int incX, platform::complex *Y, const int incY) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_caxpy( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_caxpy( handle, n, reinterpret_cast(alpha), reinterpret_cast(X), incX, reinterpret_cast(Y), incY)); @@ -263,15 +262,14 @@ struct CUBlas> { platform::complex *C, int ldc, long long int strideC, // NOLINT int batchCount) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::rocblas_cgemm_strided_batched( - handle, transa, transb, m, n, k, - reinterpret_cast(alpha), - reinterpret_cast(A), lda, strideA, - reinterpret_cast(B), ldb, strideB, - reinterpret_cast(beta), - reinterpret_cast(C), ldc, strideC, - batchCount)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_cgemm_strided_batched( + handle, transa, transb, m, n, k, + reinterpret_cast(alpha), + reinterpret_cast(A), lda, strideA, + reinterpret_cast(B), ldb, strideB, + reinterpret_cast(beta), + reinterpret_cast(C), ldc, strideC, + batchCount)); } static void GEMM(rocblas_handle handle, rocblas_operation transa, @@ -281,7 +279,7 @@ struct CUBlas> { const platform::complex *B, int ldb, const platform::complex *beta, platform::complex *C, int ldc) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_cgemm( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_cgemm( handle, transa, transb, m, n, k, reinterpret_cast(alpha), reinterpret_cast(A), lda, @@ -302,7 +300,7 @@ struct CUBlas> { rocblas_datatype computeType) { rocblas_gemm_algo algo = rocblas_gemm_algo_standard; dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_gemm_ex( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex( handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0)); }); @@ -317,7 +315,7 @@ struct CUBlas> { const platform::complex *B, int ldb, const platform::complex *beta, platform::complex *C, int ldc) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_zgemv( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_zgemv( handle, transa, m, n, reinterpret_cast(alpha), reinterpret_cast(A), lda, @@ -330,7 +328,7 @@ struct CUBlas> { const platform::complex *alpha, const platform::complex *X, const int incX, platform::complex *Y, const int incY) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_zaxpy( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_zaxpy( handle, n, reinterpret_cast(alpha), reinterpret_cast(X), incX, reinterpret_cast(Y), incY)); @@ -348,15 +346,14 @@ struct CUBlas> { platform::complex *C, int ldc, long long int strideC, // NOLINT int batchCount) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::rocblas_zgemm_strided_batched( - handle, transa, transb, m, n, k, - reinterpret_cast(alpha), - reinterpret_cast(A), lda, strideA, - reinterpret_cast(B), ldb, strideB, - reinterpret_cast(beta), - reinterpret_cast(C), ldc, strideC, - batchCount)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_zgemm_strided_batched( + handle, transa, transb, m, n, k, + reinterpret_cast(alpha), + reinterpret_cast(A), lda, strideA, + reinterpret_cast(B), ldb, strideB, + reinterpret_cast(beta), + reinterpret_cast(C), ldc, strideC, + batchCount)); } static void GEMM(rocblas_handle handle, rocblas_operation transa, @@ -366,7 +363,7 @@ struct CUBlas> { const platform::complex *B, int ldb, const platform::complex *beta, platform::complex *C, int ldc) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_zgemm( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_zgemm( handle, transa, transb, m, n, k, reinterpret_cast(alpha), reinterpret_cast(A), lda, @@ -387,7 +384,7 @@ struct CUBlas> { rocblas_datatype computeType) { rocblas_gemm_algo algo = rocblas_gemm_algo_standard; dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_gemm_ex( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex( handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0)); }); diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index 614ae93d9fa82..bc2d496a3e76a 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -287,13 +287,11 @@ class ConcatFunctor { const T** dev_ins_data = nullptr; if (!has_same_shape || in_num < 2 || in_num > 4) { tmp_dev_ins_data = memory::Alloc(context, in_num * sizeof(T*)); - { - platform::SkipCUDAGraphCaptureGuard guard; - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), - tmp_dev_ins_data->ptr(), platform::CPUPlace(), - static_cast(inputs_data), in_num * sizeof(T*), - context.stream()); - } + auto* restored = + platform::RestoreHostMemIfCapturingCUDAGraph(inputs_data, in_num); + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), + tmp_dev_ins_data->ptr(), platform::CPUPlace(), restored, + in_num * sizeof(T*), context.stream()); dev_ins_data = reinterpret_cast(tmp_dev_ins_data->ptr()); } @@ -317,13 +315,12 @@ class ConcatFunctor { } else { auto tmp_dev_ins_col_data = memory::Alloc(context, inputs_col_num * sizeof(int64_t)); - { - platform::SkipCUDAGraphCaptureGuard guard; - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), - tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), - static_cast(inputs_col), - inputs_col_num * sizeof(int64_t), context.stream()); - } + + auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph( + inputs_col, inputs_col_num); + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), + tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), restored, + inputs_col_num * sizeof(int64_t), context.stream()); int64_t* dev_ins_col_data = static_cast(tmp_dev_ins_col_data->ptr()); @@ -422,13 +419,11 @@ class SplitFunctor { T** dev_out_gpu_data = nullptr; if (!has_same_shape || o_num < 2 || o_num > 4) { tmp_dev_outs_data = memory::Alloc(context, o_num * sizeof(T*)); - { - platform::SkipCUDAGraphCaptureGuard guard; - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), - tmp_dev_outs_data->ptr(), platform::CPUPlace(), - reinterpret_cast(outputs_data), o_num * sizeof(T*), - context.stream()); - } + auto* restored = + platform::RestoreHostMemIfCapturingCUDAGraph(outputs_data, o_num); + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), + tmp_dev_outs_data->ptr(), platform::CPUPlace(), restored, + o_num * sizeof(T*), context.stream()); dev_out_gpu_data = reinterpret_cast(tmp_dev_outs_data->ptr()); } @@ -452,13 +447,11 @@ class SplitFunctor { } else { auto tmp_dev_ins_col_data = memory::Alloc(context, outputs_cols_num * sizeof(int64_t)); - { - platform::SkipCUDAGraphCaptureGuard guard; - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), - tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), - reinterpret_cast(outputs_cols), - outputs_cols_num * sizeof(int64_t), context.stream()); - } + auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph( + outputs_cols, outputs_cols_num); + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), + tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), restored, + outputs_cols_num * sizeof(int64_t), context.stream()); int64_t* dev_outs_col_data = reinterpret_cast(tmp_dev_ins_col_data->ptr()); diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu index 537c7e47155fe..56ba145da1cad 100644 --- a/paddle/fluid/operators/math/cos_sim_functor.cu +++ b/paddle/fluid/operators/math/cos_sim_functor.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/cos_sim_functor.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index 55662e1d0aad7..3e80e40f3577c 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -14,8 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/math/cross_entropy.h" -#include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu index 6da1bfb964f24..6ff2ddaa338df 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -23,8 +23,8 @@ namespace cub = hipcub; #endif #include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h index 62c45f4dc098b..75d4809a462cb 100644 --- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h +++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/gru_compute.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h index 24885d37020dc..851a62dbe9a48 100644 --- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/lstm_compute.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h index 01f05530e34e6..b24f5d40e8dca 100644 --- a/paddle/fluid/operators/math/eigen_values_vectors.h +++ b/paddle/fluid/operators/math/eigen_values_vectors.h @@ -184,14 +184,12 @@ struct MatrixEighFunctor { values_stride >= 32 && values_stride <= 512); syevjInfo_t syevj_params; if (use_syevj) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnCreateSyevjInfo(&syevj_params)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cusolverDnSsyevj_bufferSize( - dev_ctx.cusolver_dn_handle(), jobz, uplo, n, - reinterpret_cast(input_vector), lda, - reinterpret_cast(out_value), &lwork, - syevj_params)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize( + dev_ctx.cusolver_dn_handle(), jobz, uplo, n, + reinterpret_cast(input_vector), lda, + reinterpret_cast(out_value), &lwork, syevj_params)); } else { EvdBuffer(dev_ctx.cusolver_dn_handle(), jobz, uplo, n, input_vector, lda, out_value, &lwork); @@ -203,7 +201,7 @@ struct MatrixEighFunctor { auto *value_data = out_value + i * values_stride; auto handle = dev_ctx.cusolver_dn_handle(); if (use_syevj) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSsyevj( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj( handle, jobz, uplo, n, reinterpret_cast(input_data), lda, reinterpret_cast(value_data), reinterpret_cast(work_ptr), lwork, info_ptr, @@ -220,7 +218,7 @@ struct MatrixEighFunctor { } if (use_syevj) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnDestroySyevjInfo(syevj_params)); } if (has_vectors) { @@ -255,7 +253,7 @@ struct MatrixEighFunctor { cusolverDnHandle_t handle, cusolverEigMode_t jobz, \ cublasFillMode_t uplo, int n, const T *A, int lda, const ValueType *W, \ int *lwork) const { \ - PADDLE_ENFORCE_CUDA_SUCCESS( \ + PADDLE_ENFORCE_GPU_SUCCESS( \ platform::dynload::cusolverDn##C##evd_bufferSize( \ handle, jobz, uplo, n, reinterpret_cast(A), lda, \ W, lwork)); \ @@ -269,7 +267,7 @@ FUNC_WITH_TYPES(EVDBUFFER_INSTANCE); cusolverDnHandle_t handle, cusolverEigMode_t jobz, \ cublasFillMode_t uplo, int n, T *A, int lda, ValueType *W, T *work, \ int lwork, int *devInfo) const { \ - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDn##C##evd( \ + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##evd( \ handle, jobz, uplo, n, reinterpret_cast(A), lda, W, \ reinterpret_cast(work), lwork, devInfo)); \ } diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu index 3eadaa2677ab4..f616e116d0aee 100644 --- a/paddle/fluid/operators/math/im2col.cu +++ b/paddle/fluid/operators/math/im2col.cu @@ -15,8 +15,8 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/math/im2col.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h index 71080bf424a01..54a37db1df71a 100644 --- a/paddle/fluid/operators/math/inclusive_scan.h +++ b/paddle/fluid/operators/math/inclusive_scan.h @@ -42,7 +42,7 @@ static void CubInclusiveScan(InputIterator x_iter, OutputIterator y_iter, void *temp_storage = nullptr; size_t temp_storage_bytes = 0; for (size_t i = 0; i < 2; ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceScan::InclusiveScan( + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::InclusiveScan( temp_storage, temp_storage_bytes, x_iter, y_iter, op, static_cast(n), // Maybe overflow? dev_ctx.stream())); diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu index 8b134a29d81cf..1856fb4eb48c7 100644 --- a/paddle/fluid/operators/math/maxouting.cu +++ b/paddle/fluid/operators/math/maxouting.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu index 84a970a9a2606..076d3aa3361f0 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/fluid/operators/math/pooling.cu @@ -16,10 +16,10 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/pooling.h" -#include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/fast_divmod.h" -#include "paddle/fluid/platform/gpu_launch_config.h" #ifdef __HIPCC__ #define POOLING_BLOCK_SIZE 256 diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h index dc1e3c1c3ded1..70aae2ba59e2c 100644 --- a/paddle/fluid/operators/math/prelu.h +++ b/paddle/fluid/operators/math/prelu.h @@ -16,11 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/math_function.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#else -#include "paddle/fluid/platform/cudnn_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu index 446acc033eb7f..f596c1bc3dcf3 100644 --- a/paddle/fluid/operators/math/sample_prob.cu +++ b/paddle/fluid/operators/math/sample_prob.cu @@ -144,13 +144,13 @@ void GPUSampleWithProb::operator()( VLOG(1) << "num_tries: " << num_tries; #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(samples_data + num_true, s_data, - sizeof(int64_t) * num_samples, - hipMemcpyHostToDevice)); + PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(samples_data + num_true, s_data, + sizeof(int64_t) * num_samples, + hipMemcpyHostToDevice)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(samples_data + num_true, s_data, - sizeof(int64_t) * num_samples, - cudaMemcpyHostToDevice)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(samples_data + num_true, s_data, + sizeof(int64_t) * num_samples, + cudaMemcpyHostToDevice)); #endif int threads = 512; diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu index b49b5036ac42e..67cf316246007 100644 --- a/paddle/fluid/operators/math/segment_pooling.cu +++ b/paddle/fluid/operators/math/segment_pooling.cu @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/segment_pooling.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index f3ef537a31b44..0e04c37ed2b12 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" namespace paddle { diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index cba8dd935ef1b..b3e1922e10657 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/sequence_pooling.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/macros.h" namespace paddle { diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu index 5578f1f0138c4..1807c77e37ca1 100644 --- a/paddle/fluid/operators/math/sequence_scale.cu +++ b/paddle/fluid/operators/math/sequence_scale.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/sequence_scale.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index 9e9fe5b9c1020..bc32e068f566d 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -16,11 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/operators/math/softmax_impl.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#else -#include "paddle/fluid/platform/cudnn_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { @@ -54,7 +50,7 @@ void SoftmaxCUDNNFunctor::operator()( xDesc.descriptor(layout, cudnn_tensor_dims); miopenTensorDescriptor_t cudnn_y_desc = xDesc.descriptor(layout, cudnn_tensor_dims); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( context.cudnn_handle(), CudnnDataType::kOne(), cudnn_x_desc, X->data(), CudnnDataType::kZero(), cudnn_y_desc, Y->mutable_data(context.GetPlace()), MIOPEN_SOFTMAX_ACCURATE, @@ -64,7 +60,7 @@ void SoftmaxCUDNNFunctor::operator()( xDesc.descriptor(layout, cudnn_tensor_dims); cudnnTensorDescriptor_t cudnn_y_desc = xDesc.descriptor(layout, cudnn_tensor_dims); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward( context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType::kOne(), cudnn_x_desc, X->data(), CudnnDataType::kZero(), cudnn_y_desc, @@ -97,7 +93,7 @@ void SoftmaxGradCUDNNFunctor::operator()( dxDesc.descriptor(layout, cudnn_tensor_dims); miopenTensorDescriptor_t cudnn_ygrad_desc = dyDesc.descriptor(layout, cudnn_tensor_dims); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2( context.cudnn_handle(), CudnnDataType::kOne(), cudnn_y_desc, Y->data(), cudnn_ygrad_desc, YGrad->data(), CudnnDataType::kZero(), cudnn_xgrad_desc, @@ -110,7 +106,7 @@ void SoftmaxGradCUDNNFunctor::operator()( dxDesc.descriptor(layout, cudnn_tensor_dims); cudnnTensorDescriptor_t cudnn_ygrad_desc = dyDesc.descriptor(layout, cudnn_tensor_dims); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward( context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType::kOne(), cudnn_y_desc, Y->data(), cudnn_ygrad_desc, YGrad->data(), diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu index ad23892f37903..dbb3d64350cae 100644 --- a/paddle/fluid/operators/math/unpooling.cu +++ b/paddle/fluid/operators/math/unpooling.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/unpooling.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu index d83b5b0fe3afb..d9c757544a9c6 100644 --- a/paddle/fluid/operators/math/vol2col.cu +++ b/paddle/fluid/operators/math/vol2col.cu @@ -15,8 +15,8 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/math/vol2col.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu index d85a262b5e910..757c780b4ea53 100644 --- a/paddle/fluid/operators/matrix_rank_op.cu +++ b/paddle/fluid/operators/matrix_rank_op.cu @@ -162,9 +162,9 @@ void MatrixRankGPUKernel::GesvdjBatched( int ldt = n; int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize( handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork, gesvdj_params)); auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); @@ -173,7 +173,7 @@ void MatrixRankGPUKernel::GesvdjBatched( int stride_U = ldu * (thin_UV ? k : m); int stride_V = ldt * (thin_UV ? k : n); for (int i = 0; i < batchSize; i++) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgesvdj( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj( handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i, U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork, info, gesvdj_params)); @@ -186,7 +186,7 @@ void MatrixRankGPUKernel::GesvdjBatched( platform::errors::PreconditionNotMet( "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); } - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); } @@ -203,9 +203,9 @@ void MatrixRankGPUKernel::GesvdjBatched( int ldt = n; int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize( handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork, gesvdj_params)); auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); @@ -214,7 +214,7 @@ void MatrixRankGPUKernel::GesvdjBatched( int stride_U = ldu * (thin_UV ? k : m); int stride_V = ldt * (thin_UV ? k : n); for (int i = 0; i < batchSize; ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgesvdj( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj( handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i, U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork, info, gesvdj_params)); @@ -228,7 +228,7 @@ void MatrixRankGPUKernel::GesvdjBatched( platform::errors::PreconditionNotMet( "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); } - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); } @@ -247,14 +247,14 @@ void MatrixRankGPUKernel::SyevjBatched( int stride_A = lda * n; int lwork = 0; syevjInfo_t params = NULL; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnCreateSyevjInfo(¶ms)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize( handle, jobz, uplo, n, A, lda, W, &lwork, params)); auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); float* workspace_ptr = reinterpret_cast(workspace->ptr()); for (int i = 0; i < batchSize; i++) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSsyevj( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj( handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr, lwork, info, params)); @@ -268,7 +268,7 @@ void MatrixRankGPUKernel::SyevjBatched( "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i, error_info)); } - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnDestroySyevjInfo(params)); } @@ -285,15 +285,15 @@ void MatrixRankGPUKernel::SyevjBatched( int stride_A = lda * n; int lwork = 0; syevjInfo_t params = NULL; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnCreateSyevjInfo(¶ms)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDsyevj_bufferSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj_bufferSize( handle, jobz, uplo, n, A, lda, W, &lwork, params)); auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); double* workspace_ptr = reinterpret_cast(workspace->ptr()); for (int i = 0; i < batchSize; i++) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDsyevj( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj( handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr, lwork, info, params)); int error_info; @@ -306,7 +306,7 @@ void MatrixRankGPUKernel::SyevjBatched( "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i, error_info)); } - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnDestroySyevjInfo(params)); } diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu index 7098a720cc3a0..79aff52a16fa9 100644 --- a/paddle/fluid/operators/mean_iou_op.cu +++ b/paddle/fluid/operators/mean_iou_op.cu @@ -15,8 +15,8 @@ limitations under the License. */ #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/mean_iou_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu index 430036bc67de7..1a10b7033f69e 100644 --- a/paddle/fluid/operators/mean_op.cu +++ b/paddle/fluid/operators/mean_op.cu @@ -19,7 +19,7 @@ limitations under the License. */ namespace cub = hipcub; #endif #include "paddle/fluid/operators/mean_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -65,14 +65,14 @@ class MeanCUDAKernel : public framework::OpKernel { auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, trans_x, out_data, size_prob, stream); - PADDLE_ENFORCE_CUDA_SUCCESS(err); + PADDLE_ENFORCE_GPU_SUCCESS(err); framework::Tensor tmp; auto* temp_storage = tmp.mutable_data( framework::make_ddim({static_cast(temp_storage_bytes)}), context.GetPlace()); err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, trans_x, out_data, size_prob, stream); - PADDLE_ENFORCE_CUDA_SUCCESS(err); + PADDLE_ENFORCE_GPU_SUCCESS(err); } }; diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h index 3998db6731b3d..43ac5984bc8c8 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.h +++ b/paddle/fluid/operators/memcpy_h2d_op.h @@ -41,7 +41,12 @@ class MemcpyH2DFunctor { void operator()(const framework::LoDTensor &lod_tensor) const { auto &out_tensor = *out_->GetMutable(); - +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + out_tensor.mutable_data( + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_.GetPlace()), + lod_tensor.type(), + static_cast(&dev_ctx_)->stream()); +#endif if (dst_place_type_ == 0 || dst_place_type_ == 1) { framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor); diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu index 3d22fc60993c7..6f19100fa9d37 100644 --- a/paddle/fluid/operators/metrics/accuracy_op.cu +++ b/paddle/fluid/operators/metrics/accuracy_op.cu @@ -15,9 +15,9 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/metrics/accuracy_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/gpu_info.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/metrics/auc_op.cu b/paddle/fluid/operators/metrics/auc_op.cu index 40609381c17ae..1cb7eba8775e8 100644 --- a/paddle/fluid/operators/metrics/auc_op.cu +++ b/paddle/fluid/operators/metrics/auc_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/metrics/auc_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/miopen_lstm_cache.h b/paddle/fluid/operators/miopen_lstm_cache.h index a357e6e5af6af..c307218baa406 100644 --- a/paddle/fluid/operators/miopen_lstm_cache.h +++ b/paddle/fluid/operators/miopen_lstm_cache.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/miopen_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { @@ -66,7 +66,7 @@ class ScopedRNNBase { // ------------------- miopen dropout descriptors --------------------- size_t state_size; if (!initialized_) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDropoutGetStatesSize(handle, &state_size)); dropout_state->mutable_data({static_cast(state_size)}, place); @@ -75,7 +75,7 @@ class ScopedRNNBase { dropout_state, seed_, state_size); // ------------------- miopen rnn descriptors --------------------- - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2( rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(), miopenRNNlinear, is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, miopenLSTM, @@ -83,7 +83,7 @@ class ScopedRNNBase { // ------------------- miopen weights_size --------------------- size_t weights_size_; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNParamsSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNParamsSize( handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, miopen_type)); PADDLE_ENFORCE_EQ( weights_size_, sizeof(T) * weight_numel_, @@ -95,10 +95,10 @@ class ScopedRNNBase { std::vector dim_w = {dim_tmp, 1, 1}; weight_desc_.descriptor(layout, dim_w); // ------------------- miopen workspace, reserve size --------------------- - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize( handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), workspace_size)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenGetRNNTrainingReserveSize( handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size)); diff --git a/paddle/fluid/operators/miopen_rnn_cache.h b/paddle/fluid/operators/miopen_rnn_cache.h index 97d608331ccb5..38cea39abd5de 100644 --- a/paddle/fluid/operators/miopen_rnn_cache.h +++ b/paddle/fluid/operators/miopen_rnn_cache.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/miopen_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { @@ -95,16 +95,16 @@ struct CudnnRNNCache { std::vector strides_y = {hidden_size_ * numDirections, 1, 1}; for (size_t i = 0; i < seq_length_; ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&x_desc_[i])); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&y_desc_[i])); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( x_desc_[i], miopen_type, 3, const_cast(dims.data()), const_cast(strides.data()))); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( y_desc_[i], miopen_type, 3, const_cast(dims_y.data()), const_cast(strides_y.data()))); } @@ -113,85 +113,85 @@ struct CudnnRNNCache { hidden_size_}; std::vector strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1}; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&hx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&cx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&hy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&cy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&dhx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&dcx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&dhy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&dcy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( hx_desc_, miopen_type, 3, const_cast(dims_hx.data()), const_cast(strides_hx.data()))); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( cx_desc_, miopen_type, 3, const_cast(dims_hx.data()), const_cast(strides_hx.data()))); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( hy_desc_, miopen_type, 3, const_cast(dims_hx.data()), const_cast(strides_hx.data()))); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( cy_desc_, miopen_type, 3, const_cast(dims_hx.data()), const_cast(strides_hx.data()))); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( dhx_desc_, miopen_type, 3, const_cast(dims_hx.data()), const_cast(strides_hx.data()))); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( dcx_desc_, miopen_type, 3, const_cast(dims_hx.data()), const_cast(strides_hx.data()))); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( dhy_desc_, miopen_type, 3, const_cast(dims_hx.data()), const_cast(strides_hx.data()))); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( dcy_desc_, miopen_type, 3, const_cast(dims_hx.data()), const_cast(strides_hx.data()))); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateDropoutDescriptor(&dropout_desc_)); size_t state_size; if (!initialized) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDropoutGetStatesSize(handle, &state_size)); dropout_state_->Resize({static_cast(state_size)}); uint8_t *dropout_state_data = dropout_state_->mutable_data(place); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetDropoutDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetDropoutDescriptor( dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, seed_, false, false, MIOPEN_RNG_PSEUDO_XORWOW)); } else { uint8_t *dropout_state_data = dropout_state_->data(); auto dropout_state_dims = dropout_state_->dims(); state_size = dropout_state_dims[0]; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenRestoreDropoutDescriptor( dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, 0, false, false, MIOPEN_RNG_PSEUDO_XORWOW)); } - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateRNNDescriptor(&rnn_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetRNNDescriptor( rnn_desc_, hidden_size_, num_layers_, miopenRNNlinear, is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, miopenLSTM, miopenRNNNoBias, miopenRNNdefault, miopen_type)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&w_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenCreateTensorDescriptor(&dw_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNParamsSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNParamsSize( handle, rnn_desc_, x_desc_[0], &weights_size_, miopen_type)); PADDLE_ENFORCE_EQ( @@ -208,14 +208,14 @@ struct CudnnRNNCache { dim_s[1] = 1; dim_s[0] = dim_w[1]; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( w_desc_, miopen_type, 3, dim_w, dim_s)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( dw_desc_, miopen_type, 3, dim_w, dim_s)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize( handle, rnn_desc_, seq_length_, x_desc_, &workspace_size_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenGetRNNTrainingReserveSize( handle, rnn_desc_, seq_length_, x_desc_, reserve_size_)); @@ -225,40 +225,40 @@ struct CudnnRNNCache { void release() { for (size_t i = 0; i < seq_length_; ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(x_desc_[i])); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(y_desc_[i])); } delete[] x_desc_; delete[] y_desc_; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(hx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(cx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(hy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(cy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(dhx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(dcx_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(dhy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(dcy_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyDropoutDescriptor(dropout_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyRNNDescriptor(rnn_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(w_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDestroyTensorDescriptor(dw_desc_)); } }; diff --git a/paddle/fluid/operators/mish_op.cu b/paddle/fluid/operators/mish_op.cu index 6513e5d95e4ac..4ca07b650c80a 100644 --- a/paddle/fluid/operators/mish_op.cu +++ b/paddle/fluid/operators/mish_op.cu @@ -11,8 +11,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mish_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index 35e35eb4bcb55..4a3d1f455bd26 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -26,14 +26,12 @@ using Tensor = framework::Tensor; using framework::DataLayout; inline dnnl::memory::dims GetWeightsTz(const Tensor* filter, const int groups) { - auto iohw_weights_tz = framework::vectorize(filter->dims()); - auto weights_tz = iohw_weights_tz; - - // IOHW -> OIHW - weights_tz[0] = iohw_weights_tz[1]; - weights_tz[1] = iohw_weights_tz[0]; + auto weights_tz = framework::vectorize(filter->dims()); int g = std::max(groups, 1); + int g_dim = (g > 1) ? 1 : 0; platform::GetGroupConvWeightsTz(weights_tz, g); + // gIOHW -> gOIHW || IOHW -> OIHW + std::swap(weights_tz[g_dim + 0], weights_tz[g_dim + 1]); return weights_tz; } diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc index 0457aeed616fa..0266edac75d1e 100644 --- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -25,9 +25,9 @@ using paddle::platform::MKLDNNDeviceContext; using paddle::platform::MKLDNNGetDataType; using paddle::platform::to_void_cast; using Tensor = paddle::framework::Tensor; -using paddle::framework::vectorize; -using paddle::framework::make_ddim; using paddle::framework::GradVarName; +using paddle::framework::make_ddim; +using paddle::framework::vectorize; template class MatMulV2MKLDNNHandler @@ -123,45 +123,58 @@ class MatMulV2MKLDNNHandler } }; -template -class MatMulV2MKLDNNKernel - : public paddle::operators::MatMulGradMKLDNNKernel { - public: - void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); } +bool IsOutputFused(const ExecutionContext& ctx) { + auto& fused_reshape_Out = ctx.Attr>("fused_reshape_Out"); + auto& fused_transpose_Out = ctx.Attr>("fused_transpose_Out"); + return !fused_reshape_Out.empty() && !fused_transpose_Out.empty(); +} + +float ComputeOutputScale(const ExecutionContext& ctx) { + float scale_x = ctx.Attr("Scale_x"); + float scale_y = ctx.Attr("Scale_y"); + bool force_fp32_out = ctx.Attr("force_fp32_output"); + float scale_out = force_fp32_out ? 1.f : ctx.Attr("Scale_out"); + return scale_out / (scale_x * scale_y); +} - protected: - void ExecuteMatMul(const ExecutionContext& ctx, +template +void ExecuteMatMulV2(const ExecutionContext& ctx, const MKLDNNDeviceContext& dev_ctx, const dnnl::engine onednn_engine, paddle::platform::Place cpu_place, const Tensor* x, std::vector& x_dims, bool trans_x, const Tensor* y, std::vector& y_dims, bool trans_y, Tensor* out, std::vector& out_dims, - int execution_number = 0) const { - MatMulV2MKLDNNHandler handler(onednn_engine, ctx.GetPlace(), x_dims, - trans_x, y_dims, trans_y, - IsOutputFused(ctx)); + int execution_number = 0) { + MatMulV2MKLDNNHandler handler(onednn_engine, ctx.GetPlace(), x_dims, + trans_x, y_dims, trans_y, + IsOutputFused(ctx)); - const auto src_memory_p = handler.AcquireSrcMemory(x); - const auto weights_memory_p = handler.AcquireWeightsMemory(y); - const auto dst_memory_p = handler.AcquireDstMemory(out); + const auto src_memory_p = handler.AcquireSrcMemory(x); + const auto weights_memory_p = handler.AcquireWeightsMemory(y); + const auto dst_memory_p = handler.AcquireDstMemory(out); - auto matmul_p = handler.AcquireForwardPrimitive(); + auto matmul_p = handler.AcquireForwardPrimitive(); - std::unordered_map matmul_args = { - {DNNL_ARG_SRC, *src_memory_p}, - {DNNL_ARG_WEIGHTS, *weights_memory_p}, - {DNNL_ARG_DST, *dst_memory_p}}; + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; - auto& astream = MKLDNNDeviceContext::tls().get_stream(); - matmul_p->execute(astream, matmul_args); - astream.wait(); + auto& astream = MKLDNNDeviceContext::tls().get_stream(); + matmul_p->execute(astream, matmul_args); + astream.wait(); - auto format = paddle::platform::MKLDNNFormatForSize( - out->dims().size(), dnnl::memory::format_tag::nchw); - out->set_layout(paddle::framework::DataLayout::kMKLDNN); - out->set_format(format); - } + auto format = paddle::platform::MKLDNNFormatForSize( + out->dims().size(), dnnl::memory::format_tag::nchw); + out->set_layout(paddle::framework::DataLayout::kMKLDNN); + out->set_format(format); +} + +template +class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel { + public: + void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); } private: void CalculateMatrixDims(const ExecutionContext& ctx, @@ -207,13 +220,6 @@ class MatMulV2MKLDNNKernel } } - bool IsOutputFused(const ExecutionContext& ctx) const { - auto& fused_reshape_Out = ctx.Attr>("fused_reshape_Out"); - auto& fused_transpose_Out = - ctx.Attr>("fused_transpose_Out"); - return !fused_reshape_Out.empty() && !fused_transpose_Out.empty(); - } - void RunKernel(const ExecutionContext& ctx) const { const auto& dev_ctx = ctx.template device_context(); const auto& onednn_engine = dev_ctx.GetEngine(); @@ -237,13 +243,14 @@ class MatMulV2MKLDNNKernel CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, out_dims, out); - ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_bd_dims, - trans_x, y, y_bd_dims, trans_y, out, out_dims); + ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, + x_bd_dims, trans_x, y, y_bd_dims, trans_y, out, + out_dims); } }; template -class MatMulV2GradMKLDNNKernel : public MatMulV2MKLDNNKernel { +class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel { public: void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); } @@ -316,7 +323,7 @@ class MatMulV2GradMKLDNNKernel : public MatMulV2MKLDNNKernel { // if no broadcasting is needed, we can simply use matmul's grad and avoid // using reduce_sum if (!is_broadcast) { - paddle::operators::MatMulGradMKLDNNKernel::Compute(ctx); + matmul_v1_grad_mkldnn_kernel.Compute(ctx); return; } @@ -342,33 +349,29 @@ class MatMulV2GradMKLDNNKernel : public MatMulV2MKLDNNKernel { dy_bd_dims); if (trans_x && trans_y) { - this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, - y_dims, true, dout, dout_dims, true, &dx_tmp, - dx_bd_dims, 1); - this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, - dout_dims, true, x, x_dims, true, &dy_tmp, dy_bd_dims, - 2); + ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, y_dims, + true, dout, dout_dims, true, &dx_tmp, dx_bd_dims, 1); + ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, + dout_dims, true, x, x_dims, true, &dy_tmp, dy_bd_dims, + 2); } else if (trans_x) { - this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, - y_dims, false, dout, dout_dims, true, &dx_tmp, - dx_bd_dims, 1); - this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, - x_dims, false, dout, dout_dims, false, &dy_tmp, - dy_bd_dims, 2); + ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, y_dims, + false, dout, dout_dims, true, &dx_tmp, dx_bd_dims, 1); + ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_dims, + false, dout, dout_dims, false, &dy_tmp, dy_bd_dims, 2); } else if (trans_y) { - this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, - dout_dims, false, y, y_dims, false, &dx_tmp, - dx_bd_dims, 1); - this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, - dout_dims, true, x, x_dims, false, &dy_tmp, - dy_bd_dims, 2); + ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, + dout_dims, false, y, y_dims, false, &dx_tmp, + dx_bd_dims, 1); + ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, + dout_dims, true, x, x_dims, false, &dy_tmp, dy_bd_dims, + 2); } else { - this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, - dout_dims, false, y, y_dims, true, &dx_tmp, - dx_bd_dims, 1); - this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, - x_dims, true, dout, dout_dims, false, &dy_tmp, - dy_bd_dims, 2); + ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, + dout_dims, false, y, y_dims, true, &dx_tmp, dx_bd_dims, + 1); + ExecuteMatMulV2(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_dims, + true, dout, dout_dims, false, &dy_tmp, dy_bd_dims, 2); } if (x_dims != dx_bd_dims) { @@ -389,8 +392,12 @@ class MatMulV2GradMKLDNNKernel : public MatMulV2MKLDNNKernel { dy->set_layout(paddle::framework::DataLayout::kMKLDNN); dy->set_format(y->format()); } + + private: + paddle::operators::MatMulGradMKLDNNKernel matmul_v1_grad_mkldnn_kernel; }; } // anonymous namespace + namespace ops = paddle::operators; REGISTER_OP_KERNEL(matmul_v2, MKLDNN, ::paddle::platform::CPUPlace, diff --git a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc index d9bd843a9d0cf..e5f70fa10e375 100644 --- a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc @@ -227,6 +227,8 @@ class SliceGradMKLDNNKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_KERNEL(slice, MKLDNN, paddle::platform::CPUPlace, ops::SliceMKLDNNKernel, + ops::SliceMKLDNNKernel, + ops::SliceMKLDNNKernel, ops::SliceMKLDNNKernel); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/mv_op.cu b/paddle/fluid/operators/mv_op.cu index ee638ede22b64..cec17f1324313 100644 --- a/paddle/fluid/operators/mv_op.cu +++ b/paddle/fluid/operators/mv_op.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/mv_op.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc index 169af47e95acd..bcbc96ea1b6d1 100644 --- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc @@ -50,7 +50,7 @@ void Communicator::InitAll(const std::vector& gpus) { for (size_t i = 0; i < gpus.size(); ++i) { (*comm_id_map)[gpus[i]] = i; } - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::ncclCommInitAll(global_comms->data(), gpus.size(), gpus.data())); inited = true; } diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc index 9a4a036077f58..f319ce159f6dd 100644 --- a/paddle/fluid/operators/nccl/nccl_op.cu.cc +++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc @@ -74,7 +74,7 @@ class NCCLAllReduceKernel : public framework::OpKernel { VLOG(3) << "gpu : " << " invoke allreduce. send " << x->numel() << " recv " << out->numel(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( x->data(), out->mutable_data(ctx.GetPlace()), out->numel(), NCCLTypeWrapper::type, reduction_op_, comm->comms().at(idx), ctx.cuda_device_context().stream())); @@ -111,7 +111,7 @@ class NCCLReduceKernel : public framework::OpKernel { } VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel() << " recv " << out->numel(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( x->data(), recvbuffer, x->numel(), NCCLTypeWrapper::type, reduction_op_, root, comm->comms().at(idx), ctx.cuda_device_context().stream())); @@ -136,7 +136,7 @@ class NCCLBcastKernel : public framework::OpKernel { if (idx == root) { auto* x = ctx.Input("X"); VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( reinterpret_cast(const_cast(x->data())), x->numel(), NCCLTypeWrapper::type, root, comm->comms().at(idx), ctx.cuda_device_context().stream())); @@ -145,7 +145,7 @@ class NCCLBcastKernel : public framework::OpKernel { auto* out = ctx.Output("Out"); VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer " << framework::product(out->dims()); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast( out->mutable_data(ctx.GetPlace()), out->numel(), NCCLTypeWrapper::type, root, comm->comms().at(idx), ctx.cuda_device_context().stream())); diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc index 6c7fba8d4ac78..41c1b4d7a8f81 100644 --- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc +++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc @@ -23,9 +23,9 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/place.h" @@ -44,7 +44,7 @@ const f::DDim kDims = {20, 20}; class NCCLTester : public ::testing::Test { public: void SetUp() override { - int count = p::GetCUDADeviceCount(); + int count = p::GetGPUDeviceCount(); if (count <= 0) { LOG(WARNING) << "Cannot test gpu nccl, because the CUDA device count is " << count; diff --git a/paddle/fluid/operators/nll_loss_op.cu b/paddle/fluid/operators/nll_loss_op.cu index b6e7cd256e18d..03af45634149d 100644 --- a/paddle/fluid/operators/nll_loss_op.cu +++ b/paddle/fluid/operators/nll_loss_op.cu @@ -13,7 +13,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/nll_loss_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/hostdevice.h" namespace paddle { diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h index 843736833f815..241c634e3fc98 100644 --- a/paddle/fluid/operators/norm_utils.cu.h +++ b/paddle/fluid/operators/norm_utils.cu.h @@ -26,11 +26,7 @@ namespace cub = hipcub; #endif #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/math/math_function.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#else -#include "paddle/fluid/platform/cudnn_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #ifdef __HIPCC__ #define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim) diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu index bffd1d5305127..3da7a3afcc93d 100644 --- a/paddle/fluid/operators/one_hot_op.cu +++ b/paddle/fluid/operators/one_hot_op.cu @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/fluid/operators/one_hot_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu index 2366f1422244e..22eb6c81845d1 100644 --- a/paddle/fluid/operators/one_hot_v2_op.cu +++ b/paddle/fluid/operators/one_hot_v2_op.cu @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/fluid/operators/one_hot_v2_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cu b/paddle/fluid/operators/optimizers/adagrad_op.cu index 5043468d4c5f7..8b939b7c6b3ba 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.cu +++ b/paddle/fluid/operators/optimizers/adagrad_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/optimizers/adagrad_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.cc b/paddle/fluid/operators/optimizers/merged_momentum_op.cc index 6c63376b5eb42..1733150f27128 100644 --- a/paddle/fluid/operators/optimizers/merged_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/merged_momentum_op.cc @@ -50,7 +50,8 @@ class MergedMomentumOpMaker : public framework::OpProtoAndCheckerMaker { .AsDuplicable(); AddInput("LearningRate", "(Tensor, default Tensor) " - "Input learning rate"); + "Input learning rate") + .AsDuplicable(); AddInput("MasterParam", "FP32 master weight for AMP.") .AsDispensable() .AsDuplicable(); @@ -68,6 +69,18 @@ class MergedMomentumOpMaker : public framework::OpProtoAndCheckerMaker { .AsDispensable() .AsDuplicable(); AddAttr("mu", "(float) Momentum coefficient"); + AddAttr("use_nesterov", + "(bool, default false) " + "Use Nesterov Momentum or not.") + .SetDefault(false); + AddAttr>( + "regularization_method", + "(string) regularization_method, right now only " + "support l2decay or none") + .SetDefault({}); + AddAttr>("regularization_coeff", + "(float) regularization_coeff") + .SetDefault({}); AddAttr("multi_precision", "(bool, default false) " "Whether to use multi-precision during weight updating.") diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.h b/paddle/fluid/operators/optimizers/merged_momentum_op.h index 4dfaa4de3ad44..7560b4fd8e5f9 100644 --- a/paddle/fluid/operators/optimizers/merged_momentum_op.h +++ b/paddle/fluid/operators/optimizers/merged_momentum_op.h @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/optimizers/momentum_op.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/macros.h" @@ -85,33 +86,43 @@ class MergedMomentumOpKernel : public framework::OpKernel { auto params = ctx.MultiInput("Param"); auto params_out = ctx.MultiOutput("ParamOut"); size_t n = params.size(); - PADDLE_ENFORCE_EQ( - n, params_out.size(), - platform::errors::InvalidArgument( - "Output(ParamOut) number must be equal to Input(Param) number.")); + PADDLE_ENFORCE_EQ(n, params_out.size(), + platform::errors::InvalidArgument( + "The size of Output(ParamOut) must be equal to " + "Input(Param), but got the size of Output(ParamOut) " + "is %d, the size of Input(Param) is %d.", + params_out.size(), n)); for (size_t i = 0; i < n; ++i) { - PADDLE_ENFORCE_EQ( - params[i], params_out[i], - platform::errors::InvalidArgument( - "Input(Param) and Output(ParamOut) must be the same Tensors.")); + PADDLE_ENFORCE_EQ(params[i], params_out[i], + platform::errors::InvalidArgument( + "The size of Input(Param) and Output(ParamOut) " + "must be the same Tensors.")); } auto grads = ctx.MultiInput("Grad"); PADDLE_ENFORCE_EQ( n, grads.size(), platform::errors::InvalidArgument( - "Input(Grad) number must be equal to Input(Param) number.")); + "The size of Input(Grad) must be equal to Input(Param), but got " + "the size of Input(Grad) is %d, the size of Input(Param) is %d.", + grads.size(), n)); auto velocitys = ctx.MultiInput("Velocity"); PADDLE_ENFORCE_EQ(n, velocitys.size(), platform::errors::InvalidArgument( - "Input(Velocity) number and Input(Param) number.")); + "The size of Input(Velocity) must be equal to " + "Input(Param), but got the size of Input(Velocity) " + "is %d, the size of Input(Param) is %d.", + velocitys.size(), n)); auto velocitys_out = ctx.MultiOutput("VelocityOut"); PADDLE_ENFORCE_EQ( n, velocitys_out.size(), - platform::errors::InvalidArgument("Output(VelocityOut) number must be " - "equal to Input(Param) number.")); + platform::errors::InvalidArgument( + "The size of Output(VelocityOut) must be " + "equal to Input(Param), but got the size of Output(VelocityOut) is " + "%d, the size of Input(Param) is %d.", + velocitys_out.size(), n)); for (size_t i = 0; i < n; ++i) { PADDLE_ENFORCE_EQ(velocitys[i], velocitys_out[i], platform::errors::InvalidArgument( @@ -126,12 +137,18 @@ class MergedMomentumOpKernel : public framework::OpKernel { if (multi_precision) { PADDLE_ENFORCE_EQ( n, master_params.size(), - platform::errors::InvalidArgument("Input(MasterParam) number must be " - "equal to Input(Param) number.")); - PADDLE_ENFORCE_EQ(n, master_params_out.size(), - platform::errors::InvalidArgument( - "Output(MasterParamOut) number must be equal to " - "Input(MasterParam) number.")); + platform::errors::InvalidArgument( + "The size of Input(MasterParam) must be " + "equal to Input(Param), but got the size of Input(MasterParam) " + "is %d, the size of Input(Param) is %d.", + master_params.size(), n)); + PADDLE_ENFORCE_EQ( + n, master_params_out.size(), + platform::errors::InvalidArgument( + "The size of Output(MasterParamOut) must be equal to " + "Input(MasterParam), but got the size of Output(MasterParamOut) " + "is %d, the size of Input(Param) is %d.", + master_params_out.size(), n)); for (size_t i = 0; i < n; ++i) { PADDLE_ENFORCE_EQ(master_params[i], master_params_out[i], platform::errors::InvalidArgument( @@ -147,20 +164,61 @@ class MergedMomentumOpKernel : public framework::OpKernel { master_params_out.clear(); } - auto lr = ctx.Input("LearningRate"); auto mu = ctx.Attr("mu"); auto rescale_grad = ctx.Attr("rescale_grad"); + auto lrs = ctx.MultiInput("LearningRate"); + if (lrs.size() != 1) { + PADDLE_ENFORCE_EQ( + n, lrs.size(), + platform::errors::InvalidArgument( + "If the size of Input(LearningRate) is not 1, the size of " + "Input(LearningRate) must be " + "equal to Input(Param), but got the size of Input(LearningRate) " + "is %d, the size of Input(Param) is %d.", + lrs.size(), n)); + } + auto use_nesterov = ctx.Attr("use_nesterov"); + auto regularization_methods = + ctx.Attr>("regularization_method"); + auto regularization_coeffs = + ctx.Attr>("regularization_coeff"); + if (regularization_methods.size() != 0) { + PADDLE_ENFORCE_EQ( + n, regularization_methods.size(), + platform::errors::InvalidArgument( + "The size of Attr(regularization_method) must be equal " + "to Input(Param), but got the size of " + "Attr(regularization_method) is %d, the size of Input(Param) is " + "%d.", + regularization_methods.size(), n)); + PADDLE_ENFORCE_EQ( + n, regularization_coeffs.size(), + platform::errors::InvalidArgument( + "The size of Attr(regularization_coeff) must be equal " + "to Input(Param), but got the size of Attr(regularization_coeff) " + "is %d, the size of Input(Param) is %d.", + regularization_coeffs.size(), n)); + } + + VLOG(5) << "use_nesterov: " << use_nesterov + << ", regularization_methods.size(): " + << regularization_methods.size() + << ", regularization_coeffs.size(): " + << regularization_coeffs.size(); + using MPType = typename operators::details::MPTypeTrait::Type; auto &dev_ctx = ctx.template device_context(); + if (lrs.size() == 1 && use_nesterov == false && + regularization_methods.size() == 0) { #define PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(kMultiPrecision) \ MergedMomentumKernelParam kernel_params; \ constexpr auto kMaxMergedNum = decltype(kernel_params)::N; \ size_t kernel_num = (n + kMaxMergedNum - 1) / kMaxMergedNum; \ kernel_params.mu = static_cast(mu); \ kernel_params.rescale_grad = static_cast(rescale_grad); \ - kernel_params.lr = lr->data(); \ + kernel_params.lr = lrs[0]->data(); \ for (size_t i = 0; i < kernel_num; ++i) { \ size_t start = i * kMaxMergedNum; \ size_t end = std::min((i + 1) * kMaxMergedNum, n); \ @@ -182,14 +240,78 @@ class MergedMomentumOpKernel : public framework::OpKernel { VLOG(10) << "Launch MergedMomentum kernel " << i << " " \ << kernel_params.param_num; \ } - - if (multi_precision) { - PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(true); + if (multi_precision) { + PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(true); + } else { + PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(false); + } +#undef PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL } else { - PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(false); - } + for (size_t idx = 0; idx < n; idx++) { + RegularizationType regularization_flag = + regularization_methods.size() > 0 && + regularization_methods[idx] == "l2_decay" + ? RegularizationType::kL2DECAY + : RegularizationType::kNONE; -#undef PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL + MPType regularization_coeff = static_cast(0.0); + if (regularization_coeffs.size() != 0) { + regularization_coeff = + static_cast(regularization_coeffs[idx]); + } + auto lr_temp = lrs.size() > 1 ? lrs[idx] : lrs[0]; + + const MPType *master_in_data = + multi_precision ? master_params[idx]->data() : nullptr; + MPType *master_out_data = + multi_precision ? master_params_out[idx]->data() : nullptr; + if (platform::is_cpu_place(ctx.GetPlace())) { + CPUDenseMomentumFunctor functor; + functor(params[idx], grads[idx], velocitys[idx], lr_temp, mu, + use_nesterov, regularization_flag, regularization_coeff, + params_out[idx], velocitys_out[idx]); + VLOG(10) << "Launch MergedMomentum cpu kernel."; + } else if (platform::is_gpu_place(ctx.GetPlace())) { + platform::ForRange for_range( + static_cast(ctx.device_context()), + params[idx]->numel()); +#define PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(__nesterov, __reg_type) \ + DenseMomentumFunctor functor( \ + params[idx]->data(), grads[idx]->data(), \ + velocitys[idx]->data(), lr_temp->data(), master_in_data, \ + mu, rescale_grad, params[idx]->numel(), regularization_coeff, \ + params_out[idx]->data(), velocitys_out[idx]->data(), \ + master_out_data); \ + for_range(functor); + if (use_nesterov) { + if (regularization_flag == RegularizationType::kL2DECAY) { + PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL( + UseNesterov, RegularizationType::kL2DECAY); + VLOG(10) + << "Launch MergedMomentum gpu kernel use_nesterov kL2DECAY."; + } else { + PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(UseNesterov, + RegularizationType::kNONE); + VLOG(10) + << "Launch MergedMomentum gpu kernel use_nesterov kNONE."; + } + } else { + if (regularization_flag == RegularizationType::kL2DECAY) { + PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL( + NoNesterov, RegularizationType::kL2DECAY); + VLOG(10) + << "Launch MergedMomentum gpu kernel no_nesterov kL2DECAY."; + } else { + PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(NoNesterov, + RegularizationType::kNONE); + VLOG(10) << "Launch MergedMomentum gpu kernel no_nesterov kNONE."; + } + } + } + } + VLOG(10) + << "Launch MergedMomentum kernel with multi_lr and regularization."; + } } }; diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index a5d9ad271f23a..3582e939f30ac 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/optimizers/sgd_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h index f1516320ec573..23e37ea27b54f 100644 --- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h +++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h @@ -445,12 +445,12 @@ class SparseMomentumOpKernel : public framework::OpKernel { for_range_index(range_functor); size_t temp_storage_bytes = 0; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( (cub::DeviceRadixSort::SortPairs( nullptr, temp_storage_bytes, nullptr, nullptr, nullptr, nullptr, static_cast(num_index)))); auto d_temp_storage = memory::Alloc(ctx.GetPlace(), temp_storage_bytes); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( (cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, index->data(), sorted_index_ptr, sort_value_ptr, grad_index_ptr, diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu index a77d0a5650ef3..a854fa6091ab4 100644 --- a/paddle/fluid/operators/pad2d_op.cu +++ b/paddle/fluid/operators/pad2d_op.cu @@ -15,8 +15,8 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu index f243a78e5578b..1567251236550 100644 --- a/paddle/fluid/operators/pad3d_op.cu +++ b/paddle/fluid/operators/pad3d_op.cu @@ -15,8 +15,8 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc index 8fcd40a9a2df4..bbe3174012947 100644 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc @@ -16,14 +16,11 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/pool_op.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cudnn_helper.h" -#endif #ifdef PADDLE_WITH_HIP #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/miopen_helper.h" #endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { @@ -204,17 +201,17 @@ class PoolCUDNNOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP char *pool_workspace; size_t pool_worksize = 0; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenPoolingGetWorkSpaceSizeV2( cudnn_pool_desc, cudnn_output_desc, &pool_worksize)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenPoolingForward( + PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingForward( handle, cudnn_pool_desc, &alpha, cudnn_input_desc, tranformed_input_data, &beta, cudnn_output_desc, tranformed_output_data, false, pool_workspace, pool_worksize)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipFree(pool_workspace)); + PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnPoolingForward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingForward( handle, cudnn_pool_desc, &alpha, cudnn_input_desc, tranformed_input_data, &beta, cudnn_output_desc, tranformed_output_data)); @@ -468,17 +465,17 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP char *pool_workspace; size_t pool_worksize = 0; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenPoolingGetWorkSpaceSizeV2( cudnn_pool_desc, cudnn_output_desc, &pool_worksize)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenPoolingBackward( + PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingBackward( handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, &beta, cudnn_input_desc, input_grad_data, pool_workspace)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipFree(pool_workspace)); + PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnPoolingBackward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingBackward( handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, &beta, cudnn_input_desc, input_grad_data)); diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 9d8f086ce0f18..fa98e76e39338 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -15,12 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/pool_op.h" #include -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cudnn_helper.h" -#endif -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu index ce3f5969cef49..06cc9ed7a96e5 100644 --- a/paddle/fluid/operators/prelu_op.cu +++ b/paddle/fluid/operators/prelu_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/prelu.h" #include "paddle/fluid/operators/prelu_op.h" #include "paddle/fluid/operators/reduce_ops/cub_reduce.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h index f9e2b78d5d31a..38f8d6542ac32 100644 --- a/paddle/fluid/operators/prroi_pool_op.h +++ b/paddle/fluid/operators/prroi_pool_op.h @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" #if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #endif namespace paddle { diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu index f69edfc1fcfec..5a0d1a700417c 100644 --- a/paddle/fluid/operators/psroi_pool_op.cu +++ b/paddle/fluid/operators/psroi_pool_op.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/psroi_pool_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cu b/paddle/fluid/operators/pull_box_extended_sparse_op.cu index 5bde6bc2e5cbb..26a02ea622479 100644 --- a/paddle/fluid/operators/pull_box_extended_sparse_op.cu +++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cu @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/fluid/operators/pull_box_extended_sparse_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/pull_box_sparse_op.cu b/paddle/fluid/operators/pull_box_sparse_op.cu index 8bba9db5426b7..96a1b1c08b79c 100644 --- a/paddle/fluid/operators/pull_box_sparse_op.cu +++ b/paddle/fluid/operators/pull_box_sparse_op.cu @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/fluid/operators/pull_box_sparse_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu index 992df172ace0c..3eb5f72b5b117 100644 --- a/paddle/fluid/operators/qr_op.cu +++ b/paddle/fluid/operators/qr_op.cu @@ -167,7 +167,7 @@ void QrGPUKernel::BatchedGeqrf( int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf_bufferSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgeqrf_bufferSize( handle, m, n, a, lda, &lwork)); auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); float* workspace_ptr = reinterpret_cast(workspace->ptr()); @@ -178,7 +178,7 @@ void QrGPUKernel::BatchedGeqrf( float* a_working_ptr = &a[i * a_stride]; float* tau_working_ptr = &tau[i * tau_stride]; // compute geqrf - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgeqrf( handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork, info_d)); // Do we need synchronized here? @@ -201,7 +201,7 @@ void QrGPUKernel::BatchedGeqrf( int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf_bufferSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgeqrf_bufferSize( handle, m, n, a, lda, &lwork)); auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); double* workspace_ptr = reinterpret_cast(workspace->ptr()); @@ -212,7 +212,7 @@ void QrGPUKernel::BatchedGeqrf( double* a_working_ptr = &a[i * a_stride]; double* tau_working_ptr = &tau[i * tau_stride]; // compute geqrf - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgeqrf( handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork, info_d)); // Do we need synchronized here? @@ -235,7 +235,7 @@ void QrGPUKernel::BatchedOrgqr( int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr_bufferSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSorgqr_bufferSize( handle, m, n, k, a, lda, tau, &lwork)); auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); float* workspace_ptr = reinterpret_cast(workspace->ptr()); @@ -246,7 +246,7 @@ void QrGPUKernel::BatchedOrgqr( float* a_working_ptr = &a[i * a_stride]; float* tau_working_ptr = &tau[i * tau_stride]; // compute orggr - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSorgqr( handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork, info_d)); // Do we need synchronized here? @@ -270,7 +270,7 @@ void QrGPUKernel::BatchedOrgqr( int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr_bufferSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDorgqr_bufferSize( handle, m, n, k, a, lda, tau, &lwork)); auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); double* workspace_ptr = reinterpret_cast(workspace->ptr()); @@ -281,7 +281,7 @@ void QrGPUKernel::BatchedOrgqr( double* a_working_ptr = &a[i * a_stride]; double* tau_working_ptr = &tau[i * tau_stride]; // compute orggr - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDorgqr( handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork, info_d)); // Do we need synchronized here? diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu index 6250d68730e13..23a0f2d0a24e3 100644 --- a/paddle/fluid/operators/range_op.cu +++ b/paddle/fluid/operators/range_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/range_op.h" #include "paddle/fluid/operators/utils.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu index aaa4eec7c1bf3..23b4475e1f7c1 100644 --- a/paddle/fluid/operators/rank_attention_op.cu +++ b/paddle/fluid/operators/rank_attention_op.cu @@ -17,8 +17,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/rank_attention.cu.h" #include "paddle/fluid/operators/rank_attention_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 58af6309e3d28..6c28daa7eac72 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -161,14 +161,14 @@ void BufferedReader::ReadAsync(size_t i) { platform::SetDeviceId( BOOST_GET_CONST(platform::CUDAPlace, place_).device); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipEventRecord(events_[i].get(), compute_stream_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream_.get(), events_[i].get(), 0)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaEventRecord(events_[i].get(), compute_stream_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0)); #endif @@ -199,19 +199,12 @@ void BufferedReader::ReadAsync(size_t i) { memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr, cuda_pinned_place, cuda_pinned_ptr, size, stream_.get()); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_.get())); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get())); -#endif + + platform::GpuStreamSync(stream_.get()); } cuda[i].set_lod(cpu[i].lod()); } -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_.get())); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get())); -#endif + platform::GpuStreamSync(stream_.get()); } } #endif diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index c433cac56a431..3d42486c6df88 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -22,8 +22,8 @@ #include "ThreadPool.h" #include "paddle/fluid/framework/reader.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include "paddle/fluid/platform/cuda_resource_pool.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #endif #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/npu_info.h" diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h index 90adea60927c0..dc79666b72fa6 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -64,6 +64,17 @@ struct CustomSum { } }; +template +struct CustomSub { + using Transformer = kps::InverseFunctor; + + inline Ty initial() { return static_cast(0.0f); } + + __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { + return b + a; + } +}; + template struct CustomMean { using Transformer = kps::DivideFunctor; diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 6b3b484320018..9c348477963b4 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -36,7 +36,8 @@ namespace cub = hipcub; #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" -#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/fast_divmod.h" // Reduce split or not, Whether to use ReduceHigherDim @@ -464,9 +465,9 @@ struct ReduceConfig { reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y); } int device_id = platform::GetCurrentDeviceId(); - int max_mp = platform::GetCUDAMultiProcessors(device_id); + int max_mp = platform::GetGPUMultiProcessors(device_id); int max_threads_per_mp = - platform::GetCUDAMaxThreadsPerMultiProcessor(device_id); + platform::GetGPUMaxThreadsPerMultiProcessor(device_id); int max_threads = max_threads_per_mp * max_mp; int num_threads = block_dim->x * block_dim->y; int max_num_blocks = max_threads / num_threads; @@ -506,9 +507,9 @@ struct ReduceConfig { left_num = last_dim_num; grid_dim->z = grid_z; int device_id = platform::GetCurrentDeviceId(); - int max_mp = platform::GetCUDAMultiProcessors(device_id); + int max_mp = platform::GetGPUMultiProcessors(device_id); int max_threads_per_mp = - platform::GetCUDAMaxThreadsPerMultiProcessor(device_id); + platform::GetGPUMaxThreadsPerMultiProcessor(device_id); int max_threads = max_threads_per_mp * max_mp; // init int num_block = (max_threads / left_num); diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 5148e3b0940c9..c12db1293856b 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -383,13 +383,13 @@ class ReshapeKernel { // 3. out tensor is view of input // We can't MakePtenDenseTensor for case 2, so we solve this case by // creating a temporary tensor here: - const auto alloc = std::make_shared( - ctx.GetPlace()); pten::DenseTensorMeta meta{pten::TransToPtenDataType(in->type()), in->dims(), pten::TransToPtenDataLayout(in->layout())}; - auto pt_out_tmp = - std::make_shared(alloc, std::move(meta)); + auto pt_out_tmp = std::make_shared( + pten::make_intrusive( + ctx.GetPlace()), + std::move(meta)); pten::DenseTensor *pt_out = nullptr; if (in == out) { pt_out = pt_x.get(); @@ -484,7 +484,8 @@ class ReshapeKernel { // non-inplace need move all result from pt_out to out, inplace need set // result dims. if (in != out) { - paddle::experimental::MovesStorage(pt_out, static_cast(out)); + paddle::experimental::MovesSharedStorage(pt_out, + static_cast(out)); } else { out->Resize(pt_out->dims()); } diff --git a/paddle/fluid/operators/rnn_op.cu.cc b/paddle/fluid/operators/rnn_op.cu.cc index 07329a9175e52..de4847ddc4590 100644 --- a/paddle/fluid/operators/rnn_op.cu.cc +++ b/paddle/fluid/operators/rnn_op.cu.cc @@ -16,12 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/utils.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cudnn_helper.h" -#endif -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { @@ -97,12 +92,12 @@ class RNNDescriptors { bool is_initialized = dropout_state->IsInitialized(); if (!is_test_ && !is_initialized) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenDropoutGetStatesSize(handle, &state_size)); dropout_state->mutable_data({static_cast(state_size)}, place); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size)); dropout_state->mutable_data({static_cast(state_size)}, place); @@ -114,19 +109,19 @@ class RNNDescriptors { // ------------------- cudnn rnn descriptors --------------------- #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2( rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(), miopenRNNlinear, is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, mode_, miopenRNNwithBias, miopenRNNdefault, cudnn_type)); #elif CUDNN_VERSION >= 6000 - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6( handle, rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(), CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, mode_, CUDNN_RNN_ALGO_STANDARD, cudnn_type)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNDescriptor( rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(), CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, mode_, @@ -135,7 +130,7 @@ class RNNDescriptors { #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201 if (!sequence_length.empty()) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode( rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED)); } #endif @@ -143,10 +138,10 @@ class RNNDescriptors { // ------------------- cudnn weights_size --------------------- size_t weights_size_; #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNParamsSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNParamsSize( handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNParamsSize( handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type)); #endif PADDLE_ENFORCE_EQ( @@ -160,18 +155,18 @@ class RNNDescriptors { weight_desc_.descriptor(layout, dim_w); // ------------------- cudnn workspace, reserve size --------------------- #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize( handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), workspace_size)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenGetRNNTrainingReserveSize( handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize( handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), workspace_size)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetRNNTrainingReserveSize( handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size)); @@ -557,7 +552,7 @@ class RNNCudnnKernel : public framework::OpKernel { // for train // This interface is used when the input/output is unpadded. #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardTraining( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNForwardTraining( handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data, rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data, rnn.weight_desc(), w_data, rnn.y_descs(), out_data, @@ -565,7 +560,7 @@ class RNNCudnnKernel : public framework::OpKernel { workspace_data_.data(), workspace_size, reserve_data, reserve_size)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardTraining( handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data, rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data, rnn.weight_desc(), w_data, rnn.y_descs(), out_data, @@ -577,15 +572,13 @@ class RNNCudnnKernel : public framework::OpKernel { #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201 // for train // This interface is used when the input/output is padded. - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnRNNForwardTrainingEx( - handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, - rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data, - rnn.weight_desc(), w_data, rnn.y_seq_desc(), out_data, - rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data, - nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, - nullptr, workspace_data_.data(), workspace_size, - reserve_data, reserve_size)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardTrainingEx( + handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, rnn.init_h_desc(), + init_h_data, rnn.init_c_desc(), init_c_data, rnn.weight_desc(), + w_data, rnn.y_seq_desc(), out_data, rnn.last_h_desc(), last_h_data, + rnn.last_c_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, workspace_data_.data(), + workspace_size, reserve_data, reserve_size)); #else PADDLE_THROW(platform::errors::Unavailable( "The padded input is supported by " @@ -606,14 +599,14 @@ class RNNCudnnKernel : public framework::OpKernel { // for inference // This interface is used when the input/output is unpadded. #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardInference( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNForwardInference( handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data, rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data, rnn->weight_desc(), w_data, rnn->y_descs(), out_data, rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data, workspace_data->data(), workspace_size)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardInference( handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data, rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data, rnn->weight_desc(), w_data, rnn->y_descs(), out_data, @@ -624,7 +617,7 @@ class RNNCudnnKernel : public framework::OpKernel { #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201 // for inference // This interface is used when the input/output is padded. - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx( handle, rnn->rnn_desc(), rnn->x_seq_desc(), x_data, rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data, rnn->weight_desc(), w_data, rnn->y_seq_desc(), out_data, @@ -831,7 +824,7 @@ class RNNGradCudnnKernel : public framework::OpKernel { if (!has_seq_length) { if (in_grad) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardData( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNBackwardData( handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data, rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data, rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data, @@ -842,7 +835,7 @@ class RNNGradCudnnKernel : public framework::OpKernel { const_cast(reserve_data), reserve_size)); #else // This interface is used when the input/output is unpadded. - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardData( handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data, rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data, rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data, @@ -855,7 +848,7 @@ class RNNGradCudnnKernel : public framework::OpKernel { } if (!weight_grad_list.empty()) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardWeights( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNBackwardWeights( handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data(), rnn.init_h_desc(), init_h_data, rnn.y_descs(), out->data(), rnn.weight_desc(), weight_grad_data, @@ -865,7 +858,7 @@ class RNNGradCudnnKernel : public framework::OpKernel { tensor_to_permuted_weight(place, stream, weight_grad, &weight_grad_list, rnn_mode, is_bidirec); #else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardWeights( handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data(), rnn.init_h_desc(), init_h_data, rnn.y_descs(), out->data(), workspace_data_.data(), workspace_size, rnn.weight_desc(), @@ -878,7 +871,7 @@ class RNNGradCudnnKernel : public framework::OpKernel { // for train // This interface is used when the input/output is padded. if (in_grad) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx( handle, rnn.rnn_desc(), rnn.y_seq_desc(), out_data, rnn.y_seq_desc(), out_grad_data, nullptr, nullptr, rnn.last_h_desc(), last_h_grad_data, rnn.last_c_desc(), @@ -891,13 +884,12 @@ class RNNGradCudnnKernel : public framework::OpKernel { } if (!weight_grad_list.empty()) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnRNNBackwardWeightsEx( - handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data(), - rnn.init_h_desc(), init_h_data, rnn.y_seq_desc(), - out->data(), workspace_data_.data(), workspace_size, - rnn.weight_desc(), weight_grad_data, - const_cast(reserve_data), reserve_size)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardWeightsEx( + handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data(), + rnn.init_h_desc(), init_h_data, rnn.y_seq_desc(), out->data(), + workspace_data_.data(), workspace_size, rnn.weight_desc(), + weight_grad_data, const_cast(reserve_data), + reserve_size)); } #else PADDLE_THROW(platform::errors::Unavailable( diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu index 111828005222b..a08339d776ff1 100644 --- a/paddle/fluid/operators/roi_align_op.cu +++ b/paddle/fluid/operators/roi_align_op.cu @@ -15,8 +15,8 @@ limitations under the License. */ #include #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/roi_align_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu index 562ff8d576b7d..0a4a076c6caae 100644 --- a/paddle/fluid/operators/roi_pool_op.cu +++ b/paddle/fluid/operators/roi_pool_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/roi_pool_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu index d70bd58887f84..57986d262820d 100644 --- a/paddle/fluid/operators/roll_op.cu +++ b/paddle/fluid/operators/roll_op.cu @@ -17,7 +17,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/roll_op.h" #include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu index a712878854298..586cf3239b575 100644 --- a/paddle/fluid/operators/row_conv_op.cu +++ b/paddle/fluid/operators/row_conv_op.cu @@ -13,7 +13,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/row_conv_op.h" -#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index c24f924313fb9..4e9c84ef4c950 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -75,14 +75,14 @@ class ScaleOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { if (ctx.InputVar("X")->IsType() || ctx.InputVar("X")->IsType()) { + std::string scale_attr; if (ctx.HasInput("ScaleTensor")) { - return framework::KernelSignature("scale.host", {"X", "ScaleTensor"}, - {"bias", "bias_after_scale"}, - {"Out"}); + scale_attr = "ScaleTensor"; } else { - return framework::KernelSignature( - "scale", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"}); + scale_attr = "scale"; } + return framework::KernelSignature( + "scale", {"X"}, {scale_attr, "bias", "bias_after_scale"}, {"Out"}); } // TODO(chenweihang): support other cases after selected rows added return framework::KernelSignature("scale.unregistered", {}, {}, {}); diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h index e3791351cefb3..6c7a0a8886ef0 100644 --- a/paddle/fluid/operators/scatter.cu.h +++ b/paddle/fluid/operators/scatter.cu.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "math/math_function.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/place.h" namespace paddle { diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc index de368e6e80219..8d92ea4166513 100644 --- a/paddle/fluid/operators/scatter_op_npu.cc +++ b/paddle/fluid/operators/scatter_op_npu.cc @@ -48,18 +48,49 @@ class ScatterNPUKernel : public framework::OpKernel { index = &tmp_tensor; } - auto stream = - ctx.template device_context() - .stream(); + const auto& dev_ctx = + ctx.template device_context(); + auto op_func_update = [](const std::vector& inputs, + const std::vector& outputs, + const NPUAttributeMap& attrs, + const platform::NPUDeviceContext& dev_ctx) { + const auto& runner = + NpuOpRunner("TensorScatterUpdate", inputs, outputs, attrs); + runner.Run(dev_ctx.stream()); + }; + auto op_func_add = [](const std::vector& inputs, + const std::vector& outputs, + const NPUAttributeMap& attrs, + const platform::NPUDeviceContext& dev_ctx) { + const auto& runner = + NpuOpRunner("TensorScatterAdd", inputs, outputs, attrs); + runner.Run(dev_ctx.stream()); + }; if (overwrite) { - const auto& runner_update = NpuOpRunner( - "TensorScatterUpdate", {*x, *index, *updates}, {*out}, {}); - runner_update.Run(stream); + if (x->type() == framework::proto::VarType::INT64) { + NpuOpRunner::TypeAdapter( + {*x, *index, *updates}, {*out}, {}, dev_ctx, op_func_update, + {framework::proto::VarType::INT32, framework::proto::VarType::INT32, + framework::proto::VarType::INT32}, + {framework::proto::VarType::INT32}); + } else { + const auto& runner_update = NpuOpRunner( + "TensorScatterUpdate", {*x, *index, *updates}, {*out}, {}); + runner_update.Run(dev_ctx.stream()); + } } else { - const auto& runner_add = - NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {}); - runner_add.Run(stream); + if (x->type() == framework::proto::VarType::INT64) { + NpuOpRunner::TypeAdapter( + {*x, *index, *updates}, {*out}, {}, dev_ctx, op_func_add, + {framework::proto::VarType::INT32, framework::proto::VarType::INT32, + framework::proto::VarType::INT32}, + {framework::proto::VarType::INT32}); + } else { + const auto& runner_add = + NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {}); + runner_add.Run(dev_ctx.stream()); + } } } }; @@ -70,6 +101,10 @@ namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL( scatter, ops::ScatterNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ScatterNPUKernel, +#endif + ops::ScatterNPUKernel, ops::ScatterNPUKernel); #endif diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu index 379a07a26dd5c..4e20844dc3275 100644 --- a/paddle/fluid/operators/segment_pool_op.cu +++ b/paddle/fluid/operators/segment_pool_op.cu @@ -14,8 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/segment_pool_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h index 5f9635c8ae111..307bf4010f7ff 100644 --- a/paddle/fluid/operators/segment_pool_op.h +++ b/paddle/fluid/operators/segment_pool_op.h @@ -72,11 +72,11 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) { const IndexT* segment_ids = segment->data(); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT), hipMemcpyDeviceToHost)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT), cudaMemcpyDeviceToHost)); #endif diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu index 6d8f60ce932ab..8092a40d19b19 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu @@ -15,7 +15,7 @@ #include #include #include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu index bacaaeadbf576..bb928cf401c33 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu index c8b6156881c96..1c4265a71d4ea 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu index d4f4051c3a460..f63fa5be7f496 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/shard_index_op.cu b/paddle/fluid/operators/shard_index_op.cu index f2800c60c3304..115b3f47d664b 100644 --- a/paddle/fluid/operators/shard_index_op.cu +++ b/paddle/fluid/operators/shard_index_op.cu @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/fluid/operators/shard_index_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/share_buffer_op.cc b/paddle/fluid/operators/share_buffer_op.cc index a161b9272b7b2..f6a6c9695b2ad 100644 --- a/paddle/fluid/operators/share_buffer_op.cc +++ b/paddle/fluid/operators/share_buffer_op.cc @@ -49,7 +49,8 @@ class ShareBufferOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor), The output tensors which are the same as X. It is " "used to build the graph dependency") .AsDuplicable(); - AddAttr>("share_dims", "Whether to share dims") + AddAttr>("share_dims_and_dtype", + "Whether to share dims and data type") .SetDefault(std::vector()); AddComment( R"DOC(Operator used to perform inplace memory reuse. It should be not exposed to Python APIs.)DOC"); diff --git a/paddle/fluid/operators/share_buffer_op.h b/paddle/fluid/operators/share_buffer_op.h index 5138ad9d54b79..1d0abf14f577e 100644 --- a/paddle/fluid/operators/share_buffer_op.h +++ b/paddle/fluid/operators/share_buffer_op.h @@ -29,12 +29,13 @@ class ShareBufferOpKernel : public framework::OpKernel { size_t n = inputs.size(); PADDLE_ENFORCE_EQ(n, outputs.size(), platform::errors::PermissionDenied( "Variable number not match.")); - const auto &share_dims = ctx.Attr>("share_dims"); - if (!share_dims.empty()) { - PADDLE_ENFORCE_EQ( - n, share_dims.size(), - platform::errors::PermissionDenied( - "Attribute share_dims number not match input variable number.")); + const auto &share_dims_and_dtype = + ctx.Attr>("share_dims_and_dtype"); + if (!share_dims_and_dtype.empty()) { + PADDLE_ENFORCE_EQ(n, share_dims_and_dtype.size(), + platform::errors::PermissionDenied( + "Attribute share_dims_and_dtype number not match " + "input variable number.")); } const std::vector *input_args = nullptr, @@ -50,8 +51,9 @@ class ShareBufferOpKernel : public framework::OpKernel { outputs[i]->ShareBufferWith(*inputs[i]); VLOG(10) << "Share tensor buffer " << (*input_args)[i] << " -> " << (*output_args)[i]; - if (!share_dims.empty() && share_dims[i]) { + if (!share_dims_and_dtype.empty() && share_dims_and_dtype[i]) { outputs[i]->Resize(inputs[i]->dims()); + outputs[i]->ShareDataTypeWith(*inputs[i]); } } } diff --git a/paddle/fluid/operators/share_buffer_op_test.cc b/paddle/fluid/operators/share_buffer_op_test.cc new file mode 100644 index 0000000000000..60220981cab1d --- /dev/null +++ b/paddle/fluid/operators/share_buffer_op_test.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/place.h" + +USE_OP(share_buffer); + +namespace paddle { +namespace framework { + +TEST(test_share_buffer_op, test_share_buffer_op) { + std::vector inputs = {"X1", "X2"}; + std::vector outputs = {"Y1", "Y2"}; + std::vector dims = {{2, 3, 4}, {5, 6}}; + std::vector share_dims_and_dtype = {false, true}; + + size_t n = inputs.size(); + EXPECT_EQ(n, outputs.size()); + EXPECT_EQ(n, dims.size()); + EXPECT_EQ(n, share_dims_and_dtype.size()); + + OpDesc desc; + desc.SetType("share_buffer"); + desc.SetInput("X", inputs); + desc.SetOutput("Out", outputs); + desc.SetOutput("XOut", inputs); + desc.SetAttr("share_dims_and_dtype", share_dims_and_dtype); + + auto op = OpRegistry::CreateOp(desc); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + platform::Place place = platform::CUDAPlace(0); +#else + platform::Place place = platform::CPUPlace(); +#endif + + Scope scope; + for (size_t i = 0; i < n; ++i) { + auto *in_tensor = scope.Var(inputs[i])->GetMutable(); + in_tensor->Resize(dims[i]); + in_tensor->mutable_data(place); + scope.Var(outputs[i])->GetMutable(); + } + op->Run(scope, place); + platform::DeviceContextPool::Instance().Get(place)->Wait(); + + for (size_t i = 0; i < n; ++i) { + const auto &in_tensor = scope.Var(inputs[i])->Get(); + const auto &out_tensor = scope.Var(outputs[i])->Get(); + EXPECT_TRUE(out_tensor.IsSharedBufferWith(in_tensor)); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu index dbc3e1a7ebe26..582d1ea0f26af 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cu +++ b/paddle/fluid/operators/shuffle_channel_op.cu @@ -10,8 +10,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/shuffle_channel_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu index 8611249a29f63..cc012230c1062 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu @@ -21,7 +21,7 @@ namespace cub = hipcub; #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/hostdevice.h" namespace paddle { diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index a5513ba648776..4965e5e156c34 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -244,7 +244,7 @@ class SliceOpMaker : public framework::OpProtoAndCheckerMaker { "mkldnn_data_type", "(string, default \"float32\"). Data type of mkldnn kernel") .SetDefault("float32") - .InEnum({"float32", "bfloat16"}) + .InEnum({"float32", "int8", "bfloat16"}) .AsExtra(); AddComment(R"DOC( Slice Operator. diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.h b/paddle/fluid/operators/softmax_cudnn_op.cu.h index 68b694a59f47d..533488896dfcd 100644 --- a/paddle/fluid/operators/softmax_cudnn_op.cu.h +++ b/paddle/fluid/operators/softmax_cudnn_op.cu.h @@ -18,12 +18,8 @@ limitations under the License. */ #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/operators/math/math_cuda_utils.h" #include "paddle/fluid/operators/softmax_op.h" -#include "paddle/fluid/platform/cuda_device_function.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#else -#include "paddle/fluid/platform/cudnn_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { @@ -453,7 +449,7 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx, const int N = SizeToAxis(axis, dims); const int D = SizeOutAxis(axis, dims); - constexpr int max_dim = 320; + constexpr int max_dim = 512; constexpr int warps_per_block = 4; if (D == 1 && dim <= max_dim && sizeof(T) <= 4) { @@ -503,12 +499,12 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx, auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE : MIOPEN_SOFTMAX_MODE_CHANNEL; if (LogMode) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( handle, platform::CudnnDataType::kOne(), desc_, x.data(), platform::CudnnDataType::kZero(), desc_, out_data, MIOPEN_SOFTMAX_LOG, mode)); } else { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( handle, platform::CudnnDataType::kOne(), desc_, x.data(), platform::CudnnDataType::kZero(), desc_, out_data, MIOPEN_SOFTMAX_ACCURATE, mode)); @@ -517,12 +513,12 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx, auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE : CUDNN_SOFTMAX_MODE_CHANNEL; if (LogMode) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward( handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType::kOne(), desc_, x.data(), platform::CudnnDataType::kZero(), desc_, out_data)); } else { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward( handle, CUDNN_SOFTMAX_ACCURATE, mode, platform::CudnnDataType::kOne(), desc_, x.data(), platform::CudnnDataType::kZero(), desc_, out_data)); @@ -544,7 +540,7 @@ void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx, const int N = SizeToAxis(axis, dims); const int D = SizeOutAxis(axis, dims); - constexpr int max_dim = 320; + constexpr int max_dim = 512; constexpr int warps_per_block = 4; if (D == 1 && dim <= max_dim && sizeof(T) <= 4) { @@ -591,12 +587,12 @@ void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx, auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE : MIOPEN_SOFTMAX_MODE_CHANNEL; if (LogMode) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2( handle, platform::CudnnDataType::kOne(), desc_, out.data(), desc_, dout.data(), platform::CudnnDataType::kZero(), desc_, dx_data, MIOPEN_SOFTMAX_LOG, mode)); } else { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2( handle, platform::CudnnDataType::kOne(), desc_, out.data(), desc_, dout.data(), platform::CudnnDataType::kZero(), desc_, dx_data, MIOPEN_SOFTMAX_ACCURATE, mode)); @@ -605,12 +601,12 @@ void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx, auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE : CUDNN_SOFTMAX_MODE_CHANNEL; if (LogMode) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward( handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType::kOne(), desc_, out.data(), desc_, dout.data(), platform::CudnnDataType::kZero(), desc_, dx_data)); } else { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward( handle, CUDNN_SOFTMAX_ACCURATE, mode, platform::CudnnDataType::kOne(), desc_, out.data(), desc_, dout.data(), platform::CudnnDataType::kZero(), desc_, dx_data)); diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 3b1753b49b11d..cb97a0bb27cb5 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -18,13 +18,7 @@ limitations under the License. */ #include #include -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cudnn_helper.h" -#endif - -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 6a9dca9fe2a6a..520c95b6f3484 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -20,12 +20,8 @@ namespace cub = hipcub; #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/softmax_cudnn_op.cu.h" #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/for_range.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#else -#include "paddle/fluid/platform/cudnn_helper.h" -#endif namespace paddle { namespace operators { @@ -453,14 +449,14 @@ static void SoftmaxWithCrossEntropyHardLabel( #ifdef PADDLE_WITH_HIP auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE : MIOPEN_SOFTMAX_MODE_CHANNEL; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( handle, platform::CudnnDataType::kOne(), descp, logits_data, platform::CudnnDataType::kZero(), descp, softmax_data, MIOPEN_SOFTMAX_LOG, mode)); #else auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE : CUDNN_SOFTMAX_MODE_CHANNEL; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward( handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType::kOne(), descp, logits_data, platform::CudnnDataType::kZero(), descp, softmax_data)); diff --git a/paddle/fluid/operators/spectral_helper.h b/paddle/fluid/operators/spectral_helper.h index 924ec7cd52d50..39639768241d4 100644 --- a/paddle/fluid/operators/spectral_helper.h +++ b/paddle/fluid/operators/spectral_helper.h @@ -66,7 +66,7 @@ class CuFFTHandle { public: CuFFTHandle() { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftCreate(&handle_)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_)); } CuFFTHandle(const CuFFTHandle& other) = delete; @@ -79,7 +79,7 @@ class CuFFTHandle { const ::cufftHandle& get() const { return handle_; } ~CuFFTHandle() { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftDestroy(handle_)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_)); } }; @@ -136,12 +136,12 @@ class FFTConfig { } // disable auto allocation of workspace to use allocator from the framework - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetAutoAllocation( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation( plan(), /* autoAllocate */ 0)); size_t ws_size_t; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftXtMakePlanMany( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtMakePlanMany( plan(), signal_ndim, signal_sizes.data(), /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, @@ -176,7 +176,7 @@ class HIPFFTHandle { public: HIPFFTHandle() { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftCreate(&handle_)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_)); } HIPFFTHandle(const HIPFFTHandle& other) = delete; @@ -189,7 +189,7 @@ class HIPFFTHandle { const ::hipfftHandle& get() const { return handle_; } ~HIPFFTHandle() { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftDestroy(handle_)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_)); } }; using plan_size_type = int; @@ -248,12 +248,12 @@ class FFTConfig { }(); // disable auto allocation of workspace to use allocator from the framework - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetAutoAllocation( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation( plan(), /* autoAllocate */ 0)); size_t ws_size_t; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftMakePlanMany( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftMakePlanMany( plan(), signal_ndim, signal_sizes.data(), /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type, diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu index e97af7cea7e08..4ad99724fd622 100644 --- a/paddle/fluid/operators/spectral_op.cu +++ b/paddle/fluid/operators/spectral_op.cu @@ -96,7 +96,7 @@ static void exec_cufft_plan_raw(const FFTConfig& config, void* in_data, void* out_data, bool forward) { auto& plan = config.plan(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftXtExec( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec( plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE)); } @@ -167,20 +167,20 @@ static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data, if (value_type == framework::proto::VarType::FP32) { switch (config.transform_type()) { case FFTTransformType::C2C: { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecC2C( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C( plan, static_cast(in_data), static_cast(out_data), forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); return; } case FFTTransformType::R2C: { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecR2C( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C( plan, static_cast(in_data), static_cast(out_data))); return; } case FFTTransformType::C2R: { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecC2R( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R( plan, static_cast(in_data), static_cast(out_data))); return; @@ -189,20 +189,20 @@ static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data, } else if (value_type == framework::proto::VarType::FP64) { switch (config.transform_type()) { case FFTTransformType::C2C: { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecZ2Z( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z( plan, static_cast(in_data), static_cast(out_data), forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); return; } case FFTTransformType::R2C: { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecD2Z( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z( plan, static_cast(in_data), static_cast(out_data))); return; } case FFTTransformType::C2R: { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecZ2D( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D( plan, static_cast(in_data), static_cast(out_data))); return; @@ -332,11 +332,11 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out, } // prepare cufft for execution - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cufftSetStream(config->plan(), ctx.stream())); framework::Tensor workspace_tensor; workspace_tensor.mutable_data(tensor_place, config->workspace_size()); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetWorkArea( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea( config->plan(), workspace_tensor.data())); // execute transform plan exec_cufft_plan(ctx, *config, &collapsed_input, @@ -355,11 +355,11 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out, config = &(plan_cache.lookup(key)); // prepare cufft for execution - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::hipfftSetStream(config->plan(), ctx.stream())); framework::Tensor workspace_tensor; workspace_tensor.mutable_data(tensor_place, config->workspace_size()); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetWorkArea( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea( config->plan(), workspace_tensor.data())); // execute transform plan exec_hipfft_plan(ctx, *config, &collapsed_input, diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu index 9e5e45f4d22d9..5b3f03445d352 100644 --- a/paddle/fluid/operators/stack_op.cu +++ b/paddle/fluid/operators/stack_op.cu @@ -16,7 +16,7 @@ #include #include #include "paddle/fluid/operators/stack_op.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" namespace plat = paddle::platform; namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/svd_op.cu b/paddle/fluid/operators/svd_op.cu index ade7496d64622..0a7ed093ad0b8 100644 --- a/paddle/fluid/operators/svd_op.cu +++ b/paddle/fluid/operators/svd_op.cu @@ -91,9 +91,9 @@ void SvdGPUKernel::GesvdjBatched( int ldt = n; int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize( handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork, gesvdj_params)); auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); @@ -102,7 +102,7 @@ void SvdGPUKernel::GesvdjBatched( int stride_U = ldu * (thin_UV ? k : m); int stride_V = ldt * (thin_UV ? k : n); for (int i = 0; i < batchSize; ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgesvdj( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj( handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i, U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork, info, gesvdj_params)); @@ -116,7 +116,7 @@ void SvdGPUKernel::GesvdjBatched( platform::errors::PreconditionNotMet( "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); } - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); } @@ -134,9 +134,9 @@ void SvdGPUKernel::GesvdjBatched( int ldt = n; int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize( handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork, gesvdj_params)); auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); @@ -145,7 +145,7 @@ void SvdGPUKernel::GesvdjBatched( int stride_U = ldu * (thin_UV ? k : m); int stride_V = ldt * (thin_UV ? k : n); for (int i = 0; i < batchSize; ++i) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgesvdj( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj( handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i, U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork, info, gesvdj_params)); @@ -159,7 +159,7 @@ void SvdGPUKernel::GesvdjBatched( platform::errors::PreconditionNotMet( "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); } - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); } diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h index 69617b7e208a8..201de5ac1a428 100644 --- a/paddle/fluid/operators/sync_batch_norm_op.cu.h +++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h @@ -21,19 +21,18 @@ limitations under the License. */ #include #ifdef __NVCC__ #include "cub/cub.cuh" -#include "paddle/fluid/platform/cudnn_helper.h" #endif #ifdef __HIPCC__ #include namespace cub = hipcub; -#include "paddle/fluid/platform/miopen_helper.h" #endif #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/norm_utils.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/nccl_helper.h" namespace paddle { namespace operators { @@ -192,7 +191,7 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx, if (comm) { int dtype = platform::ToNCCLDataType(mean_out->type()); // In-place operation - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( stats, stats, 2 * C + 1, static_cast(dtype), ncclSum, comm, stream)); } @@ -466,7 +465,7 @@ void SyncBatchNormGradFunctor( if (comm) { int dtype = platform::ToNCCLDataType(scale->type()); // In-place operation - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( stats, stats, 2 * C + 1, static_cast(dtype), ncclSum, comm, stream)); } diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu index cb1ff5335cdf0..eb5a78f9dc0ec 100644 --- a/paddle/fluid/operators/temporal_shift_op.cu +++ b/paddle/fluid/operators/temporal_shift_op.cu @@ -10,8 +10,8 @@ limitations under the License. */ #include "paddle/fluid/operators/temporal_shift_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index 07749f90ebaa2..05ae5c9188ceb 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -24,7 +24,7 @@ limitations under the License. */ #endif #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/top_k_op.h" -#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/float16.h" #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h index 784d97b543fbd..6c637effee2cb 100644 --- a/paddle/fluid/operators/transpose_op.cu.h +++ b/paddle/fluid/operators/transpose_op.cu.h @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/gpu_utils.h" #include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/trunc_op.cu b/paddle/fluid/operators/trunc_op.cu index a284e0ea6e393..68d8c608f6338 100644 --- a/paddle/fluid/operators/trunc_op.cu +++ b/paddle/fluid/operators/trunc_op.cu @@ -12,8 +12,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/trunc_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc index f38f5d9f72357..1426c799007a0 100644 --- a/paddle/fluid/operators/warpctc_op.cc +++ b/paddle/fluid/operators/warpctc_op.cc @@ -16,12 +16,7 @@ limitations under the License. */ #include -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_helper.h" -#endif -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cudnn_helper.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/where_index_op.cu b/paddle/fluid/operators/where_index_op.cu index b1cd172923ee6..feb8e83864e84 100644 --- a/paddle/fluid/operators/where_index_op.cu +++ b/paddle/fluid/operators/where_index_op.cu @@ -24,7 +24,7 @@ namespace cub = hipcub; #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/where_index_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/for_range.h" namespace paddle { diff --git a/paddle/fluid/operators/where_op.cu b/paddle/fluid/operators/where_op.cu index 721c6e5390e85..54b0d5b69086c 100644 --- a/paddle/fluid/operators/where_op.cu +++ b/paddle/fluid/operators/where_op.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/where_op.h" -#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" namespace platform = paddle::platform; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 280674f9ab147..d8d41e9d9185a 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -47,18 +47,11 @@ cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS}) cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) IF(WITH_GPU) - nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade) - nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda) - nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce) nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph) ELSE() cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade) ENDIF() -IF(WITH_ROCM) - hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda) -ENDIF() - cc_library(place SRCS place.cc DEPS enforce boost) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) @@ -78,6 +71,12 @@ IF(WITH_GPU OR WITH_ROCM) set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream) ENDIF() +IF(WITH_IPU) + set(IPU_CTX_DEPS ipu_backend) +ELSE() + set(IPU_CTX_DEPS) +ENDIF(WITH_IPU) + IF(WITH_ASCEND_CL) set(NPU_CTX_DEPS npu_stream npu_info) ENDIF() @@ -116,7 +115,7 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost) # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS} - place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} + place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS}) cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce) @@ -125,8 +124,7 @@ if(WITH_ASCEND_CL) endif() if(WITH_GPU OR WITH_ROCM) - cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info) - target_link_libraries(device_context cuda_resource_pool) + target_link_libraries(device_context gpu_resource_pool) endif() if(WITH_ASCEND_CL) @@ -147,8 +145,6 @@ if(WITH_GPU) nv_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) - nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda pten) - nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) endif() @@ -158,8 +154,6 @@ if(WITH_ROCM) hip_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu) hip_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) - hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda) - hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda tensor) hip_test(transform_test SRCS transform_test.cu DEPS memory place device_context) endif() @@ -172,11 +166,9 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) if(WITH_GPU) nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda) - nv_test(cuda_helper_test SRCS cuda_helper_test.cu) nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) elseif(WITH_ROCM) hip_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce) - hip_test(cuda_helper_test SRCS cuda_helper_test.cu) hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) else() cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce) diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index 03359d932b5ab..25f8f3ed9f3d8 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -15,7 +15,7 @@ #include "paddle/fluid/platform/collective_helper.h" #include -#include "paddle/fluid/platform/cuda_resource_pool.h" +#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" namespace paddle { namespace platform { @@ -96,7 +96,7 @@ NCCLComm* NCCLCommContext::CreateComm(ncclUniqueId* nccl_id, int nranks, ncclComm_t comm = nullptr; SetDeviceId(dev_id); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::ncclCommInitRank(&comm, nranks, *nccl_id, rank)); auto* comm_wrapper = AssignNCCLComm(comm, nranks, rank, dev_id, ring_id); @@ -121,7 +121,7 @@ void NCCLCommContext::CreateAllNCCLComms(const std::vector& dev_ids, const int kDevices = dev_ids.size(); ncclComm_t comms[kDevices]; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclCommInitAll( comms, dev_ids.size(), dev_ids.data())); PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0, @@ -153,18 +153,18 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( << ", rind_id: " << ring_id; ncclComm_t comms[kDevices]; { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart()); for (int i = 0; i < kDevices; i++) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipSetDevice(i)); + PADDLE_ENFORCE_GPU_SUCCESS(hipSetDevice(i)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(i)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(i)); #endif platform::dynload::ncclCommInitRank(comms + i, kDevices * ntrainers, *nccl_id, train_id * kDevices + i); VLOG(1) << "ncclCommInitRank: " << i; } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd()); VLOG(1) << "nccl group end seccessss"; } PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0, diff --git a/paddle/fluid/platform/complex.h b/paddle/fluid/platform/complex.h index 35de34086c57d..e50b74133847c 100644 --- a/paddle/fluid/platform/complex.h +++ b/paddle/fluid/platform/complex.h @@ -401,6 +401,16 @@ HOSTDEVICE inline T abs(const complex& a) { #endif } +template +HOSTDEVICE inline T arg(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return thrust::arg(thrust::complex(a)); +#else + return std::arg(std::complex(a)); +#endif +} + template HOSTDEVICE inline complex pow(const complex& a, const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ diff --git a/paddle/fluid/platform/cuda_device_guard.h b/paddle/fluid/platform/cuda_device_guard.h index a85ebf4b81366..40204c0ed83f9 100644 --- a/paddle/fluid/platform/cuda_device_guard.h +++ b/paddle/fluid/platform/cuda_device_guard.h @@ -13,7 +13,7 @@ // limitations under the License. #pragma once -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.h b/paddle/fluid/platform/cuda_graph_with_memory_pool.h index 6586146c5aefb..7a9e1a3a1419c 100644 --- a/paddle/fluid/platform/cuda_graph_with_memory_pool.h +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.h @@ -17,7 +17,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cuda_graph.h" +#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h" #endif namespace paddle { @@ -60,6 +60,23 @@ inline void AddResetCallbackIfCapturingCUDAGraph(Callback &&callback) { callback(); } +template +inline T *RestoreHostMemIfCapturingCUDAGraph(T *host_mem, size_t size) { + static_assert(std::is_trivial::value, "T must be trivial type"); + static_assert(!std::is_same::value, "T cannot be void"); +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(IsCUDAGraphCapturing())) { + size_t nbytes = size * sizeof(T); + void *new_host_mem = new uint8_t[nbytes]; + std::memcpy(new_host_mem, host_mem, nbytes); + AddResetCallbackIfCapturingCUDAGraph( + [new_host_mem] { delete[] reinterpret_cast(new_host_mem); }); + return reinterpret_cast(new_host_mem); + } +#endif + return host_mem; +} + class SkipCUDAGraphCaptureGuard { DISABLE_COPY_AND_ASSIGN(SkipCUDAGraphCaptureGuard); diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt index 515453afb63be..0cd07dec20e3e 100644 --- a/paddle/fluid/platform/device/CMakeLists.txt +++ b/paddle/fluid/platform/device/CMakeLists.txt @@ -2,8 +2,16 @@ IF(WITH_XPU) add_subdirectory(xpu) ENDIF() +IF(WITH_GPU OR WITH_ROCM) + add_subdirectory(gpu) +ENDIF() # NPU IF(WITH_ASCEND OR WITH_ASCEND_CL) add_subdirectory(npu) ENDIF() + +# IPU +IF(WITH_IPU) + add_subdirectory(ipu) +ENDIF() diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt new file mode 100644 index 0000000000000..5cf2258204fda --- /dev/null +++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt @@ -0,0 +1,15 @@ +IF(WITH_GPU) + add_subdirectory(cuda) + nv_library(gpu_info SRCS gpu_info.cc DEPS cuda_info gflags glog enforce monitor dynload_cuda) + + nv_test(cuda_helper_test SRCS cuda_helper_test.cu) + nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) +ELSEIF(WITH_ROCM) + add_subdirectory(rocm) + hip_library(gpu_info SRCS gpu_info.cc DEPS rocm_info gflags glog enforce monitor dynload_cuda) + + hip_test(cuda_helper_test SRCS cuda_helper_test.cu) + hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) +ENDIF() + +cc_library(gpu_resource_pool SRCS gpu_resource_pool.cc DEPS gpu_info) diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt new file mode 100644 index 0000000000000..5df1de1b00fac --- /dev/null +++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt @@ -0,0 +1,5 @@ +nv_library(cuda_info SRCS cuda_info.cc DEPS gflags glog enforce monitor dynload_cuda) +nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade) +nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce) + +nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda pten) diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h similarity index 67% rename from paddle/fluid/platform/cuda_device_function.h rename to paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h index 352143302388a..e7d807573957f 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h @@ -22,16 +22,11 @@ limitations under the License. */ namespace paddle { namespace platform { -#ifdef PADDLE_WITH_HIP -#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate)) -#else #define FULL_WARP_MASK 0xFFFFFFFF #define CREATE_SHFL_MASK(mask, predicate) \ mask = __ballot_sync(FULL_WARP_MASK, (predicate)) -#endif inline static int RoundToPowerOfTwo(int dim) { -#ifdef PADDLE_WITH_CUDA if (dim > 512) { return 1024; } else if (dim > 256) { @@ -45,17 +40,6 @@ inline static int RoundToPowerOfTwo(int dim) { } else { return 32; } -#else // HIP results in error or nan if > 256 - if (dim > 128) { - return 256; - } else if (dim > 64) { - return 128; - } else if (dim > 32) { - return 64; - } else { - return 32; - } -#endif } #define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ @@ -76,71 +60,15 @@ template __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { -#if defined(PADDLE_WITH_HIP) - return __shfl_down(val, delta, width); -#else return __shfl_down_sync(mask, val, static_cast(delta), width); -#endif } template __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val, int width = warpSize) { -#if defined(PADDLE_WITH_HIP) - return __shfl_xor(val, width); -#else return __shfl_xor_sync(mask, val, width); -#endif -} - -#if defined(PADDLE_WITH_HIP) -template <> -__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, - float16 val, int delta, - int width) { - return float16(__shfl_down(static_cast(val), - static_cast(delta), width)); } -template <> -__forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( - unsigned mask, paddle::platform::complex val, int delta, int width) { - float real = __shfl_down(val.real, delta, width); - float imag = __shfl_down(val.imag, delta, width); - return paddle::platform::complex(real, imag); -} - -template <> -__forceinline__ __device__ paddle::platform::complex -CudaShuffleDownSync(unsigned mask, paddle::platform::complex val, - int delta, int width) { - double real = __shfl_down(val.real, delta, width); - double imag = __shfl_down(val.imag, delta, width); - return paddle::platform::complex(real, imag); -} - -template <> -__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, - float16 val, int width) { - return float16(__shfl_xor(static_cast(val), width)); -} - -template <> -__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( - unsigned mask, paddle::platform::complex val, int width) { - float real = __shfl_xor(val.real, width); - float imag = __shfl_xor(val.imag, width); - return paddle::platform::complex(real, imag); -} - -template <> -__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( - unsigned mask, paddle::platform::complex val, int width) { - double real = __shfl_xor(val.real, width); - double imag = __shfl_xor(val.imag, width); - return paddle::platform::complex(real, imag); -} -#else template <> __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, float16 val, int delta, @@ -197,16 +125,11 @@ __forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( __shfl_xor_sync(mask, static_cast(val.imag), width)); return paddle::platform::complex(real, imag); } -#endif template __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { -#if defined(PADDLE_WITH_HIP) - return __shfl(val, src_line, width); -#else return __shfl_sync(mask, val, src_line, width); -#endif } template @@ -216,17 +139,13 @@ HOSTDEVICE T Infinity() { template __device__ T reduceSum(T val, int tid, int len) { -// NOTE(zcd): The warp size should be taken from the -// parameters of the GPU but not specified as 32 simply. -// To make the reduceSum more efficiently, -// I use Warp-Level Parallelism and assume the Warp size -// is 32 which may be different for different GPU, -// but most card's warp size is 32. -#ifdef PADDLE_WITH_HIP - const int warpSize = 64; -#else + // NOTE(zcd): The warp size should be taken from the + // parameters of the GPU but not specified as 32 simply. + // To make the reduceSum more efficiently, + // I use Warp-Level Parallelism and assume the Warp size + // is 32 which may be different for different GPU, + // but most card's warp size is 32. const int warpSize = 32; -#endif __shared__ T shm[warpSize]; unsigned mask = 0u; CREATE_SHFL_MASK(mask, tid < len); diff --git a/paddle/fluid/platform/cuda_graph.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc similarity index 90% rename from paddle/fluid/platform/cuda_graph.cc rename to paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc index 6f3d452ef5c50..3970acf82d3ea 100644 --- a/paddle/fluid/platform/cuda_graph.cc +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/platform/cuda_graph.h" +#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h" namespace paddle { namespace platform { @@ -23,11 +23,11 @@ void CUDAGraph::Reset() { if (is_reset_) return; #if CUDA_VERSION >= 10010 for (auto graph : graphs_) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphDestroy(graph)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphDestroy(graph)); } graphs_.clear(); for (auto exec_graph : exec_graphs_) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphExecDestroy(exec_graph)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphExecDestroy(exec_graph)); } exec_graphs_.clear(); #endif @@ -46,7 +46,7 @@ void CUDAGraph::Replay() { errors::PermissionDenied( "Cannot replay the CUDA Graph after reset is called.")); for (auto exec_graph : exec_graphs_) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphLaunch(exec_graph, stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphLaunch(exec_graph, stream_)); } #endif } @@ -58,7 +58,7 @@ void CUDAGraph::BeginSegmentCapture() { IsCapturing(), true, errors::PermissionDenied("BeginSegmentCapture should be called when CUDA " "Graph is capturing.")); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamBeginCapture( + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamBeginCapture( capturing_graph_->stream_, capturing_graph_->capture_mode_)); PADDLE_ENFORCE_EQ(IsValidCapturing(), true, platform::errors::PermissionDenied( @@ -92,19 +92,19 @@ void CUDAGraph::EndSegmentCapture() { PADDLE_ENFORCE_EQ(IsCapturing(), true, errors::PermissionDenied("No CUDA Graph is capturing.")); cudaGraph_t graph; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamEndCapture(capturing_graph_->stream_, &graph)); auto num_nodes = static_cast(-1); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphGetNodes(graph, nullptr, &num_nodes)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphGetNodes(graph, nullptr, &num_nodes)); if (num_nodes == 0) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphDestroy(graph)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphDestroy(graph)); VLOG(10) << "Skip empty CUDA Graph with ID " << capturing_graph_->id_ << ", segment id " << capturing_graph_->graphs_.size(); return; } cudaGraphExec_t exec_graph; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaGraphInstantiate(&exec_graph, graph, nullptr, nullptr, 0)); VLOG(10) << "End to capture CUDA Graph with ID " << capturing_graph_->id_ << ", segment id " << capturing_graph_->graphs_.size(); @@ -123,7 +123,7 @@ bool CUDAGraph::IsValidCapturing() { if (!IsCapturing()) return false; cudaStreamCaptureStatus status; CUDAGraphID id; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamGetCaptureInfo(capturing_graph_->stream_, &status, &id)); return status == cudaStreamCaptureStatusActive; #else @@ -154,7 +154,7 @@ void CUDAGraph::PrintToDotFiles(const std::string &dirname, ConcatPath(dirname, "segment_" + std::to_string(i) + ".dot"); VLOG(10) << "Save the " << i << "-th segment of graph " << id_ << " to " << filename; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaGraphDebugDotPrint(graphs_[i], filename.c_str(), flags)); } #else diff --git a/paddle/fluid/platform/cuda_graph.h b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h similarity index 96% rename from paddle/fluid/platform/cuda_graph.h rename to paddle/fluid/platform/device/gpu/cuda/cuda_graph.h index f70a66f76242f..0856e0fad1900 100644 --- a/paddle/fluid/platform/cuda_graph.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h @@ -21,7 +21,7 @@ #include #include "cuda.h" // NOLINT #include "cuda_runtime.h" // NOLINT -#include "paddle/fluid/platform/type_defs.h" +#include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" @@ -129,7 +129,7 @@ class CUDAGraphCaptureModeGuard { explicit CUDAGraphCaptureModeGuard( cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) { if (UNLIKELY(CUDAGraph::IsCapturing())) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode)); // After cudaThreadExchangeStreamCaptureMode is called, // the variable "mode" would be set to the old capturing mode. old_mode_ = mode; @@ -138,7 +138,7 @@ class CUDAGraphCaptureModeGuard { ~CUDAGraphCaptureModeGuard() PADDLE_MAY_THROW { if (UNLIKELY(CUDAGraph::IsCapturing())) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaThreadExchangeStreamCaptureMode(&old_mode_)); } } diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h similarity index 78% rename from paddle/fluid/platform/cuda_helper.h rename to paddle/fluid/platform/device/gpu/cuda/cuda_helper.h index 202be920c5595..3199af9c97520 100644 --- a/paddle/fluid/platform/cuda_helper.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h @@ -16,12 +16,7 @@ #include // NOLINT -#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/dynload/cublas.h" -#endif -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/dynload/rocblas.h" -#endif #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" @@ -72,28 +67,13 @@ namespace platform { * */ -#ifdef __HIPCC__ -#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ - int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \ - for (index_type i = __index__; __index__ < (num); \ - __index__ += hipBlockDim_x * hipGridDim_x, i = __index__) -#else #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \ for (index_type i = __index__; __index__ < (num); \ __index__ += blockDim.x * gridDim.x, i = __index__) -#endif - -#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int) class CublasHandleHolder { public: -#ifdef PADDLE_WITH_HIP - explicit CublasHandleHolder(hipStream_t stream) { - PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_)); - PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream)); - } -#else CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) { PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasCreate(&handle_)); PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream)); @@ -109,20 +89,11 @@ class CublasHandleHolder { } #endif // CUDA_VERSION >= 9000 } -#endif -#ifdef PADDLE_WITH_HIP - const rocblas_handle& GetCublasHandle() const { return handle_; } -#else const cublasHandle_t& GetCublasHandle() const { return handle_; } -#endif ~CublasHandleHolder() PADDLE_MAY_THROW { -#ifdef PADDLE_WITH_HIP - PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_)); -#else PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_)); -#endif } template @@ -134,11 +105,7 @@ class CublasHandleHolder { private: DISABLE_COPY_AND_ASSIGN(CublasHandleHolder); -#ifdef PADDLE_WITH_HIP - rocblas_handle handle_; -#else cublasHandle_t handle_; -#endif mutable std::mutex mtx_; }; diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc new file mode 100644 index 0000000000000..6109ed6554318 --- /dev/null +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc @@ -0,0 +1,268 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/lock_guard_ptr.h" +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/monitor.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/split.h" + +static std::once_flag g_device_props_size_init_flag; +static std::vector> g_device_props_init_flags; +static std::vector g_device_props; + +namespace paddle { +namespace platform { +int DnnVersion() { + if (!dynload::HasCUDNN()) return -1; + return dynload::cudnnGetVersion(); +} + +static int GetGPUDeviceCountImpl() { + int driverVersion = 0; + cudaError_t status = cudaDriverGetVersion(&driverVersion); + + if (!(status == gpuSuccess && driverVersion != 0)) { + // No GPU driver + VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!"; + return 0; + } + + const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES"); + + if (cuda_visible_devices != nullptr) { + std::string cuda_visible_devices_str(cuda_visible_devices); + if (!cuda_visible_devices_str.empty()) { + cuda_visible_devices_str.erase( + 0, cuda_visible_devices_str.find_first_not_of('\'')); + cuda_visible_devices_str.erase( + cuda_visible_devices_str.find_last_not_of('\'') + 1); + cuda_visible_devices_str.erase( + 0, cuda_visible_devices_str.find_first_not_of('\"')); + cuda_visible_devices_str.erase( + cuda_visible_devices_str.find_last_not_of('\"') + 1); + } + if (std::all_of(cuda_visible_devices_str.begin(), + cuda_visible_devices_str.end(), + [](char ch) { return ch == ' '; })) { + VLOG(2) << "CUDA_VISIBLE_DEVICES is set to be " + "empty. No GPU detected."; + return 0; + } + } + int count; + PADDLE_ENFORCE_GPU_SUCCESS(cudaGetDeviceCount(&count)); + return count; +} + +int GetGPUDeviceCount() { + // cache the count + static auto dev_cnt = GetGPUDeviceCountImpl(); + return dev_cnt; +} + +int GetGPUComputeCapability(int id) { + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + int major, minor; + auto major_error_code = + cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id); + auto minor_error_code = + cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id); + + PADDLE_ENFORCE_GPU_SUCCESS(major_error_code); + PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code); + return major * 10 + minor; +} + +int GetGPURuntimeVersion(int id) { + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + int runtime_version = 0; + PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version)); + return runtime_version; +} + +int GetGPUDriverVersion(int id) { + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + int driver_version = 0; + PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version)); + return driver_version; +} + +bool TensorCoreAvailable() { + int device = GetCurrentDeviceId(); + int driver_version = GetGPUComputeCapability(device); + return driver_version >= 70; +} + +int GetGPUMultiProcessors(int id) { + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS( + cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id)); + return count; +} + +int GetGPUMaxThreadsPerMultiProcessor(int id) { + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute( + &count, cudaDevAttrMaxThreadsPerMultiProcessor, id)); + + return count; +} + +int GetGPUMaxThreadsPerBlock(int id) { + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS( + cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id)); + return count; +} + +int GetCurrentDeviceId() { + int device_id; + PADDLE_ENFORCE_GPU_SUCCESS(cudaGetDevice(&device_id)); + return device_id; +} + +dim3 GetGpuMaxGridDimSize(int id) { + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + dim3 ret; + int size; + auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); + ret.x = size; + + auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); + ret.y = size; + + auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); + ret.z = size; + return ret; +} + +const gpuDeviceProp &GetDeviceProperties(int id) { + std::call_once(g_device_props_size_init_flag, [&] { + int gpu_num = 0; + gpu_num = platform::GetGPUDeviceCount(); + g_device_props_init_flags.resize(gpu_num); + g_device_props.resize(gpu_num); + for (int i = 0; i < gpu_num; ++i) { + g_device_props_init_flags[i] = std::make_unique(); + } + }); + + if (id == -1) { + id = platform::GetCurrentDeviceId(); + } + + if (id < 0 || id >= static_cast(g_device_props.size())) { + PADDLE_THROW(platform::errors::OutOfRange( + "The device id %d is out of range [0, %d), where %d is the number of " + "devices on this machine. Because the device id should be greater than " + "or equal to zero and smaller than the number of gpus. Please input " + "appropriate device again!", + id, static_cast(g_device_props.size()), + static_cast(g_device_props.size()))); + } + + std::call_once(*(g_device_props_init_flags[id]), [&] { + PADDLE_ENFORCE_GPU_SUCCESS( + cudaGetDeviceProperties(&g_device_props[id], id)); + }); + + return g_device_props[id]; +} + +void SetDeviceId(int id) { + // TODO(qijun): find a better way to cache the cuda device count + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id)); +} + +void GpuMemcpyAsync(void *dst, const void *src, size_t count, + gpuMemcpyKind kind, gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream)); +} + +void GpuMemcpySync(void *dst, const void *src, size_t count, + gpuMemcpyKind kind) { + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, count, kind)); +} + +void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, + int src_device, size_t count, gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); +} + +void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, + int src_device, size_t count) { + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemcpyPeer(dst, dst_device, src, src_device, count)); +} + +void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(dst, value, count, stream)); +} + +void GpuStreamSync(gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); +} + +void GpuDestroyStream(gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream)); +} + +void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); } + +gpuError_t GpuGetLastError() { return cudaGetLastError(); } +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/cuda_profiler.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc similarity index 85% rename from paddle/fluid/platform/cuda_profiler.cc rename to paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc index 998dd80dc5e7d..42351fe097a9d 100644 --- a/paddle/fluid/platform/cuda_profiler.cc +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/platform/cuda_profiler.h" +#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" namespace paddle { namespace platform { @@ -25,13 +25,13 @@ void CudaProfilerInit(std::string output_file, std::string output_mode, "`csv`, but received `%s`.", output_mode)); cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode)); } -void CudaProfilerStart() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStart()); } +void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); } -void CudaProfilerStop() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStop()); } +void CudaProfilerStop() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStop()); } #ifndef _WIN32 void CudaNvtxRangePush(std::string name) { diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h similarity index 100% rename from paddle/fluid/platform/cuda_profiler.h rename to paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h similarity index 84% rename from paddle/fluid/platform/cudnn_desc.h rename to paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h index 318c85ee484be..7bff2c69381e6 100644 --- a/paddle/fluid/platform/cudnn_desc.h +++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h @@ -23,7 +23,7 @@ #include #include -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { @@ -99,7 +99,7 @@ class ActivationDescriptor { struct Deleter { void operator()(T* t) { if (t != nullptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnDestroyActivationDescriptor(t)); t = nullptr; } @@ -107,13 +107,13 @@ class ActivationDescriptor { }; ActivationDescriptor() { T* raw_ptr; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnCreateActivationDescriptor(&raw_ptr)); desc_.reset(raw_ptr); } template void set(cudnnActivationMode_t mode, const T& coef) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetActivationDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetActivationDescriptor( desc_.get(), mode, CUDNN_NOT_PROPAGATE_NAN, static_cast(coef))); } @@ -130,14 +130,14 @@ class TensorDescriptor { struct Deleter { void operator()(T* t) { if (t != nullptr) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyTensorDescriptor(t)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyTensorDescriptor(t)); t = nullptr; } } }; TensorDescriptor() { T* raw_ptr; - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateTensorDescriptor(&raw_ptr)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateTensorDescriptor(&raw_ptr)); desc_.reset(raw_ptr); } T* desc() { return desc_.get(); } @@ -153,7 +153,7 @@ class TensorDescriptor { if (groups > 1) { dims_with_group[1] = dims_with_group[1] / groups; } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptor( desc_.get(), ToCudnnDataType(tensor.type()), dims_with_group.size(), dims_with_group.data(), strides.data())); } @@ -166,7 +166,7 @@ class TensorDescriptor { } else { transformed_dims = dims; } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx( desc_.get(), format, dtype, transformed_dims.size(), transformed_dims.data())); } @@ -187,14 +187,14 @@ class FilterDescriptor { struct Deleter { void operator()(T* t) { if (t != nullptr) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFilterDescriptor(t)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyFilterDescriptor(t)); t = nullptr; } } }; FilterDescriptor() { T* raw_ptr; - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFilterDescriptor(&raw_ptr)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateFilterDescriptor(&raw_ptr)); desc_.reset(raw_ptr); } T* desc() { return desc_.get(); } @@ -211,7 +211,7 @@ class FilterDescriptor { if (groups > 1) { transformed_dims[1] = transformed_dims[1] / groups; } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetFilterNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetFilterNdDescriptor( desc_.get(), dtype, format, transformed_dims.size(), transformed_dims.data())); } @@ -233,7 +233,7 @@ class ConvolutionDescriptor { struct Deleter { void operator()(T* t) { if (t != nullptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnDestroyConvolutionDescriptor(t)); t = nullptr; } @@ -241,7 +241,7 @@ class ConvolutionDescriptor { }; ConvolutionDescriptor() { T* raw_ptr; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnCreateConvolutionDescriptor(&raw_ptr)); desc_.reset(raw_ptr); } @@ -255,28 +255,26 @@ class ConvolutionDescriptor { cudnnDataType_t compute_type = (dtype == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT; T* desc = desc_.get(); - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor( desc, pads.size(), pads.data(), strides.data(), dilations.data(), CUDNN_CROSS_CORRELATION, compute_type)); #if CUDNN_VERSION_MIN(7, 0, 1) - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnSetConvolutionGroupCount(desc, groups)); #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( desc, CUDNN_DEFAULT_MATH)); if (dtype == CUDNN_DATA_HALF) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnSetConvolutionMathType(desc, - CUDNN_TENSOR_OP_MATH)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + desc, CUDNN_TENSOR_OP_MATH)); #if CUDA_VERSION >= 11000 #if CUDNN_VERSION_MIN(8, 1, 0) } else if (dtype == CUDNN_DATA_BFLOAT16) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnSetConvolutionMathType(desc, - CUDNN_TENSOR_OP_MATH)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + desc, CUDNN_TENSOR_OP_MATH)); #endif // CUDNN_VERSION_MIN(8,1,0) } else if (dtype == CUDNN_DATA_FLOAT && !allow_tf32) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnSetConvolutionMathType(desc, CUDNN_FMA_MATH)); #endif // CUDA_VERSION >= 11000 } diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h similarity index 88% rename from paddle/fluid/platform/cudnn_helper.h rename to paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h index 65dd69a37d37f..2bcdbaa201889 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h @@ -191,10 +191,10 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat( class ScopedTensorDescriptor { public: ScopedTensorDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateTensorDescriptor(&desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateTensorDescriptor(&desc_)); } ~ScopedTensorDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyTensorDescriptor(desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyTensorDescriptor(desc_)); } inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format, @@ -216,20 +216,20 @@ class ScopedTensorDescriptor { if (dims.size() == 4) { if (format == CUDNN_TENSOR_NCHW) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptor( desc_, type, dims_with_group.size(), dims_with_group.data(), strides.data())); } else { // CUDNN_TENSOR_NHWC - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensor4dDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensor4dDescriptor( desc_, format, type, dims[0], dims[3], dims[1], dims[2])); } } else if (dims.size() == 5) { if (format == CUDNN_TENSOR_NCHW) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptor( desc_, type, dims_with_group.size(), dims_with_group.data(), strides.data())); } else { // CUDNN_TENSOR_NHWC - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx( desc_, format, type, dims.size(), dims.data())); } } @@ -247,7 +247,7 @@ class ScopedTensorDescriptor { inline cudnnTensorDescriptor_t descriptor(const cudnnDataType_t cudnn_type, const std::vector& dim, const std::vector& stride) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptor( desc_, cudnn_type, dim.size(), dim.data(), stride.data())); return desc_; } @@ -269,11 +269,11 @@ class ScopedTensorDescriptor { class ScopedRNNTensorDescriptor { public: ScopedRNNTensorDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateRNNDataDescriptor(&desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateRNNDataDescriptor(&desc_)); } ~ScopedRNNTensorDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyRNNDataDescriptor(desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyRNNDataDescriptor(desc_)); } inline cudnnRNNDataDescriptor_t descriptor( @@ -288,7 +288,7 @@ class ScopedRNNTensorDescriptor { layout = CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED; } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetRNNDataDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetRNNDataDescriptor( desc_, cudnn_type, layout, max_seq_length, batch_size, input_size, seq_length.data(), static_cast(&padding_fill))); @@ -314,10 +314,10 @@ class ScopedRNNTensorDescriptor { class ScopedDropoutDescriptor { public: ScopedDropoutDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateDropoutDescriptor(&desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateDropoutDescriptor(&desc_)); } ~ScopedDropoutDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyDropoutDescriptor(desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyDropoutDescriptor(desc_)); } inline cudnnDropoutDescriptor_t descriptor(const cudnnHandle_t& handle, @@ -327,19 +327,19 @@ class ScopedDropoutDescriptor { framework::Tensor* dropout_state_, int seed, size_t state_size) { if (dropout_state_ == nullptr) { // for no dropout or test - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetDropoutDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetDropoutDescriptor( desc_, handle, 0 /* dropout */, nullptr, 0 /* state_size */, 0 /* seed */)); return desc_; } auto* dropout_state_data = dropout_state_->data(); if (!initialized) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetDropoutDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetDropoutDescriptor( desc_, handle, dropout_prob_, dropout_state_data, state_size, seed)); } else { auto dropout_state_dims = dropout_state_->dims(); state_size = dropout_state_dims[0]; - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnRestoreDropoutDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnRestoreDropoutDescriptor( desc_, handle, dropout_prob_, dropout_state_data, state_size, 0)); } return desc_; @@ -354,10 +354,10 @@ class ScopedDropoutDescriptor { class ScopedRNNDescriptor { public: ScopedRNNDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateRNNDescriptor(&desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateRNNDescriptor(&desc_)); } ~ScopedRNNDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyRNNDescriptor(desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyRNNDescriptor(desc_)); } inline cudnnRNNDescriptor_t desc() { return desc_; } @@ -370,10 +370,10 @@ class ScopedRNNDescriptor { class ScopedFilterDescriptor { public: ScopedFilterDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFilterDescriptor(&desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateFilterDescriptor(&desc_)); } ~ScopedFilterDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFilterDescriptor(desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyFilterDescriptor(desc_)); } inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format, @@ -389,7 +389,7 @@ class ScopedFilterDescriptor { kernel_with_group[0] /= groups; // NOTE: input filter(C) of the filter is already asserted to be C/groups. } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetFilterNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetFilterNdDescriptor( desc_, type, format, kernel_with_group.size(), kernel_with_group.data())); return desc_; @@ -413,11 +413,11 @@ class ScopedFilterDescriptor { class ScopedConvolutionDescriptor { public: ScopedConvolutionDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnCreateConvolutionDescriptor(&desc_)); } ~ScopedConvolutionDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnDestroyConvolutionDescriptor(desc_)); } @@ -438,7 +438,7 @@ class ScopedConvolutionDescriptor { cudnnDataType_t compute_type = (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT; - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor( desc_, pads.size(), pads.data(), strides.data(), dilations.data(), CUDNN_CROSS_CORRELATION, compute_type)); return desc_; @@ -459,10 +459,10 @@ class ScopedConvolutionDescriptor { class ScopedPoolingDescriptor { public: ScopedPoolingDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreatePoolingDescriptor(&desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreatePoolingDescriptor(&desc_)); } ~ScopedPoolingDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyPoolingDescriptor(desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyPoolingDescriptor(desc_)); } inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode, @@ -480,7 +480,7 @@ class ScopedPoolingDescriptor { "The size of kernel and strides should be equal. But " "received size of kernel is %d, size of strides is %d.", kernel.size(), strides.size())); - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetPoolingNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetPoolingNdDescriptor( desc_, (GetPoolingMode(mode)), CUDNN_PROPAGATE_NAN, // Always propagate nans. kernel.size(), kernel.data(), pads.data(), strides.data())); @@ -495,18 +495,18 @@ class ScopedPoolingDescriptor { class ScopedSpatialTransformerDescriptor { public: ScopedSpatialTransformerDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnCreateSpatialTransformerDescriptor(&desc_)); } ~ScopedSpatialTransformerDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnDestroySpatialTransformerDescriptor(desc_)); } template inline cudnnSpatialTransformerDescriptor_t descriptor(const int nbDims, const int dimA[]) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetSpatialTransformerNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetSpatialTransformerNdDescriptor( desc_, CUDNN_SAMPLER_BILINEAR, CudnnDataType::type, nbDims, dimA)); return desc_; } @@ -519,11 +519,11 @@ class ScopedSpatialTransformerDescriptor { class ScopedActivationDescriptor { public: ScopedActivationDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnCreateActivationDescriptor(&desc_)); } ~ScopedActivationDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnDestroyActivationDescriptor(desc_)); } @@ -561,7 +561,7 @@ class ScopedActivationDescriptor { "Unrecognized CUDNN activation mode: %d.", static_cast(activation_mode))); } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetActivationDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetActivationDescriptor( desc_, mode, CUDNN_NOT_PROPAGATE_NAN, relu_ceiling)); return desc_; } @@ -587,15 +587,15 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { class ScopedCTCLossDescriptor { public: ScopedCTCLossDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateCTCLossDescriptor(&desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateCTCLossDescriptor(&desc_)); } ~ScopedCTCLossDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyCTCLossDescriptor(desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyCTCLossDescriptor(desc_)); } template inline cudnnCTCLossDescriptor_t descriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cudnnSetCTCLossDescriptor(desc_, CudnnDataType::type)); return desc_; } diff --git a/paddle/fluid/platform/cudnn_helper_test.cc b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc similarity index 98% rename from paddle/fluid/platform/cudnn_helper_test.cc rename to paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc index 98ec2be87755c..851d0d18c604c 100644 --- a/paddle/fluid/platform/cudnn_helper_test.cc +++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc @@ -15,7 +15,7 @@ limitations under the License. */ #define GLOG_NO_ABBREVIATED_SEVERITIES #define GOOGLE_GLOG_DLL_DECL -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu similarity index 98% rename from paddle/fluid/platform/cuda_helper_test.cu rename to paddle/fluid/platform/device/gpu/cuda_helper_test.cu index fd46aa2393403..ab8bb2cad8c51 100644 --- a/paddle/fluid/platform/cuda_helper_test.cu +++ b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu @@ -21,11 +21,11 @@ #include #define PADDLE_CUDA_FP16 -#include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_helper.h" using paddle::platform::PADDLE_CUDA_NUM_THREADS; using paddle::platform::float16; diff --git a/paddle/fluid/platform/cudnn_desc_test.cc b/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc similarity index 90% rename from paddle/fluid/platform/cudnn_desc_test.cc rename to paddle/fluid/platform/device/gpu/cudnn_desc_test.cc index db5362f5cb1f5..8ea30027e8ade 100644 --- a/paddle/fluid/platform/cudnn_desc_test.cc +++ b/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc @@ -12,11 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/miopen_desc.h" -#else -#include "paddle/fluid/platform/cudnn_desc.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include diff --git a/paddle/fluid/platform/device/gpu/gpu_device_function.h b/paddle/fluid/platform/device/gpu/gpu_device_function.h new file mode 100644 index 0000000000000..a8daa5e87fdc3 --- /dev/null +++ b/paddle/fluid/platform/device/gpu/gpu_device_function.h @@ -0,0 +1,24 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h" +#else +#include "paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h" +#endif + +#endif diff --git a/paddle/fluid/platform/device/gpu/gpu_dnn.h b/paddle/fluid/platform/device/gpu/gpu_dnn.h new file mode 100644 index 0000000000000..3f9bc5e6de80b --- /dev/null +++ b/paddle/fluid/platform/device/gpu/gpu_dnn.h @@ -0,0 +1,27 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/device/gpu/rocm/miopen_desc.h" +#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h" +#else // CUDA +#include "paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h" +#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h" +#endif + +#endif diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h new file mode 100644 index 0000000000000..6077a7b625d25 --- /dev/null +++ b/paddle/fluid/platform/device/gpu/gpu_helper.h @@ -0,0 +1,26 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h" +#else +#include "paddle/fluid/platform/device/gpu/cuda/cuda_helper.h" +#endif + +#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int) + +#endif diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc new file mode 100644 index 0000000000000..e68277cc37b38 --- /dev/null +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -0,0 +1,356 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include +#include +#include + +#include "gflags/gflags.h" +#include "paddle/fluid/platform/cuda_device_guard.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/dynload/miopen.h" +#else +#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h" +#include "paddle/fluid/platform/dynload/cudnn.h" +#endif +#include "paddle/fluid/memory/malloc.h" +#ifdef PADDLE_WITH_CUDA +#if CUDA_VERSION >= 10020 +#include "paddle/fluid/platform/dynload/cuda_driver.h" +#endif +#endif +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/lock_guard_ptr.h" +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/monitor.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/split.h" + +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); +DECLARE_bool(enable_cublas_tensor_op_math); +DECLARE_string(selected_gpus); +DECLARE_uint64(gpu_memory_limit_mb); + +constexpr static float fraction_reserve_gpu_memory = 0.05f; + +USE_GPU_MEM_STAT; +namespace paddle { +namespace platform { +//! Get a list of device ids from environment variable or use all. +std::vector GetSelectedDevices() { + // use user specified GPUs in single-node multi-process mode. + std::vector devices; + if (!FLAGS_selected_gpus.empty()) { + auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ','); + for (auto id : devices_str) { + devices.push_back(atoi(id.c_str())); + } + } else { + int count = GetGPUDeviceCount(); + for (int i = 0; i < count; ++i) { + devices.push_back(i); + } + } + return devices; +} + +void GpuMemoryUsage(size_t *available, size_t *total) { + size_t actual_available, actual_total; + RecordedGpuMemGetInfo(available, total, &actual_available, &actual_total, + platform::GetCurrentDeviceId()); +} + +size_t GpuAvailableMemToAlloc() { + size_t total = 0; + size_t available = 0; + GpuMemoryUsage(&available, &total); + size_t reserving = + static_cast(fraction_reserve_gpu_memory * available); + // If available size is less than minimum chunk size, no usable memory exists + size_t available_to_alloc = available - reserving; + size_t min_chunk_size = GpuMinChunkSize(); + if (available_to_alloc < min_chunk_size) { + available_to_alloc = 0; + } + VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20) + << "M, " << (available_to_alloc >> 20) << "M available to allocate"; + return available_to_alloc; +} + +size_t GpuMaxAllocSize() { + return std::max(GpuInitAllocSize(), GpuReallocSize()); +} + +static size_t GpuAllocSize(bool realloc) { + size_t available_to_alloc = GpuAvailableMemToAlloc(); + PADDLE_ENFORCE_GT( + available_to_alloc, 0, + platform::errors::ResourceExhausted("Not enough available GPU memory.")); + // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be + // allocated by fraction + size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb + : FLAGS_initial_gpu_memory_in_mb; + size_t alloc_bytes = + (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc * + FLAGS_fraction_of_gpu_memory_to_use); + PADDLE_ENFORCE_GE( + available_to_alloc, alloc_bytes, + platform::errors::ResourceExhausted("Not enough available GPU memory.")); + VLOG(10) << "Alloc size is " << (alloc_bytes >> 20) + << " MiB, is it Re-alloc: " << realloc; + return alloc_bytes; +} + +size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); } + +size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); } + +size_t GpuMinChunkSize() { + // Allow to allocate the minimum chunk size is 256 bytes. + return 1 << 8; +} + +size_t GpuMaxChunkSize() { + size_t max_chunk_size = GpuMaxAllocSize(); + VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; + return max_chunk_size; +} + +static void RaiseNonOutOfMemoryError(gpuError_t *status) { + if (*status == gpuErrorOutOfMemory) { + *status = gpuSuccess; + } + PADDLE_ENFORCE_GPU_SUCCESS(*status); + + *status = platform::GpuGetLastError(); + if (*status == gpuErrorOutOfMemory) { + *status = gpuSuccess; + } + PADDLE_ENFORCE_GPU_SUCCESS(*status); +} + +class RecordedGpuMallocHelper { + private: + explicit RecordedGpuMallocHelper(int dev_id, uint64_t limit_size = 0) + : dev_id_(dev_id), limit_size_(limit_size) { + if (NeedRecord()) { + mtx_.reset(new std::mutex()); + } + } + + DISABLE_COPY_AND_ASSIGN(RecordedGpuMallocHelper); + + public: + static RecordedGpuMallocHelper *Instance(int dev_id) { + std::call_once(once_flag_, [] { + int dev_cnt = GetGPUDeviceCount(); + instances_.reserve(dev_cnt); + for (int i = 0; i < dev_cnt; ++i) { + instances_.emplace_back( + new RecordedGpuMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20)); + } + }); + + PADDLE_ENFORCE_GE( + dev_id, 0, + platform::errors::OutOfRange( + "Device id must be not less than 0, but got %d.", dev_id)); + PADDLE_ENFORCE_LT( + dev_id, instances_.size(), + platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.", + dev_id, instances_.size())); + return instances_[dev_id].get(); + } + + /** + * Try to allocate `size` gpu memory. Only cudaErrorMemoryAllocation + * or cudaSuccess would be returned, and the cudaGetLastError() flag + * would be clear. + */ + gpuError_t Malloc(void **ptr, size_t size) { + LockGuardPtr lock(mtx_); + if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) { + return gpuErrorOutOfMemory; + } + + CUDADeviceGuard guard(dev_id_); +#ifdef PADDLE_WITH_HIP + auto result = hipMalloc(ptr, size); +#else + CUDAGraphCaptureModeGuard capture_mode_guard; + auto result = cudaMalloc(ptr, size); +#endif + if (result == gpuSuccess) { + cur_size_.fetch_add(size); + STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); + return gpuSuccess; + } else { + RaiseNonOutOfMemoryError(&result); + // Non out of memory error would be raised inside + // RaiseNonOutOfMemoryError. Therefore, we can + // return cudaErrorMemoryAllocation directly here. + return gpuErrorOutOfMemory; + } + } + + /** + * Free gpu memory. Usually, free is not allowed to raise error. + * If it does raise error, the process should be crashed. + */ + void Free(void *ptr, size_t size) { + // Purposefully allow cudaErrorCudartUnloading, because + // that is returned if you ever call cudaFree after the + // driver has already shutdown. This happens only if the + // process is terminating, in which case we don't care if + // cudaFree succeeds. + CUDADeviceGuard guard(dev_id_); +#ifdef PADDLE_WITH_HIP + auto err = hipFree(ptr); + if (err != hipErrorDeinitialized) { +#else + auto err = cudaFree(ptr); + if (err != cudaErrorCudartUnloading) { +#endif + PADDLE_ENFORCE_GPU_SUCCESS(err); + cur_size_.fetch_sub(size); + STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); + } else { + platform::GpuGetLastError(); // clear the error flag when + // cudaErrorCudartUnloading / + // hipErrorDeinitialized + } + } + + bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail, + size_t *actual_total) { + { + CUDADeviceGuard guard(dev_id_); +#ifdef PADDLE_WITH_HIP + auto result = hipMemGetInfo(actual_avail, actual_total); +#else + auto result = cudaMemGetInfo(actual_avail, actual_total); +#endif + if (result != gpuSuccess) { + *actual_avail = 0; + } + RaiseNonOutOfMemoryError(&result); + } + + if (NeedRecord()) { + std::lock_guard guard(*mtx_); + *avail = std::min(*actual_avail, limit_size_ - cur_size_.load()); + *total = std::min(*actual_total, limit_size_); + return *total < *actual_total; + } else { + *avail = *actual_avail; + *total = *actual_total; + return false; + } + } + + inline bool NeedRecord() const { return limit_size_ != 0; } + + uint64_t RecordedSize() const { return cur_size_.load(); } + + uint64_t LimitSize() const { return limit_size_; } + +#ifdef PADDLE_WITH_CUDA +#if CUDA_VERSION >= 10020 + CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size, + const CUmemAllocationProp *prop, + unsigned long long flags) { // NOLINT + auto result = + paddle::platform::dynload::cuMemCreate(handle, size, prop, flags); + if (result == CUDA_SUCCESS) { + cur_size_.fetch_add(size); + } + return result; + } + + CUresult MemRelease(CUmemGenericAllocationHandle handle, size_t size) { + auto result = paddle::platform::dynload::cuMemRelease(handle); + if (result == CUDA_SUCCESS) { + cur_size_.fetch_sub(size); + } + return result; + } + +#endif +#endif + + private: + const int dev_id_; + const uint64_t limit_size_; + std::atomic cur_size_{0}; + + mutable std::unique_ptr mtx_; + + static std::once_flag once_flag_; + static std::vector> instances_; +}; // NOLINT + +std::once_flag RecordedGpuMallocHelper::once_flag_; +std::vector> + RecordedGpuMallocHelper::instances_; + +gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id) { + return RecordedGpuMallocHelper::Instance(dev_id)->Malloc(ptr, size); +} + +void RecordedGpuFree(void *p, size_t size, int dev_id) { + return RecordedGpuMallocHelper::Instance(dev_id)->Free(p, size); +} + +#ifdef PADDLE_WITH_CUDA +#if CUDA_VERSION >= 10020 +CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, + const CUmemAllocationProp *prop, + unsigned long long flags, int dev_id) { // NOLINT + return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(handle, size, + prop, flags); +} + +CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size, + int dev_id) { + return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size); +} +#endif +#endif + +bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail, + size_t *actual_total, int dev_id) { + return RecordedGpuMallocHelper::Instance(dev_id)->GetMemInfo( + avail, total, actual_avail, actual_total); +} + +uint64_t RecordedGpuMallocSize(int dev_id) { + return RecordedGpuMallocHelper::Instance(dev_id)->RecordedSize(); +} + +bool IsGpuMallocRecorded(int dev_id) { + return RecordedGpuMallocHelper::Instance(dev_id)->NeedRecord(); +} + +void EmptyCache(void) { + std::vector devices = GetSelectedDevices(); + for (auto device : devices) { + memory::Release(CUDAPlace(device)); + } +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h similarity index 70% rename from paddle/fluid/platform/gpu_info.h rename to paddle/fluid/platform/device/gpu/gpu_info.h index 93e787fcf36f5..18e6ac83295f8 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/device/gpu/gpu_info.h @@ -1,11 +1,8 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -14,49 +11,42 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_CUDA -#include -#endif - -#ifdef PADDLE_WITH_HIP -#include -#endif - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -// Note: this header for simplify HIP and CUDA type string + #include #include #include -#include "paddle/fluid/platform/type_defs.h" + +#include "paddle/fluid/platform/device/gpu/gpu_types.h" namespace paddle { namespace platform { -//! Get the version of cudnn -int CudnnVersion(); +//! Get the version of dnn +int DnnVersion(); //! Get the total number of GPU devices in system. -int GetCUDADeviceCount(); +int GetGPUDeviceCount(); //! Get the compute capability of the ith GPU (format: major * 10 + minor) -int GetCUDAComputeCapability(int i); +int GetGPUComputeCapability(int id); //! Get the runtime version of the ith GPU -int GetCUDARuntimeVersion(int id); +int GetGPURuntimeVersion(int id); //! Get the driver version of the ith GPU -int GetCUDADriverVersion(int id); +int GetGPUDriverVersion(int id); //! Wheter the current device support TensorCore bool TensorCoreAvailable(); //! Get the MultiProcessors of the ith GPU. -int GetCUDAMultiProcessors(int i); +int GetGPUMultiProcessors(int id); //! Get the MaxThreads of each MultiProcessor of the ith GPU. -int GetCUDAMaxThreadsPerMultiProcessor(int i); +int GetGPUMaxThreadsPerMultiProcessor(int id); //! Get the MaxThreads of each block of the ith GPU. -int GetCUDAMaxThreadsPerBlock(int i); +int GetGPUMaxThreadsPerBlock(int id); //! Get the current GPU device id in system. int GetCurrentDeviceId(); @@ -97,19 +87,11 @@ size_t GpuMaxChunkSize(); //! Copy memory from address src to dst asynchronously. void GpuMemcpyAsync(void *dst, const void *src, size_t count, -#ifdef PADDLE_WITH_HIP - enum hipMemcpyKind kind, hipStream_t stream); -#else - enum cudaMemcpyKind kind, cudaStream_t stream); -#endif + gpuMemcpyKind kind, gpuStream_t stream); //! Copy memory from address src to dst synchronously. void GpuMemcpySync(void *dst, const void *src, size_t count, -#ifdef PADDLE_WITH_HIP - enum hipMemcpyKind kind); -#else - enum cudaMemcpyKind kind); -#endif + gpuMemcpyKind kind); //! Copy memory from one device to another device asynchronously. void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, @@ -125,34 +107,40 @@ void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream); //! Blocks until stream has completed all operations. void GpuStreamSync(gpuStream_t stream); +void GpuDestroyStream(gpuStream_t stream); + +// ! Blocks until device has completed all operations. +void GpuDeviceync(); + //! CudaMalloc with recorded info -gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id); +gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id); //! CudaFree with recorded info -void RecordedCudaFree(void *p, size_t size, int dev_id); +void RecordedGpuFree(void *p, size_t size, int dev_id); + +gpuError_t GpuGetLastError(); #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10020 - //! cuMemCreate with recorded info -CUresult RecordedCuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, - const CUmemAllocationProp *prop, - unsigned long long flags, int dev_id); // NOLINT +CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, + const CUmemAllocationProp *prop, + unsigned long long flags, int dev_id); // NOLINT //! cuMemRelease with recorded info -CUresult RecordedCuMemRelease(CUmemGenericAllocationHandle handle, size_t size, - int dev_id); +CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size, + int dev_id); #endif #endif //! Get available and total gpu memory with considering limitation -bool RecordedCudaMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail, - size_t *actual_total, int dev_id); +bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail, + size_t *actual_total, int dev_id); //! Get recorded cudaMalloc size. If record is disabled, return 0. -uint64_t RecordedCudaMallocSize(int dev_id); +uint64_t RecordedGpuMallocSize(int dev_id); -bool IsCudaMallocRecorded(int dev_id); +bool IsGpuMallocRecorded(int dev_id); //! Empty idle cached memory held by the allocator. void EmptyCache(void); diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h similarity index 98% rename from paddle/fluid/platform/gpu_launch_config.h rename to paddle/fluid/platform/device/gpu/gpu_launch_config.h index 399f1dbaa03e1..55f4c8eb4cd55 100644 --- a/paddle/fluid/platform/gpu_launch_config.h +++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h @@ -28,6 +28,7 @@ #include #include #include +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h similarity index 100% rename from paddle/fluid/platform/cuda_primitives.h rename to paddle/fluid/platform/device/gpu/gpu_primitives.h diff --git a/paddle/fluid/platform/cuda_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc similarity index 84% rename from paddle/fluid/platform/cuda_resource_pool.cc rename to paddle/fluid/platform/device/gpu/gpu_resource_pool.cc index 70d2ec5505798..2c55eb972b765 100644 --- a/paddle/fluid/platform/cuda_resource_pool.cc +++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc @@ -13,24 +13,24 @@ // limitations under the License. #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include "paddle/fluid/platform/cuda_resource_pool.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" namespace paddle { namespace platform { CudaStreamResourcePool::CudaStreamResourcePool() { - int dev_cnt = platform::GetCUDADeviceCount(); + int dev_cnt = platform::GetGPUDeviceCount(); pool_.reserve(dev_cnt); for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { auto creator = [dev_idx] { platform::SetDeviceId(dev_idx); gpuStream_t stream; #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); #endif return stream; @@ -39,9 +39,9 @@ CudaStreamResourcePool::CudaStreamResourcePool() { auto deleter = [dev_idx](gpuStream_t stream) { platform::SetDeviceId(dev_idx); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream)); #endif }; @@ -69,17 +69,17 @@ std::shared_ptr CudaStreamResourcePool::New(int dev_idx) { } CudaEventResourcePool::CudaEventResourcePool() { - int dev_cnt = platform::GetCUDADeviceCount(); + int dev_cnt = platform::GetGPUDeviceCount(); pool_.reserve(dev_cnt); for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { auto creator = [dev_idx] { platform::SetDeviceId(dev_idx); gpuEvent_t event; #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event, hipEventDisableTiming)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); #endif return event; @@ -88,9 +88,9 @@ CudaEventResourcePool::CudaEventResourcePool() { auto deleter = [dev_idx](gpuEvent_t event) { platform::SetDeviceId(dev_idx); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); #endif }; diff --git a/paddle/fluid/platform/cuda_resource_pool.h b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h similarity index 100% rename from paddle/fluid/platform/cuda_resource_pool.h rename to paddle/fluid/platform/device/gpu/gpu_resource_pool.h diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h new file mode 100644 index 0000000000000..d7362fe9cbd81 --- /dev/null +++ b/paddle/fluid/platform/device/gpu/gpu_types.h @@ -0,0 +1,94 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +#ifdef PADDLE_WITH_HIP +#include +#include "paddle/fluid/platform/dynload/miopen.h" +#include "paddle/fluid/platform/dynload/rocblas.h" + +#else +#include +#include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/dynload/cudnn.h" +#endif + +namespace paddle { + +#ifdef PADDLE_WITH_HIP +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ + using GPU_TYPE = ROCM_TYPE; +#else // CDUA + +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ + using GPU_TYPE = CUDA_TYPE; +#endif + +DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t); +DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t); +DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t); +DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind); +DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t); + +DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t); +DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, cudnnActivationStruct, + miopenActivationDescriptor); +DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, cudnnActivationMode_t, + miopenActivationMode_t); +DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor, cudnnTensorStruct, + miopenTensorDescriptor); +DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, cudnnTensorFormat_t, + miopenTensorFormat_t); +DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor, cudnnFilterStruct, + miopenTensorDescriptor); +DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t, cudnnFilterDescriptor_t, + miopenTensorDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor, cudnnConvolutionStruct, + miopenConvolutionDescriptor); +DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t, cudnnConvolutionDescriptor_t, + miopenConvolutionDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t, cudnnPoolingDescriptor_t, + miopenPoolingDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t); +DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, cudnnDropoutDescriptor_t, + miopenDropoutDescriptor_t); +DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t); + +DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle); + +using CUDAGraphID = unsigned long long; // NOLINT + +#undef DECLARE_TYPE_FOR_GPU + +#ifdef PADDLE_WITH_HIP +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ + constexpr auto GPU_CV = ROCM_CV; +#else // CDUA + +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ + constexpr auto GPU_CV = CUDA_CV; +#endif + +DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, cudaErrorMemoryAllocation, + hipErrorOutOfMemory); +DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady); +DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess); + +#undef DECLARE_CONSTANT_FOR_GPU +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h similarity index 99% rename from paddle/fluid/platform/nccl_helper.h rename to paddle/fluid/platform/device/gpu/nccl_helper.h index e297e7203c698..f26116749077e 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -70,11 +70,11 @@ class NCCLGroupGuard { inline NCCLGroupGuard() { NCCLMutex().lock(); - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart()); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart()); } inline ~NCCLGroupGuard() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd()); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd()); NCCLMutex().unlock(); } }; diff --git a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt new file mode 100644 index 0000000000000..86b9ecd5f5445 --- /dev/null +++ b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt @@ -0,0 +1,3 @@ +hip_library(rocm_info SRCS rocm_info.cc DEPS gflags glog enforce monitor dynload_cuda) + +hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda) diff --git a/paddle/fluid/platform/miopen_desc.h b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h similarity index 88% rename from paddle/fluid/platform/miopen_desc.h rename to paddle/fluid/platform/device/gpu/rocm/miopen_desc.h index c82e61ceb122c..d2389ba409e5e 100644 --- a/paddle/fluid/platform/miopen_desc.h +++ b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h @@ -23,8 +23,8 @@ #include #include +#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/miopen_helper.h" namespace paddle { namespace framework { @@ -88,7 +88,7 @@ class ActivationDescriptor { struct Deleter { void operator()(T* t) { if (t != nullptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::miopenDestroyActivationDescriptor(t)); t = nullptr; } @@ -96,13 +96,13 @@ class ActivationDescriptor { }; ActivationDescriptor() { T* raw_ptr; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::miopenCreateActivationDescriptor(&raw_ptr)); desc_.reset(raw_ptr); } template void set(miopenActivationMode_t mode, const T& coef) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetActivationDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetActivationDescriptor( desc_.get(), mode, static_cast(coef), 0.0, 0.0)); } @@ -119,15 +119,14 @@ class TensorDescriptor { struct Deleter { void operator()(T* t) { if (t != nullptr) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(t)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyTensorDescriptor(t)); t = nullptr; } } }; TensorDescriptor() { T* raw_ptr; - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::miopenCreateTensorDescriptor(&raw_ptr)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateTensorDescriptor(&raw_ptr)); desc_.reset(raw_ptr); } T* desc() { return desc_.get(); } @@ -144,7 +143,7 @@ class TensorDescriptor { if (groups > 1) { dims_with_group[1] = dims_with_group[1] / groups; } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor( (miopenTensorDescriptor_t)(desc_.get()), ToCudnnDataType(tensor.type()), static_cast(dims_with_group.size()), const_cast(dims_with_group.data()), @@ -166,7 +165,7 @@ class TensorDescriptor { if (groups > 1) { dims_with_group[1] = dims_with_group[1] / groups; } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor( (miopenTensorDescriptor_t)(desc_.get()), ToCudnnDataType(tensor.type()), static_cast(dims_with_group.size()), const_cast(dims_with_group.data()), @@ -183,15 +182,14 @@ class FilterDescriptor { struct Deleter { void operator()(T* t) { if (t != nullptr) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(t)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyTensorDescriptor(t)); t = nullptr; } } }; FilterDescriptor() { T* raw_ptr; - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::miopenCreateTensorDescriptor(&raw_ptr)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateTensorDescriptor(&raw_ptr)); desc_.reset(raw_ptr); } T* desc() { return desc_.get(); } @@ -212,7 +210,7 @@ class FilterDescriptor { if (groups > 1) { dims_with_group[1] = dims_with_group[1] / groups; } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor( (miopenTensorDescriptor_t)(desc_.get()), ToCudnnDataType(tensor.type()), static_cast(dims_with_group.size()), const_cast(dims_with_group.data()), @@ -229,7 +227,7 @@ class ConvolutionDescriptor { struct Deleter { void operator()(T* t) { if (t != nullptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::miopenDestroyConvolutionDescriptor(t)); t = nullptr; } @@ -237,7 +235,7 @@ class ConvolutionDescriptor { }; ConvolutionDescriptor() { T* raw_ptr; - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::miopenCreateConvolutionDescriptor(&raw_ptr)); desc_.reset(raw_ptr); } @@ -247,12 +245,12 @@ class ConvolutionDescriptor { void set(miopenDataType_t dtype, const std::vector& pads, const std::vector& strides, const std::vector& dilations, bool allow_tf32, const int groups = 1) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenInitConvolutionNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenInitConvolutionNdDescriptor( (miopenConvolutionDescriptor_t)desc_.get(), static_cast(pads.size()), const_cast(pads.data()), const_cast(strides.data()), const_cast(dilations.data()), miopenConvolution)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::miopenSetConvolutionGroupCount( (miopenConvolutionDescriptor_t)desc_.get(), groups)); } diff --git a/paddle/fluid/platform/miopen_helper.h b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h similarity index 89% rename from paddle/fluid/platform/miopen_helper.h rename to paddle/fluid/platform/device/gpu/rocm/miopen_helper.h index 46c7da8397041..bd8d05f8124a1 100644 --- a/paddle/fluid/platform/miopen_helper.h +++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/fluid/platform/dynload/miopen.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" @@ -36,13 +37,6 @@ DECLARE_bool(cudnn_deterministic); namespace paddle { namespace platform { - -// MIOPEN only support NCHW, just for compatibility with CUDNN API -typedef enum { - MIOPEN_TENSOR_NCHW = 0, - MIOPEN_TENSOR_NHWC = 1, -} miopenTensorFormat_t; - inline const char* miopenGetErrorString(miopenStatus_t status) { switch (status) { case miopenStatusSuccess: @@ -188,10 +182,10 @@ inline miopenTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) { class ScopedTensorDescriptor { public: ScopedTensorDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_)); } ~ScopedTensorDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_)); } inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format, @@ -216,12 +210,12 @@ class ScopedTensorDescriptor { platform::errors::InvalidArgument( "format should ONLY be NCHW in MIOPEN.")); if (dims.size() == 4) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor( desc_, type, dims_with_group.size(), const_cast(dims_with_group.data()), const_cast(strides.data()))); } else if (dims.size() == 5) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor( desc_, type, dims_with_group.size(), const_cast(dims_with_group.data()), const_cast(strides.data()))); @@ -240,7 +234,7 @@ class ScopedTensorDescriptor { inline miopenTensorDescriptor_t descriptor(const miopenDataType_t miopen_type, const std::vector& dim, const std::vector& stride) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor( desc_, miopen_type, dim.size(), const_cast(dim.data()), const_cast(stride.data()))); return desc_; @@ -262,10 +256,10 @@ class ScopedTensorDescriptor { class ScopedDropoutDescriptor { public: ScopedDropoutDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateDropoutDescriptor(&desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateDropoutDescriptor(&desc_)); } ~ScopedDropoutDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyDropoutDescriptor(desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyDropoutDescriptor(desc_)); } inline miopenDropoutDescriptor_t descriptor(const miopenHandle_t& handle, @@ -275,20 +269,20 @@ class ScopedDropoutDescriptor { framework::Tensor* dropout_state_, int seed, size_t state_size) { if (dropout_state_ == nullptr) { // for no dropout or test - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetDropoutDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetDropoutDescriptor( desc_, handle, 0 /* dropout */, nullptr, 0 /* state_size */, 0 /* seed */, false, false, MIOPEN_RNG_PSEUDO_XORWOW)); return desc_; } auto* dropout_state_data = dropout_state_->data(); if (!initialized) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetDropoutDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetDropoutDescriptor( desc_, handle, dropout_prob_, dropout_state_data, state_size, seed, false, false, MIOPEN_RNG_PSEUDO_XORWOW)); } else { auto dropout_state_dims = dropout_state_->dims(); state_size = dropout_state_dims[0]; - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenRestoreDropoutDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenRestoreDropoutDescriptor( desc_, handle, dropout_prob_, dropout_state_data, state_size, 0, false, false, MIOPEN_RNG_PSEUDO_XORWOW)); } @@ -304,10 +298,10 @@ class ScopedDropoutDescriptor { class ScopedRNNDescriptor { public: ScopedRNNDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateRNNDescriptor(&desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateRNNDescriptor(&desc_)); } ~ScopedRNNDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyRNNDescriptor(desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyRNNDescriptor(desc_)); } inline miopenRNNDescriptor_t desc() { return desc_; } @@ -320,10 +314,10 @@ class ScopedRNNDescriptor { class ScopedFilterDescriptor { public: ScopedFilterDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_)); } ~ScopedFilterDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_)); } inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format, @@ -344,7 +338,7 @@ class ScopedFilterDescriptor { for (int k = kernel_with_group.size() - 2; k >= 0; k--) { stride_dim[k] = stride_dim[k + 1] * kernel_with_group[k + 1]; } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor( desc_, type, kernel_with_group.size(), const_cast(kernel_with_group.data()), const_cast(stride_dim.data()))); @@ -369,11 +363,11 @@ class ScopedFilterDescriptor { class ScopedConvolutionDescriptor { public: ScopedConvolutionDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::miopenCreateConvolutionDescriptor(&desc_)); } ~ScopedConvolutionDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::miopenDestroyConvolutionDescriptor(desc_)); } @@ -391,7 +385,7 @@ class ScopedConvolutionDescriptor { "The size of pads and dilations should be equal. But received size " "of pads is %d, size of dilations is %d.", pads.size(), dilations.size())); - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenInitConvolutionNdDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenInitConvolutionNdDescriptor( desc_, pads.size(), const_cast(pads.data()), const_cast(strides.data()), const_cast(dilations.data()), miopenConvolution)); @@ -413,10 +407,10 @@ class ScopedConvolutionDescriptor { class ScopedPoolingDescriptor { public: ScopedPoolingDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreatePoolingDescriptor(&desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreatePoolingDescriptor(&desc_)); } ~ScopedPoolingDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyPoolingDescriptor(desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyPoolingDescriptor(desc_)); } inline miopenPoolingDescriptor_t descriptor(const PoolingMode& mode, @@ -434,7 +428,7 @@ class ScopedPoolingDescriptor { "The size of kernel and strides should be equal. But " "received size of kernel is %d, size of strides is %d.", kernel.size(), strides.size())); - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetNdPoolingDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetNdPoolingDescriptor( desc_, GetPoolingMode(mode), kernel.size(), const_cast(kernel.data()), const_cast(pads.data()), const_cast(strides.data()))); @@ -449,11 +443,11 @@ class ScopedPoolingDescriptor { class ScopedActivationDescriptor { public: ScopedActivationDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::miopenCreateActivationDescriptor(&desc_)); } ~ScopedActivationDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::miopenDestroyActivationDescriptor(desc_)); } @@ -489,7 +483,7 @@ class ScopedActivationDescriptor { "Unrecognized MIOPEN activation mode: %d.", static_cast(activation_mode))); } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetActivationDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetActivationDescriptor( desc_, mode, relu_ceiling, 0.0, 0.0)); return desc_; } @@ -514,15 +508,15 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { class ScopedCTCLossDescriptor { public: ScopedCTCLossDescriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateCTCLossDescriptor(&desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateCTCLossDescriptor(&desc_)); } ~ScopedCTCLossDescriptor() PADDLE_MAY_THROW { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyCTCLossDescriptor(desc_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyCTCLossDescriptor(desc_)); } template inline miopenCTCLossDescriptor_t descriptor() { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetCTCLossDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetCTCLossDescriptor( desc_, CudnnDataType::type, 0, false)); return desc_; } diff --git a/paddle/fluid/platform/miopen_helper_test.cc b/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc similarity index 98% rename from paddle/fluid/platform/miopen_helper_test.cc rename to paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc index e201f4893f577..13cf52dc2c6a3 100644 --- a/paddle/fluid/platform/miopen_helper_test.cc +++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc @@ -15,7 +15,7 @@ limitations under the License. */ #define GLOG_NO_ABBREVIATED_SEVERITIES #define GOOGLE_GLOG_DLL_DECL -#include "paddle/fluid/platform/miopen_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h new file mode 100644 index 0000000000000..2263383f8fabb --- /dev/null +++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h @@ -0,0 +1,160 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +// NOTE(): support float16 to half in header file. +#define PADDLE_CUDA_FP16 +#include "paddle/fluid/platform/complex.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace platform { + +#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate)) + +inline static int RoundToPowerOfTwo(int dim) { + // HIP results in error or nan if > 256 + if (dim > 128) { + return 256; + } else if (dim > 64) { + return 128; + } else if (dim > 32) { + return 64; + } else { + return 32; + } +} + +#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ + case (dim): { \ + constexpr auto kPowerOfTwoDim = (dim); \ + __VA_ARGS__; \ + } break + +#define CUDA_LAUNCH_KERNEL_HELPER(...) \ + CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); + +template +__forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, + int delta, + int width = warpSize) { + return __shfl_down(val, delta, width); +} + +template +__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val, + int width = warpSize) { + return __shfl_xor(val, width); +} + +template <> +__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, + float16 val, int delta, + int width) { + return float16(__shfl_down(static_cast(val), + static_cast(delta), width)); +} + +template <> +__forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( + unsigned mask, paddle::platform::complex val, int delta, int width) { + float real = __shfl_down(val.real, delta, width); + float imag = __shfl_down(val.imag, delta, width); + return paddle::platform::complex(real, imag); +} + +template <> +__forceinline__ __device__ paddle::platform::complex +CudaShuffleDownSync(unsigned mask, paddle::platform::complex val, + int delta, int width) { + double real = __shfl_down(val.real, delta, width); + double imag = __shfl_down(val.imag, delta, width); + return paddle::platform::complex(real, imag); +} + +template <> +__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, + float16 val, int width) { + return float16(__shfl_xor(static_cast(val), width)); +} + +template <> +__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( + unsigned mask, paddle::platform::complex val, int width) { + float real = __shfl_xor(val.real, width); + float imag = __shfl_xor(val.imag, width); + return paddle::platform::complex(real, imag); +} + +template <> +__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( + unsigned mask, paddle::platform::complex val, int width) { + double real = __shfl_xor(val.real, width); + double imag = __shfl_xor(val.imag, width); + return paddle::platform::complex(real, imag); +} + +template +__forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line, + int width = 32) { + return __shfl(val, src_line, width); +} + +template +HOSTDEVICE T Infinity() { + return INFINITY; +} + +template +__device__ T reduceSum(T val, int tid, int len) { + // NOTE(zcd): The warp size should be taken from the + // parameters of the GPU but not specified as 32 simply. + // To make the reduceSum more efficiently, + // I use Warp-Level Parallelism and assume the Warp size + // is 32 which may be different for different GPU, + // but most card's warp size is 32. + const int warpSize = 32; + __shared__ T shm[warpSize]; + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, tid < len); + + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += platform::CudaShuffleDownSync(mask, val, offset); + + if (tid < warpSize) shm[tid] = 0; + __syncthreads(); + + if (tid % warpSize == 0) { + shm[tid / warpSize] = val; + } + __syncthreads(); + + CREATE_SHFL_MASK(mask, tid < warpSize); + + if (tid < warpSize) { + val = shm[tid]; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += platform::CudaShuffleDownSync(mask, val, offset); + } + return val; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h new file mode 100644 index 0000000000000..a0f3fb0f73ba5 --- /dev/null +++ b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h @@ -0,0 +1,102 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT + +#include "paddle/fluid/platform/dynload/rocblas.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace platform { + +/* + * Summary: Grid stride looping macro in CUDA kernel + * + * [ Why need this macro? ] + * + * The original looping in CUDA kernel is: + * + * `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + * i += blockDim.x * gridDim.x)` + * + * This for condition is risky. The value of `blockIdx.x * blockDim.x` + * may be large, such as over 1GB, the first iteration is no problem here, + * but when `i += blockDim.x * gridDim.x` is executed, the value of i + * will greater than INT_MAX and overflow becomes negative value, at + * this time, the cycle condition `i < (n)` is still satisfied, so it + * will cause illegal access to cuda memory. + * + * Here is a real example in ERINE, it will trigger above error. + * The related data are: + * - blockIdx.x = 2172938 + * - blockDim.x = 512 + * - blockIdx.x * blockDim.x = 1112543864 + * - INT_MAX = 2147483647 + * + * So we polish the for condition as follow, the int64_t __index__ will + * prevent overflow in the loop increment. + * + * Parameters: + * - i: loop index + * - num: total element numbers + * + * Examples: + * template + * __global__ void Scale(T* logit_grad, const T* loss_grad, const int num, + * const int d, const int remain) { + * CUDA_KERNEL_LOOP(index, num) { + * int idx_n = index / d; + * int idx_remain = index % remain; + * logit_grad[index] *= loss_grad[idx_n * remain + idx_remain]; + * } + * } + * +*/ + +#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ + int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \ + for (index_type i = __index__; __index__ < (num); \ + __index__ += hipBlockDim_x * hipGridDim_x, i = __index__) + +class CublasHandleHolder { + public: + explicit CublasHandleHolder(hipStream_t stream) { + PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_)); + PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream)); + } + + const rocblas_handle& GetCublasHandle() const { return handle_; } + + ~CublasHandleHolder() PADDLE_MAY_THROW { + PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_)); + } + + template + inline void Call(Callback&& callback) const { + std::lock_guard guard(mtx_); + callback(handle_); + } + + private: + DISABLE_COPY_AND_ASSIGN(CublasHandleHolder); + + rocblas_handle handle_; + mutable std::mutex mtx_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc b/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc new file mode 100644 index 0000000000000..06dba8ce423ef --- /dev/null +++ b/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc @@ -0,0 +1,269 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/lock_guard_ptr.h" +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/monitor.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/split.h" + +static std::once_flag g_device_props_size_init_flag; +static std::vector> g_device_props_init_flags; +static std::vector g_device_props; + +namespace paddle { +namespace platform { +int DnnVersion() { + if (!dynload::HasCUDNN()) return -1; + size_t version_major, version_minor, version_patch; + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion( + &version_major, &version_minor, &version_patch)); + return version_major * 100 + version_minor * 10 + version_patch; +} + +static int GetGPUDeviceCountImpl() { + int driverVersion = 0; + hipError_t status = hipDriverGetVersion(&driverVersion); + + if (!(status == gpuSuccess && driverVersion != 0)) { + // No GPU driver + VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!"; + return 0; + } + + const auto *cuda_visible_devices = std::getenv("HIP_VISIBLE_DEVICES"); + + if (cuda_visible_devices != nullptr) { + std::string cuda_visible_devices_str(cuda_visible_devices); + if (!cuda_visible_devices_str.empty()) { + cuda_visible_devices_str.erase( + 0, cuda_visible_devices_str.find_first_not_of('\'')); + cuda_visible_devices_str.erase( + cuda_visible_devices_str.find_last_not_of('\'') + 1); + cuda_visible_devices_str.erase( + 0, cuda_visible_devices_str.find_first_not_of('\"')); + cuda_visible_devices_str.erase( + cuda_visible_devices_str.find_last_not_of('\"') + 1); + } + if (std::all_of(cuda_visible_devices_str.begin(), + cuda_visible_devices_str.end(), + [](char ch) { return ch == ' '; })) { + VLOG(2) << "HIP_VISIBLE_DEVICES is set to be " + "empty. No GPU detected."; + return 0; + } + } + int count; + PADDLE_ENFORCE_GPU_SUCCESS(hipGetDeviceCount(&count)); + return count; +} + +int GetGPUDeviceCount() { + // cache the count + static auto dev_cnt = GetGPUDeviceCountImpl(); + return dev_cnt; +} + +int GetGPUComputeCapability(int id) { + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + int major, minor; + auto major_error_code = hipDeviceGetAttribute( + &major, hipDeviceAttributeComputeCapabilityMajor, id); + auto minor_error_code = hipDeviceGetAttribute( + &minor, hipDeviceAttributeComputeCapabilityMinor, id); + + PADDLE_ENFORCE_GPU_SUCCESS(major_error_code); + PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code); + return major * 100 + minor; +} + +int GetGPURuntimeVersion(int id) { + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + int runtime_version = 0; + PADDLE_ENFORCE_GPU_SUCCESS(hipRuntimeGetVersion(&runtime_version)); + return runtime_version; +} + +int GetGPUDriverVersion(int id) { + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + int driver_version = 0; + PADDLE_ENFORCE_GPU_SUCCESS(hipDriverGetVersion(&driver_version)); + return driver_version; +} + +bool TensorCoreAvailable() { return false; } + +int GetGPUMultiProcessors(int id) { + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS( + hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id)); + return count; +} + +int GetGPUMaxThreadsPerMultiProcessor(int id) { + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute( + &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id)); + + return count; +} + +int GetGPUMaxThreadsPerBlock(int id) { + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS( + hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id)); + return count; +} + +int GetCurrentDeviceId() { + int device_id; + PADDLE_ENFORCE_GPU_SUCCESS(hipGetDevice(&device_id)); + return device_id; +} + +dim3 GetGpuMaxGridDimSize(int id) { + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + dim3 ret; + int size; + auto error_code_x = + hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); + ret.x = size; + + auto error_code_y = + hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); + ret.y = size; + + auto error_code_z = + hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); + ret.z = size; + return ret; +} + +const gpuDeviceProp &GetDeviceProperties(int id) { + std::call_once(g_device_props_size_init_flag, [&] { + int gpu_num = 0; + gpu_num = platform::GetGPUDeviceCount(); + g_device_props_init_flags.resize(gpu_num); + g_device_props.resize(gpu_num); + for (int i = 0; i < gpu_num; ++i) { + g_device_props_init_flags[i] = std::make_unique(); + } + }); + + if (id == -1) { + id = platform::GetCurrentDeviceId(); + } + + if (id < 0 || id >= static_cast(g_device_props.size())) { + PADDLE_THROW(platform::errors::OutOfRange( + "The device id %d is out of range [0, %d), where %d is the number of " + "devices on this machine. Because the device id should be greater than " + "or equal to zero and smaller than the number of gpus. Please input " + "appropriate device again!", + id, static_cast(g_device_props.size()), + static_cast(g_device_props.size()))); + } + + std::call_once(*(g_device_props_init_flags[id]), [&] { + PADDLE_ENFORCE_GPU_SUCCESS(hipGetDeviceProperties(&g_device_props[id], id)); + }); + + return g_device_props[id]; +} + +void SetDeviceId(int id) { + // TODO(qijun): find a better way to cache the cuda device count + PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetGPUDeviceCount())); + PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id)); +} + +void GpuMemcpyAsync(void *dst, const void *src, size_t count, + gpuMemcpyKind kind, gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream)); +} + +void GpuMemcpySync(void *dst, const void *src, size_t count, + gpuMemcpyKind kind) { + PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(dst, src, count, kind)); +} + +void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, + int src_device, size_t count, gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS( + hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); +} + +void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, + int src_device, size_t count) { + PADDLE_ENFORCE_GPU_SUCCESS( + hipMemcpyPeer(dst, dst_device, src, src_device, count)); +} + +void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(dst, value, count, stream)); +} + +void GpuStreamSync(gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream)); +} + +void GpuDestroyStream(gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream)); +} + +void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); } + +gpuError_t GpuGetLastError() { return hipGetLastError(); } +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt new file mode 100644 index 0000000000000..25629ba74d915 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt @@ -0,0 +1,12 @@ +# IPU +IF(WITH_IPU) + cc_library(ipu_device SRCS device.cc DEPS enforce popart) + cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart) + cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce) + cc_library(ipu_optimizer SRCS ipu_optimizer.cc DEPS popart enforce) + cc_library(ipu_executor SRCS ipu_executor.cc DEPS ipu_optimizer ipu_utils popart graph framework_proto) + cc_library(popart_canonicalization_utils SRCS ${POPART_CANONICALIZATION_SRC} DEPS framework_proto enforce ipu_utils) + cc_library(ipu_compiler SRCS ipu_compiler.cc DEPS popart graph ipu_utils graph_helper) + cc_library(ipu_backend SRCS ipu_backend.cc DEPS popart ipu_compiler graph framework_proto enforce ipu_utils ipu_strategy ipu_device ipu_executor graph_helper) + cc_library(ipu_info SRCS ipu_info.cc DEPS ipu_backend) +ENDIF() diff --git a/paddle/fluid/platform/device/ipu/common.h b/paddle/fluid/platform/device/ipu/common.h new file mode 100644 index 0000000000000..7d62f10abd201 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/common.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +namespace paddle { +namespace platform { +namespace ipu { + +static constexpr const char *sIpuIndexAttr = "ipu_index"; +static constexpr const char *sIpuStageAttr = "ipu_stage"; +static constexpr const char *sOpIdentifyIdAttr = "op_identify_id"; +static constexpr const char *sDebugInfoId = "__debug_info_id"; + +static constexpr const char *sBeta1 = "beta1"; +static constexpr const char *sBeta2 = "beta2"; +static constexpr const char *sBeta1Pow = "Beta1Pow"; +static constexpr const char *sBeta2Pow = "Beta2Pow"; + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/device.cc b/paddle/fluid/platform/device/ipu/device.cc new file mode 100644 index 0000000000000..47e6475089d3f --- /dev/null +++ b/paddle/fluid/platform/device/ipu/device.cc @@ -0,0 +1,39 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/device/ipu/device.h" + +namespace paddle { +namespace platform { +namespace ipu { + +Device::Device(const popart::DeviceInfo& device_info) + : id_(device_info.getId()), is_attached_(device_info.isAttached()) { + popart::DeviceType popart_device_type = device_info.getType(); + switch (popart_device_type) { + case popart::DeviceType::IpuModel: + device_type_ = DeviceType::IpuModel; + break; + case popart::DeviceType::Ipu: + device_type_ = DeviceType::Ipu; + break; + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "popart::DeviceType:Unsupported type %d", popart_device_type)); + } +} + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/device.h b/paddle/fluid/platform/device/ipu/device.h new file mode 100644 index 0000000000000..24a8bdec3087c --- /dev/null +++ b/paddle/fluid/platform/device/ipu/device.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { +namespace ipu { + +enum class DeviceType { IpuModel = 0, Cpu, Ipu, OfflineIpu, Sim }; + +class Device { + public: + Device() {} + explicit Device(const popart::DeviceInfo& device_info); + + int getId() const { return id_; } + bool isAttached() const { return is_attached_; } + DeviceType getType() const { return device_type_; } + + private: + int id_; + bool is_attached_; + DeviceType device_type_; + /* TODO:: Add more elements in the future */ +}; + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc new file mode 100644 index 0000000000000..cd0f5ae554cb4 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc @@ -0,0 +1,195 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/ipu/ipu_backend.h" +#include "paddle/fluid/platform/ipu/ipu_utils.h" + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/node.h" + +namespace paddle { +namespace platform { +namespace ipu { + +std::shared_ptr IpuBackend::instance_ = nullptr; + +IpuBackend::IpuBackend() { + compiler_ = std::make_shared(); + executor_ = std::make_unique(); +} + +void IpuBackend::Clear() { + executor_.reset(); + // detach device + if (device_ != nullptr && device_->isAttached()) { + device_->detach(); + device_.reset(); + device_ = nullptr; + } +} + +IpuBackend::~IpuBackend() { Clear(); } + +std::shared_ptr IpuBackend::GetInstance() { + if (!instance_) { + instance_.reset(new IpuBackend()); + } + return instance_; +} + +// This api should only call from python, always return a new object +std::shared_ptr IpuBackend::GetNewInstance() { + instance_.reset(new IpuBackend()); + return instance_; +} + +void IpuBackend::Compile(framework::ir::Graph* graph, + const std::vector& feed_list, + const std::vector& fetch_list) { + VLOG(10) << "enter IpuBackend::Compile"; + compiler_->InitInputs(graph, feed_list); + compiler_->LowerWeights(graph, scope_); + compiler_->LowerBody(graph); + compiler_->InitOutputs(fetch_list); + executor_->SetWeights(compiler_->GetWeights()); + VLOG(10) << "leave IpuBackend::Compile"; +} + +void IpuBackend::Run(const std::vector& inputs, + const std::vector& outputs, + const framework::ExecutionContext& ctx) { + Prepare(); + auto inputs_id = compiler_->GetInputs(); + auto outputs_id = compiler_->GetOutputs(); + executor_->Run(inputs_id, inputs, outputs_id, outputs, ctx); +} + +void IpuBackend::Prepare() { + if (is_prepared_) { + return; + } else { + is_prepared_ = true; + } + // convert Model to fp16 + if (ipu_strategy_->enable_fp16) { + compiler_->ConvertProtoToFp16(); + } + auto proto = compiler_->GetModelProto(); + auto tensors = compiler_->GetTensors(); + auto outputs = compiler_->GetOutputs(); + executor_->Prepare(proto, tensors, outputs, device_); +} + +void IpuBackend::SetScope(const framework::Scope& scope) { + scope_ = &scope; + executor_->SetScope(&scope); +} + +void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) { + ipu_strategy_ = &strategy; + executor_->SetIpuStrategy(strategy); + compiler_->SetIpuStrategy(strategy); +} + +size_t IpuBackend::GetNumDevices() { + // IpuModel + bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); + if (ipu_model) return 1; + // Real dev + size_t num_devices = + popart::DeviceManager::createDeviceManager().enumerateDevices().size(); + PADDLE_ENFORCE_GT( + num_devices, 0, + platform::errors::Unavailable( + "Do not found any IPU devices, please make " + "sure Poplar sdk is enabled or enable ENV \"POPLAR_IPUMODEL=1\"")); + return num_devices; +} + +std::vector IpuBackend::GetDeviceIds() { + bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); + if (ipu_model) { + return {0}; + } + std::vector device_ids; + auto devices = + popart::DeviceManager::createDeviceManager().enumerateDevices(); + PADDLE_ENFORCE_GT( + devices.size(), 0, + platform::errors::Unavailable("Do not found any IPU devices, please make " + "sure Poplar sdk is enabled.")); + + for (auto device : devices) { + device_ids.push_back(device->getId()); + } + + return device_ids; +} + +Device IpuBackend::GetDevice(int id) { + bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); + if (ipu_model) { + std::map deviceOpts{{"numIPUs", "1 "}}; + device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice( + deviceOpts); + Device device(*device_.get()); + return device; + } + size_t num_devices = GetNumDevices(); + if (id < 0 || id >= num_devices) { + PADDLE_THROW(platform::errors::InvalidArgument( + "device id %d is invalid, number devices is %d", id, num_devices)); + } + std::shared_ptr popart_device_info = + popart::DeviceManager::createDeviceManager().getDevice( + popart::SyncPattern::Full, id); + Device device(*popart_device_info.get()); + return device; +} + +void IpuBackend::AttachDevice(int id) { + // trick here + // Compiler ipu is not same as the runtime ipu. + VLOG(10) << "comile ipu id = " << id; + bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); + if (ipu_model) { + return; + } + device_ = popart::DeviceManager::createDeviceManager().acquireAvailableDevice( + UpperIpuNum()); + PADDLE_ENFORCE_NOT_NULL( + device_, platform::errors::Unavailable("Can't attach IPU, ipu_num = %d.", + UpperIpuNum())); +} + +bool IpuBackend::DeviceIsAttached() { return device_ != nullptr; } + +// num_ipus must be pow(2,n); +int IpuBackend::UpperIpuNum() { + PADDLE_ENFORCE_GT(ipu_strategy_->num_ipus, 0, + platform::errors::Unavailable( + "The ipu num get is wrong, please make sure the " + "sharding or pipline parameter is right.")); + int i = 0; + while (std::pow(2, i) < ipu_strategy_->num_ipus) { + i++; + } + return std::pow(2, i); +} + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h new file mode 100644 index 0000000000000..769a1b5b52ab8 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/ipu_backend.h @@ -0,0 +1,103 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/ipu/device.h" +#include "paddle/fluid/platform/ipu/ipu_compiler.h" +#include "paddle/fluid/platform/ipu/ipu_executor.h" +#include "paddle/fluid/platform/ipu/ipu_strategy.h" + +namespace paddle { +namespace platform { +namespace ipu { + +class IpuBackend { + // IpuBackend is the center of paddle-ipu, its function include: + // 1. Compile paddle model to popart model + // 2. Run popart model, inference or training + // 3. Request and release device + // 4. Other helper function + + public: + IpuBackend(); + ~IpuBackend(); + + void Clear(); + + // return if exsits, else create and return + static std::shared_ptr GetInstance(); + + // always return a new instance_ + static std::shared_ptr GetNewInstance(); + + // what compile does include(call compiler_): + // 1. map paddle-op -> poart op + // 2. construct popart onnx compute graph + void Compile(framework::ir::Graph *graph, + const std::vector &feed_list, + const std::vector &fetch_list); + + // what run does include: + // 1. construct forward onnx graph + // 2. graph-level optimization + // 3. autodiff + void Run(const std::vector &inputs, + const std::vector &outputs, + const framework::ExecutionContext &ctx); + + Executor &GetExecutor() { return *executor_; } + + void SetScope(const framework::Scope &scope); + const framework::Scope *GetScope() { return scope_; } + void SetIpuStrategy(const IpuStrategy &strategy); + const IpuStrategy *GetIpuStrategy() { return ipu_strategy_; } + + // Device + size_t GetNumDevices(); + std::vector GetDeviceIds(); + Device GetDevice(int id); + void AttachDevice(int id); + bool DeviceIsAttached(); + + private: + int UpperIpuNum(); + void Prepare(); + + private: + std::shared_ptr compiler_; + std::unique_ptr executor_; + std::shared_ptr device_; + bool is_prepared_ = false; + + // not own + const framework::Scope *scope_ = nullptr; + const IpuStrategy *ipu_strategy_ = nullptr; + + private: + static std::shared_ptr instance_; +}; + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc new file mode 100644 index 0000000000000..a1c5ed4fefbf3 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc @@ -0,0 +1,397 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/ipu/ipu_compiler.h" + +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/platform/ipu/ipu_utils.h" + +namespace paddle { +namespace platform { +namespace ipu { + +template +T GetAttrAllowNull(std::string attr, framework::OpDesc* op_desc) { + if (op_desc->HasAttr(attr)) { + return BOOST_GET_CONST(T, op_desc->GetAttr(attr)); + } else { + return {}; + } +} + +template +nonstd::optional GetOptAttrAllowNull(std::string attr, + framework::OpDesc* op_desc) { + if (op_desc->HasAttr(attr)) { + return BOOST_GET_CONST(T, op_desc->GetAttr(attr)); + } else { + return {}; + } +} + +Compiler::Compiler() { + builder_ = popart::Builder::create(); + RegisterOpFunc(); +} + +Compiler::~Compiler() {} + +void Compiler::RegisterOpFunc() { + VLOG(10) << "enter Compiler::RegisterOpFunc"; +#define INT_VEC std::vector +#define FLOAT_VEC std::vector +#define FLOAT float +#define INT std::int64_t +#define BOOL bool +#define STRING std::string +#define STRING_VEC std::vector +#define NONE + +#define ARG(Type, Name) , GetAttrAllowNull(#Name, op_desc) +#define OPT_ARG(Type, Name) , GetOptAttrAllowNull(#Name, op_desc) +#define POPART_CONST_ARG(Name) , const PopartConstant& Name +#define HOST_SIDE_CONST_ARG(Name) , const HostSideConstant& Name +#define POPART_ATTRIB_VEC_ARG(Name) +#define BODY_ARG(Name) NONE + + name_function_ = { +#define OP_DECL(FuncName, OnnxImpl, Args) \ + {#FuncName, [&](framework::OpDesc* op_desc) { \ + auto op_type = op_desc->Type(); \ + VLOG(10) << "build op:" << op_type << " args " << #Args; \ + auto inputs = GetOpInputs(op_desc); \ + auto output_names = GetOpOutputs(op_desc); \ + auto debug_context = BuildDebugContext(op_desc); \ + auto aiGraphcoreOpset = builder_->aiGraphcoreOpset1(); \ + auto aiOnnxOpset = builder_->aiOnnxOpset11(); \ + auto output_ids = OnnxImpl(inputs Args, debug_context); \ + SetIpuIndexStage(output_ids, op_desc); \ + InsertTensors(output_names, output_ids); \ + }}, // NOLINT +#include "paddle/fluid/platform/ipu/supported_ops_autogen.h" + }; + +#undef OP_DECL +#undef BODY_ARG +#undef POPART_ATTRIB_VEC_ARG +#undef HOST_SIDE_CONST_ARG +#undef POPART_CONST_ARG +#undef OPT_ARG +#undef ARG +#undef NONE +#undef STRING_VEC +#undef STRING +#undef BOOL +#undef INT +#undef FLOAT +#undef FLOAT_VEC +#undef INT_VEC +} + +void Compiler::LowerBody(const framework::ir::Graph* graph) { + VLOG(10) << "enter Compiler::LowerBody"; + auto nodes = framework::ir::TopologySortOperations(*graph); + for (auto* node : nodes) { + auto* op_desc = node->Op(); + auto op_type = op_desc->Type(); + VLOG(10) << "node->type: " << op_type; + + if (op_type == "popart_constant") { + auto dims = + BOOST_GET_CONST(std::vector, op_desc->GetAttr("dims")); + auto dtype_ = BOOST_GET_CONST(int, op_desc->GetAttr("dtype")); + auto dtype = OnnxDtype2PopartType(dtype_); + popart::TensorInfo tensor_info{dtype, dims}; + auto value_attr = op_desc->GetAttr("value"); + auto const_data = std::unique_ptr{}; + switch (dtype) { + case popart::DataType::FLOAT: + const_data.reset(new popart::ConstVoidData( + BOOST_GET_CONST(std::vector, value_attr).data(), + tensor_info)); + break; + case popart::DataType::INT32: + const_data.reset(new popart::ConstVoidData( + BOOST_GET_CONST(std::vector, value_attr).data(), + tensor_info)); + break; + case popart::DataType::DOUBLE: + const_data.reset(new popart::ConstVoidData( + BOOST_GET_CONST(std::vector, value_attr).data(), + tensor_info)); + break; + case popart::DataType::INT64: + const_data.reset(new popart::ConstVoidData( + BOOST_GET_CONST(std::vector, value_attr).data(), + tensor_info)); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "The popart datatype is not supported, popart::DataType is %d", + dtype)); + } + popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data); + SetIpuIndexStage(result, op_desc); + InsertTensors(GetOpOutputs(op_desc), result); + } else if (op_type == "popart_batchnormalization") { + auto inputs = GetOpInputs(op_desc); + auto outputs = GetOpOutputs(op_desc); + auto num_outputs = outputs.size(); + auto epsilon = BOOST_GET_CONST(float, op_desc->GetAttr("epsilon")); + auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum")); + auto result = builder_->aiOnnxOpset11().batchnormalization( + inputs, num_outputs, epsilon, momentum); + SetIpuIndexStage(result, op_desc); + InsertTensors(GetOpOutputs(op_desc), result); + } else if (op_type == "popart_nllloss") { + auto inputs = GetOpInputs(op_desc); + auto ignoreIndex = BOOST_GET_CONST(int, op_desc->GetAttr("ignoreIndex")); + auto result = builder_->aiGraphcoreOpset1().nllloss( + inputs, popart::ReductionType::NoReduction, ignoreIndex); + SetIpuIndexStage(result, op_desc); + InsertTensors(GetOpOutputs(op_desc), result); + } else if (op_type == "popart_topk") { + auto inputs = GetOpInputs(op_desc); + auto outputs = GetOpOutputs(op_desc); + int64_t axis = BOOST_GET_CONST(int64_t, op_desc->GetAttr("axis")); + int sorted_INT32 = BOOST_GET_CONST(int, op_desc->GetAttr("sorted")); + int64_t sorted = int64_t{sorted_INT32}; + + auto aiOnnxOpset = builder_->aiOnnxOpset11(); + + popart::ConvInputs result; + if (inputs.size() == 2) { + VLOG(10) + << "[Compiler::LowerBody] size of inputs for is 2"; + result = aiOnnxOpset.topk(inputs, axis, sorted); + } else if (inputs.size() == 1) { + VLOG(10) + << "[Compiler::LowerBody] size of inputs for is 1"; + int64_t k = BOOST_GET_CONST(int64_t, op_desc->GetAttr("k")); + popart::TensorInfo kShape{"INT64", std::vector{1}}; + popart::ConstVoidData kData = {&k, kShape}; + auto K_t = aiOnnxOpset.constant(kData); + result = aiOnnxOpset.topk({inputs[0], K_t}, axis, sorted); + } + result[1] = aiOnnxOpset.cast({result[1]}, "INT32"); + SetIpuIndexStage(result, op_desc); + VLOG(10) << "[Compiler::LowerBody] output[1]: " << outputs[1]; + VLOG(10) << "[Compiler::LowerBody] output[1]: " + << GetOpOutputs(op_desc)[1] << " -> " << result[1]; + tensors_.emplace(GetOpOutputs(op_desc)[1], result[1]); // topk indices + VLOG(10) << "[Compiler::LowerBody] output[0]: " << outputs[0]; + VLOG(10) << "[Compiler::LowerBody] output[0]: " + << GetOpOutputs(op_desc)[0] << " -> " << result[0]; + tensors_.emplace(GetOpOutputs(op_desc)[0], result[0]); // topk values + } else { + auto itr = name_function_.find(op_type); + if (itr != name_function_.end()) { + itr->second(node->Op()); + } else { + PADDLE_THROW(platform::errors::NotFound( + "Op %s is not registered in popart canonicalization", op_type)); + } + } + } + VLOG(10) << "leave Compiler::LowerBody"; +} + +void Compiler::InitInputs(framework::ir::Graph* graph, + const std::vector& feed_list) { + for (const auto& feed_name : feed_list) { + feed_list_.push_back(feed_name); + for (const framework::ir::Node* n : graph->Nodes()) { + if (n->IsVar()) { + auto* var_desc = n->Var(); + if (feed_name == var_desc->Name()) { + VLOG(10) << "feed_name= " << var_desc->Name(); + auto data_type = VarType2PopartType(var_desc->GetDataType()); + if (ipu_strategy_->enable_fp16) { + data_type = popart::DataType::FLOAT16; + } + popart::TensorInfo input_info{data_type, var_desc->GetShape()}; + VLOG(10) << "popart input_info = " << input_info; + popart::TensorId tensor_id = + builder_->addInputTensor(input_info, feed_name); + VLOG(10) << "popart input tensor id = " << tensor_id; + inputs_.push_back(tensor_id); + tensors_.emplace(var_desc->Name(), tensor_id); + } + } + } + } +} + +void Compiler::InitOutputs(const std::vector& fetch_list) { + for (const auto& fetch_name : fetch_list) { + fetch_list_.push_back(fetch_name); + auto tensor = tensors_.find(fetch_name); + PADDLE_ENFORCE_NE(tensor, tensors_.end(), + platform::errors::NotFound( + "output tensor %s does not exist.", fetch_name)); + VLOG(10) << "fetch_name= " << fetch_name; + VLOG(10) << "popart output tensor id = " << tensor->second; + builder_->addOutputTensor(tensor->second); + outputs_.push_back(tensor->second); + } +} + +void Compiler::LowerWeights(const framework::ir::Graph* graph, + const framework::Scope* scope_) { + PADDLE_ENFORCE_NOT_NULL(scope_, + platform::errors::PreconditionNotMet( + "You should call set_scope before LowerWeights")); + // at this step, the graph doesn't contains optimizer related states + for (const auto* node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + if (node->Var()->Persistable() && node->inputs.empty()) { + auto var_name = node->Var()->Name(); + // workround: https://github.com/graphcore/Paddle/issues/151 + if (tensors_.count(var_name) != 0) { + continue; + } + + auto var = scope_->FindVar(var_name); + if (var) { + auto tensor = var->Get(); + auto dtype = VarType2PopartType(tensor.type()); + auto shape = std::vector(); + for (size_t i = 0; i < tensor.dims().size(); ++i) { + shape.push_back(tensor.dims().at(i)); + } + popart::TensorInfo tensor_info(dtype, shape); + popart::ConstVoidData const_data{tensor.data(), tensor_info}; + popart::TensorId result = + builder_->addInitializedInputTensor(const_data, var_name); + tensors_.emplace(var_name, result); + weights_.push_back(result); + } + } + } + } +} + +void Compiler::InsertTensors(const std::vector& output_names, + const std::vector& tensor_ids) { + PADDLE_ENFORCE_EQ(output_names.size(), tensor_ids.size(), + platform::errors::Fatal("InsertTensors size mismatch")); + for (int i = 0; i < tensor_ids.size(); i++) { + std::string tensor_id = tensor_ids[i]; + tensors_.emplace(output_names[i], tensor_ids[i]); + } +} + +void Compiler::InsertTensors(const std::vector& output_names, + const std::string& tensor_id) { + PADDLE_ENFORCE_EQ(output_names.size(), 1, + platform::errors::Fatal("InsertTensors size mismatch")); + tensors_.emplace(output_names[0], tensor_id); +} + +void Compiler::SetIpuIndexStage(const std::vector& tensor_ids, + const framework::OpDesc* op_desc) { + VLOG(10) << "enter Compiler::SetIpuIndexStage"; + auto tensor_ids_set = + std::set(tensor_ids.begin(), tensor_ids.end()); + + if (op_desc->HasAttr(sIpuIndexAttr)) { + auto ipu_index = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuIndexAttr)); + builder_->virtualGraph(tensor_ids_set, ipu_index); + VLOG(10) << "set " << sIpuIndexAttr << " = " << ipu_index + << " for op: " << op_desc->Type(); + if (op_desc->HasAttr(sIpuStageAttr)) { + auto ipu_stage = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuStageAttr)); + builder_->pipelineStage(tensor_ids_set, ipu_stage); + VLOG(10) << "set " << sIpuStageAttr << "= " << ipu_stage + << " for op: " << op_desc->Type(); + } + } + VLOG(10) << "leave Compiler::SetIpuIndexStage"; +} + +void Compiler::SetIpuIndexStage(const std::string& tensor_id, + const framework::OpDesc* op_desc) { + VLOG(10) << "enter Compiler::SetIpuIndexStage"; + + if (op_desc->HasAttr(sIpuIndexAttr)) { + auto ipu_index = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuIndexAttr)); + builder_->virtualGraph(tensor_id, ipu_index); + VLOG(10) << "set " << sIpuIndexAttr << " = " << ipu_index + << " for op: " << op_desc->Type(); + if (op_desc->HasAttr(sIpuStageAttr)) { + auto ipu_stage = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuStageAttr)); + builder_->pipelineStage(tensor_id, ipu_stage); + VLOG(10) << "set " << sIpuStageAttr << "= " << ipu_stage + << " for op: " << op_desc->Type(); + } + } + VLOG(10) << "leave Compiler::SetIpuIndexStage"; +} + +std::vector& Compiler::GetWeights() { return weights_; } + +// convertFloatsToHalfs +void Compiler::ConvertProtoToFp16() { + popart::GraphTransformer graph_transformer(builder_->getModelProto()); + graph_transformer.convertFloatsToHalfs(); + converted_proto_ = graph_transformer.getModelProto(); +} + +std::string Compiler::GetModelProto() { + if (converted_proto_.length()) { + return converted_proto_; + } + return builder_->getModelProto(); +} + +void Compiler::SaveModelProto(const std::string& path) { + builder_->saveModelProto(path); +} + +void Compiler::SaveModelProtoNoCheck(const std::string& path) { + auto proto = GetModelProto(); + std::ofstream onnxfile(path, std::ios_base::binary); + onnxfile.write(proto.data(), proto.size()); + onnxfile.close(); +} + +std::vector Compiler::GetOpInputs(const framework::OpDesc* op) { + auto ins = op->Input("__inputs__"); + std::vector inputs; + for (const auto& in : ins) { + if (tensors_.find(in) != tensors_.end()) { + inputs.push_back(tensors_[in]); + } else { + inputs.push_back(in); + } + } + return inputs; +} + +const std::vector& Compiler::GetOpOutputs( + const framework::OpDesc* op) { + return op->Output("__outputs__"); +} + +popart::DebugContext Compiler::BuildDebugContext(const framework::OpDesc* op) { + auto op_identify_id = + BOOST_GET_CONST(std::string, op->GetAttr(sOpIdentifyIdAttr)); + VLOG(10) << "op_identify_id of op: " << op->Type() << " is " + << op_identify_id; + return popart::DebugContext(op_identify_id); +} + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h new file mode 100644 index 0000000000000..ecee1595bb892 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h @@ -0,0 +1,93 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/ipu/common.h" +#include "paddle/fluid/platform/ipu/ipu_strategy.h" + +namespace paddle { +namespace platform { +namespace ipu { + +class Compiler { + public: + Compiler(); + ~Compiler(); + void RegisterOpFunc(); + void LowerBody(const framework::ir::Graph *graph); + void InitInputs(framework::ir::Graph *graph, + const std::vector &feed_list); + void InitOutputs(const std::vector &fetch_list); + void LowerWeights(const framework::ir::Graph *graph, + const framework::Scope *scope_); + + void InsertTensors(const std::vector &output_names, + const std::vector &tensor_ids); + void InsertTensors(const std::vector &output_names, + const std::string &tensor_id); + void SetIpuIndexStage(const std::vector &tensor_ids, + const framework::OpDesc *op_desc); + void SetIpuIndexStage(const std::string &tensor_id, + const framework::OpDesc *op_desc); + + std::vector GetInputs() { return inputs_; } + std::vector GetOutputs() { return outputs_; } + std::map GetTensors() { return tensors_; } + std::vector &GetWeights(); + + std::string GetModelProto(); + void SetIpuStrategy(const IpuStrategy &strategy) { + ipu_strategy_ = &strategy; + }; + void SaveModelProto(const std::string &path); + void SaveModelProtoNoCheck(const std::string &path); + void ConvertProtoToFp16(); + + private: + std::vector GetOpInputs(const framework::OpDesc *op); + const std::vector &GetOpOutputs(const framework::OpDesc *op); + popart::DebugContext BuildDebugContext(const framework::OpDesc *op); + + private: + std::unique_ptr builder_; + + using OpFunc = std::function; + std::unordered_map name_function_; + + // stateful variable + std::map tensors_; + + // feed_list_ & fetch_list save paddle tensor id + std::vector feed_list_; + std::vector fetch_list_; + + // inputs_ & outputs_ save popart tensor id + std::vector inputs_; + std::vector outputs_; + + // weights info map + std::vector weights_; + + std::string converted_proto_ = ""; + const IpuStrategy *ipu_strategy_ = nullptr; +}; + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc new file mode 100644 index 0000000000000..a7978ba6f37b1 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc @@ -0,0 +1,209 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/ipu/ipu_executor.h" + +namespace paddle { +namespace platform { +namespace ipu { + +Executor::Executor() {} + +Executor::~Executor() {} + +void Executor::Prepare(const std::string &proto, + const std::map &tensors, + const std::vector &outputs, + std::shared_ptr device) { + auto art = popart::AnchorReturnType("All"); + std::map anchor_ids; + for (const auto &id : outputs) { + anchor_ids.emplace(id, art); + } + + auto dataFlow = popart::DataFlow(ipu_strategy_->batches_per_step, anchor_ids); + + PADDLE_ENFORCE_NOT_NULL(device, platform::errors::Unavailable( + "IPU device isn't attached, please call " + "IpuBackend::AttachDevice(id) first.")); + + if (ipu_strategy_ != nullptr && ipu_strategy_->is_training) { + VLOG(10) << "Creating TrainingSession from Onnx Model..."; + auto popart_optimizer = GetPopartOptimizer(opt_info); + + auto it = tensors.find(opt_info.GetLoss()); + PADDLE_ENFORCE_NE( + it, tensors.end(), + paddle::platform::errors::InvalidArgument( + "loss_id = %s doesn't exist in popart graph.", opt_info.GetLoss())); + + session_ = popart::TrainingSession::createFromOnnxModel( + proto, dataFlow, it->second, *popart_optimizer, device, + popart::InputShapeInfo(), ipu_strategy_->popart_options_, + popart::Patterns(popart::PatternsLevel::Default)); + } else { + VLOG(10) << "Creating InferenceSession from Onnx Model..."; + session_ = popart::InferenceSession::createFromOnnxModel( + proto, dataFlow, device, popart::InputShapeInfo(), + ipu_strategy_->popart_options_, + popart::Patterns(popart::PatternsLevel::Default)); + } + VLOG(10) << "Creating session from Onnx Model...done"; + + VLOG(10) << "Preparing session device..."; + session_->prepareDevice(); + VLOG(10) << "Preparing session device...done"; + + SetWeightsIO(); + + VLOG(10) << "Copy weights from paddle to popart..."; + WeightsFromPaddle(); + VLOG(10) << "Copy weights from paddle to popart...done"; + + VLOG(10) << "Copy weights from host to device..."; + session_->weightsFromHost(); + VLOG(10) << "Copy weights from host to device...done"; + + if (ipu_strategy_->save_init_onnx) { + session_->modelToHost("test_init.onnx"); + } +} + +void Executor::Run(const std::vector &inputs_id, + const std::vector &inputs, + const std::vector &outputs_id, + const std::vector &outputs, + const framework::ExecutionContext &ctx) { + // inputs + std::map popart_inputs; + std::map input_wrappers; + for (size_t i = 0; i < inputs.size(); i++) { + auto tensor_id = inputs_id[i]; + framework::Tensor *tensor = nullptr; + tensor->ShareDataWith(*inputs[i]); + input_wrappers.emplace(tensor_id, PaddleIArray(tensor)); + popart_inputs.emplace(tensor_id, input_wrappers.at(tensor_id)); + } + // anchors + std::map popart_anchors; + std::map anchor_wrappers; + for (size_t i = 0; i < outputs.size(); i++) { + auto tensor_id = outputs_id[i]; + framework::Tensor *tensor = nullptr; + tensor->ShareDataWith(*outputs[i]); + // get dims & dtype from session + auto fetch_info = session_->getInfo(tensor_id); + auto output_shape = fetch_info.shape(); + if (ipu_strategy_->batches_per_step > 1) { + output_shape.insert(output_shape.begin(), + ipu_strategy_->batches_per_step); + } + tensor->Resize(framework::make_ddim(output_shape)); + auto fetch_dtype = fetch_info.dataType(); + auto paddle_type = PopartType2VarType(fetch_dtype); + tensor->mutable_data(ctx.GetPlace(), paddle_type); + anchor_wrappers.emplace(tensor_id, PaddleIArray(tensor)); + popart_anchors.emplace(tensor_id, anchor_wrappers.at(tensor_id)); + } + + if (ipu_strategy_ != nullptr && ipu_strategy_->is_training) { + VLOG(10) << "Update optimizer learning rate..."; + SetLR(GetLRFromScope()); + auto popart_optimizer = GetPopartOptimizer(opt_info); + auto &session = dynamic_cast(*session_); + session.updateOptimizerFromHost(popart_optimizer.get()); + } + + popart::StepIO stepio(popart_inputs, popart_anchors); + VLOG(10) << "Running..."; + session_->run(stepio); + VLOG(10) << "Running...done"; + + if (ipu_strategy_ != nullptr && ipu_strategy_->is_training) { + session_->weightsToHost(); + WeightsToPaddle(); + if (ipu_strategy_->save_last_onnx) { + session_->modelToHost("test_last.onnx"); + } + } +} + +void Executor::SetOptimizerType(const std::string &type) { + opt_info.SetType(type); +} + +void Executor::SetLR(float lr_rate) { opt_info.SetLR(lr_rate); } + +void Executor::SetOptimizerAttr(const std::string &attr, float value) { + opt_info.SetAttr(attr, value); +} + +void Executor::SetLoss(const std::string &loss) { opt_info.SetLoss(loss); } + +void Executor::SetLRVarName(const std::string &name) { + opt_info.SetLRVarName(name); +} + +void Executor::SetWeights(const std::vector &weights) { + weights_ = weights; +} + +void Executor::SetWeightsIO() { + auto opt_type = opt_info.GetType(); + auto pre_post_fix = GetOptPrePostfix(opt_type); + for (const auto &weight_id : weights_) { + for (const auto &pair : pre_post_fix) { + if (!IsOptimizerSupported(opt_type)) { + continue; + } + + // pair.first : popart prefix, pair.second : paddle postfix + auto popart_var_name = pair.first + weight_id; + auto paddle_var_name = weight_id + pair.second; + + if (scope_->FindVar(paddle_var_name) == nullptr) { + continue; + } + + auto var = scope_->GetVar(paddle_var_name); + auto data_ptr = var->GetMutable()->data(); + + auto tensor_info = session_->getInfo(popart_var_name); + weights_io_.insert(popart_var_name, {data_ptr, tensor_info}); + } + } +} + +void Executor::WeightsFromPaddle() { session_->writeWeights(weights_io_); } + +void Executor::WeightsToPaddle() { session_->readWeights(weights_io_); } + +void Executor::SetIpuStrategy(const IpuStrategy &strategy) { + ipu_strategy_ = &strategy; +} + +float Executor::GetLRFromScope() { + auto lr_var = scope_->GetVar(opt_info.GetLRVarName()); + auto tensor = lr_var->Get(); + + PADDLE_ENFORCE_EQ(tensor.type(), framework::proto::VarType::FP32, + platform::errors::InvalidArgument( + "LR requiree float, but got (%s).", tensor.type())); + + return tensor.data()[0]; +} + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.h b/paddle/fluid/platform/device/ipu/ipu_executor.h new file mode 100644 index 0000000000000..400884a2c2b0f --- /dev/null +++ b/paddle/fluid/platform/device/ipu/ipu_executor.h @@ -0,0 +1,83 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/ipu/common.h" +#include "paddle/fluid/platform/ipu/ipu_optimizer.h" +#include "paddle/fluid/platform/ipu/ipu_strategy.h" +#include "paddle/fluid/platform/ipu/ipu_utils.h" + +namespace paddle { +namespace platform { +namespace ipu { + +class Executor { + public: + Executor(); + ~Executor(); + + void Prepare(const std::string &proto, + const std::map &tensors, + const std::vector &outputs, + std::shared_ptr device); + + void Run(const std::vector &inputs_id, + const std::vector &inputs, + const std::vector &outputs_id, + const std::vector &outputs, + const framework::ExecutionContext &ctx); + + // Optimizer + void SetOptimizerType(const std::string &type); + void SetOptimizerAttr(const std::string &attr, float value); + void SetLoss(const std::string &loss); + void SetLR(float lr_rate); + void SetLRVarName(const std::string &name); + + void SetWeights(const std::vector &info); + + void SetWeightsIO(); + void WeightsFromPaddle(); + void WeightsToPaddle(); + + // Scope + void SetScope(const framework::Scope *scope) { scope_ = scope; } + + // Strategy + void SetIpuStrategy(const IpuStrategy &strategy); + + private: + float GetLRFromScope(); + + public: + OptmizerMetaInfo opt_info; + std::unique_ptr session_; + + private: + const framework::Scope *scope_ = nullptr; + const IpuStrategy *ipu_strategy_ = nullptr; + popart::WeightsIO weights_io_; + std::vector weights_; +}; + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_info.cc b/paddle/fluid/platform/device/ipu/ipu_info.cc new file mode 100644 index 0000000000000..c184149a9d38d --- /dev/null +++ b/paddle/fluid/platform/device/ipu/ipu_info.cc @@ -0,0 +1,32 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/device/ipu/ipu_info.h" +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" + +namespace paddle { +namespace platform { + +//! Get a list of device ids from environment variable or use all. +std::vector GetSelectedIPUDevices() { + std::shared_ptr ipu_backend = + platform::ipu::IpuBackend::GetInstance(); + return ipu_backend->GetDeviceIds(); +} + +//! Get the total number of IPU devices in system. +int GetIPUDeviceCount() { + std::shared_ptr ipu_backend = + platform::ipu::IpuBackend::GetInstance(); + return ipu_backend->GetNumDevices(); +} +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/type_defs.h b/paddle/fluid/platform/device/ipu/ipu_info.h similarity index 50% rename from paddle/fluid/platform/type_defs.h rename to paddle/fluid/platform/device/ipu/ipu_info.h index 88a2d16472fa7..3d032eeb4bfc1 100644 --- a/paddle/fluid/platform/type_defs.h +++ b/paddle/fluid/platform/device/ipu/ipu_info.h @@ -1,40 +1,24 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once -#ifdef PADDLE_WITH_HIP -#include -#else -#include -#endif +#ifdef PADDLE_WITH_IPU +#include +#include +#include "glog/logging.h" namespace paddle { - -#ifdef PADDLE_WITH_HIP -#define gpuSuccess hipSuccess -using gpuStream_t = hipStream_t; -using gpuError_t = hipError_t; -using gpuEvent_t = hipEvent_t; -using gpuDeviceProp = hipDeviceProp_t; -#else -#define gpuSuccess cudaSuccess -using gpuStream_t = cudaStream_t; -using gpuError_t = cudaError_t; -using gpuEvent_t = cudaEvent_t; -using gpuDeviceProp = cudaDeviceProp; -#endif - -using CUDAGraphID = unsigned long long; // NOLINT +namespace platform { +std::vector GetSelectedIPUDevices(); +int GetIPUDeviceCount(); +} // namespace platform } // namespace paddle +#endif diff --git a/paddle/fluid/platform/device/ipu/ipu_optimizer.cc b/paddle/fluid/platform/device/ipu/ipu_optimizer.cc new file mode 100644 index 0000000000000..92bb2ca3afcf8 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/ipu_optimizer.cc @@ -0,0 +1,136 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/device/ipu/ipu_optimizer.h" + +namespace paddle { +namespace platform { +namespace ipu { + +OptmizerMetaInfo::OptmizerMetaInfo() {} + +OptmizerMetaInfo::~OptmizerMetaInfo() {} + +void OptmizerMetaInfo::SetType(const std::string &type) { + type_ = OptTypeStr2Enum(type); +} + +float OptmizerMetaInfo::GetAttr(const std::string &attr, + float default_value) const { + if (attrs_.count(attr) == 0) { + return default_value; + } + return attrs_.at(attr); +} + +void OptmizerMetaInfo::SetAttr(const std::string &attr, float value) { + attrs_[attr] = value; +} + +OptimizerType OptTypeStr2Enum(const std::string type) { + if (type == "sgd") { + return OptimizerType::SGD; + } else if (type == "adam") { + return OptimizerType::Adam; + } else if (type == "lamb") { + return OptimizerType::Lamb; + } else { + return OptimizerType::Undefined; + } +} + +std::unique_ptr GetPopartOptimizer( + const OptmizerMetaInfo &opt_meta_info) { + auto opt_type = opt_meta_info.GetType(); + PADDLE_ENFORCE_NE( + opt_type, OptimizerType::Undefined, + platform::errors::InvalidArgument("Optimizer type have not been set.")); + + if (opt_type == OptimizerType::SGD) { + auto optimizer = std::make_unique( + popart::OptimizerValue(opt_meta_info.GetLR(), false), + popart::OptimizerValue(popart::SGD::getUnsetWeightDecay()), + popart::OptimizerValue(popart::SGD::getUnsetMomentum()), + popart::OptimizerValue(popart::SGD::getUnsetDampening()), + popart::OptimizerValue(popart::SGD::getUnsetVelocityScaling()), + popart::OptimizerValue(popart::SGD::getUnsetLossScaling())); + return optimizer; + } else if (opt_type == OptimizerType::Adam) { + auto optimizer = std::make_unique( + popart::OptimizerValue(opt_meta_info.GetLR(), false), + popart::OptimizerValue(popart::Adam::getUnsetWeightDecay()), + popart::OptimizerValue(opt_meta_info.GetAttr("beta1"), false), + popart::OptimizerValue(opt_meta_info.GetAttr("beta2"), false), + popart::OptimizerValue(opt_meta_info.GetAttr("epsilon"), false), + popart::OptimizerValue(popart::Adam::getUnsetLossScaling()), + popart::AdamMode::Adam, popart::WeightDecayMode::Decay, + popart::DataType::FLOAT, popart::DataType::FLOAT, + popart::DataType::FLOAT); + return optimizer; + } else if (opt_type == OptimizerType::Lamb) { + auto optimizer = std::make_unique( + popart::OptimizerValue(opt_meta_info.GetLR(), false), + popart::OptimizerValue(opt_meta_info.GetAttr("weight_decay"), false), + popart::OptimizerValue(opt_meta_info.GetAttr("beta1"), false), + popart::OptimizerValue(opt_meta_info.GetAttr("beta2"), false), + popart::OptimizerValue(opt_meta_info.GetAttr("epsilon"), false), + popart::OptimizerValue(popart::Adam::getUnsetLossScaling()), + popart::AdamMode::Lamb, popart::WeightDecayMode::Decay, + popart::DataType::FLOAT, popart::DataType::FLOAT, + popart::DataType::FLOAT); + return optimizer; + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Optimizer %d is not implemented now.", static_cast(opt_type))); + } +} + +bool IsOptimizerSupported(OptimizerType type) { + switch (type) { + case OptimizerType::SGD: + case OptimizerType::Adam: + case OptimizerType::Lamb: + return true; + default: + return false; + } +} + +std::vector> GetOptPrePostfix( + OptimizerType opt_type) { + // format: {popart_tensor_id, paddle_tensor_id}, ... + std::vector> pre_post_fix; + + switch (opt_type) { + case OptimizerType::SGD: + pre_post_fix.push_back(std::make_pair("", "")); + break; + case OptimizerType::Adam: + case OptimizerType::Lamb: + pre_post_fix.push_back(std::make_pair("", "")); + pre_post_fix.push_back(std::make_pair("Accl1___", "_moment1_0")); + pre_post_fix.push_back(std::make_pair("Accl2___", "_moment2_0")); + pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0")); + break; + default: + pre_post_fix.push_back(std::make_pair("", "")); + break; + } + + return pre_post_fix; +} + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_optimizer.h b/paddle/fluid/platform/device/ipu/ipu_optimizer.h new file mode 100644 index 0000000000000..ee16abce398fb --- /dev/null +++ b/paddle/fluid/platform/device/ipu/ipu_optimizer.h @@ -0,0 +1,76 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { +namespace ipu { + +enum class OptimizerType { SGD = 0, Adam, Lamb, Undefined }; + +class OptmizerMetaInfo { + public: + OptmizerMetaInfo(); + ~OptmizerMetaInfo(); + + void SetType(const std::string &type); + OptimizerType GetType() const { return type_; } + + void SetAttr(const std::string &attr, float value); + float GetAttr(const std::string &attr, float default_value = 0.0f) const; + + void SetLoss(const std::string &loss) { loss_ = loss; } + std::string GetLoss() const { return loss_; } + + void SetLR(float lr_rate) { lr_rate_ = lr_rate; } + float GetLR() const { return lr_rate_; } + + void SetLRVarName(const std::string &name) { lr_var_name_ = name; } + std::string GetLRVarName() const { return lr_var_name_; } + + private: + // type: adam, sgd, ... + OptimizerType type_ = OptimizerType::Undefined; + + // loss: loss TensorId + std::string loss_; + + // attrs: beta1, beta2, ... + std::map attrs_; + + // learning rate + float lr_rate_ = 1.0; + std::string lr_var_name_; +}; + +OptimizerType OptTypeStr2Enum(const std::string type); + +std::unique_ptr GetPopartOptimizer( + const OptmizerMetaInfo &info); + +bool IsOptimizerSupported(OptimizerType type); + +std::vector> GetOptPrePostfix( + OptimizerType type); + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc new file mode 100644 index 0000000000000..47e7e332c8fba --- /dev/null +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc @@ -0,0 +1,21 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/ipu/ipu_strategy.h" + +namespace paddle { +namespace platform { +namespace ipu {} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h new file mode 100644 index 0000000000000..7e07d517e1031 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +namespace paddle { +namespace platform { +namespace ipu { + +using VirtualGraphMode = popart::VirtualGraphMode; + +struct IpuStrategy { + int num_ipus = 1; + int batches_per_step = 1; + int batch_size = 1; + bool is_training = true; + bool save_init_onnx = false; + bool save_last_onnx = true; + popart::SessionOptions popart_options_; + bool need_avg_shard = false; + bool enable_fp16 = false; +}; + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.cc b/paddle/fluid/platform/device/ipu/ipu_utils.cc new file mode 100644 index 0000000000000..08ba50415dd5f --- /dev/null +++ b/paddle/fluid/platform/device/ipu/ipu_utils.cc @@ -0,0 +1,155 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/ipu/ipu_utils.h" + +namespace paddle { +namespace platform { +namespace ipu { + +void* PaddleIArray::data() { return tensor_->data(); } + +popart::DataType PaddleIArray::dataType() const { + return VarType2PopartType(tensor_->type()); +} + +std::size_t PaddleIArray::rank() const { return tensor_->dims().size(); } + +int64_t PaddleIArray::dim(size_t index) const { + return tensor_->dims().at(index); +} + +std::size_t PaddleIArray::nelms() const { + return std::accumulate(shape_.begin(), shape_.end(), static_cast(1), + std::multiplies()); +} + +const popart::Shape PaddleIArray::shape() const { return shape_; } + +popart::DataType VarType2PopartType( + const framework::proto::VarType::Type type) { + switch (type) { + case framework::proto::VarType::UINT8: + return popart::DataType::UINT8; + case framework::proto::VarType::INT8: + return popart::DataType::INT8; + case framework::proto::VarType::INT16: + return popart::DataType::INT16; + case framework::proto::VarType::INT32: + return popart::DataType::INT32; + case framework::proto::VarType::INT64: + return popart::DataType::INT64; + case framework::proto::VarType::BOOL: + return popart::DataType::BOOL; + case framework::proto::VarType::FP64: + return popart::DataType::DOUBLE; + case framework::proto::VarType::FP32: + return popart::DataType::FLOAT; + case framework::proto::VarType::FP16: + return popart::DataType::FLOAT16; + case framework::proto::VarType::BF16: + return popart::DataType::BFLOAT16; + case framework::proto::VarType::COMPLEX64: + return popart::DataType::COMPLEX64; + case framework::proto::VarType::COMPLEX128: + return popart::DataType::COMPLEX128; + default: + PADDLE_THROW(paddle::platform::errors::Unavailable( + "Unsupported Paddle var type.")); + } +} + +framework::proto::VarType::Type PopartType2VarType( + const popart::DataType type) { + switch (type) { + case popart::DataType::UINT8: + return framework::proto::VarType::UINT8; + case popart::DataType::INT8: + return framework::proto::VarType::INT8; + case popart::DataType::INT16: + return framework::proto::VarType::INT16; + case popart::DataType::INT32: + return framework::proto::VarType::INT32; + case popart::DataType::INT64: + return framework::proto::VarType::INT64; + case popart::DataType::BOOL: + return framework::proto::VarType::BOOL; + case popart::DataType::DOUBLE: + return framework::proto::VarType::FP64; + case popart::DataType::FLOAT: + return framework::proto::VarType::FP32; + case popart::DataType::FLOAT16: + return framework::proto::VarType::FP16; + case popart::DataType::BFLOAT16: + return framework::proto::VarType::BF16; + case popart::DataType::COMPLEX64: + return framework::proto::VarType::COMPLEX64; + case popart::DataType::COMPLEX128: + return framework::proto::VarType::COMPLEX128; + default: + PADDLE_THROW(paddle::platform::errors::Unavailable( + "Unsupported Paddle var type.")); + } +} + +popart::DataType OnnxDtype2PopartType(const int type) { + auto dtype = static_cast(type); + switch (dtype) { + case ONNXDataType::BOOL: + return popart::DataType::BOOL; + case ONNXDataType::INT16: + return popart::DataType::INT16; + case ONNXDataType::INT32: + return popart::DataType::INT32; + case ONNXDataType::INT64: + return popart::DataType::INT64; + case ONNXDataType::FLOAT16: + return popart::DataType::FLOAT16; + case ONNXDataType::FLOAT: + return popart::DataType::FLOAT; + case ONNXDataType::DOUBLE: + return popart::DataType::DOUBLE; + case ONNXDataType::UINT8: + return popart::DataType::UINT8; + case ONNXDataType::INT8: + return popart::DataType::INT8; + case ONNXDataType::BFLOAT16: + return popart::DataType::BFLOAT16; + case ONNXDataType::COMPLEX64: + return popart::DataType::COMPLEX64; + case ONNXDataType::COMPLEX128: + return popart::DataType::COMPLEX128; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported ONNX data type: %d.", dtype)); + } +} + +// count num should > 0 +bool GetBoolEnv(std::string str) { + char* str_val = getenv(str.c_str()); + if (str_val == NULL) { + return false; + } else { + bool val = false; + if (strcmp(str_val, "1") == 0 || strcmp(str_val, "true") == 0 || + strcmp(str_val, "True") == 0 || strcmp(str_val, "TRUE") == 0) + val = true; + return val; + } +} + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.h b/paddle/fluid/platform/device/ipu/ipu_utils.h new file mode 100644 index 0000000000000..670427128b870 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/ipu_utils.h @@ -0,0 +1,101 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace platform { +namespace ipu { + +// onnx dtype +// https://github.com/onnx/onnx/blob/master/onnx/onnx-ml.proto3 +enum ONNXDataType : int { + UNDEFINED = 0, + FLOAT = 1, + UINT8 = 2, + INT8 = 3, + UINT16 = 4, + INT16 = 5, + INT32 = 6, + INT64 = 7, + STRING = 8, + BOOL = 9, + FLOAT16 = 10, + DOUBLE = 11, + UINT32 = 12, + UINT64 = 13, + COMPLEX64 = 14, + COMPLEX128 = 15, + BFLOAT16 = 16 +}; + +class PaddleIArray final : public popart::IArray { + public: + explicit PaddleIArray(framework::Tensor *tensor) : tensor_(tensor) { + for (int i = 0; i < tensor->dims().size(); ++i) { + shape_.push_back(tensor->dims().at(i)); + } + } + + public: + void *data(); + popart::DataType dataType() const; + std::size_t rank() const; + int64_t dim(size_t index) const; + std::size_t nelms() const; + const popart::Shape shape() const; + + private: + framework::Tensor *tensor_; + std::vector shape_; +}; + +popart::DataType VarType2PopartType(const framework::proto::VarType::Type type); +framework::proto::VarType::Type PopartType2VarType(const popart::DataType type); +popart::DataType OnnxDtype2PopartType(const int type); +bool GetBoolEnv(std::string str); + +template +std::unique_ptr> Tensor2IArray( + const framework::Tensor &tensor) { + auto dtype = VarType2PopartType(tensor.type()); + auto shape = std::vector(); + for (size_t i = 0; i < tensor.dims().size(); ++i) { + shape.push_back(tensor.dims().at(i)); + } + popart::TensorInfo tensor_info(dtype, shape); + + return std::make_unique>( + reinterpret_cast(tensor.data()), tensor_info); +} + +template +std::unique_ptr> LoDTensor2IArray( + framework::LoDTensor const &lod_tensor) { + PADDLE_ENFORCE_EQ( + lod_tensor.lod().size(), 0UL, + platform::errors::InvalidArgument("LoDTensor2IArray is Unimplemented")); + return Tensor2IArray(lod_tensor); +} + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h new file mode 100644 index 0000000000000..4cd7f928f6e22 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h @@ -0,0 +1,197 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// clang-format off + +#pragma once + +// Ops from AiGraphcoreOpset1 +OP_DECL(popart_groupnormalization_v2, aiGraphcoreOpset.groupnormalization, ARG(INT,num_groups) ARG(FLOAT,epsilon) ) // NOLINT +OP_DECL(popart_subsample_v2, aiGraphcoreOpset.subsample, ARG(INT_VEC,strides) ) // NOLINT +OP_DECL(popart_nop_v2, aiGraphcoreOpset.nop, NONE) // NOLINT +OP_DECL(popart_scale_v2, aiGraphcoreOpset.scale, ARG(FLOAT,scale) ) // NOLINT +OP_DECL(popart_scaledadd_v2, aiGraphcoreOpset.scaledadd, ARG(FLOAT,scale0) ARG(FLOAT,scale1) ) // NOLINT +OP_DECL(popart_gelu_v2, aiGraphcoreOpset.gelu, NONE) // NOLINT +OP_DECL(popart_detach_v2, aiGraphcoreOpset.detach, NONE) // NOLINT +OP_DECL(popart_depthtospace_v2, aiGraphcoreOpset.depthtospace, ARG(INT,blocksize) ARG(STRING,mode) ) // NOLINT +OP_DECL(popart_round_v2, aiGraphcoreOpset.round, NONE) // NOLINT +OP_DECL(popart_dynamicslice_v2, aiGraphcoreOpset.dynamicslice, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ARG(INT,noOverlap) ) // NOLINT +OP_DECL(popart_dynamicupdate_v2, aiGraphcoreOpset.dynamicupdate, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ARG(INT,noOverlap) ) // NOLINT +OP_DECL(popart_dynamiczero_v2, aiGraphcoreOpset.dynamiczero, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ) // NOLINT +OP_DECL(popart_dynamicadd_v2, aiGraphcoreOpset.dynamicadd, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ) // NOLINT +OP_DECL(popart_sequenceslice_v2, aiGraphcoreOpset.sequenceslice, ARG(INT,zeroUnused) ) // NOLINT +OP_DECL(popart_replicatedallreduce_v2, aiGraphcoreOpset.replicatedallreduce, OPT_ARG(INT_VEC,commGroup) ) // NOLINT +OP_DECL(popart_ctcbeamsearchdecoder_v2, aiGraphcoreOpset.ctcbeamsearchdecoder, ARG(INT,blank) ARG(INT,beamWidth) ARG(INT,topPaths) ) // NOLINT +OP_DECL(popart_shapeddropout_v2, aiGraphcoreOpset.shapeddropout, ARG(INT_VEC,shape) ARG(FLOAT,ratio) ) // NOLINT +OP_DECL(popart_atan2_v2, aiGraphcoreOpset.atan2, NONE) // NOLINT +OP_DECL(popart_expm1_v2, aiGraphcoreOpset.expm1, NONE) // NOLINT +OP_DECL(popart_log1p_v2, aiGraphcoreOpset.log1p, NONE) // NOLINT +OP_DECL(popart_fmod_v2, aiGraphcoreOpset.fmod, NONE) // NOLINT +OP_DECL(popart_remainder_v2, aiGraphcoreOpset.remainder, NONE) // NOLINT +OP_DECL(popart_reverse_v2, aiGraphcoreOpset.reverse, ARG(INT_VEC,dimensions) ) // NOLINT +OP_DECL(popart_bitwisenot_v2, aiGraphcoreOpset.bitwisenot, NONE) // NOLINT +OP_DECL(popart_bitwiseand_v2, aiGraphcoreOpset.bitwiseand, NONE) // NOLINT +OP_DECL(popart_bitwiseor_v2, aiGraphcoreOpset.bitwiseor, NONE) // NOLINT +OP_DECL(popart_bitwisexor_v2, aiGraphcoreOpset.bitwisexor, NONE) // NOLINT +OP_DECL(popart_bitwisexnor_v2, aiGraphcoreOpset.bitwisexnor, NONE) // NOLINT +OP_DECL(popart_reducemedian_v2, aiGraphcoreOpset.reducemedian, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT +// Ops from AiOnnxOpset11 +OP_DECL(popart_argmax, aiOnnxOpset.argmax, ARG(INT,axis) ARG(INT,keepdims) ) // NOLINT +OP_DECL(popart_argmin, aiOnnxOpset.argmin, ARG(INT,axis) ARG(INT,keepdims) ) // NOLINT +OP_DECL(popart_averagepool, aiOnnxOpset.averagepool, ARG(INT_VEC,kernel_shape) ARG(INT,ceil_mode) ARG(INT,count_include_pad) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT +OP_DECL(popart_bitshift, aiOnnxOpset.bitshift, ARG(STRING,direction) ) // NOLINT +OP_DECL(popart_clip, aiOnnxOpset.clip, NONE) // NOLINT +OP_DECL(popart_compress, aiOnnxOpset.compress, OPT_ARG(INT,axis) ) // NOLINT +OP_DECL(popart_concat, aiOnnxOpset.concat, ARG(INT,axis) ) // NOLINT +OP_DECL(popart_concatfromsequence, aiOnnxOpset.concatfromsequence, ARG(INT,axis) ARG(INT,new_axis) ) // NOLINT +OP_DECL(popart_conv, aiOnnxOpset.conv, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT +OP_DECL(popart_convtranspose, aiOnnxOpset.convtranspose, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,output_padding) ARG(INT_VEC,output_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT +OP_DECL(popart_cumsum, aiOnnxOpset.cumsum, ARG(INT,exclusive) ARG(INT,reverse) ) // NOLINT +OP_DECL(popart_depthtospace, aiOnnxOpset.depthtospace, ARG(INT,blocksize) ARG(STRING,mode) ) // NOLINT +OP_DECL(popart_det, aiOnnxOpset.det, NONE) // NOLINT +OP_DECL(popart_dynamicquantizelinear, aiOnnxOpset.dynamicquantizelinear, NONE) // NOLINT +OP_DECL(popart_equal, aiOnnxOpset.equal, NONE) // NOLINT +OP_DECL(popart_flatten, aiOnnxOpset.flatten, ARG(INT,axis) ) // NOLINT +OP_DECL(popart_gather, aiOnnxOpset.gather, ARG(INT,axis) ) // NOLINT +OP_DECL(popart_gatherelements, aiOnnxOpset.gatherelements, ARG(INT,axis) ) // NOLINT +OP_DECL(popart_gathernd, aiOnnxOpset.gathernd, NONE) // NOLINT +OP_DECL(popart_gemm, aiOnnxOpset.gemm, ARG(FLOAT,alpha) ARG(FLOAT,beta) ARG(INT,transA) ARG(INT,transB) ) // NOLINT +OP_DECL(popart_hardmax, aiOnnxOpset.hardmax, ARG(INT,axis) ) // NOLINT +OP_DECL(popart_logsoftmax, aiOnnxOpset.logsoftmax, ARG(INT,axis) ) // NOLINT +OP_DECL(popart_lppool, aiOnnxOpset.lppool, ARG(INT_VEC,kernel_shape) ARG(INT,p) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT +OP_DECL(popart_maxpool, aiOnnxOpset.maxpool, ARG(INT,num_outputs) ARG(INT_VEC,kernel_shape) ARG(INT,ceil_mode) ARG(INT_VEC,dilations) ARG(INT_VEC,pads) ARG(INT,storage_order) ARG(INT_VEC,strides) ) // NOLINT +OP_DECL(popart_maxunpool, aiOnnxOpset.maxunpool, ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT +OP_DECL(popart_nonmaxsuppression, aiOnnxOpset.nonmaxsuppression, ARG(INT,center_point_box) ) // NOLINT +OP_DECL(popart_onehot, aiOnnxOpset.onehot, ARG(INT,axis) ) // NOLINT +OP_DECL(popart_pad, aiOnnxOpset.pad, ARG(STRING,mode) ) // NOLINT +OP_DECL(popart_range, aiOnnxOpset.range, NONE) // NOLINT +OP_DECL(popart_reducel1, aiOnnxOpset.reducel1, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT +OP_DECL(popart_reducel2, aiOnnxOpset.reducel2, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT +OP_DECL(popart_reducelogsum, aiOnnxOpset.reducelogsum, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT +OP_DECL(popart_reducelogsumexp, aiOnnxOpset.reducelogsumexp, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT +OP_DECL(popart_reducemax, aiOnnxOpset.reducemax, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT +OP_DECL(popart_reducemean, aiOnnxOpset.reducemean, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT +OP_DECL(popart_reducemin, aiOnnxOpset.reducemin, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT +OP_DECL(popart_reduceprod, aiOnnxOpset.reduceprod, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT +OP_DECL(popart_reducesum, aiOnnxOpset.reducesum, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT +OP_DECL(popart_reducesumsquare, aiOnnxOpset.reducesumsquare, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT +OP_DECL(popart_resize, aiOnnxOpset.resize, ARG(STRING,coordinate_transformation_mode) ARG(FLOAT,cubic_coeff_a) ARG(INT,exclude_outside) ARG(FLOAT,extrapolation_value) ARG(STRING,mode) ARG(STRING,nearest_mode) ) // NOLINT +OP_DECL(popart_round, aiOnnxOpset.round, NONE) // NOLINT +OP_DECL(popart_scatter, aiOnnxOpset.scatter, ARG(INT,axis) ) // NOLINT +OP_DECL(popart_scatterelements, aiOnnxOpset.scatterelements, ARG(INT,axis) ) // NOLINT +OP_DECL(popart_scatternd, aiOnnxOpset.scatternd, NONE) // NOLINT +OP_DECL(popart_sequenceat, aiOnnxOpset.sequenceat, NONE) // NOLINT +OP_DECL(popart_sequenceconstruct, aiOnnxOpset.sequenceconstruct, NONE) // NOLINT +OP_DECL(popart_sequenceerase, aiOnnxOpset.sequenceerase, NONE) // NOLINT +OP_DECL(popart_sequenceinsert, aiOnnxOpset.sequenceinsert, NONE) // NOLINT +OP_DECL(popart_sequencelength, aiOnnxOpset.sequencelength, NONE) // NOLINT +OP_DECL(popart_slice, aiOnnxOpset.slice, NONE) // NOLINT +OP_DECL(popart_softmax, aiOnnxOpset.softmax, ARG(INT,axis) ) // NOLINT +OP_DECL(popart_split, aiOnnxOpset.split, ARG(INT,num_outputs) ARG(INT,axis) ARG(INT_VEC,split) ) // NOLINT +OP_DECL(popart_splittosequence, aiOnnxOpset.splittosequence, ARG(INT,axis) ARG(INT,keepdims) ) // NOLINT +OP_DECL(popart_squeeze, aiOnnxOpset.squeeze, ARG(INT_VEC,axes) ) // NOLINT +OP_DECL(popart_topk, aiOnnxOpset.topk, ARG(INT,axis) ARG(INT,largest) ARG(INT,sorted) ) // NOLINT +OP_DECL(popart_unique, aiOnnxOpset.unique, ARG(INT,num_outputs) OPT_ARG(INT,axis) ARG(INT,sorted) ) // NOLINT +OP_DECL(popart_unsqueeze, aiOnnxOpset.unsqueeze, ARG(INT_VEC,axes) ) // NOLINT +// Ops from AiOnnxOpset10 +OP_DECL(popart_convinteger, aiOnnxOpset.convinteger, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT +OP_DECL(popart_dequantizelinear, aiOnnxOpset.dequantizelinear, NONE) // NOLINT +OP_DECL(popart_dropout, aiOnnxOpset.dropout, ARG(INT,num_outputs) ARG(FLOAT,ratio) ) // NOLINT +OP_DECL(popart_isinf, aiOnnxOpset.isinf, ARG(INT,detect_negative) ARG(INT,detect_positive) ) // NOLINT +OP_DECL(popart_matmulinteger, aiOnnxOpset.matmulinteger, NONE) // NOLINT +OP_DECL(popart_mod, aiOnnxOpset.mod, ARG(INT,fmod) ) // NOLINT +OP_DECL(popart_qlinearconv, aiOnnxOpset.qlinearconv, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT +OP_DECL(popart_qlinearmatmul, aiOnnxOpset.qlinearmatmul, NONE) // NOLINT +OP_DECL(popart_quantizelinear, aiOnnxOpset.quantizelinear, NONE) // NOLINT +OP_DECL(popart_reversesequence, aiOnnxOpset.reversesequence, ARG(INT,batch_axis) ARG(INT,time_axis) ) // NOLINT +OP_DECL(popart_roialign, aiOnnxOpset.roialign, ARG(STRING,mode) ARG(INT,output_height) ARG(INT,output_width) ARG(INT,sampling_ratio) ARG(FLOAT,spatial_scale) ) // NOLINT +OP_DECL(popart_thresholdedrelu, aiOnnxOpset.thresholdedrelu, ARG(FLOAT,alpha) ) // NOLINT +OP_DECL(popart_upsample, aiOnnxOpset.upsample, ARG(STRING,mode) ) // NOLINT +// Ops from AiOnnxOpset9 +OP_DECL(popart_acosh, aiOnnxOpset.acosh, NONE) // NOLINT +OP_DECL(popart_asinh, aiOnnxOpset.asinh, NONE) // NOLINT +OP_DECL(popart_atanh, aiOnnxOpset.atanh, NONE) // NOLINT +OP_DECL(popart_batchnormalization, aiOnnxOpset.batchnormalization, ARG(INT,num_outputs) ARG(FLOAT,epsilon) ARG(FLOAT,momentum) ) // NOLINT +OP_DECL(popart_cast, aiOnnxOpset.cast, ARG(STRING,to) ) // NOLINT +OP_DECL(popart_cosh, aiOnnxOpset.cosh, NONE) // NOLINT +OP_DECL(popart_erf, aiOnnxOpset.erf, NONE) // NOLINT +OP_DECL(popart_eyelike, aiOnnxOpset.eyelike, OPT_ARG(INT,dtype) ARG(INT,k) ) // NOLINT +OP_DECL(popart_greater, aiOnnxOpset.greater, NONE) // NOLINT +OP_DECL(popart_isnan, aiOnnxOpset.isnan, NONE) // NOLINT +OP_DECL(popart_less, aiOnnxOpset.less, NONE) // NOLINT +OP_DECL(popart_matmul, aiOnnxOpset.matmul, NONE) // NOLINT +OP_DECL(popart_meanvariancenormalization, aiOnnxOpset.meanvariancenormalization, ARG(INT_VEC,axes) ) // NOLINT +OP_DECL(popart_nonzero, aiOnnxOpset.nonzero, NONE) // NOLINT +OP_DECL(popart_prelu, aiOnnxOpset.prelu, NONE) // NOLINT +OP_DECL(popart_shrink, aiOnnxOpset.shrink, ARG(FLOAT,bias) ARG(FLOAT,lambd) ) // NOLINT +OP_DECL(popart_sign, aiOnnxOpset.sign, NONE) // NOLINT +OP_DECL(popart_sinh, aiOnnxOpset.sinh, NONE) // NOLINT +OP_DECL(popart_where, aiOnnxOpset.where, NONE) // NOLINT +// Ops from AiOnnxOpset8 +OP_DECL(popart_expand, aiOnnxOpset.expand, NONE) // NOLINT +OP_DECL(popart_max, aiOnnxOpset.max, NONE) // NOLINT +OP_DECL(popart_mean, aiOnnxOpset.mean, NONE) // NOLINT +OP_DECL(popart_min, aiOnnxOpset.min, NONE) // NOLINT +OP_DECL(popart_sum, aiOnnxOpset.sum, NONE) // NOLINT +// Ops from AiOnnxOpset7 +OP_DECL(popart_acos, aiOnnxOpset.acos, NONE) // NOLINT +OP_DECL(popart_add, aiOnnxOpset.add, NONE) // NOLINT +OP_DECL(popart_logical_and, aiOnnxOpset.logical_and, NONE) // NOLINT +OP_DECL(popart_asin, aiOnnxOpset.asin, NONE) // NOLINT +OP_DECL(popart_atan, aiOnnxOpset.atan, NONE) // NOLINT +OP_DECL(popart_cos, aiOnnxOpset.cos, NONE) // NOLINT +OP_DECL(popart_div, aiOnnxOpset.div, NONE) // NOLINT +OP_DECL(popart_mul, aiOnnxOpset.mul, NONE) // NOLINT +OP_DECL(popart_multinomial, aiOnnxOpset.multinomial, ARG(INT,dtype) ARG(INT,sample_size) OPT_ARG(FLOAT,seed) ) // NOLINT +OP_DECL(popart_logical_or, aiOnnxOpset.logical_or, NONE) // NOLINT +OP_DECL(popart_pow, aiOnnxOpset.pow, NONE) // NOLINT +OP_DECL(popart_sin, aiOnnxOpset.sin, NONE) // NOLINT +OP_DECL(popart_sub, aiOnnxOpset.sub, NONE) // NOLINT +OP_DECL(popart_tan, aiOnnxOpset.tan, NONE) // NOLINT +OP_DECL(popart_logical_xor, aiOnnxOpset.logical_xor, NONE) // NOLINT +// Ops from AiOnnxOpset6 +OP_DECL(popart_abs, aiOnnxOpset.abs, NONE) // NOLINT +OP_DECL(popart_ceil, aiOnnxOpset.ceil, NONE) // NOLINT +OP_DECL(popart_elu, aiOnnxOpset.elu, ARG(FLOAT,alpha) ) // NOLINT +OP_DECL(popart_exp, aiOnnxOpset.exp, NONE) // NOLINT +OP_DECL(popart_floor, aiOnnxOpset.floor, NONE) // NOLINT +OP_DECL(popart_globalaveragepool, aiOnnxOpset.globalaveragepool, NONE) // NOLINT +OP_DECL(popart_globallppool, aiOnnxOpset.globallppool, ARG(INT,p) ) // NOLINT +OP_DECL(popart_globalmaxpool, aiOnnxOpset.globalmaxpool, NONE) // NOLINT +OP_DECL(popart_hardsigmoid, aiOnnxOpset.hardsigmoid, ARG(FLOAT,alpha) ARG(FLOAT,beta) ) // NOLINT +OP_DECL(popart_identity, aiOnnxOpset.identity, NONE) // NOLINT +OP_DECL(popart_instancenormalization, aiOnnxOpset.instancenormalization, ARG(FLOAT,epsilon) ) // NOLINT +OP_DECL(popart_lrn, aiOnnxOpset.lrn, ARG(INT,size) ARG(FLOAT,alpha) ARG(FLOAT,beta) ARG(FLOAT,bias) ) // NOLINT +OP_DECL(popart_leakyrelu, aiOnnxOpset.leakyrelu, ARG(FLOAT,alpha) ) // NOLINT +OP_DECL(popart_log, aiOnnxOpset.log, NONE) // NOLINT +OP_DECL(popart_lpnormalization, aiOnnxOpset.lpnormalization, ARG(INT,axis) ARG(INT,p) ) // NOLINT +OP_DECL(popart_maxroipool, aiOnnxOpset.maxroipool, ARG(INT_VEC,pooled_shape) ARG(FLOAT,spatial_scale) ) // NOLINT +OP_DECL(popart_neg, aiOnnxOpset.neg, NONE) // NOLINT +OP_DECL(popart_logical_not, aiOnnxOpset.logical_not, NONE) // NOLINT +OP_DECL(popart_randomnormallike, aiOnnxOpset.randomnormallike, OPT_ARG(INT,dtype) ARG(FLOAT,mean) ARG(FLOAT,scale) OPT_ARG(FLOAT,seed) ) // NOLINT +OP_DECL(popart_randomuniformlike, aiOnnxOpset.randomuniformlike, OPT_ARG(INT,dtype) ARG(FLOAT,high) ARG(FLOAT,low) OPT_ARG(FLOAT,seed) ) // NOLINT +OP_DECL(popart_reciprocal, aiOnnxOpset.reciprocal, NONE) // NOLINT +OP_DECL(popart_relu, aiOnnxOpset.relu, NONE) // NOLINT +OP_DECL(popart_reshape, aiOnnxOpset.reshape, NONE) // NOLINT +OP_DECL(popart_selu, aiOnnxOpset.selu, ARG(FLOAT,alpha) ARG(FLOAT,gamma) ) // NOLINT +OP_DECL(popart_shape, aiOnnxOpset.shape, NONE) // NOLINT +OP_DECL(popart_sigmoid, aiOnnxOpset.sigmoid, NONE) // NOLINT +OP_DECL(popart_size, aiOnnxOpset.size, NONE) // NOLINT +OP_DECL(popart_softplus, aiOnnxOpset.softplus, NONE) // NOLINT +OP_DECL(popart_softsign, aiOnnxOpset.softsign, NONE) // NOLINT +OP_DECL(popart_spacetodepth, aiOnnxOpset.spacetodepth, ARG(INT,blocksize) ) // NOLINT +OP_DECL(popart_sqrt, aiOnnxOpset.sqrt, NONE) // NOLINT +OP_DECL(popart_tanh, aiOnnxOpset.tanh, NONE) // NOLINT +OP_DECL(popart_tile, aiOnnxOpset.tile, NONE) // NOLINT +OP_DECL(popart_transpose, aiOnnxOpset.transpose, ARG(INT_VEC,perm) ) // NOLINT diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h index f1ef8650be4c1..69cea31446680 100644 --- a/paddle/fluid/platform/device/npu/hccl_helper.h +++ b/paddle/fluid/platform/device/npu/hccl_helper.h @@ -66,11 +66,11 @@ inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) { // inline HCCLGroupGuard() { // HCCLMutex().lock(); -// PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart()); +// PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart()); // } // inline ~HCCLGroupGuard() PADDLE_MAY_THROW { -// PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd()); +// PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd()); // HCCLMutex().unlock(); // } // }; diff --git a/paddle/fluid/platform/device/xpu/xpu1_op_list.h b/paddle/fluid/platform/device/xpu/xpu1_op_list.h index 1cc7bba132e59..d6b466ff92c5b 100644 --- a/paddle/fluid/platform/device/xpu/xpu1_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu1_op_list.h @@ -29,40 +29,35 @@ using XPUOpMap = std::unordered_map; XPUOpMap& get_kl1_ops() { // KL1支持的op,通过op_name, data_type, place来索引 static XPUOpMap s_xpu1_kernels{ - {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"sigmoid_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"gelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"sqrt_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"hard_switch", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"hard_switch_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"leaky_relu_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"abs", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"affine_channel", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"accuracy", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"affine_channel_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"affine_channel", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"assign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace())})}, - {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"batch_norm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"bilinear_interp", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"bilinear_interp_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"bilinear_interp_v2", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"bilinear_interp_v2_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"broadcast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace())})}, {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace())})}, @@ -72,188 +67,197 @@ XPUOpMap& get_kl1_ops() { XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace())})}, - {"c_reduce_sum", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"c_allreduce_sum", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"broadcast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP64, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace())})}, {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"logicalor", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()), - pOpKernelType(vartype::INT8, XPUPlace()), - pOpKernelType(vartype::INT16, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, - {"logicaland", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()), - pOpKernelType(vartype::INT8, XPUPlace()), - pOpKernelType(vartype::INT16, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, - {"logicalnot", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()), - pOpKernelType(vartype::INT8, XPUPlace()), - pOpKernelType(vartype::INT16, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, - {"depthwise_conv2d", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"depthwise_conv2d_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"deformable_conv", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"deformable_conv_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"depthwise_conv2d", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"depthwise_conv2d_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"dropout_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"elementwise_sub", + {"c_allreduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"elementwise_sub_grad", + {"c_reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"elementwise_add", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"elementwise_add_grad", + {"elementwise_div_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"elementwise_div", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"elementwise_div_grad", + {"elementwise_floordiv", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"elementwise_pow", + {"elementwise_max_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"elementwise_floordiv", + {"elementwise_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"elementwise_mul", + {"elementwise_min_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elementwise_min", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"elementwise_mul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"elementwise_max", + {"elementwise_mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"elementwise_max_grad", + {"elementwise_pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"elementwise_min", + {"elementwise_sub_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"elementwise_min_grad", + {"elementwise_sub", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"expand_as_v2", + XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"expand_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"fill_constant", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"gather_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"gaussian_random", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"bilinear_interp", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"bilinear_interp_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"nearest_interp", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"nearest_interp_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"bilinear_interp_v2", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"bilinear_interp_v2_grad", + {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"gelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"hard_switch_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"nearest_interp_v2", + {"hard_switch", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"iou_similarity", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"nearest_interp_v2_grad", + {"lamb", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"layer_norm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"layer_norm_grad", + {"leaky_relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"load", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT8, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"log_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"logicaland", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::INT16, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"logicalnot", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::INT16, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"logicalor", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::INT16, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"log_loss_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"lookup_table_v2", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"log_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"logsumexp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"lookup_table_v2_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"lookup_table_v2", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"matmul_v2_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"mean_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"accuracy", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"momuntem", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"mul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"one_hot", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace())})}, + {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"nearest_interp_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"nearest_interp_v2_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"nearest_interp_v2", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"nearest_interp", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace())})}, - {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"lamb", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"one_hot", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace())})}, {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"range", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"reduce_sum_grad", + {"reduce_max_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"logsumexp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"reduce_max_grad", + {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reduce_sum_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::BOOL, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"reshape2_grad", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"roi_align_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"shape", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sigmoid_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"sign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace())})}, - {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"softmax_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"softmax_with_cross_entropy", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"squeeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::BOOL, XPUPlace()), - pOpKernelType(vartype::INT8, XPUPlace()), - pOpKernelType(vartype::UINT8, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, - {"squeeze_grad", + {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sqrt_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"squeeze2_grad", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), @@ -268,7 +272,7 @@ XPUOpMap& get_kl1_ops() { pOpKernelType(vartype::INT8, XPUPlace()), pOpKernelType(vartype::UINT8, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"squeeze2_grad", + {"squeeze_grad", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), @@ -276,27 +280,29 @@ XPUOpMap& get_kl1_ops() { pOpKernelType(vartype::INT8, XPUPlace()), pOpKernelType(vartype::UINT8, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, + {"squeeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"top_k", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"transpose_grad", + {"transpose2_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"transpose2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"transpose2_grad", + {"transpose_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"truncated_gaussian_random", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"uniform_random", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"unsqueeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::BOOL, XPUPlace()), - pOpKernelType(vartype::INT8, XPUPlace()), - pOpKernelType(vartype::UINT8, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, - {"unsqueeze_grad", + {"unsqueeze2_grad", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), @@ -311,7 +317,7 @@ XPUOpMap& get_kl1_ops() { pOpKernelType(vartype::INT8, XPUPlace()), pOpKernelType(vartype::UINT8, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"unsqueeze2_grad", + {"unsqueeze_grad", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), @@ -319,21 +325,13 @@ XPUOpMap& get_kl1_ops() { pOpKernelType(vartype::INT8, XPUPlace()), pOpKernelType(vartype::UINT8, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"momuntem", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"iou_similarity", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"expand_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + {"unsqueeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"expand_as_v2", - XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::BOOL, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, // AddMore }; diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 78fc53cfc8535..74f519c7a8617 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -29,141 +29,109 @@ using XPUOpMap = std::unordered_map; XPUOpMap& get_kl2_ops() { // KL1支持的op,通过op_name, data_type, place来索引 static XPUOpMap s_xpu2_kernels{ - {"label_smooth", + {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"assign_value", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, - {"elementwise_sub", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, - {"elementwise_sub_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, - {"elementwise_add", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, + {"batch_norm_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace())})}, + {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"depthwise_conv2d_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"depthwise_conv2d", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"dropout_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"elementwise_add_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, - {"elementwise_div", + {"elementwise_add", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_div_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"elementwise_div_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, - {"elementwise_pow", + {"elementwise_div", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elementwise_div", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, {"elementwise_floordiv", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, - {"elementwise_mul", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, - {"elementwise_mul_grad", + {"elementwise_max_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, {"elementwise_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, - {"elementwise_max_grad", + {"elementwise_min_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, {"elementwise_min", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, - {"elementwise_min_grad", + {"elementwise_mul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, - {"momentum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"batch_norm_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, - {"layer_norm_grad", + {"elementwise_mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, - {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, - {"mean_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, - {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"reduce_sum_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"softmax_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"softmax_with_cross_entropy", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"softmax_with_cross_entropy_grad", + {"elementwise_pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, - {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, - {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, - {"transpose_grad", + {"elementwise_sub_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, - {"transpose2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, - {"transpose2_grad", + {"elementwise_sub", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, - {"iou_similarity", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"reduce_mean_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace())})}, - {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace())})}, {"equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, - {"less_than", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, - {"less_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, - {"greater_than", - XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace()), + {"expand_as_v2", + XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"greater_equal", + {"expand_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"fill_any_like", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace())})}, - {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace()), - pOpKernelType(vartype::BOOL, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace())})}, - {"fill_any_like", + {"fill_constant", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT16, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, - {"flatten", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::INT8, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, - {"flatten_grad", + pOpKernelType(vartype::BF16, XPUPlace()), + pOpKernelType(vartype::COMPLEX64, XPUPlace()), + pOpKernelType(vartype::COMPLEX128, XPUPlace())})}, + {"flatten2_grad", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT8, XPUPlace()), @@ -172,123 +140,205 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT8, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"flatten2_grad", + {"flatten_contiguous_range_grad", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"matmul_v2_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"assign_value", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"dropout_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"elementwise_div", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"elementwise_div_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace())})}, - {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"reshape2_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace())})}, - {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace())})}, - {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"layer_norm_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"lookup_table_v2", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"lookup_table_v2_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"flatten_contiguous_range", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT8, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"flatten_contiguous_range_grad", + {"flatten_grad", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT8, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace())})}, - {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, - {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + {"flatten", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"gather_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"gather_nd", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"gaussian_random", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, {"gelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, - {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, - {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, - {"gather_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, - {"fill_constant", + {"greater_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::INT16, XPUPlace()), - pOpKernelType(vartype::INT8, XPUPlace()), - pOpKernelType(vartype::BOOL, XPUPlace()), - pOpKernelType(vartype::FP64, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace()), - pOpKernelType(vartype::BF16, XPUPlace()), - pOpKernelType(vartype::COMPLEX64, XPUPlace()), - pOpKernelType(vartype::COMPLEX128, XPUPlace())})}, - {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, - {"softmax_grad", + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"greater_than", + XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"iou_similarity", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"label_smooth", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"layer_norm_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"layer_norm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, - {"gather_nd", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace()), + {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"less_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"less_than", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"tile", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::BOOL, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, - {"where", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, - {"where_index", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::BOOL, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, + {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"lookup_table_v2_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"lookup_table_v2", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"masked_select", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"expand_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), - pOpKernelType(vartype::INT64, XPUPlace()), - pOpKernelType(vartype::BOOL, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace()), + {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"matmul_v2_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"mean_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"momentum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"expand_as_v2", - XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace())})}, + {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"prior_box", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace())})}, + {"reduce_max_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reduce_mean_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reduce_sum_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reshape2_grad", + XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"depthwise_conv2d", + {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace())})}, + {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace())})}, + {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace())})}, + {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace())})}, + {"softmax_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"depthwise_conv2d_grad", + {"softmax_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"softmax_with_cross_entropy_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"softmax_with_cross_entropy", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"squeeze2_grad", + XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"squeeze2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace())})}, + {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"tile", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"transpose2_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"transpose2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"transpose_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"unsqueeze2_grad", + XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"unsqueeze2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"where_index", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"where", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, // AddMore }; diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index c2dc60a29fe42..a0c9ff09460af 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -370,10 +370,10 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { char* scratch = static_cast(scratchpad()) + Eigen::kGpuScratchSize; semaphore_ = reinterpret_cast(scratch); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_)); #endif } @@ -439,14 +439,14 @@ CUDAContext::~CUDAContext() { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { CUDADeviceGuard guard(place_.device); - compute_capability_ = GetCUDAComputeCapability(place_.device); - multi_process_ = GetCUDAMultiProcessors(place_.device); - max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device); + compute_capability_ = GetGPUComputeCapability(place_.device); + multi_process_ = GetGPUMultiProcessors(place_.device); + max_threads_per_mp_ = GetGPUMaxThreadsPerMultiProcessor(place_.device); max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device); - max_threads_per_block_ = GetCUDAMaxThreadsPerBlock(place_.device); + max_threads_per_block_ = GetGPUMaxThreadsPerBlock(place_.device); - driver_version_ = GetCUDADriverVersion(place_.device); - runtime_version_ = GetCUDARuntimeVersion(place_.device); + driver_version_ = GetGPUDriverVersion(place_.device); + runtime_version_ = GetGPURuntimeVersion(place_.device); LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device << ", GPU Compute Capability: " @@ -459,7 +459,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { << (runtime_version_ % 100) / 10; #ifdef PADDLE_WITH_HIP size_t version_major, version_minor, version_patch; - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion( &version_major, &version_minor, &version_patch)); LOG_FIRST_N(WARNING, 1) << "device: " << place_.device << ", MIOpen Version: " << version_major << "." @@ -499,7 +499,7 @@ CUDADeviceContext::~CUDADeviceContext() { SetDeviceId(place_.device); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nccl_comm_) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_)); } #endif } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 73232994516b6..875132dfe89c4 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/memory/malloc.h" #ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_helper.h" #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cusolver.h" @@ -28,17 +28,17 @@ limitations under the License. */ #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/dynload/nccl.h" #endif -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif #ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/cuda_helper.h" // NOLINT +#include "paddle/fluid/platform/device/gpu/gpu_helper.h" // NOLINT #include "paddle/fluid/platform/dynload/miopen.h" #include "paddle/fluid/platform/dynload/rocblas.h" #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/dynload/rccl.h" #endif -#include "paddle/fluid/platform/gpu_info.h" // NOLINT +#include "paddle/fluid/platform/device/gpu/gpu_info.h" // NOLINT #endif #if defined(PADDLE_WITH_XPU_BKCL) @@ -62,6 +62,9 @@ limitations under the License. */ #include "paddle/fluid/platform/device/npu/enforce_npu.h" #include "paddle/fluid/platform/device/npu/npu_stream.h" #endif +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/platform/device/ipu/device.h" +#endif #include "unsupported/Eigen/CXX11/Tensor" namespace Eigen { @@ -99,8 +102,8 @@ enum DeviceType { CUDA = 1, XPU = 2, NPU = 3, - - MAX_DEVICE_TYPES = 4, + IPU = 4, + MAX_DEVICE_TYPES = 5, }; DeviceType Place2DeviceType(const platform::Place& place); @@ -109,6 +112,7 @@ constexpr DeviceType kCPU = DeviceType::CPU; constexpr DeviceType kCUDA = DeviceType::CUDA; constexpr DeviceType kXPU = DeviceType::XPU; constexpr DeviceType kNPU = DeviceType::NPU; +constexpr DeviceType kIPU = DeviceType::IPU; class DeviceContext { public: @@ -140,6 +144,30 @@ struct DefaultDeviceContextType { using TYPE = CPUDeviceContext; }; +// Graphcore IPU +#ifdef PADDLE_WITH_IPU +class IPUDeviceContext : public DeviceContext { + public: + IPUDeviceContext() = delete; + explicit IPUDeviceContext(IPUPlace place); + virtual ~IPUDeviceContext(); + Eigen::DefaultDevice* eigen_device() const { return nullptr; } + Place GetPlace() const override; + /*! \brief Wait for all operations completion in the stream. */ + void Wait() const override; + int DeviceId() const { return device_.getId(); } + + private: + IPUPlace place_; + platform::ipu::Device device_; +}; +template <> +struct DefaultDeviceContextType { + using TYPE = IPUDeviceContext; +}; + +#endif + #ifdef PADDLE_WITH_XPU namespace xpu = baidu::xpu::api; class XPUDeviceContext : public DeviceContext { @@ -371,7 +399,7 @@ class CUDAContext { if (dynload::HasCUDNN()) { #ifdef PADDLE_WITH_HIP size_t miopen_major, miopen_minor, miopen_patch; - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion( &miopen_major, &miopen_minor, &miopen_patch)); auto local_miopen_version = (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10; @@ -388,8 +416,8 @@ class CUDAContext { << "Please recompile or reinstall Paddle with compatible MIOPEN " "version."; } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreate(&cudnn_handle_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(&cudnn_handle_)); + PADDLE_ENFORCE_GPU_SUCCESS( dynload::miopenSetStream(cudnn_handle_, RawStream())); #else auto local_cudnn_version = dynload::cudnnGetVersion() / 100; @@ -425,9 +453,9 @@ class CUDAContext { void DestoryCuDNNContext() { if (cudnn_handle_) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroy(cudnn_handle_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroy(cudnn_handle_)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroy(cudnn_handle_)); #endif } cudnn_handle_ = nullptr; @@ -442,7 +470,7 @@ class CUDAContext { #ifndef PADDLE_WITH_HIP void DestoryCuSolverContext() { if (cusolver_dn_handle_) { - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( dynload::cusolverDnDestroy(cusolver_dn_handle_)); } } diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu index 2f9413c4f3ea7..cf617a478eb71 100644 --- a/paddle/fluid/platform/device_context_test.cu +++ b/paddle/fluid/platform/device_context_test.cu @@ -23,7 +23,7 @@ TEST(Device, Init) { using paddle::platform::CUDADeviceContext; using paddle::platform::CUDAPlace; - int count = paddle::platform::GetCUDADeviceCount(); + int count = paddle::platform::GetGPUDeviceCount(); for (int i = 0; i < count; i++) { CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); Eigen::GpuDevice* gpu_device = device_context->eigen_device(); @@ -36,7 +36,7 @@ TEST(Device, CUDADeviceContext) { using paddle::platform::CUDADeviceContext; using paddle::platform::CUDAPlace; - int count = paddle::platform::GetCUDADeviceCount(); + int count = paddle::platform::GetGPUDeviceCount(); for (int i = 0; i < count; i++) { CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); Eigen::GpuDevice* gpu_device = device_context->eigen_device(); @@ -70,7 +70,7 @@ TEST(Device, DeviceContextPool) { ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1); std::vector gpu_places; - int count = paddle::platform::GetCUDADeviceCount(); + int count = paddle::platform::GetGPUDeviceCount(); for (int i = 0; i < count; ++i) { auto dev_ctx = pool.Get(CUDAPlace(i)); ASSERT_NE(dev_ctx, nullptr); diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h index f42eb7ece1a72..a3f88592b7649 100644 --- a/paddle/fluid/platform/device_memory_aligment.h +++ b/paddle/fluid/platform/device_memory_aligment.h @@ -17,12 +17,10 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/place.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include "paddle/fluid/platform/gpu_info.h" -#endif #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/device/npu/npu_info.h" #endif +#include "paddle/fluid/platform/device/gpu/gpu_info.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h index f72eb6731f627..34845f24ff50d 100644 --- a/paddle/fluid/platform/dynload/miopen.h +++ b/paddle/fluid/platform/dynload/miopen.h @@ -25,6 +25,12 @@ limitations under the License. */ (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \ MIOPEN_VERSION_PATCH) // NOLINT +// MIOPEN only support NCHW, just for compatibility with CUDNN API +typedef enum { + MIOPEN_TENSOR_NCHW = 0, + MIOPEN_TENSOR_NHWC = 1, +} miopenTensorFormat_t; + namespace paddle { namespace platform { namespace dynload { diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 86f71fdf64fba..530ae6ba79889 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -96,7 +96,7 @@ limitations under the License. */ #include "paddle/fluid/imperative/type_defs.h" // Note: this header for simplify HIP and CUDA type string #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include "paddle/fluid/platform/type_defs.h" +#include "paddle/fluid/platform/device/gpu/gpu_types.h" #endif #include "paddle/fluid/platform/flags.h" @@ -944,7 +944,7 @@ inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) { } #endif // not(__APPLE__) and PADDLE_WITH_NCCL -#define PADDLE_ENFORCE_CUDA_SUCCESS(COND) \ +#define PADDLE_ENFORCE_GPU_SUCCESS(COND) \ do { \ auto __cond__ = (COND); \ using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ @@ -1150,7 +1150,7 @@ DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess); } // namespace details -#define PADDLE_ENFORCE_CUDA_SUCCESS(COND) \ +#define PADDLE_ENFORCE_GPU_SUCCESS(COND) \ do { \ auto __cond__ = (COND); \ using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index 6ff9e6ea903cd..b9e4239299169 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -294,14 +294,14 @@ TEST(EOF_EXCEPTION, THROW_EOF) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") { - PADDLE_ENFORCE_CUDA_SUCCESS(value); + PADDLE_ENFORCE_GPU_SUCCESS(value); return true; } template bool CheckCudaStatusFailure(T value, const std::string& msg) { try { - PADDLE_ENFORCE_CUDA_SUCCESS(value); + PADDLE_ENFORCE_GPU_SUCCESS(value); return false; } catch (paddle::platform::EnforceNotMet& error) { std::string ex_msg = error.what(); diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index 2b11de48a1ec7..136dc2d725208 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -148,9 +148,9 @@ class CudaEvent { void Record(const paddle::platform::stream::CUDAStream& stream) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, stream.raw_stream())); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream())); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, stream.raw_stream())); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream.raw_stream())); #endif } @@ -172,15 +172,15 @@ class CudaEvent { return false; } #endif - PADDLE_ENFORCE_CUDA_SUCCESS(err); + PADDLE_ENFORCE_GPU_SUCCESS(err); return false; } void Synchronize() { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventSynchronize(event_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventSynchronize(event_)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_)); #endif } gpuEvent_t GetRawCudaEvent() { return event_; } diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index 6e5c7f4e91660..5518dabbf92a4 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/gpu_launch_config.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc deleted file mode 100644 index 9dc6254234a97..0000000000000 --- a/paddle/fluid/platform/gpu_info.cc +++ /dev/null @@ -1,734 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/platform/gpu_info.h" -#include -#include -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/platform/cuda_device_guard.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/dynload/miopen.h" -#else -#include "paddle/fluid/platform/cuda_graph.h" -#include "paddle/fluid/platform/dynload/cudnn.h" -#endif -#include "paddle/fluid/memory/malloc.h" -#ifdef PADDLE_WITH_CUDA -#if CUDA_VERSION >= 10020 -#include "paddle/fluid/platform/dynload/cuda_driver.h" -#endif -#endif -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/lock_guard_ptr.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/fluid/platform/monitor.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/split.h" - -DECLARE_double(fraction_of_gpu_memory_to_use); -DECLARE_uint64(initial_gpu_memory_in_mb); -DECLARE_uint64(reallocate_gpu_memory_in_mb); -DECLARE_bool(enable_cublas_tensor_op_math); -DECLARE_string(selected_gpus); -DECLARE_uint64(gpu_memory_limit_mb); - -constexpr static float fraction_reserve_gpu_memory = 0.05f; - -static std::once_flag g_device_props_size_init_flag; -static std::vector> g_device_props_init_flags; -static std::vector g_device_props; - -USE_GPU_MEM_STAT; -namespace paddle { -namespace platform { - -int CudnnVersion() { - if (!dynload::HasCUDNN()) return -1; - -#ifdef PADDLE_WITH_HIP - size_t version_major, version_minor, version_patch; - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion( - &version_major, &version_minor, &version_patch)); - return version_major * 100 + version_minor * 10 + version_patch; -#else - return dynload::cudnnGetVersion(); -#endif -} -static int GetCUDADeviceCountImpl() { - int driverVersion = 0; -#ifdef PADDLE_WITH_HIP - hipError_t status = hipDriverGetVersion(&driverVersion); -#else - cudaError_t status = cudaDriverGetVersion(&driverVersion); -#endif - - if (!(status == gpuSuccess && driverVersion != 0)) { - // No GPU driver - VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!"; - return 0; - } - -#ifdef PADDLE_WITH_HIP - const auto *cuda_visible_devices = std::getenv("HIP_VISIBLE_DEVICES"); -#else - const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES"); -#endif - if (cuda_visible_devices != nullptr) { - std::string cuda_visible_devices_str(cuda_visible_devices); - if (!cuda_visible_devices_str.empty()) { - cuda_visible_devices_str.erase( - 0, cuda_visible_devices_str.find_first_not_of('\'')); - cuda_visible_devices_str.erase( - cuda_visible_devices_str.find_last_not_of('\'') + 1); - cuda_visible_devices_str.erase( - 0, cuda_visible_devices_str.find_first_not_of('\"')); - cuda_visible_devices_str.erase( - cuda_visible_devices_str.find_last_not_of('\"') + 1); - } - if (std::all_of(cuda_visible_devices_str.begin(), - cuda_visible_devices_str.end(), - [](char ch) { return ch == ' '; })) { - VLOG(2) << "CUDA_VISIBLE_DEVICES or HIP_VISIBLE_DEVICES is set to be " - "empty. No GPU detected."; - return 0; - } - } - int count; -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipGetDeviceCount(&count)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDeviceCount(&count)); -#endif - return count; -} - -int GetCUDADeviceCount() { - // cache the count - static auto dev_cnt = GetCUDADeviceCountImpl(); - return dev_cnt; -} - -/* Here is a very simple CUDA “pro tip”: cudaDeviceGetAttribute() is a much -faster way to query device properties. You can see details in -https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/ -*/ -int GetCUDAComputeCapability(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); - int major, minor; - -#ifdef PADDLE_WITH_HIP - auto major_error_code = hipDeviceGetAttribute( - &major, hipDeviceAttributeComputeCapabilityMajor, id); - auto minor_error_code = hipDeviceGetAttribute( - &minor, hipDeviceAttributeComputeCapabilityMinor, id); -#else - auto major_error_code = - cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id); - auto minor_error_code = - cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id); -#endif - PADDLE_ENFORCE_CUDA_SUCCESS(major_error_code); - PADDLE_ENFORCE_CUDA_SUCCESS(minor_error_code); -#ifdef PADDLE_WITH_HIP - return major * 100 + minor; -#else - return major * 10 + minor; -#endif -} - -dim3 GetGpuMaxGridDimSize(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); - dim3 ret; - int size; -#ifdef PADDLE_WITH_HIP - auto error_code_x = - hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id); -#else - auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id); -#endif - PADDLE_ENFORCE_CUDA_SUCCESS(error_code_x); - ret.x = size; - -#ifdef PADDLE_WITH_HIP - auto error_code_y = - hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id); -#else - auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id); -#endif - PADDLE_ENFORCE_CUDA_SUCCESS(error_code_y); - ret.y = size; - -#ifdef PADDLE_WITH_HIP - auto error_code_z = - hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id); -#else - auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id); -#endif - PADDLE_ENFORCE_CUDA_SUCCESS(error_code_z); - ret.z = size; - return ret; -} - -int GetCUDARuntimeVersion(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); - int runtime_version = 0; -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipRuntimeGetVersion(&runtime_version)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaRuntimeGetVersion(&runtime_version)); -#endif - return runtime_version; -} - -int GetCUDADriverVersion(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); - int driver_version = 0; -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipDriverGetVersion(&driver_version)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaDriverGetVersion(&driver_version)); -#endif - return driver_version; -} - -bool TensorCoreAvailable() { -#if !defined(PADDLE_WITH_HIP) && CUDA_VERSION >= 9000 - int device = GetCurrentDeviceId(); - int driver_version = GetCUDAComputeCapability(device); - return driver_version >= 70; -#else - return false; -#endif -} - -int GetCUDAMultiProcessors(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); - int count; -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id)); -#endif - return count; -} - -int GetCUDAMaxThreadsPerMultiProcessor(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); - int count; -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceGetAttribute( - &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceGetAttribute( - &count, cudaDevAttrMaxThreadsPerMultiProcessor, id)); -#endif - return count; -} - -int GetCUDAMaxThreadsPerBlock(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); - int count; -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id)); -#endif - return count; -} - -int GetCurrentDeviceId() { - int device_id; -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipGetDevice(&device_id)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDevice(&device_id)); -#endif - return device_id; -} - -//! Get a list of device ids from environment variable or use all. -std::vector GetSelectedDevices() { - // use user specified GPUs in single-node multi-process mode. - std::vector devices; - if (!FLAGS_selected_gpus.empty()) { - auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ','); - for (auto id : devices_str) { - devices.push_back(atoi(id.c_str())); - } - } else { - int count = GetCUDADeviceCount(); - for (int i = 0; i < count; ++i) { - devices.push_back(i); - } - } - return devices; -} - -const gpuDeviceProp &GetDeviceProperties(int id) { - std::call_once(g_device_props_size_init_flag, [&] { - int gpu_num = 0; - gpu_num = platform::GetCUDADeviceCount(); - g_device_props_init_flags.resize(gpu_num); - g_device_props.resize(gpu_num); - for (int i = 0; i < gpu_num; ++i) { - g_device_props_init_flags[i] = std::make_unique(); - } - }); - - if (id == -1) { - id = platform::GetCurrentDeviceId(); - } - - if (id < 0 || id >= static_cast(g_device_props.size())) { - PADDLE_THROW(platform::errors::OutOfRange( - "The device id %d is out of range [0, %d), where %d is the number of " - "devices on this machine. Because the device id should be greater than " - "or equal to zero and smaller than the number of gpus. Please input " - "appropriate device again!", - id, static_cast(g_device_props.size()), - static_cast(g_device_props.size()))); - } - - std::call_once(*(g_device_props_init_flags[id]), [&] { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaGetDeviceProperties(&g_device_props[id], id)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS( - hipGetDeviceProperties(&g_device_props[id], id)); -#endif - }); - - return g_device_props[id]; -} - -void SetDeviceId(int id) { - // TODO(qijun): find a better way to cache the cuda device count - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); -#ifdef PADDLE_WITH_HIP - PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id)); -#else - PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id)); -#endif -} - -void GpuMemoryUsage(size_t *available, size_t *total) { - size_t actual_available, actual_total; - RecordedCudaMemGetInfo(available, total, &actual_available, &actual_total, - platform::GetCurrentDeviceId()); -} - -size_t GpuAvailableMemToAlloc() { - size_t total = 0; - size_t available = 0; - GpuMemoryUsage(&available, &total); - size_t reserving = - static_cast(fraction_reserve_gpu_memory * available); - // If available size is less than minimum chunk size, no usable memory exists - size_t available_to_alloc = available - reserving; - size_t min_chunk_size = GpuMinChunkSize(); - if (available_to_alloc < min_chunk_size) { - available_to_alloc = 0; - } - VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20) - << "M, " << (available_to_alloc >> 20) << "M available to allocate"; - return available_to_alloc; -} - -size_t GpuMaxAllocSize() { - return std::max(GpuInitAllocSize(), GpuReallocSize()); -} - -static size_t GpuAllocSize(bool realloc) { - size_t available_to_alloc = GpuAvailableMemToAlloc(); - PADDLE_ENFORCE_GT( - available_to_alloc, 0, - platform::errors::ResourceExhausted("Not enough available GPU memory.")); - // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be - // allocated by fraction - size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb - : FLAGS_initial_gpu_memory_in_mb; - size_t alloc_bytes = - (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc * - FLAGS_fraction_of_gpu_memory_to_use); - PADDLE_ENFORCE_GE( - available_to_alloc, alloc_bytes, - platform::errors::ResourceExhausted("Not enough available GPU memory.")); - VLOG(10) << "Alloc size is " << (alloc_bytes >> 20) - << " MiB, is it Re-alloc: " << realloc; - return alloc_bytes; -} - -size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); } - -size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); } - -size_t GpuMinChunkSize() { - // Allow to allocate the minimum chunk size is 256 bytes. - return 1 << 8; -} - -size_t GpuMaxChunkSize() { - size_t max_chunk_size = GpuMaxAllocSize(); - VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; - return max_chunk_size; -} - -#ifdef PADDLE_WITH_HIP -void GpuMemcpyAsync(void *dst, const void *src, size_t count, - enum hipMemcpyKind kind, hipStream_t stream) { - PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream)); -} -#else -void GpuMemcpyAsync(void *dst, const void *src, size_t count, - enum cudaMemcpyKind kind, cudaStream_t stream) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream)); -} -#endif - -#ifdef PADDLE_WITH_HIP -void GpuMemcpySync(void *dst, const void *src, size_t count, - enum hipMemcpyKind kind) { - PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(dst, src, count, kind)); -} -#else -void GpuMemcpySync(void *dst, const void *src, size_t count, - enum cudaMemcpyKind kind) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(dst, src, count, kind)); -} -#endif - -void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, - int src_device, size_t count, gpuStream_t stream) { -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); -#endif -} - -void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, - int src_device, size_t count) { -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - hipMemcpyPeer(dst, dst_device, src, src_device, count)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMemcpyPeer(dst, dst_device, src, src_device, count)); -#endif -} - -void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipMemsetAsync(dst, value, count, stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(dst, value, count, stream)); -#endif -} - -void GpuStreamSync(gpuStream_t stream) { -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif -} - -static void RaiseNonOutOfMemoryError(gpuError_t *status) { -#ifdef PADDLE_WITH_HIP - if (*status == hipErrorOutOfMemory) { - *status = hipSuccess; - } -#else - if (*status == cudaErrorMemoryAllocation) { - *status = cudaSuccess; - } -#endif - PADDLE_ENFORCE_CUDA_SUCCESS(*status); - -#ifdef PADDLE_WITH_HIP - *status = hipGetLastError(); - if (*status == hipErrorOutOfMemory) { - *status = hipSuccess; - } -#else - *status = cudaGetLastError(); - if (*status == cudaErrorMemoryAllocation) { - *status = cudaSuccess; - } -#endif - PADDLE_ENFORCE_CUDA_SUCCESS(*status); -} - -class RecordedCudaMallocHelper { - private: - explicit RecordedCudaMallocHelper(int dev_id, uint64_t limit_size = 0) - : dev_id_(dev_id), limit_size_(limit_size) { - if (NeedRecord()) { - mtx_.reset(new std::mutex()); - } - } - - DISABLE_COPY_AND_ASSIGN(RecordedCudaMallocHelper); - - public: - static RecordedCudaMallocHelper *Instance(int dev_id) { - std::call_once(once_flag_, [] { - int dev_cnt = GetCUDADeviceCount(); - instances_.reserve(dev_cnt); - for (int i = 0; i < dev_cnt; ++i) { - instances_.emplace_back( - new RecordedCudaMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20)); - } - }); - - PADDLE_ENFORCE_GE( - dev_id, 0, - platform::errors::OutOfRange( - "Device id must be not less than 0, but got %d.", dev_id)); - PADDLE_ENFORCE_LT( - dev_id, instances_.size(), - platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.", - dev_id, instances_.size())); - return instances_[dev_id].get(); - } - - /** - * Try to allocate `size` gpu memory. Only cudaErrorMemoryAllocation - * or cudaSuccess would be returned, and the cudaGetLastError() flag - * would be clear. - */ - gpuError_t Malloc(void **ptr, size_t size) { - LockGuardPtr lock(mtx_); - if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) { -#ifdef PADDLE_WITH_HIP - return hipErrorOutOfMemory; -#else - return cudaErrorMemoryAllocation; -#endif - } - - CUDADeviceGuard guard(dev_id_); -#ifdef PADDLE_WITH_HIP - auto result = hipMalloc(ptr, size); -#else - CUDAGraphCaptureModeGuard capture_mode_guard; - auto result = cudaMalloc(ptr, size); -#endif - if (result == gpuSuccess) { - cur_size_.fetch_add(size); - STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); - return gpuSuccess; - } else { - RaiseNonOutOfMemoryError(&result); -// Non out of memory error would be raised inside -// RaiseNonOutOfMemoryError. Therefore, we can -// return cudaErrorMemoryAllocation directly here. -#ifdef PADDLE_WITH_HIP - return hipErrorOutOfMemory; -#else - return cudaErrorMemoryAllocation; -#endif - } - } - - /** - * Free gpu memory. Usually, free is not allowed to raise error. - * If it does raise error, the process should be crashed. - */ - void Free(void *ptr, size_t size) { - // Purposefully allow cudaErrorCudartUnloading, because - // that is returned if you ever call cudaFree after the - // driver has already shutdown. This happens only if the - // process is terminating, in which case we don't care if - // cudaFree succeeds. - CUDADeviceGuard guard(dev_id_); -#ifdef PADDLE_WITH_HIP - auto err = hipFree(ptr); - if (err != hipErrorDeinitialized) { -#else - auto err = cudaFree(ptr); - if (err != cudaErrorCudartUnloading) { -#endif - PADDLE_ENFORCE_CUDA_SUCCESS(err); - cur_size_.fetch_sub(size); - STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); - } else { -#ifdef PADDLE_WITH_HIP - hipGetLastError(); // clear the error flag when hipErrorDeinitialized -#else - cudaGetLastError(); // clear the error flag when cudaErrorCudartUnloading -#endif - } - } - - bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail, - size_t *actual_total) { - { - CUDADeviceGuard guard(dev_id_); -#ifdef PADDLE_WITH_HIP - auto result = hipMemGetInfo(actual_avail, actual_total); -#else - auto result = cudaMemGetInfo(actual_avail, actual_total); -#endif - if (result != gpuSuccess) { - *actual_avail = 0; - } - RaiseNonOutOfMemoryError(&result); - } - - if (NeedRecord()) { - std::lock_guard guard(*mtx_); - *avail = std::min(*actual_avail, limit_size_ - cur_size_.load()); - *total = std::min(*actual_total, limit_size_); - return *total < *actual_total; - } else { - *avail = *actual_avail; - *total = *actual_total; - return false; - } - } - - inline bool NeedRecord() const { return limit_size_ != 0; } - - uint64_t RecordedSize() const { return cur_size_.load(); } - - uint64_t LimitSize() const { return limit_size_; } - -#ifdef PADDLE_WITH_CUDA -#if CUDA_VERSION >= 10020 - CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size, - const CUmemAllocationProp *prop, - unsigned long long flags) { // NOLINT - auto result = - paddle::platform::dynload::cuMemCreate(handle, size, prop, flags); - if (result == CUDA_SUCCESS) { - cur_size_.fetch_add(size); - } - return result; - } - - CUresult MemRelease(CUmemGenericAllocationHandle handle, size_t size) { - auto result = paddle::platform::dynload::cuMemRelease(handle); - if (result == CUDA_SUCCESS) { - cur_size_.fetch_sub(size); - } - return result; - } - -#endif -#endif - - private: - const int dev_id_; - const uint64_t limit_size_; - std::atomic cur_size_{0}; - - mutable std::unique_ptr mtx_; - - static std::once_flag once_flag_; - static std::vector> instances_; -}; // NOLINT - -std::once_flag RecordedCudaMallocHelper::once_flag_; -std::vector> - RecordedCudaMallocHelper::instances_; - -gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id) { - return RecordedCudaMallocHelper::Instance(dev_id)->Malloc(ptr, size); -} - -void RecordedCudaFree(void *p, size_t size, int dev_id) { - return RecordedCudaMallocHelper::Instance(dev_id)->Free(p, size); -} - -#ifdef PADDLE_WITH_CUDA -#if CUDA_VERSION >= 10020 -CUresult RecordedCuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, - const CUmemAllocationProp *prop, - unsigned long long flags, int dev_id) { // NOLINT - return RecordedCudaMallocHelper::Instance(dev_id)->MemCreate(handle, size, - prop, flags); -} - -CUresult RecordedCuMemRelease(CUmemGenericAllocationHandle handle, size_t size, - int dev_id) { - return RecordedCudaMallocHelper::Instance(dev_id)->MemRelease(handle, size); -} -#endif -#endif - -bool RecordedCudaMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail, - size_t *actual_total, int dev_id) { - return RecordedCudaMallocHelper::Instance(dev_id)->GetMemInfo( - avail, total, actual_avail, actual_total); -} - -uint64_t RecordedCudaMallocSize(int dev_id) { - return RecordedCudaMallocHelper::Instance(dev_id)->RecordedSize(); -} - -bool IsCudaMallocRecorded(int dev_id) { - return RecordedCudaMallocHelper::Instance(dev_id)->NeedRecord(); -} - -void EmptyCache(void) { - std::vector devices = GetSelectedDevices(); - for (auto device : devices) { - memory::Release(CUDAPlace(device)); - } -} - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 698563a53d255..b642f160da21a 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -45,6 +45,10 @@ limitations under the License. */ #include "DbgHelp.h" #endif +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/platform/device/ipu/ipu_info.h" +#endif + DECLARE_int32(paddle_num_threads); PADDLE_DEFINE_EXPORTED_int32( multiple_of_cupti_buffer_size, 1, @@ -164,6 +168,15 @@ void InitDevices() { LOG(WARNING) << "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found in runtime."; } +#endif +#ifdef PADDLE_WITH_IPU + try { + // use user specified IPUs. + devices = platform::GetSelectedIPUDevices(); + } catch (const std::exception &exp) { + LOG(WARNING) + << "Compiled with PADDLE_WITH_IPU, but no IPU found in runtime."; + } #endif InitDevices(devices); } @@ -185,6 +198,9 @@ void InitDevices(const std::vector devices) { #ifdef PADDLE_WITH_XPU places.emplace_back(platform::XPUPlace(devices[i])); #endif +#ifdef PADDLE_WITH_IPU + places.emplace_back(platform::IPUPlace(devices[i])); +#endif #ifdef PADDLE_WITH_ASCEND_CL places.emplace_back(platform::NPUPlace(devices[i])); #endif diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc index 965fe7b6db45c..dbca7d1549546 100644 --- a/paddle/fluid/platform/init_test.cc +++ b/paddle/fluid/platform/init_test.cc @@ -32,7 +32,7 @@ TEST(InitDevices, CUDA) { using paddle::platform::DeviceContextPool; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - int count = paddle::platform::GetCUDADeviceCount(); + int count = paddle::platform::GetGPUDeviceCount(); InitDevices(); DeviceContextPool& pool = DeviceContextPool::Instance(); ASSERT_EQ(pool.size(), 2U + static_cast(count)); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 40d9bb99f44f5..f6d9c8f64fd35 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include // NOLINT #include +#include #include +#include #include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/enforce.h" @@ -30,6 +32,290 @@ PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false, namespace paddle { namespace platform { +struct DurationEvent { + public: + DurationEvent(const char *name, uint64_t start_ns, uint64_t end_ns, + EventRole role) + : name(name), start_ns(start_ns), end_ns(end_ns), role(role) {} + + DurationEvent(std::function &arena_allocator, + const std::string &name_str, uint64_t start_ns, uint64_t end_ns, + EventRole role, const std::string &attr_str) + : start_ns(start_ns), end_ns(end_ns), role(role) { + auto buf = static_cast(arena_allocator(name_str.length() + 1)); + strncpy(buf, name_str.c_str(), name_str.length() + 1); + name = buf; + buf = static_cast(arena_allocator(attr_str.length() + 1)); + strncpy(buf, attr_str.c_str(), attr_str.length() + 1); + attr = buf; + } + + DurationEvent(const std::function &arena_allocator, + const std::string &name_str, uint64_t start_ns, uint64_t end_ns, + EventRole role) + : start_ns(start_ns), end_ns(end_ns), role(role) { + auto buf = static_cast(arena_allocator(name_str.length() + 1)); + strncpy(buf, name_str.c_str(), name_str.length() + 1); + name = buf; + } + + const char *name = nullptr; // not owned, designed for performance + uint64_t start_ns = 0; + uint64_t end_ns = 0; + EventRole role = EventRole::kOrdinary; + const char *attr = nullptr; // not owned, designed for performance +}; + +template +struct ContainsStdString + : std::conditional_t< + std::is_same>>::value, + std::true_type, ContainsStdString> {}; + +template +struct ContainsStdString + : std::is_same>> {}; + +template +class EventContainer { + public: + EventContainer() { + event_blocks_ = cur_event_block_ = new EventBlock; + str_blocks_ = cur_str_block_ = new StringBlock; + } + ~EventContainer() { + Reduce(); + delete event_blocks_; + for (auto cur = str_blocks_; cur != nullptr;) { + auto next = cur->next; + delete cur; + cur = next; + } + } + DISABLE_COPY_AND_ASSIGN(EventContainer); + + public: + // Record an event + template + void Record(Args &&... args) { + DoRecord(ContainsStdString(), std::forward(args)...); + } + + // Get all events and clear the container + std::vector Reduce(); + + // Return a buffer to store the string attribute of Event. + // HostEventRecorder locates in the static data section. + // So it's safe to use arena to avoid fragmented allocations. + char *GetStrBufFromArena(size_t size) { return GetStringStorage(size); } + + private: + struct EventBlock { + union InitDeferedEvent { + InitDeferedEvent() {} + ~InitDeferedEvent() {} + + EventType event; + }; + + static constexpr size_t kBlockSize = 1 << 24; // 16 MB + static constexpr size_t kAvailSize = + kBlockSize - sizeof(size_t) - sizeof(nullptr); + static constexpr size_t kNumEvents = kAvailSize / sizeof(InitDeferedEvent); + static constexpr size_t kPadSize = + kAvailSize - kNumEvents * sizeof(InitDeferedEvent); + static constexpr size_t kMinimumEventsPerBlock = 1024; + static_assert( + kNumEvents >= kMinimumEventsPerBlock, + "EventType is too large for kBlockSize, make kBlockSize larger"); + + size_t offset = 0; + EventBlock *next = nullptr; + InitDeferedEvent events[kNumEvents]; + char padding[kPadSize]; + }; + static_assert(sizeof(EventBlock) == EventBlock::kBlockSize, + "sizeof EventBlock must equal to kBlockSize"); + + struct StringBlock { + static constexpr size_t kBlockSize = 1 << 22; // 4 MB + static constexpr size_t kAvailSize = + kBlockSize - sizeof(size_t) - sizeof(nullptr); + + size_t offset = 0; + StringBlock *next = nullptr; + char storage[kAvailSize]; + }; + static_assert(sizeof(StringBlock) == StringBlock::kBlockSize, + "sizeof StringBlock must equal to kBlockSize"); + + // Record an event with string arguments + template + void DoRecord(std::true_type, Args &&... args) { + auto *storage = GetEventStorage(); + std::function allocator = [this](size_t size) { + return GetStrBufFromArena(size); + }; + new (storage) EventType(allocator, std::forward(args)...); + } + + // Record an event without any string argument + template + void DoRecord(std::false_type, Args &&... args) { + auto *storage = GetEventStorage(); + new (storage) EventType(std::forward(args)...); + } + + EventType *GetEventStorage(); + + char *GetStringStorage(size_t sz); + + EventBlock *event_blocks_ = nullptr; + EventBlock *cur_event_block_ = nullptr; + StringBlock *str_blocks_ = nullptr; + StringBlock *cur_str_block_ = nullptr; +}; + +template +std::vector EventContainer::Reduce() { + std::vector all_events; + size_t event_cnt = 0; + for (auto cur = event_blocks_; cur != nullptr; cur = cur->next) { + event_cnt += cur->offset; + } + all_events.reserve(event_cnt); + for (auto cur = event_blocks_; cur != nullptr;) { + for (size_t i = 0; i < cur->offset; ++i) { + all_events.emplace_back(cur->events[i].event); + } + auto next = cur->next; + delete cur; + cur = next; + } + event_blocks_ = cur_event_block_ = new EventBlock; + return std::move(all_events); +} + +template +EventType *EventContainer::GetEventStorage() { + if (UNLIKELY(cur_event_block_->offset >= + EventBlock::kNumEvents)) { // another block + cur_event_block_->next = new EventBlock; + cur_event_block_ = cur_event_block_->next; + } + auto &obj = cur_event_block_->events[cur_event_block_->offset].event; + ++cur_event_block_->offset; + return &obj; +} + +template +char *EventContainer::GetStringStorage(size_t sz) { + if (UNLIKELY(cur_str_block_->offset + sz > + StringBlock::kAvailSize)) { // another block + cur_str_block_->next = new StringBlock; + cur_str_block_ = cur_str_block_->next; + } + char *storage = cur_str_block_->storage + cur_str_block_->offset; + cur_str_block_->offset += sz; + return storage; +} + +struct ThreadEventSection { + std::string thread_name; + uint64_t thread_id; + std::vector events; +}; + +class ThreadEventRecorder { + public: + ThreadEventRecorder(); + DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder); + + public: + // Forward call to EventContainer::Record + template + void RecordEvent(Args &&... args) { + base_evt_cntr_.Record(std::forward(args)...); + } + + ThreadEventSection GatherEvents() { + ThreadEventSection thr_sec; + thr_sec.thread_name = thread_name_; + thr_sec.thread_id = thread_id_; + thr_sec.events = std::move(base_evt_cntr_.Reduce()); + return std::move(thr_sec); + } + + private: + uint64_t thread_id_; + std::string thread_name_; + EventContainer base_evt_cntr_; +}; + +struct HostEventSection { + std::string process_name; + uint64_t process_id; + std::vector thr_sections; +}; + +class HostEventRecorder { + public: + // singleton + static HostEventRecorder &GetInstance() { + static HostEventRecorder instance; + return instance; + } + + // If your string argument has a longer lifetime than the Event, + // use 'const char*'. e.g.: string literal, op name, etc. + // Do your best to avoid using 'std::string' as the argument type. + // It will cause deep-copy to harm performance. + template + void RecordEvent(Args &&... args) { + GetThreadLocalRecorder().RecordEvent(std::forward(args)...); + } + + // Poor performance, call it at the ending + HostEventSection GatherEvents(); + + void RegisterThreadRecorder(uint64_t tid, ThreadEventRecorder *recorder) { + const std::lock_guard guard(thread_recorders_lock_); + thread_recorders_[tid] = recorder; + } + + private: + HostEventRecorder() = default; + DISABLE_COPY_AND_ASSIGN(HostEventRecorder); + + ThreadEventRecorder &GetThreadLocalRecorder() { + static thread_local ThreadEventRecorder tls_recorder; + return tls_recorder; + } + + std::mutex thread_recorders_lock_; + std::unordered_map thread_recorders_; +}; + +static uint64_t GetThreadId() { + return std::hash{}(std::this_thread::get_id()); +} + +ThreadEventRecorder::ThreadEventRecorder() { + thread_id_ = GetThreadId(); + HostEventRecorder::GetInstance().RegisterThreadRecorder(thread_id_, this); +} + +HostEventSection HostEventRecorder::GatherEvents() { + HostEventSection host_sec; + host_sec.thr_sections.reserve(thread_recorders_.size()); + for (auto &kv : thread_recorders_) { + host_sec.thr_sections.emplace_back(std::move(kv.second->GatherEvents())); + } + return std::move(host_sec); +} + MemEvenRecorder MemEvenRecorder::recorder; Event::Event(EventType type, std::string name, uint32_t thread_id, @@ -57,8 +343,44 @@ double Event::CudaElapsedMs(const Event &e) const { #endif } +RecordEvent::RecordEvent(const char *name, const EventRole role) { +#ifndef _WIN32 +#ifdef PADDLE_WITH_CUDA + if (g_enable_nvprof_hook) { + dynload::nvtxRangePushA(name); + is_pushed_ = true; + } +#endif +#endif + if (UNLIKELY(g_enable_host_event_recorder_hook == false)) { + RecordEvent(name, role, "none"); + return; + } + shallow_copy_name_ = name; + role_ = role; + start_ns_ = PosixInNsec(); +} + +RecordEvent::RecordEvent(const std::string &name, const EventRole role) { +#ifndef _WIN32 +#ifdef PADDLE_WITH_CUDA + if (g_enable_nvprof_hook) { + dynload::nvtxRangePushA(name.c_str()); + is_pushed_ = true; + } +#endif +#endif + if (UNLIKELY(g_enable_host_event_recorder_hook == false)) { + RecordEvent(name, role, "none"); + return; + } + name_ = new std::string(name); + role_ = role; + start_ns_ = PosixInNsec(); +} + RecordEvent::RecordEvent(const std::string &name, const EventRole role, - const std::string attr) { + const std::string &attr) { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA if (g_enable_nvprof_hook) { @@ -67,17 +389,26 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role, } #endif #endif + if (g_enable_host_event_recorder_hook) { + name_ = new std::string(name); + start_ns_ = PosixInNsec(); + attr_ = new std::string(attr); + return; + } + if (g_state == ProfilerState::kDisabled || name.empty()) return; // do some initialization + name_ = new std::string(name); start_ns_ = PosixInNsec(); role_ = role; + attr_ = new std::string(attr); is_enabled_ = true; // lock is not needed, the code below is thread-safe // Maybe need the same push/pop behavior. Event *e = PushEvent(name, role, attr); SetCurAnnotation(e); - name_ = e->name(); + // name_ = e->name(); } RecordEvent::~RecordEvent() { @@ -88,15 +419,36 @@ RecordEvent::~RecordEvent() { } #endif #endif + uint64_t end_ns = PosixInNsec(); + if (LIKELY(g_enable_host_event_recorder_hook)) { + if (LIKELY(shallow_copy_name_ != nullptr)) { + HostEventRecorder::GetInstance().RecordEvent(shallow_copy_name_, + start_ns_, end_ns, role_); + } else if (name_ != nullptr) { + if (attr_ == nullptr) { + HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns, + role_); + } else { + HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns, + role_, *attr_); + } + } + delete name_; + delete attr_; + return; + } + if (g_state == ProfilerState::kDisabled || !is_enabled_) return; // lock is not needed, the code below is thread-safe DeviceTracer *tracer = GetDeviceTracer(); if (tracer) { - tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(), - BlockDepth(), g_thread_id); + tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(), + g_thread_id); } ClearCurAnnotation(); - PopEvent(name_, role_); + PopEvent(*name_, role_); + delete name_; + delete attr_; } void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place, @@ -148,11 +500,11 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() { PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free); } -RecordRPCEvent::RecordRPCEvent(const std::string &name) { +/*RecordRPCEvent::RecordRPCEvent(const std::string &name) { if (FLAGS_enable_rpc_profiler) { event_.reset(new platform::RecordEvent(name)); } -} +}*/ RecordBlock::RecordBlock(int block_id) : is_enabled_(false), start_ns_(PosixInNsec()) { @@ -362,5 +714,20 @@ void NvprofEnableRecordEvent() { void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; } +void EnableHostEventRecorder() { g_enable_host_event_recorder_hook = true; } + +std::string PrintHostEvents() { + std::ostringstream oss; + auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents(); + for (const auto &thr_evt_sec : host_evt_sec.thr_sections) { + oss << thr_evt_sec.thread_id << std::endl; + for (const auto &evt : thr_evt_sec.events) { + oss << "{ " << evt.name << " | " << evt.start_ns << " | " << evt.end_ns + << " }" << std::endl; + } + } + return oss.str(); +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu index 02930627d41e3..5d1caffd45326 100644 --- a/paddle/fluid/platform/profiler.cu +++ b/paddle/fluid/platform/profiler.cu @@ -29,7 +29,7 @@ __global__ void DummyKernel(int *a) { a[0] = 0; } static void ForEachDevice(std::function func) { auto original_device = platform::GetCurrentDeviceId(); - int count = platform::GetCUDADeviceCount(); + int count = platform::GetGPUDeviceCount(); for (int i = 0; i < count; i++) { platform::SetDeviceId(i); func(i); @@ -43,13 +43,13 @@ void DummyKernelAndEvent() { ForEachDevice([](int d) { platform::SetDeviceId(d); hipStream_t stream; - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream)); Mark("_cuda_startup_"); int *ptr; - PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&ptr, sizeof(int))); + PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&ptr, sizeof(int))); hipLaunchKernelGGL(DummyKernel, dim3(1), dim3(1), 0, stream, ptr); - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipFree(ptr)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream)); + PADDLE_ENFORCE_GPU_SUCCESS(hipFree(ptr)); }); } #else @@ -57,13 +57,13 @@ void DummyKernelAndEvent() { ForEachDevice([](int d) { platform::SetDeviceId(d); cudaStream_t stream; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream)); Mark("_cuda_startup_"); int *ptr; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&ptr, sizeof(int))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(&ptr, sizeof(int))); DummyKernel<<<1, 1, 0, stream>>>(ptr); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaFree(ptr)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(ptr)); }); } #endif diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index fbae6165e313a..317991160b798 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -31,7 +31,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.pb.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif namespace paddle { @@ -128,31 +128,38 @@ struct MemEvenRecorder { }; struct RecordEvent { - RecordEvent(const std::string& name, - const EventRole role = EventRole::kOrdinary, - const std::string attr = "none"); + explicit RecordEvent(const std::string& name, + const EventRole role = EventRole::kOrdinary); + + explicit RecordEvent(const char* name, + const EventRole role = EventRole::kOrdinary); + + RecordEvent(const std::string& name, const EventRole role, + const std::string& attr); ~RecordEvent(); bool is_enabled_{false}; bool is_pushed_{false}; - uint64_t start_ns_; // Event name - std::string name_; + const std::string* name_{nullptr}; + const char* shallow_copy_name_{nullptr}; + uint64_t start_ns_; // Need to distinguish name by op type, block_id, program_id and perhaps // different kernel invocations within an op. - std::string full_name_; + // std::string full_name_; EventRole role_{EventRole::kOrdinary}; + const std::string* attr_{nullptr}; }; -class RecordRPCEvent { +/*class RecordRPCEvent { public: explicit RecordRPCEvent(const std::string& name); ~RecordRPCEvent() {} private: std::unique_ptr event_; -}; +};*/ struct RecordBlock { explicit RecordBlock(int block_id); @@ -242,5 +249,10 @@ int64_t ListenerId(); void NvprofEnableRecordEvent(); void NvprofDisableRecordEvent(); +void EnableHostEventRecorder(); + +// Defined for UT +std::string PrintHostEvents(); + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index a8438263cb97b..4277f7d4dc63e 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -47,6 +47,8 @@ static TracerOption g_tracer_option = TracerOption::kDefault; static ProfilerState g_state = ProfilerState::kDisabled; // To hook RecordEvent's events, use it to nvtx timeline static bool g_enable_nvprof_hook = false; +// To hook RecordEvent, use HostEventRecorder +static bool g_enable_host_event_recorder_hook = false; // The thread local event list only can be accessed by the specific thread // The thread index of each thread static thread_local int32_t g_thread_id; @@ -119,17 +121,17 @@ std::vector> GetMemEvents() { void SynchronizeAllDevice() { #ifdef PADDLE_WITH_CUDA - int count = GetCUDADeviceCount(); + int count = GetGPUDeviceCount(); for (int i = 0; i < count; i++) { SetDeviceId(i); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); } #endif #ifdef PADDLE_WITH_HIP - int count = GetCUDADeviceCount(); + int count = GetGPUDeviceCount(); for (int i = 0; i < count; i++) { SetDeviceId(i); - PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize()); + PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); } #endif } diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc index 212d99f6a78ed..dafb61fe0aaf4 100644 --- a/paddle/fluid/platform/stream/cuda_stream.cc +++ b/paddle/fluid/platform/stream/cuda_stream.cc @@ -30,18 +30,18 @@ bool CUDAStream::Init(const Place& place, const Priority& priority, CUDADeviceGuard guard(BOOST_GET_CONST(CUDAPlace, place_).device); if (priority == Priority::kHigh) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreateWithPriority( + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority( &stream_, static_cast(flag), -1)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreateWithPriority( + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreateWithPriority( &stream_, static_cast(flag), -1)); #endif } else if (priority == Priority::kNormal) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreateWithPriority( + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority( &stream_, static_cast(flag), 0)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreateWithPriority( + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreateWithPriority( &stream_, static_cast(flag), 0)); #endif } @@ -58,9 +58,9 @@ void CUDAStream::Destroy() { WaitCallback(); if (stream_) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_)); #endif } stream_ = nullptr; @@ -89,7 +89,7 @@ void CUDAStream::Wait() const { #endif #endif // PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(e_sync); + PADDLE_ENFORCE_GPU_SUCCESS(e_sync); } CUDAStream* get_current_stream(int deviceId) { diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h index 472d6bbab0c6c..36f31c46673b2 100644 --- a/paddle/fluid/platform/stream/cuda_stream.h +++ b/paddle/fluid/platform/stream/cuda_stream.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/stream_callback_manager.h" @@ -64,32 +64,32 @@ class CUDAStream final { #ifdef PADDLE_WITH_HIP void RecordEvent(hipEvent_t ev, Callback callback) const { callback(); - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(ev, stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_)); } #else void RecordEvent(cudaEvent_t ev, Callback callback) const { callback(); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_)); } #endif #ifdef PADDLE_WITH_HIP void RecordEvent(hipEvent_t ev) const { - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(ev, stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_)); } #else void RecordEvent(cudaEvent_t ev) const { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_)); } #endif #ifdef PADDLE_WITH_HIP void WaitEvent(hipEvent_t ev) const { - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(stream_, ev, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream_, ev, 0)); } #else void WaitEvent(cudaEvent_t ev) const { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0)); } #endif @@ -122,17 +122,11 @@ class CUDAStream final { } #endif - PADDLE_ENFORCE_CUDA_SUCCESS(err); + PADDLE_ENFORCE_GPU_SUCCESS(err); return false; } - void Synchronize() const { -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_)); -#endif - } + void Synchronize() const { platform::GpuStreamSync(stream_); } const Place& GetPlace() const { return place_; } diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index 3f0c5ace900d1..28aa022fe2f13 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "paddle/fluid/platform/stream_callback_manager.h" -#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/npu/npu_info.h" -#endif +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace platform { @@ -59,15 +59,15 @@ void StreamCallbackManager::AddCallback( }); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); #endif #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaLaunchHostFunc(stream_, StreamCallbackFunc, func)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); #endif #endif @@ -81,11 +81,8 @@ void StreamCallbackManager::AddCallback( template void StreamCallbackManager::Wait() const { -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_)); -#endif -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_)); +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) + platform::GpuStreamSync(stream_); #endif #ifdef PADDLE_WITH_ASCEND_CL NPUStreamSync(stream_); diff --git a/paddle/fluid/platform/test_limit_gpu_memory.cu b/paddle/fluid/platform/test_limit_gpu_memory.cu index 81b766182337f..684cb78073551 100644 --- a/paddle/fluid/platform/test_limit_gpu_memory.cu +++ b/paddle/fluid/platform/test_limit_gpu_memory.cu @@ -15,7 +15,7 @@ #include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/platform/cuda_device_guard.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" DECLARE_uint64(gpu_memory_limit_mb); @@ -30,32 +30,24 @@ TEST(test_record_malloc, test_limit_gpu_memory) { size_t limit = FLAGS_gpu_memory_limit_mb << 20; { - ASSERT_TRUE(IsCudaMallocRecorded(DEVICE_ID)); - ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), 0UL); + ASSERT_TRUE(IsGpuMallocRecorded(DEVICE_ID)); + ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), 0UL); } size_t avail, total; { size_t actual_avail, actual_total; - RecordedCudaMemGetInfo(&avail, &total, &actual_avail, &actual_total, - DEVICE_ID); + RecordedGpuMemGetInfo(&avail, &total, &actual_avail, &actual_total, + DEVICE_ID); ASSERT_EQ(total, limit); -#ifdef PADDLE_WITH_HIP - ASSERT_EQ(hipGetLastError(), gpuSuccess); -#else - ASSERT_EQ(cudaGetLastError(), gpuSuccess); -#endif + ASSERT_EQ(paddle::platform::GpuGetLastError(), gpuSuccess); } { CUDADeviceGuard guard(DEVICE_ID); GpuMemoryUsage(&avail, &total); ASSERT_EQ(total, limit); -#ifdef PADDLE_WITH_HIP - ASSERT_EQ(hipGetLastError(), gpuSuccess); -#else - ASSERT_EQ(cudaGetLastError(), gpuSuccess); -#endif + ASSERT_EQ(paddle::platform::GpuGetLastError(), gpuSuccess); } gpuError_t err = gpuSuccess; @@ -63,54 +55,41 @@ TEST(test_record_malloc, test_limit_gpu_memory) { void *p1 = nullptr; size_t size1 = limit / 4 * 3; { - err = platform::RecordedCudaMalloc(&p1, size1, DEVICE_ID); + err = platform::RecordedGpuMalloc(&p1, size1, DEVICE_ID); ASSERT_EQ(err, gpuSuccess); -#ifdef PADDLE_WITH_HIP - ASSERT_EQ(hipGetLastError(), gpuSuccess); -#else - ASSERT_EQ(cudaGetLastError(), gpuSuccess); -#endif + ASSERT_EQ(paddle::platform::GpuGetLastError(), gpuSuccess); ASSERT_NE(p1, nullptr); - ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1); + ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), size1); } void *p2 = nullptr; size_t size2 = limit / 2; { - err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID); -#ifdef PADDLE_WITH_HIP - ASSERT_EQ(err, hipErrorOutOfMemory); - ASSERT_EQ(hipGetLastError(), gpuSuccess); -#else - ASSERT_EQ(err, cudaErrorMemoryAllocation); - ASSERT_EQ(cudaGetLastError(), gpuSuccess); -#endif + err = platform::RecordedGpuMalloc(&p2, size2, DEVICE_ID); + ASSERT_EQ(err, gpuErrorOutOfMemory); + ASSERT_EQ(paddle::platform::GpuGetLastError(), gpuSuccess); ASSERT_EQ(p2, nullptr); - ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1); + ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), size1); } { - platform::RecordedCudaFree(p1, size1, DEVICE_ID); - ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), 0UL); + platform::RecordedGpuFree(p1, size1, DEVICE_ID); + ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), 0UL); } { - err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID); + err = platform::RecordedGpuMalloc(&p2, size2, DEVICE_ID); ASSERT_EQ(err, gpuSuccess); -#ifdef PADDLE_WITH_HIP - ASSERT_EQ(hipGetLastError(), hipSuccess); -#else - ASSERT_EQ(cudaGetLastError(), cudaSuccess); -#endif + ASSERT_EQ(paddle::platform::GpuGetLastError(), gpuSuccess); ASSERT_NE(p2, nullptr); - ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size2); + ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), size2); } { - platform::RecordedCudaFree(p2, size2, DEVICE_ID); - ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), 0UL); + platform::RecordedGpuFree(p2, size2, DEVICE_ID); + ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), 0UL); } } diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 588caed5a452e..e6d0a096b2d80 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -16,6 +16,9 @@ endif() if (WITH_GPU) set(PYBIND_DEPS ${PYBIND_DEPS} cuda_profiler) endif() +if (WITH_IPU) + set(PYBIND_DEPS ${PYBIND_DEPS} ipu_info) +endif() if (WITH_NCCL OR WITH_RCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper) @@ -25,6 +28,13 @@ endif() if (WITH_XPU_BKCL) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) set(PYBIND_DEPS ${PYBIND_DEPS} bkcl_context) + set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context) +endif() + +if (WITH_ASCEND_CL) + set(PYBIND_DEPS ${PYBIND_DEPS} reducer) + set(PYBIND_DEPS ${PYBIND_DEPS} hccl_context) + set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context) endif() if(NOT WIN32) @@ -32,9 +42,7 @@ if(NOT WIN32) set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator) if (WITH_NCCL OR WITH_RCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context) - endif() - if (WITH_ASCEND_CL) - set(PYBIND_DEPS ${PYBIND_DEPS} hccl_context) + set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context) endif() endif(NOT WIN32) @@ -126,17 +134,25 @@ if(WITH_PYTHON) add_executable(op_function_generator op_function_generator.cc) target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS}) + add_executable(eager_op_function_generator eager_op_function_generator.cc) + target_link_libraries(eager_op_function_generator ${OP_FUNCTION_GENERETOR_DEPS}) get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(op_function_generator ${os_dependency_modules}) + target_link_libraries(eager_op_function_generator ${os_dependency_modules}) if(WITH_ROCM) target_link_libraries(op_function_generator ${ROCM_HIPRTC_LIB}) + target_link_libraries(eager_op_function_generator ${ROCM_HIPRTC_LIB}) endif() set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h) set(tmp_impl_file ${impl_file}.tmp) + set(eager_impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function_impl.h) + set(tmp_eager_impl_file ${eager_impl_file}.tmp) set(OP_IMPL_DEPS op_function_generator) + set(EAGER_OP_IMPL_DEPS eager_op_function_generator) + if(WIN32) if("${CMAKE_GENERATOR}" STREQUAL "Ninja") set(op_impl_path "${CMAKE_CURRENT_BINARY_DIR}") @@ -160,22 +176,41 @@ if(WITH_PYTHON) ")\n" "exit /b 0") + file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat "" + "set build_times=1\n" + ":retry\n" + "ECHO eager_op_function_generator run %build_times% time\n" + "taskkill /f /im eager_op_function_generator.exe 2>NUL\n" + "${op_impl_path}/eager_op_function_generator.exe ${tmp_eager_impl_file}\n" + "if %ERRORLEVEL% NEQ 0 (\n" + " set /a build_times=%build_times%+1\n" + " if %build_times% GEQ 10 (\n" + " exit /b 1\n" + " ) else (\n" + " goto :retry\n" + " )\n" + ")\n" + "exit /b 0") + if(${CBLAS_PROVIDER} STREQUAL MKLML) ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/libiomp5md.dll COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${op_impl_path} DEPENDS mklml) list(APPEND OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll) + list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll) else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS) ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/openblas.dll COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${op_impl_path} DEPENDS extern_openblas) list(APPEND OP_IMPL_DEPS ${op_impl_path}/openblas.dll) + list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/openblas.dll) endif() if(WITH_MKLDNN) ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/mkldnn.dll COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${op_impl_path} DEPENDS mkldnn) list(APPEND OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll) + list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll) endif() add_custom_command(OUTPUT ${impl_file} @@ -183,6 +218,13 @@ if(WITH_PYTHON) COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file} COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" DEPENDS ${OP_IMPL_DEPS}) + if(NOT ON_INFER) + add_custom_command(OUTPUT ${eager_impl_file} + COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} ${eager_impl_file} + COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}" + DEPENDS ${EAGER_OP_IMPL_DEPS}) + endif() else(WIN32) # If there are no *.so in /usr/lib or LD_LIBRARY_PATH, # copy these *.so to current directory and append current directory to @@ -193,12 +235,14 @@ if(WITH_PYTHON) COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${CMAKE_CURRENT_BINARY_DIR} DEPENDS mklml) list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so) + list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so) endif() if(WITH_MKLDNN) ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0 COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR} DEPENDS mkldnn) list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0) + list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0) endif() add_custom_command(OUTPUT ${impl_file} COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:." @@ -208,10 +252,35 @@ if(WITH_PYTHON) COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" DEPENDS ${OP_IMPL_DEPS} VERBATIM) + if(NOT ON_INFER) + add_custom_command(OUTPUT ${eager_impl_file} + COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:." + "${CMAKE_CURRENT_BINARY_DIR}/eager_op_function_generator" + "${tmp_eager_impl_file}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} ${eager_impl_file} + COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}" + DEPENDS ${EAGER_OP_IMPL_DEPS} + VERBATIM) + endif() endif(WIN32) add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file}) + if(NOT ON_INFER) + add_custom_target(eager_op_function_generator_cmd ALL DEPENDS ${eager_impl_file}) + endif() list(APPEND PYBIND_DEPS interpretercore standalone_executor) + cc_library(op_function_common SRCS op_function_common.cc DEPS ${PYBIND_DEPS}) + list(APPEND PYBIND_DEPS op_function_common) + + if(NOT ON_INFER) + cc_library(paddle_eager + SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc + DEPS eager_api autograd_meta backward grad_node_info pten op_function_common dygraph_function dygraph_node math_cpu linalg_cpu creation_cpu utils_cpu manipulation_cpu accumulation_node global_utils utils python) + add_dependencies(paddle_eager eager_codegen) + add_dependencies(paddle_eager eager_op_function_generator_cmd) + list(APPEND PYBIND_DEPS paddle_eager) + endif() + cc_library(paddle_pybind SHARED SRCS ${PYBIND_SRCS} DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc index 115be1b8ba8b4..6fc9b2a494f61 100644 --- a/paddle/fluid/pybind/bind_fleet_executor.cc +++ b/paddle/fluid/pybind/bind_fleet_executor.cc @@ -16,6 +16,7 @@ #include #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" #include "paddle/fluid/distributed/fleet_executor/task_node.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/place.h" @@ -32,8 +33,8 @@ void BindFleetExecutor(py::module* m) { py::class_(*m, "FleetExecutor") .def(py::init()) .def("init", &FleetExecutor::Init) - .def("run", &FleetExecutor::Run) - .def("release", &FleetExecutor::Release); + .def("run", &FleetExecutor::Run, + py::call_guard()); py::class_(*m, "TaskNode") .def(py::init()) diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc index 311fb872ac103..21571e17a2b48 100644 --- a/paddle/fluid/pybind/cuda_streams_py.cc +++ b/paddle/fluid/pybind/cuda_streams_py.cc @@ -61,9 +61,9 @@ void BindCudaStream(py::module *m_ptr) { int curr_device_id = paddle::platform::GetCurrentDeviceId(); paddle::platform::SetDeviceId(device_id); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize()); + PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); #endif paddle::platform::SetDeviceId(curr_device_id); #else @@ -264,7 +264,7 @@ void BindCudaStream(py::module *m_ptr) { auto stream_flag = paddle::platform::stream::StreamFlag::kStreamNonBlocking; - int device_count = platform::GetCUDADeviceCount(); + int device_count = platform::GetGPUDeviceCount(); if (device < 0) { device = platform::GetCurrentDeviceId(); } diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc new file mode 100644 index 0000000000000..0714080382205 --- /dev/null +++ b/paddle/fluid/pybind/eager.cc @@ -0,0 +1,135 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +// disable numpy compile error +#include + +#include +#include + +#include "paddle/fluid/eager/api/all.h" +#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/pybind/eager.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/pten/common/data_type.h" +#include "paddle/pten/core/convert_utils.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/include/core.h" +#pragma GCC diagnostic ignored "-Wmissing-field-initializers" +#include "paddle/fluid/pybind/eager_op_function_impl.h" + +namespace paddle { +namespace pybind { + +namespace py = ::pybind11; + +PyTypeObject* p_eager_tensor_type; + +PyObject* eagertensor_new(PyTypeObject* type, PyObject* args, + PyObject* kwargs) { + PyObject* obj = type->tp_alloc(type, 0); + if (obj) { + auto v = reinterpret_cast(obj); + new (&(v->eagertensor)) egr::EagerTensor(); + } + return obj; +} + +static void eagertensor_dealloc(EagerTensorObject* self) { + self->eagertensor.~EagerTensor(); + Py_TYPE(self)->tp_free(reinterpret_cast(self)); +} + +extern struct PyGetSetDef variable_properties[]; + +extern PyMethodDef variable_methods[]; + +PyTypeObject eager_tensor_type = { + PyVarObject_HEAD_INIT(NULL, 0) "core_avx.eager.EagerTensor", /* tp_name */ + sizeof(EagerTensorObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)eagertensor_dealloc, /* tp_dealloc */ + 0, /* tp_vectorcall_offset */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_reserved */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | + Py_TPFLAGS_HEAPTYPE, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + variable_methods, /* tp_methods */ + 0, /* tp_members */ + variable_properties, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + eagertensor_new, /* tp_new */ + 0, /* tp_free */ + 0, /* tp_is_gc */ + 0, /* tp_bases */ + 0, /* tp_mro */ + 0, /* tp_cache */ + 0, /* tp_subclasses */ + 0, /* tp_weaklist */ + 0, /* tp_del */ + 0, /* tp_version_tag */ + 0 /* tp_finalize */ +}; + +void BindEager(pybind11::module* module) { + auto m = module->def_submodule("eager"); + + p_eager_tensor_type = &eager_tensor_type; + if (PyType_Ready(&eager_tensor_type) < 0) { + PADDLE_THROW(platform::errors::Fatal( + "Init Paddle erroe in BindEager(PyType_Ready).")); + return; + } + + Py_INCREF(&eager_tensor_type); + if (PyModule_AddObject(m.ptr(), "EagerTensor", + reinterpret_cast(&eager_tensor_type)) < 0) { + Py_DECREF(&eager_tensor_type); + Py_DECREF(m.ptr()); + PADDLE_THROW(platform::errors::Fatal( + "Init Paddle erroe in BindEager(PyModule_AddObject).")); + return; + } + + BindFunctions(m.ptr()); + BindEagerOpFunctions(&m); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/pten/api/include/manipulation.h b/paddle/fluid/pybind/eager.h similarity index 66% rename from paddle/pten/api/include/manipulation.h rename to paddle/fluid/pybind/eager.h index 579fa5cdf945a..c1a869d9b89fa 100644 --- a/paddle/pten/api/include/manipulation.h +++ b/paddle/fluid/pybind/eager.h @@ -1,28 +1,24 @@ /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once -#include "paddle/pten/api/include/tensor.h" +#include +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" namespace paddle { -namespace experimental { - -PD_DLL_DECL Tensor flatten(const Tensor& x, int start_axis, int stop_axis); +namespace pybind { -PD_DLL_DECL Tensor cast(const Tensor& x, DataType out_dtype); +void BindEager(pybind11::module* m); +void BindFunctions(PyObject* module); -PD_DLL_DECL Tensor reshape(const Tensor& x, const std::vector& shape); -} // namespace experimental +} // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc new file mode 100644 index 0000000000000..8c0f9ddf19f12 --- /dev/null +++ b/paddle/fluid/pybind/eager_functions.cc @@ -0,0 +1,223 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +// disable numpy compile error +#include + +#include +#include + +#include "pybind11/numpy.h" +#include "pybind11/pybind11.h" + +#include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/all.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/backward.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/pybind/eager.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/fluid/pybind/exception.h" +#include "paddle/pten/api/lib/utils/allocator.h" +#include "paddle/pten/api/lib/utils/storage.h" +#include "paddle/pten/api/lib/utils/tensor_utils.h" +#include "paddle/pten/common/data_type.h" +#include "paddle/pten/core/convert_utils.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/include/core.h" + +namespace paddle { +namespace pybind { + +namespace py = ::pybind11; + +extern PyTypeObject* p_eager_tensor_type; + +size_t PyArray_Size_(PyObject* numpy_data) { + size_t res = 1; + auto dims = pybind11::detail::array_proxy(numpy_data)->dimensions; + auto nd = pybind11::detail::array_proxy(numpy_data)->nd; + while (nd--) { + res *= (*dims++); + } + return res; +} + +class EagerNumpyAllocation : public paddle::memory::allocation::Allocation { + public: + explicit EagerNumpyAllocation(PyObject* numpy_data, pten::DataType dtype) + : Allocation( + static_cast(pybind11::detail::array_proxy(numpy_data)->data), + pten::DataTypeSize(dtype) * PyArray_Size_(numpy_data), + paddle::platform::CPUPlace()), + arr_(numpy_data) { + PADDLE_ENFORCE_NOT_NULL(arr_, platform::errors::InvalidArgument( + "The underlying PyObject pointer of " + "numpy array cannot be nullptr")); + PADDLE_ENFORCE_NE( + arr_, Py_None, + platform::errors::PreconditionNotMet( + "The underlying PyObject pointer of numpy array cannot be None")); + Py_INCREF(arr_); + } + ~EagerNumpyAllocation() override { + py::gil_scoped_acquire gil; + Py_DECREF(arr_); + } + + private: + PyObject* arr_; +}; + +static PyObject* eager_api_set_expected_place(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0); + egr::Controller::Instance().SetExpectedPlace(place); + + Py_INCREF(Py_None); + return Py_None; + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* eager_api_scale(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + // TODO(jiabin): Sync Tensor and Variable here when we support + egr::EagerTensor ret = + egr::scale(reinterpret_cast(PyTuple_GET_ITEM(args, 0)) + ->eagertensor, + CastPyArg2AttrFloat(PyTuple_GET_ITEM(args, 1), 1), + CastPyArg2AttrFloat(PyTuple_GET_ITEM(args, 2), 2), + CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3), + CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4)); + return ToPyObject(ret); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* eager_api_numpy_to_tensor(PyObject* numpy_data, + pten::DataType dtype, + const paddle::platform::Place& place, + bool stop_gradient) { + std::vector vec_dims; + auto numpy_shape = pybind11::detail::array_proxy(numpy_data)->dimensions; + int rank = pybind11::detail::array_proxy(numpy_data)->nd; + for (int i = 0; i < rank; i++) { + vec_dims.push_back(static_cast(numpy_shape[i])); + } + paddle::framework::DDim dims = paddle::framework::make_ddim(vec_dims); + + // TODO(jiabin): Support GPU later + auto meta = pten::DenseTensorMeta(dtype, dims); + auto holder = std::make_shared(numpy_data, dtype); + auto shared_storage = + pten::make_intrusive(holder, 0); + std::shared_ptr densetensor( + new pten::DenseTensor(std::move(shared_storage), std::move(meta))); + + PyObject* obj = p_eager_tensor_type->tp_alloc(p_eager_tensor_type, 0); + if (obj) { + auto v = reinterpret_cast(obj); + new (&(v->eagertensor)) egr::EagerTensor(); + v->eagertensor.set_impl(densetensor); + v->eagertensor.set_name(egr::Controller::Instance().GenerateUniqueName()); + auto meta = egr::EagerUtils::autograd_meta(&(v->eagertensor)); + meta->SetStopGradient(stop_gradient); + + // Created tensor will be leaf tensor + // So we append AccumulationNode to it. + auto accumulation_node = std::make_shared(); + meta->SetGradNode(accumulation_node); + + // TODO(jiabin): Shall we increase ref cnt here to make python ref cnt num + // correctly? + } else { + PADDLE_THROW(platform::errors::Fatal( + "tp_alloc return null, can not new a PyObject.")); + } + + return obj; +} + +static PyObject* eager_api_to_tensor(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + // TODO(jiabin): Support Kwargs here + PyObject* data = PyTuple_GET_ITEM(args, 0); + auto str_dtype = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1); + pten::DataType dtype = pten::String2DataType(str_dtype); + auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2); + bool stop_gradient = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3); + // TODO(jiabin): Support this when python given name + // auto str_name = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 4), 4); + + if (pybind11::detail::npy_api::get().PyArray_Check_(data)) { + return eager_api_numpy_to_tensor(data, dtype, place, stop_gradient); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Eater to_tensor only support numpy to tensor.")); + Py_INCREF(Py_None); + return Py_None; + } + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* eager_api_retain_grad_for_tensor(PyObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + egr::egr_utils_api::RetainGradForTensor( + CastPyArg2EagerTensor(PyTuple_GET_ITEM(args, 0), 0)); + Py_INCREF(Py_None); + return Py_None; + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* eager_api_run_backward(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto tensors = CastPyArg2VectorOfEagerTensor(PyTuple_GET_ITEM(args, 0), 0); + auto grad_tensors = + CastPyArg2VectorOfEagerTensor(PyTuple_GET_ITEM(args, 1), 1); + RunBackward(tensors, grad_tensors, + CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2)); + Py_INCREF(Py_None); + return Py_None; + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +PyMethodDef variable_functions[] = { + {"to_tensor", (PyCFunction)(void (*)(void))eager_api_to_tensor, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"scale", (PyCFunction)(void (*)(void))eager_api_scale, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"_set_expected_place", + (PyCFunction)(void (*)(void))eager_api_set_expected_place, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"retain_grad_for_tensor", + (PyCFunction)(void (*)(void))eager_api_retain_grad_for_tensor, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward, + METH_VARARGS | METH_KEYWORDS, NULL}, + {NULL, NULL, 0, NULL}}; + +void BindFunctions(PyObject* module) { + if (PyModule_AddFunctions(module, variable_functions) < 0) { + PADDLE_THROW(platform::errors::Fatal( + "Init Paddle erroe in BindFunctions(PyModule_AddFunctions).")); + return; + } +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc new file mode 100644 index 0000000000000..75fd8c7fabe63 --- /dev/null +++ b/paddle/fluid/pybind/eager_method.cc @@ -0,0 +1,113 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +// disable numpy compile error +#include + +#include +#include + +#include "pybind11/numpy.h" +#include "pybind11/pybind11.h" + +#include "paddle/fluid/eager/api/all.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/pybind/eager.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/fluid/pybind/exception.h" +#include "paddle/pten/common/data_type.h" +#include "paddle/pten/core/convert_utils.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/include/core.h" +namespace paddle { +namespace pybind { + +extern PyTypeObject* pEagerTensorType; + +static PyObject* eager_tensor_method_numpy(EagerTensorObject* self, + PyObject* args, PyObject* kwargs) { + EAGER_TRY + self->eagertensor.SyncToTensor(); + if (!self->eagertensor.initialized()) { + Py_INCREF(Py_None); + return Py_None; + } + auto tensor_dims = self->eagertensor.shape(); + auto numpy_dtype = TensorDtype2NumpyDtype(self->eagertensor.type()); + auto sizeof_dtype = pten::DataTypeSize(self->eagertensor.type()); + Py_intptr_t py_dims[paddle::framework::DDim::kMaxRank]; + Py_intptr_t py_strides[paddle::framework::DDim::kMaxRank]; + size_t numel = 1; + for (int i = tensor_dims.size() - 1; i >= 0; --i) { + py_dims[i] = static_cast(tensor_dims[i]); + py_strides[i] = sizeof_dtype * numel; + numel *= py_dims[i]; + } + auto& api = pybind11::detail::npy_api::get(); + PyObject* array = api.PyArray_NewFromDescr_( + api.PyArray_Type_, api.PyArray_DescrFromType_(numpy_dtype), + tensor_dims.size(), py_dims, py_strides, nullptr, + pybind11::detail::npy_api::NPY_ARRAY_ALIGNED_ | + pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_, + nullptr); + + if (self->eagertensor.is_cpu()) { + auto dense_tensor = + std::dynamic_pointer_cast(self->eagertensor.impl()); + platform::CPUPlace place; + // deep copy + paddle::memory::Copy(place, reinterpret_cast( + pybind11::detail::array_proxy(array)->data), + place, dense_tensor->data(), sizeof_dtype * numel); +#if defined(PADDLE_WITH_CUDA) + } else if (self->eagertensor.is_cuda()) { + auto dense_tensor = + std::dynamic_pointer_cast(self->eagertensor.impl()); + + paddle::platform::GpuMemcpySync( + pybind11::detail::array_proxy(array)->data, dense_tensor->data(), + pten::DataTypeSize(dense_tensor->dtype()) * dense_tensor->numel(), + cudaMemcpyDeviceToHost); +#endif + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Tensor.numpy() only support cpu tensor.")); + Py_INCREF(Py_None); + return Py_None; + } + + return array; + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* eager_tensor_method_is_initialized(EagerTensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + if (self->eagertensor.Var().IsInitialized()) { + self->eagertensor.SyncToTensor(); + } + return ToPyObject(self->eagertensor.initialized()); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +PyMethodDef variable_methods[] = { + {"numpy", (PyCFunction)(void (*)(void))eager_tensor_method_numpy, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"_is_initialized", + (PyCFunction)(void (*)(void))eager_tensor_method_is_initialized, + METH_VARARGS | METH_KEYWORDS, NULL}, + {NULL, NULL, 0, NULL}}; + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc new file mode 100644 index 0000000000000..46d0bdcb46de7 --- /dev/null +++ b/paddle/fluid/pybind/eager_op_function_generator.cc @@ -0,0 +1,397 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#ifndef _WIN32 +#include +#endif + +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/pybind/pybind.h" +#include "paddle/fluid/string/string_helper.h" +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/framework/fleet/ascend_wrapper.h" +#endif +#include "paddle/fluid/pybind/op_function_generator.h" + +std::set gen_list = {"elementwise_add", "reduce_sum", "matmul_v2", + "sigmoid"}; + +// clang-format off +const char* OUT_INITIALIZER_TEMPLATE = + R"({"%s", {std::shared_ptr(new imperative::VarBase("auto_"+std::to_string(VarBaseUniqueNameID++)+"_"))}})"; +const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableOutput(%s)})"; + +const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})"; +const char* INPUT_LIST_INITIALIZER_TEMPLATE = R"({"%s", %s})"; + +const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"( + if (%s != nullptr) { + ins["%s"] = {%s}; + } +)"; + +const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"( + if (%s.size() != 0) { + ins["%s"] = %s; + } +)"; + +const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"( + outs["%s"] = {%s}; +)"; + +const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"( + outs["%s"] = %s; +)"; +// if inputs is list, no need {} +const char* ARG_OUT_NUM = R"(%sNum)"; +const char* ARG_OUT_NUM_TYPE = R"(size_t )"; + +const char* IN_VAR_TYPE = R"(py::handle)"; +const char* IN_VAR_LIST_TYPE = R"(py::handle)"; + +const char* OUT_VAR_TYPE = R"(std::shared_ptr)"; +const char* OUT_VAR_LIST_TYPE = R"(std::vector>)"; + +const char* CAST_VAR_TEMPLATE = R"( + auto %s = GetEagerTensorFromArgs("%s", "%s", args, %d, %s);)"; + +const char* CAST_VAR_LIST_TEMPLATE = R"( + auto %s = GetEagerTensorListFromArgs("%s", "%s", args, %d, %s);)"; + +const char* CAST_SIZE_T_TEMPLATE = R"( + auto %s = GetUnsignedLongFromArgs("%s", "%s", args, %d, %s);)"; + +const char* ARG_TEMPLATE = R"(const %s& %s)"; + +const char* RETURN_TUPLE_TYPE = R"(std::tuple<%s>)"; +const char* RETURN_TUPLE_TEMPLATE = R"(std::make_tuple(%s))"; +const char* RETURN_LIST_TEMPLATE = R"(outs["%s"])"; +const char* RETURN_TEMPLATE = R"(outs["%s"][0])"; + +const char* FUNCTION_ARGS = R"(%s, const py::args& args)"; +const char* FUNCTION_ARGS_NO_INPUT = R"(const py::args& args)"; + +const char* HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT = R"( + if (ins.count("%s") && outs.count("%s")) { + HandleViewBetweenInputAndOutput(ins["%s"][0], outs["%s"][0]); + })"; + +const char* OP_FUNCTION_TEMPLATE = +R"( +static PyObject * %s(PyObject *self, PyObject *args, PyObject *kwargs) +{ + PyThreadState *tstate = nullptr; + try + { + %s + framework::AttributeMap attrs; + ConstructAttrMapFromPyArgs("%s", args, %d, PyTuple_GET_SIZE(args) , attrs); + tstate = PyEval_SaveThread(); + %s + PyEval_RestoreThread(tstate); + tstate = nullptr; + %s + } + catch(...) { + if (tstate) { + PyEval_RestoreThread(tstate); + } + ThrowExceptionToPython(std::current_exception()); + return nullptr; + } +})"; + +const char* PYBIND_ITEM_TEMPLATE = R"( {"%s", (PyCFunction)(void(*)(void))%s, METH_VARARGS | METH_KEYWORDS, "C++ interface function for %s in dygraph."},)"; + +// clang-format on +static inline bool FindInsMap(const std::string& op_type, + const std::string& in_name) { + return op_ins_map[op_type].count(in_name); +} + +static inline bool FindOutsMap(const std::string& op_type, + const std::string& out_name) { + return op_outs_map[op_type].count(out_name); +} + +static inline bool FindPassingOutsMap(const std::string& op_type, + const std::string& out_name) { + return op_passing_outs_map[op_type].count(out_name); +} + +static inline bool FindViewOpMap(const std::string& op_type) { + return view_op_map.count(op_type); +} + +static inline std::string TempName(const std::string& name) { + return name + '_'; +} + +std::string GenerateOpFunctionsBody( + const paddle::framework::proto::OpProto* op_proto, std::string func_name, + bool use_inplace_strategy = false, + std::map inplace_map = {}) { + auto& op_type = op_proto->type(); + std::string input_args = ""; + std::string call_api_str = "auto out = " + op_type + "_dygraph_function("; + std::string ins_initializer_with_null = ""; + std::string py_arg = ""; + int arg_idx = 0; + int input_args_num = 0; + std::string ins_cast_str = ""; + std::string view_strategy_str = ""; + for (auto& input : op_proto->inputs()) { + auto& in_name = input.name(); + // skip those dispensable inputs, like ResidualData in conv2d + if (input.dispensable() && !FindInsMap(op_type, in_name)) { + continue; + } + const auto in_type = input.duplicable() ? IN_VAR_LIST_TYPE : IN_VAR_TYPE; + auto input_arg = + paddle::string::Sprintf(ARG_TEMPLATE, in_type, TempName(in_name)); + input_args += input_arg; + input_args += ","; + input_args_num++; + const auto in_cast_type = + input.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE; + auto dispensable = input.dispensable() ? "true" : "false"; + ins_cast_str += paddle::string::Sprintf(in_cast_type, in_name, op_type, + in_name, arg_idx++, dispensable); + + if (input.dispensable()) { + const auto in_template = input.duplicable() + ? INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST + : INPUT_INITIALIZER_TEMPLATE_WITH_NULL; + ins_initializer_with_null += + paddle::string::Sprintf(in_template, in_name, in_name, in_name); + } else { + call_api_str += in_name + ", "; + } + } + + if (!input_args.empty() && input_args.back() == ',') { + input_args.pop_back(); + } + + // Generate outs initializer + std::string outs_initializer = "{"; + std::string outs_initializer_with_null = ""; + std::string return_str = ""; + + int outs_num = 0; + for (auto& output : op_proto->outputs()) { + auto& out_name = output.name(); + + // skip those dispensable oututs + if (output.dispensable() && !FindOutsMap(op_type, out_name)) { + continue; + } + const auto out_type = + output.duplicable() ? OUT_VAR_LIST_TYPE : OUT_VAR_TYPE; + + if (FindPassingOutsMap(op_type, out_name)) { + if (input_args != "") { + input_args += ","; + } + input_args += out_type; + input_args += out_name; + input_args_num++; + + if (output.dispensable()) { + const auto out_template = + output.duplicable() ? OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST + : OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL; + outs_initializer_with_null += + paddle::string::Sprintf(out_template, out_name, out_name); + } else { + const auto out_template = output.duplicable() + ? INPUT_LIST_INITIALIZER_TEMPLATE + : INPUT_INITIALIZER_TEMPLATE; + outs_initializer += + paddle::string::Sprintf(out_template, out_name, out_name); + outs_initializer += ","; + } + + const auto in_cast_type = + output.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE; + auto dispensable = output.dispensable() ? "true" : "false"; + ins_cast_str += paddle::string::Sprintf(in_cast_type, out_name, op_type, + out_name, arg_idx++, dispensable); + } else { + // There are few Operators that have duplicable output, like `Out` in + // split op. We need to specify the number of variables for the + // duplicable output, as the argument OutNum; + if (output.duplicable()) { + if (input_args != "") { + input_args += ","; + } + auto out_num_str = paddle::string::Sprintf(ARG_OUT_NUM, out_name); + input_args += ARG_OUT_NUM_TYPE; + input_args += out_num_str; + input_args_num++; + outs_initializer += paddle::string::Sprintf( + OUT_DUPLICABLE_INITIALIZER_TEMPLATE, out_name, out_num_str); + + auto dispensable = output.dispensable() ? "true" : "false"; + ins_cast_str += + paddle::string::Sprintf(CAST_SIZE_T_TEMPLATE, out_num_str, op_type, + out_num_str, arg_idx++, dispensable); + call_api_str += out_num_str + ", "; + } else { + outs_initializer += + paddle::string::Sprintf(OUT_INITIALIZER_TEMPLATE, out_name); + } + outs_initializer += ","; + } + + // return_str += paddle::string::Sprintf(return_template, out_name); + // return_str += ","; + outs_num += 1; + } + call_api_str += "attrs);"; + if (outs_initializer.back() == ',') { + outs_initializer.pop_back(); + // return_str.pop_back(); + } + outs_initializer += "}"; + if (FindViewOpMap(op_type)) { + std::string viwe_input_name = view_op_map[op_type].first; + std::string viwe_output_name = view_op_map[op_type].second; + view_strategy_str += paddle::string::Sprintf( + HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name, + viwe_input_name, viwe_output_name); + } + if (outs_num == 0) { + return_str = "Py_INCREF(Py_None);\n return Py_None;"; + } else { + return_str = "return ToPyObject(out);"; + } + std::string function_args = ""; + if (input_args == "") { + function_args = FUNCTION_ARGS_NO_INPUT; + } else { + function_args = paddle::string::Sprintf(FUNCTION_ARGS, input_args); + } + + // generate op funtcion body + auto op_function_str = paddle::string::Sprintf( + OP_FUNCTION_TEMPLATE, func_name, ins_cast_str, op_type, input_args_num, + call_api_str, return_str); + + return op_function_str; +} + +static std::tuple, std::vector> +GenerateOpFunctions() { + auto& op_info_map = paddle::framework::OpInfoMap::Instance().map(); + + std::vector op_function_list, bind_function_list; + auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels(); + + for (auto& pair : op_info_map) { + auto& op_info = pair.second; + auto op_proto = op_info.proto_; + if (op_proto == nullptr) { + continue; + } + auto& op_type = op_proto->type(); + // Skip ooerator which is not inherit form OperatorWithKernel, like while, + // since only OperatorWithKernel can run in dygraph mode. + // if the pten lib contains op kernel, we still generate ops method + if (!all_kernels.count(op_type) && + !pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) { + continue; + } + if (!gen_list.count(op_type)) { + continue; + } + std::string func_name = "eager_api_" + op_type; + std::string op_function_str = GenerateOpFunctionsBody(op_proto, func_name); + + // generate pybind item + auto bind_function_str = paddle::string::Sprintf( + PYBIND_ITEM_TEMPLATE, op_type, func_name, op_type); + + op_function_list.emplace_back(std::move(op_function_str)); + bind_function_list.emplace_back(std::move(bind_function_str)); + } + return std::make_tuple(op_function_list, bind_function_list); +} + +int main(int argc, char* argv[]) { + if (argc != 2) { + std::cerr << "argc must be 2" << std::endl; + return -1; + } + +#ifdef PADDLE_WITH_ASCEND_CL + auto ascend_ptr = paddle::framework::AscendInstance::GetInstance(); + ascend_ptr->InitGEForUT(); +#endif + + std::vector headers{ + "\"pybind11/detail/common.h\"", + "\"paddle/fluid/pybind/op_function_common.h\"", + "\"paddle/fluid/pybind/exception.h\"", ""}; + + std::ofstream out(argv[1], std::ios::out); + + out << "#pragma once\n\n"; + + for (auto& header : headers) { + out << "#include " + header + "\n"; + } + + out << "\n\n"; + + auto op_funcs = GenerateOpFunctions(); + + out << "namespace paddle {\n" + << "namespace pybind {\n\n"; + out << paddle::string::join_strings(std::get<0>(op_funcs), '\n'); + out << "\n\n"; + + out << "static PyMethodDef ExtestMethods[] = {\n" + << paddle::string::join_strings(std::get<1>(op_funcs), '\n') + << "\n {nullptr,nullptr,0,nullptr}" + << "};\n\n"; + + out << "inline void BindEagerOpFunctions(pybind11::module *module) {\n" + << " auto m = module->def_submodule(\"ops\");\n" + << " if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n" + << " PADDLE_THROW(platform::errors::Fatal (\"Add functions to " + "core.eager.ops failed!\"));\n" + << " }\n\n" + << " InitOpsAttrTypeMap();" + << "}\n\n" + << "} // namespace pybind\n" + << "} // namespace paddle\n"; + + out.close(); + +#ifdef PADDLE_WITH_ASCEND_CL + ge::GEFinalize(); +#endif + + return 0; +} diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc new file mode 100644 index 0000000000000..7f20f32e81a5e --- /dev/null +++ b/paddle/fluid/pybind/eager_properties.cc @@ -0,0 +1,166 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +// disable numpy compile error +#include + +#include +#include + +#include "paddle/fluid/eager/api/all.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/pybind/eager.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/fluid/pybind/exception.h" +#include "paddle/pten/common/data_type.h" +#include "paddle/pten/core/convert_utils.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/include/core.h" +#pragma GCC diagnostic ignored "-Wwrite-strings" + +namespace paddle { +namespace pybind { + +extern PyTypeObject* p_eager_tensor_type; + +PyObject* eager_tensor_properties_get_name(EagerTensorObject* self, + void* closure) { + EAGER_TRY + self->eagertensor.SyncToTensor(); + return ToPyObject(self->eagertensor.name()); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +int eager_tensor_properties_set_name(EagerTensorObject* self, PyObject* value, + void* closure) { + EAGER_TRY + self->eagertensor.SyncToTensor(); + self->eagertensor.set_name(CastPyArg2AttrString(value, 0)); + return 0; + EAGER_CATCH_AND_THROW_RETURN_ZERO +} + +PyObject* eager_tensor_properties_get_stop_gradient(EagerTensorObject* self, + void* closure) { + EAGER_TRY + self->eagertensor.SyncToTensor(); + auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor); + return ToPyObject(meta->StopGradient()); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +PyObject* eager_tensor_properties_get_grad(EagerTensorObject* self, + void* closure) { + EAGER_TRY + self->eagertensor.SyncToTensor(); + auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eagertensor); + return ToPyObject(meta->Grad()); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +int eager_tensor_properties_set_stop_gradient(EagerTensorObject* self, + PyObject* value, void* closure) { + EAGER_TRY + self->eagertensor.SyncToTensor(); + auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor); + meta->SetStopGradient(CastPyArg2AttrBoolean(value, 0)); + return 0; + EAGER_CATCH_AND_THROW_RETURN_ZERO +} + +PyObject* eager_tensor_properties_get_persistable(EagerTensorObject* self, + void* closure) { + EAGER_TRY + self->eagertensor.SyncToTensor(); + auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor); + return ToPyObject(meta->Persistable()); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +int eager_tensor_properties_set_persistable(EagerTensorObject* self, + PyObject* value, void* closure) { + EAGER_TRY + self->eagertensor.SyncToTensor(); + auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor); + meta->SetPersistable(CastPyArg2AttrBoolean(value, 0)); + return 0; + EAGER_CATCH_AND_THROW_RETURN_ZERO +} + +PyObject* eager_tensor_properties_get_shape(EagerTensorObject* self, + void* closure) { + EAGER_TRY + self->eagertensor.SyncToTensor(); + auto ddim = self->eagertensor.shape(); + std::vector value; + size_t rank = static_cast(ddim.size()); + value.resize(rank); + for (size_t i = 0; i < rank; i++) { + value[i] = ddim[i]; + } + + return ToPyObject(value); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +PyObject* eager_tensor_properties_get_place(EagerTensorObject* self, + void* closure) { + EAGER_TRY + self->eagertensor.SyncToTensor(); + return ToPyObject(self->eagertensor.place()); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +PyObject* eager_tensor_properties_get_place_str(EagerTensorObject* self, + void* closure) { + EAGER_TRY + self->eagertensor.SyncToTensor(); + std::stringstream ostr; + ostr << self->eagertensor.place(); + return ToPyObject(ostr.str()); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +PyObject* eager_tensor_properties_get_dtype(EagerTensorObject* self, + void* closure) { + EAGER_TRY + self->eagertensor.SyncToTensor(); + return ToPyObject(pten::DataType2String(self->eagertensor.type())); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +struct PyGetSetDef variable_properties[] = { + {"grad", (getter)eager_tensor_properties_get_grad, nullptr, nullptr, + nullptr}, + {"name", (getter)eager_tensor_properties_get_name, + (setter)eager_tensor_properties_set_name, nullptr, nullptr}, + {"stop_gradient", (getter)eager_tensor_properties_get_stop_gradient, + (setter)eager_tensor_properties_set_stop_gradient, nullptr, nullptr}, + {"persistable", (getter)eager_tensor_properties_get_persistable, + (setter)eager_tensor_properties_set_persistable, nullptr, nullptr}, + {"shape", (getter)eager_tensor_properties_get_shape, nullptr, nullptr, + nullptr}, + // {"is_leaf", (getter)eager_tensor_properties_get_is_leaf, nullptr, + // nullptr, + // nullptr}, + {"place", (getter)eager_tensor_properties_get_place, nullptr, nullptr, + nullptr}, + {"_place_str", (getter)eager_tensor_properties_get_place_str, nullptr, + nullptr, nullptr}, + {"dtype", (getter)eager_tensor_properties_get_dtype, nullptr, nullptr, + nullptr}, + {nullptr, nullptr, nullptr, nullptr, nullptr}}; + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc new file mode 100644 index 0000000000000..eb53884186ffc --- /dev/null +++ b/paddle/fluid/pybind/eager_utils.cc @@ -0,0 +1,451 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include +#include + +#include "paddle/fluid/eager/api/all.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/operators/py_func_op.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/pybind/eager.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/fluid/pybind/op_function_common.h" +#include "paddle/fluid/pybind/tensor_py.h" +#include "paddle/pten/common/data_type.h" +#include "paddle/pten/core/convert_utils.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/include/core.h" + +namespace paddle { +namespace pybind { + +extern PyTypeObject* p_eager_tensor_type; + +extern PyTypeObject* g_place_pytype; +extern PyTypeObject* g_cudaplace_pytype; +extern PyTypeObject* g_cpuplace_pytype; +extern PyTypeObject* g_xpuplace_pytype; +extern PyTypeObject* g_npuplace_pytype; +extern PyTypeObject* g_cudapinnedplace_pytype; + +int TensorDtype2NumpyDtype(pten::DataType dtype) { + switch (dtype) { + case pten::DataType::BOOL: + return pybind11::detail::npy_api::NPY_BOOL_; + case pten::DataType::INT8: + return pybind11::detail::npy_api::NPY_INT8_; + case pten::DataType::UINT8: + return pybind11::detail::npy_api::NPY_UINT8_; + case pten::DataType::INT16: + return pybind11::detail::npy_api::NPY_INT16_; + case pten::DataType::INT32: + return pybind11::detail::npy_api::NPY_INT32_; + case pten::DataType::INT64: + return pybind11::detail::npy_api::NPY_INT64_; + case pten::DataType::FLOAT16: + return pybind11::detail::NPY_FLOAT16_; + case pten::DataType::FLOAT32: + return pybind11::detail::npy_api::NPY_FLOAT_; + case pten::DataType::FLOAT64: + return pybind11::detail::npy_api::NPY_DOUBLE_; + case pten::DataType::COMPLEX64: + return pybind11::detail::NPY_COMPLEX64; + case pten::DataType::COMPLEX128: + return pybind11::detail::NPY_COMPLEX128; + default: + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Unknow pten::DataType, the int value = %d.", + static_cast(dtype))); + return 0; + } +} + +bool PyObject_CheckLongOrConvertToLong(PyObject** obj) { + if ((PyLong_Check(*obj) && !PyBool_Check(*obj))) { + return true; + } + + if (std::string((reinterpret_cast((*obj)->ob_type))->tp_name) + .find("numpy") != std::string::npos) { + auto to = PyNumber_Long(*obj); + if (to) { + *obj = to; + return true; + } + } + + return false; +} + +bool PyObject_CheckFloatOrConvertToFloat(PyObject** obj) { + // sometimes users provide PyLong or numpy.int64 but attr is float + if (PyFloat_Check(*obj) || PyLong_Check(*obj)) { + return true; + } + if (std::string((reinterpret_cast((*obj)->ob_type))->tp_name) + .find("numpy") != std::string::npos) { + auto to = PyNumber_Float(*obj); + if (to) { + *obj = to; + return true; + } + } + return false; +} + +bool PyObject_CheckStr(PyObject* obj) { return PyUnicode_Check(obj); } + +bool CastPyArg2AttrBoolean(PyObject* obj, ssize_t arg_pos) { + if (obj == Py_None) { + return false; // To be compatible with QA integration testing. Some + // test case pass in None. + } else if (obj == Py_True) { + return true; + } else if (obj == Py_False) { + return false; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "bool, but got %s", + arg_pos + 1, (reinterpret_cast(obj->ob_type))->tp_name)); + } +} + +int CastPyArg2AttrInt(PyObject* obj, ssize_t arg_pos) { + if (PyObject_CheckLongOrConvertToLong(&obj)) { + return static_cast(PyLong_AsLong(obj)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "int, but got %s", + arg_pos + 1, (reinterpret_cast(obj->ob_type))->tp_name)); + } +} + +int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos) { + if (PyObject_CheckLongOrConvertToLong(&obj)) { + return (int64_t)PyLong_AsLong(obj); // NOLINT + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "long, but got %s", + arg_pos + 1, (reinterpret_cast(obj->ob_type))->tp_name)); + } +} + +float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos) { + if (PyObject_CheckFloatOrConvertToFloat(&obj)) { + return static_cast(PyFloat_AsDouble(obj)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "float, but got %s", + arg_pos + 1, (reinterpret_cast(obj->ob_type))->tp_name)); + } +} + +std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos) { + if (PyObject_CheckStr(obj)) { + Py_ssize_t size; + const char* data; + data = PyUnicode_AsUTF8AndSize(obj, &size); + return std::string(data, static_cast(size)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "str, but got %s", + arg_pos + 1, (reinterpret_cast(obj->ob_type))->tp_name)); + return ""; + } +} + +egr::EagerTensor CastPyArg2EagerTensor(PyObject* obj, ssize_t arg_pos) { + if (PyObject_IsInstance(obj, + reinterpret_cast(p_eager_tensor_type))) { + return reinterpret_cast(obj)->eagertensor; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "EagerTensor, but got %s", + arg_pos + 1, reinterpret_cast(obj->ob_type)->tp_name)); + } +} + +std::vector CastPyArg2VectorOfEagerTensor(PyObject* obj, + ssize_t arg_pos) { + std::vector result; + if (PyList_Check(obj)) { + Py_ssize_t len = PyList_Size(obj); + PyObject* item = nullptr; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyList_GetItem(obj, i); + if (PyObject_IsInstance( + item, reinterpret_cast(p_eager_tensor_type))) { + result.emplace_back( + reinterpret_cast(item)->eagertensor); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "list of bool, but got %s at pos %d", + arg_pos + 1, + reinterpret_cast(item->ob_type)->tp_name, i)); + } + } + } else if (PyTuple_Check(obj)) { + Py_ssize_t len = PyTuple_Size(obj); + PyObject* item = nullptr; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyTuple_GetItem(obj, i); + if (PyObject_IsInstance( + item, reinterpret_cast(p_eager_tensor_type))) { + result.emplace_back( + reinterpret_cast(item)->eagertensor); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "list of bool, but got %s at pos %d", + arg_pos + 1, + reinterpret_cast(item->ob_type)->tp_name, i)); + } + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "list or tuple, but got %s", + arg_pos + 1, reinterpret_cast(obj->ob_type)->tp_name)); + } + return result; +} + +platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos) { + platform::Place place; + if (PyObject_IsInstance(obj, reinterpret_cast(g_place_pytype))) { + place = ::pybind11::handle(obj).cast(); + } else if (PyObject_IsInstance( + obj, reinterpret_cast(g_cudaplace_pytype))) { + place = ::pybind11::handle(obj).cast(); + } else if (PyObject_IsInstance( + obj, reinterpret_cast(g_cpuplace_pytype))) { + place = ::pybind11::handle(obj).cast(); + } else if (PyObject_IsInstance( + obj, reinterpret_cast(g_xpuplace_pytype))) { + place = ::pybind11::handle(obj).cast(); + } else if (PyObject_IsInstance( + obj, reinterpret_cast(g_npuplace_pytype))) { + place = ::pybind11::handle(obj).cast(); + } else if (PyObject_IsInstance( + obj, reinterpret_cast(g_cudapinnedplace_pytype))) { + place = ::pybind11::handle(obj).cast(); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "one of(Place,CUDAPlace,CPUPlace,XPUPlace,NPUPlace,CUDAPinnedPlace), " + "but got %s", + arg_pos + 1, reinterpret_cast(obj->ob_type)->tp_name)); + } + return place; +} + +PyObject* ToPyObject(bool value) { + if (value) { + Py_INCREF(Py_True); + return Py_True; + } else { + Py_INCREF(Py_False); + return Py_False; + } +} + +PyObject* ToPyObject(int value) { return PyLong_FromLong(value); } + +PyObject* ToPyObject(int64_t value) { return PyLong_FromLongLong(value); } + +PyObject* ToPyObject(float value) { return PyLong_FromDouble(value); } + +PyObject* ToPyObject(double value) { return PyLong_FromDouble(value); } + +PyObject* ToPyObject(const char* value) { return PyUnicode_FromString(value); } + +PyObject* ToPyObject(const std::string& value) { + return PyUnicode_FromString(value.c_str()); +} + +PyObject* ToPyObject(const egr::EagerTensor& value) { + PyObject* obj = p_eager_tensor_type->tp_alloc(p_eager_tensor_type, 0); + if (obj) { + auto v = reinterpret_cast(obj); + new (&(v->eagertensor)) egr::EagerTensor(); + v->eagertensor = value; + } else { + PADDLE_THROW(platform::errors::Fatal( + "tp_alloc return null, can not new a PyObject.")); + } + return obj; +} + +PyObject* ToPyObject(const std::vector& value) { + PyObject* result = PyList_New((Py_ssize_t)value.size()); + + for (size_t i = 0; i < value.size(); i++) { + PyList_SET_ITEM(result, static_cast(i), ToPyObject(value[i])); + } + + return result; +} + +PyObject* ToPyObject(const std::vector& value) { + PyObject* result = PyList_New((Py_ssize_t)value.size()); + + for (size_t i = 0; i < value.size(); i++) { + PyList_SET_ITEM(result, static_cast(i), ToPyObject(value[i])); + } + + return result; +} + +PyObject* ToPyObject(const std::vector& value) { + PyObject* result = PyList_New((Py_ssize_t)value.size()); + + for (size_t i = 0; i < value.size(); i++) { + PyList_SET_ITEM(result, (Py_ssize_t)i, ToPyObject(value[i])); + } + + return result; +} + +PyObject* ToPyObject(const std::vector& value) { + PyObject* result = PyList_New((Py_ssize_t)value.size()); + + for (size_t i = 0; i < value.size(); i++) { + PyList_SET_ITEM(result, static_cast(i), ToPyObject(value[i])); + } + + return result; +} + +PyObject* ToPyObject(const std::vector& value) { + PyObject* result = PyList_New((Py_ssize_t)value.size()); + + for (size_t i = 0; i < value.size(); i++) { + PyList_SET_ITEM(result, static_cast(i), ToPyObject(value[i])); + } + + return result; +} + +PyObject* ToPyObject(const std::vector& value) { + PyObject* result = PyList_New((Py_ssize_t)value.size()); + + for (size_t i = 0; i < value.size(); i++) { + PyObject* obj = p_eager_tensor_type->tp_alloc(p_eager_tensor_type, 0); + if (obj) { + auto v = reinterpret_cast(obj); + new (&(v->eagertensor)) egr::EagerTensor(); + v->eagertensor = value[i]; + } else { + PADDLE_THROW(platform::errors::Fatal( + "tp_alloc return null, can not new a PyObject.")); + } + PyList_SET_ITEM(result, static_cast(i), obj); + } + + return result; +} + +PyObject* ToPyObject(const platform::Place& value) { + auto obj = ::pybind11::cast(value); + obj.inc_ref(); + return obj.ptr(); +} + +egr::EagerTensor GetEagerTensorFromArgs(const std::string& op_type, + const std::string& arg_name, + PyObject* args, ssize_t arg_idx, + bool dispensable) { + PyObject* obj = PyTuple_GET_ITEM(args, arg_idx); + + if (PyTuple_Check(obj)) { + obj = PyTuple_GET_ITEM(obj, 0); + } + + if (obj == nullptr || obj == Py_None) { + if (!dispensable) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be Tensor, but got None", + op_type, arg_name, arg_idx)); + } + egr::EagerTensor emptytensor; + return emptytensor; + } + + return reinterpret_cast(obj)->eagertensor; +} + +std::vector GetEagerTensorListFromArgs( + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable) { + PyObject* list = PyTuple_GET_ITEM(args, arg_idx); + + if (list == nullptr) { + if (!dispensable) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensor, but got " + "None", + op_type, arg_name, arg_idx)); + } + return {}; + } + + std::vector result; + + if (PyList_Check(list)) { + Py_ssize_t len = PyList_Size(list); + if (len == 0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensors, but got " + "empty list", + op_type, arg_name, arg_idx)); + } + for (Py_ssize_t i = 0; i < len; i++) { + result.emplace_back( + reinterpret_cast(PyList_GetItem(list, i)) + ->eagertensor); + } + } else if (PyTuple_Check(list)) { + Py_ssize_t len = PyTuple_Size(list); + if (len == 0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensors, but got " + "empty list", + op_type, arg_name, arg_idx)); + } + for (Py_ssize_t i = 0; i < len; i++) { + result.emplace_back( + reinterpret_cast(PyTuple_GetItem(list, i)) + ->eagertensor); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensors, but got " + "%s", + op_type, arg_name, arg_idx, + (reinterpret_cast(list->ob_type))->tp_name)); + } + + return result; +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h new file mode 100644 index 0000000000000..e72820c4dbe8c --- /dev/null +++ b/paddle/fluid/pybind/eager_utils.h @@ -0,0 +1,89 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace paddle { +namespace pybind { + +typedef struct { + PyObject_HEAD egr::EagerTensor eagertensor; +} EagerTensorObject; + +int TensorDtype2NumpyDtype(pten::DataType dtype); + +bool PyObject_CheckLongOrConvertToLong(PyObject** obj); +bool PyObject_CheckFloatOrConvertToFloat(PyObject** obj); +bool PyObject_CheckStr(PyObject* obj); +bool CastPyArg2AttrBoolean(PyObject* obj, ssize_t arg_pos); +int CastPyArg2AttrInt(PyObject* obj, ssize_t arg_pos); +int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos); +float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos); +std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos); +egr::EagerTensor CastPyArg2EagerTensor(PyObject* obj, ssize_t arg_pos); +std::vector CastPyArg2VectorOfEagerTensor(PyObject* obj, + ssize_t arg_pos); +platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos); + +PyObject* ToPyObject(int value); +PyObject* ToPyObject(bool value); +PyObject* ToPyObject(int64_t value); +PyObject* ToPyObject(float value); +PyObject* ToPyObject(double value); +PyObject* ToPyObject(const char* value); +PyObject* ToPyObject(const std::string& value); +PyObject* ToPyObject(const egr::EagerTensor& value); +PyObject* ToPyObject(const std::vector& value); +PyObject* ToPyObject(const std::vector& value); +PyObject* ToPyObject(const std::vector& value); +PyObject* ToPyObject(const std::vector& value); +PyObject* ToPyObject(const std::vector& value); +PyObject* ToPyObject(const std::vector& value); +PyObject* ToPyObject(const platform::Place& value); + +template +struct TupleEagerTensorResult { + static void Run(const Tuple& out, PyObject* result) { + TupleEagerTensorResult::Run(out, result); + PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get(out))); + } +}; + +template +struct TupleEagerTensorResult { + static void Run(const Tuple& out, PyObject* result) { + PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out))); + } +}; + +template +PyObject* ToPyObject(const std::tuple& out) { + auto len = sizeof...(Args); + PyObject* result = PyTuple_New(len); + + TupleEagerTensorResult::Run(out, result); + + return result; +} + +egr::EagerTensor GetEagerTensorFromArgs(const std::string& op_type, + const std::string& arg_name, + PyObject* args, ssize_t arg_idx, + bool dispensable = false); +std::vector GetEagerTensorListFromArgs( + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable = false); + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc index 3d07985ff654e..362a3e44fab62 100644 --- a/paddle/fluid/pybind/exception.cc +++ b/paddle/fluid/pybind/exception.cc @@ -81,5 +81,48 @@ void BindException(pybind11::module* m) { }); } +void ThrowExceptionToPython(std::exception_ptr p) { + static PyObject* EOFExceptionException = + PyErr_NewException("paddle.EOFException", PyExc_Exception, NULL); + static PyObject* EnforceNotMetException = + PyErr_NewException("paddle.EnforceNotMet", PyExc_Exception, NULL); + try { + if (p) std::rethrow_exception(p); + } catch (const platform::EOFException& e) { + PyErr_SetString(EOFExceptionException, e.what()); + } catch (const platform::EnforceNotMet& e) { + switch (e.code()) { + case paddle::platform::error::INVALID_ARGUMENT: + PyErr_SetString(PyExc_ValueError, e.what()); + break; + case paddle::platform::error::NOT_FOUND: + case paddle::platform::error::ALREADY_EXISTS: + case paddle::platform::error::PRECONDITION_NOT_MET: + case paddle::platform::error::PERMISSION_DENIED: + case paddle::platform::error::EXECUTION_TIMEOUT: + case paddle::platform::error::UNAVAILABLE: + PyErr_SetString(PyExc_RuntimeError, e.what()); + break; + case paddle::platform::error::OUT_OF_RANGE: + PyErr_SetString(PyExc_IndexError, e.what()); + break; + case paddle::platform::error::RESOURCE_EXHAUSTED: + PyErr_SetString(PyExc_MemoryError, e.what()); + break; + case paddle::platform::error::UNIMPLEMENTED: + PyErr_SetString(PyExc_NotImplementedError, e.what()); + break; + case paddle::platform::error::FATAL: + PyErr_SetString(PyExc_SystemError, e.what()); + break; + case paddle::platform::error::EXTERNAL: + PyErr_SetString(PyExc_OSError, e.what()); + break; + default: + PyErr_SetString(EnforceNotMetException, e.what()); + break; + } + } +} } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/exception.h b/paddle/fluid/pybind/exception.h index 5e054267361f2..cf82f464a11f2 100644 --- a/paddle/fluid/pybind/exception.h +++ b/paddle/fluid/pybind/exception.h @@ -18,10 +18,26 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "pybind11/pybind11.h" +#define EAGER_TRY try { +#define EAGER_CATCH_AND_THROW_RETURN_NULL \ + } \ + catch (...) { \ + ThrowExceptionToPython(std::current_exception()); \ + return nullptr; \ + } + +#define EAGER_CATCH_AND_THROW_RETURN_ZERO \ + } \ + catch (...) { \ + ThrowExceptionToPython(std::current_exception()); \ + return 0; \ + } + namespace paddle { namespace pybind { void BindException(pybind11::module* m); +void ThrowExceptionToPython(std::exception_ptr p); } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 5ff0e58d85801..dc97d98e8c47f 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -37,6 +37,7 @@ limitations under the License. */ #include "paddle/fluid/imperative/data_loader.h" #include "paddle/fluid/imperative/gloo_context.h" #include "paddle/fluid/imperative/hccl_context.h" +#include "paddle/fluid/imperative/heter_ccl_context.h" #include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/nccl_context.h" @@ -59,18 +60,6 @@ PyTypeObject *g_varbase_pytype = nullptr; namespace py = ::pybind11; -class Layer : public imperative::Layer { - public: - using imperative::Layer::Layer; // Inherit constructors - - std::vector> Forward( - const std::vector> &inputs) - override { - PYBIND11_OVERLOAD(std::vector>, Layer, - Forward, inputs); // NOLINT - } -}; - template static T PyObjectCast(PyObject *obj) { try { @@ -1549,7 +1538,7 @@ void BindImperative(py::module *m_ptr) { self.MutableGradVarBase()->SetType(type); }) .def("_reset_grad_inplace_version", - [](imperative::VarBase &self) { + [](imperative::VarBase &self, bool set_to_zero) { /* *** This interfaceis a complete hack *** reset_grad_inplace_version removes all inplace related records to @@ -1561,15 +1550,20 @@ void BindImperative(py::module *m_ptr) { Make sure you fully understand what you're doing before make use of this interface, and prepare for the worst. */ + py::gil_scoped_release release; + if (self.HasGradVar()) { auto grad_var = self.GradVarBase(); auto var_wrapper = grad_var->SharedVar(); - if (var_wrapper) var_wrapper->ResetInplaceVersion(); + if (var_wrapper) { + var_wrapper->ResetInplaceVersion(set_to_zero); + } } }) .def("_grad_ivar", [](const imperative::VarBase &self) { auto &grad_var = self.GradVarBase(); + if (grad_var && grad_var->Var().IsInitialized()) { auto *tensor = grad_var->MutableVar()->IsType() @@ -1578,6 +1572,7 @@ void BindImperative(py::module *m_ptr) { : grad_var->MutableVar() ->GetMutable() ->mutable_value(); + if (tensor->IsInitialized()) { return grad_var; } @@ -1756,7 +1751,7 @@ void BindImperative(py::module *m_ptr) { "Cannot copy this Tensor to GPU in CPU version Paddle, " "Please recompile or reinstall Paddle with CUDA support.")); #else - int device_count = platform::GetCUDADeviceCount(); + int device_count = platform::GetGPUDeviceCount(); int device_id = 0; if (handle == py::none()) { if (platform::is_gpu_place(self->Place())) { @@ -1975,10 +1970,6 @@ void BindImperative(py::module *m_ptr) { .def("_numel", [](std::shared_ptr &self) { auto *t = self->MutableVar()->GetMutable(); - PADDLE_ENFORCE_EQ( - t->IsInitialized(), true, - platform::errors::InvalidArgument( - "Tensor %s has not been initialized!", self->Name())); return t->numel(); }) .def_property("name", &imperative::VarBase::Name, @@ -2051,18 +2042,6 @@ void BindImperative(py::module *m_ptr) { .def_property_readonly("type", &imperative::VarBase::Type) .def_property_readonly("dtype", &imperative::VarBase::DataType); - // NOTE(zhiqiu): set the metaclass of Layer. - // See details: https://github.com/pybind/pybind11/pull/679 - // https://github.com/pybind/pybind11/blob/028812ae7eee307dca5f8f69d467af7b92cc41c8/tests/test_methods_and_attributes.cpp#L284 - py::class_ layer( - m, "Layer", py::metaclass((PyObject *)&PyType_Type)); // NOLINT - layer.def(py::init<>()) - .def("forward", - [](imperative::Layer &self, - const std::vector> &inputs) { - return self.Forward(inputs); - }); - py::class_(m, "ProgramDescTracer", "") .def("create_program_desc", &imperative::jit::ProgramDescTracer::CreateProgramDesc) @@ -2360,6 +2339,15 @@ void BindImperative(py::module *m_ptr) { py::arg("ring_id")); #endif +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) + py::class_>( + m, "HeterParallelContext") + .def(py::init()) + .def("init", [](imperative::HeterParallelContext &self) { self.Init(); }); +#endif + m.def("pylayer_apply", [](const platform::CPUPlace &place, const py::object &cls, const py::args args, const py::kwargs kwargs) { diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h index 997cb610fafca..7b9379df6be2c 100644 --- a/paddle/fluid/pybind/op_function.h +++ b/paddle/fluid/pybind/op_function.h @@ -29,34 +29,14 @@ #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/type_defs.h" +#include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/pybind/op_function_common.h" namespace py = pybind11; namespace paddle { namespace pybind { -class OpAttrTypeMap { - public: - static OpAttrTypeMap& Instance() { - static OpAttrTypeMap g_op_attr_type_map; - return g_op_attr_type_map; - } - - std::unordered_map< - std::string, - std::unordered_map>& - Map() { - return ops_attrtype_map_; - } - - private: - OpAttrTypeMap() = default; - std::unordered_map< - std::string, - std::unordered_map> - ops_attrtype_map_; -}; - static inline std::shared_ptr CastPyHandleToVarBase( const std::string& op_type, const std::string& arg_name, int arg_idx, const py::handle& handle, bool dispensable = false) { @@ -197,737 +177,7 @@ static inline void HandleViewBetweenInputAndOutput( } } -extern PyTypeObject* g_varbase_pytype; -extern PyTypeObject* g_vartype_pytype; -extern PyTypeObject* g_blockdesc_pytype; - -inline bool PyObject_CheckBool(PyObject** obj) { return PyBool_Check(*obj); } - -inline bool PyObject_CheckLongOrToLong(PyObject** obj) { - if ((PyLong_Check(*obj) && !PyBool_Check(*obj)) || - PyObject_IsInstance(*obj, (PyObject*)g_vartype_pytype) || // NOLINT - PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype)) { // NOLINT - return true; - } - - if (std::string(((PyTypeObject*)(*obj)->ob_type)->tp_name) // NOLINT - .find("numpy") != std::string::npos) { - auto to = PyNumber_Long(*obj); - if (to) { - *obj = to; - return true; - } - } - - return false; -} - -inline bool PyObject_CheckFloatOrToFloat(PyObject** obj) { - // sometimes users provide PyLong or numpy.int64 but attr is float - if (PyFloat_Check(*obj) || PyLong_Check(*obj) || - PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype)) { // NOLINT - return true; - } - if (std::string(((PyTypeObject*)(*obj)->ob_type)->tp_name) // NOLINT - .find("numpy") != std::string::npos) { - auto to = PyNumber_Float(*obj); - if (to) { - *obj = to; - return true; - } - } - return false; -} - -inline bool PyObject_CheckString(PyObject* obj) { return PyUnicode_Check(obj); } - -static inline void CastPyArg2AttrBoolean( - PyObject* obj, - paddle::framework::AttributeMap& attrs, // NOLINT - const std::string& key, const std::string& op_type, ssize_t arg_pos) { - if (obj == Py_None) { - attrs[key] = false; // To be compatible with QA integration testing. Some - // test case pass in None. - } else if (obj == Py_True) { - attrs[key] = true; - } else if (obj == Py_False) { - attrs[key] = false; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "bool, but got %s", - op_type, arg_pos + 1, - ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT - } -} - -static inline void CastPyArg2AttrInt( - PyObject* obj, - paddle::framework::AttributeMap& attrs, // NOLINT - const std::string& key, const std::string& op_type, ssize_t arg_pos) { - if (PyObject_CheckLongOrToLong(&obj)) { - attrs[key] = (int)PyLong_AsLong(obj); // NOLINT - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "int, but got %s", - op_type, arg_pos + 1, - ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT - } -} - -static inline void CastPyArg2AttrLong( - PyObject* obj, - paddle::framework::AttributeMap& attrs, // NOLINT - const std::string& key, const std::string& op_type, ssize_t arg_pos) { - if (PyObject_CheckLongOrToLong(&obj)) { - attrs[key] = (int64_t)PyLong_AsLong(obj); // NOLINT - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "long, but got %s", - op_type, arg_pos + 1, - ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT - } -} - -static inline void CastPyArg2AttrFloat( - PyObject* obj, - paddle::framework::AttributeMap& attrs, // NOLINT - const std::string& key, const std::string& op_type, ssize_t arg_pos) { - if (PyObject_CheckFloatOrToFloat(&obj)) { - attrs[key] = (float)PyFloat_AsDouble(obj); // NOLINT - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "float, but got %s", - op_type, arg_pos + 1, - ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT - } -} - -static inline void CastPyArg2AttrString( - PyObject* obj, - paddle::framework::AttributeMap& attrs, // NOLINT - const std::string& key, const std::string& op_type, ssize_t arg_pos) { - if (PyObject_CheckString(obj)) { - Py_ssize_t size; - const char* data; - data = PyUnicode_AsUTF8AndSize(obj, &size); - attrs[key] = std::string(data, static_cast(size)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "str, but got %s", - op_type, arg_pos + 1, - ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT - } -} - -static inline void CastPyArg2AttrBooleans( - PyObject* obj, paddle::framework::AttributeMap& attrs, // NOLINT - const std::string& key, const std::string& op_type, ssize_t arg_pos) { - if (PyList_Check(obj)) { - Py_ssize_t len = PyList_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PyList_GetItem(obj, i); - if (PyObject_CheckBool(&item)) { - value.emplace_back(PyLong_AsLong(item)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of bool, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else if (PyTuple_Check(obj)) { - Py_ssize_t len = PyTuple_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PyTuple_GetItem(obj, i); - if (PyObject_CheckBool(&item)) { - value.emplace_back(PyLong_AsLong(item)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of bool, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list or tuple, but got %s", - op_type, arg_pos + 1, - ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT - } -} - -static inline void CastPyArg2AttrInts( - PyObject* obj, - paddle::framework::AttributeMap& attrs, // NOLINT - const std::string& key, const std::string& op_type, ssize_t arg_pos) { - if (PyList_Check(obj)) { - Py_ssize_t len = PyList_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PyList_GetItem(obj, i); - if (PyObject_CheckLongOrToLong(&item)) { - value.emplace_back(PyLong_AsLong(item)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of int, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else if (PyTuple_Check(obj)) { - Py_ssize_t len = PyTuple_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PyTuple_GetItem(obj, i); - if (PyObject_CheckLongOrToLong(&item)) { - value.emplace_back(PyLong_AsLong(item)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of int, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else if (PySequence_Check(obj)) { - Py_ssize_t len = PySequence_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PySequence_GetItem(obj, i); - if (PyObject_CheckLongOrToLong(&item)) { - value.emplace_back(PyLong_AsLong(item)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of int, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list or tuple, but got %s", - op_type, arg_pos + 1, - ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT - } -} - -static inline void CastPyArg2AttrLongs( - PyObject* obj, - paddle::framework::AttributeMap& attrs, // NOLINT - const std::string& key, const std::string& op_type, ssize_t arg_pos) { - if (PyList_Check(obj)) { - Py_ssize_t len = PyList_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PyList_GetItem(obj, i); - if (PyObject_CheckLongOrToLong(&item)) { - value.emplace_back(PyLong_AsLong(item)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of int, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else if (PyTuple_Check(obj)) { - Py_ssize_t len = PyTuple_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PyTuple_GetItem(obj, i); - if (PyObject_CheckLongOrToLong(&item)) { - value.emplace_back(PyLong_AsLong(item)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of int, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else if (PySequence_Check(obj)) { - Py_ssize_t len = PySequence_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PySequence_GetItem(obj, i); - if (PyObject_CheckLongOrToLong(&item)) { - value.emplace_back(PyLong_AsLong(item)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of int, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list or tuple, but got %s", - op_type, arg_pos + 1, - ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT - } -} - -static inline void CastPyArg2AttrFloats( - PyObject* obj, - paddle::framework::AttributeMap& attrs, // NOLINT - const std::string& key, const std::string& op_type, ssize_t arg_pos) { - if (PyList_Check(obj)) { - Py_ssize_t len = PyList_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PyList_GetItem(obj, i); - if (PyObject_CheckFloatOrToFloat(&item)) { - value.emplace_back(PyFloat_AsDouble(item)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of float, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else if (PyTuple_Check(obj)) { - Py_ssize_t len = PyTuple_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PyTuple_GetItem(obj, i); - if (PyObject_CheckFloatOrToFloat(&item)) { - value.emplace_back(PyFloat_AsDouble(item)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of float, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else if (PySequence_Check(obj)) { - Py_ssize_t len = PySequence_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PySequence_GetItem(obj, i); - if (PyObject_CheckFloatOrToFloat(&item)) { - value.emplace_back(PyFloat_AsDouble(item)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of float, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list or tuple, but got %s", - op_type, arg_pos + 1, - ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT - } -} - -static inline void CastPyArg2AttrFloat64s( - PyObject* obj, paddle::framework::AttributeMap& attrs, // NOLINT - const std::string& key, const std::string& op_type, ssize_t arg_pos) { - if (PyList_Check(obj)) { - Py_ssize_t len = PyList_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PyList_GetItem(obj, i); - if (PyObject_CheckFloatOrToFloat(&item)) { - value.emplace_back(PyFloat_AsDouble(item)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of float, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else if (PyTuple_Check(obj)) { - Py_ssize_t len = PyTuple_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PyTuple_GetItem(obj, i); - if (PyObject_CheckFloatOrToFloat(&item)) { - value.emplace_back(PyFloat_AsDouble(item)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of float, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else if (PySequence_Check(obj)) { - Py_ssize_t len = PySequence_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PySequence_GetItem(obj, i); - if (PyObject_CheckFloatOrToFloat(&item)) { - value.emplace_back(PyFloat_AsDouble(item)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of float, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list or tuple, but got %s", - op_type, arg_pos + 1, - ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT - } -} - -static inline void CastPyArg2AttrStrings( - PyObject* obj, - paddle::framework::AttributeMap& attrs, // NOLINT - const std::string& key, const std::string& op_type, ssize_t arg_pos) { - if (PyList_Check(obj)) { - Py_ssize_t len = PyList_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PyList_GetItem(obj, i); - if (PyObject_CheckString(item)) { - Py_ssize_t size; - const char* data; - data = PyUnicode_AsUTF8AndSize(item, &size); - value.emplace_back(std::string(data, (size_t)size)); // NOLINT - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of str, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else if (PyTuple_Check(obj)) { - Py_ssize_t len = PyTuple_Size(obj); - PyObject* item = nullptr; - std::vector value; - for (Py_ssize_t i = 0; i < len; i++) { - item = PyTuple_GetItem(obj, i); - if (PyObject_CheckString(item)) { - Py_ssize_t size; - const char* data; - data = PyUnicode_AsUTF8AndSize(item, &size); - value.emplace_back(std::string(data, static_cast(size))); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list of str, but got %s at pos %d", - op_type, arg_pos + 1, - ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT - i)); - } - } - attrs[key] = value; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list or tuple, but got %s", - op_type, arg_pos + 1, - ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT - } -} - -static inline void CastPyArg2AttrBlock( - PyObject* obj, paddle::framework::AttributeMap& attrs, // NOLINT - const std::string& key, const std::string& op_type, ssize_t arg_pos) { - ::pybind11::detail::instance* inst = - (::pybind11::detail::instance*)obj; // NOLINT - - if (!PyObject_IsInstance((PyObject*)inst, // NOLINT - (PyObject*)g_blockdesc_pytype)) { // NOLINT - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "BlockDesc, but got %s", - op_type, arg_pos + 1, - ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT - } - void** vh = inst->simple_layout ? inst->simple_value_holder - : &inst->nonsimple.values_and_holders[0]; - attrs[key] = reinterpret_cast(vh[0]); -} - -static inline void ConstructAttrMapFromPyArgs( - const std::string& op_type, PyObject* args, ssize_t attr_start, - ssize_t attr_end, paddle::framework::AttributeMap& attrs) { // NOLINT - PADDLE_ENFORCE_EQ( - (attr_end - attr_start) % 2, 0, - platform::errors::InvalidArgument( - "The number of arguments for attributes should be even.")); - - auto attr_type_map = &(OpAttrTypeMap::Instance().Map()[op_type]); - - PyObject* obj = nullptr; - for (ssize_t arg_pos = attr_start; arg_pos < attr_end; arg_pos += 2) { - Py_ssize_t key_len; - const char* key_ptr; - obj = PyTuple_GET_ITEM(args, arg_pos); - if (PyObject_CheckString(obj)) { - key_ptr = PyUnicode_AsUTF8AndSize(obj, &key_len); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be str, but got " - "%s", - op_type, arg_pos, ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT - } - - std::string key(key_ptr, static_cast(key_len)); - auto iter = attr_type_map->find(key); - if (iter == attr_type_map->end()) { - continue; - } - - obj = PyTuple_GET_ITEM(args, arg_pos + 1); - - switch (iter->second) { - case paddle::framework::proto::AttrType::INT: - CastPyArg2AttrInt(obj, attrs, key, op_type, arg_pos); - break; - case paddle::framework::proto::AttrType::FLOAT: - CastPyArg2AttrFloat(obj, attrs, key, op_type, arg_pos); - break; - case paddle::framework::proto::AttrType::STRING: - CastPyArg2AttrString(obj, attrs, key, op_type, arg_pos); - break; - case paddle::framework::proto::AttrType::INTS: - CastPyArg2AttrInts(obj, attrs, key, op_type, arg_pos); - break; - case paddle::framework::proto::AttrType::FLOATS: - CastPyArg2AttrFloats(obj, attrs, key, op_type, arg_pos); - break; - case paddle::framework::proto::AttrType::STRINGS: - CastPyArg2AttrStrings(obj, attrs, key, op_type, arg_pos); - break; - case paddle::framework::proto::AttrType::BOOLEAN: - CastPyArg2AttrBoolean(obj, attrs, key, op_type, arg_pos); - break; - case paddle::framework::proto::AttrType::BOOLEANS: - CastPyArg2AttrBooleans(obj, attrs, key, op_type, arg_pos); - break; - case paddle::framework::proto::AttrType::LONG: - CastPyArg2AttrLong(obj, attrs, key, op_type, arg_pos); - break; - case paddle::framework::proto::AttrType::LONGS: - CastPyArg2AttrLongs(obj, attrs, key, op_type, arg_pos); - break; - case paddle::framework::proto::AttrType::FLOAT64S: - CastPyArg2AttrFloat64s(obj, attrs, key, op_type, arg_pos); - break; - case paddle::framework::proto::AttrType::BLOCK: - CastPyArg2AttrBlock(obj, attrs, key, op_type, arg_pos); - break; - default: - break; - } - } -} - -static inline std::shared_ptr GetVarBaseFromArgs( - const std::string& op_type, const std::string& arg_name, PyObject* args, - ssize_t arg_idx, bool dispensable = false) { - ::pybind11::detail::instance* inst = - (::pybind11::detail::instance*)PyTuple_GET_ITEM(args, arg_idx); - - if (PyTuple_Check((PyObject*)inst)) { // NOLINT - inst = (::pybind11::detail::instance*)PyTuple_GET_ITEM(inst, 0); - } - - if (inst == nullptr || (PyObject*)inst == Py_None) { // NOLINT - if (!dispensable) { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument '%s' (position %d) must be Tensor, but got None", - op_type, arg_name, arg_idx)); - } - return nullptr; - } - - if (!PyObject_IsInstance((PyObject*)inst, // NOLINT - (PyObject*)g_varbase_pytype)) { // NOLINT - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument '%s' (position %d) must be Tensor, but got " - "%s", - op_type, arg_name, arg_idx, - ((PyTypeObject*)((PyObject*)inst)->ob_type)->tp_name)); // NOLINT - } - - void** vh = inst->simple_layout ? inst->simple_value_holder - : &inst->nonsimple.values_and_holders[0]; - return reinterpret_cast&>(vh[1]); -} - -static inline std::vector> -GetVarBaseListFromArgs(const std::string& op_type, const std::string& arg_name, - PyObject* args, ssize_t arg_idx, - bool dispensable = false) { - PyObject* list = PyTuple_GET_ITEM(args, arg_idx); - - if (list == nullptr) { - if (!dispensable) { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument '%s' (position %d) must be list of Tensor, but got " - "None", - op_type, arg_name, arg_idx)); // NOLINT - } - return {}; - } - - std::vector> result; - - if (PyList_Check(list)) { - Py_ssize_t len = PyList_Size(list); - if (len == 0) { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument '%s' (position %d) must be list of Tensors, but got " - "empty list", - op_type, arg_name, arg_idx)); - } - ::pybind11::detail::instance* item = nullptr; - for (Py_ssize_t i = 0; i < len; i++) { - item = (::pybind11::detail::instance*)PyList_GetItem(list, i); - if (!PyObject_IsInstance((PyObject*)item, // NOLINT - (PyObject*)g_varbase_pytype)) { // NOLINT - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument '%s' (position %d) must be list of Tensors, but " - "got list of " - "%s", - op_type, arg_name, arg_idx, - ((PyTypeObject*)((PyObject*)item)->ob_type)->tp_name)); // NOLINT - } - void** vh = item->simple_layout ? item->simple_value_holder - : &item->nonsimple.values_and_holders[0]; - result.emplace_back( - reinterpret_cast&>( - vh[1])); - } - } else if (PyTuple_Check(list)) { - Py_ssize_t len = PyTuple_Size(list); - if (len == 0) { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument '%s' (position %d) must be list of Tensors, but got " - "empty list", - op_type, arg_name, arg_idx)); - } - ::pybind11::detail::instance* item = nullptr; - for (Py_ssize_t i = 0; i < len; i++) { - item = (::pybind11::detail::instance*)PyTuple_GetItem(list, i); // NOLINT - if (!PyObject_IsInstance((PyObject*)item, // NOLINT - (PyObject*)g_varbase_pytype)) { // NOLINT - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument '%s' (position %d) must be list of Tensors, but " - "got list of " - "%s", - op_type, arg_name, arg_idx, - ((PyTypeObject*)((PyObject*)item)->ob_type)->tp_name)); // NOLINT - } - void** vh = item->simple_layout ? item->simple_value_holder - : &item->nonsimple.values_and_holders[0]; - result.emplace_back( - reinterpret_cast&>( - vh[1])); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument '%s' (position %d) must be list of Tensors, but got " - "%s", - op_type, arg_name, arg_idx, - ((PyTypeObject*)list->ob_type)->tp_name)); // NOLINT - } - - return result; -} - -static inline unsigned long GetUnsignedLongFromArgs( // NOLINT - const std::string& op_type, const std::string& arg_name, PyObject* args, - ssize_t arg_idx, bool dispensable = false) { - PyObject* item = PyTuple_GET_ITEM(args, arg_idx); - - if (item == nullptr) { - if (!dispensable) { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument '%s' (position %d) must be long, but got None", - op_type, arg_name, arg_idx)); - } - return 0; - } - - if (PyObject_CheckLongOrToLong(&item)) { - return PyLong_AsUnsignedLong(item); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument '%s' (position %d) must be " - "long, but got %s", - op_type, arg_name, arg_idx, - ((PyTypeObject*)item->ob_type)->tp_name)); // NOLINT - } -} - -static inline PyObject* MakeReturnPyObject( +PyObject* MakeReturnPyObject( const std::shared_ptr& out) { return ::pybind11::detail::type_caster_base::cast_holder( ::pybind11::detail::holder_helper< @@ -936,7 +186,7 @@ static inline PyObject* MakeReturnPyObject( .ptr(); } -static inline PyObject* MakeReturnPyObject( +PyObject* MakeReturnPyObject( const std::vector>& out) { PyObject* result = PyList_New((Py_ssize_t)out.size()); @@ -969,7 +219,7 @@ struct TupleVarBasesResult { }; template -static inline PyObject* MakeReturnPyObject(const std::tuple& out) { +PyObject* MakeReturnPyObject(const std::tuple& out) { auto len = sizeof...(Args); PyObject* result = PyTuple_New(len); @@ -978,64 +228,6 @@ static inline PyObject* MakeReturnPyObject(const std::tuple& out) { return result; } -void InitOpsAttrTypeMap() { - auto op_info_map = paddle::framework::OpInfoMap::Instance().map(); - for (auto iter = op_info_map.begin(); iter != op_info_map.end(); ++iter) { - auto op_proto = iter->second.proto_; - if (op_proto == nullptr) { - continue; - } - auto attrs_proto = op_proto->attrs(); - for (auto& attr : attrs_proto) { - OpAttrTypeMap::Instance().Map()[iter->first][attr.name()] = attr.type(); - } - } -} - -void ThrowExceptionToPython(std::exception_ptr p) { - static PyObject* EOFExceptionException = - PyErr_NewException("paddle.EOFException", PyExc_Exception, NULL); - static PyObject* EnforceNotMetException = - PyErr_NewException("paddle.EnforceNotMet", PyExc_Exception, NULL); - try { - if (p) std::rethrow_exception(p); - } catch (const platform::EOFException& e) { - PyErr_SetString(EOFExceptionException, e.what()); - } catch (const platform::EnforceNotMet& e) { - switch (e.code()) { - case paddle::platform::error::INVALID_ARGUMENT: - PyErr_SetString(PyExc_ValueError, e.what()); - break; - case paddle::platform::error::NOT_FOUND: - case paddle::platform::error::ALREADY_EXISTS: - case paddle::platform::error::PRECONDITION_NOT_MET: - case paddle::platform::error::PERMISSION_DENIED: - case paddle::platform::error::EXECUTION_TIMEOUT: - case paddle::platform::error::UNAVAILABLE: - PyErr_SetString(PyExc_RuntimeError, e.what()); - break; - case paddle::platform::error::OUT_OF_RANGE: - PyErr_SetString(PyExc_IndexError, e.what()); - break; - case paddle::platform::error::RESOURCE_EXHAUSTED: - PyErr_SetString(PyExc_MemoryError, e.what()); - break; - case paddle::platform::error::UNIMPLEMENTED: - PyErr_SetString(PyExc_NotImplementedError, e.what()); - break; - case paddle::platform::error::FATAL: - PyErr_SetString(PyExc_SystemError, e.what()); - break; - case paddle::platform::error::EXTERNAL: - PyErr_SetString(PyExc_OSError, e.what()); - break; - default: - PyErr_SetString(EnforceNotMetException, e.what()); - break; - } - } -} - } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc new file mode 100644 index 0000000000000..1f761ae29c2af --- /dev/null +++ b/paddle/fluid/pybind/op_function_common.cc @@ -0,0 +1,806 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include +#include +#include + +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/imperative/type_defs.h" +#include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/pybind/op_function_common.h" + +namespace py = pybind11; +namespace paddle { +namespace pybind { + +class OpAttrTypeMap { + public: + static OpAttrTypeMap& Instance() { + static OpAttrTypeMap g_op_attr_type_map; + return g_op_attr_type_map; + } + + std::unordered_map< + std::string, + std::unordered_map>& + Map() { + return ops_attrtype_map_; + } + + private: + OpAttrTypeMap() = default; + std::unordered_map< + std::string, + std::unordered_map> + ops_attrtype_map_; +}; + +extern PyTypeObject* g_varbase_pytype; +extern PyTypeObject* g_vartype_pytype; +extern PyTypeObject* g_blockdesc_pytype; + +bool PyObject_CheckBool(PyObject** obj) { return PyBool_Check(*obj); } + +bool PyObject_CheckLongOrToLong(PyObject** obj) { + if ((PyLong_Check(*obj) && !PyBool_Check(*obj)) || + PyObject_IsInstance(*obj, (PyObject*)g_vartype_pytype) || // NOLINT + PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype)) { // NOLINT + return true; + } + + if (std::string(((PyTypeObject*)(*obj)->ob_type)->tp_name) // NOLINT + .find("numpy") != std::string::npos) { + auto to = PyNumber_Long(*obj); + if (to) { + *obj = to; + return true; + } + } + + return false; +} + +bool PyObject_CheckFloatOrToFloat(PyObject** obj) { + // sometimes users provide PyLong or numpy.int64 but attr is float + if (PyFloat_Check(*obj) || PyLong_Check(*obj) || + PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype)) { // NOLINT + return true; + } + if (std::string(((PyTypeObject*)(*obj)->ob_type)->tp_name) // NOLINT + .find("numpy") != std::string::npos) { + auto to = PyNumber_Float(*obj); + if (to) { + *obj = to; + return true; + } + } + return false; +} + +bool PyObject_CheckString(PyObject* obj) { return PyUnicode_Check(obj); } + +void CastPyArg2AttrBoolean(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos) { + if (obj == Py_None) { + attrs[key] = false; // To be compatible with QA integration testing. Some + // test case pass in None. + } else if (obj == Py_True) { + attrs[key] = true; + } else if (obj == Py_False) { + attrs[key] = false; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "bool, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +void CastPyArg2AttrInt(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos) { + if (PyObject_CheckLongOrToLong(&obj)) { + attrs[key] = (int)PyLong_AsLong(obj); // NOLINT + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "int, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +void CastPyArg2AttrLong(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos) { + if (PyObject_CheckLongOrToLong(&obj)) { + attrs[key] = (int64_t)PyLong_AsLong(obj); // NOLINT + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "long, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +void CastPyArg2AttrFloat(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos) { + if (PyObject_CheckFloatOrToFloat(&obj)) { + attrs[key] = (float)PyFloat_AsDouble(obj); // NOLINT + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "float, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +void CastPyArg2AttrString(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos) { + if (PyObject_CheckString(obj)) { + Py_ssize_t size; + const char* data; + data = PyUnicode_AsUTF8AndSize(obj, &size); + attrs[key] = std::string(data, (size_t)size); // NOLINT + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "str, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +void CastPyArg2AttrBooleans(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos) { + if (PyList_Check(obj)) { + Py_ssize_t len = PyList_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyList_GetItem(obj, i); + if (PyObject_CheckBool(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of bool, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PyTuple_Check(obj)) { + Py_ssize_t len = PyTuple_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyTuple_GetItem(obj, i); + if (PyObject_CheckBool(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of bool, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list or tuple, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +void CastPyArg2AttrInts(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos) { + if (PyList_Check(obj)) { + Py_ssize_t len = PyList_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyList_GetItem(obj, i); + if (PyObject_CheckLongOrToLong(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of int, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PyTuple_Check(obj)) { + Py_ssize_t len = PyTuple_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyTuple_GetItem(obj, i); + if (PyObject_CheckLongOrToLong(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of int, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PySequence_Check(obj)) { + Py_ssize_t len = PySequence_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PySequence_GetItem(obj, i); + if (PyObject_CheckLongOrToLong(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of int, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list or tuple, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +void CastPyArg2AttrLongs(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos) { + if (PyList_Check(obj)) { + Py_ssize_t len = PyList_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyList_GetItem(obj, i); + if (PyObject_CheckLongOrToLong(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of int, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PyTuple_Check(obj)) { + Py_ssize_t len = PyTuple_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyTuple_GetItem(obj, i); + if (PyObject_CheckLongOrToLong(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of int, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PySequence_Check(obj)) { + Py_ssize_t len = PySequence_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PySequence_GetItem(obj, i); + if (PyObject_CheckLongOrToLong(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of int, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list or tuple, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +void CastPyArg2AttrFloats(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos) { + if (PyList_Check(obj)) { + Py_ssize_t len = PyList_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyList_GetItem(obj, i); + if (PyObject_CheckFloatOrToFloat(&item)) { + value.emplace_back(PyFloat_AsDouble(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of float, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PyTuple_Check(obj)) { + Py_ssize_t len = PyTuple_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyTuple_GetItem(obj, i); + if (PyObject_CheckFloatOrToFloat(&item)) { + value.emplace_back(PyFloat_AsDouble(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of float, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PySequence_Check(obj)) { + Py_ssize_t len = PySequence_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PySequence_GetItem(obj, i); + if (PyObject_CheckFloatOrToFloat(&item)) { + value.emplace_back(PyFloat_AsDouble(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of float, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list or tuple, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +void CastPyArg2AttrFloat64s(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos) { + if (PyList_Check(obj)) { + Py_ssize_t len = PyList_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyList_GetItem(obj, i); + if (PyObject_CheckFloatOrToFloat(&item)) { + value.emplace_back(PyFloat_AsDouble(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of float, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PyTuple_Check(obj)) { + Py_ssize_t len = PyTuple_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyTuple_GetItem(obj, i); + if (PyObject_CheckFloatOrToFloat(&item)) { + value.emplace_back(PyFloat_AsDouble(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of float, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PySequence_Check(obj)) { + Py_ssize_t len = PySequence_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PySequence_GetItem(obj, i); + if (PyObject_CheckFloatOrToFloat(&item)) { + value.emplace_back(PyFloat_AsDouble(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of float, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list or tuple, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +void CastPyArg2AttrStrings(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos) { + if (PyList_Check(obj)) { + Py_ssize_t len = PyList_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyList_GetItem(obj, i); + if (PyObject_CheckString(item)) { + Py_ssize_t size; + const char* data; + data = PyUnicode_AsUTF8AndSize(item, &size); + value.emplace_back(std::string(data, (size_t)size)); // NOLINT + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of str, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PyTuple_Check(obj)) { + Py_ssize_t len = PyTuple_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyTuple_GetItem(obj, i); + if (PyObject_CheckString(item)) { + Py_ssize_t size; + const char* data; + data = PyUnicode_AsUTF8AndSize(item, &size); + value.emplace_back(std::string(data, (size_t)size)); // NOLINT + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of str, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list or tuple, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +void CastPyArg2AttrBlock(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos) { + ::pybind11::detail::instance* inst = + (::pybind11::detail::instance*)obj; // NOLINT + + if (!PyObject_IsInstance((PyObject*)inst, // NOLINT + (PyObject*)g_blockdesc_pytype)) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "BlockDesc, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } + void** vh = inst->simple_layout ? inst->simple_value_holder + : &inst->nonsimple.values_and_holders[0]; + attrs[key] = reinterpret_cast(vh[0]); +} + +void ConstructAttrMapFromPyArgs( + const std::string& op_type, PyObject* args, ssize_t attr_start, + ssize_t attr_end, paddle::framework::AttributeMap& attrs) { // NOLINT + PADDLE_ENFORCE_EQ( + (attr_end - attr_start) % 2, 0, + platform::errors::InvalidArgument( + "The number of arguments for attributes should be even.")); + + auto attr_type_map = &(OpAttrTypeMap::Instance().Map()[op_type]); + + PyObject* obj = nullptr; + for (ssize_t arg_pos = attr_start; arg_pos < attr_end; arg_pos += 2) { + Py_ssize_t key_len; + const char* key_ptr; + obj = PyTuple_GET_ITEM(args, arg_pos); + if (PyObject_CheckString(obj)) { + key_ptr = PyUnicode_AsUTF8AndSize(obj, &key_len); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be str, but got " + "%s", + op_type, arg_pos, ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } + + std::string key(key_ptr, (size_t)key_len); // NOLINT + auto iter = attr_type_map->find(key); + if (iter == attr_type_map->end()) { + continue; + } + + obj = PyTuple_GET_ITEM(args, arg_pos + 1); + + switch (iter->second) { + case paddle::framework::proto::AttrType::INT: + CastPyArg2AttrInt(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::FLOAT: + CastPyArg2AttrFloat(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::STRING: + CastPyArg2AttrString(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::INTS: + CastPyArg2AttrInts(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::FLOATS: + CastPyArg2AttrFloats(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::STRINGS: + CastPyArg2AttrStrings(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::BOOLEAN: + CastPyArg2AttrBoolean(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::BOOLEANS: + CastPyArg2AttrBooleans(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::LONG: + CastPyArg2AttrLong(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::LONGS: + CastPyArg2AttrLongs(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::FLOAT64S: + CastPyArg2AttrFloat64s(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::BLOCK: + CastPyArg2AttrBlock(obj, attrs, key, op_type, arg_pos); + break; + default: + break; + } + } +} + +std::shared_ptr GetVarBaseFromArgs( + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable) { + ::pybind11::detail::instance* inst = + (::pybind11::detail::instance*)PyTuple_GET_ITEM(args, arg_idx); + + if (PyTuple_Check((PyObject*)inst)) { // NOLINT + inst = (::pybind11::detail::instance*)PyTuple_GET_ITEM(inst, 0); + } + + if (inst == nullptr || (PyObject*)inst == Py_None) { // NOLINT + if (!dispensable) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be Tensor, but got None", + op_type, arg_name, arg_idx)); + } + return nullptr; + } + + if (!PyObject_IsInstance((PyObject*)inst, // NOLINT + (PyObject*)g_varbase_pytype)) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be Tensor, but got " + "%s", + op_type, arg_name, arg_idx, + ((PyTypeObject*)((PyObject*)inst)->ob_type)->tp_name)); // NOLINT + } + + void** vh = inst->simple_layout ? inst->simple_value_holder + : &inst->nonsimple.values_and_holders[0]; + return reinterpret_cast&>(vh[1]); +} + +std::vector> GetVarBaseListFromArgs( + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable) { + PyObject* list = PyTuple_GET_ITEM(args, arg_idx); + + if (list == nullptr) { + if (!dispensable) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensor, but got " + "None", + op_type, arg_name, arg_idx)); // NOLINT + } + return {}; + } + + std::vector> result; + + if (PyList_Check(list)) { + Py_ssize_t len = PyList_Size(list); + if (len == 0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensors, but got " + "empty list", + op_type, arg_name, arg_idx)); + } + ::pybind11::detail::instance* item = nullptr; + for (Py_ssize_t i = 0; i < len; i++) { + item = (::pybind11::detail::instance*)PyList_GetItem(list, i); + if (!PyObject_IsInstance((PyObject*)item, // NOLINT + (PyObject*)g_varbase_pytype)) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensors, but " + "got list of " + "%s", + op_type, arg_name, arg_idx, + ((PyTypeObject*)((PyObject*)item)->ob_type)->tp_name)); // NOLINT + } + void** vh = item->simple_layout ? item->simple_value_holder + : &item->nonsimple.values_and_holders[0]; + result.emplace_back( + reinterpret_cast&>( + vh[1])); + } + } else if (PyTuple_Check(list)) { + Py_ssize_t len = PyTuple_Size(list); + if (len == 0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensors, but got " + "empty list", + op_type, arg_name, arg_idx)); + } + ::pybind11::detail::instance* item = nullptr; + for (Py_ssize_t i = 0; i < len; i++) { + item = (::pybind11::detail::instance*)PyTuple_GetItem(list, i); // NOLINT + if (!PyObject_IsInstance((PyObject*)item, // NOLINT + (PyObject*)g_varbase_pytype)) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensors, but " + "got list of " + "%s", + op_type, arg_name, arg_idx, + ((PyTypeObject*)((PyObject*)item)->ob_type)->tp_name)); // NOLINT + } + void** vh = item->simple_layout ? item->simple_value_holder + : &item->nonsimple.values_and_holders[0]; + result.emplace_back( + reinterpret_cast&>( + vh[1])); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensors, but got " + "%s", + op_type, arg_name, arg_idx, + ((PyTypeObject*)list->ob_type)->tp_name)); // NOLINT + } + + return result; +} + +unsigned long GetUnsignedLongFromArgs( // NOLINT + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable) { + PyObject* item = PyTuple_GET_ITEM(args, arg_idx); + + if (item == nullptr) { + if (!dispensable) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be long, but got None", + op_type, arg_name, arg_idx)); + } + return 0; + } + + if (PyObject_CheckLongOrToLong(&item)) { + return PyLong_AsUnsignedLong(item); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be " + "long, but got %s", + op_type, arg_name, arg_idx, + ((PyTypeObject*)item->ob_type)->tp_name)); // NOLINT + } +} + +void InitOpsAttrTypeMap() { + auto op_info_map = paddle::framework::OpInfoMap::Instance().map(); + for (auto iter = op_info_map.begin(); iter != op_info_map.end(); ++iter) { + auto op_proto = iter->second.proto_; + if (op_proto == nullptr) { + continue; + } + auto attrs_proto = op_proto->attrs(); + for (auto& attr : attrs_proto) { + OpAttrTypeMap::Instance().Map()[iter->first][attr.name()] = attr.type(); + } + } +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h new file mode 100644 index 0000000000000..9dc3a71a6ccf9 --- /dev/null +++ b/paddle/fluid/pybind/op_function_common.h @@ -0,0 +1,126 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/imperative/type_defs.h" +#include "paddle/fluid/pybind/imperative.h" + +namespace py = pybind11; +namespace paddle { +namespace pybind { + +bool PyObject_CheckBool(PyObject** obj); + +bool PyObject_CheckLongOrToLong(PyObject** obj); + +bool PyObject_CheckFloatOrToFloat(PyObject** obj); + +bool PyObject_CheckString(PyObject* obj); + +void CastPyArg2AttrBoolean(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos); + +void CastPyArg2AttrInt(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos); + +void CastPyArg2AttrLong(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos); + +void CastPyArg2AttrFloat(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos); + +void CastPyArg2AttrString(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos); + +void CastPyArg2AttrBooleans(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos); + +void CastPyArg2AttrInts(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos); + +void CastPyArg2AttrLongs(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos); + +void CastPyArg2AttrFloats(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos); + +void CastPyArg2AttrFloat64s(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos); + +void CastPyArg2AttrStrings(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos); + +void CastPyArg2AttrBlock(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, + ssize_t arg_pos); + +void ConstructAttrMapFromPyArgs( + const std::string& op_type, PyObject* args, ssize_t attr_start, + ssize_t attr_end, + paddle::framework::AttributeMap& attrs); // NOLINT + +std::shared_ptr GetVarBaseFromArgs( + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable = false); + +std::vector> GetVarBaseListFromArgs( + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable = false); + +unsigned long GetUnsignedLongFromArgs( // NOLINT + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable = false); + +void InitOpsAttrTypeMap(); + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 850f208359e05..5587952facc53 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/pybind/op_function_generator.h" + #include #include #include @@ -30,179 +32,6 @@ #include "paddle/fluid/framework/fleet/ascend_wrapper.h" #endif -// NOTE(zhiqiu): Commonly, the inputs in auto-generated OP function are -// determined by the OP`s proto automatically, i.e., all the inputs registered -// in OpMaker. -// However, some OPs have dispensable inputs, which means the input can -// be none for some conditions. It is discovered that most dispensable inputs -// is not used in imperative mode, so we drop those inputs when generating OP -// functions. While, for very few OPs, the dispensable inputs are used, we -// need to manually specify them in this map. -std::map> op_ins_map = { - {"layer_norm", {"X", "Scale", "Bias"}}, - {"bincount", {"X", "Weights"}}, - {"fused_attention", - {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "SrcMask", "OutLinearW", - "OutLinearBias", "Ln2Scale", "Ln2Bias"}}, - {"instance_norm", {"X", "Scale", "Bias"}}, - {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}}, - {"label_smooth", {"X", "PriorDist"}}, - {"assign", {"X"}}, - {"reshape2", {"X", "Shape"}}, - {"expand", {"X", "ExpandTimes"}}, - {"slice", {"Input", "StartsTensor", "EndsTensor"}}, - {"fake_quantize_dequantize_moving_average_abs_max", - {"X", "InScale", "InAccum", "InState"}}, - {"nll_loss", {"X", "Label", "Weight"}}, - {"bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}}, - {"gather", {"X", "Index", "Axis"}}, - {"roi_pool", {"X", "ROIs", "RoisNum"}}, - {"roi_align", {"X", "ROIs", "RoisNum"}}, - {"psroi_pool", {"X", "ROIs", "RoisNum"}}, - {"collect_fpn_proposals", - {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}}, - {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}}, - {"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}}, - {"hierarchical_sigmoid", - {"X", "W", "Label", "PathTable", "PathCode", "Bias"}}, - {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}}, - {"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}}, - {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}}, - {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}}, - {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}}, - {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}}, - {"run_program", {"X", "Params"}}, - {"fused_feedforward", - {"Dropout1Seed", "Dropout2Seed", "Linear1Bias", "Linear2Bias", "Ln1Scale", - "Ln1Bias", "Ln2Scale", "Ln2Bias"}}, - {"faster_tokenizer", {"Text", "Vocab", "TextPair"}}, - {"matrix_rank", {"X", "TolTensor"}}, - {"adam", - {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", - "Beta2Pow", "MasterParam"}}, - {"adamw", - {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", - "Beta2Pow", "MasterParam"}}, -}; - -// NOTE(zhiqiu): Like op_ins_map. -// Commonly, the outputs in auto-generated OP function are determined by the -// OP`s proto automatically, i.e., all the outputs registered in OpMaker. -// However, some OPs have dispensable outputs, which means the output can -// be none for some conditions. It is discovered that most dispensable outputs -// is not used in imperative mode, so we drop those outputs when generating OP -// functions. While, for very few OPs, the dispensable outputs are used, we -// need to manually specify them in this map. -std::map> op_outs_map = { - {"fake_quantize_dequantize_moving_average_abs_max", - {"Out", "OutScale", "OutAccum", "OutState"}}, - {"batch_norm", - {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance", - "ReserveSpace"}}, - {"fused_attention", - {"LnMean", "LnVariance", "LnOut", "QKVOut", "QKVBiasOut", "TransposeOut2", - "QKOut", "QKTVOut", "SoftmaxOut", "AttnDropoutMaskOut", "AttnDropoutOut", - "SrcMaskOut", "FMHAOut", "OutLinearOut", "DropoutMaskOut", "Ln2Mean", - "Ln2Variance", "BiasDropoutResidualOut", "Y"}}, - {"sync_batch_norm", - {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance", - "ReserveSpace"}}, - {"unique", {"Out", "Index", "Indices", "Counts"}}, - {"unique_consecutive", {"Out", "Index", "Counts"}}, - {"generate_proposals", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, - {"collect_fpn_proposals", {"FpnRois", "RoisNum"}}, - {"matrix_nms", {"Out", "Index", "RoisNum"}}, - {"distribute_fpn_proposals", - {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}}, - {"moving_average_abs_max_scale", - {"Out", "OutScale", "OutAccum", "OutState"}}, - {"multiclass_nms3", {"Out", "NmsRoisNum"}}, - {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, - {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, - {"sparse_momentum", {"ParamOut", "VelocityOut"}}, - {"rnn", {"DropoutState", "Reserve", "Out", "State"}}, - {"lamb", - {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, - {"run_program", {"DOut"}}, - {"adam", - {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", - "MasterParamOut"}}, - {"adamw", - {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", - "MasterParamOut"}}, -}; - -// NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are -// generated in C++ automatically. -// However, some OPs need to pass the outputs from Python instead of generating -// them in C++. There are mainly 2 reasons for that, -// (1) Optimizer OPs need to update the input param in-place, like sgd. -// So they need to pass the output which is same as input param. -// (2) Very few python APIs has out in their arguments, like fill_constant. -// So they need to pass the python output to C++. -// Actually, this is not a good design, since it may break the SSA graph, -// especially in declarative mode. -// For those OPs, we need to manually specify the outs need to pass in this map. -std::map> op_passing_outs_map = { - {"sgd", {"ParamOut"}}, - {"adam", - {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", - "MasterParamOut"}}, - {"adamw", - {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", - "MasterParamOut"}}, - {"average_accumulates", - {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates", - "out_old_num_accumulates", "out_num_updates"}}, - {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, - {"sparse_momentum", {"ParamOut", "VelocityOut"}}, - {"batch_norm", {"MeanOut", "VarianceOut"}}, - {"sync_batch_norm", {"MeanOut", "VarianceOut"}}, - {"accuracy", {"Correct", "Total"}}, - {"fill_constant", {"Out"}}, - {"recv_v2", {"Out"}}, - {"partial_recv", {"Out"}}, - {"matmul", {"Out"}}, - {"c_broadcast", {"Out"}}, - {"c_sync_calc_stream", {"Out"}}, - {"c_sync_comm_stream", {"Out"}}, - {"c_reduce_sum", {"Out"}}, - {"c_reduce_max", {"Out"}}, - {"c_reduce_min", {"Out"}}, - {"c_reduce_prod", {"Out"}}, - {"c_reduce", {"Out"}}, - {"c_scatter", {"Out"}}, - {"barrier", {"Out"}}, - {"fake_quantize_dequantize_moving_average_abs_max", - {"Out", "OutScale", "OutAccum", "OutState"}}, - {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}}, - {"fake_channel_wise_quantize_dequantize_abs_max", {"Out", "OutScale"}}, - {"check_finite_and_unscale", {"Out", "FoundInfinite"}}, - {"update_loss_scaling", - {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}}, - {"moving_average_abs_max_scale", - {"Out", "OutScale", "OutAccum", "OutState"}}, - {"lamb", - {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, - {"rnn", {"DropoutState"}}, - {"run_program", {"Out", "DOut", "OutScope"}}, - {"clear_float_status", {"FloatStatusOut"}}, - {"get_float_status", {"FloatStatusOut"}}, -}; - -// NOTE(pangyoki): Tensor View Strategy. -// In this case, a new output varbase will be created, and this varbase will -// reuse the input varbase's allocation. -// It's a map. The key of outer map is the view op name, the value is -// a pair which implies the mapping relationship between the input and -// output varbase. -std::map> view_op_map = { - {"squeeze2", {"X", "Out"}}, // "X" -> "Out" - {"unsqueeze2", {"X", "Out"}}, - {"reshape2", {"X", "Out"}}, - {"flatten_contiguous_range", {"X", "Out"}}, -}; - // NOTE(pangyoki): Inplace OP with duplicable input. // The set includes inplace ops that have duplicable input. // The first Varbase in input needs to be specified for the inplace strategy diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h new file mode 100644 index 0000000000000..7000097e0abcb --- /dev/null +++ b/paddle/fluid/pybind/op_function_generator.h @@ -0,0 +1,192 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +// NOTE(zhiqiu): Commonly, the inputs in auto-generated OP function are +// determined by the OP`s proto automatically, i.e., all the inputs registered +// in OpMaker. +// However, some OPs have dispensable inputs, which means the input can +// be none for some conditions. It is discovered that most dispensable inputs +// is not used in imperative mode, so we drop those inputs when generating OP +// functions. While, for very few OPs, the dispensable inputs are used, we +// need to manually specify them in this map. +std::map> op_ins_map = { + {"layer_norm", {"X", "Scale", "Bias"}}, + {"bincount", {"X", "Weights"}}, + {"fused_attention", + {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "SrcMask", "OutLinearW", + "OutLinearBias", "Ln2Scale", "Ln2Bias"}}, + {"instance_norm", {"X", "Scale", "Bias"}}, + {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}}, + {"label_smooth", {"X", "PriorDist"}}, + {"assign", {"X"}}, + {"reshape2", {"X", "Shape"}}, + {"expand", {"X", "ExpandTimes"}}, + {"slice", {"Input", "StartsTensor", "EndsTensor"}}, + {"fake_quantize_dequantize_moving_average_abs_max", + {"X", "InScale", "InAccum", "InState"}}, + {"nll_loss", {"X", "Label", "Weight"}}, + {"bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}}, + {"gather", {"X", "Index", "Axis"}}, + {"roi_pool", {"X", "ROIs", "RoisNum"}}, + {"roi_align", {"X", "ROIs", "RoisNum"}}, + {"psroi_pool", {"X", "ROIs", "RoisNum"}}, + {"collect_fpn_proposals", + {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}}, + {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}}, + {"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}}, + {"hierarchical_sigmoid", + {"X", "W", "Label", "PathTable", "PathCode", "Bias"}}, + {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}}, + {"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}}, + {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}}, + {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}}, + {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}}, + {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}}, + {"run_program", {"X", "Params"}}, + {"fused_feedforward", + {"Dropout1Seed", "Dropout2Seed", "Linear1Bias", "Linear2Bias", "Ln1Scale", + "Ln1Bias", "Ln2Scale", "Ln2Bias"}}, + {"faster_tokenizer", {"Text", "Vocab", "TextPair"}}, + {"matrix_rank", {"X", "TolTensor"}}, + {"adam", + {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", + "Beta2Pow", "MasterParam"}}, + {"adamw", + {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", + "Beta2Pow", "MasterParam"}}, +}; + +// NOTE(zhiqiu): Like op_ins_map. +// Commonly, the outputs in auto-generated OP function are determined by the +// OP`s proto automatically, i.e., all the outputs registered in OpMaker. +// However, some OPs have dispensable outputs, which means the output can +// be none for some conditions. It is discovered that most dispensable outputs +// is not used in imperative mode, so we drop those outputs when generating OP +// functions. While, for very few OPs, the dispensable outputs are used, we +// need to manually specify them in this map. +std::map> op_outs_map = { + {"fake_quantize_dequantize_moving_average_abs_max", + {"Out", "OutScale", "OutAccum", "OutState"}}, + {"batch_norm", + {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance", + "ReserveSpace"}}, + {"fused_attention", + {"LnMean", "LnVariance", "LnOut", "QKVOut", "QKVBiasOut", "TransposeOut2", + "QKOut", "QKTVOut", "SoftmaxOut", "AttnDropoutMaskOut", "AttnDropoutOut", + "SrcMaskOut", "FMHAOut", "OutLinearOut", "DropoutMaskOut", "Ln2Mean", + "Ln2Variance", "BiasDropoutResidualOut", "Y"}}, + {"sync_batch_norm", + {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance", + "ReserveSpace"}}, + {"unique", {"Out", "Index", "Indices", "Counts"}}, + {"unique_consecutive", {"Out", "Index", "Counts"}}, + {"generate_proposals", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, + {"collect_fpn_proposals", {"FpnRois", "RoisNum"}}, + {"matrix_nms", {"Out", "Index", "RoisNum"}}, + {"distribute_fpn_proposals", + {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}}, + {"moving_average_abs_max_scale", + {"Out", "OutScale", "OutAccum", "OutState"}}, + {"multiclass_nms3", {"Out", "NmsRoisNum"}}, + {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, + {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, + {"sparse_momentum", {"ParamOut", "VelocityOut"}}, + {"rnn", {"DropoutState", "Reserve", "Out", "State"}}, + {"lamb", + {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, + {"run_program", {"DOut"}}, + {"adam", + {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", + "MasterParamOut"}}, + {"adamw", + {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", + "MasterParamOut"}}, +}; + +// NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are +// generated in C++ automatically. +// However, some OPs need to pass the outputs from Python instead of generating +// them in C++. There are mainly 2 reasons for that, +// (1) Optimizer OPs need to update the input param in-place, like sgd. +// So they need to pass the output which is same as input param. +// (2) Very few python APIs has out in their arguments, like fill_constant. +// So they need to pass the python output to C++. +// Actually, this is not a good design, since it may break the SSA graph, +// especially in declarative mode. +// For those OPs, we need to manually specify the outs need to pass in this map. +std::map> op_passing_outs_map = { + {"sgd", {"ParamOut"}}, + {"adam", + {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", + "MasterParamOut"}}, + {"adamw", + {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", + "MasterParamOut"}}, + {"average_accumulates", + {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates", + "out_old_num_accumulates", "out_num_updates"}}, + {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, + {"sparse_momentum", {"ParamOut", "VelocityOut"}}, + {"batch_norm", {"MeanOut", "VarianceOut"}}, + {"sync_batch_norm", {"MeanOut", "VarianceOut"}}, + {"accuracy", {"Correct", "Total"}}, + {"fill_constant", {"Out"}}, + {"recv_v2", {"Out"}}, + {"partial_recv", {"Out"}}, + {"matmul", {"Out"}}, + {"c_broadcast", {"Out"}}, + {"c_sync_calc_stream", {"Out"}}, + {"c_sync_comm_stream", {"Out"}}, + {"c_reduce_sum", {"Out"}}, + {"c_reduce_max", {"Out"}}, + {"c_reduce_min", {"Out"}}, + {"c_reduce_prod", {"Out"}}, + {"c_reduce", {"Out"}}, + {"c_scatter", {"Out"}}, + {"barrier", {"Out"}}, + {"fake_quantize_dequantize_moving_average_abs_max", + {"Out", "OutScale", "OutAccum", "OutState"}}, + {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}}, + {"fake_channel_wise_quantize_dequantize_abs_max", {"Out", "OutScale"}}, + {"check_finite_and_unscale", {"Out", "FoundInfinite"}}, + {"update_loss_scaling", + {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}}, + {"moving_average_abs_max_scale", + {"Out", "OutScale", "OutAccum", "OutState"}}, + {"lamb", + {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, + {"rnn", {"DropoutState"}}, + {"run_program", {"Out", "DOut", "OutScope"}}, + {"clear_float_status", {"FloatStatusOut"}}, + {"get_float_status", {"FloatStatusOut"}}, +}; + +// NOTE(pangyoki): Tensor View Strategy. +// In this case, a new output varbase will be created, and this varbase will +// reuse the input varbase's allocation. +// It's a map. The key of outer map is the view op name, the value is +// a pair which implies the mapping relationship between the input and +// output varbase. +std::map> view_op_map = { + {"squeeze2", {"X", "Out"}}, // "X" -> "Out" + {"unsqueeze2", {"X", "Out"}}, + {"reshape2", {"X", "Out"}}, + {"flatten_contiguous_range", {"X", "Out"}}, +}; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 617c724f63f38..1e29820e08d5c 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -75,6 +75,7 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/pybind/cuda_streams_py.h" +#include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/io.h" #include "paddle/utils/none.h" #ifdef PADDLE_WITH_ASCEND @@ -113,9 +114,9 @@ limitations under the License. */ #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #endif #ifndef PADDLE_WITH_HIP -#include "paddle/fluid/platform/cuda_profiler.h" +#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" #endif -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif #ifdef PADDLE_WITH_ASCEND_CL @@ -150,6 +151,14 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType); namespace paddle { namespace pybind { + +PyTypeObject *g_place_pytype = nullptr; +PyTypeObject *g_cudaplace_pytype = nullptr; +PyTypeObject *g_cpuplace_pytype = nullptr; +PyTypeObject *g_xpuplace_pytype = nullptr; +PyTypeObject *g_npuplace_pytype = nullptr; +PyTypeObject *g_cudapinnedplace_pytype = nullptr; + bool IsCompiledWithCUDA() { #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) return false; @@ -198,6 +207,14 @@ bool IsCompiledWithMKLDNN() { #endif } +bool IsCompiledWithCINN() { +#ifndef PADDLE_WITH_CINN + return false; +#else + return true; +#endif +} + bool IsCompiledWithHETERPS() { #ifndef PADDLE_WITH_HETERPS return false; @@ -498,7 +515,7 @@ static void AssertStaticGraphAndDygraphGradMakerNoDiff() { static int GetNCCLVersion() { #if NCCL_VERSION_CODE >= 2304 int ver; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGetVersion(&ver)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetVersion(&ver)); return ver; #else PADDLE_THROW(platform::errors::External( @@ -524,6 +541,7 @@ PYBIND11_MODULE(core_avx, m) { PYBIND11_MODULE(core_noavx, m) { #endif + BindEager(&m); BindCudaStream(&m); // Not used, just make sure cpu_info.cc is linked. @@ -546,7 +564,7 @@ PYBIND11_MODULE(core_noavx, m) { m.def("disable_signal_handler", &DisableSignalHandler); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - m.def("cudnn_version", &platform::CudnnVersion); + m.def("cudnn_version", &platform::DnnVersion); m.def("gpu_memory_available", []() { size_t available = 0; size_t total = 0; @@ -554,6 +572,7 @@ PYBIND11_MODULE(core_noavx, m) { return available; }); #endif + #ifdef PADDLE_WITH_NCCL m.def("nccl_version", &GetNCCLVersion); #endif @@ -1611,7 +1630,7 @@ All parameter, weight, gradient are variables in Paddle. #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) py::class_(m, "Communicator").def(py::init<>()); #endif - py::class_(m, "CUDAPlace", R"DOC( + py::class_ cudaplace(m, "CUDAPlace", R"DOC( CUDAPlace is a descriptor of a device. It represents a GPU device allocated or to be allocated with Tensor or LoDTensor. @@ -1634,7 +1653,9 @@ All parameter, weight, gradient are variables in Paddle. place = paddle.CUDAPlace(0) - )DOC") + )DOC"); + g_cudaplace_pytype = reinterpret_cast(cudaplace.ptr()); + cudaplace .def("__init__", [](platform::CUDAPlace &self, int dev_id) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -1646,8 +1667,8 @@ All parameter, weight, gradient are variables in Paddle. std::exit(-1); } - if (UNLIKELY(dev_id >= platform::GetCUDADeviceCount())) { - if (platform::GetCUDADeviceCount() == 0) { + if (UNLIKELY(dev_id >= platform::GetGPUDeviceCount())) { + if (platform::GetGPUDeviceCount() == 0) { LOG(ERROR) << "Cannot use GPU because there is no GPU " "detected on your " "machine."; @@ -1656,8 +1677,8 @@ All parameter, weight, gradient are variables in Paddle. LOG(ERROR) << string::Sprintf( "Invalid CUDAPlace(%d), must inside [0, %d), because GPU " "number on your machine is %d", - dev_id, platform::GetCUDADeviceCount(), - platform::GetCUDADeviceCount()); + dev_id, platform::GetGPUDeviceCount(), + platform::GetGPUDeviceCount()); std::exit(-1); } } @@ -1692,13 +1713,15 @@ All parameter, weight, gradient are variables in Paddle. .def("__repr__", string::to_string) .def("__str__", string::to_string); - py::class_(m, "XPUPlace", R"DOC( + py::class_ xpuplace(m, "XPUPlace", R"DOC( **Note**: Examples: .. code-block:: python import paddle.fluid as fluid xpu_place = fluid.XPUPlace(0) - )DOC") + )DOC"); + g_xpuplace_pytype = reinterpret_cast(xpuplace.ptr()); + xpuplace .def("__init__", [](platform::XPUPlace &self, int dev_id) { #ifdef PADDLE_WITH_XPU @@ -1768,7 +1791,7 @@ All parameter, weight, gradient are variables in Paddle. }); #endif - py::class_(m, "CPUPlace", R"DOC( + py::class_ cpuplace(m, "CPUPlace", R"DOC( CPUPlace is a descriptor of a device. It represents a CPU device on which a tensor will be allocated and a model will run. @@ -1778,8 +1801,9 @@ All parameter, weight, gradient are variables in Paddle. import paddle cpu_place = paddle.CPUPlace() - )DOC") - .def(py::init<>()) + )DOC"); + g_cpuplace_pytype = reinterpret_cast(cpuplace.ptr()); + cpuplace.def(py::init<>()) .def("_type", &PlaceIndex) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) @@ -1791,7 +1815,8 @@ All parameter, weight, gradient are variables in Paddle. .def("__repr__", string::to_string) .def("__str__", string::to_string); - py::class_(m, "CUDAPinnedPlace", R"DOC( + py::class_ cudapinnedplace( + m, "CUDAPinnedPlace", R"DOC( CUDAPinnedPlace is a descriptor of a device. It refers to the page locked memory allocated by the CUDA function `cudaHostAlloc()` in the host memory. The host operating system will not paging and exchanging the memory. @@ -1805,7 +1830,10 @@ All parameter, weight, gradient are variables in Paddle. import paddle place = paddle.CUDAPinnedPlace() - )DOC") + )DOC"); + g_cudapinnedplace_pytype = + reinterpret_cast(cudapinnedplace.ptr()); + cudapinnedplace .def("__init__", [](platform::CUDAPinnedPlace &self) { #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) @@ -1831,7 +1859,7 @@ All parameter, weight, gradient are variables in Paddle. .def("__str__", string::to_string); // NPUPlace - py::class_(m, "NPUPlace", R"DOC( + py::class_ npuplace(m, "NPUPlace", R"DOC( NPUPlace is a descriptor of a device. It represents a NPU device on which a tensor will be allocated and a model will run. @@ -1840,7 +1868,9 @@ All parameter, weight, gradient are variables in Paddle. import paddle npu_place = paddle.NPUPlace(0) - )DOC") + )DOC"); + g_npuplace_pytype = reinterpret_cast(npuplace.ptr()); + npuplace .def("__init__", [](platform::NPUPlace &self, int dev_id) { #ifdef PADDLE_WITH_ASCEND_CL @@ -1891,8 +1921,9 @@ All parameter, weight, gradient are variables in Paddle. [](const platform::NPUPlace &self) { return self.GetDeviceId(); }) .def("__str__", string::to_string); - py::class_(m, "Place") - .def(py::init<>()) + py::class_ platformplace(m, "Place"); + g_place_pytype = reinterpret_cast(platformplace.ptr()); + platformplace.def(py::init<>()) .def("_type", &PlaceIndex) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) @@ -2180,6 +2211,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("is_compiled_with_npu", IsCompiledWithNPU); m.def("is_compiled_with_xpu", IsCompiledWithXPU); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); + m.def("is_compiled_with_cinn", IsCompiledWithCINN); m.def("_is_compiled_with_heterps", IsCompiledWithHETERPS); m.def("supports_bfloat16", SupportsBfloat16); m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance); @@ -2229,7 +2261,7 @@ All parameter, weight, gradient are variables in Paddle. #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { // Only GPUs with Compute Capability >= 53 support float16 - return platform::GetCUDAComputeCapability(place.device) >= 53; + return platform::GetGPUComputeCapability(place.device) >= 53; }); #endif @@ -2409,7 +2441,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("op_support_gpu", OpSupportGPU); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - m.def("get_cuda_device_count", platform::GetCUDADeviceCount); + m.def("get_cuda_device_count", platform::GetGPUDeviceCount); m.def("cuda_empty_cache", [] { for (int dev_id : platform::GetSelectedDevices()) { auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace( diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt new file mode 100644 index 0000000000000..b8f6f4738d3e7 --- /dev/null +++ b/paddle/infrt/CMakeLists.txt @@ -0,0 +1,79 @@ +if (NOT WITH_INFRT) + return() +endif() + +set(infrt_src CACHE INTERNAL "" FORCE) + +# Gather headers for library publish. +function(core_gather_headers) + file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h) + + foreach(header ${includes}) + set(core_includes "${core_includes};${header}" CACHE INTERNAL "") + endforeach() +endfunction() + +function(gather_srcs SRC_GROUP) + set(options) + set(oneValueArgs) + set(multiValueArgs "SRCS") + cmake_parse_arguments(prefix "" "" "${multiValueArgs}" ${ARGN}) + foreach(cpp ${prefix_SRCS}) + set(${SRC_GROUP} "${${SRC_GROUP}};${CMAKE_CURRENT_SOURCE_DIR}/${cpp}" CACHE INTERNAL "") + endforeach() +endfunction() + +# This method is similar to the global cc_test, but discard the huge amount default dependencies those are +# not needed by INFRT. +function(cc_test_tiny TARGET_NAME) + if(WITH_TESTING) + set(options SERIAL) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS ARGS) + cmake_parse_arguments(cc_test_tiny "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_executable(${TARGET_NAME} ${cc_test_tiny_SRCS}) + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(${TARGET_NAME} ${cc_test_tiny_DEPS} ${os_dependency_modules} infrt_gtest_main gtest ) + add_dependencies(${TARGET_NAME} ${cc_test_tiny_DEPS} infrt_gtest_main gtest extern_gtest) + + add_test(NAME ${TARGET_NAME} + COMMAND ${TARGET_NAME} "${cc_test_tiny_ARGS}" + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + if (${cc_test_tiny_SERIAL}) + set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) + endif() + endif() + +endfunction() + +if (WITH_TESTING) + cc_library(infrt_gtest_main SRCS gtest_main.cc DEPS gtest glog gflags) +endif() + + +add_subdirectory(api) +add_subdirectory(common) +add_subdirectory(dialect) +add_subdirectory(host_context) +add_subdirectory(kernel) +add_subdirectory(tensor) +add_subdirectory(support) +add_subdirectory(external_kernels) +add_subdirectory(paddle) + + +# MLIR td file generations +set(infrt_mlir_incs + ops_inc + basic_kernels_inc + test_kernels_inc + infrt_base_inc + tensor_shape_inc + dense_tensor_inc + pd_ops_inc + rewrite_inc + ) +message(STATUS "infrt srcs:\n${infrt_src}") + +cc_library(infrt SRCS ${infrt_src} DEPS glog ${mlir_libs} paddle_framework_proto) +add_dependencies(infrt ${infrt_mlir_incs}) diff --git a/paddle/infrt/api/CMakeLists.txt b/paddle/infrt/api/CMakeLists.txt new file mode 100644 index 0000000000000..93a7ae8369521 --- /dev/null +++ b/paddle/infrt/api/CMakeLists.txt @@ -0,0 +1,8 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + infrt_api.cc + ) + +# Disable temporarily for the external-kernel's mkldnn is outdate +# cc_test(test_infrt_api SRCS infrt_api_test.cc DEPS infrt ${MLIR_IR_LIBS}) diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc new file mode 100644 index 0000000000000..c2a4e0aff7a08 --- /dev/null +++ b/paddle/infrt/api/infrt_api.cc @@ -0,0 +1,246 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/api/infrt_api.h" + +#include +#include +#include +#include + +#include +#include + +#include "paddle/infrt/common/global.h" +#include "paddle/infrt/dialect/dense_tensor.h" +#include "paddle/infrt/dialect/mlir_loader.h" +#include "paddle/infrt/host_context/core_runtime.h" +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/mlir_function_executable.h" +#include "paddle/infrt/host_context/mlir_to_runtime_translate.h" +#include "paddle/infrt/host_context/op_executable.h" +#include "paddle/infrt/host_context/value.h" +#include "paddle/infrt/kernel/basic_kernels.h" +#include "paddle/infrt/kernel/control_flow_kernels.h" +#include "paddle/infrt/kernel/tensor_kernels.h" +#include "paddle/infrt/kernel/tensor_shape_kernels.h" +#include "paddle/infrt/kernel/test_kernels.h" +#include "paddle/infrt/tensor/tensor_map.h" + +using namespace infrt::host_context; // NOLINT +using namespace infrt::tensor; // NOLINT +using namespace infrt::tensor; // NOLINT +using infrt::dt::TensorMapType; // NOLINT +using infrt::dt::TensorType; // NOLINT + +namespace infrt { + +template +std::string DumpToString(T& op) { // NOLINT + std::string buffer; + llvm::raw_string_ostream os(buffer); + op.print(os); + os.flush(); + return buffer; +} + +struct MlirToRuntimeTranslator::Impl { + mlir::ModuleOp module; + // The runtime for a function call. + CoreRuntimeBuilder* runtime{}; + + // The current working op, the translator process the ops one by one, each + // time it updates `cur_op` here to current op + // working on. + OpExecutableBuilder* cur_op{}; + + // record the current function name. + std::string cur_func_name; + + // Name to function definitions. + std::unordered_map func_defs; + + // Map from an operation to its results. + std::unordered_map> op_results; + llvm::DenseMap value_map; +}; + +/** + * Execute the mlir program in predict mode. + */ +class PredictExecutor : public MlirToRuntimeTranslator { + public: + CoreRuntimeBuilder core_runtime; + + PredictExecutor(mlir::ModuleOp module, + KernelRegistry* registry, + TensorMap* map) + : MlirToRuntimeTranslator(module, &core_runtime), + core_runtime(registry), + registry_(registry) { + CHECK(registry_); + Init(map); + } + + void Run() { + auto arguments = llvm::makeArrayRef(arguments_); + auto results = llvm::makeMutableArrayRef(results_.begin(), results_.size()); + function_executable_->Execute(arguments, results); + } + + int GetInputNum() { return inputs_.size(); } + + DenseHostTensor* GetInput(int i) { return inputs_[i]; } + + int GetOutputNum() { return outputs_.size(); } + + DenseHostTensor* GetOutput(int i) { return outputs_[i]; } + + private: + void Init(TensorMap* map) { + EmitFunctions(); + llvm::Optional predict_func_ = llvm::None; + for (auto func_op : impl_->module.getOps()) { + if (func_op.getName().str() != "predict") continue; + predict_func_ = func_op; + break; + } + if (!predict_func_) { + std::cout << "ERROR: init failed, no predict function found in mlir." + << std::endl; + return; + } + auto& predict_func = predict_func_.getValue(); + function_executable_ = + new MlirFunctionExecutable(predict_func, registry_, impl_->func_defs); + + // process parammeters + for (size_t i = 0; i < predict_func.getNumArguments(); ++i) { + auto arg = predict_func.getArgument(i); + auto type = arg.getType(); + // this param is TensorMap + if (type.isa()) { + auto* value = new host_context::Value(std::move(*map)); + arguments_.push_back(value); + AddValue(predict_func.getArgument(i), value); + } else { + // this param is an input Tensor + auto dht = DenseHostTensor(); + auto* value = new host_context::Value(std::move(dht)); + arguments_.push_back(value); + inputs_.push_back(&(value->get())); + } + } + + // process results + auto& last_op = predict_func.front().back(); + if (last_op.getName().getStringRef() == "infrt.return") { + for (size_t i = 0; i < last_op.getNumOperands(); ++i) { + auto* value = AddValue(mlir::Value(last_op.getOperand(i))); + results_.push_back(ValueRef(value)); + outputs_.push_back(&(value->get())); + } + } + } + + protected: + std::unordered_map func_def_table; + + void EmitFunction(mlir::FuncOp op) override { + CHECK(!impl_->func_defs.count(op.getName().str())) + << "Duplicate function defition found for function [" + << op.getName().str(); + impl_->func_defs.emplace(op.getName().str(), op); + } + + private: + KernelRegistry* registry_{}; + MlirFunctionExecutable* function_executable_; + llvm::SmallVector inputs_; + llvm::SmallVector arguments_; + llvm::SmallVector outputs_; + llvm::SmallVector results_; +}; + +std::shared_ptr CreateInfRtPredictor( + const InfRtConfig& config) { + auto x = std::make_shared(); + x->Init(config); + return x; +} + +struct InfRtPredictor::Impl { + mlir::OwningModuleRef module_ref; + std::unique_ptr executor; +}; + +InfRtPredictor::InfRtPredictor() : impl_(new Impl) {} +InfRtPredictor::~InfRtPredictor() {} + +void InfRtPredictor::Run() { impl_->executor->Run(); } + +int InfRtPredictor::Init(const InfRtConfig& config) { + mlir::MLIRContext* context = infrt::Global::getMLIRContext(); + auto module_ref = dialect::LoadMlirFile(config.mlir_path(), context); + + KernelRegistry* registry = new KernelRegistry(); + + kernel::RegisterBasicKernels(registry); + kernel::RegisterTestKernels(registry); + kernel::RegisterTensorShapeKernels(registry); + kernel::RegisterTensorKernels(registry); + kernel::RegisterControlFlowKernels(registry); + + impl_->module_ref = std::move(module_ref); + + // load extra shared library + for (const std::string& lib_path : config.shared_libs()) { + std::string err; + llvm::sys::DynamicLibrary dynLib = + llvm::sys::DynamicLibrary::getPermanentLibrary(lib_path.c_str(), &err); + if (!dynLib.isValid()) { + llvm::errs() << "Load shared library failed. Error: " << err << "\n"; + return 1; + } + if (auto reg_sym = dynLib.SearchForAddressOfSymbol("RegisterKernels")) { + auto reg_func = reinterpret_cast(reg_sym); + reg_func(registry); + } else { + llvm::outs() << "Symbol \"RegisterKernels\" not found in \"" << lib_path + << "\". Skip.\n"; + } + } + + // Load params + TensorMap* tensor_map = LoadParams(config.model_dir()); + + // Create PredictExecutor + impl_->executor.reset( + new PredictExecutor(impl_->module_ref.get(), registry, tensor_map)); + return 0; +} + +int InfRtPredictor::GetInputNum() { return impl_->executor->GetInputNum(); } + +DenseHostTensor* InfRtPredictor::GetInput(int i) { + return impl_->executor->GetInput(i); +} + +int InfRtPredictor::GetOutputNum() { return impl_->executor->GetOutputNum(); } + +DenseHostTensor* InfRtPredictor::GetOutput(int i) { + return impl_->executor->GetOutput(i); +} + +} // namespace infrt diff --git a/paddle/infrt/api/infrt_api.h b/paddle/infrt/api/infrt_api.h new file mode 100644 index 0000000000000..82b6cb8df91ff --- /dev/null +++ b/paddle/infrt/api/infrt_api.h @@ -0,0 +1,63 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +#include "paddle/infrt/tensor/dense_host_tensor.h" + +namespace infrt { + +class InfRtConfig { + std::string model_dir_; + std::string mlir_path_; + std::vector shared_libs_; + + public: + InfRtConfig() = default; + void set_model_dir(const std::string& model_dir) { model_dir_ = model_dir; } + const std::string& model_dir() const { return model_dir_; } + + void set_mlir_path(const std::string& mlir_path) { mlir_path_ = mlir_path; } + const std::string& mlir_path() const { return mlir_path_; } + + void set_shared_libs(const std::vector& shared_libs) { + shared_libs_ = shared_libs; + } + const std::vector& shared_libs() const { return shared_libs_; } + + virtual ~InfRtConfig() = default; +}; + +class InfRtPredictor { + public: + InfRtPredictor(); + ~InfRtPredictor(); + void Run(); + int Init(const InfRtConfig& config); + int GetInputNum(); + tensor::DenseHostTensor* GetInput(int i); + int GetOutputNum(); + tensor::DenseHostTensor* GetOutput(int i); + + protected: + struct Impl; + std::unique_ptr impl_; +}; + +std::shared_ptr CreateInfRtPredictor(const InfRtConfig& config); + +} // namespace infrt diff --git a/paddle/infrt/api/infrt_api_test.cc b/paddle/infrt/api/infrt_api_test.cc new file mode 100644 index 0000000000000..92e069f47521b --- /dev/null +++ b/paddle/infrt/api/infrt_api_test.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/api/infrt_api.h" + +#include + +#include +#include + +#include "llvm/Support/raw_ostream.h" +#include "paddle/infrt/common/buffer.h" +#include "paddle/infrt/common/dtype.h" + +using infrt::InfRtConfig; +using infrt::InfRtPredictor; +using infrt::CreateInfRtPredictor; + +namespace infrt { + +TEST(InfRtPredictor, predictor) { + std::vector shared_libs; + shared_libs.push_back("../../paddle/libexternal_kernels.so"); + + InfRtConfig config; + + // set external shared libraries that contain kernels. + config.set_shared_libs(shared_libs); + // set model dir + config.set_model_dir("../../paddle/paddle_1.8_fc_model"); + // set mlir path + config.set_mlir_path("../../../infrt/dialect/mlir_tests/tensor_map.mlir"); + + std::shared_ptr predictor = CreateInfRtPredictor(config); + + auto* input = predictor->GetInput(0); + std::vector shape = {3, 3}; + input->Init(shape, infrt::GetDType()); + llvm::outs() << input->shape() << "\n"; + + // init input tensor + auto* input_data = reinterpret_cast(input->buffer()->data()->memory); + for (int i = 0; i < input->shape().GetNumElements(); i++) input_data[i] = 1.0; + + predictor->Run(); + + // get and print output tensor + auto* output = predictor->GetOutput(0); + auto* output_data = + reinterpret_cast(output->buffer()->data()->memory); + + std::vector ans = {0.428458, + 0.244493, + 0.572342, + 0.572008, + 0.509771, + 0.495599, + 0.651287, + 0.326426, + 0.404649}; + + ASSERT_EQ(output->shape().GetNumElements(), ans.size()); + for (int i = 0; i < output->shape().GetNumElements(); ++i) { + ASSERT_NEAR(output_data[i], ans[i], 0.000001); + } +} + +} // namespace infrt diff --git a/paddle/infrt/common/CMakeLists.txt b/paddle/infrt/common/CMakeLists.txt new file mode 100644 index 0000000000000..931e3e42307eb --- /dev/null +++ b/paddle/infrt/common/CMakeLists.txt @@ -0,0 +1,14 @@ +core_gather_headers() +set(core_includes "${core_includes};infrt/common/dtype.def" CACHE INTERNAL "") + +gather_srcs(infrt_src SRCS + dtype.cc + global.cc + target.cc + type.cc + shared.cc + object.cc + string.cc + buffer.cc + memory.cc + ) diff --git a/paddle/infrt/common/buffer.cc b/paddle/infrt/common/buffer.cc new file mode 100644 index 0000000000000..bc4ec7feada87 --- /dev/null +++ b/paddle/infrt/common/buffer.cc @@ -0,0 +1,98 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/common/buffer.h" + +#include +#include + +#include + +namespace infrt { +void Buffer::Resize(uint32_t size) { + if (size_ > 0) { + Free(); + size_ = 0; + } + + if (size_ != size) { + data_.memory = reinterpret_cast(Malloc(size)); + size_ = size; + } +} + +void Buffer::Resize(uint32_t alignment, uint32_t size) { + if (size_ > 0) { + Free(); + size_ = 0; + } + + if (size_ != size) { + data_.memory = reinterpret_cast(AlignedAlloc(alignment, size)); + size_ = size; + } +} + +void Buffer::SetTarget(const infrt::common::Target& target) { + target_ = target; + memory_mng_cache_ = MemoryManager::Global().RetrieveSafely(target_.arch); +} + +void Buffer::ResizeLazy(uint32_t size) { + if (size <= size_) return; + Resize(size); +} + +void Buffer::ResizeLazy(uint32_t alignment, uint32_t size) { + if (size <= size_) return; + Resize(alignment, size); +} + +void Buffer::Resize(uint32_t size, const infrt::common::Target& target) { + if (target.arch != target_.arch) { + Free(); + SetTarget(target); + } + Resize(size); +} + +void Buffer::Resize(uint32_t alignment, + uint32_t size, + const infrt::common::Target& target) { + if (target.arch != target_.arch) { + Free(); + SetTarget(target); + } + Resize(alignment, size); +} + +void Buffer::ResizeLazy(uint32_t size, const infrt::common::Target& target) { + if (target.arch != target_.arch) { + Free(); + SetTarget(target); + } + ResizeLazy(size); +} + +void Buffer::ResizeLazy(uint32_t alignment, + uint32_t size, + const infrt::common::Target& target) { + if (target.arch != target_.arch) { + Free(); + SetTarget(target); + } + ResizeLazy(alignment, size); +} + +} // namespace infrt diff --git a/paddle/infrt/common/buffer.h b/paddle/infrt/common/buffer.h new file mode 100644 index 0000000000000..cae2a7ead96ab --- /dev/null +++ b/paddle/infrt/common/buffer.h @@ -0,0 +1,296 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include + +#include "paddle/infrt/common/macros.h" +#include "paddle/infrt/common/memory.h" +#include "paddle/infrt/common/target.h" + +namespace infrt { + +#ifdef __cplusplus +extern "C" { +#endif + +#define INFRT_ALWAYS_INLINE __attribute__((always_inline)) inline + +//! Code for the primitive types supported in INFRT. +typedef enum infrt_type_code_t { + infrt_type_unk = -1, //! Unknown type + infrt_type_int = 0, //! signed int + infrt_type_uint = 1, //! unsigned int + infrt_type_float = 2, //! floating point + infrt_type_handle = 3 //! void* +} infrt_type_code_t; + +#ifndef INFRT_ATTRIBUTE_ALIGN +#define INFRT_ATTRIBUTE_ALIGN(n) __attribute__((aligned(n))) +#endif + +/** + * A tuntime tag for type in INFRT system. + */ +typedef struct infrt_type_t { +#if __cplusplus >= 201103L + INFRT_ATTRIBUTE_ALIGN(1) infrt_type_code_t code; +#else + uint8_t code; +#endif + + //! Number of bits. + uint8_t bits; + + //! Number of elements in a vector, 1 for scalar. + uint16_t lanes; + + //! Number of '*', e.g. for `float*`, the num_asterisks is 1, `float**` it is + //! 2. + uint8_t num_asterisks{0}; + +#ifdef __cplusplus + INFRT_ALWAYS_INLINE infrt_type_t() + : code(infrt_type_int), bits(0), lanes(0) {} + INFRT_ALWAYS_INLINE infrt_type_t(infrt_type_code_t code, + uint8_t bits, + uint16_t lanes = 1, + uint8_t num_asterisks = 0) + : code(code), bits(bits), lanes(lanes), num_asterisks(num_asterisks) {} + INFRT_ALWAYS_INLINE bool operator==(const infrt_type_t& other) const { + return code == other.code && bits == other.bits && lanes == other.lanes; + } + INFRT_ALWAYS_INLINE bool operator!=(const infrt_type_t& other) const { + return !(*this == other); + } + INFRT_ALWAYS_INLINE uint16_t bytes() const { return (bits + 7) / 8; } +#endif // __cplusplus +} infrt_type_t; + +//! Help to define the size of a dimension, due to polyhedral representation, we +//! no need to record the extend or +//! min(default to 0). +typedef int infrt_dimension_t; + +//! Help to tell the kind of the device. +typedef enum infrt_device_kind_t { + infrt_unk_device = -1, // Undefined device. + infrt_x86_device = 0, // X86 device + infrt_opencl_device = 1, // OpenCL device + infrt_arm_device = 2 // ARM device +} infrt_device_kind_t; + +struct infrt_buffer_t; + +/** + * All INFRT backends implementation should provide an interface to be used. + */ +struct infrt_device_interface_impl_t; + +struct infrt_device_interface_t { + int (*malloc)(void* context, struct infrt_buffer_t* buf); + int (*free)(void* context, struct infrt_buffer_t* buf); + int (*sync)(void* context, struct infrt_buffer_t* buf); + int (*release)(void* context, + const struct infrt_device_interface_t* device_interface); + int (*copy_to_host)(void* context, struct infrt_buffer_t* buf); + int (*copy_to_device)(void* context, struct infrt_buffer_t* buf); + int (*buffer_copy)(void* context, + struct infrt_buffer_t* src, + struct infrt_buffer_t* dst); + struct infrt_device_interface_impl_t* impl; +}; + +//! The raw representation of a buffer,used in the generated code/lib. +#define INFRT_BUFFER_MAX_DIMS 8 +typedef struct infrt_buffer_t { + //! Tell which kind of device this buffer locates. + infrt_device_kind_t device; + + //! The interface used to operate on device. + const struct infrt_device_interface_t* device_interface; + + //! A pointer to the memory in host. + uint8_t* memory; + + //! Extra flags. + uint64_t flag; + + //! Data type. + infrt_type_t type; + + //! Number of dimensions. + int32_t dimensions; + infrt_dimension_t dims[INFRT_BUFFER_MAX_DIMS]; + + //! Allocate and deallocate lazily, default true. + char lazy; + + //! The actual memory size(in bytes). + uint64_t memory_size; + + uint16_t align; + +#ifdef __cplusplus + infrt_buffer_t() + : device(infrt_unk_device), + device_interface(NULL), + memory(NULL), + flag(0UL), + type(infrt_type_t()), + dimensions(0), + lazy(true), + memory_size(0), + align(0) {} + + static void delete_(struct infrt_buffer_t* x) { delete x; } + + ~infrt_buffer_t() {} + + // NOTE the buffer should be resized first. + static void alloc(struct infrt_buffer_t*); + + //! Set the shape of the buffer. NOTE this just record the shape, not allocate + //! the memory. + INFRT_ALWAYS_INLINE void resize(const infrt_dimension_t* dims, + int dimensions) { + this->dimensions = dimensions; + memcpy(this->dims, dims, dimensions * sizeof(infrt_dimension_t)); + } + + INFRT_ALWAYS_INLINE uint64_t num_elements() const { + uint64_t res = 1; + for (int i = 0; i < dimensions; i++) { + res *= dims[i]; + } + return res; + } + + INFRT_ALWAYS_INLINE int device_sync(void* ctx = NULL) { + if (device_interface && device_interface->sync) { + return device_interface->sync(ctx, this); + } + return 0; + } + + INFRT_ALWAYS_INLINE uint8_t* begin() const { return 0; } + INFRT_ALWAYS_INLINE uint8_t* end() const { + return memory + num_elements() * type.bytes(); + } + +#endif // __cplusplus +} infrt_buffer_t; + +#ifdef __cplusplus +struct infrt_device_interface_impl_t { + int (*malloc)(void* context, struct infrt_buffer_t* buf); + int (*free)(void* context, struct infrt_buffer_t* buf); + int (*sync)(void* context, struct infrt_buffer_t* buf); + int (*release)(void* context); + int (*copy_to_host)(void* context, struct infrt_buffer_t* buf); + int (*copy_to_device)(void* context, struct infrt_buffer_t* buf); + int (*buffer_copy)(void* context, + struct infrt_buffer_t* src, + struct infrt_buffer_t* dst); +}; + +// The device implementations +extern struct infrt_device_interface_t* infrt_x86_device_interface(); +#endif // __cplusplus + +#ifdef __cplusplus +} // extern "C" +#endif + +#define INFRT_LOG(fmt, ...) \ + do { \ + fprintf(stderr, \ + "%s:%d:%s(): " fmt, \ + __FILE__, \ + __LINE__, \ + __func__, \ + __VA_ARGS__); \ + } while (0) + +#define INFRT_CHECK(cond) \ + if (!(cond)) { \ + INFRT_LOG("check %s failed", #cond); \ + abort(); \ + } +/** + * Buffer helps to hold the memory, and offers a set of methods to help manage + * the memory. + */ +struct Buffer final { + Buffer() = default; + explicit Buffer(const common::Target& target) { SetTarget(target); } + + //! Resize the memory hold by this buffer *exactlly* to \p size. + void Resize(uint32_t size); + void Resize(uint32_t alignment, uint32_t size); + + //! Lazily resize the memory. + void ResizeLazy(uint32_t size); + void ResizeLazy(uint32_t alignment, uint32_t size); + + //! Resize the memory to \p size in target \p target. + void Resize(uint32_t size, const common::Target& target); + void Resize(uint32_t alignment, uint32_t size, const common::Target& target); + + //! Lazily resize the memory to \p size in target \p target. + void ResizeLazy(uint32_t size, const common::Target& target); + void ResizeLazy(uint32_t alignment, + uint32_t size, + const common::Target& target); + + void SetTarget(const common::Target& target); + + const infrt_buffer_t* data() const { return &data_; } + infrt_buffer_t* data() { return &data_; } + + //! Free all the memory owned by this buffer. + void Free() { + if (!data_.memory) return; + memory_mng_cache_->free(data_.memory); + } + + private: + inline void* Malloc(uint32_t size) INFRT_RESULT_SHOULD_USE { + CHECK(memory_mng_cache_) << "Should set target first"; + return memory_mng_cache_->malloc(size); + } + + inline void* AlignedAlloc(uint32_t alignment, + uint32_t size) INFRT_RESULT_SHOULD_USE { + CHECK(memory_mng_cache_) << "Should set target first"; + return memory_mng_cache_->aligned_alloc(alignment, size); + } + + private: + infrt_buffer_t data_; + + //! The place where this buffer locates. + common::Target target_; + + //! Number of bytes of this buffer. + uint32_t size_{}; + + //! Hold the corresponding memory manager for speed. + MemoryInterface* memory_mng_cache_{}; +}; + +} // namespace infrt diff --git a/paddle/infrt/common/common.h b/paddle/infrt/common/common.h new file mode 100644 index 0000000000000..a15bc69b6030a --- /dev/null +++ b/paddle/infrt/common/common.h @@ -0,0 +1,61 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/infrt/common/macros.h" +#include "paddle/infrt/common/shared.h" +#include "paddle/infrt/common/target.h" +#include "paddle/infrt/common/type.h" + +namespace infrt { + +// export some general concepts. +using common::make_shared; +using common::Object; +using common::ref_count; +using common::Shared; + +// Type related. +using common::Bool; +using common::Float; +using common::Int; +using common::UInt; +using common::Void; + +using common::type_of; + +using common::Target; +using common::Type; +using common::UnkTarget; + +template +T& Reference(const T* x) { + return *const_cast(x); +} + +static void CheckVarNameValid(const std::string& name) { + CHECK(!name.empty()); + CHECK(name.find(' ') == std::string::npos && // + name.find('.') == std::string::npos && // + name.find('/') == std::string::npos && // + name.find('\t') == std::string::npos && // + name.find('\n') == std::string::npos && // + name.find('\r') == std::string::npos) + << "Some invalid character found"; +} + +} // namespace infrt diff --git a/paddle/infrt/common/dtype.cc b/paddle/infrt/common/dtype.cc new file mode 100644 index 0000000000000..d5cf67d8a3c40 --- /dev/null +++ b/paddle/infrt/common/dtype.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/common/dtype.h" + +namespace infrt { + +const char* DType::name() const { + switch (kind_) { + case Kind::Unk: + return "Unk"; + break; +#define INFRT_DTYPE(enum__, value__) \ + case Kind::enum__: \ + return #enum__; \ + break; +#include "paddle/infrt/common/dtype.def" +#undef INFRT_DTYPE + } + + return ""; +} + +size_t DType::GetHostSize() const { + switch (kind_) { +#define INFRT_DTYPE(enum__, value__) \ + case DType::Kind::enum__: \ + return sizeof(DTypeInternal::type); +#include "paddle/infrt/common/dtype.def" // NOLINT +#undef INFRT_DTYPE + + case Kind::Unk: + return 0; + break; + } + return 0; +} + +} // namespace infrt diff --git a/paddle/infrt/common/dtype.def b/paddle/infrt/common/dtype.def new file mode 100644 index 0000000000000..32df72aa764a3 --- /dev/null +++ b/paddle/infrt/common/dtype.def @@ -0,0 +1,18 @@ +// Define all INFRT dtypes +// DTYPE(ENUM, VALUE) +#ifdef INFRT_DTYPE + +INFRT_DTYPE(UI8, 1) +INFRT_DTYPE(UI16, 2) +INFRT_DTYPE(UI32, 3) +INFRT_DTYPE(UI64, 4) +INFRT_DTYPE(I1, 5) +INFRT_DTYPE(I8, 6) +INFRT_DTYPE(I16, 7) +INFRT_DTYPE(I32, 8) +INFRT_DTYPE(I64, 9) +INFRT_DTYPE(F32, 10) +INFRT_DTYPE(F64, 11) +INFRT_DTYPE(STRING, 12) + +#endif \ No newline at end of file diff --git a/paddle/infrt/common/dtype.h b/paddle/infrt/common/dtype.h new file mode 100644 index 0000000000000..8b57299fa94fd --- /dev/null +++ b/paddle/infrt/common/dtype.h @@ -0,0 +1,85 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include + +namespace infrt { +class DType { + public: + enum class Kind : uint8_t { + Unk = 0, + +// Automatically generate the enum definition +#define INFRT_DTYPE(enum__, value__) enum__ = value__, +#include "paddle/infrt/common/dtype.def" +#undef INFRT_DTYPE + + BOOL = I1, + }; + + DType() = default; + explicit constexpr DType(Kind kind) : kind_(kind) { assert(IsValid()); } + + DType(const DType&) = default; + DType& operator=(const DType&) = default; + bool operator==(DType other) const { return kind_ == other.kind_; } + bool operator!=(DType other) const { return !(*this == other); } + + constexpr Kind kind() const { return kind_; } + + bool IsValid() const { return kind_ != Kind::Unk; } + bool IsInvalid() const { return !IsValid(); } + + const char* name() const; + + size_t GetHostSize() const; + + private: + Kind kind_{Kind::Unk}; +}; + +template +constexpr DType GetDType(); + +template +struct DTypeInternal; + +#define INFRT_IMPL_GET_DTYPE(cpp_type__, enum__) \ + template <> \ + inline constexpr DType GetDType() { \ + return DType{DType::Kind::enum__}; \ + } \ + template <> \ + struct DTypeInternal { \ + using type = cpp_type__; \ + }; + +INFRT_IMPL_GET_DTYPE(bool, I1); +INFRT_IMPL_GET_DTYPE(int8_t, I8); +INFRT_IMPL_GET_DTYPE(int16_t, I16); +INFRT_IMPL_GET_DTYPE(int32_t, I32); +INFRT_IMPL_GET_DTYPE(int64_t, I64); +INFRT_IMPL_GET_DTYPE(uint8_t, UI8); +INFRT_IMPL_GET_DTYPE(uint16_t, UI16); +INFRT_IMPL_GET_DTYPE(uint32_t, UI32); +INFRT_IMPL_GET_DTYPE(uint64_t, UI64); +INFRT_IMPL_GET_DTYPE(float, F32); +INFRT_IMPL_GET_DTYPE(double, F64); +INFRT_IMPL_GET_DTYPE(std::string, STRING); + +} // namespace infrt diff --git a/paddle/infrt/common/global.cc b/paddle/infrt/common/global.cc new file mode 100644 index 0000000000000..54ecf1589aa14 --- /dev/null +++ b/paddle/infrt/common/global.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/common/global.h" + +namespace infrt { + +Global::Global() {} + +mlir::MLIRContext* Global::context = nullptr; + +mlir::MLIRContext* Global::getMLIRContext() { + if (nullptr == context) { + context = new mlir::MLIRContext(); + } + return context; +} + +} // namespace infrt diff --git a/paddle/pten/api/include/linalg.h b/paddle/infrt/common/global.h similarity index 62% rename from paddle/pten/api/include/linalg.h rename to paddle/infrt/common/global.h index 259cf66493203..f89164d03f31d 100644 --- a/paddle/pten/api/include/linalg.h +++ b/paddle/infrt/common/global.h @@ -14,17 +14,19 @@ #pragma once -#include "paddle/pten/api/include/tensor.h" +#include "mlir/IR/MLIRContext.h" +#include "paddle/infrt/tensor/dense_host_tensor.h" -namespace paddle { -namespace experimental { +namespace infrt { -PD_DLL_DECL Tensor dot(const Tensor& x, const Tensor& y); +// global variables +class Global { + private: + static mlir::MLIRContext *context; + Global(); -PD_DLL_DECL Tensor matmul(const Tensor& x, - const Tensor& y, - bool transpose_x = false, - bool transpose_y = false); + public: + static mlir::MLIRContext *getMLIRContext(); +}; // class Global -} // namespace experimental -} // namespace paddle +} // namespace infrt diff --git a/paddle/infrt/common/macros.h b/paddle/infrt/common/macros.h new file mode 100644 index 0000000000000..4481f6b38aed3 --- /dev/null +++ b/paddle/infrt/common/macros.h @@ -0,0 +1,52 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if !defined(NDEBUG) +#define INFRT_DEBUG +#endif + +#define INFRT_DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ + void operator=(const TypeName&) = delete + +#ifndef INFRT_NOT_IMPLEMENTED +#define INFRT_NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented"; +#endif + +#define INFRT_RESULT_SHOULD_USE __attribute__((warn_unused_result)) + +/** + * A trick to enforce the registry. + * + * usage: + * + * INFRT_REGISTER_HELPER(some_key) { + * // register methods + * } + * + * INFRT_USE_REGISTER(some_key); + */ +#define INFRT_REGISTER_HELPER(symbol__) bool __infrt__##symbol__##__registrar() +#define INFRT_USE_REGISTER(symbol__) \ + extern bool __infrt__##symbol__##__registrar(); \ + [[maybe_unused]] static bool __infrt_extern_registrar_##symbol__ = \ + __infrt__##symbol__##__registrar(); + +#if __cplusplus >= 201703L +#define INFRT_NODISCARD [[nodiscard]] +#else +#define INFRT_NODISCARD +#endif diff --git a/paddle/infrt/common/memory.cc b/paddle/infrt/common/memory.cc new file mode 100644 index 0000000000000..aa5983a56c434 --- /dev/null +++ b/paddle/infrt/common/memory.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/common/memory.h" + +namespace infrt { + +using infrt::common::Target; + +namespace { + +class X86MemoryMng : public MemoryInterface { + public: + void* malloc(size_t nbytes) override { return ::malloc(nbytes); } + void free(void* data) override { + if (!data) return; + ::free(data); + } + void* aligned_alloc(size_t alignment, size_t nbytes) override { + return ::aligned_alloc(alignment, nbytes); + } +}; + +} // namespace + +MemoryManager::MemoryManager() { + Register(Target::Arch::Unk, new X86MemoryMng); + Register(Target::Arch::X86, new X86MemoryMng); +} + +} // namespace infrt diff --git a/paddle/infrt/common/memory.h b/paddle/infrt/common/memory.h new file mode 100644 index 0000000000000..678529b8b785c --- /dev/null +++ b/paddle/infrt/common/memory.h @@ -0,0 +1,76 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include + +#include "paddle/infrt/common/macros.h" +#include "paddle/infrt/common/target.h" + +namespace infrt { + +class MemoryInterface { + public: + virtual void* malloc(size_t nbytes) = 0; + virtual void free(void* data) = 0; + virtual void* aligned_alloc(size_t alignment, size_t nbytes) { + return nullptr; + } + virtual ~MemoryInterface() {} +}; + +/** + * MemoryManager holds a map of MemoryInterface for each articture. + */ +class MemoryManager final { + public: + using key_t = common::Target::Arch; + + static MemoryManager& Global() { + static auto* x = new MemoryManager; + return *x; + } + + MemoryInterface* Retrieve(key_t key) INFRT_RESULT_SHOULD_USE { + auto it = memory_mngs_.find(key); + if (it != memory_mngs_.end()) return it->second.get(); + return nullptr; + } + + MemoryInterface* RetrieveSafely(key_t key) { + auto* res = Retrieve(key); + CHECK(res) << "no MemoryInterface for architecture [" << key << "]"; + return res; + } + + MemoryInterface* Register(key_t key, MemoryInterface* item) { + CHECK(!memory_mngs_.count(key)) << "Duplicate register [" << key << "]"; + memory_mngs_[key].reset(item); + return item; + } + + private: + MemoryManager(); + + std::unordered_map> + memory_mngs_; + + INFRT_DISALLOW_COPY_AND_ASSIGN(MemoryManager); +}; + +} // namespace infrt diff --git a/paddle/infrt/common/object.cc b/paddle/infrt/common/object.cc new file mode 100644 index 0000000000000..6842ff7ba007d --- /dev/null +++ b/paddle/infrt/common/object.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/common/object.h" + +namespace infrt { +namespace common {} // namespace common +} // namespace infrt diff --git a/paddle/infrt/common/object.h b/paddle/infrt/common/object.h new file mode 100644 index 0000000000000..ab2d00cce985c --- /dev/null +++ b/paddle/infrt/common/object.h @@ -0,0 +1,81 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include "paddle/infrt/common/shared.h" + +namespace infrt { +namespace common { + +template +class Shared; +/** + * Object is the basic element in the INFRT, with `Shared` wrapper, the object + * can be shared accross the system. + */ +struct Object { + //! Get the type representation of this object. + virtual const char* type_info() const = 0; + virtual ~Object() {} + + //! Cast to a derived type. + template + T* as() { + return static_cast(this); + } + + //! Cast to a derived type. + template + const T* as() const { + return static_cast(this); + } + + //! Type safe cast. + template + T* safe_as() { + if (std::strcmp(type_info(), T::__type_info__) == 0) { + return static_cast(this); + } + return nullptr; + } + //! Type safe cast. + template + const T* safe_as() const { + if (std::strcmp(type_info(), T::__type_info__) == 0) { + return static_cast(this); + } + return nullptr; + } + + //! Check if the type is right. + template + bool is_type() const { + if (std::strcmp(type_info(), T::__type_info__) == 0) { + return true; + } + return false; + } + + //! The reference count, which make all the derived type able to share. + mutable RefCount __ref_count__; +}; + +using object_ptr = Object*; +using shared_object = Shared; + +} // namespace common +} // namespace infrt diff --git a/paddle/infrt/common/shared.cc b/paddle/infrt/common/shared.cc new file mode 100644 index 0000000000000..78457b7ed352b --- /dev/null +++ b/paddle/infrt/common/shared.cc @@ -0,0 +1,15 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/common/shared.h" diff --git a/paddle/infrt/common/shared.h b/paddle/infrt/common/shared.h new file mode 100644 index 0000000000000..dbcf2b0597888 --- /dev/null +++ b/paddle/infrt/common/shared.h @@ -0,0 +1,153 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +namespace infrt { +namespace common { + +class RefCount { + public: + using value_type = int32_t; + RefCount() = default; + + value_type Inc() { return ++count_; } + value_type Dec() { return --count_; } + bool is_zero() const { return 0 == count_; } + std::string to_string() { return std::to_string(count_.load()); } + int32_t val() const { return count_; } + + private: + std::atomic count_{0}; +}; + +class Object; +/** + * The templated methods are used to unify the way to get the RefCount instance + * in client classes. + */ +template +RefCount& ref_count(const T* t) { + static_assert(std::is_base_of::value, "T is not a Object"); + return t->__ref_count__; +} +template +void Destroy(const T* t) { + delete t; +} + +template +struct Shared { + using object_ptr = T*; + + Shared() = default; + explicit Shared(T* p) : p_(p) { + if (p) IncRef(p); + } + Shared(const Shared& other) : p_(other.p_) { IncRef(p_); } + Shared(Shared&& other) : p_(other.p_) { other.p_ = nullptr; } + Shared& operator=(const Shared& other); + + //! Reset to another pointer \p x. + void Reset(T* x = nullptr); + + //! Access the pointer in various ways. + // @{ + inline T* get() const { return p_; } + inline T& operator*() const { return *p_; } + inline T* operator->() const { return p_; } + inline T* self() { return p_; } + inline const T* self() const { return p_; } + // @} + + inline bool same_as(const Shared& other) { return p_ == other.p_; } + inline bool defined() const { return p_; } + inline bool operator<(const Shared& other) const { return p_ < other.p_; } + inline Shared& operator=(T* x); + inline bool operator==(const Shared& other) const { return p_ == other.p_; } + + ~Shared(); + + private: + //! Increase the share count. + void IncRef(T* p); + + //! Decrease the share count. + void DecRef(T* p); + + protected: + T* p_{}; +}; + +template +void Shared::IncRef(T* p) { + if (p) { + ref_count(p).Inc(); + } +} +template +void Shared::DecRef(T* p) { + if (p) { + if (ref_count(p).Dec() == 0) { + Destroy(p); + } + } +} +template +Shared& Shared::operator=(const Shared& other) { + if (other.p_ == p_) return *this; + // Other can be inside of something owned by this, so we should be careful to + // incref other before we decref + // ourselves. + T* tmp = other.p_; + IncRef(tmp); + DecRef(p_); + p_ = tmp; + return *this; +} + +template +T* make_shared(Args&&... args) { + return new T(args...); +} + +template +Shared& Shared::operator=(T* x) { + if (p_ == x) return *this; + + T* tmp = x; + IncRef(tmp); + DecRef(p_); + p_ = tmp; + return *this; +} + +template +Shared::~Shared() { + DecRef(p_); + p_ = nullptr; +} + +template +void Shared::Reset(T* x) { + if (x) IncRef(x); + DecRef(p_); + p_ = x; +} + +} // namespace common +} // namespace infrt diff --git a/paddle/infrt/common/string.cc b/paddle/infrt/common/string.cc new file mode 100644 index 0000000000000..d02643825a7c8 --- /dev/null +++ b/paddle/infrt/common/string.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/common/string.h" + +#include + +#include + +namespace infrt { +namespace infrt { + +std::string StringFormat(const std::string &fmt_str, ...) { + /* Reserve two times as much as the length of the fmt_str */ + int final_n, n = (static_cast(fmt_str.size())) * 2; + std::unique_ptr formatted; + va_list ap; + while (1) { + formatted.reset( + new char[n]); /* Wrap the plain char array into the unique_ptr */ + std::strcpy(&formatted[0], fmt_str.c_str()); // NOLINT + va_start(ap, fmt_str); + final_n = vsnprintf(&formatted[0], n, fmt_str.c_str(), ap); + va_end(ap); + if (final_n < 0 || final_n >= n) + n += abs(final_n - n + 1); + else + break; + } + return std::string(formatted.get()); +} + +std::string Trim(const std::string &s, const char *empty) { + if (s.empty()) return s; + auto start = s.find_first_not_of(empty); + if (start == std::string::npos) return ""; + auto end = s.find_last_not_of(empty); + return s.substr(start, end - start + 1); +} + +std::string Uppercase(const std::string &x) { + auto res = x; + for (auto &c : res) { + c = toupper(c); + } + return res; +} + +bool Startswith(const std::string &x, const std::string &str) { + return x.find(str) == 0; +} +bool Endswith(const std::string &x, const std::string &str) { + if (x.length() >= str.length()) { + return std::equal(str.rbegin(), str.rend(), x.rbegin()); + } + return false; +} + +std::vector Split(const std::string &str, + const std::string &splitter) { + std::vector results; + std::string::size_type pos1, pos2; + pos2 = str.find(splitter); + pos1 = 0; + while (std::string::npos != pos2) { + results.push_back(str.substr(pos1, pos2 - pos1)); + pos1 = pos2 + splitter.size(); + pos2 = str.find(splitter, pos1); + } + if (pos1 != str.length()) { + results.push_back(str.substr(pos1)); + } + return results; +} + +void Replace(std::string *s, const std::string &from, const std::string &to) { + size_t pos = 0; + while ((pos = s->find(from, pos)) != std::string::npos) { + s->replace(pos, from.size(), to); + pos += to.length(); + } +} + +size_t Count(std::string *s, const std::string &sub) { + size_t pos = 0; + size_t times = 0; + while ((pos = s->find(sub, pos)) != std::string::npos) { + if ((pos == 0 || !IsPrefix(s->at(pos - 1))) && + (pos + sub.length() == s->size() || + !IsSuffix(s->at(pos + sub.length())))) { + pos += sub.length(); + times++; + } else { + pos++; + } + } + return times; +} + +bool IsPrefix(const char &c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c == '_'); +} + +bool IsSuffix(const char &c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c == '_') || + (c >= '0' && c <= '9') || (c == '\''); +} + +std::string TransValidVarName(std::string name) { + Replace(&name, ".", "__"); + Replace(&name, "/", "___"); + name.erase(0, name.find_first_not_of("_")); + return name; +} + +} // namespace infrt +} // namespace infrt diff --git a/paddle/infrt/common/string.h b/paddle/infrt/common/string.h new file mode 100644 index 0000000000000..f744470603f80 --- /dev/null +++ b/paddle/infrt/common/string.h @@ -0,0 +1,84 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include + +namespace infrt { +namespace infrt { + +//! Get the content of a stream. +template +std::string GetStreamCnt(const T& x); + +/** + * Construct a formatted string with arguments. + * @param fmt_str The format. + * @param ... The parameters of the format. + * @return The formated string. + */ +std::string StringFormat(const std::string& fmt_str, ...); + +/** + * Join multiple fields to a single string. Similar to Python's str.join method. + */ +template +std::string Join(const std::vector& fields, const std::string& splitter) { + if (fields.empty()) return ""; + std::stringstream ss; + for (int i = 0; i < fields.size() - 1; i++) ss << fields[i] << splitter; + ss << fields.back(); + return ss.str(); +} + +std::vector Split(const std::string& str, + const std::string& splitter); + +std::string Trim(const std::string& s, const char* empty = " \n\r\t"); + +//! Convert a string to its uppercase. +std::string Uppercase(const std::string& x); + +//! Replace a substr 'from' to 'to' in string s. +void Replace(std::string* s, const std::string& from, const std::string& to); + +//! Count how many times substr 'sub' appears in string s. +size_t Count(std::string* s, const std::string& sub); + +//! Tell if a char is prefix of a tensor's name. +bool IsPrefix(const char& c); + +//! Tell if a char is suffix of a tensor's name. +bool IsSuffix(const char& c); + +//! Tell if a string \p x start with \p str. +bool Startswith(const std::string& x, const std::string& str); + +//! Tell if a string \p x ends with \p str. +bool Endswith(const std::string& x, const std::string& str); + +template +std::string GetStreamCnt(const T& x) { + std::stringstream os; + os << x; + return os.str(); +} + +std::string TransValidVarName(std::string name); + +} // namespace infrt +} // namespace infrt diff --git a/paddle/infrt/common/target.cc b/paddle/infrt/common/target.cc new file mode 100644 index 0000000000000..d376ad7db0241 --- /dev/null +++ b/paddle/infrt/common/target.cc @@ -0,0 +1,118 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/common/target.h" + +#include + +namespace infrt { +namespace common { + +bool Target::operator==(const Target &other) const { + return os == other.os && // + arch == other.arch && // + bits == other.bits && // + features == other.features; +} + +int Target::max_num_threads() const { + CHECK(arch == Arch::NVGPU) + << "The target is not NVGPU! Cannot get max number of threads."; + return 1024; +} + +std::vector Target::get_target_libs() const { return libs; } + +int Target::get_target_bits() const { + switch (bits) { + case Bit::k32: + return 32; + case Bit::k64: + return 64; + case Bit::Unk: + return 0; + default: + LOG(FATAL) << "Not supported Bit"; + } + return -1; +} + +std::ostream &operator<<(std::ostream &os, const Target &target) { + os << "Target<"; + switch (target.os) { + case Target::OS::Linux: + os << "linux"; + break; + case Target::OS::Windows: + os << "windows"; + break; + case Target::OS::Unk: + os << "unk"; + break; + } + + os << ","; + + switch (target.arch) { + case Target::Arch::X86: + os << "x86"; + break; + case Target::Arch::ARM: + os << "arm"; + break; + case Target::Arch::NVGPU: + os << "nvgpu"; + break; + case Target::Arch::Unk: + os << "unk"; + break; + } + os << ","; + + switch (target.bits) { + case Target::Bit::k32: + os << "32"; + break; + case Target::Bit::k64: + os << "64"; + break; + case Target::Bit::Unk: + os << "unk"; + break; + } + os << ">"; + + return os; +} + +std::ostream &operator<<(std::ostream &os, Target::Arch arch) { + switch (arch) { + case Target::Arch::Unk: + os << "Unk"; + break; + case Target::Arch::X86: + os << "X86"; + break; + case Target::Arch::ARM: + os << "ARM"; + break; + case Target::Arch::NVGPU: + os << "NVGPU"; + break; + } + return os; +} + +} // namespace common +} // namespace infrt diff --git a/paddle/infrt/common/target.h b/paddle/infrt/common/target.h new file mode 100644 index 0000000000000..eaf19efbfe7a8 --- /dev/null +++ b/paddle/infrt/common/target.h @@ -0,0 +1,112 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace infrt { +namespace common { + +struct Target { + /** + * The operating system used by the target. Determines which system calls to + * generate. + */ + enum class OS : int { + Unk = -1, + Linux, + Windows, + }; + + /** + * The architecture used by the target. Determines the instruction set to use. + */ + enum class Arch : int { + Unk = -1, + X86, + ARM, + NVGPU, + }; + + enum class Bit : int { + Unk = -1, + k32, + k64, + }; + + OS os{OS::Unk}; + Arch arch{Arch::Unk}; + Bit bits{Bit::Unk}; + + enum class Feature : int { + JIT = 0, + Debug, + }; + + /** + * The library used by the target. + */ + enum class Lib : int { + Unk = -1, + MKL, + }; + std::vector features; + std::vector libs; + + explicit Target(OS o = OS::Linux, + Arch a = Arch::Unk, + Bit b = Bit::Unk, + const std::vector& features = {}, + const std::vector& libs = {}) + : os(o), arch(a), bits(b), features(features), libs(libs) {} + + bool defined() const { + return os != OS::Unk && arch != Arch::Unk && bits != Bit::Unk; + } + + int max_num_threads() const; + + int get_target_bits() const; + + std::vector get_target_libs() const; + + bool operator==(const Target& other) const; + bool operator!=(const Target& other) const { return !(*this == other); } + friend std::ostream& operator<<(std::ostream& os, const Target& target); +}; + +static const Target& UnkTarget() { + static Target target( + Target::OS::Unk, Target::Arch::Unk, Target::Bit::Unk, {}, {}); + return target; +} + +static const Target& DefaultHostTarget() { + static Target target( + Target::OS::Linux, Target::Arch::X86, Target::Bit::k64, {}, {}); + return target; +} + +static const Target& DefaultNVGPUTarget() { + static Target target( + Target::OS::Linux, Target::Arch::NVGPU, Target::Bit::k64, {}, {}); + return target; +} + +std::ostream& operator<<(std::ostream& os, Target::Arch arch); + +} // namespace common +} // namespace infrt diff --git a/paddle/infrt/common/type.cc b/paddle/infrt/common/type.cc new file mode 100644 index 0000000000000..f262bd4697b36 --- /dev/null +++ b/paddle/infrt/common/type.cc @@ -0,0 +1,358 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/common/type.h" + +#include + +namespace infrt { +namespace common { + +struct Type::Storage { + Storage() = default; + Storage(type_t t, int b, int w) : type_(t), bits_(b), lanes_(w) {} + + type_t type_{type_t::Unk}; + cpp_type_t cpp_type_{cpp_type_t::None}; + + //! How many bits per element. + int bits_{}; + + //! How many elements(if a vector type), for scalar types, it should be 1. + int lanes_{1}; + + //! Name of the customized type. + std::string customized_type_; +}; + +Type::~Type() {} + +std::ostream &operator<<(std::ostream &os, const Type &t) { + if (t.is_cpp_const()) os << "const "; + switch (t.type()) { + case Type::type_t::Int: + if (t.bits() == 1) { + os << "bool"; + } else { + os << "int" << t.bits(); + } + + break; + case Type::type_t::UInt: + os << "uint" << t.bits(); + break; + + case Type::type_t::Float: + os << "float" << t.bits(); + break; + case Type::type_t::Void: + os << "void"; + break; + case Type::type_t::Customized: + os << t.customized_type(); + break; + case Type::type_t::String: + os << "string"; + break; + case Type::type_t::Unk: + os << "unk"; + break; + } + + if (t.lanes() > 1) os << "<" << t.lanes() << ">"; + if (t.is_cpp_handle()) os << "*"; + if (t.is_cpp_handle2()) os << "**"; + + return os; +} + +std::ostream &operator<<(std::ostream &os, Type::type_t t) { + switch (t) { + case Type::type_t::String: + os << "String"; + break; + case Type::type_t::Void: + os << "Void"; + break; + case Type::type_t::UInt: + os << "UInt"; + break; + case Type::type_t::Int: + os << "Int"; + break; + case Type::type_t::Float: + os << "Float"; + break; + case Type::type_t::Unk: + os << "Unk"; + break; + case Type::type_t::Customized: + os << "Customized"; + } + return os; +} + +Type &Type::set_cpp_handle(bool x) { + // unset the other handle-related bits. + set_cpp_handle2(false); + + auto &v = (*reinterpret_cast(&GetStorage().cpp_type_)); + // unset the other handle-related bits. + v &= ~static_cast(cpp_type_t::Handle); + v &= ~static_cast(cpp_type_t::HandleHandle); + + if (x) + v |= static_cast(cpp_type_t::Handle); + else + v &= ~static_cast(cpp_type_t::Handle); + + return *this; +} + +Type &Type::set_cpp_handle2(bool x) { + auto &v = (*reinterpret_cast(&GetStorage().cpp_type_)); + + // unset the other handle-related bits. + v &= ~static_cast(cpp_type_t::Handle); + v &= ~static_cast(cpp_type_t::HandleHandle); + + if (x) + v |= static_cast(cpp_type_t::HandleHandle); + else + v &= ~static_cast(cpp_type_t::HandleHandle); + + return *this; +} + +Type Type::VectorOf(int w) const { + CheckTypeValid(); + return Type(type(), w, bits()); +} + +Type::Type(const Type &other) { + if (other.storage_) storage_.reset(new Storage(*other.storage_)); +} + +Type Type::ElementOf() const { + CheckTypeValid(); + auto type = *this; + type.storage_->lanes_ = 1; + return type; +} + +void Type::CheckTypeValid() const { CHECK_NE(GetStorage().type_, type_t::Unk); } + +Type Type::PointerOf() const { + CheckTypeValid(); + auto x = *this; + CHECK(!x.is_cpp_handle2()) << "Not support three level of PointerOf"; + if (x.is_cpp_handle()) + x.set_cpp_handle2(); + else + x.set_cpp_handle(); + return x; +} + +Type Type::ConstOf() const { + CheckTypeValid(); + auto x = *this; + x.set_cpp_const(); + return x; +} + +Type Type::IgnoreConst() const { + CheckTypeValid(); + auto x = *this; + x.set_cpp_const(false); + return x; +} + +Type Type::with_bits(int x) const { + CHECK(is_primitive()); + Type type = *this; + type.GetStorage().bits_ = x; + return type; +} + +Type Type::with_type(Type::type_t x) const { + Type type = *this; + type.GetStorage().type_ = x; + return type; +} + +Type Type::with_lanes(int x) const { + CHECK(valid()); + Type type = *this; + type.GetStorage().lanes_ = x; + return type; +} + +Type Type::with_cpp_const(bool x) const { + Type type = *this; + type.set_cpp_const(x); + return type; +} + +Type &Type::set_cpp_const(bool is_const) { + uint8_t &data = *reinterpret_cast(&GetStorage().cpp_type_); + if (is_const) { + data |= static_cast(cpp_type_t::Const); + } else { + data &= ~(static_cast(cpp_type_t::Const)); + } + + return *this; +} +Type &Type::set_customized_type(const std::string &t) { + GetStorage().type_ = type_t::Customized; + GetStorage().customized_type_ = t; + + return *this; +} + +bool Type::valid() const { + if (is_unk()) return false; + if (is_customized()) { + return !GetStorage().customized_type_.empty(); + } + if (is_primitive()) { + return bits() != 0; + } + return true; +} + +Type::Type(Type::type_t t, int b, int w) : storage_(new Storage(t, b, w)) {} +bool Type::is_primitive() const { + return !is_unk() && type() != type_t::Customized; +} +bool Type::is_customized() const { + return !is_unk() && type() == type_t::Customized; +} +bool Type::is_unk() const { return type() == type_t::Unk; } +bool Type::is_bool() const { return type() == type_t::UInt && bits() == 1; } +bool Type::is_void() const { return type() == type_t::Void; } +bool Type::is_vector() const { return lanes() > 1; } +bool Type::is_scalar() const { return lanes() == 1; } +bool Type::is_float(int bits) const { + return type() == type_t::Float && (bits < 0 || bits == this->bits()); +} +bool Type::is_uint(int bits) const { + return type() == type_t::UInt && (bits < 0 || bits == this->bits()); +} +bool Type::is_int(int bits) const { + return type() == type_t::Int && (bits < 0 || bits == this->bits()); +} +bool Type::is_integer(int bits) const { + return (type() == type_t::Int || type() == type_t::UInt) && + (bits < 0 || bits == this->bits()); +} +bool Type::is_index_type() { + return is_int() && lanes() == 1 && (bits() == 32 || bits() == 64); +} +bool Type::is_cpp_handle() const { + return static_cast(GetStorage().cpp_type_) & + static_cast(cpp_type_t::Handle); +} +bool Type::is_cpp_handle2() const { + return static_cast(GetStorage().cpp_type_) & + static_cast(cpp_type_t::HandleHandle); +} +bool Type::is_cpp_const() const { + return static_cast(cpp_type_t::Const) & + static_cast(GetStorage().cpp_type_); +} +const std::string &Type::customized_type() const { + return GetStorage().customized_type_; +} +bool Type::is_customized_type() const { + return !GetStorage().customized_type_.empty(); +} +Type::type_t Type::type() const { return GetStorage().type_; } +int Type::bits() const { return GetStorage().bits_; } +int Type::lanes() const { return GetStorage().lanes_; } +Type::cpp_type_t Type::cpp_type() const { return GetStorage().cpp_type_; } +bool Type::operator==(const Type &other) const { + return type() == other.type() && bits() == other.bits() && + lanes() == other.lanes() && + GetStorage().cpp_type_ == other.GetStorage().cpp_type_ && + customized_type() == other.customized_type(); +} +bool Type::is_string() const { return type() == type_t::String; } + +Type &Type::operator=(const Type &other) { + if (other.storage_) storage_.reset(new Storage(*other.storage_)); + return *this; +} + +Type::Storage &Type::GetStorage() { return *storage_; } +const Type::Storage &Type::GetStorage() const { return *storage_; } + +Type::Type() : storage_(new Storage) {} +Type::Type(Type &&other) : storage_(std::move(other.storage_)) {} + +const Type &F16() { + static auto t = Float(16); + return t; +} +const Type &F32() { + static auto t = Float(32); + return t; +} +const Type &F64() { + static auto t = Float(64); + return t; +} +const Type &I8() { + static auto t = Int(8); + return t; +} +const Type &I16() { + static auto t = Int(16); + return t; +} +const Type &I32() { + static auto t = Int(32); + return t; +} +const Type &I64() { + static auto t = Int(64); + return t; +} +const Type &UI8() { + static auto t = UInt(8); + return t; +} +const Type &UI16() { + static auto t = UInt(16); + return t; +} +const Type &UI32() { + static auto t = UInt(32); + return t; +} +const Type &UI64() { + static auto t = UInt(64); + return t; +} +const Type &I1() { + static auto t = Int(1); + return t; +} +const Type &UI1() { + static auto t = UInt(1); + return t; +} + +} // namespace common +} // namespace infrt diff --git a/paddle/infrt/common/type.h b/paddle/infrt/common/type.h new file mode 100644 index 0000000000000..b532fc154ff02 --- /dev/null +++ b/paddle/infrt/common/type.h @@ -0,0 +1,223 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include +#include + +#include "paddle/infrt/common/macros.h" + +//! Much of the concepts are borrowed from Halide project. + +namespace infrt { +namespace common { + +/** + * Types in the INFRT type system. They can be ints, unsigned ints, or floats of + * various bit-widths. + * They can also be vectors of the same (by setting the `lanes` field to + * something larger than one). + * NOTE: Front-end code other than vectorize shouldn't use vector types. + */ +struct Type { + enum class type_t { + Unk = -1, + Int, + UInt, + Float, + String, + Void, + // stupid idea to mix the Customized with other primitive types, large + // refactor needs here. + Customized, // Customized type + }; + + //! type decorators in C++, the different code can used together. + enum class cpp_type_t : uint8_t { + None = 0, // None information. + Const = 1, // const. + Handle = 1 << 1, // pointer type, such as `infrt_buffer_t*`. + HandleHandle = 1 << 2, // pointer of pointer, such as `infrt_buffer_t**`. + }; + + Type(); + Type(type_t t, int b, int w); + Type(const Type& other); + explicit Type(Type&& other); + Type& operator=(const Type& other); + + INFRT_NODISCARD bool is_primitive() const; + INFRT_NODISCARD bool is_customized() const; + INFRT_NODISCARD bool valid() const; + + //! Some helper functions to check a type. + // @{ + INFRT_NODISCARD bool is_unk() const; + INFRT_NODISCARD bool is_void() const; + INFRT_NODISCARD bool is_bool() const; + INFRT_NODISCARD bool is_vector() const; + INFRT_NODISCARD bool is_scalar() const; + INFRT_NODISCARD bool is_float(int bits = -1) const; + INFRT_NODISCARD bool is_int(int bits = -1) const; + INFRT_NODISCARD bool is_integer(int bits = -1) const; + INFRT_NODISCARD bool is_uint(int bits = -1) const; + INFRT_NODISCARD bool is_string() const; + INFRT_NODISCARD bool is_index_type(); + // @} + + Type& set_cpp_handle(bool x = true); + INFRT_NODISCARD bool is_cpp_handle() const; + + Type& set_cpp_handle2(bool x = true); + INFRT_NODISCARD bool is_cpp_handle2() const; + + Type& set_cpp_const(bool is_const = true); + INFRT_NODISCARD bool is_cpp_const() const; + + Type& set_customized_type(const std::string& t); + const std::string& customized_type() const; + INFRT_NODISCARD bool is_customized_type() const; + + // Get a new type with bits set to \p x. + Type with_bits(int x) const; + // Get a new type with type set to \p x. + Type with_type(type_t x) const; + // Get a new type with lanes set to \p x. + Type with_lanes(int x) const; + // Get a new type with cpp_const set to \p x. + Type with_cpp_const(bool x = true) const; + + //! Getters + // @{ + type_t type() const; + int bits() const; + int lanes() const; + cpp_type_t cpp_type() const; + // @} + + //! Compare two types for equality. + bool operator==(const Type& other) const; + + //! Compare two types for inequality. + bool operator!=(const Type& other) const { return !(*this == other); } + + //! Generate a vector of this type, with `w` elements. + Type VectorOf(int w) const; + //! Generate a element type of this type. + Type ElementOf() const; + //! Generate the address type. + Type PointerOf() const; + //! Ignore const. + Type IgnoreConst() const; + //! Add const. + Type ConstOf() const; + + friend std::ostream& operator<<(std::ostream& os, const Type& t); + + ~Type(); + + private: + void CheckTypeValid() const; + + struct Storage; + Storage& GetStorage(); + const Storage& GetStorage() const; + + std::unique_ptr storage_; +}; // namespace common + +inline Type Void() { return Type(Type::type_t::Void, 1, 0); } +inline Type Int(int bits, int lanes = 1) { + return Type(Type::type_t::Int, bits, lanes); +} +inline Type UInt(int bits, int lanes = 1) { + return Type(Type::type_t::UInt, bits, lanes); +} +inline Type Float(int bits, int lanes = 1) { + return Type(Type::type_t::Float, bits, lanes); +} +inline Type Bool(int lanes = 1) { return Type(Type::type_t::UInt, 1, lanes); } +inline Type String() { return Type(Type::type_t::String, 1, 1); } + +//! Builtin native types as global singletons. +// @{ +const Type& F16(); +const Type& F32(); +const Type& F64(); +const Type& I8(); +const Type& I16(); +const Type& I32(); +const Type& I64(); +const Type& UI8(); +const Type& UI16(); +const Type& UI32(); +const Type& UI64(); +const Type& I1(); +const Type& UI1(); +// @} + +template +Type type_of(); + +// clang-format off +template <> inline Type type_of() { return F32(); } +template <> inline Type type_of() { return F64(); } +template <> inline Type type_of() { return UI8(); } +template <> inline Type type_of() { return UI16(); } +template <> inline Type type_of() { return I32(); } +template <> inline Type type_of() { return UI32(); } +template <> inline Type type_of() { return UI1(); } +template <> inline Type type_of() { return I8(); } +template <> inline Type type_of() { return I64(); } +template <> inline Type type_of() { return UI64(); } +template <> inline Type type_of() { return I8(); } +template <> inline Type type_of() { return Void(); } +// clang-format on +template <> +inline Type type_of() { + Type x = Int(8); + x.set_cpp_handle(); + return x; +} +template <> +inline Type type_of() { + Type x = type_of(); + x.set_cpp_handle(); + return x; +} +template <> +inline Type type_of() { + Type x = type_of(); + x.set_cpp_handle2(); + return x; +} +template <> +inline Type type_of() { + Type x = type_of(); + x.set_cpp_handle(); + return x; +} +template <> +inline Type type_of() { + Type x = type_of(); + x.set_cpp_handle(); + return x; +} + +std::ostream& operator<<(std::ostream& os, Type::type_t t); + +} // namespace common +} // namespace infrt diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt new file mode 100644 index 0000000000000..c1517beab0662 --- /dev/null +++ b/paddle/infrt/dialect/CMakeLists.txt @@ -0,0 +1,61 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + dialect.cc + types.cc + basic_kernels.cc + test_kernels.cc + infrt_base.cc + init_infrt_dialects.cc + tensor_shape.cc + dense_tensor.cc + mlir_loader.cc + diagnostic_utils.cc + pd_types.cc + pd_ops.cc + ) + +mlir_tablegen_on(ops) +mlir_tablegen_on(basic_kernels) +mlir_tablegen_on(test_kernels) +mlir_tablegen_on(infrt_base DIALECT infrt) +mlir_tablegen_on(tensor_shape DIALECT ts) +mlir_tablegen_on(dense_tensor DIALECT dt) +mlir_tablegen_on(pd_op_base DIALECT pd) +mlir_tablegen_on(pd_ops) +mlir_add_rewriter(rewrite) + +# TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code +add_executable(infrtopt opt.cc) +target_link_libraries(infrtopt infrt ${mlir_libs}) +add_dependencies(infrtopt infrt) + +add_executable(print-ir print_ir.cc) +target_link_libraries(print-ir infrt ${mlir_libs}) +add_dependencies(print-ir pd_ops_inc) + + +# MLIR opt tests +# %{ +set(infrt_opt_path ${CMAKE_BINARY_DIR}/infrt/dialect/infrtopt) + +add_test(test_infrt_mlir_opt_on_basic ${infrt_opt_path} + ${CMAKE_SOURCE_DIR}/infrt/dialect/mlir_tests/basic.mlir) +add_test(test_infrt_mlir_opt_on_tensor_shape ${infrt_opt_path} + ${CMAKE_SOURCE_DIR}/infrt/dialect/mlir_tests/tensor_shape.mlir) +add_test(test_infrt_mlir_opt_on_paddle_ops + ${infrt_opt_path} + ${CMAKE_SOURCE_DIR}/infrt/dialect/mlir_tests/paddle_ops.mlir) +# %} + +cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS}) + +# execute mlir and run FileCheck +infrt_exec_check(run_and_check_tensor_type mlir_tests/tensor_type.mlir) +infrt_exec_check(run_and_check_basic mlir_tests/basic.mlir) +infrt_exec_check(run_and_check_benchmark mlir_tests/benchmark.mlir) +#infrt_exec_check(run_and_check_dense_tensor mlir_tests/dense_tensor.mlir) +add_test(test_infrt_mlir_dense_tensor + ${CMAKE_BINARY_DIR}/infrt/host_context/infrt-exec + -i + ${CMAKE_CURRENT_SOURCE_DIR}/mlir_tests/dense_tensor.mlir) diff --git a/paddle/infrt/dialect/basic_kernels.cc b/paddle/infrt/dialect/basic_kernels.cc new file mode 100644 index 0000000000000..b4d2b9182b0c5 --- /dev/null +++ b/paddle/infrt/dialect/basic_kernels.cc @@ -0,0 +1,164 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/basic_kernels.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/infrt/dialect/dense_tensor.h" + +namespace infrt::dialect { +using namespace mlir; // NOLINT + +static ParseResult parseCallOp(OpAsmParser &parser, // NOLINT + OperationState &result) { // NOLINT + SymbolRefAttr callee_attr; + FunctionType callee_type; + SmallVector operands; + auto callee_loc = parser.getNameLoc(); + if (parser.parseAttribute(callee_attr, "callee", result.attributes) || + parser.parseOperandList(operands, OpAsmParser::Delimiter::Paren) || + parser.parseOptionalAttrDict(result.attributes) || + parser.parseColonType(callee_type) || + parser.addTypesToList(callee_type.getResults(), result.types) || + parser.resolveOperands( + operands, callee_type.getInputs(), callee_loc, result.operands)) + return failure(); + return success(); +} + +static ParseResult parseConstantOp(Type attrType, + OpAsmParser &parser, // NOLINT + OperationState &result) { // NOLINT + Attribute valueAttr; + if (parser.parseOptionalAttrDict(result.attributes) || + parser.parseAttribute(valueAttr, attrType, "value", result.attributes) || + parser.addTypeToList(attrType, result.types)) + return failure(); + return success(); +} + +static ParseResult parseConstantF32Op(OpAsmParser &parser, // NOLINT + OperationState &result) { // NOLINT + return parseConstantOp( + FloatType::getF32(result.getContext()), parser, result); +} +static ParseResult parseConstantF64Op(OpAsmParser &parser, // NOLINT + OperationState &result) { // NOLINT + return parseConstantOp( + FloatType::getF64(result.getContext()), parser, result); +} +static ParseResult parseConstantI32Op(OpAsmParser &parser, // NOLINT + OperationState &result) { // NOLINT + return parseConstantOp( + IntegerType::get(32, result.getContext()), parser, result); +} +static ParseResult parseConstantI64Op(OpAsmParser &parser, // NOLINT + OperationState &result) { // NOLINT + return parseConstantOp( + IntegerType::get(64, result.getContext()), parser, result); +} + +static ParseResult parseReturnOp(OpAsmParser &parser, // NOLINT + OperationState &result) { // NOLINT + SmallVector opInfo; + SmallVector types; + llvm::SMLoc loc = parser.getCurrentLocation(); + return failure(parser.parseOperandList(opInfo) || + (!opInfo.empty() && parser.parseColonTypeList(types)) || + parser.resolveOperands(opInfo, types, loc, result.operands)); +} + +static void print(OpAsmPrinter &p, CallOp op) { // NOLINT + p << "infrt.call " << op.getAttr("callee") << "("; + p.printOperands(op.getOperands()); + p << ")"; + p.printOptionalAttrDict(op.getAttrs(), {"callee"}); + p << " : "; +} + +static void printConstant(OpAsmPrinter &p, mlir::Operation *op) { // NOLINT + p << op->getName() << " "; + p.printOptionalAttrDict(op->getAttrs(), /*elidedAttrs=*/{"value"}); + + if (op->getAttrs().size() > 1) p << ' '; + Attribute attr = op->getAttr("value"); + if (auto int_attr = attr.dyn_cast()) { + bool is_signed = int_attr.getType().isIndex() || + int_attr.getType().getIntOrFloatBitWidth() != 1; + int_attr.getValue().print(p.getStream(), is_signed); + } else if (auto float_attr = attr.dyn_cast()) { + p << float_attr.getValue().convertToFloat(); + } else { + op->emitOpError("unknown attribute type"); + } +} + +static void print(OpAsmPrinter &p, ConstantF32Op op) { // NOLINT + printConstant(p, op); +} +static void print(OpAsmPrinter &p, ConstantF64Op op) { // NOLINT + printConstant(p, op); +} +static void print(OpAsmPrinter &p, ConstantI32Op op) { // NOLINT + printConstant(p, op); +} +static void print(OpAsmPrinter &p, ConstantI64Op op) { // NOLINT + printConstant(p, op); +} + +static void print(OpAsmPrinter &p, ReturnOp op) { // NOLINT + p << "infrt.return"; + if (op.getNumOperands() > 0) { + p << ' '; + p.printOperands(op.getOperands()); + p << " : "; + llvm::interleaveComma(op.getOperands(), p); + } +} + +static LogicalResult verify(CallOp op) { return success(); } + +static LogicalResult verify(ConstantF32Op op) { return success(); } +static LogicalResult verify(ConstantI32Op op) { return success(); } +static LogicalResult verify(ConstantF64Op op) { return success(); } +static LogicalResult verify(ConstantI64Op op) { return success(); } + +static LogicalResult verify(ReturnOp op) { + auto function = dyn_cast(op.getParentOp()); + + if (!function) return success(); + + auto results = function.getType().getResults(); + if (op.getNumOperands() != results.size()) + return op.emitOpError("has ") + << op.getNumOperands() + << " operands, but enclosing function returns " << results.size(); + + return success(); +} + +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/basic_kernels.cpp.inc" + +} // namespace infrt::dialect diff --git a/paddle/infrt/dialect/basic_kernels.h b/paddle/infrt/dialect/basic_kernels.h new file mode 100644 index 0000000000000..65316bc1437c0 --- /dev/null +++ b/paddle/infrt/dialect/basic_kernels.h @@ -0,0 +1,24 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +using namespace mlir; // NOLINT + +namespace infrt::dialect { +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/basic_kernels.hpp.inc" +} // namespace infrt::dialect diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/basic_kernels.td new file mode 100644 index 0000000000000..df5e4d8a2c6a1 --- /dev/null +++ b/paddle/infrt/dialect/basic_kernels.td @@ -0,0 +1,139 @@ +// Operation definitions for basic kernels. + +#ifdef BASIC_OPS +#else +#define BASIC_OPS + +include "paddle/infrt/dialect/infrt_base.td" +include "mlir/Interfaces/SideEffectInterfaces.td" + +class INFRT_Op traits = []> : Op { + + // Each registered op needs to provide all of a printer, parser and verifier. + let printer = [{ return infrt::dialect::print(p, *this); }]; + let verifier = [{ return infrt::dialect::verify(*this); }]; + let parser = [{ return infrt::dialect::parse$cppClass(parser, result); }]; +} + +def CallOp : INFRT_Op<"call"> { + let summary = "call a host operation"; + let description = [{ + The "infrt.call" operation represents a direct call to a function. The operands and result types of the call must match the specified function type. + + %2 = infrt.call @add(%0, %1) : (f32, f32) -> f32 + }]; + + let arguments = (ins FlatSymbolRefAttr:$callee, Variadic:$operands); + let results = (outs Variadic); + + let extraClassDeclaration = [{ + StringRef getCallee() { return callee(); } + mlir::FunctionType getCalleeType(); + }]; +} + +class ConstantOp + : INFRT_Op<"constant." # suffix, [NoSideEffect]> { + let summary = "constant value constructor in host"; + + let arguments = (ins attr:$value); + let results = (outs baseType); +} + +def ConstantI32Op : ConstantOp<"i32", I32, I32Attr>; +def ConstantI64Op : ConstantOp<"i64", I64, I64Attr>; +def ConstantF32Op : ConstantOp<"f32", F32, F32Attr>; +def ConstantF64Op : ConstantOp<"f64", F64, F64Attr>; + +def ReturnOp : INFRT_Op<"return", [Terminator]> { + let summary = "host executor return operation"; + let description = [{ + The "infrt.return" operation represents a return operation within a function. + + func @foo() : (i32, f8) { + infrt.return %0, %1 : i32, f8 + } + }]; + + let arguments = (ins Variadic:$operands); + + let builders = [OpBuilder< + "OpBuilder &b, OperationState &result", + [{ build(b, result, llvm::None); }]>]; +} + +class AddOp : INFRT_Op<"add." # suffix, [NoSideEffect]> { + let summary = "infrt.add operation"; + let description = [{ + An operation that takes two inputs and returns their sum as result. + }]; + + let arguments = (ins type, type); + let results = (outs type); + let assemblyFormat = "operands attr-dict"; + let verifier = ?; +} + +def AddI32Op : AddOp<"i32", I32>; +def AddI64Op : AddOp<"i64", I64>; +def AddF32Op : AddOp<"f32", F32>; +def AddF64Op : AddOp<"f64", F64>; + +class MulOp : INFRT_Op<"mul." # suffix, [NoSideEffect]> { + let summary = "infrt.mul operation"; + let description = [{ + An operation that takes two inputs and returns their mul as result. + }]; + + let arguments = (ins type, type); +let results = (outs type); +let assemblyFormat = "operands attr-dict"; +let verifier = ?; +} + +def MulI32Op : MulOp<"i32", I32>; +def MulI64Op : MulOp<"i64", I64>; +def MulF32Op : MulOp<"f32", F32>; +def MulF64Op : MulOp<"f64", F64>; + +class PrintOp : INFRT_Op<"print." # suffix> { + let summary = "infrt.print operation"; + let description = [{ + An operation takes a number as input and prints to stdout. + }]; + + let arguments = (ins type); + let assemblyFormat = "operands attr-dict"; + let verifier = ?; +} + +//def PrintI32Op : PrintOp<"i32", I32>; +//def PrintI64Op : PrintOp<"i64", I64>; +def PrintF32Op : PrintOp<"f32", F32>; +//def PrintF64Op : PrintOp<"f64", F64>; + +def GetStringOp : INFRT_Op<"get_string"> { + let summary = "infrt.get_string"; + let description = [{ + Get a !infrt.string value from the given string attribute. + }]; + + let arguments = (ins StrAttr:$value); + let results = (outs StringType); + let assemblyFormat = "`(` $value `)` attr-dict"; + let verifier = ?; +} + +def PrintStringOp : INFRT_Op<"print_string"> { + let summary = "infrt.print_string"; + let description = [{ + An operation that prints a string. + }]; + + let arguments = (ins StringType:$input); + let results = (outs); + let assemblyFormat = "`(` $input `)` attr-dict"; + let verifier = ?; +} + +#endif // basic kernels diff --git a/paddle/infrt/dialect/dense_tensor.cc b/paddle/infrt/dialect/dense_tensor.cc new file mode 100644 index 0000000000000..629a7b16523fc --- /dev/null +++ b/paddle/infrt/dialect/dense_tensor.cc @@ -0,0 +1,277 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/dense_tensor.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "paddle/infrt/common/global.h" +#include "paddle/infrt/dialect/tensor_shape.h" + +namespace infrt::dt { + +void DTDialect::initialize() { + allowUnknownTypes(); + addOperations< +#define GET_OP_LIST +#include "paddle/infrt/dialect/dense_tensor.cpp.inc" + >(); +} + +namespace detail { +struct TensorTypeStorage : public mlir::TypeStorage { + TensorTypeStorage(TargetType target, + LayoutType layout, + PrecisionType precision) + : target_(target), layout_(layout), precision_(precision) {} + + using KeyTy = std::tuple; + + bool operator==(const KeyTy &key) const { + return key == KeyTy(target_, layout_, precision_); + } + + static llvm::hash_code hashKey(const KeyTy &key) { + return llvm::hash_value(key); + } + + static TensorTypeStorage *construct( + mlir::TypeStorageAllocator &allocator, // NOLINT + const KeyTy &key) { + return new (allocator.allocate()) + TensorTypeStorage(std::get<0>(key), std::get<1>(key), std::get<2>(key)); + } + + TargetType target_; + LayoutType layout_; + PrecisionType precision_; +}; +} // namespace detail + +llvm::Optional GetTargetType(mlir::StringRef key) { + if (key.equals_lower("x86")) + return TargetType::X86; + else if (key.equals_lower("cuda")) + return TargetType::CUDA; + else + return llvm::None; +} + +llvm::Optional GetLayoutType(mlir::StringRef key) { + if (key.equals_lower("nchw")) + return LayoutType::NCHW; + else if (key.equals_lower("nhwc")) + return LayoutType::NHWC; + else + return llvm::None; +} + +llvm::Optional GetPrecisionType(mlir::StringRef key) { + if (key.equals_lower("i32")) + return PrecisionType::I32; + else if (key.equals_lower("f32")) + return PrecisionType::F32; + else + return llvm::None; +} + +TensorType TensorType::get(TargetType target, + LayoutType layout, + PrecisionType precision) { + return Base::get( + ::infrt::Global::getMLIRContext(), target, layout, precision); +} + +TargetType TensorType::target() { return getImpl()->target_; } + +LayoutType TensorType::layout() { return getImpl()->layout_; } + +PrecisionType TensorType::precision() { return getImpl()->precision_; } + +raw_ostream &operator<<(raw_ostream &os, TensorType tensorType) { + os << "TensorType<" << tensorType.target() << ", " << tensorType.layout() + << ", " << tensorType.precision() << ">"; + return os; +} + +TensorMapType TensorMapType::get() { + return Base::get(::infrt::Global::getMLIRContext()); +} + +TensorMapType TensorMapType::get(mlir::MLIRContext *context) { + return Base::get(context); +} + +StringType StringType::get() { + return Base::get(::infrt::Global::getMLIRContext()); +} + +StringType StringType::get(mlir::MLIRContext *context) { + return Base::get(context); +} + +raw_ostream &operator<<(raw_ostream &os, TargetType type) { + switch (type) { + case (TargetType::X86): + os << "X86"; + break; + case (TargetType::CUDA): + os << "CUDA"; + break; + default: + os << "Unsupported"; + } + return os; +} + +raw_ostream &operator<<(raw_ostream &os, LayoutType type) { + switch (type) { + case (LayoutType::NCHW): + os << "NCHW"; + break; + case (LayoutType::NHWC): + os << "NHWC"; + break; + default: + os << "Unsupported"; + } + return os; +} + +raw_ostream &operator<<(raw_ostream &os, PrecisionType type) { + switch (type) { + case (PrecisionType::I32): + os << "I32"; + break; + case (PrecisionType::F32): + os << "F32"; + break; + default: + os << "Unsupported"; + } + return os; +} + +static Type getTensorType(mlir::MLIRContext *context) { + auto t_dialect = Identifier::get("t", context); + return OpaqueType::get(t_dialect, "tensor", context); +} + +static ParseResult parseCreateUninitTensorOp( + OpAsmParser &parser, // NOLINT + OperationState &result) { // NOLINT + auto loc = parser.getCurrentLocation(); + ::mlir::Type outputRawTypes[1]; + ::llvm::ArrayRef<::mlir::Type> outputTypes(outputRawTypes); + + mlir::ArrayAttr shapeAttr; + if (parser.parseAttribute(shapeAttr, + parser.getBuilder().getI64Type(), + "shape", + result.attributes)) + return failure(); + if (parser.parseOptionalAttrDict(result.attributes)) return failure(); + + if (parser.parseArrow()) return failure(); + if (parser.parseType(outputRawTypes[0])) return failure(); + if (!outputRawTypes[0].isa()) + return parser.emitError(loc, "invalid kind of type specified"); + result.addTypes(outputTypes); + return success(); +} + +template +static void printCreateUninitTensorOp(OpAsmPrinter &p, // NOLINT + CreateUninitTensorOp op) { + p << CreateUninitTensorOp::getOperationName(); + p << " "; + p.printAttributeWithoutType(op.shapeAttr()); + p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"shape"}); + p << " -> "; + p << op.getOperation()->getResultTypes(); +} + +// TODO(shibo): can be removed? +// static ParseResult parseFillTensorWithConstantOp(OpAsmParser& parser, +// OperationState& result) { +// auto loc = parser.getCurrentLocation(); +// ::mlir::OpAsmParser::OperandType inputRawOperands[1]; +// ::llvm::ArrayRef<::mlir::OpAsmParser::OperandType> +// inputOperands(inputRawOperands); +// ::mlir::Type inputRawTypes[1]; +// ::llvm::ArrayRef<::mlir::Type> inputTypes(inputRawTypes); +// +// if (parser.parseOperand(inputRawOperands[0])) return failure(); +// +// if (parser.parseColon()) return failure(); +// if (parser.parseType(inputRawTypes[0])) return failure(); +// if (!inputRawTypes[0].isa()) +// return parser.emitError(loc, "invalid kind of type specified"); +// +// Attribute value_attr; +// if (parser.resolveOperands(inputOperands, inputTypes, loc, result.operands)) +// return failure(); +// if (parser.parseAttribute(value_attr, "value", result.attributes)) return +// failure(); +// return success(); +//} + +// TODO(shibo): can be removed? +// template +// static void printFillTensorWithConstantOp(OpAsmPrinter& p, FillTensorOp op) { +// p << FillTensorOp::getOperationName(); +// p << " "; +// p.printOperand(op.getOperand()); +// p << " : "; +// p << op.getOperation()->getOperandTypes(); +// p << " "; +// p << op.getAttr("value"); +//} + +static ParseResult parseSetTensorOp(OpAsmParser &parser, // NOLINT + OperationState &result) { // NOLINT + SmallVector operands; + if (parser.parseOperandList(operands, 1)) return failure(); + + auto tensor_type = getTensorType(result.getContext()); + + Attribute value_attr; + return failure( + parser.resolveOperand(operands[0], tensor_type, result.operands) || + parser.parseAttribute(value_attr, "values", result.attributes)); +} + +template +static void printSetTensorOp(OpAsmPrinter &p, SetTensorOp op) { // NOLINT + p << SetTensorOp::getOperationName() << " "; + p.printOperand(op.getOperand()); + p << " " << op.getAttr("values"); +} + +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/dense_tensor.cpp.inc" // NOLINT + +} // namespace infrt::dt diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h new file mode 100644 index 0000000000000..866c62213ab05 --- /dev/null +++ b/paddle/infrt/dialect/dense_tensor.h @@ -0,0 +1,79 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +#include + +using namespace mlir; // NOLINT +namespace infrt::dt { + +namespace detail { +struct TensorTypeStorage; +} // namespace detail + +enum class TargetType : uint8_t { X86, CUDA }; +enum class LayoutType : uint8_t { NCHW, NHWC }; +enum class PrecisionType : uint8_t { I32, F32 }; + +llvm::Optional GetTargetType(mlir::StringRef key); +llvm::Optional GetLayoutType(mlir::StringRef key); +llvm::Optional GetPrecisionType(mlir::StringRef key); + +raw_ostream &operator<<(raw_ostream &os, TargetType type); +raw_ostream &operator<<(raw_ostream &os, LayoutType type); +raw_ostream &operator<<(raw_ostream &os, PrecisionType type); + +class TensorType : public mlir::Type::TypeBase { + public: + using Base::Base; + static TensorType get(TargetType target, + LayoutType layout, + PrecisionType precision); + + TargetType target(); + LayoutType layout(); + PrecisionType precision(); +}; + +raw_ostream &operator<<(raw_ostream &os, TensorType tensorType); + +class TensorMapType : public mlir::Type::TypeBase { + public: + using Base::Base; + static TensorMapType get(); + static TensorMapType get(mlir::MLIRContext *context); +}; + +class StringType + : public mlir::Type::TypeBase { + public: + using Base::Base; + static StringType get(); + static StringType get(mlir::MLIRContext *context); +}; + +#include "paddle/infrt/dialect/dense_tensor_dialect.hpp.inc" + +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/dense_tensor.hpp.inc" + +} // namespace infrt::dt diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td new file mode 100644 index 0000000000000..07e70cb2ca1ee --- /dev/null +++ b/paddle/infrt/dialect/dense_tensor.td @@ -0,0 +1,150 @@ +#ifdef DT_OPS +#else +#define DT_OPS + +include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/tensor_shape_base.td" +include "mlir/Interfaces/SideEffectInterfaces.td" + +def DT_Dialect : Dialect { + let name = "dt"; + + let description = [{ + The DenseTensor dialect. + }]; + + let cppNamespace = "::infrt::dt"; +} + +class DT_Op traits = []> : + Op; + +class CreateUninitTensorOp + : DT_Op<"create_uninit_tensor." # dtype, [NoSideEffect]> { + let summary = "dt.create_uninit_tensor operation"; + + let description = [{ + An operation that creates an uninitialized tensor. + }]; + + let arguments = (ins I64ArrayAttr:$shape); + let results = (outs TensorType:$output); + + let parser = [{ return infrt::dt::parseCreateUninitTensorOp(parser, result); }]; + let printer = [{ return infrt::dt::printCreateUninitTensorOp(p, *this); }]; +} + + +def ShallowCopyTensorOp + : DT_Op<"shallow_copy_tensor", [NoSideEffect]> { + let summary = "dt.shallow_copy_tensor operation"; + + let description = [{ + An operation that copy a tensor shallowly. + }]; + + let arguments = (ins TensorType:$input); + let results = (outs TensorType:$output); + + let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)"; +} + + +class FillTensorWithConstantOp : + DT_Op<"fill_tensor_with_constant." # dtype> { + let summary = "dt.fill_tensor_with_constant operation"; + + let description = [{ + An operation that fills an input tensor with a value. + }]; + + let arguments = (ins + TensorType:$input, + AnyAttr:$value + ); + let results = (outs); + + // TODO: can be removed? + //let parser = [{ return infrt::dt::parseFillTensorWithConstantOp(parser, result); }]; + //let printer = [{ return infrt::dt::printFillTensorWithConstantOp(p, *this); }]; + let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict"; +} + +def PrintTensorOp : DT_Op<"print_tensor"> { + let summary = "dt.print_tensor operation"; + + let description = [{ + An operation that prints a tensor. + }]; + + let arguments = (ins TensorType:$input); + let results = (outs); + let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict"; +} + +class SetTensorOp : + DT_Op<"set_tensor_with_constant_values." # dtype> { + let summary = "dt.set_tensor_with_constant_values operation"; + + let description = [{ + An operation that sets an input tensor with given values. + }]; + + let arguments = (ins TensorType); + let results = (outs); + + let parser = [{ return infrt::dt::parseSetTensorOp(parser, result); }]; + let printer = [{ return infrt::dt::printSetTensorOp(p, *this); }]; +} + +def LoadParamsOp : DT_Op<"load_params", [NoSideEffect]> { + let summary = "dt.load_params operation"; + + let description = [{ + An operation that can load tensors to TensorMap. + }]; + + // input path of model params. + let arguments = (ins StringType:$path); + let results = (outs TensorMapType); + + let assemblyFormat = "`(` operands `)` attr-dict"; + let verifier = ?; +} + +def GetParamOp : DT_Op<"get_param", [NoSideEffect]> { + let summary = "dt.get_param operation"; + + let description = [{ + An operation that can get a tensor from TensorMap. + }]; + + // input path of model params. + let arguments = (ins + TensorMapType:$map, + StrAttr:$name + ); + let results = (outs TensorType:$output); + let assemblyFormat = "`(` $map `,` $name `)` attr-dict `->` type($output)"; + let verifier = ?; +} + +def GetTensorShapeOp : DT_Op<"get_tensor_shape", [NoSideEffect]> { + let summary = "dt.get_tensor_shape operation"; + + let description = [{ + An operation that returns the shape of the input tensor. + }]; + + let arguments = (ins TensorType:$input); + let results = (outs TS_Shape:$output); + let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)"; +} + +foreach dtype = ["ui8", "ui16", "ui32", "ui64", "i32", "f32", "f64", "i64"] in { + def DT_CreateUninitTensorOp_#dtype : CreateUninitTensorOp; + def DT_FillTensorOp_#dtype : FillTensorWithConstantOp; + def DT_SetTensorOp_#dtype : SetTensorOp; +} + +#endif // DT_OPS diff --git a/paddle/infrt/dialect/diagnostic_utils.cc b/paddle/infrt/dialect/diagnostic_utils.cc new file mode 100644 index 0000000000000..a28176e38fdc7 --- /dev/null +++ b/paddle/infrt/dialect/diagnostic_utils.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/diagnostic_utils.h" + +#include + +namespace infrt::dialect { + +struct MyScopedDiagnosicHandler::Impl { + Impl() : diag_stream_(diag_str_) {} + + // String stream to assemble the final error message. + std::string diag_str_; + llvm::raw_string_ostream diag_stream_; + + // A SourceMgr to use for the base handler class. + llvm::SourceMgr source_mgr_; + + // Log detail information. + bool log_info_{}; +}; + +MyScopedDiagnosicHandler::MyScopedDiagnosicHandler(mlir::MLIRContext *ctx, + bool propagate) + : mlir::SourceMgrDiagnosticHandler( + impl_->source_mgr_, ctx, impl_->diag_stream_), + impl_(new Impl) { + setHandler([this](mlir::Diagnostic &diag) { return this->handler(&diag); }); +} + +mlir::LogicalResult MyScopedDiagnosicHandler::handler(mlir::Diagnostic *diag) { + if (diag->getSeverity() != mlir::DiagnosticSeverity::Error && + !impl_->log_info_) + return mlir::success(); + emitDiagnostic(*diag); + impl_->diag_stream_.flush(); + return mlir::failure(true); +} + +} // namespace infrt::dialect diff --git a/paddle/infrt/dialect/diagnostic_utils.h b/paddle/infrt/dialect/diagnostic_utils.h new file mode 100644 index 0000000000000..3a8098cf75181 --- /dev/null +++ b/paddle/infrt/dialect/diagnostic_utils.h @@ -0,0 +1,39 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include + +namespace infrt::dialect { + +/** + * A scoped diagnostic handler to help debug MLIR process. + */ +class MyScopedDiagnosicHandler : public mlir::SourceMgrDiagnosticHandler { + public: + MyScopedDiagnosicHandler(mlir::MLIRContext* ctx, bool propagate); + + mlir::LogicalResult handler(mlir::Diagnostic* diag); + + ~MyScopedDiagnosicHandler(); + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace infrt::dialect diff --git a/paddle/infrt/dialect/dialect.cc b/paddle/infrt/dialect/dialect.cc new file mode 100644 index 0000000000000..cbcd5d0f0fa78 --- /dev/null +++ b/paddle/infrt/dialect/dialect.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace infrt::hlir::dialect { + +class CinnDialect : public ::mlir::Dialect { + public: + explicit CinnDialect(::mlir::MLIRContext* ctx); + + //! We should register this function in dialect + static llvm::StringRef getDialectNamespace() { + return "infrt::hlir::dialect"; + } +}; + +} // namespace infrt::hlir::dialect diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc new file mode 100644 index 0000000000000..b28ad5ad4b5a5 --- /dev/null +++ b/paddle/infrt/dialect/infrt_base.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/infrt_base.h" + +#include "paddle/infrt/dialect/basic_kernels.h" +#include "paddle/infrt/dialect/dense_tensor.h" +#include "paddle/infrt/dialect/test_kernels.h" + +namespace infrt::dialect { + +// ----INFRTDialect definition begin---- +void INFRTDialect::initialize() { + allowUnknownTypes(); + allowUnknownOperations(); + + addTypes(); + addTypes(); + addTypes(); + + addOperations< +#define GET_OP_LIST +#include "paddle/infrt/dialect/basic_kernels.cpp.inc" + >(); + addOperations< +#define GET_OP_LIST +#include "paddle/infrt/dialect/test_kernels.cpp.inc" + >(); +} + +mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const { + llvm::StringRef keyword; + if (parser.parseKeyword(&keyword)) return mlir::Type(); + // parse TensorType, for example: !infrt.tensor + if (keyword == "tensor") { + llvm::StringRef target; + llvm::StringRef layout; + llvm::StringRef precision; + + // parse "<" + if (parser.parseLess()) return mlir::Type(); + // parse target + if (parser.parseKeyword(&target)) return mlir::Type(); + auto targetType = infrt::dt::GetTargetType(target); + if (!targetType) { + parser.emitError(parser.getCurrentLocation(), "unknown target type: ") + << target; + return mlir::Type(); + } + + // parse "," + if (parser.parseComma()) return mlir::Type(); + // parse layout + if (parser.parseKeyword(&layout)) return mlir::Type(); + auto layoutType = infrt::dt::GetLayoutType(layout); + if (!layoutType) { + parser.emitError(parser.getCurrentLocation(), "unknown layout type: ") + << layout; + return mlir::Type(); + } + + // parse "," + if (parser.parseComma()) return mlir::Type(); + // parse precision + if (parser.parseKeyword(&precision)) return mlir::Type(); + auto precisionType = infrt::dt::GetPrecisionType(precision); + if (!precisionType) { + parser.emitError(parser.getCurrentLocation(), "unknown precision type: ") + << precision; + return mlir::Type(); + } + + // parse ">" + if (parser.parseGreater()) return mlir::Type(); + + return infrt::dt::TensorType::get(*targetType, *layoutType, *precisionType); + } + // parse TensorMapType, for example: !infrt.tensor_map + if (keyword == "tensor_map") { + return infrt::dt::TensorMapType::get(); + } + // parse StringType, for example: !infrt.string + if (keyword == "string") { + return infrt::dt::StringType::get(); + } + + parser.emitError(parser.getCurrentLocation(), "unknown infrt type: ") + << keyword; + return mlir::Type(); +} + +void INFRTDialect::printType(mlir::Type type, + mlir::DialectAsmPrinter &printer) const { + // print TensorType, for example: !infrt.tensor + if (type.isa()) { + auto tensorType = type.cast(); + printer << "tensor<" << tensorType.target() << ", " << tensorType.layout() + << ", " << tensorType.precision() << ">"; + return; + } + // print TensorMapType, for example: !infrt.tensor_map + if (type.isa()) { + printer << "tensor_map"; + return; + } + // print StringType, for example: !infrt.string + if (type.isa()) { + printer << "string"; + return; + } + llvm_unreachable("unknown infrt type."); +} + +// ----INFRTDialect definition end---- + +} // namespace infrt::dialect diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h new file mode 100644 index 0000000000000..1398378957069 --- /dev/null +++ b/paddle/infrt/dialect/infrt_base.h @@ -0,0 +1,73 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/infrt/dialect/infrt_base.hpp.inc" + +namespace infrt::dialect { + +class INFRTDialect : public ::mlir::Dialect { + explicit INFRTDialect(::mlir::MLIRContext *context) + : ::mlir::Dialect(getDialectNamespace(), + context, + ::mlir::TypeID::get()) { + initialize(); + } + + // parse types registered to the dialect. + mlir::Type parseType(mlir::DialectAsmParser &parser) const override; + // print types registered to the dialect. + void printType(mlir::Type type, + mlir::DialectAsmPrinter &printer) const override; + + void initialize(); + friend class ::mlir::MLIRContext; + + public: + static ::llvm::StringRef getDialectNamespace() { return "infrt"; } +}; + +} // namespace infrt::dialect + +namespace mlir { + +template +static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b, // NOLINT + mlir::Location loc, + T constant) { + return b.getIntegerAttr(b.getI32Type(), constant); +} + +static mlir::ValueRange cvtValueToValueRange(const mlir::Value &operand) { + return mlir::ValueRange(operand); +} + +static mlir::ValueRange concatTwoValueRange(mlir::ValueRange operand_0, + mlir::ValueRange operand_1) { + mlir::SmallVector<::mlir::Value, 4> operands; + operands.append(operand_0.begin(), operand_0.end()); + operands.append(operand_1.begin(), operand_1.end()); + return operands; +} + +} // namespace mlir diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td new file mode 100644 index 0000000000000..61dcfe5bfb1c3 --- /dev/null +++ b/paddle/infrt/dialect/infrt_base.td @@ -0,0 +1,42 @@ +#ifndef INFRT_BASE +#define INFRT_BASE + +include "mlir/IR/OpBase.td" + +def INFRT_Dialect : Dialect { + let name = "infrt"; + + let description = [{ + The INFRT host dialect. + }]; + + let cppNamespace = "::infrt::dialect"; +} + +// Type definitions +def StringType : + Type()">, "!infrt.string type">, + BuildableType<"$_builder.getType<::infrt::dt::StringType>()">; + +def TensorType : + Type()">, "!infrt.tensor type">; + +def TensorMapType : + Type()">, "!infrt.tensor_map type">, + BuildableType<"$_builder.getType<::infrt::dt::TensorMapType>()">; + +def BufferType : OpaqueType<"b", "buffer", "buffer">; + +class INFRT_createI32Attr : NativeCodeCall< + "mlir::createI32Attr($_builder, $_loc, " # value # ")">; + +def INFRT_cvtValueToValueRange : NativeCodeCall< + "mlir::cvtValueToValueRange($0)">; + +def INFRT_concatTwoValueRange : NativeCodeCall< + "mlir::concatTwoValueRange($0, $1)">; + +class IsBoolAttrEq : Constraint< + CPred<"($0.getValue() ==" # value # ")">, + "Bool attrbute value constraint">; +#endif // INFRT_BASE diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc new file mode 100644 index 0000000000000..4bc2bf70942d2 --- /dev/null +++ b/paddle/infrt/dialect/init_infrt_dialects.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/init_infrt_dialects.h" + +#include + +#include "paddle/infrt/dialect/basic_kernels.h" +#include "paddle/infrt/dialect/dense_tensor.h" +#include "paddle/infrt/dialect/infrt_base.h" +#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/tensor_shape.h" + +namespace infrt { + +void RegisterCinnDialects(mlir::DialectRegistry& registry) { // NOLINT + registry.insert(); + registry.insert(); + registry.insert(); + registry.insert(); +} + +} // namespace infrt diff --git a/paddle/infrt/dialect/init_infrt_dialects.h b/paddle/infrt/dialect/init_infrt_dialects.h new file mode 100644 index 0000000000000..50caca018980d --- /dev/null +++ b/paddle/infrt/dialect/init_infrt_dialects.h @@ -0,0 +1,23 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "mlir/IR/Dialect.h" + +namespace infrt { + +void RegisterCinnDialects(mlir::DialectRegistry& registry); // NOLINT + +} // namespace infrt diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc new file mode 100644 index 0000000000000..8df8727dbe2b0 --- /dev/null +++ b/paddle/infrt/dialect/mlir_loader.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/mlir_loader.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "paddle/infrt/dialect/diagnostic_utils.h" +#include "paddle/infrt/dialect/init_infrt_dialects.h" + +namespace infrt::dialect { + +mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context, + const std::string& mlir_source) { + context->allowUnregisteredDialects(); + RegisterCinnDialects(context->getDialectRegistry()); + context->getDialectRegistry().insert(); + + mlir::ScopedDiagnosticHandler scope_handler( + context, [](mlir::Diagnostic& diag) { + if (diag.getSeverity() != mlir::DiagnosticSeverity::Error) + return mlir::success(); + LOG(INFO) << "diag: " << diag.str(); + return mlir::failure(true); + }); + + auto res = mlir::parseSourceString( + llvm::StringRef(mlir_source.data(), mlir_source.length()), context); + CHECK(*res) << "failed to parse MLIR string"; + return res; +} + +mlir::OwningModuleRef LoadMlirFile(const std::string& file_name, + mlir::MLIRContext* context) { + context->allowUnregisteredDialects(); + RegisterCinnDialects(context->getDialectRegistry()); + context->getDialectRegistry().insert(); + + mlir::ScopedDiagnosticHandler scope_handler( + context, [](mlir::Diagnostic& diag) { + if (diag.getSeverity() != mlir::DiagnosticSeverity::Error) + return mlir::success(); + LOG(INFO) << "diag: " << diag.str(); + return mlir::failure(true); + }); + + return mlir::parseSourceFile(std::string(file_name), context); +} + +} // namespace infrt::dialect diff --git a/paddle/infrt/dialect/mlir_loader.h b/paddle/infrt/dialect/mlir_loader.h new file mode 100644 index 0000000000000..092da7d9ce03f --- /dev/null +++ b/paddle/infrt/dialect/mlir_loader.h @@ -0,0 +1,30 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include + +namespace infrt::dialect { + +mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context, + const std::string& mlir_source); +mlir::OwningModuleRef LoadMlirFile(const std::string& file_name, + mlir::MLIRContext* context); + +} // namespace infrt::dialect diff --git a/paddle/infrt/dialect/mlir_loader_test.cc b/paddle/infrt/dialect/mlir_loader_test.cc new file mode 100644 index 0000000000000..1b622d585ad8e --- /dev/null +++ b/paddle/infrt/dialect/mlir_loader_test.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/mlir_loader.h" + +#include +#include +#include +#include +#include + +#include + +#include "paddle/infrt/dialect/init_infrt_dialects.h" + +namespace infrt::dialect { + +TEST(MlirLoader, basic) { + mlir::MLIRContext context; + + auto source = R"ROC( +func @main() -> f32 { + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + + "infrt.print.f32"(%v0) : (f32) -> () + + infrt.return %value : f32 +} +)ROC"; + + auto module = LoadMlirSource(&context, source); + module->verify(); + + LOG(INFO) << "module name: " << module->getOperationName().data(); + for (auto func : module->getOps()) { + LOG(INFO) << "get func " << func.getName().str(); + int num_args = func.getNumArguments(); + for (int i = 0; i < num_args; i++) { + LOG(INFO) << "arg: " << func.getArgument(i).getArgNumber(); + } + } +} + +} // namespace infrt::dialect diff --git a/paddle/infrt/dialect/mlir_tests/basic.mlir b/paddle/infrt/dialect/mlir_tests/basic.mlir new file mode 100644 index 0000000000000..84b9b0fbd71cb --- /dev/null +++ b/paddle/infrt/dialect/mlir_tests/basic.mlir @@ -0,0 +1,40 @@ +// CHECK-LABEL: @basic_f32 +func @basic_f32() -> f32 { + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + + // CHECK-NEXT: 3 + "infrt.print.f32"(%value) : (f32) -> () + + infrt.return %value : f32 +} + +/// ================================================================ +/// @caller call the other function @callee +func @callee.add.f32(%x : f32, %y : f32, %y1 : f32) -> f32 { + %z = "infrt.add.f32"(%x, %y) : (f32, f32) -> f32 + %z1 = "infrt.add.f32"(%z, %y1) : (f32, f32) -> f32 + infrt.return %z1 : f32 +} + +// CHECK-LABEL: @caller.add.f32 +func @caller.add.f32() -> f32 { + %x = infrt.constant.f32 1.0 + %y = infrt.constant.f32 2.0 + %y1 = infrt.constant.f32 3.0 + %z = infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32 + + // CHECK-NEXT: 6 + "infrt.print.f32"(%z) : (f32) -> () + infrt.return %z : f32 +} +/// <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +// CHECK-LABEL: @string_test +func @string_test() { + %path = infrt.get_string("this is get_string op.") + // CHECK-LABEL: string = this is get_string op. + infrt.print_string(%path) + infrt.return +} diff --git a/paddle/infrt/dialect/mlir_tests/benchmark.mlir b/paddle/infrt/dialect/mlir_tests/benchmark.mlir new file mode 100644 index 0000000000000..8b4530689df7e --- /dev/null +++ b/paddle/infrt/dialect/mlir_tests/benchmark.mlir @@ -0,0 +1,23 @@ +// CHECK-LABEL: @benchmark +func @benchmark() { + // CHECK-LABEL: BM:add.f32:Count: 3 + // CHECK-LABEL: BM:add.f32:Duration(ns) + // CHECK-LABEL: BM:add.f32:Time Min(ns) + // CHECK-LABEL: BM:add.f32:Time 50%(ns) + // CHECK-LABEL: BM:add.f32:Time 95%(ns) + // CHECK-LABEL: BM:add.f32:Time 99%(ns) + // CHECK-LABEL: BM:add.f32:CPU Min(ns) + // CHECK-LABEL: BM:add.f32:CPU 50%(ns) + // CHECK-LABEL: BM:add.f32:CPU 95%(ns) + // CHECK-LABEL: BM:add.f32:CPU 99%(ns) + // CHECK-LABEL: BM:add.f32:CPU utilization(percent) + infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3 + { + %0 = infrt.constant.f32 1.0 + %1 = infrt.constant.f32 2.0 + %res = "infrt.add.f32"(%0, %1) : (f32, f32) -> f32 + "infrt.print.f32"(%res) : (f32) -> () + infrt.return %res : f32 + } + infrt.return +} diff --git a/paddle/infrt/dialect/mlir_tests/dense_tensor.mlir b/paddle/infrt/dialect/mlir_tests/dense_tensor.mlir new file mode 100644 index 0000000000000..cca7445cd58d8 --- /dev/null +++ b/paddle/infrt/dialect/mlir_tests/dense_tensor.mlir @@ -0,0 +1,22 @@ +func @dense_shape0() { + %shape = ts.build_shape [1:i64, 57:i64] + %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor + + infrt.return +} + +func @predict(%a: !infrt.tensor, %b: !infrt.tensor) -> (!infrt.tensor, !infrt.tensor) { + %a0 = dt.shallow_copy_tensor %a : !infrt.tensor -> !infrt.tensor + %b0 = dt.shallow_copy_tensor %b : !infrt.tensor -> !infrt.tensor + + infrt.return %a0, %b0: !infrt.tensor, !infrt.tensor +} + + +func @main() { + %shape = ts.build_shape [1:i64, 57:i64] + %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor + + %b, %c = infrt.call @predict(%a, %a) : (!infrt.tensor, !infrt.tensor) -> (!infrt.tensor, !infrt.tensor) + infrt.return +} diff --git a/paddle/infrt/dialect/mlir_tests/paddle_ops.mlir b/paddle/infrt/dialect/mlir_tests/paddle_ops.mlir new file mode 100644 index 0000000000000..1855a68dd91c3 --- /dev/null +++ b/paddle/infrt/dialect/mlir_tests/paddle_ops.mlir @@ -0,0 +1,8 @@ +func @ops() { + %a = pd.Feed() : tensor + %b = pd.Feed() : tensor + + %c = "pd.Matmul"(%a, %b) {transpose_x=true, transpose_y=false} : (tensor, tensor) -> tensor + + infrt.return +} diff --git a/paddle/infrt/dialect/mlir_tests/rewrite.mlir b/paddle/infrt/dialect/mlir_tests/rewrite.mlir new file mode 100644 index 0000000000000..c984fda3e6211 --- /dev/null +++ b/paddle/infrt/dialect/mlir_tests/rewrite.mlir @@ -0,0 +1,24 @@ +// CHECK-LABEL: @main +func @main() -> tensor { + %a = "pd.Feed"() : () -> tensor + %b = "pd.Feed"() : () -> tensor + %bias = "pd.Feed"() : () -> tensor + + %b1 = "pd.Feed"() : () -> tensor + %b2 = "pd.Feed"() : () -> tensor + %bias1 = "pd.Feed"() : () -> tensor + %bias2 = "pd.Feed"() : () -> tensor + + %c = "pd.Matmul"(%a, %b) {transpose_y=false} : (tensor, tensor) -> tensor + %d = "pd.ElementwiseAdd"(%c, %bias) {axis=1:i32} : (tensor, tensor) -> tensor + %e = "pd.Relu6"(%d) {} : (tensor) -> tensor + + %c1 = "pd.Matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor, tensor) -> tensor + %d1 = "pd.ElementwiseAdd"(%c1, %bias1) {axis=1:i32} : (tensor, tensor) -> tensor + %e1 = "pd.Relu"(%d1) {} : (tensor) -> tensor + + %c2 = "pd.Matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor, tensor) -> tensor + %d2 = "pd.ElementwiseAdd"(%c2, %bias2) {axis=1:i32} : (tensor, tensor) -> tensor + %e2 = "pd.Relu"(%d2) {} : (tensor) -> tensor + infrt.return %e2 : tensor +} \ No newline at end of file diff --git a/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir new file mode 100644 index 0000000000000..d41d4b2f9f6bc --- /dev/null +++ b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir @@ -0,0 +1,15 @@ +// CHECK-LABEL: @main +func @main() -> tensor { + %a = "pd.Feed"() : () -> tensor + %filter = "pd.Constant"(){value = dense<1.000000e+00> : tensor<3x64x3x3xf32>} : () -> tensor<3x64x3x3xf32> + %bias = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32> + + %scale = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32> + %bias2 = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32> + %mean = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32> + %var = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32> + + %c = "pd.conv2d"(%a, %filter, %bias) {} : (tensor, tensor<3x64x3x3xf32>, tensor<64xf32>) -> tensor + %d = "pd.batch_norm"(%c, %scale, %bias2, %mean, %var) {} : (tensor, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor + infrt.return %d : tensor +} \ No newline at end of file diff --git a/paddle/infrt/dialect/mlir_tests/tensor_map.mlir b/paddle/infrt/dialect/mlir_tests/tensor_map.mlir new file mode 100644 index 0000000000000..111c01c9a108b --- /dev/null +++ b/paddle/infrt/dialect/mlir_tests/tensor_map.mlir @@ -0,0 +1,31 @@ +// CHECK-LABEL: @predict +func @predict(%input:!infrt.tensor, %map: !infrt.tensor_map) -> (!infrt.tensor) { + %w = dt.get_param(%map, "create_parameter_0.w_0") -> !infrt.tensor + %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !infrt.tensor + + %out = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.tensor + + // fc + "external.matmul"(%input, %w, %out) {}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () + "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () + "external.sigmoid"(%out, %out) {}: (!infrt.tensor, !infrt.tensor) -> () + //dt.print_tensor (%out : !infrt.tensor) + + infrt.return %out : !infrt.tensor +} + +// CHECK-LABEL: @main +func @main() { + %input = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.tensor + dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor) {value=1.0:f32} + + %path = infrt.get_string("/infrt/build/paddle/paddle_1.8_fc_model") + // CHECK-LABEL: loading params + %map = dt.load_params(%path) + + %out = infrt.call @predict(%input, %map): (!infrt.tensor, !infrt.tensor_map) -> (!infrt.tensor) + dt.print_tensor (%out : !infrt.tensor) + + infrt.return +} + diff --git a/paddle/infrt/dialect/mlir_tests/tensor_shape.mlir b/paddle/infrt/dialect/mlir_tests/tensor_shape.mlir new file mode 100644 index 0000000000000..504b5b36be038 --- /dev/null +++ b/paddle/infrt/dialect/mlir_tests/tensor_shape.mlir @@ -0,0 +1,5 @@ +func @build_tensor1() { + %a = ts.build_shape [1:i64, 57:i64, 92:i64] + ts.print_shape %a + infrt.return +} diff --git a/paddle/infrt/dialect/mlir_tests/tensor_type.mlir b/paddle/infrt/dialect/mlir_tests/tensor_type.mlir new file mode 100644 index 0000000000000..c331097ab1072 --- /dev/null +++ b/paddle/infrt/dialect/mlir_tests/tensor_type.mlir @@ -0,0 +1,9 @@ +// CHECK-LABEL: test_tensor_type +func @test_tensor_type() { + %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor + dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor) {value=1.0:f32} + // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + dt.print_tensor (%a : !infrt.tensor) + + infrt.return +} diff --git a/paddle/infrt/dialect/ops.td b/paddle/infrt/dialect/ops.td new file mode 100644 index 0000000000000..264134a447c63 --- /dev/null +++ b/paddle/infrt/dialect/ops.td @@ -0,0 +1,6 @@ +include "mlir/IR/OpBase.td" +include "paddle/infrt/dialect/infrt_base.td" + + +class INFRT_Op traits = []> : + Op; diff --git a/paddle/infrt/dialect/opt.cc b/paddle/infrt/dialect/opt.cc new file mode 100644 index 0000000000000..d90d25230d0c2 --- /dev/null +++ b/paddle/infrt/dialect/opt.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "paddle/infrt/common/global.h" +#include "paddle/infrt/dialect/init_infrt_dialects.h" +#include "paddle/infrt/dialect/mlir_loader.h" + +int main(int argc, char **argv) { + mlir::MLIRContext *context = infrt::Global::getMLIRContext(); + + auto ®istry = context->getDialectRegistry(); + infrt::RegisterCinnDialects(registry); + + mlir::registerCanonicalizerPass(); + + return mlir::failed( + mlir::MlirOptMain(argc, argv, "INFRT mlir pass driver", registry)); +} diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td new file mode 100644 index 0000000000000..af53df113dfb3 --- /dev/null +++ b/paddle/infrt/dialect/pd_op_base.td @@ -0,0 +1,77 @@ +// This file defines some basic elements of Paddle(alias pd) dialect. +// We learned much from TensorFlow mlir dialect https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td + +#ifndef PD_OP_BASE +#define PD_OP_BASE + +include "mlir/IR/OpBase.td" +include "mlir/Interfaces/SideEffectInterfaces.td" + +def PD_Dialect : Dialect { + let name = "pd"; + + let description = [{ + The PaddlePaddle dialect. + + This dialect contains the PaddlePaddle operators. + }]; + + let cppNamespace = "::mlir::pd"; +} + +class PD_Op traits = []> : + Op; + + +class PD_PaddleAttr : + Attr()">, + "PaddlePaddle " # description # " attribute">; + + +//===----------------------------------------------------------------------===// +// PaddlePaddle type definitions +//===----------------------------------------------------------------------===// + +def PD_PDDialectType : Type()">, "PaddlePaddle type">; + +class PD_PaddleType : + Type()">, + "Paddle " # description # " type">, + BuildableType<"getType()">; + +//===----------------------------------------------------------------------===// +// Integer types +def PD_Bool : AnyTypeOf<[I<1>], "bool">; +def PD_Int8 : AnyTypeOf<[I8], "8-bit integer">; +def PD_Int16 : AnyTypeOf<[I16], "16-bit integer">; +def PD_Int32 : AnyTypeOf<[I32], "32-bit integer">; +def PD_Int64 : AnyTypeOf<[I64], "64-bit integer">; + +def PD_UInt8 : AnyTypeOf<[UI<8>], "8-bit unsigned integer">; +def PD_UInt16 : AnyTypeOf<[UI<16>], "16-bit unsigned integer">; +def PD_UInt32 : AnyTypeOf<[UI<32>], "32-bit unsigned integer">; +def PD_UInt64 : AnyTypeOf<[UI<64>], "64-bit unsigned integer">; + +def PD_SInt : AnyTypeOf<[PD_Int8, PD_Int16, PD_Int32, PD_Int64], "signed integer">; +def PD_UInt : AnyTypeOf<[PD_UInt8, PD_UInt16, PD_UInt32, PD_UInt64], "unsigned integer">; +def PD_Int : AnyTypeOf<[PD_SInt, PD_UInt], "integer">; + +// Float types +def PD_Float16 : AnyTypeOf<[F16], "16-bit float">; +def PD_Float32 : AnyTypeOf<[F32], "32-bit float">; +def PD_Float64 : AnyTypeOf<[F64], "64-bit float">; + +def PD_Float : AnyTypeOf<[PD_Float16, PD_Float32, PD_Float64], "floating-point">; + + +// Tensor types + +def PD_ElementType : Type, + "pd.dtype">; + +def PD_Tensor : TensorOf<[PD_ElementType]>; + + +#endif // PD_OP_BASE diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc new file mode 100644 index 0000000000000..7ca07dd5fcbba --- /dev/null +++ b/paddle/infrt/dialect/pd_ops.cc @@ -0,0 +1,177 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/pd_ops.h" + +#include "mlir/IR/Matchers.h" +#include "mlir/IR/PatternMatch.h" +#include "paddle/infrt/dialect/infrt_base.h" + +namespace mlir { +namespace pd { + +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/pd_ops.hpp.inc" +#undef GET_OP_CLASSES + +PaddleDialect::PaddleDialect(MLIRContext *context) + : Dialect("pd", context, TypeID::get()) { + addOperations< +#define GET_OP_LIST +#include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT + >(); +#undef GET_OP_LIST +} + +mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder, + mlir::Attribute value, + mlir::Type type, + mlir::Location loc) { + return builder.create(loc, value); +} + +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT +#undef GET_OP_CLASSES + +#include "paddle/infrt/dialect/rewrite.hpp.inc" // NOLINT + +void ConstantOp::build(OpBuilder &builder, + OperationState &state, + Attribute value) { + if (auto elem_attr = value.dyn_cast()) { + return ConstantOp::build(builder, state, elem_attr); + } else if (value.isa()) { + ShapedType type = RankedTensorType::get(/*shape=*/{}, value.getType()); + state.addAttribute("value", DenseElementsAttr::get(type, value)); + state.addTypes(type); + return; + } + llvm_unreachable("unsupported attribute type for building pd.constant"); +} + +LogicalResult ConstantOp::inferReturnTypes( + MLIRContext *context, + Optional location, + ValueRange operands, + DictionaryAttr attributes, + RegionRange regions, + SmallVectorImpl &inferredReturnTypes) { + inferredReturnTypes.push_back(attributes.get("value").getType()); + return success(); +} +::mlir::OpFoldResult ConstantOp::fold( + ::llvm::ArrayRef<::mlir::Attribute> operands) { + return value(); +} + +LogicalResult ElementwiseAdd::inferReturnTypes( + MLIRContext *context, + Optional location, + ValueRange operands, + DictionaryAttr attributes, + RegionRange regions, + SmallVectorImpl &inferredReturnTypes) { + inferredReturnTypes.push_back(operands[0].getType()); + return success(); +} +void ElementwiseAdd::getCanonicalizationPatterns( + ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) { + results.insert(context); +} + +::mlir::OpFoldResult ElementwiseAdd::fold( + llvm::ArrayRef operands) { + if (getElementTypeOrSelf(getType()).isa()) { + if (!operands[0] || !operands[1]) return {}; + DenseElementsAttr lhs = operands[0].dyn_cast(); + DenseElementsAttr rhs = operands[1].dyn_cast(); + if (!lhs || !rhs) return {}; + ShapedType type = getType().template cast(); + if (!type.hasStaticShape()) return {}; + Type etype = type.getElementType(); + if (!etype.isa()) return {}; + SmallVector values; + values.reserve(lhs.getNumElements()); + for (const auto zip : + llvm::zip(lhs.getValues(), rhs.getValues())) { + values.push_back( + std::plus()(std::get<0>(zip), std::get<1>(zip))); + } + return DenseElementsAttr::get(type, values); + } + return {}; +} + +LogicalResult ElementwiseDiv::inferReturnTypes( + MLIRContext *context, + Optional location, + ValueRange operands, + DictionaryAttr attributes, + RegionRange regions, + SmallVectorImpl &inferredReturnTypes) { + inferredReturnTypes.push_back(operands[0].getType()); + return success(); +} + +LogicalResult ElementwiseMul::inferReturnTypes( + MLIRContext *context, + Optional location, + ValueRange operands, + DictionaryAttr attributes, + RegionRange regions, + SmallVectorImpl &inferredReturnTypes) { + inferredReturnTypes.push_back(operands[0].getType()); + return success(); +} + +LogicalResult ElementwiseSub::inferReturnTypes( + MLIRContext *context, + Optional location, + ValueRange operands, + DictionaryAttr attributes, + RegionRange regions, + SmallVectorImpl &inferredReturnTypes) { + inferredReturnTypes.push_back(operands[0].getType()); + return success(); +} + +LogicalResult MulOp::inferReturnTypes( + MLIRContext *context, + Optional location, + ValueRange operands, + DictionaryAttr attributes, + RegionRange regions, + SmallVectorImpl &inferredReturnTypes) { + inferredReturnTypes.push_back(operands[0].getType()); + return success(); +} + +void ReluOp::getCanonicalizationPatterns( + ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) { + results.insert(context); +} + +void FusedRepeatedFCRelu::getCanonicalizationPatterns( + ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) { + results.insert(context); +} + +void BatchNormOp::getCanonicalizationPatterns( + ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) { + results.insert(context); +} + +} // namespace pd +} // namespace mlir diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h new file mode 100644 index 0000000000000..d09b6032257a2 --- /dev/null +++ b/paddle/infrt/dialect/pd_ops.h @@ -0,0 +1,57 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "mlir/Dialect/Traits.h" +#include "mlir/IR/Attributes.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Dialect.h" +#include "mlir/IR/Function.h" +#include "mlir/IR/Matchers.h" +#include "mlir/IR/Module.h" +#include "mlir/IR/OpImplementation.h" +#include "mlir/IR/StandardTypes.h" +#include "mlir/IR/TypeUtilities.h" +#include "mlir/Interfaces/CallInterfaces.h" +#include "mlir/Interfaces/DerivedAttributeOpInterface.h" +#include "mlir/Interfaces/InferTypeOpInterface.h" +#include "mlir/Interfaces/LoopLikeInterface.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" + +namespace mlir { +namespace pd { + +class PaddleDialect : public Dialect { + public: + explicit PaddleDialect(MLIRContext* context); + + static StringRef getDialectNamespace() { return "pd"; } + + /// A hook used to materialize constant values with the given type. + Operation* materializeConstant(OpBuilder& builder, + Attribute value, + Type type, + Location loc) override; + + Type parseType(DialectAsmParser& parser) const override { + return Dialect::parseType(parser); + } + void printType(Type type, DialectAsmPrinter& printer) const override { + Dialect::printType(type, printer); + } +}; + +} // namespace pd +} // namespace mlir diff --git a/paddle/infrt/dialect/pd_ops.td b/paddle/infrt/dialect/pd_ops.td new file mode 100644 index 0000000000000..9e906ad0c02cc --- /dev/null +++ b/paddle/infrt/dialect/pd_ops.td @@ -0,0 +1,182 @@ +#ifndef PD_OPS +#define PD_OPS + +include "mlir/Interfaces/InferTypeOpInterface.td" +include "mlir/Interfaces/LoopLikeInterface.td" +include "mlir/IR/OpBase.td" +include "paddle/infrt/dialect/pd_op_base.td" + +def PD_FeedOp : PD_Op<"Feed", [NoSideEffect]> { + let summary = "Feed Op"; + + let description = [{ + Feed a tensor into the model. + }]; + + let arguments = (ins); + let results = (outs PD_Tensor:$out); + + let assemblyFormat = [{ + `(` `)` attr-dict `:` type($out) + }]; +} + +def PD_ConstantOp : PD_Op<"Constant", [NoSideEffect, ConstantLike, DeclareOpInterfaceMethods, AllTypesMatch<["value", "output"]>]> { + let summary = "constant Op"; + let description = [{}]; + + let arguments = (ins ElementsAttr:$value); + let results = (outs PD_Tensor:$output); + let hasFolder = 1; + + let builders = [ + OpBuilder<"OpBuilder &builder, OperationState &state, Attribute value">, + ]; +} + +def PD_AbsOp : PD_Op<"Abs", [NoSideEffect, SameOperandsAndResultType]> { + let summary = "Computes the absolute value of a tensor"; + + let description = [{ + }]; + + let arguments = (ins PD_Tensor:$x); + let results = (outs PD_Tensor:$y); +} + +def PD_SqrtOp : PD_Op<"sqrt", [NoSideEffect, SameOperandsAndResultType]> { + let summary = "Computes the sqrt value of a tensor"; + + let description = [{ + }]; + + let arguments = (ins PD_Tensor:$x); + let results = (outs PD_Tensor:$y); +} + +def PD_ReluOp : PD_Op<"Relu", [NoSideEffect, SameOperandsAndResultType]> { + let summary = "Computes the Relu of a tensor"; + + let description = [{ + }]; + + let arguments = (ins PD_Tensor:$x); + let results = (outs PD_Tensor:$y); + let hasCanonicalizer = 1; +} + +def PD_Relu6Op : PD_Op<"Relu6", [NoSideEffect, SameOperandsAndResultType]> { + let summary = "Computes the Relu6 of a tensor"; + + let description = [{ + }]; + + let arguments = (ins PD_Tensor:$x); + let results = (outs PD_Tensor:$y); +} + +def PD_ElementwiseAdd : PD_Op<"ElementwiseAdd", [NoSideEffect, Commutative, DeclareOpInterfaceMethods]> { + let summary = "ElementwiseAdd Op"; + let description = [{ + }]; + + let arguments = (ins PD_Tensor:$x, PD_Tensor:$y, DefaultValuedAttr:$axis); + let results = (outs PD_Tensor:$out); + let hasCanonicalizer = 1; + let hasFolder = 1; +} + +def PD_ElementwiseSub : PD_Op<"ElementwiseSub", [NoSideEffect, DeclareOpInterfaceMethods]> { + let summary = "ElementwiseSub Op"; + let description = [{ + }]; + + let arguments = (ins PD_Tensor:$x, PD_Tensor:$y, DefaultValuedAttr:$axis); + let results = (outs PD_Tensor:$out); +} + +def PD_ElementwiseMul : PD_Op<"ElementwiseMul", [NoSideEffect, Commutative, DeclareOpInterfaceMethods]> { + let summary = "ElementwiseMul Op"; + let description = [{ + }]; + + let arguments = (ins PD_Tensor:$x, PD_Tensor:$y, DefaultValuedAttr:$axis); + let results = (outs PD_Tensor:$out); +} + +def PD_ElementwiseDiv : PD_Op<"ElementwiseDiv", [NoSideEffect, DeclareOpInterfaceMethods]> { + let summary = "ElementwiseDiv Op"; + let description = [{ + }]; + + let arguments = (ins PD_Tensor:$x, PD_Tensor:$y, DefaultValuedAttr:$axis); + let results = (outs PD_Tensor:$out); +} + +def PD_MatmulOp : PD_Op<"Matmul", [NoSideEffect]> { + let summary = "Computes the matrix mulplication result of two tensors"; + let description = [{ + }]; + + let arguments = (ins PD_Tensor:$x, PD_Tensor:$y, + DefaultValuedAttr:$transpose_x, + DefaultValuedAttr:$transpose_y, + DefaultValuedAttr:$alpha); + let results = (outs PD_Tensor:$out); + + //let hasCanonicalizer = 1; +} + +def PD_MulOp : PD_Op<"mul", [NoSideEffect, DeclareOpInterfaceMethods]> { + let summary = "paddle mul op"; + let description = [{}]; + + let arguments = (ins PD_Tensor:$x, PD_Tensor:$y); + let results = (outs PD_Tensor:$out); + + //let hasCanonicalizer = 1; +} + +def PD_Conv2dOp : PD_Op<"conv2d", [NoSideEffect]> { + let summary = "paddle conv2d operation"; + let description = [{ + }]; + + let arguments = (ins PD_Tensor:$Input, PD_Tensor:$Filter, PD_Tensor:$Bias); + let results = (outs PD_Tensor:$Output); + + //let hasCanonicalizer = 1; +} + +def PD_BatchNormOp : PD_Op<"batch_norm", [NoSideEffect]> { + let summary = "paddle batch_norm operation"; + let description = [{ + }]; + + let arguments = (ins PD_Tensor:$X, PD_Tensor:$Scale, PD_Tensor:$Bias, + PD_Tensor:$Mean, PD_Tensor:$Variance, + DefaultValuedAttr:$epsilon); + let results = (outs PD_Tensor:$Y); + + let hasCanonicalizer = 1; +} + +def PD_FusedFC : PD_Op<"FC", [NoSideEffect]> { + let summary = "Computes the Fully Connected result of two tensors"; + let description = [{ + }]; + + let arguments = (ins PD_Tensor:$input, PD_Tensor:$w, PD_Tensor:$bias, DefaultValuedAttr:$in_num_col_dims); + let results = (outs PD_Tensor:$out); +} + +def PD_FusedRepeatedFCRelu : PD_Op<"RepeatedFCRelu", [SameVariadicOperandSize, NoSideEffect]> { + let summary = ""; + let description = [{ }]; + + let arguments = (ins PD_Tensor:$input, Variadic:$w, Variadic:$bias); + let results = (outs PD_Tensor:$out); + let hasCanonicalizer = 1; +} + +#endif // PD_OPS diff --git a/paddle/infrt/dialect/pd_types.cc b/paddle/infrt/dialect/pd_types.cc new file mode 100644 index 0000000000000..94856e362d301 --- /dev/null +++ b/paddle/infrt/dialect/pd_types.cc @@ -0,0 +1,15 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/pd_types.h" diff --git a/paddle/infrt/dialect/pd_types.h b/paddle/infrt/dialect/pd_types.h new file mode 100644 index 0000000000000..6f9fe56338a9f --- /dev/null +++ b/paddle/infrt/dialect/pd_types.h @@ -0,0 +1,57 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file defines the types used in PaddlePaddle MLIR dialect. +// We borrowed much ideas from tensorflow mlir dialect (tf_types.h in +// tensorflow). + +#pragma once + +#include "mlir/IR/Diagnostics.h" +#include "mlir/IR/Location.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/StandardTypes.h" +#include "mlir/IR/TypeUtilities.h" +#include "mlir/IR/Types.h" + +namespace mlir { +namespace PD { + +class PaddleType : public Type { + public: + using Type::Type; + + static bool classof(Type type); +}; + +namespace detail { + +template +class PaddleTypeImpl : public Type::TypeBase { + public: + using Base = typename Type::TypeBase; + using PDBase = PaddleTypeImpl; + using Base::Base; +}; + +} // namespace detail + +#define HANDLE_PD_TYPE(pdtype, enumerant, name) \ + class pdtype##Type : public detail::PaddleTypeImpl { \ + public: \ + using PDBase::PDBase; \ + }; + +} // namespace PD +} // namespace mlir diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc new file mode 100644 index 0000000000000..3c5a2b6a7bf90 --- /dev/null +++ b/paddle/infrt/dialect/print_ir.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "llvm/ADT/Optional.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ScopedPrinter.h" +#include "llvm/Support/raw_os_ostream.h" +#include "llvm/Support/raw_ostream.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/IR/AsmState.h" +#include "mlir/IR/Block.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/Module.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Region.h" +#include "mlir/IR/Verifier.h" +#include "mlir/Parser.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Support/LogicalResult.h" +#include "mlir/Transforms/Passes.h" +#include "paddle/infrt/common/global.h" +#include "paddle/infrt/dialect/init_infrt_dialects.h" + +namespace cl = llvm::cl; + +static cl::opt inputFilename(cl::Positional, + cl::desc(""), + cl::init("-"), + cl::value_desc("filename")); + +llvm::raw_ostream &printIndent(int indent = 0) { + for (int i = 0; i < indent; ++i) llvm::outs() << " "; + return llvm::outs(); +} + +void printOperation(mlir::Operation *op, int indent); +void printRegion(mlir::Region ®ion, int indent); // NOLINT +void printBlock(mlir::Block &block, int indent); // NOLINT + +void printOperation(mlir::Operation *op, int indent) { + llvm::Optional module_op = llvm::None; + if (llvm::isa(op)) + module_op = llvm::dyn_cast(op); + llvm::Optional func_op = llvm::None; + if (llvm::isa(op)) func_op = llvm::dyn_cast(op); + + printIndent(indent) << "op: '" << op->getName(); + // This getName is inherited from Operation::getName + if (module_op) { + printIndent() << "@" << module_op->getName(); + } + // This getName is inherited from SymbolOpInterfaceTrait::getName, + // which return value of "sym_name" in ModuleOp or FuncOp attributes. + if (func_op) { + printIndent() << "@" << func_op->getName(); + } + printIndent() << "' with " << op->getNumOperands() << " operands" + << ", " << op->getNumResults() << " results" + << ", " << op->getAttrs().size() << " attributes" + << ", " << op->getNumRegions() << " regions" + << ", " << op->getNumSuccessors() << " successors\n"; + if (!op->getAttrs().empty()) { + printIndent(indent) << op->getAttrs().size() << " attributes:\n"; + for (mlir::NamedAttribute attr : op->getAttrs()) { + printIndent(indent + 1) << "- {" << attr.first << " : " << attr.second + << "}\n"; + } + } + + if (op->getNumRegions() > 0) { + printIndent(indent) << op->getNumRegions() << " nested regions:\n"; + for (mlir::Region ®ion : op->getRegions()) { + printRegion(region, indent + 1); + } + } +} + +void printRegion(mlir::Region ®ion, int indent) { // NOLINT + printIndent(indent) << "Region with " << region.getBlocks().size() + << " blocks:\n"; + for (mlir::Block &block : region.getBlocks()) { + printBlock(block, indent + 1); + } +} + +void printBlock(mlir::Block &block, int indent) { // NOLINT + printIndent(indent) << "Block with " << block.getNumArguments() + << " arguments" + << ", " << block.getNumSuccessors() << " successors" + << ", " << block.getOperations().size() + << " operations\n"; + + for (mlir::Operation &operation : block.getOperations()) { + printOperation(&operation, indent + 1); + } +} + +int main(int argc, char **argv) { + mlir::registerAsmPrinterCLOptions(); + mlir::registerMLIRContextCLOptions(); + mlir::registerPassManagerCLOptions(); + cl::ParseCommandLineOptions(argc, argv, "mlir demo"); + + mlir::MLIRContext *context = infrt::Global::getMLIRContext(); + context->allowUnregisteredDialects(); + auto ®istry = context->getDialectRegistry(); + infrt::RegisterCinnDialects(registry); + + // mlir will verify module automatically after parsing. + // https://github.com/llvm/llvm-project/blob/38d18d93534d290d045bbbfa86337e70f1139dc2/mlir/lib/Parser/Parser.cpp#L2051 + // mlir::OwningModuleRef module_ref = mlir::parseSourceString(mlir_source, + // context); + mlir::OwningModuleRef module_ref = + mlir::parseSourceFile(inputFilename, context); + std::cout << "----------print IR Structure begin----------" << std::endl; + printOperation(module_ref->getOperation(), 0); + std::cout << "----------print IR Structure end----------" << std::endl; + + module_ref->dump(); + return 0; +} diff --git a/paddle/infrt/dialect/rewrite.td b/paddle/infrt/dialect/rewrite.td new file mode 100644 index 0000000000000..aa81dd72d059b --- /dev/null +++ b/paddle/infrt/dialect/rewrite.td @@ -0,0 +1,90 @@ +#ifndef INFRT_REWRITE +#define INFRT_REWRITE + +include "paddle/infrt/dialect/infrt_base.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "paddle/infrt/dialect/pd_ops.td" + +//===----------------------------------------------------------------------===// +// This is to fuse the composition: 'Matmul o ElementwiseAdd' into 'PD_FusedFC'. +// +// We have: +// (Matmul) z = x * y +// (Add) out = z + bias +// +// which corresponds to the following computation: +// (FusedFC) out = x * y + bias +// +// Todo: +// 1. Make the constrait more completely. +// 2. Consider the case of : out = bias + z +//===----------------------------------------------------------------------===// +def FuseMulAdd : Pat<(PD_ElementwiseAdd (PD_MatmulOp $x, $y, $transpose_x, $transpose_y, $alpha), $bias, $axis), + (PD_FusedFC $x, $y, $bias, (INFRT_createI32Attr<"1">)), + [(IsBoolAttrEq<"false"> $transpose_x),(IsBoolAttrEq<"false"> $transpose_y)]>; + + +//===----------------------------------------------------------------------===// +// This is to fuse the composition: 'FusedFC o Relu' into 'FusedRepeatedFCRelu'. +// +// We have: +// (FusedFC) z = fc(x, y, bias) +// (Relu) out = relu(z) +// +// which corresponds to the following computation: +// (FusedRepeatedFCRelu) out = RepeatedFCRelu(x, [y], [bias]) +// +//===----------------------------------------------------------------------===// +def FuseFCRelu : Pat<(PD_ReluOp (PD_FusedFC $x, $y, $bias, $_)), + (PD_FusedRepeatedFCRelu $x, (INFRT_cvtValueToValueRange $y), (INFRT_cvtValueToValueRange $bias))>; + +//===----------------------------------------------------------------------===// +// This is to fold 'FusedRepeatedFCRelu' op. +// +// We have: +// (FusedRepeatedFCRelu) z = RepeatedFCRelu(x, [y, ...], [bias, ...]) +// (FusedRepeatedFCRelu) out = RepeatedFCRelu(z, [y1, ...], [bias1, ...]) +// +// which corresponds to the following computation: +// (FusedRepeatedFCRelu) out = RepeatedFCRelu(x, [y, ..., y1, ...], [bias, ..., bias1, ....]) +// +//===----------------------------------------------------------------------===// +def FuseRepeatedFCRelu2 : Pat<(PD_FusedRepeatedFCRelu (PD_FusedRepeatedFCRelu $x, $y, $bias), $y_2, $bias_2), + (PD_FusedRepeatedFCRelu $x, (INFRT_concatTwoValueRange $y, $y_2), (INFRT_concatTwoValueRange $bias, $bias_2))>; + + +//===----------------------------------------------------------------------===// +// This is to fuse the composition: 'BatchNorm o Conv' into 'Conv' +// by deriving new 'w' and 'b' for 'Conv': +// +// We have: +// (Conv) z = w * x + b +// (BatchNorm) y = scale * (z - mean) / sqrt(var + eps) + bias +// +// which corresponds to the following computation: +// y = w_ * x + b_ +// where +// w_ = scale * w / sqrt(var + eps) +// b_ = B + scale * (b - mean) / sqrt(var + eps) +// +//===----------------------------------------------------------------------===// +def FuseBatchNormWithConvPattern: Pat< + (PD_BatchNormOp + (PD_Conv2dOp $input, $filter, $bias), + $scale, $bias_2, $mean, $var, $epsilon), + (PD_Conv2dOp + $input, + (PD_MulOp $filter, + (PD_ElementwiseDiv:$coefficientW + $scale, + (PD_SqrtOp (PD_ElementwiseAdd $var, (PD_ConstantOp $epsilon), (INFRT_createI32Attr<"1">))), + (INFRT_createI32Attr<"1">))), + (PD_ElementwiseAdd + $bias, + (PD_MulOp + (PD_ElementwiseSub $bias, $mean, (INFRT_createI32Attr<"1">)), + $coefficientW), + (INFRT_createI32Attr<"1">))) +>; + +#endif // INFRT_REWRITE diff --git a/paddle/infrt/dialect/tensor_shape.cc b/paddle/infrt/dialect/tensor_shape.cc new file mode 100644 index 0000000000000..ef5a5525cb22f --- /dev/null +++ b/paddle/infrt/dialect/tensor_shape.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/tensor_shape.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace infrt::ts { +using namespace mlir; // NOLINT + +void TensorShapeDialect::initialize() { + allowUnknownTypes(); + addTypes(); + addOperations< +#define GET_OP_LIST +#include "paddle/infrt/dialect/tensor_shape.cpp.inc" + >(); +} + +Type TensorShapeDialect::parseType(DialectAsmParser &parser) const { + StringRef keyword; + if (parser.parseKeyword(&keyword)) return Type(); + if (keyword == "shape") return ShapeType::get(getContext()); + if (keyword == "partial_shape") return PartialShapeType::get(getContext()); + + parser.emitError(parser.getNameLoc(), "unknown shape type: ") << keyword; + return Type(); +} + +void TensorShapeDialect::printType(::mlir::Type type, + ::mlir::DialectAsmPrinter &os) const { + if (type.isa()) { + os << "shape"; + return; + } + + if (type.isa()) { + os << "partial_shape"; + return; + } + llvm_unreachable("unexpected 'shape' type kind"); +} + +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/tensor_shape.cpp.inc" // NOLINT + +} // namespace infrt::ts diff --git a/paddle/infrt/dialect/tensor_shape.h b/paddle/infrt/dialect/tensor_shape.h new file mode 100644 index 0000000000000..bd3fa8853675a --- /dev/null +++ b/paddle/infrt/dialect/tensor_shape.h @@ -0,0 +1,40 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +namespace infrt::ts { + +class ShapeType + : public mlir::Type::TypeBase { + public: + using Base::Base; +}; + +class PartialShapeType : public mlir::Type::TypeBase { + public: + using Base::Base; +}; + +using namespace mlir; // NOLINT +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/tensor_shape.hpp.inc" +#include "paddle/infrt/dialect/tensor_shape_dialect.hpp.inc" + +} // namespace infrt::ts diff --git a/paddle/infrt/dialect/tensor_shape.td b/paddle/infrt/dialect/tensor_shape.td new file mode 100644 index 0000000000000..d3714c8ed14d3 --- /dev/null +++ b/paddle/infrt/dialect/tensor_shape.td @@ -0,0 +1,49 @@ +#ifdef INFRT_OPS +#else +#define INFRT_OPS + +include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/tensor_shape_base.td" +include "mlir/Interfaces/SideEffectInterfaces.td" + +// Base class for the operation in the TensorShape dialect +class TS_Op traits = []> : + Op { + let parser = [{ return infrt::dialect::parse$cppClass(parser, result); }]; + let printer = " return infrt::dialect::printOpWithOperands(p, *this)" ";"; +} + +def TS_BuildShapeOp : TS_Op<"build_shape", [NoSideEffect]> { + let summary = "Build tensor shape operation"; + let description = [{ + An operation that builds a tensor shape of given ranks and extents. + }]; + + let arguments = (ins I64ArrayAttr:$value); + let results = (outs TS_Shape:$output); + let assemblyFormat = "$value attr-dict"; +} + +def TS_GetNumElementsOp : TS_Op<"get_num_elements"> { + let summary = "Returns the number of elements in the shape"; + + let description = [{ + An operation that returns the number of elements in the given shape. + }]; + + let arguments = (ins TS_Shape); + let results = (outs I64); + let assemblyFormat = "operands attr-dict"; +} + +def TS_PrintShapeOp : TS_Op<"print_shape"> { + let summary = "Print tensor shape operation"; + let description = [{ + An operation that prints a tensor shape. + }]; + + let arguments = (ins TS_Shape:$shape); + let assemblyFormat = "operands attr-dict"; +} + +#endif diff --git a/paddle/infrt/dialect/tensor_shape_base.td b/paddle/infrt/dialect/tensor_shape_base.td new file mode 100644 index 0000000000000..ea1c1854d77ca --- /dev/null +++ b/paddle/infrt/dialect/tensor_shape_base.td @@ -0,0 +1,36 @@ +#ifdef TS_OPS_BASE +#else +#define TS_OPS_BASE + +// Tensor shape dialect. +def TensorShapeDialect : Dialect { + let name = "ts"; + + let description = [{ + The Tensor Shape dialect. + + This dialect contains operations for working with tensor shapes. + }]; + + let cppNamespace = "::infrt::ts"; +} + +// Type definition. +def TS_Shape : DialectType()">, "!ts.shape type">, +BuildableType<"$_builder.getType<::infrt::ts::ShapeType>()"> { + let typeDescription = [{ + `!ts.shape type` represents a static tensor shape. +}]; +} + +def TS_PartialShape : DialectType()">, "!ts.partial_shape type">, +BuildableType<"$_builder.getType<::infrt::ts::PartialShapeType>()"> { + let typeDescription = [{ + `!ts.partial_shape type` represents either a static tensor shape, unranked + tensor shape or a ranked tensor shape with unknown dimension sizes. +}]; +} + +#endif // TS_OPS_BASE diff --git a/paddle/infrt/dialect/test_kernels.cc b/paddle/infrt/dialect/test_kernels.cc new file mode 100644 index 0000000000000..894d96f95ad5c --- /dev/null +++ b/paddle/infrt/dialect/test_kernels.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/test_kernels.h" + +#include "mlir/IR/Builders.h" +#include "mlir/IR/OpDefinition.h" +#include "mlir/IR/OpImplementation.h" +#include "mlir/IR/StandardTypes.h" +#include "mlir/IR/TypeUtilities.h" + +namespace infrt::dialect { + +//===----------------------------------------------------------------------===// +// BenchmarkOp +//===----------------------------------------------------------------------===// + +// Parse the BenchmarkOp in the following format +// infrt.benchmark "add.i32"(%c : i32, %d : f32) +// max_count = 100, duration_secs = 1 { +// ... +// } + +static ParseResult parseBenchmarkOp(OpAsmParser &parser, // NOLINT + OperationState &result) { // NOLINT + StringAttr nameAttr; + if (parser.parseAttribute(nameAttr, "name", result.attributes)) + return failure(); + + // Parse the operands, e.g. (%c : i32, %d : f32) + if (parser.parseLParen()) return failure(); + + SmallVector operands; + SmallVector types; + llvm::SMLoc type_loc = parser.getCurrentLocation(); + + if (parser.parseOptionalRParen()) { + // Parse non-empty operands + do { + // Parse %c : i32, + OpAsmParser::OperandType operand; + Type type; + + if (parser.parseOperand(operand) || parser.parseColonType(type)) + return failure(); + + operands.push_back(operand); + types.push_back(type); + } while (succeeded(parser.parseOptionalComma())); + + if (parser.parseRParen()) return failure(); + } + + if (parser.resolveOperands(operands, types, type_loc, result.operands)) + return failure(); + + // Parse the keyword attribute, e.g. max_count = 100, duration_secs = 1 + do { + StringRef attr; + Attribute resultAttr; + if (parser.parseKeyword(&attr) || parser.parseEqual() || + parser.parseAttribute(resultAttr, + parser.getBuilder().getIntegerType(32), + attr, + result.attributes)) + return failure(); + } while (succeeded(parser.parseOptionalComma())); + + // Set the default attribute num_warmup_runs to 1 if unset + auto setDefaultAttrIfUnset = [&](const char *attr_name, int value) { + bool found = llvm::any_of(result.attributes, + [attr_name](const NamedAttribute &attr) { + return attr.first == attr_name; + }); + if (!found) { + IntegerAttr default_val = parser.getBuilder().getI32IntegerAttr(value); + result.addAttribute(attr_name, default_val); + } + }; + setDefaultAttrIfUnset("num_warmup_runs", 1); + + Region *target = result.addRegion(); + return parser.parseRegion(*target, + operands, + types, + /*enableNameShadowing=*/true); +} + +// Print the BenchmarkOp in the following format +// infrt.benchmark "add.i32"(%c : i32, %d : f32) +// max_count = 100, duration_secs = 1 { +// ... +// } +static void print(OpAsmPrinter &p, BenchmarkOp op) { // NOLINT + p << "infrt.benchmark "; + + // Print the name attribute, e.g "add.i32" + auto name_attr = op.getAttr("name"); + p << name_attr; + + // Print the operands and types, e.g. (%c : i32, %d : f32) + p << '('; + llvm::interleaveComma(llvm::zip(op.getOperands(), op.getOperandTypes()), + p, + [&](const auto &it) { + p << std::get<0>(it) << " : " << std::get<1>(it); + }); + p << ") "; + + bool need_comma = false; + // Print the attributes, e.g. max_count = 100, duration_secs = 1 + for (auto &name_attr : op.getAttrs()) { + auto id = name_attr.first; + if (id == "name") continue; + if (need_comma) p << ", "; + auto attr = name_attr.second; + p << id << " = "; + if (auto int_attr = attr.dyn_cast()) { + int_attr.getValue().print(p.getStream(), /*isSigned=*/false); + } else { + op.emitOpError("Unexpected attribute"); + } + need_comma = true; + } + p << ' '; + + // Print the region + // Reuse the argument names provided to the op for the bbarg names within + // the region. + p.shadowRegionArgs(op.region(), op.getOperands()); + p.printRegion(op.region(), /*printEntryBlockArgs=*/false); +} + +static LogicalResult verify(BenchmarkOp op) { + // Verify that the target benchmark region has exactly one return value. + auto ®ion = op.region(); + auto &last_op = region.front().back(); + if (last_op.getName().getStringRef() != "infrt.return") { + return op.emitOpError("missing return statement"); + } + if (last_op.getNumOperands() != 1) { + return op.emitOpError( + "incorrect number of return values. One return value is expected"); + } + + return success(); +} + +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/test_kernels.cpp.inc" + +} // namespace infrt::dialect diff --git a/paddle/infrt/dialect/test_kernels.h b/paddle/infrt/dialect/test_kernels.h new file mode 100644 index 0000000000000..29d4209cb7280 --- /dev/null +++ b/paddle/infrt/dialect/test_kernels.h @@ -0,0 +1,23 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "mlir/IR/OpDefinition.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" + +namespace infrt::dialect { +using namespace mlir; // NOLINT +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/test_kernels.hpp.inc" +} // namespace infrt::dialect diff --git a/paddle/infrt/dialect/test_kernels.td b/paddle/infrt/dialect/test_kernels.td new file mode 100644 index 0000000000000..6aa12f252d014 --- /dev/null +++ b/paddle/infrt/dialect/test_kernels.td @@ -0,0 +1,65 @@ +// Operation definitions for testing. + +#ifdef TEST_OPS +#else +#define TEST_OPS + +include "paddle/infrt/dialect/infrt_base.td" +include "mlir/Interfaces/SideEffectInterfaces.td" + +// Base class for Test dialect ops. +class Test_Op traits = []> : + Op { + + // Each registered op in the Test namespace needs to provide all of a printer, + // parser and verifier. + let printer = [{ return infrt::dialect::print(p, *this); }]; + let verifier = [{ return infrt::dialect::verify(*this); }]; + let parser = [{ return infrt::dialect::parse$cppClass(parser, result); }]; +} + +def BenchmarkOp : Test_Op<"benchmark"> { + let summary = "benchmark operation"; + let description = [{ + The "infrt.benchmark" operation benchmarks the performance of an MLIR + region by executing the given MLIR region repeatedly up to the + `duratino_secs` seconds or `max_count` times. `num_warmup_runs` specifies + the number of warm up runs to run the given MLIR region before the + benchmark starts. + + The target MLIR region can take an arbitrary number of arguments and + should return exactly one value. The arguments for the MLIR region are + provided as the operands of the infrt.benchmark op. + + Example: + infrt.benchmark "add.i32"(%c : i32, %d : f32) max_count = 100, duration_secs = 1 { + // code for benchmarking + ... + } + + infrt.benchmark "add.i32"(%c : i32) + duration_secs = 1, + max_count = 100, + num_warmup_runs = 10 { + // The MLIR code to be benchmarked goes here. + // The following code benchmarks the infrt.add.i32 kernel. + %x = infrt.add.i32 %c, %c + // The benchmarked function needs to return exactly one value. + infrt.return %x : i32 + } + }]; + + let regions = (region SizedRegion<1>:$region); + + let arguments = (ins + Variadic, + I32Attr:$duration_secs, + I32Attr:$max_count, + StrAttr:$name, + DefaultValuedAttr:$num_warmup_runs + ); + + let results = (outs); +} + +#endif // TEST_OPS diff --git a/paddle/infrt/dialect/types.cc b/paddle/infrt/dialect/types.cc new file mode 100644 index 0000000000000..6d6f6a20b46c9 --- /dev/null +++ b/paddle/infrt/dialect/types.cc @@ -0,0 +1,17 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/types.h" + +namespace infrt::hlir::mlir {} // namespace infrt::hlir::mlir diff --git a/paddle/infrt/dialect/types.h b/paddle/infrt/dialect/types.h new file mode 100644 index 0000000000000..a9a2b61871cc0 --- /dev/null +++ b/paddle/infrt/dialect/types.h @@ -0,0 +1,16 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include diff --git a/paddle/infrt/external_kernels/CMakeLists.txt b/paddle/infrt/external_kernels/CMakeLists.txt new file mode 100644 index 0000000000000..faffc3909bc1e --- /dev/null +++ b/paddle/infrt/external_kernels/CMakeLists.txt @@ -0,0 +1,13 @@ +set(external_kernels_src "basic_kernels.cc") + +cc_library(external_kernels SHARED SRCS ${external_kernels_src}) +set_target_properties(external_kernels PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + +set(basic_mlir "${CMAKE_CURRENT_SOURCE_DIR}/basic.mlir") +set(external_kernels_lib "${CMAKE_CURRENT_BINARY_DIR}/libexternal_kernels.so") +message(STATUS "basic_mlir: ${basic_mlir}") +message(STATUS "external_kernels_lib: ${external_kernels_lib}") +add_test( + NAME run_and_check_external_kernels + COMMAND sh -c "${CMAKE_BINARY_DIR}/infrt/host_context/infrt-exec -i ${basic_mlir} --shared_libs=${external_kernels_lib} | ${LLVM_PATH}/bin/FileCheck ${basic_mlir}" +) diff --git a/paddle/infrt/external_kernels/basic.mlir b/paddle/infrt/external_kernels/basic.mlir new file mode 100644 index 0000000000000..843b12ced21a9 --- /dev/null +++ b/paddle/infrt/external_kernels/basic.mlir @@ -0,0 +1,21 @@ +// CHECK: basic +func @basic() -> f32 { + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %v2 = "external.add.f32"(%v0, %v1) : (f32, f32) -> f32 + + // CHECK: 1 + "external.print.f32"(%v0) : (f32) -> () + // CHECK: 2 + "external.print.f32"(%v1) : (f32) -> () + + // CHECK: 3 + "external.print.f32"(%v2) : (f32) -> () + + %v3 = "external.mul.f32"(%v2, %v1) : (f32, f32) -> f32 + + // CHECK: 6 + "external.print.f32"(%v3) : (f32) -> () + + infrt.return %v3 : f32 +} diff --git a/paddle/infrt/external_kernels/basic_kernels.cc b/paddle/infrt/external_kernels/basic_kernels.cc new file mode 100644 index 0000000000000..b59a8881fb092 --- /dev/null +++ b/paddle/infrt/external_kernels/basic_kernels.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/kernel_utils.h" + +template +T add(T a, T b) { + return a + b; +} + +template +T sub(T a, T b) { + return a - b; +} + +template +T mul(T a, T b) { + return a * b; +} + +template +T div(T a, T b) { + return a / b; +} + +template +void print(T a) { + std::cout << a << std::endl; +} + +void RegisterKernels(infrt::host_context::KernelRegistry *registry) { + // int32 + registry->AddKernel("external.add.i32", INFRT_KERNEL(add)); + registry->AddKernel("external.sub.i32", INFRT_KERNEL(sub)); + registry->AddKernel("external.mul.i32", INFRT_KERNEL(mul)); + registry->AddKernel("external.div.i32", INFRT_KERNEL(div)); + registry->AddKernel("external.print.i32", INFRT_KERNEL(print)); + + // float + registry->AddKernel("external.add.f32", INFRT_KERNEL(add)); + registry->AddKernel("external.sub.f32", INFRT_KERNEL(sub)); + registry->AddKernel("external.mul.f32", INFRT_KERNEL(mul)); + registry->AddKernel("external.div.f32", INFRT_KERNEL(div)); + registry->AddKernel("external.print.f32", INFRT_KERNEL(print)); +} diff --git a/paddle/infrt/external_kernels/fc.mlir b/paddle/infrt/external_kernels/fc.mlir new file mode 100644 index 0000000000000..bdac9ded2ef65 --- /dev/null +++ b/paddle/infrt/external_kernels/fc.mlir @@ -0,0 +1,43 @@ +// CHECK-LABEL: @fc +func @fc(%input : !infrt.tensor, + %w : !infrt.tensor, + %bias : !infrt.tensor) -> !infrt.tensor +{ + %out = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor + // dt.fill_tensor_with_constant.f32 (%out : !infrt.tensor) {value=0.0:f32} + + // fc1 + "external.matmul"(%input, %w, %out) {}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () + "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () + "external.sigmoid"(%out, %out) {}: (!infrt.tensor, !infrt.tensor) -> () + + // fc2 + "external.matmul"(%out, %w, %out) {}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () + "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () + "external.sigmoid"(%out, %out) {}: (!infrt.tensor, !infrt.tensor) -> () + + infrt.return %out : !infrt.tensor +} + +// CHECK-LABEL: @benchmark +func @benchmark() { + %input = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor + dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor) {value=1.0:f32} + + %w = dt.create_uninit_tensor.f32 [50, 50] -> !infrt.tensor + dt.fill_tensor_with_constant.f32 (%w : !infrt.tensor) {value=2.0:f32} + + %bias = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor + dt.fill_tensor_with_constant.f32 (%bias : !infrt.tensor) {value=3.0:f32} + + infrt.benchmark "add.f32"( + %input:!infrt.tensor, + %w:!infrt.tensor, + %bias:!infrt.tensor) + duration_secs = 100, max_count = 300000, num_warmup_runs = 3 + { + %res = infrt.call @fc(%input, %w, %bias) : (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> (!infrt.tensor) + infrt.return %res : !infrt.tensor + } + infrt.return +} diff --git a/paddle/infrt/external_kernels/paddle.mlir b/paddle/infrt/external_kernels/paddle.mlir new file mode 100644 index 0000000000000..e7b8e9efba838 --- /dev/null +++ b/paddle/infrt/external_kernels/paddle.mlir @@ -0,0 +1,50 @@ +// CHECK: paddle_func +func @paddle_func() -> () { + %input = dt.create_uninit_tensor.f32 [3, 5] -> !infrt.tensor + dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor) {value=1.0:f32} + + %w = dt.create_uninit_tensor.f32 [5, 4] -> !infrt.tensor + dt.fill_tensor_with_constant.f32 (%w : !infrt.tensor) {value=2.0:f32} + + %bias = dt.create_uninit_tensor.f32 [4] -> !infrt.tensor + dt.fill_tensor_with_constant.f32 (%bias : !infrt.tensor) {value=3.0:f32} + + %out = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor + dt.fill_tensor_with_constant.f32 (%out : !infrt.tensor) {value=0.0:f32} + + "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!infrt.tensor, !infrt.tensor, !infrt.tensor, !infrt.tensor) -> () + // CHECK-LABEL: tensor: shape=shape[3,5], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + dt.print_tensor (%input : !infrt.tensor) + // CHECK-LABEL: tensor: shape=shape[5,4], values=[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] + dt.print_tensor (%w : !infrt.tensor) + dt.print_tensor (%bias : !infrt.tensor) + dt.print_tensor (%out : !infrt.tensor) + + // test external.matmul + %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor + dt.fill_tensor_with_constant.f32 (%out1 : !infrt.tensor) {value=0.0:f32} + "external.matmul"(%input, %w, %out1) {}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () + dt.print_tensor (%out1 : !infrt.tensor) + + // test external.elementwise_add + %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor + dt.fill_tensor_with_constant.f32 (%out2 : !infrt.tensor) {value=0.0:f32} + %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor + dt.fill_tensor_with_constant.f32 (%bias1 : !infrt.tensor) {value=3.0:f32} + "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () + dt.print_tensor (%out2 : !infrt.tensor) + + // test external.relu + %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor + dt.fill_tensor_with_constant.f32 (%out3 : !infrt.tensor) {value=0.0:f32} + "external.relu"(%out1, %out3) {}: (!infrt.tensor, !infrt.tensor) -> () + dt.print_tensor (%out3 : !infrt.tensor) + + // test external.sigmoid + %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor + dt.fill_tensor_with_constant.f32 (%out4 : !infrt.tensor) {value=0.0:f32} + "external.sigmoid"(%out1, %out4) {}: (!infrt.tensor, !infrt.tensor) -> () + dt.print_tensor (%out4 : !infrt.tensor) + + infrt.return +} diff --git a/paddle/infrt/gtest_main.cc b/paddle/infrt/gtest_main.cc new file mode 100644 index 0000000000000..26e2b5dcfc61a --- /dev/null +++ b/paddle/infrt/gtest_main.cc @@ -0,0 +1,23 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + gflags::ParseCommandLineFlags(&argc, &argv, false); + + return RUN_ALL_TESTS(); +} diff --git a/paddle/infrt/host_context/CMakeLists.txt b/paddle/infrt/host_context/CMakeLists.txt new file mode 100644 index 0000000000000..fdba9af4a5912 --- /dev/null +++ b/paddle/infrt/host_context/CMakeLists.txt @@ -0,0 +1,29 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + kernel_frame.cc + kernel_registry.cc + value.cc + kernel_utils.cc + symbol_table.cc + op_executable.cc + core_runtime.cc + mlir_to_runtime_translate.cc + function.cc + mlir_function_executable.cc + mlir_program_executor.cc + ) + +cc_test_tiny(test_infrt_host_context_value SRCS value_test.cc DEPS infrt ${MLIR_IR_LIBS}) +cc_test_tiny(test_infrt_kernel_utils SRCS kernel_utils_test.cc DEPS infrt ${MLIR_IR_LIBS}) +cc_test_tiny(test_infrt_kernel_registry SRCS kernel_registry_test.cc DEPS infrt ${MLIR_IR_LIBS}) +cc_test_tiny(test_infrt_op_executable SRCS op_executable_test.cc DEPS infrt ${MLIR_IR_LIBS}) +cc_test_tiny(test_infrt_core_runtime SRCS core_runtime_test.cc DEPS infrt ${MLIR_IR_LIBS}) +cc_test_tiny(test_infrt_mlir_to_runtime_translate SRCS mlir_to_runtime_translate_test.cc DEPS infrt ${MLIR_IR_LIBS}) + +infrt_exec_check(test_infrt_mlir_exec_on_basic mlir_tests/basic.mlir) +infrt_exec_check(test_infrt_mlir_exec_on_shape mlir_tests/shape.mlir) +infrt_exec_check(test_infrt_mlir_exec_on_dense_tensor mlir_tests/dense_tensor.mlir) + +add_executable(infrt-exec mlir_exec.cc) +target_link_libraries(infrt-exec infrt ${MLIR_IR_LIBS}) diff --git a/paddle/infrt/host_context/core_runtime.cc b/paddle/infrt/host_context/core_runtime.cc new file mode 100644 index 0000000000000..cdb8cc99ecb26 --- /dev/null +++ b/paddle/infrt/host_context/core_runtime.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/core_runtime.h" + +#include + +#include +#include + +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/op_executable.h" +#include "paddle/infrt/host_context/symbol_table.h" + +namespace infrt::host_context { + +struct CoreRuntime::Impl { + KernelRegistry* kernel_registry{}; + SymbolTable symbol_table; + std::vector op_executables; + + mutable std::vector results; +}; + +SymbolTable* CoreRuntime::symbol_table() { return &impl_->symbol_table; } + +CoreRuntime::CoreRuntime(CoreRuntime::Impl* impl) : impl_(impl) { CHECK(impl); } + +void CoreRuntime::Execute() { + // std::cout << "CoreRuntime::Execute" << std::endl; + int op_offset = 0; + for (auto& op : impl_->op_executables) { + VLOG(3) << "running op " << op_offset++ << " " << op.name(); + op.Execute(); + } +} + +KernelRegistry* CoreRuntime::kernel_registry() const { + return impl_->kernel_registry; +} + +size_t CoreRuntime::num_ops() const { return impl_->op_executables.size(); } + +CoreRuntimeBuilder::CoreRuntimeBuilder(KernelRegistry* kernel_registry) + : CoreRuntime(new Impl) { + impl_->kernel_registry = + kernel_registry ? kernel_registry : GetCpuKernelRegistry(); +} + +OpExecutableBuilder* CoreRuntimeBuilder::NewOpExecutable( + const std::string& op_name) { + CHECK(impl_.get()); + impl_->op_executables.emplace_back( + op_name, symbol_table(), impl_->kernel_registry); + return &impl_->op_executables.back(); +} + +void CoreRuntimeBuilder::FeedInArgs( + llvm::ArrayRef> args) { + for (auto& item : args) { + symbol_table()->Register(item.first, item.second); + } +} + +void CoreRuntimeBuilder::SetKernelRegistry(KernelRegistry* x) { + CHECK(x); + impl_->kernel_registry = x; +} + +llvm::SmallVector CoreRuntime::GetResults( + llvm::ArrayRef arg_names) { + llvm::SmallVector results; + for (auto& name : arg_names) { + results.push_back(ValueRef(symbol_table()->GetValue(name))); + } + + return results; +} + +CoreRuntime::~CoreRuntime() {} + +} // namespace infrt::host_context diff --git a/paddle/infrt/host_context/core_runtime.h b/paddle/infrt/host_context/core_runtime.h new file mode 100644 index 0000000000000..802f8b17bb010 --- /dev/null +++ b/paddle/infrt/host_context/core_runtime.h @@ -0,0 +1,86 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include +#include +#include + +#include "paddle/infrt/host_context/value.h" + +namespace infrt::host_context { + +class KernelRegistry; +class OpExecutable; +class OpExecutableBuilder; +class SymbolTable; + +/** + * CoreRuntime encapsulate the execution for a sequence of ops. + * Each function call will bind to a CoreRuntime instance, push the argument + * Values in to the argument-list, and get the + * result Values from the return-list. + */ +class CoreRuntime : public std::enable_shared_from_this { + public: + //! Execute a program. + void Execute(); + + //! Return the number of ops. + size_t num_ops() const; + + //! Get the results of the execution. + llvm::SmallVector // + GetResults(llvm::ArrayRef arg_names); + + std::shared_ptr getptr() { + return std::shared_ptr(this); + } + + KernelRegistry* kernel_registry() const; + + ~CoreRuntime(); + + protected: + //! Get the symbol table. + SymbolTable* symbol_table(); + + class Impl; + explicit CoreRuntime(Impl* impl); + std::unique_ptr impl_; +}; + +/** + * The builder for CoreRuntime, help to construct a function. + */ +class CoreRuntimeBuilder : public CoreRuntime { + public: + explicit CoreRuntimeBuilder(KernelRegistry* kernel_registry); + + using CoreRuntime::symbol_table; + + void SetKernelRegistry(KernelRegistry* x); + + //! Feed the input arguments, each item is a pair of arg-name and arg-value. + void FeedInArgs(llvm::ArrayRef> args); + + llvm::ArrayRef attr_names() const; + + OpExecutableBuilder* NewOpExecutable(const std::string& op_name); +}; + +} // namespace infrt::host_context diff --git a/paddle/infrt/host_context/core_runtime_test.cc b/paddle/infrt/host_context/core_runtime_test.cc new file mode 100644 index 0000000000000..3c0dadaad42e7 --- /dev/null +++ b/paddle/infrt/host_context/core_runtime_test.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/core_runtime.h" + +#include + +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/kernel_utils.h" +#include "paddle/infrt/host_context/op_executable.h" +#include "paddle/infrt/host_context/symbol_table.h" + +namespace infrt { +namespace host_context { + +int add(int a, int b) { return a + b; } +int sub(int a, int b) { return a - b; } + +TEST(CoreRuntime, basic) { + KernelRegistry registry; + registry.AddKernel("infrt.test.addi32", INFRT_KERNEL(add)); + registry.AddKernel("infrt.test.subi32", INFRT_KERNEL(sub)); + + CoreRuntimeBuilder builder(®istry); + auto* table = builder.symbol_table(); + table->Register("a", 1); + table->Register("b", 2); + table->Register("d", 4); + + // c = a + b + auto* op0 = builder.NewOpExecutable("infrt.test.addi32"); + op0->AppendArgument("a"); + op0->AppendArgument("b"); + op0->SetResults({"c"}); + + // e = c - d + auto* op1 = builder.NewOpExecutable("infrt.test.subi32"); + op1->AppendArgument("c"); + op1->AppendArgument("d"); + op1->SetResults({"e"}); + + builder.Execute(); + + ASSERT_EQ(table->GetValue("d")->get(), 4); + ASSERT_EQ(table->GetValue("c")->get(), 3); + ASSERT_EQ(table->GetValue("e")->get(), -1); +} + +TEST(CoreRuntime, function) { + // The function: + // func(int a, int b) { + // int c = a + b + // return c + // } + KernelRegistry registry; + registry.AddKernel("infrt.test.addi32", INFRT_KERNEL(add)); + registry.AddKernel("infrt.test.subi32", INFRT_KERNEL(sub)); + + CoreRuntimeBuilder builder(®istry); + auto* table = builder.symbol_table(); + + std::vector> feeds{ + {std::make_pair("a", ValueRef(new Value(1))), // + std::make_pair("b", ValueRef(new Value(2)))}}; + builder.FeedInArgs(llvm::ArrayRef>( + feeds.data(), feeds.size())); + + ASSERT_EQ(table->Get("a"), 1); + ASSERT_EQ(table->Get("b"), 2); + ASSERT_EQ(table->size(), 2UL); + + auto* op = builder.NewOpExecutable("infrt.test.addi32"); + op->AppendArgument("a"); + op->AppendArgument("b"); + op->SetResults({"c"}); + + builder.Execute(); + + auto res = builder.GetResults({"c"}); + ASSERT_EQ(res.size(), 1UL); + ASSERT_EQ(res[0].get(), 3); +} + +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/function.cc b/paddle/infrt/host_context/function.cc new file mode 100644 index 0000000000000..8b111f2645a80 --- /dev/null +++ b/paddle/infrt/host_context/function.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/function.h" + +namespace infrt { +namespace host_context {} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/function.h b/paddle/infrt/host_context/function.h new file mode 100644 index 0000000000000..030e3b6cfbc09 --- /dev/null +++ b/paddle/infrt/host_context/function.h @@ -0,0 +1,62 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include + +namespace infrt { +namespace host_context { + +struct Value; +struct ValueRef; + +/** + * Base class of all executable Function. + * + * This is used by `infrt.call` op, to execute a function. + */ +class Function { + public: + Function(Function&& other) + : name_(other.name_), + num_arguments_(other.num_arguments_), + num_results_(other.num_results_) {} + + Function() = delete; + + std::string name() const { return name_; } + + size_t num_arguments() const { return num_arguments_; } + size_t num_results() const { return num_results_; } + + virtual void Execute(llvm::ArrayRef arguments, + llvm::MutableArrayRef results, + bool is_region = false) const {} + + virtual ~Function() = default; + + protected: + Function(std::string name, size_t num_arguments, size_t num_results) + : name_(name), num_arguments_(num_arguments), num_results_(num_results) {} + + private: + std::string name_; + size_t num_arguments_{}; + size_t num_results_{}; +}; + +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/kernel_frame.cc b/paddle/infrt/host_context/kernel_frame.cc new file mode 100644 index 0000000000000..1acb35e898308 --- /dev/null +++ b/paddle/infrt/host_context/kernel_frame.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/kernel_frame.h" + +#include + +namespace infrt { +namespace host_context { + +std::ostream& operator<<(std::ostream& os, const KernelFrame& frame) { + os << "KernelFrame: " << frame.GetNumArgs() << " args, " + << frame.GetNumResults() << " res, " << frame.GetNumResults() << " attrs"; + return os; +} + +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/kernel_frame.h b/paddle/infrt/host_context/kernel_frame.h new file mode 100644 index 0000000000000..20cb17dc7fbe2 --- /dev/null +++ b/paddle/infrt/host_context/kernel_frame.h @@ -0,0 +1,166 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include + +#include "llvm/ADT/SmallVector.h" +#include "paddle/infrt/host_context/value.h" + +namespace infrt::host_context { + +/** + * KernelFrame captures the states(input arguments, attributes, results) + * associated with a kernel invocation. + */ +class KernelFrame { + public: + int GetNumArgs() const { return num_arguments_; } + int GetNumResults() const { return num_results_; } + int GetNumAttributes() const { + return value_or_attrs_.size() - num_arguments_ - + (num_results_ == -1 ? 0 : num_results_); + } + + template + T& GetArgAt(int index) { + CHECK_LT(index, GetNumArgs()); + return value_or_attrs_[index]->get(); + } + template + const T& GetArgAt(int index) const { + CHECK_LT(index, GetNumArgs()); + return value_or_attrs_[index]->get(); + } + + Value* GetArgAt(int index) { + CHECK_LT(index, GetNumArgs()); + return value_or_attrs_[index]; + } + + // Get all arguments. + llvm::ArrayRef GetArguments() const { + return GetValues(0, num_arguments_); + } + + Value* GetAttributeAt(int idx) { + CHECK_NE(num_results_, -1) + << "Must call SetNumResults before GetAttributeAt"; + CHECK_LT(idx, + static_cast(value_or_attrs_.size() - num_arguments_ - + num_results_)); + return value_or_attrs_[num_arguments_ + num_results_ + idx]; + } + + void AddAttribute(Value* v) { + CHECK_NE(num_results_, -1) + << "Must call SetNumResults before calling AddAttribute"; + value_or_attrs_.emplace_back(v); + } + + template + void EmplaceResult(Args&&... args) { + EmplaceResult(0, std::forward(args)...); + } + + template + void EmplaceResult(int index, Args&&... args) { + SetResultAt(index, T(std::forward(args)...)); + } + + template + void SetResultAt(int index, T&& value) { + CHECK_LT(index, num_results_) << "Invalid result index"; + CHECK(value_or_attrs_[num_arguments_ + index]); + value_or_attrs_[num_arguments_ + index]->set(std::move(value)); + } + + llvm::ArrayRef GetResults() const { + return GetValues(num_arguments_, num_results_); + } + llvm::MutableArrayRef GetResults() { + return GetMutableValues(num_arguments_, num_results_); + } + + llvm::ArrayRef GetValues(size_t from, size_t length) const { + CHECK_LE(static_cast(from + length), num_arguments_ + num_results_); + if (length == 0) return {}; + + return llvm::makeArrayRef(&value_or_attrs_[from], length); + } + + llvm::MutableArrayRef GetMutableValues(size_t from, size_t length) { + CHECK_LE(static_cast(from + length), num_arguments_ + num_results_); + if (length == 0) return {}; + return llvm::makeMutableArrayRef(&value_or_attrs_[from], length); + } + + protected: + int num_arguments_{}; + int num_results_{-1}; + + llvm::SmallVector value_or_attrs_; +}; + +std::ostream& operator<<(std::ostream& os, const KernelFrame& frame); + +class KernelFrameBuilder : public KernelFrame { + public: + void AddArgument(Value* value) { + CHECK(value); + CHECK_EQ(num_results_, -1) + << "Should call AddArgument before calling SetNumResults"; + value_or_attrs_.push_back(value); + ++num_arguments_; + } + + void SetResults(llvm::ArrayRef values) { + CHECK_EQ(num_arguments_, static_cast(value_or_attrs_.size())); + CHECK_EQ(num_results_, -1); + for (Value* x : values) { + value_or_attrs_.push_back(x); + } + num_results_ = values.size(); + } + + void SetNumResults(size_t n) { + CHECK_EQ(num_arguments_, static_cast(value_or_attrs_.size())); + CHECK_EQ(num_results_, -1); + num_results_ = n; + for (size_t i = 0; i < n; i++) { + value_or_attrs_.emplace_back(new Value); + } + } + + void SetResultAt(int result_id, Value* value) { + CHECK_EQ(static_cast(value_or_attrs_.size()), + num_arguments_ + num_results_) + << "Call SetNumResults first"; + CHECK_LT(result_id + num_arguments_, + static_cast(value_or_attrs_.size())); + CHECK(value); + value_or_attrs_[num_arguments_ + result_id]->set(value); + } + + void Reset() { + value_or_attrs_.clear(); + num_arguments_ = 0; + num_results_ = -1; + } +}; + +} // namespace infrt::host_context diff --git a/paddle/infrt/host_context/kernel_registry.cc b/paddle/infrt/host_context/kernel_registry.cc new file mode 100644 index 0000000000000..f343dfc71b040 --- /dev/null +++ b/paddle/infrt/host_context/kernel_registry.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/kernel_registry.h" + +#include + +#include "glog/logging.h" +#include "llvm/ADT/SmallVector.h" + +namespace infrt { +namespace host_context { + +struct KernelRegistry::Impl { + std::unordered_map data; + std::unordered_map> attr_names; +}; + +KernelRegistry::KernelRegistry() : impl_(std::make_unique()) {} + +void KernelRegistry::AddKernel(const std::string &key, + KernelImplementation fn) { + CHECK(!impl_->data.count(key)) << "kernel [" << key + << "] is registered twice"; + impl_->data.emplace(key, fn); +} + +void KernelRegistry::AddKernelAttrNameList( + const std::string &key, const std::vector &names) { + CHECK(!impl_->attr_names.count(key)) + << "kernel [" << key << "] is registered twice in attribute names"; + impl_->attr_names.emplace( + key, llvm::SmallVector(names.begin(), names.end())); +} + +KernelImplementation KernelRegistry::GetKernel(const std::string &key) const { + auto it = impl_->data.find(key); + return it != impl_->data.end() ? it->second : KernelImplementation{}; +} + +std::vector KernelRegistry::GetKernelList() const { + std::vector res(impl_->data.size()); + for (auto i : impl_->data) { + res.push_back(i.first); + } + return res; +} + +KernelRegistry::~KernelRegistry() {} + +size_t KernelRegistry::size() const { return impl_->data.size(); } + +KernelRegistry *GetCpuKernelRegistry() { + static auto registry = std::make_unique(); + return registry.get(); +} + +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/kernel_registry.h b/paddle/infrt/host_context/kernel_registry.h new file mode 100644 index 0000000000000..d65969999f6ed --- /dev/null +++ b/paddle/infrt/host_context/kernel_registry.h @@ -0,0 +1,67 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +namespace infrt { +namespace host_context { + +class KernelFrame; + +using KernelImplementation = void (*)(KernelFrame *frame); + +/** + * Hold the kernels registered in the system. + */ +class KernelRegistry { + public: + KernelRegistry(); + + void AddKernel(const std::string &key, KernelImplementation fn); + void AddKernelAttrNameList(const std::string &key, + const std::vector &names); + + KernelImplementation GetKernel(const std::string &key) const; + std::vector GetKernelList() const; + + size_t size() const; + + ~KernelRegistry(); + + private: + class Impl; + + std::unique_ptr impl_; +}; + +//! The global CPU kernel registry. +KernelRegistry *GetCpuKernelRegistry(); + +} // namespace host_context +} // namespace infrt + +/** + * compile function RegisterKernels in C way to avoid C++ name mangling. + */ +#ifdef __cplusplus +extern "C" { +#endif +void RegisterKernels(infrt::host_context::KernelRegistry *registry); +#ifdef __cplusplus +} +#endif diff --git a/paddle/infrt/host_context/kernel_registry_test.cc b/paddle/infrt/host_context/kernel_registry_test.cc new file mode 100644 index 0000000000000..f36ec2a1cac7d --- /dev/null +++ b/paddle/infrt/host_context/kernel_registry_test.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/kernel_registry.h" + +#include + +#include "paddle/infrt/host_context/kernel_utils.h" + +namespace infrt::host_context { + +int add_i32(int a, int b) { return a + b; } + +TEST(KernelRegistry, basic) { + KernelRegistry registry; + std::string key = "infrt.test.add.i32"; + registry.AddKernel(key, INFRT_KERNEL(add_i32)); + + auto* kernel_impl = registry.GetKernel(key); + ASSERT_TRUE(kernel_impl); + + ValueRef a(1); + ValueRef b(2); + KernelFrameBuilder fbuilder; + fbuilder.AddArgument(a.get()); + fbuilder.AddArgument(b.get()); + fbuilder.SetNumResults(1); + + kernel_impl(&fbuilder); + + auto results = fbuilder.GetResults(); + ASSERT_EQ(results.size(), 1UL); + ASSERT_EQ(results[0]->get(), 3); +} + +} // namespace infrt::host_context diff --git a/paddle/infrt/host_context/kernel_utils.cc b/paddle/infrt/host_context/kernel_utils.cc new file mode 100644 index 0000000000000..cf9476da032be --- /dev/null +++ b/paddle/infrt/host_context/kernel_utils.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/kernel_utils.h" + +namespace infrt { +namespace host_context {} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/kernel_utils.h b/paddle/infrt/host_context/kernel_utils.h new file mode 100644 index 0000000000000..33812912ba029 --- /dev/null +++ b/paddle/infrt/host_context/kernel_utils.h @@ -0,0 +1,352 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include + +#include "paddle/infrt/host_context/kernel_frame.h" +#include "paddle/infrt/host_context/value.h" + +namespace infrt { +namespace host_context { + +template +class Argument { + public: + explicit Argument(ValueRef value) : value_(value) {} + + ValueRef& value() { return value_; } + const ValueRef& value() const { return value_; } + + T& get() const { return value_.get(); } + + private: + ValueRef value_; +}; + +/** + * RemainingArguments collects all remaining arguments in an ArrayRef. + */ +class RemainingArguments { + public: + explicit RemainingArguments(llvm::ArrayRef remaining_arguments) + : remaining_arguments_(remaining_arguments) {} + + llvm::ArrayRef values() const { return remaining_arguments_; } + size_t size() const { return remaining_arguments_.size(); } + const Value* operator[](size_t i) const { return remaining_arguments_[i]; } + + private: + llvm::ArrayRef remaining_arguments_; +}; + +/** + * RemainingResults collects all remaining results in a MutableArrayRef. + */ +class RemainingResults { + public: + explicit RemainingResults(llvm::MutableArrayRef remaining_results) + : remaining_results_(remaining_results) {} + llvm::MutableArrayRef values() { return remaining_results_; } + size_t size() const { return remaining_results_.size(); } + + template + const ValueRef& AllocateAt(int index) { + // eagerly create a ValueRef + if (remaining_results_[index].get()) return remaining_results_[index]; + remaining_results_[index] = ValueRef(new Value); + return remaining_results_[index]; + } + ValueRef& operator[](size_t i) const { return remaining_results_[i]; } + + private: + llvm::MutableArrayRef remaining_results_; +}; + +template +class Result { + public: + explicit Result(ValueRef* result) : result_(result) {} + + template + void Emplace(Args&&... args) { + ValueRef v; + Set(T(std::forward(args)...)); + } + + void Set(Argument argument) { + CHECK(!result_->IsValid()); + *result_ = argument.value(); + } + + private: + ValueRef* result_{}; +}; + +template +class Attribute { + public: + explicit Attribute(const Value* value) : value_(value) {} + + const T& get() const { return value_->get(); } + + private: + const Value* value_; +}; + +template +class ArgumentView { + using UnderlyingT = typename ViewT::UnderlyingT; + + public: + explicit ArgumentView(Value* value) + : value_(value), arg_(&value->template get()) {} + + Value* value() const { return value_; } + ViewT& get() const { return arg_; } + ViewT* operator->() const { return &get(); } + ViewT& operator*() const { return get(); } + + private: + Value* value_{}; + mutable ViewT arg_; +}; + +template +struct KernelImpl; + +template +struct TypeTag {}; + +#define INFRT_KERNEL(...) \ + ::infrt::host_context::KernelImpl::Invoke + +template +struct KernelImpl { + static void Invoke(KernelFrame* frame) { + KernelCallHelper>::template Invoke<0, 0, 0>(frame); + } + + // Helper that introspects the arguments to derive the signature and cast + // parts of the KernelFrame to their type before passing them to impl_fn. + template + struct KernelCallHelper; + + // Casts the return value of the kernel, if non-void. + // bool _ is an unnecessary parameter to make compiler allow templace specific + // in non-namespace scope. + template + struct KernelReturnHelper { + static void Invoke(KernelFrame* frame, const Args&... args) { + HandleReturn(frame, impl_fn(args...)); + } + }; + + template + struct KernelReturnHelper { + static void Invoke(KernelFrame* frame, const Args&... args) { + impl_fn(args...); + } + }; + + // Specialization to cast a single input argument(Head). + template + struct KernelCallHelper, Tail...> { + template + static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) { + static_assert(in_idx != -1, + "Do not place Arguments after RemainingArguments"); + static_assert(out_idx == 0, "Arguments should appear before results"); + static_assert(const_idx == 0, + "Arguments and results should appear before attributes."); + + Argument arg(frame->GetArgAt(in_idx)); + KernelCallHelper< + Tail...>::template Invoke(frame, + pargs..., + arg); + } + }; + + template + struct KernelCallHelper, Tail...> { + template + static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) { + static_assert(in_idx != -1, + "Do not place Arguments after RemainingArguments"); + static_assert(out_idx == 0, "Arguments should appear before results"); + static_assert(const_idx == 0, + "Arguments and results should appear before attributes."); + + ArgumentView arg(frame->GetArgAt(in_idx)); + KernelCallHelper< + Tail...>::template Invoke(frame, + pargs..., + arg); + } + }; + + // Specialization to cast a single result argument (Head). + template + struct KernelCallHelper, Tail...> { + template + static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) { + static_assert(out_idx != -1, + "Do not place Results after RemainingResults"); + static_assert(const_idx == 0, + "Arguments and results should appear before attributes"); + Result arg(&frame->GetResults()[out_idx]); + KernelCallHelper< + Tail...>::template Invoke(frame, + pargs..., + arg); + } + }; + + // Specialization to cast a single attribute. + template + struct KernelCallHelper, Tail...> { + template + static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) { + static_assert(const_idx != -1, + "Do not place Attributes after RemainingAttributes"); + Attribute arg(frame->GetAttributeAt(const_idx)); + KernelCallHelper< + Tail...>::template Invoke(frame, + pargs..., + arg); + } + }; + + // Treat other pointer as an Argument. + template + struct KernelCallHelper { + template + static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) { + static_assert(in_idx != -1, + "Do not place Arguments after RemainingArguments"); + static_assert(out_idx == 0, "Arguments should appear before results"); + static_assert(const_idx == 0, + "Arguments and results should appear before attributes."); + auto* arg = &frame->GetArgAt(in_idx); + KernelCallHelper< + Tail...>::template Invoke(frame, + pargs..., + arg); + } + }; + + // Treat any other type as an Argument. + template + struct KernelCallHelper { + using ArgT = std::decay_t; + + template + static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) { + static_assert(in_idx != -1, + "Do not place Arguments after RemainingArguments"); + static_assert(out_idx == 0, "Arguments should appear before results"); + static_assert(const_idx == 0, + "Arguments and results should appear before attributes."); + + auto* value = frame->GetArgAt(in_idx); + auto&& arg = value->get(); + + KernelCallHelper< + Tail...>::template Invoke(frame, + pargs..., + arg); + } + }; + + // RemainingArguments provides an ArrayRef containing all + // remaining arguments. Useful for variadic + // kernels. + template + struct KernelCallHelper { + template + static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) { + static_assert(in_idx != -1, + "Do not use more than one RemainingArguments"); + static_assert(out_idx == 0, "Arguments should appear before results."); + static_assert(const_idx == 0, + "Arguments and results should appear before attributes"); + RemainingArguments remaining_arguments( + frame->GetArguments().drop_front(in_idx)); + + KernelCallHelper::template Invoke<-1, out_idx, const_idx>( + frame, pargs..., remaining_arguments); + } + }; + + // RemainingResults provides an MutableArrayRef containing all + // remaining results. + template + struct KernelCallHelper { + template + static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) { + static_assert(out_idx != -1, "Do not use more than one RemainingResults"); + static_assert(const_idx == 0, + "Arguments and results should appear before attributes"); + llvm::MutableArrayRef returned_results = + frame->GetResults().drop_front(out_idx); + + llvm::SmallVector result_values; + for (size_t i = 0; i < returned_results.size(); i++) + result_values.emplace_back(returned_results[i]); + + RemainingResults remaining_results(result_values); + KernelCallHelper::template Invoke( + frame, pargs..., remaining_results); + } + }; + + // No arguments left. + template + struct KernelCallHelper> { + template + static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) { + KernelReturnHelper::Invoke(frame, pargs...); + } + }; + + // Handle pair result + template + static void HandleReturn(KernelFrame* frame, std::pair&& t) { + CHECK_EQ(frame->GetNumResults(), 2); + StoreResultAt(frame, 0, std::move(t.first)); + StoreResultAt(frame, 1, std::move(t.second)); + } + + // Store the function result back to the output Value in KernelFrame. + template + static void HandleReturn(KernelFrame* frame, T&& t) { + assert(frame->GetNumResults() == 1 && "Extra results passed to kernel."); + StoreResultAt(frame, 0, std::forward(t)); + } + + // Store result as an Value output in KernelFrame. + template + static void StoreResultAt(KernelFrame* frame, int index, T&& t) { + frame->EmplaceResult>(index, std::forward(t)); + } +}; + +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/kernel_utils_test.cc b/paddle/infrt/host_context/kernel_utils_test.cc new file mode 100644 index 0000000000000..1904eb106a293 --- /dev/null +++ b/paddle/infrt/host_context/kernel_utils_test.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/kernel_utils.h" + +#include + +namespace infrt::host_context { + +int add_i32(int a, int b) { return a + b; } +float add_f32(float a, float b) { return a + b; } +std::pair add_pair(int a, float b) { return {a, b}; } + +TEST(KernelImpl, i32) { + KernelFrameBuilder fbuilder; + ValueRef a(new Value(1)); + ValueRef b(new Value(2)); + fbuilder.AddArgument(a.get()); + fbuilder.AddArgument(b.get()); + fbuilder.SetNumResults(1); + + INFRT_KERNEL(add_i32)(&fbuilder); + auto results = fbuilder.GetResults(); + ASSERT_EQ(results.size(), 1UL); + ASSERT_EQ(results.front()->get(), 3); +} + +TEST(KernelImpl, f32) { + KernelFrameBuilder fbuilder; + ValueRef a(new Value(1.f)); + ValueRef b(new Value(2.f)); + fbuilder.AddArgument(a.get()); + fbuilder.AddArgument(b.get()); + fbuilder.SetNumResults(1); + + INFRT_KERNEL(add_f32)(&fbuilder); + auto results = fbuilder.GetResults(); + ASSERT_EQ(results.size(), 1UL); + ASSERT_EQ(results.front()->get(), 3.f); +} + +TEST(KernelImpl, pair) { + KernelFrameBuilder fbuilder; + ValueRef a(new Value(1)); + ValueRef b(new Value(3.f)); + + fbuilder.AddArgument(a.get()); + fbuilder.AddArgument(b.get()); + fbuilder.SetNumResults(2); + + INFRT_KERNEL(add_pair)(&fbuilder); + auto results = fbuilder.GetResults(); + ASSERT_EQ(results.size(), 2UL); + ASSERT_EQ(results[0]->get(), 1); + ASSERT_EQ(results[1]->get(), 3.f); +} + +} // namespace infrt::host_context diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc new file mode 100644 index 0000000000000..b0d70af5ef9f2 --- /dev/null +++ b/paddle/infrt/host_context/mlir_exec.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "llvm/Support/DynamicLibrary.h" +#include "paddle/infrt/common/global.h" +#include "paddle/infrt/dialect/mlir_loader.h" +#include "paddle/infrt/host_context/core_runtime.h" +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/mlir_to_runtime_translate.h" +#include "paddle/infrt/kernel/basic_kernels.h" +#include "paddle/infrt/kernel/control_flow_kernels.h" +#include "paddle/infrt/kernel/tensor_kernels.h" +#include "paddle/infrt/kernel/tensor_shape_kernels.h" +#include "paddle/infrt/kernel/test_kernels.h" + +static llvm::cl::list cl_shared_libs( // NOLINT + "shared_libs", + llvm::cl::desc("Specify shared library with kernels."), + llvm::cl::ZeroOrMore, + llvm::cl::MiscFlags::CommaSeparated); + +int main(int argc, char** argv) { + using namespace llvm; // NOLINT + using namespace infrt; // NOLINT + cl::opt input_file("i", + cl::desc("Specify input filename"), + cl::value_desc("input file name")); + cl::ParseCommandLineOptions(argc, argv); + + mlir::MLIRContext* context = infrt::Global::getMLIRContext(); + auto module = dialect::LoadMlirFile(input_file.c_str(), context); + + host_context::KernelRegistry registry; + + kernel::RegisterBasicKernels(®istry); + kernel::RegisterTestKernels(®istry); + kernel::RegisterTensorShapeKernels(®istry); + kernel::RegisterTensorKernels(®istry); + kernel::RegisterControlFlowKernels(®istry); + + // load extra shared library + for (const auto& lib_path : cl_shared_libs) { + std::string err; + llvm::sys::DynamicLibrary dynLib = + llvm::sys::DynamicLibrary::getPermanentLibrary(lib_path.c_str(), &err); + if (!dynLib.isValid()) { + llvm::errs() << "Load shared library failed. Error: " << err << "\n"; + return 1; + } + if (auto reg_sym = dynLib.SearchForAddressOfSymbol("RegisterKernels")) { + auto reg_func = + reinterpret_cast(reg_sym); + reg_func(®istry); + } else { + llvm::outs() << "Symbol \"RegisterKernels\" not found in \"" << lib_path + << "\". Skip.\n"; + } + } + + host_context::TestMlir(module.get(), ®istry); + + std::cout << std::endl; + return 0; +} diff --git a/paddle/infrt/host_context/mlir_function_executable.cc b/paddle/infrt/host_context/mlir_function_executable.cc new file mode 100644 index 0000000000000..5f8dacf8e448a --- /dev/null +++ b/paddle/infrt/host_context/mlir_function_executable.cc @@ -0,0 +1,135 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/mlir_function_executable.h" + +#include + +#include // NOLINT + +#include "paddle/infrt/common/common.h" +#include "paddle/infrt/host_context/core_runtime.h" + +namespace infrt { +namespace host_context { + +template +std::string DumpToString(T& op) { // NOLINT + std::string buffer; + llvm::raw_string_ostream os(buffer); + op.print(os); + os.flush(); + return buffer; +} + +MlirFunctionExecutable::MlirFunctionExecutable( + mlir::FuncOp func_op, + KernelRegistry* kernel_registry, + MlirToRuntimeTranslator::function_defs_t& function_table) + : Function(func_op.getName().str(), + func_op.getNumArguments(), + func_op.getNumResults()), + MlirToRuntimeTranslator(&core_runtime_builder_), + region_(&func_op.getRegion()), + core_runtime_builder_(kernel_registry), + function_table_(function_table) {} + +MlirFunctionExecutable::MlirFunctionExecutable( + mlir::Region* region, + mlir::FunctionType func_type, + KernelRegistry* kernel_registry, + MlirToRuntimeTranslator::function_defs_t& function_table) + : Function("", func_type.getNumInputs(), func_type.getNumResults()), + MlirToRuntimeTranslator(&core_runtime_builder_), + region_(region), + core_runtime_builder_(kernel_registry), + function_table_(function_table) {} + +void MlirFunctionExecutable::BuildExecutables( + llvm::ArrayRef arguments, + llvm::MutableArrayRef results, + bool is_region) { + CHECK_EQ(arguments.size(), num_arguments()); + // We use the function call's arguments as op_executable's operands to avoid + // copy. + for (size_t i = 0; i < num_arguments(); i++) { + AddValue(region_->getArgument(i), arguments[i]); + } + + // build the program + auto& blocks = region_->getBlocks(); + CHECK_EQ(blocks.size(), 1UL) + << "function with more than one block is not supported yet"; + + llvm::SmallVector runtime_results; + for (auto& op : blocks.front()) { + if (EmitConstantOp(&op)) continue; + if (EmitBuildShapeOp(&op)) continue; + + llvm::SmallVector mlir_results; + if (EmitReturnOp(&op, &mlir_results)) { + if (!is_region) { + for (auto v : mlir_results) { + runtime_results.push_back(GetValue(v)); + } + } + continue; + } + + if (EmitCallOp(&op, &function_table_)) continue; + + if (EmitGeneralOp(&op)) continue; + LOG(FATAL) << "Not supported op: " << DumpToString(op); + } + + // after the block is built, we can get the result values of the whole + // function call in the runtime_results. + + mlir::SmallVector results_copied; + if (!is_region) { + for (ValueRef& x : results) { + results_copied.push_back(x.get()); + } + } + + // set a lambda function to help copy the results from the runtime results in + // the local function to outer program. + CHECK_EQ(results_copied.size(), runtime_results.size()); + this->copy_res_fn_ = [results_copied, runtime_results] { + VLOG(4) << "copy results to result"; + for (size_t i = 0; i < results_copied.size(); i++) { + VLOG(4) << ".. copy " << runtime_results[i] << " to " + << results_copied[i]; + CopyTo(*runtime_results[i], results_copied[i]); + } + }; +} + +void MlirFunctionExecutable::Execute(llvm::ArrayRef arguments, + llvm::MutableArrayRef results, + bool is_region) const { + CHECK_EQ(arguments.size(), num_arguments()); + CHECK_EQ(results.size(), num_results()); + + if (core_runtime_builder_.num_ops() == 0) { + Reference(this).BuildExecutables(arguments, results, is_region); + } + + Reference(&core_runtime_builder_).Execute(); + + copy_res_fn_(); +} + +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/mlir_function_executable.h b/paddle/infrt/host_context/mlir_function_executable.h new file mode 100644 index 0000000000000..ba5fa154d6fcc --- /dev/null +++ b/paddle/infrt/host_context/mlir_function_executable.h @@ -0,0 +1,78 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include +#include + +#include "paddle/infrt/host_context/core_runtime.h" +#include "paddle/infrt/host_context/function.h" +#include "paddle/infrt/host_context/mlir_to_runtime_translate.h" + +namespace infrt { +namespace host_context { + +struct KernelRegistry; + +/** + * Executable function for a given MLIR function definition, mainly used in two + * scenerios: + * 1. infrt.call op + * 2. main function call + * + * A MlirFunctionExecutable might have one or more arguments and results. + */ +class MlirFunctionExecutable : public Function, public MlirToRuntimeTranslator { + public: + using function_defs_t = std::unordered_map; + + MlirFunctionExecutable(mlir::FuncOp func_op, + KernelRegistry* kernel_registry, + function_defs_t& function_table); // NOLINT + + MlirFunctionExecutable( + mlir::Region* region, + mlir::FunctionType func_type, + KernelRegistry* kernel_registry, + MlirToRuntimeTranslator::function_defs_t& function_table); // NOLINT + + /** + * Execute the function with the given arguments and results. + * NOTE the \param arguments and \param results should not be altered. + */ + void Execute(llvm::ArrayRef arguments, + llvm::MutableArrayRef results, + bool is_region = false) const; + + private: + /** + * Build the runtime executables once the function call arguments and results + * are passed in. + * This will trigger in the first execution. + */ + void BuildExecutables(llvm::ArrayRef arguments, + llvm::MutableArrayRef results, + bool is_region); + + private: + mlir::Region* region_{}; + CoreRuntimeBuilder core_runtime_builder_; + MlirToRuntimeTranslator::function_defs_t& function_table_; + std::function copy_res_fn_; +}; + +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/mlir_program_executor.cc b/paddle/infrt/host_context/mlir_program_executor.cc new file mode 100644 index 0000000000000..c5009bcc97c5c --- /dev/null +++ b/paddle/infrt/host_context/mlir_program_executor.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/mlir_program_executor.h" + +namespace infrt { +namespace host_context {} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/mlir_program_executor.h b/paddle/infrt/host_context/mlir_program_executor.h new file mode 100644 index 0000000000000..b2af4d2d79db5 --- /dev/null +++ b/paddle/infrt/host_context/mlir_program_executor.h @@ -0,0 +1,79 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "paddle/infrt/host_context/core_runtime.h" +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/mlir_function_executable.h" +#include "paddle/infrt/host_context/mlir_to_runtime_translate.h" +#include "paddle/infrt/host_context/op_executable.h" + +namespace infrt { +namespace host_context { + +/** + * This get a MLIR program as input, it compiles it into runtime program, and + * one can retrieve the function and execute + * it by passing the input arguments. + */ +class MlirProgramExecutor : public MlirToRuntimeTranslator { + public: + CoreRuntimeBuilder runtime_builder; + mlir::ModuleOp module; + function_defs_t function_defs; + + MlirProgramExecutor(mlir::ModuleOp module, KernelRegistry* registry) + : MlirToRuntimeTranslator(module, &runtime_builder), + runtime_builder(registry), + module(module) {} + + // Build functions and generate executables. + void BuildFunctions() { EmitFunctions(); } + + void EmitFunction(mlir::FuncOp op) override { + LOG(INFO) << "Emit function: " << op.getName().str(); + function_defs[op.getName().str()] = op; + + func_executables_.emplace( + op.getName().str(), + new MlirFunctionExecutable( + op, runtime_builder.kernel_registry(), function_defs)); + } + + MlirFunctionExecutable* LookupFunc(const std::string& name) { + auto it = func_executables_.find(name); + if (it != func_executables_.end()) { + return it->second.get(); + } + return nullptr; + } + + private: + std::unordered_map> + func_executables_; +}; + +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/mlir_tests/basic.mlir b/paddle/infrt/host_context/mlir_tests/basic.mlir new file mode 100644 index 0000000000000..263d5884134b1 --- /dev/null +++ b/paddle/infrt/host_context/mlir_tests/basic.mlir @@ -0,0 +1,30 @@ +// CHECK-LABEL: basic +func @basic() -> f32 { + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + + // CHECK: 1 + "infrt.print.f32"(%v0) : (f32) -> () + // CHECK: 2 + "infrt.print.f32"(%v1) : (f32) -> () + + // CHECK: 3 + "infrt.print.f32"(%v2) : (f32) -> () + + %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 + + // CHECK: 6 + "infrt.print.f32"(%v3) : (f32) -> () + + infrt.return %v3 : f32 +} + +// CHECK-LABEL: basic1 +// Check the mlir executor can work with more than one function in a file. +func @basic1() -> () { + %v0 = infrt.constant.f32 1.0 + "infrt.print.f32"(%v0) : (f32) -> () + // CHECK: 1 + infrt.return +} \ No newline at end of file diff --git a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir new file mode 100644 index 0000000000000..83afa1db8a91c --- /dev/null +++ b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir @@ -0,0 +1,9 @@ +// CHECK-LABEL: build_tensor1 +func @build_tensor1() { + %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor + dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor) {value=1.0:f32} + // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + dt.print_tensor (%a : !infrt.tensor) + + infrt.return +} diff --git a/paddle/infrt/host_context/mlir_tests/shape.mlir b/paddle/infrt/host_context/mlir_tests/shape.mlir new file mode 100644 index 0000000000000..a3130857b0ef7 --- /dev/null +++ b/paddle/infrt/host_context/mlir_tests/shape.mlir @@ -0,0 +1,7 @@ +// CHECK-LABEL: build_tensor1 +func @build_tensor1() { + %a = ts.build_shape [1:i64, 57:i64, 92:i64] + // CHECK: shape[1,57,92] + ts.print_shape %a + infrt.return +} \ No newline at end of file diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc new file mode 100644 index 0000000000000..25324b1291582 --- /dev/null +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -0,0 +1,558 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/mlir_to_runtime_translate.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "boost/optional.hpp" +#include "paddle/infrt/common/string.h" +#include "paddle/infrt/dialect/mlir_loader.h" +#include "paddle/infrt/dialect/tensor_shape.h" +#include "paddle/infrt/host_context/core_runtime.h" +#include "paddle/infrt/host_context/kernel_frame.h" +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/mlir_function_executable.h" +#include "paddle/infrt/host_context/op_executable.h" +#include "paddle/infrt/host_context/value.h" +#include "paddle/infrt/tensor/tensor_shape.h" + +namespace infrt::host_context { + +template +std::string DumpToString(T& op) { // NOLINT + std::string buffer; + llvm::raw_string_ostream os(buffer); + op.print(os); + os.flush(); + return buffer; +} + +struct MlirToRuntimeTranslator::Impl { + mlir::ModuleOp module; + // The runtime for a function call. + CoreRuntimeBuilder* runtime{}; + // The current working op, the translator process the ops one by one, each + // time it updates `cur_op` here to current op + // working on. + OpExecutableBuilder* cur_op{}; + + // record the current function name. + std::string cur_func_name; + + // Name to function definitions. + std::unordered_map func_defs; + + // Map from an operation to its results. + std::unordered_map> op_results; + llvm::DenseMap value_map; +}; + +bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) { + if (!infrt::Startswith(op->getName().getStringRef().str(), "infrt.constant")) + return false; + VLOG(3) << "Emitting constant op [" << op->getName().getStringRef().str() + << "]"; + + auto attr = op->getAttr("value"); + if (attr.isa()) { + if (attr.getType().isF32()) { + impl_->op_results[op] = {ValueRef( + static_cast(attr.cast().getValueAsDouble()))}; + } else if (attr.getType().isF64()) { + impl_->op_results[op] = {ValueRef(static_cast( + attr.cast().getValueAsDouble()))}; + } else { + LOG(FATAL) << "Not supported attribute type"; + } + return true; + } + + if (attr.isa()) { + if (attr.getType().isInteger(32)) { + impl_->op_results[op] = {ValueRef( + static_cast(attr.cast().getSInt()))}; + } else if (attr.getType().isInteger(64)) { + impl_->op_results[op] = {ValueRef( + static_cast(attr.cast().getSInt()))}; + } else if (attr.getType().isInteger(1)) { + impl_->op_results[op] = { + ValueRef(static_cast(attr.cast().getInt()))}; + } else { + LOG(FATAL) << "Not supported attribute type"; + } + return true; + } + + LOG(FATAL) << "Not supported constant attribute type"; + return true; +} + +template <> +boost::optional MlirToRuntimeTranslator::EmitAttribute( + const mlir::Attribute* attr) { + if (!attr->isa()) return boost::none; + if (attr->isa()) { + auto val = attr->cast(); + if (val.getType().isInteger(32)) { + return val.getInt(); + } + } + return boost::none; +} +template <> +boost::optional MlirToRuntimeTranslator::EmitAttribute( + const mlir::Attribute* attr) { + if (!attr->isa()) return boost::none; + if (attr->isa()) { + auto val = attr->cast(); + if (val.getType().isInteger(64)) { + return val.getInt(); + } + } + return boost::none; +} + +// TODO(Superjomn) Make double and float parsing share some thing. +template <> +boost::optional MlirToRuntimeTranslator::EmitAttribute( + const mlir::Attribute* attr) { + if (!attr->isa()) return boost::none; + if (attr->isa()) { + auto val = attr->cast(); + if (val.getType().isF32()) return val.getValueAsDouble(); + } + return boost::none; +} + +template <> +boost::optional MlirToRuntimeTranslator::EmitAttribute( + const mlir::Attribute* attr) { + if (!attr->isa()) return boost::none; + if (attr->isa()) { + auto val = attr->cast(); + if (val.getType().isF64()) return val.getValueAsDouble(); + } + return boost::none; +} + +template <> +boost::optional MlirToRuntimeTranslator::EmitAttribute( + const mlir::Attribute* attr) { + if (!attr->isa()) return boost::none; + return attr->cast().getValue().str(); +} + +#define PROCESS_ARRAY_INT(type__, bits__) \ + template <> \ + boost::optional> MlirToRuntimeTranslator::EmitAttribute( \ + const mlir::Attribute* attr) { \ + if (!attr->isa()) return boost::none; \ + auto array = attr->cast(); \ + CHECK(!array.empty()); \ + \ + if (!array[0].getType().isInteger(bits__)) { \ + return boost::none; \ + } \ + \ + std::vector res; \ + for (auto& v : array) { \ + res.push_back(v.cast().getInt()); \ + } \ + return res; \ + } + +PROCESS_ARRAY_INT(int16_t, 16); +PROCESS_ARRAY_INT(int32_t, 32); +PROCESS_ARRAY_INT(int64_t, 64); + +template <> +boost::optional> MlirToRuntimeTranslator::EmitAttribute( + const mlir::Attribute* attr) { + if (!attr->isa()) return boost::none; + auto array = attr->cast(); + CHECK(!array.empty()); + + if (!array[0].getType().isF32()) return boost::none; + + std::vector res; + for (auto& v : array) { + res.push_back(v.cast().getValueAsDouble()); + } + return res; +} + +template <> +boost::optional> MlirToRuntimeTranslator::EmitAttribute( + const mlir::Attribute* attr) { + if (!attr->isa()) return boost::none; + auto array = attr->cast(); + CHECK(!array.empty()); + + if (!array[0].getType().isF64()) return boost::none; + + std::vector res; + for (auto& v : array) { + res.push_back(v.cast().getValueAsDouble()); + } + return res; +} + +static bool IsReturn(mlir::Operation* op) { + return op->getName().getStringRef() == "infrt.return"; +} + +bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { + CHECK(impl_->runtime); + impl_->cur_op = + impl_->runtime->NewOpExecutable(op->getName().getStringRef().str()); + + VLOG(3) << "processing general op : " << op->getName().getStringRef().str(); + + // process operands + for (int i = 0, e = op->getNumOperands(); i < e; i++) { + // function argument as value + auto operand = op->getOperand(i); + if (operand.getKind() == mlir::Value::Kind::BlockArgument) { + mlir::BlockArgument arg = operand.dyn_cast(); + Value* arg_value = GetValue(arg); + impl_->cur_op->AppendArgument(arg_value); + VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " " + << GetValue(arg); + continue; + } + + // normal value + Value* arg_value = GetValue(operand); + if (!arg_value) { + auto upstream_op = operand.getDefiningOp(); + arg_value = GetOpResult(upstream_op); + } + CHECK(arg_value) << "No-exist argument value found: " + << DumpToString(operand); + impl_->cur_op->AppendArgument(arg_value); + + VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " " + << GetValue(operand) << " vs " << arg_value; + } + + // process results + llvm::SmallVector res_values; + for (int i = 0, e = op->getNumResults(); i < e; i++) { + auto res = op->getResult(i); + res_values.push_back(AddValue(res)); + + VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res); + } + impl_->cur_op->SetResults(res_values); + +#ifdef INFRT_DEBUG + { + VLOG(3) << "check result"; + for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) { + VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i]; + } + } +#endif + + // process attributes + auto attrs = op->getAttrs(); + + for (size_t i = 0; i < attrs.size(); i++) { + auto& attr = attrs[i]; + if (auto v = EmitAttribute(&attr.second)) { + impl_->cur_op->AppendAttribute(new Value(*v)); + } else if (auto v = EmitAttribute(&attr.second)) { + impl_->cur_op->AppendAttribute(new Value(*v)); + } else if (auto v = EmitAttribute(&attr.second)) { + impl_->cur_op->AppendAttribute(new Value(*v)); + } else if (auto v = EmitAttribute(&attr.second)) { + impl_->cur_op->AppendAttribute(new Value(*v)); + } else if (auto v = EmitAttribute(&attr.second)) { + impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + } else if (auto v = EmitAttribute>(&attr.second)) { + impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + } else if (auto v = EmitAttribute>(&attr.second)) { + impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + } else if (auto v = EmitAttribute>(&attr.second)) { + impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + } else if (auto v = EmitAttribute>(&attr.second)) { + impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + } else if (auto v = EmitAttribute>(&attr.second)) { + impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + } else { + LOG(FATAL) << "Not supported attribute type"; + } + } + + // process regions, we treat regions as attribute. + auto num_regions = op->getNumRegions(); + if (num_regions > 0) { + CHECK_EQ(num_regions, 1UL) + << "op with more than one region is not supported yet."; + auto& region = op->getRegions().front(); + auto num_blocks = region.getBlocks().size(); + CHECK_EQ(num_blocks, 1UL) + << "region with more than one block is not supported yet."; + + // process arguments + llvm::SmallVector inputs; + auto& block = region.getBlocks().front(); + for (auto arg : block.getArguments()) inputs.push_back(arg.getType()); + + // process results + // NOTE: if an op contains a region, we simply ignore the region's return + // values, + // or its return values will conflict with op's return values. + llvm::SmallVector results; + + auto func_type = + mlir::FunctionType::get(inputs, results, region.getContext()); + auto* function = impl_->cur_op->CreateFunctionExecutable( + ®ion, func_type, &impl_->func_defs); + impl_->cur_op->AppendAttribute(new Value(function)); + } + + return true; +} + +bool MlirToRuntimeTranslator::EmitReturnOp( + mlir::Operation* op, llvm::SmallVectorImpl* results) { + CHECK(results); + if (op->getName().getStringRef() == "infrt.return") { + for (size_t i = 0; i < op->getNumOperands(); i++) { + results->push_back(op->getOperand(i)); + } + + return true; + } + return false; +} + +bool MlirToRuntimeTranslator::EmitFunctions() { + for (auto func_op : impl_->module.getOps()) { + EmitFunction(func_op); + } + return true; +} + +void MlirToRuntimeTranslator::EmitFunction(mlir::FuncOp op) { + impl_->func_defs[op.getName().str()] = op; +} + +Value* MlirToRuntimeTranslator::GetOpResult(mlir::Operation* op) { + auto it = impl_->op_results.find(op); + return it == impl_->op_results.end() ? nullptr : it->second.front().get(); +} + +Value* MlirToRuntimeTranslator::GetValue(mlir::Value value) { + auto it = impl_->value_map.find(value); + return it == impl_->value_map.end() ? nullptr : it->second.get(); +} + +Value* MlirToRuntimeTranslator::AddValue(mlir::Value value) { + auto res = impl_->value_map.try_emplace(value, ValueRef(new Value)); + CHECK(res.second) << "Duplicate add mlir value [" << DumpToString(value) + << "]"; + return res.first->second.get(); +} + +MlirToRuntimeTranslator::~MlirToRuntimeTranslator() {} + +void MlirToRuntimeTranslator::UpdateCurFuncName(const std::string& name) { + impl_->cur_func_name = std::string(name); +} + +MlirToRuntimeTranslator::MlirToRuntimeTranslator(mlir::ModuleOp module, + CoreRuntimeBuilder* runtime) + : impl_(new Impl) { + CHECK(runtime); + impl_->module = module; + impl_->runtime = runtime; +} + +bool MlirToRuntimeTranslator::EmitBuildShapeOp(mlir::Operation* op) { + if (op->getName().getStringRef() != "ts.build_shape") return false; + + auto value = op->getAttr("value"); + + CHECK(value.isa()); + auto values = value.cast().getValue(); + std::vector dims; + for (auto& attr_v : values) { + dims.push_back(attr_v.cast().getInt()); + } + impl_->op_results[op] = { + ValueRef(new Value(tensor::TensorShape(llvm::ArrayRef(dims))))}; + + return true; +} + +bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op, + function_defs_t* function_table) { + CHECK(op); + CHECK(function_table); + if (op->getName().getStringRef() != "infrt.call") return false; + + impl_->cur_op = + impl_->runtime->NewOpExecutable(op->getName().getStringRef().str()); + + auto callee = op->getAttr("callee"); + auto callee_name = callee.dyn_cast(); + + // process arguments + for (size_t i = 0; i < op->getNumOperands(); i++) { + auto operand = op->getOperand(i); + auto* arg_value = GetValue(operand); + + if (!arg_value) { + auto upstream_op = operand.getDefiningOp(); + arg_value = GetOpResult(upstream_op); + } + CHECK(arg_value) << "No-exist argument value found: " + << DumpToString(operand); + impl_->cur_op->AppendArgument(arg_value); + } + + // process results + llvm::SmallVector res_values; + for (int i = 0, e = op->getNumResults(); i < e; i++) { + auto res = op->getResult(i); + res_values.push_back(AddValue(res)); + } + impl_->cur_op->SetResults(res_values); + + // process attribute + auto& table = function_table ? *function_table : impl_->func_defs; + { + // lookup the callee function + auto it = table.find(callee_name.getValue().str()); + CHECK(it != table.end()) << "can't find function [" + << callee_name.getValue().str() << "]"; + auto* function = + impl_->cur_op->CreateFunctionExecutable(it->second, &impl_->func_defs); + impl_->cur_op->AppendAttribute(new Value(function)); + } + + VLOG(3) << "Emit call " << callee_name.getValue().str() << " " + << impl_->cur_op->frame(); + return true; +} + +MlirToRuntimeTranslator::MlirToRuntimeTranslator(CoreRuntimeBuilder* runtime) + : impl_(new Impl) { + CHECK(runtime); + impl_->runtime = runtime; +} + +Value* MlirToRuntimeTranslator::AddValue(mlir::Value mlir_value, Value* value) { + auto it = impl_->value_map.try_emplace(mlir_value, ValueRef(value)); + CHECK(it.second) << "duplicate add value " << DumpToString(mlir_value); + return value; +} + +void MlirToRuntimeTranslate(mlir::ModuleOp module, + CoreRuntimeBuilder* runtime) { + MlirToRuntimeTranslator(module, runtime).Run(); +} + +/** + * Execute the mlir program in test mode -- print some debug information to + * stdout. + */ +class MlirProgramTestExecutor : public MlirToRuntimeTranslator { + public: + CoreRuntimeBuilder core_runtime; + + MlirProgramTestExecutor(mlir::ModuleOp module, KernelRegistry* registry) + : MlirToRuntimeTranslator(module, &core_runtime), + core_runtime(registry), + registry(registry) { + CHECK(registry); + } + + void Run() { + EmitFunctions(); + + CHECK(registry); + for (auto func_op : impl_->module.getOps()) { + VLOG(3) << "Running function " << func_op.getName().str(); + EmitAndRunFuncWithoutArguments(func_op); + } + } + + protected: + std::unordered_map func_def_table; + + void EmitFunction(mlir::FuncOp op) override { + CHECK(!impl_->func_defs.count(op.getName().str())) + << "Duplicate function defition found for function [" + << op.getName().str(); + impl_->func_defs.emplace(op.getName().str(), op); + } + + private: + void EmitAndRunFuncWithoutArguments(mlir::FuncOp func) { + // print the function name for llvm FileChecker macro, CHECK-LABEL + std::cout << '@' << func.getName().str() << std::endl; + if (func.getNumArguments() == + 0) { // an entry function, execute it immediately + VLOG(3) << "executing function " << func.getName().str(); + // Emit and execute each function + CoreRuntimeBuilder runtime(registry); + impl_->runtime = &runtime; + + auto& blocks = func.getBlocks(); + CHECK_EQ(blocks.size(), 1UL) + << "function with more than one block is not supported yet"; + + for (auto& op : blocks.front()) { + if (EmitConstantOp(&op)) continue; + if (EmitBuildShapeOp(&op)) continue; + llvm::SmallVector results; + if (EmitReturnOp(&op, &results)) continue; + if (EmitCallOp(&op, &impl_->func_defs)) continue; + if (EmitGeneralOp(&op)) continue; + LOG(FATAL) << "Not supported op: " << DumpToString(op); + } + + runtime.Execute(); + + } else { + VLOG(2) << "get an callable function: " << func.getName().str(); + } + } + + private: + KernelRegistry* registry{}; +}; + +void TestMlir(mlir::ModuleOp module, KernelRegistry* registry) { + MlirProgramTestExecutor execute(module, registry); + execute.Run(); +} + +} // namespace infrt::host_context diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h new file mode 100644 index 0000000000000..598e81bfd96d8 --- /dev/null +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h @@ -0,0 +1,107 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include // NOLINT +#include //NOLINT +#include // NOLINT + +namespace mlir { +class FuncOp; +class ModuleOp; +class Operation; +class Attribute; +class Value; +} // namespace mlir + +namespace infrt::host_context { + +class CoreRuntimeBuilder; +class Value; +class ValueRef; +class KernelRegistry; + +/** + * MlirToRuntimeTranslator helps to translate a MLIR program to a CoreRuntime. + * This is the base class of all the modules those parse a MLIR program and + * finally generate a CoreRuntime. + */ +class MlirToRuntimeTranslator { + public: + //! Holds all the function definitions. + using function_defs_t = std::unordered_map; + + explicit MlirToRuntimeTranslator(CoreRuntimeBuilder* runtime); + MlirToRuntimeTranslator(mlir::ModuleOp module, CoreRuntimeBuilder* runtime); + + void Run() { EmitFunctions(); } + + virtual ~MlirToRuntimeTranslator(); + + protected: + //! Emit a "infrt.constant.*" operation, return true if succeed. + bool EmitConstantOp(mlir::Operation* op); + //! Emit a "infrt.return" operation. + bool EmitReturnOp(mlir::Operation* op, + llvm::SmallVectorImpl* results); + //! Emit a "ts.build_shape" operation. + bool EmitBuildShapeOp(mlir::Operation* op); + //! Emit an operation other than the special cases above. + bool EmitGeneralOp(mlir::Operation* op); + //! Emit all the functions. + bool EmitFunctions(); + + //! Emit a single function, this is an API that should be implemented by + //! inherients. + virtual void EmitFunction(mlir::FuncOp op); + + bool EmitCallOp(mlir::Operation* op, function_defs_t* function_table); + + template + boost::optional EmitAttribute(const mlir::Attribute* attr); + + Value* GetOpResult(mlir::Operation* op); + + Value* GetValue(mlir::Value value); + + Value* AddValue(mlir::Value value); + + Value* AddValue(mlir::Value mlir_value, Value* value); + + void UpdateCurFuncName(const std::string& name); + + protected: + struct Impl; + std::unique_ptr impl_; +}; + +/** + * Build a CoreRuntime from a MLIR module. + */ +void MlirToRuntimeTranslate(mlir::ModuleOp module, CoreRuntimeBuilder* runtime); + +/** + * Execute a MLIR program, that is execute all the functions without input + * arguments. + * This is mainly used by testcase. + * @param module a MLIR module. + * @param registry the kernel registry containing all the valid kernels. + */ +void TestMlir(mlir::ModuleOp module, KernelRegistry* registry); + +} // namespace infrt::host_context diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc new file mode 100644 index 0000000000000..9b85be977ab6c --- /dev/null +++ b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc @@ -0,0 +1,160 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/mlir_to_runtime_translate.h" + +#include +#include + +#include "paddle/infrt/common/global.h" +#include "paddle/infrt/dialect/mlir_loader.h" +#include "paddle/infrt/host_context/core_runtime.h" +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/kernel_utils.h" +#include "paddle/infrt/host_context/mlir_program_executor.h" +#include "paddle/infrt/kernel/basic_kernels.h" +#include "paddle/infrt/kernel/control_flow_kernels.h" +#include "paddle/infrt/kernel/tensor_kernels.h" +#include "paddle/infrt/kernel/tensor_shape_kernels.h" +#include "paddle/infrt/kernel/test_kernels.h" + +namespace infrt::host_context { + +TEST(MlirToRuntimeTranslate, basic) { + mlir::MLIRContext context; + + auto source = R"ROC( +func @main() -> () { + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 + + "infrt.print.f32"(%v1) : (f32) -> () + + infrt.return +} +)ROC"; + + auto module = dialect::LoadMlirSource(&context, source); + module->verify(); + + KernelRegistry registry; + kernel::RegisterFloatBasicKernels(®istry); + kernel::RegisterIntBasicKernels(®istry); + + TestMlir(module.get(), ®istry); +} + +TEST(TestMlir, basic) { + mlir::MLIRContext context; + + auto source = R"ROC( +func @main() -> () { + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 + + "infrt.print.f32"(%v1) : (f32) -> () + + infrt.return +} +)ROC"; + + auto module = dialect::LoadMlirSource(&context, source); + module->verify(); + + KernelRegistry registry; + kernel::RegisterFloatBasicKernels(®istry); + kernel::RegisterIntBasicKernels(®istry); + + TestMlir(module.get(), ®istry); +} + +TEST(TestMlir, shadow_copy_tensor_profile) { + mlir::MLIRContext* context = infrt::Global::getMLIRContext(); + + auto head = R"ROC( +func @predict(%a: !infrt.tensor, %b: !infrt.tensor) -> (!infrt.tensor, !infrt.tensor) { +)ROC"; + + auto tpl0 = + "%a{0} = dt.shallow_copy_tensor %a : !infrt.tensor -> " + "!infrt.tensor"; + auto tpl1 = + "%b{0} = dt.shallow_copy_tensor %b : !infrt.tensor -> " + "!infrt.tensor"; + + auto end = R"ROC( +infrt.return %a0, %b0: !infrt.tensor, !infrt.tensor +} + )ROC"; + + std::stringstream ss; + ss << head; + for (int i = 0; i < 2000; i++) { + ss << llvm::formatv(tpl0, i).str() << "\n"; + ss << llvm::formatv(tpl1, i).str() << "\n"; + } + ss << end; + + auto content = ss.str(); + + // LOG(INFO) << "content: " << content << std::endl; + + auto module = dialect::LoadMlirSource(context, content); + module->verify(); + + host_context::KernelRegistry registry; + + kernel::RegisterBasicKernels(®istry); + kernel::RegisterTestKernels(®istry); + kernel::RegisterTensorShapeKernels(®istry); + kernel::RegisterTensorKernels(®istry); + kernel::RegisterControlFlowKernels(®istry); + + MlirProgramExecutor executor(*module, ®istry); + executor.BuildFunctions(); + + auto* func = executor.LookupFunc("predict"); + ASSERT_TRUE(func); + + std::vector in_args; + std::vector out_args( + {ValueRef(new Value(tensor::DenseHostTensor())), + ValueRef(new Value(tensor::DenseHostTensor()))}); + + auto create_tensor = [] { + tensor::DenseHostTensor a(tensor::TensorShape{{200, 3000}}, + DType(DType::Kind::F32)); + auto* data = reinterpret_cast(a.raw_data()); + for (int i = 0; i < a.shape().GetNumElements(); i++) { + data[i] = i; + } + return a; + }; + + std::vector inputs({ValueRef(new Value(create_tensor())), + ValueRef(new Value(create_tensor()))}); + in_args.assign({inputs[0].get(), inputs[1].get()}); + + for (int i = 0; i < 500; i++) { + func->Execute( + llvm::ArrayRef(in_args.data(), in_args.size()), + llvm::MutableArrayRef(out_args.data(), out_args.size())); + } +} + +} // namespace infrt::host_context diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc new file mode 100644 index 0000000000000..6b10ed473719e --- /dev/null +++ b/paddle/infrt/host_context/op_executable.cc @@ -0,0 +1,151 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/op_executable.h" + +#include + +#include "paddle/infrt/host_context/kernel_frame.h" +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/mlir_function_executable.h" +#include "paddle/infrt/host_context/symbol_table.h" + +namespace infrt::host_context { + +struct OpExecutable::Impl { + Impl(const std::string& op_name, + SymbolTable* symbol_table, + KernelRegistry* kernel_registry) + : name(op_name), + symbol_table(symbol_table), + kernel_registry(kernel_registry ? kernel_registry + : GetCpuKernelRegistry()) { + CHECK(kernel_registry); + } + + inline bool to_execute() const { + return !run_once || run_once && !has_executed; + } + inline void MarkRun() { has_executed = true; } + + std::string name; + SymbolTable* symbol_table{}; + KernelFrameBuilder frame; + KernelRegistry* kernel_registry{}; + + std::unique_ptr mlir_function_executable; + + KernelImplementation kernel_impl{}; + + //! Tell whether this Op should be executed only once. + bool run_once{}; + //! Tell whether this op has been executed. + bool has_executed{}; +}; + +OpExecutable::OpExecutable(OpExecutable::Impl* impl) : impl_(impl) {} + +const std::string& OpExecutable::name() const { return impl_->name; } + +OpExecutableBuilder::OpExecutableBuilder(const std::string& op_name, + SymbolTable* symbol_table, + KernelRegistry* kernel_registry) + : OpExecutable(new Impl(op_name, symbol_table, kernel_registry)) { + CHECK(impl_); + // CPU kernel registry is the default KernelRegistry. + impl_->kernel_impl = impl_->kernel_registry->GetKernel( + std::string(op_name.data(), op_name.size())); + // TODO(Superjomn) support other device other than CPU. + CHECK(impl_->kernel_impl) << "No CPU kernel called " << op_name; + + if (op_name == "dt.get_param") { + impl_->run_once = true; + } +} + +void OpExecutableBuilder::AppendArgument(const std::string& name) { + if (!impl_->symbol_table->GetValue(name)) { + impl_->symbol_table->Register(name); + } + impl_->frame.AddArgument(impl_->symbol_table->GetValue(name)); +} + +void OpExecutableBuilder::AppendArgument(Value* value) { + impl_->frame.AddArgument(value); +} + +KernelFrame& OpExecutable::frame() { return impl_->frame; } +const KernelFrame& OpExecutable::frame() const { return impl_->frame; } + +void OpExecutableBuilder::SetResults(llvm::ArrayRef result_names) { + llvm::SmallVector results; + for (size_t result_id = 0; result_id < result_names.size(); result_id++) { + Value* value = impl_->symbol_table->Register(result_names[result_id]); + results.push_back(value); + } + impl_->frame.SetResults(results); +} + +void OpExecutableBuilder::SetResults(llvm::ArrayRef results) { + impl_->frame.SetResults(results); +} + +void OpExecutableBuilder::AppendAttribute(Value* value) { + impl_->frame.AddAttribute(value); +} + +OpExecutableBuilder::OpExecutableBuilder(OpExecutableBuilder&& other) + : OpExecutable(other.impl_.release()) {} + +MlirFunctionExecutable* OpExecutableBuilder::CreateFunctionExecutable( + mlir::FuncOp op, MlirToRuntimeTranslator::function_defs_t* function_defs) { + CHECK(!impl_->mlir_function_executable); + impl_->mlir_function_executable.reset( + new MlirFunctionExecutable(op, impl_->kernel_registry, *function_defs)); + return impl_->mlir_function_executable.get(); +} + +MlirFunctionExecutable* OpExecutableBuilder::CreateFunctionExecutable( + mlir::Region* region, + mlir::FunctionType func_type, + function_defs_t* function_defs) { + CHECK(!impl_->mlir_function_executable); + impl_->mlir_function_executable.reset(new MlirFunctionExecutable( + region, func_type, impl_->kernel_registry, *function_defs)); + return impl_->mlir_function_executable.get(); +} + +void OpExecutable::Execute() { +#ifndef NDEBUG + VLOG(3) << "execute " << name() + << " --- frame args: " << impl_->frame.GetNumArgs() << " results " + << impl_->frame.GetNumResults() << " attributes " + << impl_->frame.GetNumAttributes(); + for (int i = 0; i < impl_->frame.GetNumArgs(); i++) { + VLOG(3) << "function arg: " << impl_->frame.GetArgAt(i); + } + for (int i = 0; i < impl_->frame.GetNumResults(); i++) { + VLOG(3) << "function result: " << impl_->frame.GetResults()[i]; + } +#endif + + if (impl_->to_execute()) { + impl_->kernel_impl(&impl_->frame); + impl_->MarkRun(); + } +} + +OpExecutable::~OpExecutable() {} + +} // namespace infrt::host_context diff --git a/paddle/infrt/host_context/op_executable.h b/paddle/infrt/host_context/op_executable.h new file mode 100644 index 0000000000000..e2248225a5caf --- /dev/null +++ b/paddle/infrt/host_context/op_executable.h @@ -0,0 +1,92 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include +#include +#include + +#include "mlir/IR/Function.h" +#include "mlir/IR/Region.h" + +namespace mlir { +class FuncOp; +} // namespace mlir + +namespace infrt::host_context { + +class SymbolTable; +class KernelRegistry; +class KernelFrame; +class Value; +class CoreRuntimeBuilder; +class MlirFunctionExecutable; + +/** + * OpExecutable is a runtime executable instance for an operation. It captures + * all the information(Tensors, attributes + * and so on) needed for execution. + * With the SymbolTable and op definition, it create and hold a KernelFrame once + * and execute any times. + */ +class OpExecutable { + public: + KernelFrame& frame(); + const KernelFrame& frame() const; + + void Execute(); + + const std::string& name() const; + + ~OpExecutable(); + + protected: + class Impl; + explicit OpExecutable(Impl* impl); + + std::unique_ptr impl_; +}; + +/** + * Builder to help contruct an OpExecutable. + */ +class OpExecutableBuilder : public OpExecutable { + public: + using function_defs_t = std::unordered_map; + + OpExecutableBuilder(const std::string& op_name, + SymbolTable* symbol_table, + KernelRegistry* kernel_registry = nullptr); + OpExecutableBuilder(OpExecutableBuilder&& other); + + void AppendArgument(const std::string& name); + void AppendArgument(Value* value); + + void SetResults(llvm::ArrayRef result_names); + void SetResults(llvm::ArrayRef results); + + void AppendAttribute(Value* value); + + MlirFunctionExecutable* CreateFunctionExecutable( + mlir::FuncOp op, function_defs_t* function_defs); + + MlirFunctionExecutable* CreateFunctionExecutable( + mlir::Region* region, + mlir::FunctionType func_type, + function_defs_t* function_defs); +}; + +} // namespace infrt::host_context diff --git a/paddle/infrt/host_context/op_executable_test.cc b/paddle/infrt/host_context/op_executable_test.cc new file mode 100644 index 0000000000000..f981cca4426c1 --- /dev/null +++ b/paddle/infrt/host_context/op_executable_test.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/op_executable.h" + +#include + +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/kernel_utils.h" +#include "paddle/infrt/host_context/symbol_table.h" + +namespace infrt { +namespace host_context { + +int add(int a, int b) { return a + b; } + +TEST(OpExecutable, basic) { + // register kernel + KernelRegistry registry; + registry.AddKernel("infrt.test.add.i32", INFRT_KERNEL(add)); + + SymbolTable table; + table.Register("a", 1); + table.Register("b", 2); + + OpExecutableBuilder executable("infrt.test.add.i32", &table, ®istry); + executable.AppendArgument("a"); + executable.AppendArgument("b"); + executable.SetResults({"c"}); + + executable.Execute(); + + // check the kernel frame has the result. + auto results = executable.frame().GetResults(); + ASSERT_EQ(results.size(), 1UL); + ASSERT_EQ(results.front()->get(), 3); + + // check symbol table contains the same result instance. + LOG(INFO) << "type: " << table.GetValue("c")->type_info(); + int c = table.GetValue("c")->get(); + ASSERT_EQ(c, 3); +} + +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/symbol_table.cc b/paddle/infrt/host_context/symbol_table.cc new file mode 100644 index 0000000000000..318dc0cc55624 --- /dev/null +++ b/paddle/infrt/host_context/symbol_table.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/symbol_table.h" + +#include + +namespace infrt { +namespace host_context { + +struct SymbolTable::Impl { + std::unordered_map data; +}; + +SymbolTable::SymbolTable() : impl_(new Impl) {} + +Value* SymbolTable::Register(const std::string& key) { + CHECK(!impl_->data.count(key)) << "Duplicate register [" << key << "]"; + auto newitem = ValueRef(new Value); + impl_->data.emplace(key, newitem); + return newitem.get(); +} + +Value* SymbolTable::Register(const std::string& key, ValueRef value) { + CHECK(!impl_->data.count(key)) << "Duplicate register [" << key << "]"; + impl_->data.emplace(key, value); + return value.get(); +} + +Value* SymbolTable::GetValue(const std::string& key) const { + auto it = impl_->data.find(std::string(key)); + return it != impl_->data.end() ? it->second.get() : nullptr; +} + +// @{ +#define REGISTER_TYPE__(T) \ + template <> \ + T SymbolTable::Get(const std::string& key) { \ + auto it = impl_->data.find(std::string(key)); \ + CHECK(it != impl_->data.end()) << "No value called " << key; \ + return it->second->get(); \ + } +REGISTER_TYPE__(int32_t); +REGISTER_TYPE__(float); +REGISTER_TYPE__(double); +REGISTER_TYPE__(int64_t); +#undef REGISTER_TYPE__ +// @} + +SymbolTable::~SymbolTable() {} + +size_t SymbolTable::size() const { return impl_->data.size(); } + +// @{ +#define REGISTER_TYPE__(T) \ + template <> \ + Value* SymbolTable::Register(const std::string& key, T&& v) { \ + CHECK(!impl_->data.count(key)) << "Duplicate register [" << key << "]"; \ + auto newitem = ValueRef(v); \ + impl_->data.emplace(key, newitem); \ + return newitem.get(); \ + } +REGISTER_TYPE__(int) +REGISTER_TYPE__(float) +REGISTER_TYPE__(double) +REGISTER_TYPE__(bool) +#undef REGISTER_TYPE__ +// @} + +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/symbol_table.h b/paddle/infrt/host_context/symbol_table.h new file mode 100644 index 0000000000000..805215a78ce0d --- /dev/null +++ b/paddle/infrt/host_context/symbol_table.h @@ -0,0 +1,65 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include + +#include "paddle/infrt/host_context/value.h" + +namespace infrt { +namespace host_context { + +/** + * SymbolTable holds all the states of the kernel graph in the runtime. + */ +class SymbolTable { + public: + SymbolTable(); + + /** + * Register a state called \p key. + */ + Value* Register(const std::string& key); + + Value* Register(const std::string& key, ValueRef value); + + /** + * Register a state and set value. + */ + template + Value* Register(const std::string& key, T&& v); + + size_t size() const; + + /** + * Get a state called \p key. + */ + Value* GetValue(const std::string& key) const; + + template + T Get(const std::string& key); + + ~SymbolTable(); + + private: + class Impl; + + std::unique_ptr impl_; +}; + +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/value.cc b/paddle/infrt/host_context/value.cc new file mode 100644 index 0000000000000..8c3ccba3d0ba5 --- /dev/null +++ b/paddle/infrt/host_context/value.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/value.h" + +#include "paddle/infrt/tensor/dense_tensor_view.h" + +namespace infrt { +namespace host_context { + +ValueRef::ValueRef(int32_t val) : Shared(new Value(val)) {} +ValueRef::ValueRef(int64_t val) : Shared(new Value(val)) {} +ValueRef::ValueRef(float val) : Shared(new Value(val)) {} +ValueRef::ValueRef(double val) : Shared(new Value(val)) {} +ValueRef::ValueRef(bool val) : Shared(new Value(val)) {} + +const char* Value::type_info() const { return __type_info__; } + +void CopyTo(const Value& from, Value* to) { + CHECK(from.valid()) << "from value is not valid, can't be copied"; + CHECK(to) << "to is not valid"; + visit( + [&](auto&& arg) { + using T = std::decay_t; + if (std::is_same::value) + to->data = arg; + else if (std::is_same::value) + to->data = arg; + else if (std::is_same::value) + to->data = arg; + else if (std::is_same::value) + to->data = arg; + else if (std::is_same::value) + to->data = arg; + else if (std::is_same::value) + to->data = arg; + else if (std::is_same::value) + to->data = arg; + else if (std::is_same::value) + to->data = arg; + else if (std::is_same::value) + to->data = arg; + else if (std::is_same::value) + to->data = arg; + else if (std::is_same>::value) + to->data = arg; + else if (std::is_same>::value) + to->data = arg; + else if (std::is_same::value) + to->data = arg; + else + LOG(FATAL) << "Not supported Value copy: " << typeid(T).name(); + }, + from.data); +} + +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h new file mode 100644 index 0000000000000..4a2b92a7e69c5 --- /dev/null +++ b/paddle/infrt/host_context/value.h @@ -0,0 +1,156 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include +#include +#include + +#include "paddle/infrt/common/object.h" +#include "paddle/infrt/common/shared.h" +#include "paddle/infrt/host_context/function.h" +#include "paddle/infrt/support/variant.h" +#include "paddle/infrt/tensor/dense_host_tensor.h" +#include "paddle/infrt/tensor/dense_tensor_view.h" +#include "paddle/infrt/tensor/tensor_map.h" +#include "paddle/infrt/tensor/tensor_shape.h" + +namespace infrt { +namespace host_context { + +struct MlirFunctionExecutable; + +using ValueVariantType = Variant, + std::vector, + std::vector, + std::vector, + std::vector>; + +//! Copy content from \param from to \param to. +void CopyTo(const Value& from, Value* to); + +/** + * Represents any data type for value in host context. + */ +class Value : public common::Object { + public: + using variant_type = ValueVariantType; + + explicit Value() {} // NOLINT + explicit Value(int32_t x) : data(x) {} + explicit Value(int64_t x) : data(x) {} + explicit Value(float x) : data(x) {} + explicit Value(double x) : data(x) {} + explicit Value(bool x) : data(x) {} + explicit Value(std::string x) : data(x) {} + explicit Value(tensor::TensorMap&& x) : data(x) {} + explicit Value(std::vector&& x) : data(x) {} + explicit Value(std::vector&& x) : data(x) {} + explicit Value(std::vector&& x) : data(x) {} + explicit Value(std::vector&& x) : data(x) {} + explicit Value(std::vector&& x) : data(x) {} + explicit Value(tensor::TensorShape&& x) : data(std::move(x)) {} + explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {} + explicit Value(MlirFunctionExecutable* x) : data(x) {} + + template + const T& get() const { + return data.get(); + } + template + T& get() { + return data.get(); + } + + template + void set(T&& v) { + data = std::move(v); + } + + void set(Value* v) { data = std::move(v->data); } + + bool valid() const { return true; } + + const char* type_info() const override; + + friend void CopyTo(const Value& from, Value* to); + + private: + ValueVariantType data; + static constexpr const char* __type_info__ = "host_context_value"; +}; + +/** + * Represents a counted reference of a Value. + */ +class ValueRef : common::Shared { + public: + ValueRef() = default; + explicit ValueRef(Value* n) : common::Shared(n) {} + explicit ValueRef(int32_t val); + explicit ValueRef(int64_t val); + explicit ValueRef(float val); + explicit ValueRef(double val); + explicit ValueRef(bool val); + + using common::Shared::get; + using common::Shared::Reset; + using common::Shared::operator->; + using common::Shared::operator*; + //! Get a readonly data. + template + const T& get() const { + CHECK(p_); + return p_->get(); + } + + template + T& get() { + CHECK(p_); + return p_->get(); + } + + //! Assign a data. + template + void Assign(const T& x) { + if (!p_) { + p_ = common::make_shared(); + } + *p_ = x; + } + + template + void Assign(Args... args) { + p_ = common::make_shared(std::forward(args)...); + } + + inline bool IsValid() { return p_; } +}; + +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/host_context/value_test.cc b/paddle/infrt/host_context/value_test.cc new file mode 100644 index 0000000000000..48d49478ce0ef --- /dev/null +++ b/paddle/infrt/host_context/value_test.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/host_context/value.h" + +#include + +namespace infrt { +namespace host_context { + +TEST(ValueRef, test) { + ValueRef x(12); + ASSERT_EQ(x.get(), 12); + + ValueRef y(1.2f); + ASSERT_EQ(y.get(), 1.2f); + + ValueRef z(true); + ASSERT_EQ(z.get(), true); +} + +} // namespace host_context +} // namespace infrt diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt new file mode 100644 index 0000000000000..da858aad28f81 --- /dev/null +++ b/paddle/infrt/kernel/CMakeLists.txt @@ -0,0 +1,9 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + basic_kernels.cc + test_kernels.cc + tensor_shape_kernels.cc + tensor_kernels.cc + control_flow_kernels.cc + ) diff --git a/paddle/infrt/kernel/basic_kernels.cc b/paddle/infrt/kernel/basic_kernels.cc new file mode 100644 index 0000000000000..d7f2c3865157d --- /dev/null +++ b/paddle/infrt/kernel/basic_kernels.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/basic_kernels.h" + +#include +#include + +#include "llvm/Support/raw_ostream.h" +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/kernel_utils.h" + +using infrt::host_context::Attribute; + +namespace infrt::kernel { + +template +T add(T a, T b) { + return a + b; +} + +template +T sub(T a, T b) { + return a - b; +} + +template +T mul(T a, T b) { + return a * b; +} + +template +T div(T a, T b) { + return a / b; +} + +template +void print(T a) { + std::cout << a << std::endl; +} + +static std::string GetString(Attribute value) { + return value.get(); +} + +static void PrintString(const std::string &str) { + llvm::outs() << "string = " << str << '\n'; + llvm::outs().flush(); +} + +void RegisterBasicKernels(host_context::KernelRegistry *registry) { + RegisterIntBasicKernels(registry); + RegisterFloatBasicKernels(registry); + registry->AddKernel("infrt.get_string", INFRT_KERNEL(GetString)); + registry->AddKernel("infrt.print_string", INFRT_KERNEL(PrintString)); +} + +void RegisterIntBasicKernels(host_context::KernelRegistry *registry) { + registry->AddKernel("infrt.add.i32", INFRT_KERNEL(add)); + registry->AddKernel("infrt.sub.i32", INFRT_KERNEL(sub)); + registry->AddKernel("infrt.mul.i32", INFRT_KERNEL(mul)); + registry->AddKernel("infrt.div.i32", INFRT_KERNEL(div)); + registry->AddKernel("infrt.print.i32", INFRT_KERNEL(print)); +} + +void RegisterFloatBasicKernels(host_context::KernelRegistry *registry) { + registry->AddKernel("infrt.add.f32", INFRT_KERNEL(add)); + registry->AddKernel("infrt.sub.f32", INFRT_KERNEL(sub)); + registry->AddKernel("infrt.mul.f32", INFRT_KERNEL(mul)); + registry->AddKernel("infrt.div.f32", INFRT_KERNEL(div)); + registry->AddKernel("infrt.print.f32", INFRT_KERNEL(print)); +} + +} // namespace infrt::kernel diff --git a/paddle/infrt/kernel/basic_kernels.h b/paddle/infrt/kernel/basic_kernels.h new file mode 100644 index 0000000000000..9e98885cf6ebf --- /dev/null +++ b/paddle/infrt/kernel/basic_kernels.h @@ -0,0 +1,34 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +namespace infrt::host_context { + +struct KernelRegistry; + +} // namespace infrt::host_context + +namespace infrt::kernel { + +/** + * Register all the basic kernels to \p registry. + */ +void RegisterBasicKernels(host_context::KernelRegistry* registry); + +void RegisterIntBasicKernels(host_context::KernelRegistry* registry); +void RegisterFloatBasicKernels(host_context::KernelRegistry* registry); + +} // namespace infrt::kernel diff --git a/paddle/infrt/kernel/control_flow_kernels.cc b/paddle/infrt/kernel/control_flow_kernels.cc new file mode 100644 index 0000000000000..6cc94dbcce077 --- /dev/null +++ b/paddle/infrt/kernel/control_flow_kernels.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/control_flow_kernels.h" + +#include + +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/mlir_function_executable.h" + +namespace infrt { +namespace kernel { + +static void INFRTCall( + host_context::RemainingArguments args, + host_context::RemainingResults results, + host_context::Attribute fn) { + VLOG(3) << "running call kernel ..."; + CHECK_EQ(fn.get()->num_arguments(), args.size()); + CHECK_EQ(fn.get()->num_results(), results.size()); + + for (auto& v : results.values()) { + CHECK(v.get()); + } + fn.get()->Execute(args.values(), results.values()); +} + +void RegisterControlFlowKernels(host_context::KernelRegistry* registry) { + registry->AddKernel("infrt.call", INFRT_KERNEL(INFRTCall)); +} + +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/control_flow_kernels.h b/paddle/infrt/kernel/control_flow_kernels.h new file mode 100644 index 0000000000000..5fa6b985f0b17 --- /dev/null +++ b/paddle/infrt/kernel/control_flow_kernels.h @@ -0,0 +1,31 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/infrt/host_context/function.h" +#include "paddle/infrt/host_context/kernel_utils.h" + +namespace infrt { + +namespace host_context { +struct KernelRegistry; +} // namespace host_context + +namespace kernel { + +void RegisterControlFlowKernels(host_context::KernelRegistry* registry); + +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc new file mode 100644 index 0000000000000..2fa477aa4dbda --- /dev/null +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/tensor_kernels.h" + +#include +#include + +#include "paddle/infrt/common/global.h" +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/kernel_utils.h" +#include "paddle/infrt/tensor/dense_host_tensor.h" +#include "paddle/infrt/tensor/dense_tensor_view.h" +#include "paddle/infrt/tensor/tensor_map.h" +#include "paddle/infrt/tensor/tensor_shape.h" + +namespace infrt::kernel { +using namespace host_context; // NOLINT +using namespace tensor; // NOLINT + +/// ===== Kernel begin ==== + +template +DenseHostTensor CreateUninitTensor(Attribute> shape) { + const auto &shape_data = shape.get(); + auto array = llvm::ArrayRef(shape_data.data(), shape_data.size()); + auto type = GetDType(); + return DenseHostTensor(TensorShape(array), type); +} + +void PrintTensor(const DenseHostTensor &tensor) { + std::cout << tensor << std::endl; +} + +template +void FillTensorWithConstant(DenseHostTensor *tensor, Attribute v) { + MutableDTArrayView(tensor).Fill(v.get()); +} + +TensorMap LoadParams(const std::string &path) { + return *(infrt::tensor::LoadParams(path)); +} + +DenseHostTensor GetParam(TensorMap map, Attribute nameAttr) { + auto &name = nameAttr.get(); + return *(map[name]); +} + +DenseHostTensor ShallowCopyTensor(DenseHostTensor v) { return v; } + +/// ===== Kernel end ==== + +void RegisterTensorKernels(host_context::KernelRegistry *registry) { + registry->AddKernel("dt.create_uninit_tensor.f32", + INFRT_KERNEL(CreateUninitTensor)); + registry->AddKernelAttrNameList("dt.create_uninit_tensor.f32", {"shape"}); + registry->AddKernel("dt.print_tensor", INFRT_KERNEL(PrintTensor)); + registry->AddKernel("dt.fill_tensor_with_constant.f32", + INFRT_KERNEL(FillTensorWithConstant)); + registry->AddKernel("dt.fill_tensor_with_constant.f64", + INFRT_KERNEL(FillTensorWithConstant)); + registry->AddKernel("dt.load_params", INFRT_KERNEL(LoadParams)); + registry->AddKernel("dt.get_param", INFRT_KERNEL(GetParam)); + registry->AddKernel("dt.shallow_copy_tensor", + INFRT_KERNEL(ShallowCopyTensor)); +} + +} // namespace infrt::kernel diff --git a/paddle/infrt/kernel/tensor_kernels.h b/paddle/infrt/kernel/tensor_kernels.h new file mode 100644 index 0000000000000..8f2180ba80a4f --- /dev/null +++ b/paddle/infrt/kernel/tensor_kernels.h @@ -0,0 +1,25 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace infrt::host_context { +struct KernelRegistry; +} // namespace infrt::host_context + +namespace infrt::kernel { + +void RegisterTensorKernels(host_context::KernelRegistry* registry); + +} // namespace infrt::kernel diff --git a/paddle/infrt/kernel/tensor_shape_kernels.cc b/paddle/infrt/kernel/tensor_shape_kernels.cc new file mode 100644 index 0000000000000..a04b492819298 --- /dev/null +++ b/paddle/infrt/kernel/tensor_shape_kernels.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/tensor_shape_kernels.h" + +#include +#include +#include + +#include + +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/kernel_utils.h" +#include "paddle/infrt/tensor/tensor_shape.h" + +namespace infrt::kernel { + +void PrintShape(const tensor::TensorShape& shape) { + llvm::raw_os_ostream oos(std::cout); + oos << shape << '\n'; +} + +void RegisterTensorShapeKernels(host_context::KernelRegistry* registry) { + registry->AddKernel("ts.print_shape", INFRT_KERNEL(PrintShape)); +} + +} // namespace infrt::kernel diff --git a/paddle/infrt/kernel/tensor_shape_kernels.h b/paddle/infrt/kernel/tensor_shape_kernels.h new file mode 100644 index 0000000000000..e87c6c37e88a0 --- /dev/null +++ b/paddle/infrt/kernel/tensor_shape_kernels.h @@ -0,0 +1,27 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace infrt::host_context { + +class KernelRegistry; + +} // namespace infrt::host_context + +namespace infrt::kernel { + +void RegisterTensorShapeKernels(host_context::KernelRegistry* registry); + +} // namespace infrt::kernel diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc new file mode 100644 index 0000000000000..d5f64d09b602f --- /dev/null +++ b/paddle/infrt/kernel/test_kernels.cc @@ -0,0 +1,200 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/test_kernels.h" + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/kernel_utils.h" +#include "paddle/infrt/host_context/mlir_function_executable.h" +#include "paddle/infrt/tensor/dense_host_tensor.h" + +using infrt::host_context::Attribute; +using infrt::host_context::MlirFunctionExecutable; +using infrt::host_context::RemainingArguments; + +namespace infrt::kernel { +namespace { +class BenchmarkStats { + public: + BenchmarkStats(std::string name, + int num_warmup_runs, + int max_count, + std::chrono::microseconds benchmark_duration) + : name_{name}, + num_warmup_runs_{num_warmup_runs}, + max_count_{max_count}, + benchmark_duration_{benchmark_duration} {} + + void StartRun() { + ++cur_count_; + // Start recording CPU time. + cur_start_walltime_ = std::chrono::steady_clock::now(); + cur_start_cpu_ = std::clock(); + } + + void StopRun() { + // Do not collect the runtime statistics if we are still in the warm up + // period. + if (cur_count_ <= num_warmup_runs_) return; + + // Stop the CPU timer. + std::clock_t cur_stop_cpu_ = std::clock(); + + // Stop the wall clock timer. + auto cur_stop_walltime_ = std::chrono::steady_clock::now(); + + // Collect the wall clock duration. + auto duration_walltime_ = cur_stop_walltime_ - cur_start_walltime_; + run_times_walltime_.push_back(duration_walltime_); + + // Collect the CPU duration in microseconds. + // First cast to integer that represents microseconds with truncation, as + // does std::chrono::duration_cast. Then cast to std::chrono::microseconds. + std::clock_t duration_cpu_raw = cur_stop_cpu_ - cur_start_cpu_; + auto duration_cpu_ = static_cast( + static_cast(1e9 * duration_cpu_raw / CLOCKS_PER_SEC)); + + run_times_cpu_.push_back(duration_cpu_); + + total_duration_walltime_ += duration_walltime_; + total_duration_cpu_ += duration_cpu_; + } + // Return if we should we run more rounds. + bool MoreRun() const { + return cur_count_ < max_count_ + num_warmup_runs_ && + total_duration_walltime_ < benchmark_duration_; + } + + // Summarize the benchmark results. + void Summarize() { + std::sort(run_times_walltime_.begin(), run_times_walltime_.end()); + std::sort(run_times_cpu_.begin(), run_times_cpu_.end()); + + auto percentile = []( + double p, const std::vector &run_times) { + assert(p >= 0.0 && p <= 1.0); + return run_times[run_times.size() * p]; + }; + + // BM: prefix is added to make grepping results from lit output easier. + std::string prefix; + llvm::raw_string_ostream(prefix) << "BM:" << name_ << ':'; + auto cpu_utilization = + total_duration_cpu_.count() * 100.0 / total_duration_walltime_.count(); + + llvm::outs() << prefix << "Count: " << run_times_walltime_.size() << '\n'; + llvm::outs() << prefix + << "Duration(ns): " << total_duration_walltime_.count() + << '\n'; + llvm::outs() << prefix + << "Time Min(ns): " << run_times_walltime_.front().count() + << '\n'; + llvm::outs() << prefix + << "Time Max(ns): " << run_times_walltime_.back().count() + << '\n'; + llvm::outs() << prefix << "Time 50%(ns): " + << percentile(0.5, run_times_walltime_).count() << '\n'; + llvm::outs() << prefix << "Time 95%(ns): " + << percentile(0.95, run_times_walltime_).count() << '\n'; + llvm::outs() << prefix << "Time 99%(ns): " + << percentile(0.99, run_times_walltime_).count() << '\n'; + // Log CPU time statistics. + llvm::outs() << prefix + << "CPU Duration(ns): " << total_duration_cpu_.count() << '\n'; + llvm::outs() << prefix << "CPU Min(ns): " << run_times_cpu_.front().count() + << '\n'; + llvm::outs() << prefix << "CPU Max(ns): " << run_times_cpu_.back().count() + << '\n'; + llvm::outs() << prefix + << "CPU 50%(ns): " << percentile(0.5, run_times_cpu_).count() + << '\n'; + llvm::outs() << prefix + << "CPU 95%(ns): " << percentile(0.95, run_times_cpu_).count() + << '\n'; + llvm::outs() << prefix + << "CPU 99%(ns): " << percentile(0.99, run_times_cpu_).count() + << '\n'; + llvm::outs() << prefix << "CPU utilization(percent): " << cpu_utilization + << "\n"; + llvm::outs().flush(); + } + + private: + const std::string name_; + const int num_warmup_runs_; + const int max_count_; + int cur_count_ = 0; + const std::chrono::nanoseconds benchmark_duration_; + std::chrono::nanoseconds total_duration_walltime_{}; + std::chrono::nanoseconds total_duration_cpu_{}; + std::chrono::time_point cur_start_walltime_{}; + std::clock_t cur_start_cpu_; + std::vector run_times_walltime_; + // CPU run times in microseconds. + std::vector run_times_cpu_; +}; + +} // anonymous namespace + +// This op benchmarks the input function by running the function in a loop +// up to a max count or max time as specified in the function's attributes. +// +// Attributes: +// duration_secs: Benchmark duration in seconds. +// max_count: Max run count of input function. +// name: The name used to tag the benchmark results. +// num_warmup_runs: Number of warm up runs before benchmarking starts. +// fn: The input function to be benchmarked. +static void benchmark(RemainingArguments args, + host_context::RemainingResults results, + Attribute duration_secs, + Attribute max_count, + Attribute name, + Attribute num_warmup_runs, + Attribute fn) { + BenchmarkStats bm_stats{name.get(), + num_warmup_runs.get(), + max_count.get(), + std::chrono::seconds(duration_secs.get())}; + + while (bm_stats.MoreRun()) { + bm_stats.StartRun(); + fn.get()->Execute(args.values(), results.values(), true); + bm_stats.StopRun(); + } + bm_stats.Summarize(); +} + +// Just copy the input to the result. +tensor::DenseHostTensor ShadowCopyTensor(tensor::DenseHostTensor src) { + return src; +} + +void RegisterTestKernels(host_context::KernelRegistry *registry) { + registry->AddKernel("infrt.benchmark", INFRT_KERNEL(benchmark)); + registry->AddKernel("infrt.test.shadow_copy_tensor", + INFRT_KERNEL(ShadowCopyTensor)); +} + +} // namespace infrt::kernel diff --git a/paddle/infrt/kernel/test_kernels.h b/paddle/infrt/kernel/test_kernels.h new file mode 100644 index 0000000000000..f42884dfaf2c9 --- /dev/null +++ b/paddle/infrt/kernel/test_kernels.h @@ -0,0 +1,31 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +namespace infrt::host_context { + +struct KernelRegistry; + +} // namespace infrt::host_context + +namespace infrt::kernel { + +/** + * Register all the test kernels to registry. + */ +void RegisterTestKernels(host_context::KernelRegistry* registry); + +} // namespace infrt::kernel diff --git a/paddle/infrt/paddle/CMakeLists.txt b/paddle/infrt/paddle/CMakeLists.txt new file mode 100644 index 0000000000000..172d78ecde3b8 --- /dev/null +++ b/paddle/infrt/paddle/CMakeLists.txt @@ -0,0 +1,24 @@ +proto_library(paddle_framework_proto SRCS framework.proto) + +add_subdirectory(cpp) +add_subdirectory(pb) + +core_gather_headers() + +gather_srcs(infrt_src SRCS + model_parser.cc + scope.cc + tensor.cc + ) + +foreach(cpp ${SRCS}) + set(infrt_src + "${infrt_src};infrt/paddle/${cpp}" + CACHE INTERNAL "") +endforeach() + +file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h) + +foreach(header ${includes}) + set(core_includes "${core_includes};${header}" CACHE INTERNAL "") +endforeach() diff --git a/paddle/infrt/paddle/cpp/CMakeLists.txt b/paddle/infrt/paddle/cpp/CMakeLists.txt new file mode 100644 index 0000000000000..0feaabd2fa7c9 --- /dev/null +++ b/paddle/infrt/paddle/cpp/CMakeLists.txt @@ -0,0 +1,16 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + ) + +foreach(cpp ${SRCS}) + set(infrt_src + "${infrt_src};infrt/paddle/cpp/${cpp}" + CACHE INTERNAL "") +endforeach() + +file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h) + +foreach(header ${includes}) + set(core_includes "${core_includes};${header}" CACHE INTERNAL "") +endforeach() diff --git a/paddle/infrt/paddle/cpp/desc_api.h b/paddle/infrt/paddle/cpp/desc_api.h new file mode 100644 index 0000000000000..ccd79c048ab14 --- /dev/null +++ b/paddle/infrt/paddle/cpp/desc_api.h @@ -0,0 +1,229 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include + +namespace infrt::paddle::cpp { + +/* + * Compatible interfaces for all the different kinds of XXXDesc. All the XXXDesc + * classes should implement this. + */ +class VarDescAPI { + public: + enum class Type { + // Pod Types + BOOL = 0, + INT16, + INT32, + INT64, + FP16, + FP32, + FP64, + // Tensor is used in C++. + SIZE_T, + UINT8, + INT8, + + // Other types that may need additional descriptions + LOD_TENSOR, + SELECTED_ROWS, + FEED_MINIBATCH, + FETCH_LIST, + STEP_SCOPES, + LOD_RANK_TABLE, + LOD_TENSOR_ARRAY, + PLACE_LIST, + READER, + // Any runtime decided variable type is raw + // raw variables should manage their own allocations + // in operators like nccl_op + RAW, + TUPLE + }; + + using VarDataType = Type; + + virtual ~VarDescAPI() = default; + + // Get var's name + virtual std::string Name() const = 0; + // Set var's name + virtual void SetName(std::string name) = 0; + // Get var's type + virtual Type GetType() const = 0; + // Set var's type + virtual void SetType(Type type) = 0; + // Tell whether var is persistable or not + virtual bool Persistable() const = 0; + // Set var to be persistable or not + virtual void SetPersistable(bool persistable) = 0; + // Get var's shape + virtual std::vector GetShape() const = 0; + // Set var's shape + virtual void SetShape(const std::vector& dims) = 0; +}; + +/* + * NOTE Some interfaces are weried, we remain them unchanged to keep compatible + * with framework::OpDesc in Fluid framework. + */ +class OpDescAPI { + public: + // The AttrType is used to make the proto::AttrType portable. + enum class AttrType { + INT = 0, + FLOAT = 1, + STRING = 2, + INTS = 3, + FLOATS = 4, + STRINGS = 5, + BOOLEAN = 6, + BOOLEANS = 7, + BLOCK = 8, + LONG = 9, + BLOCKS = 10, + LONGS = 11, + UNK, + }; + + virtual ~OpDescAPI() = default; + + /// Get operator's type. + virtual std::string Type() const = 0; + /// Set operator's type. + virtual void SetType(const std::string& type) = 0; + /// Get arguments given the parameter. + virtual std::vector Input(const std::string& param) const = 0; + /// Get parameters. + virtual std::vector InputArgumentNames() const = 0; + /// Get arguments given the parameter. + virtual std::vector Output(const std::string& param) const = 0; + /// Get parameters. + virtual std::vector OutputArgumentNames() const = 0; + /// Set a input given the parameter and arguments. + virtual void SetInput(const std::string& param, + const std::vector& args) = 0; + virtual void SetOutput(const std::string& param, + const std::vector& args) = 0; + /// Tell whether this desc has an attribute. + virtual bool HasAttr(const std::string& name) const = 0; + + /// Get the type of an attribute. + virtual AttrType GetAttrType(const std::string& name) const = 0; + + virtual std::vector AttrNames() const = 0; + + /// Set an attribute. + template + void SetAttr(const std::string& name, const T& v); + + /// Get an attribute. + template + T GetAttr(const std::string& name) const; + + std::string Repr() const { + std::stringstream ss; + ss << Type(); + ss << "("; + for (auto& arg : InputArgumentNames()) { + ss << arg << ":"; + for (auto val : Input(arg)) { + ss << val << " "; + } + } + ss << ") -> ("; + for (auto& arg : OutputArgumentNames()) { + ss << arg << ":"; + for (auto val : Output(arg)) { + ss << val << " "; + } + } + ss << ")"; + return ss.str(); + } +}; + +class BlockDescAPI { + public: + virtual ~BlockDescAPI() = default; + + virtual int32_t Idx() const = 0; + + virtual void SetIdx(int32_t idx) = 0; + + virtual int32_t ParentIdx() const = 0; + + virtual void SetParentIdx(int32_t idx) = 0; + + virtual size_t VarsSize() const = 0; + + virtual void ClearVars() = 0; + + // NOTE: This ugly method is used to compatible interfaces between cpp and + // pb/nb backends + // TODO(sangoly): refine this + template + T* GetVar(int32_t idx); + + template + T* AddVar(); + + virtual size_t OpsSize() const = 0; + + virtual void ClearOps() = 0; + + // NOTE: This ugly method is used to compatible interfaces between cpp and + // pb/nb backends + // TODO(sangoly): refine this + template + T* GetOp(int32_t idx); + + template + T* AddOp(); + + virtual int32_t ForwardBlockIdx() const = 0; + + virtual void SetForwardBlockIdx(int32_t idx) = 0; +}; + +class ProgramDescAPI { + public: + virtual ~ProgramDescAPI() = default; + + virtual size_t BlocksSize() const = 0; + + virtual void ClearBlocks() = 0; + + // NOTE: This ugly method is used to compatible interfaces between cpp and + // pb/nb backends + // TODO(sangoly): refine this + template + T* GetBlock(int32_t idx); + + template + T* AddBlock(); + + virtual bool HasVersion() const = 0; + + virtual int64_t Version() const = 0; + + virtual void SetVersion(int64_t version) = 0; +}; + +} // namespace infrt::paddle::cpp diff --git a/paddle/infrt/paddle/framework.proto b/paddle/infrt/paddle/framework.proto new file mode 100644 index 0000000000000..634ec9665d08e --- /dev/null +++ b/paddle/infrt/paddle/framework.proto @@ -0,0 +1,213 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +syntax = "proto2"; +package paddle.framework.proto; + +// Any incompatible changes to ProgramDesc and its dependencies should +// raise the version defined version.h. +// +// Serailization and Deserialization codes should be modified in a way +// that supports old versions following the version and compatibility policy. +message Version { optional int64 version = 1 [ default = 0 ]; } + +enum AttrType { + INT = 0; + FLOAT = 1; + STRING = 2; + INTS = 3; + FLOATS = 4; + STRINGS = 5; + BOOLEAN = 6; + BOOLEANS = 7; + BLOCK = 8; + LONG = 9; + BLOCKS = 10; + LONGS = 11; +} + +// OpDesc describes an instance of a C++ framework::OperatorBase +// derived class type. +message OpDesc { + + message Attr { + required string name = 1; + required AttrType type = 2; + optional int32 i = 3; + optional float f = 4; + optional string s = 5; + repeated int32 ints = 6; + repeated float floats = 7; + repeated string strings = 8; + optional bool b = 10; + repeated bool bools = 11; + optional int32 block_idx = 12; + optional int64 l = 13; + repeated int32 blocks_idx = 14; + repeated int64 longs = 15; + }; + + message Var { + required string parameter = 1; + repeated string arguments = 2; + }; + + required string type = 3; + repeated Var inputs = 1; + repeated Var outputs = 2; + repeated Attr attrs = 4; + optional bool is_target = 5 [ default = false ]; +}; + +// OpProto describes a C++ framework::OperatorBase derived class. +message OpProto { + + // VarProto describes the C++ type framework::Variable. + message Var { + required string name = 1; + required string comment = 2; + + optional bool duplicable = 3 [ default = false ]; + optional bool intermediate = 4 [ default = false ]; + optional bool dispensable = 5 [ default = false ]; + } + + // AttrProto describes the C++ type Attribute. + message Attr { + required string name = 1; + required AttrType type = 2; + required string comment = 3; + // If that attribute is generated, it means the Paddle third + // language binding has responsibility to fill that + // attribute. End-User should not set that attribute. + optional bool generated = 4 [ default = false ]; + } + + required string type = 1; + repeated Var inputs = 2; + repeated Var outputs = 3; + repeated Attr attrs = 4; + required string comment = 5; +} + +message VarType { + enum Type { + // Pod Types + BOOL = 0; + INT16 = 1; + INT32 = 2; + INT64 = 3; + FP16 = 4; + FP32 = 5; + FP64 = 6; + // Tensor is used in C++. + SIZE_T = 19; + UINT8 = 20; + INT8 = 21; + + // Other types that may need additional descriptions + LOD_TENSOR = 7; + SELECTED_ROWS = 8; + FEED_MINIBATCH = 9; + FETCH_LIST = 10; + STEP_SCOPES = 11; + LOD_RANK_TABLE = 12; + LOD_TENSOR_ARRAY = 13; + PLACE_LIST = 14; + READER = 15; + // Any runtime decided variable type is raw + // raw variables should manage their own allocations + // in operators like nccl_op + RAW = 17; + TUPLE = 18; + } + + required Type type = 1; + + message TensorDesc { + // Should only be PODType. Is enforced in C++ + required Type data_type = 1; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] + } + optional TensorDesc selected_rows = 2; + + message LoDTensorDesc { + required TensorDesc tensor = 1; + optional int32 lod_level = 2 [ default = 0 ]; + } + optional LoDTensorDesc lod_tensor = 3; + + message LoDTensorArrayDesc { + required TensorDesc tensor = 1; + optional int32 lod_level = 2 [ default = 0 ]; + } + optional LoDTensorArrayDesc tensor_array = 4; + + message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; } + optional ReaderDesc reader = 5; + + message Tuple { repeated Type element_type = 1; } + optional Tuple tuple = 7; +} + +message VarDesc { + required string name = 1; + required VarType type = 2; + optional bool persistable = 3 [ default = false ]; + // True if the variable is an input data and + // have to check the feed data shape and dtype + optional bool need_check_feed = 4 [ default = false ]; +} + +message BlockDesc { + required int32 idx = 1; + required int32 parent_idx = 2; + repeated VarDesc vars = 3; + repeated OpDesc ops = 4; + optional int32 forward_block_idx = 5 [ default = -1 ]; +} + +// CompatibleInfo is used to determine if a feature is compatible and +// provides the information. +message CompatibleInfo { + enum Type { + COMPATIBLE = 0; + DEFINITELY_NOT = 1; + POSSIBLE = 2; + BUG_FIX = 3; + PRECISION_CHANGE = 4; + } + required string version = 1; + required Type type = 2; +} + +// In some cases, Paddle Fluid may perform operator definition iterations, +// and the operator uses OpCompatibleMap for compatibility testing. +message OpCompatibleMap { + message OpCompatiblePair { + required string op_name = 1; + required CompatibleInfo compatible_info = 2; + } + repeated OpCompatiblePair pair = 1; + optional string default_required_version = 2; +} + +// Please refer to +// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md +// for more details. +// TODO(panyx0718): A model can have multiple programs. Need a +// way to distinguish them. Maybe ID or name? +message ProgramDesc { + reserved 2; // For backward compatibility. + repeated BlockDesc blocks = 1; + optional Version version = 4; + optional OpCompatibleMap op_compatible_map = 3; +} \ No newline at end of file diff --git a/paddle/infrt/paddle/model_parser.cc b/paddle/infrt/paddle/model_parser.cc new file mode 100644 index 0000000000000..285280e69435b --- /dev/null +++ b/paddle/infrt/paddle/model_parser.cc @@ -0,0 +1,172 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/paddle/model_parser.h" + +#include +#include + +#include "paddle/infrt/common/common.h" +#include "paddle/infrt/common/string.h" +#include "paddle/infrt/common/target.h" +#include "paddle/infrt/common/type.h" + +namespace infrt::paddle { + +int SizeOfType(framework_proto::VarType::Type type) { + using Type = framework_proto::VarType::Type; + switch (static_cast(type)) { +#define DO(desc, type) \ + case Type::VarType_Type_##desc: \ + return sizeof(type); + DO(BOOL, bool); + DO(FP16, float); + DO(FP32, float); + DO(INT8, int8_t); + DO(INT16, int16_t); + DO(INT32, int); + DO(INT64, int64_t); +#undef DO + default: + LOG(FATAL) << "unknown data type " << type; + } + return -1; +} + +void TensorFromStream(std::istream &is, + _Tensor_ *tensor, + const common::Target &target) { + using Type = framework_proto::VarType::Type; + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + CHECK_EQ(version, 0U) << "Only version 0 is supported"; + // read tensor desc + framework_proto::VarType::TensorDesc desc; + { + // int32_t size + // proto buffer + int32_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + std::unique_ptr buf(new char[size]); + is.read(reinterpret_cast(buf.get()), size); + CHECK(desc.ParseFromArray(buf.get(), size)) << "Cannot parse tensor desc"; + } + + // read tensor + std::vector dims_vec; + std::copy( + desc.dims().begin(), desc.dims().end(), std::back_inserter(dims_vec)); + Shape dims(dims_vec); + tensor->Resize(dims); + void *buf; + size_t size = tensor->shape().numel() * SizeOfType(desc.data_type()); + // alllocate memory + if (target.arch == Target::Arch::X86) { + switch (static_cast(desc.data_type())) { +#define SET_TENSOR(desc, type, precision) \ + case Type::VarType_Type_##desc: \ + buf = tensor->mutable_data(target); \ + tensor->set_type(precision); \ + break + + SET_TENSOR(FP32, float, Float(32)); + SET_TENSOR(INT8, int8_t, Int(8)); + SET_TENSOR(INT16, int16_t, Int(16)); + SET_TENSOR(INT32, int32_t, Int(32)); + SET_TENSOR(INT64, int64_t, Int(64)); +#undef SET_TENSOR + default: + LOG(FATAL) << "unknown type " << desc.data_type(); + } + // tensor->set_persistable(true); + is.read(static_cast(buf), size); + } else if (target.arch == Target::Arch::NVGPU) { +#ifdef INFRT_WITH_CUDA + if (desc.data_type() != Type::VarType_Type_FP32) + LOG(FATAL) << "[CUDA] The type is not fp32!!"; + auto *data = tensor->mutable_data(target); + tensor->set_type(infrt::common::Float(32)); + std::vector temp(tensor->shape().numel()); + // LOG(INFO) <<"[CUDA] The tensor's size is "<< tensor->shape().numel(); + is.read(reinterpret_cast(temp.data()), size); + CUDA_CALL(cudaMemcpy(reinterpret_cast(data), + temp.data(), + tensor->shape().numel() * sizeof(float), + cudaMemcpyHostToDevice)); +#else + LOG(FATAL) << "To use CUDA backends, you need to set WITH_CUDA ON!"; +#endif + } else { + INFRT_NOT_IMPLEMENTED + } +} + +void LoadLoDTensor(std::istream &is, _Variable *var, const Target &target) { + auto &tensor = var->get(); + uint32_t version{}; + is.read(reinterpret_cast(&version), sizeof(version)); + VLOG(3) << "model version " << version; + + // Load LoD information + uint64_t lod_level{}; + is.read(reinterpret_cast(&lod_level), sizeof(lod_level)); + + for (uint64_t i = 0; i < lod_level; ++i) { + uint64_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + std::vector tmp(size / sizeof(uint64_t)); + is.read(reinterpret_cast(tmp.data()), + static_cast(size)); + // lod[i] = tmp; + } + + TensorFromStream(is, tensor.operator->(), target); +} + +void ReadBinaryFile(const std::string &filename, std::string *contents) { + std::ifstream fin(filename, std::ios::in | std::ios::binary); + CHECK(fin.is_open()) << "Cannot open file: " << filename; + fin.seekg(0, std::ios::end); + auto size = fin.tellg(); + contents->clear(); + contents->resize(size); + fin.seekg(0, std::ios::beg); + fin.read(&(contents->at(0)), contents->size()); + fin.close(); +} + +std::unique_ptr LoadProgram( + const std::string &path, bool program_from_memory) { + std::unique_ptr main_program( + new framework_proto::ProgramDesc); + if (!program_from_memory) { + std::string desc_str; + ReadBinaryFile(path, &desc_str); + main_program->ParseFromString(desc_str); + } else { + main_program->ParseFromString(path); + } + return main_program; +} + +void LoadParams(const std::string &path) {} + +// Load directly to CPU, and latter transfer to other devices. +void LoadParam(const std::string &path, _Variable *out, const Target &target) { + std::ifstream fin(path, std::ios::binary); + CHECK(fin.is_open()) << "failed to open file " << path; + LoadLoDTensor(fin, out, target); +} + +} // namespace infrt::paddle diff --git a/paddle/infrt/paddle/model_parser.h b/paddle/infrt/paddle/model_parser.h new file mode 100644 index 0000000000000..73125fadedb82 --- /dev/null +++ b/paddle/infrt/paddle/model_parser.h @@ -0,0 +1,55 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include + +#include "paddle/infrt/paddle/framework.pb.h" +#include "paddle/infrt/paddle/pb/block_desc.h" +#include "paddle/infrt/paddle/pb/op_desc.h" +#include "paddle/infrt/paddle/pb/program_desc.h" +#include "paddle/infrt/paddle/scope.h" +#include "paddle/infrt/paddle/tensor.h" + +namespace infrt::paddle { +namespace framework_proto = ::paddle::framework::proto; + +// Read a __model__ file. +std::unique_ptr LoadProgram( + const std::string& path, bool program_from_memory = false); + +void LoadLoDTensor(std::istream& is, + _Variable* var, + const common::Target& target); + +// Read a single file containing all the parameters. +void LoadParams(const std::string& path); + +// Load a single parameter to an output tensor. +void LoadParam(const std::string& path, + _Variable* out, + const common::Target& target); + +// LoDTensor to ostream +void TensorToStream(std::ostream& os, const _Tensor_& tensor); +void TensorFromStream( + std::istream& is, + _Tensor_* tensor, + const common::Target& target = common::DefaultHostTarget()); +void ReadBinaryFile(const std::string& filename, std::string* contents); + +} // namespace infrt::paddle diff --git a/paddle/infrt/paddle/pb/CMakeLists.txt b/paddle/infrt/paddle/pb/CMakeLists.txt new file mode 100644 index 0000000000000..fac38afa62db2 --- /dev/null +++ b/paddle/infrt/paddle/pb/CMakeLists.txt @@ -0,0 +1,20 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + var_desc.cc + op_desc.cc + block_desc.cc + program_desc.cc + ) + +foreach(cpp ${SRCS}) + set(infrt_src + "${infrt_src};infrt/paddle/pb/${cpp}" + CACHE INTERNAL "") +endforeach() + +file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h) + +foreach(header ${includes}) + set(core_includes "${core_includes};${header}" CACHE INTERNAL "") +endforeach() diff --git a/paddle/infrt/paddle/pb/block_desc.cc b/paddle/infrt/paddle/pb/block_desc.cc new file mode 100644 index 0000000000000..11186bc68af16 --- /dev/null +++ b/paddle/infrt/paddle/pb/block_desc.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/paddle/pb/block_desc.h" + +namespace infrt::paddle::pb { + +template <> +framework_proto::VarDesc* BlockDesc::GetVar( + int32_t idx) { + CHECK_LT(idx, static_cast(VarsSize())) << "idx >= vars.size()"; + return desc_->mutable_vars(idx); +} + +template <> +framework_proto::VarDesc* BlockDesc::AddVar() { + return desc_->add_vars(); +} + +template <> +framework_proto::OpDesc* BlockDesc::GetOp( + int32_t idx) { + CHECK_LT(idx, static_cast(OpsSize())) << "idx >= ops.size()"; + return desc_->mutable_ops(idx); +} + +template <> +framework_proto::OpDesc* BlockDesc::AddOp() { + return desc_->add_ops(); +} + +} // namespace infrt::paddle::pb diff --git a/paddle/infrt/paddle/pb/block_desc.h b/paddle/infrt/paddle/pb/block_desc.h new file mode 100644 index 0000000000000..9c1b7f9adf172 --- /dev/null +++ b/paddle/infrt/paddle/pb/block_desc.h @@ -0,0 +1,77 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/infrt/paddle/cpp/desc_api.h" +#include "paddle/infrt/paddle/framework.pb.h" + +namespace infrt::paddle::pb { + +namespace framework_proto = ::paddle::framework::proto; + +class BlockDesc : public cpp::BlockDescAPI { + public: + BlockDesc() = delete; + + explicit BlockDesc(framework_proto::BlockDesc* desc) : desc_(desc) { + CHECK(desc_); + } + + framework_proto::BlockDesc* Proto() { return desc_; } + + const framework_proto::BlockDesc& ReadonlyProto() const { return *desc_; } + + int32_t Idx() const override { return desc_->idx(); } + + void SetIdx(int32_t idx) override { desc_->set_idx(idx); } + + int32_t ParentIdx() const override { return desc_->parent_idx(); } + + void SetParentIdx(int32_t idx) override { desc_->set_parent_idx(idx); } + + size_t VarsSize() const override { return desc_->vars_size(); } + + void ClearVars() override { desc_->clear_vars(); } + + template + T* GetVar(int32_t idx); + + template + T* AddVar(); + + size_t OpsSize() const override { return desc_->ops_size(); } + + void ClearOps() override { desc_->clear_ops(); } + + template + T* GetOp(int32_t idx); + + template + T* AddOp(); + + int32_t ForwardBlockIdx() const override { + return desc_->forward_block_idx(); + } + + void SetForwardBlockIdx(int32_t idx) override { + desc_->set_forward_block_idx(idx); + } + + private: + framework_proto::BlockDesc* desc_; // not_own +}; + +} // namespace infrt::paddle::pb diff --git a/paddle/infrt/paddle/pb/op_desc.cc b/paddle/infrt/paddle/pb/op_desc.cc new file mode 100644 index 0000000000000..c7b1e66f50642 --- /dev/null +++ b/paddle/infrt/paddle/pb/op_desc.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/paddle/pb/op_desc.h" + +namespace infrt::paddle::pb { + +google::protobuf::internal::RepeatedPtrIterator +FindAttr(framework_proto::OpDesc *desc, const std::string &name) { + auto &xs = *desc->mutable_attrs(); + auto it = std::find_if( + xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) { + return x.name() == name; + }); + if (it == xs.end()) { + auto *attr = xs.Add(); + attr->set_name(name); + it = std::find_if( + xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) { + return x.name() == name; + }); + } + return it; +} + +#define SET_IMPL_ONE(T, ty__, pb_f__) \ + template <> \ + void OpDesc::SetAttr(const std::string &name, const T &v) { \ + auto it = FindAttr(desc_, name); \ + it->set_type(framework_proto::ty__); \ + it->set_##pb_f__(v); \ + } +SET_IMPL_ONE(int, INT, i); +SET_IMPL_ONE(float, FLOAT, f); +SET_IMPL_ONE(bool, BOOLEAN, b); +SET_IMPL_ONE(int64_t, LONG, l); + +template <> +void OpDesc::SetAttr>(const std::string &name, + const std::vector &v) { + auto it = FindAttr(desc_, name); + it->set_type(framework_proto::INTS); + it->clear_ints(); + for (auto &i : v) { + it->add_ints(i); + } +} + +template <> +void OpDesc::SetAttr(const std::string &name, + const std::string &v) { + auto it = FindAttr(desc_, name); + it->set_type(framework_proto::STRING); + it->set_s(v.c_str()); +} + +template <> +void OpDesc::SetAttr>(const std::string &name, + const std::vector &v) { + auto it = FindAttr(desc_, name); + it->set_type(framework_proto::FLOATS); + it->clear_floats(); + for (auto &i : v) { + it->add_floats(i); + } +} + +template <> +void OpDesc::SetAttr>( + const std::string &name, const std::vector &v) { + auto it = FindAttr(desc_, name); + it->set_type(framework_proto::STRINGS); + it->clear_strings(); + for (auto &i : v) { + it->add_strings(i); + } +} + +template <> +void OpDesc::SetAttr>(const std::string &name, + const std::vector &v) { + auto it = FindAttr(desc_, name); + it->set_type(framework_proto::LONGS); + it->clear_longs(); + for (auto &i : v) { + it->add_longs(i); + } +} +google::protobuf::internal::RepeatedPtrIterator< + const framework_proto::OpDesc_Attr> +GetFindAttr(const framework_proto::OpDesc &desc, const std::string &name) { + auto &xs = desc.attrs(); + auto it = std::find_if( + xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) { + return x.name() == name; + }); + return it; +} + +#define GET_ATTR_IMPL(T, pb_f__) \ + template <> \ + T OpDesc::GetAttr(const std::string &name) const { \ + auto it = GetFindAttr(*desc_, name); \ + return it->pb_f__(); \ + } + +#define GET_ATTRS_IMPL(T, pb_f__) \ + template <> \ + T OpDesc::GetAttr(const std::string &name) const { \ + auto it = GetFindAttr(*desc_, name); \ + T res; \ + for (const auto &v : it->pb_f__()) { \ + res.push_back(v); \ + } \ + return res; \ + } +GET_ATTR_IMPL(int32_t, i); +GET_ATTR_IMPL(int16_t, block_idx); +GET_ATTR_IMPL(float, f); +GET_ATTR_IMPL(bool, b); +GET_ATTR_IMPL(int64_t, l); +GET_ATTRS_IMPL(std::vector, ints); +GET_ATTRS_IMPL(std::vector, floats); +GET_ATTRS_IMPL(std::vector, strings); +GET_ATTR_IMPL(std::string, s); +GET_ATTRS_IMPL(std::vector, longs); + +} // namespace infrt::paddle::pb diff --git a/paddle/infrt/paddle/pb/op_desc.h b/paddle/infrt/paddle/pb/op_desc.h new file mode 100644 index 0000000000000..81d57d9f32252 --- /dev/null +++ b/paddle/infrt/paddle/pb/op_desc.h @@ -0,0 +1,198 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/infrt/paddle/cpp/desc_api.h" +#include "paddle/infrt/paddle/framework.pb.h" +#include "paddle/infrt/support/variant.h" + +namespace infrt::paddle::pb { + +namespace framework_proto = ::paddle::framework::proto; + +using Attribute = + Variant, std::vector>; +using VariableNameMap = std::map>; + +/* + * The lite::OpDesc, an light-weight implementation of wrapper of proto::OpDesc. + * Unlike the original one in framework::OpDesc, we remove the local members + * except the desc_, to avoid the inconsistent state, which is normal in the + * original interface and results in bugs. + */ +class OpDesc : public cpp::OpDescAPI { + public: + OpDesc() = delete; + + explicit OpDesc(framework_proto::OpDesc *desc) : desc_(desc) { CHECK(desc_); } + + framework_proto::OpDesc *Proto() { return desc_; } + const framework_proto::OpDesc &ReadonlyProto() const { return *desc_; } + + std::string Type() const override { return desc_->type(); } + + void SetType(const std::string &type) override { desc_->set_type(type); } + + // Get the arguments of parameter called `param` + std::vector Input(const std::string ¶m) const override { + return GetArguments(desc_->inputs(), param); + } + + std::vector InputArgumentNames() const override { + return GetArgumentNames(desc_->inputs()); + } + + void SetInput(const std::string ¶m, + const std::vector &args) override { + SetArgument(desc_->mutable_inputs(), param, args); + } + + std::vector Output(const std::string ¶m) const override { + return GetArguments(desc_->outputs(), param); + } + + std::vector OutputArgumentNames() const override { + return GetArgumentNames(desc_->outputs()); + } + + void SetOutput(const std::string ¶m, + const std::vector &args) override { + SetArgument(desc_->mutable_outputs(), param, args); + } + + bool HasAttr(const std::string &name) const override { + const auto &xs = desc_->attrs(); + auto it = std::find_if( + xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) { + return x.name() == name; + }); + return it != xs.end(); + } + + AttrType GetAttrType(const std::string &name) const override { + const auto &xs = desc_->attrs(); + auto it = std::find_if( + xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) { + return x.name() == name; + }); + CHECK(it != xs.end()); +#define DEF_ONE(type__) \ + case framework_proto::AttrType::type__: \ + return AttrType::type__; + + switch (it->type()) { + DEF_ONE(INT); + DEF_ONE(FLOAT); + DEF_ONE(STRING); + DEF_ONE(INTS); + DEF_ONE(FLOATS); + DEF_ONE(STRINGS); + DEF_ONE(BOOLEAN); + DEF_ONE(BOOLEANS); + DEF_ONE(BLOCK); + DEF_ONE(LONG); + DEF_ONE(BLOCKS); + DEF_ONE(LONGS); + default: + LOG(FATAL) << "Unknown attribute type"; + return static_cast(-1); + } +#undef DEF_ONE + } + + std::vector AttrNames() const override { + std::vector res; + const auto &xs = desc_->attrs(); + std::transform( + xs.begin(), + xs.end(), + std::back_inserter(res), + [](const framework_proto::OpDesc_Attr &x) { return x.name(); }); + return res; + } + + template + void SetAttr(const std::string &name, const T &v); + + template + T GetAttr(const std::string &name) const; + + private: + std::vector GetArguments( + const google::protobuf::RepeatedPtrField &xs, + const std::string ¶m) const { + std::vector res; + auto it = std::find_if( + xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Var &it) { + return it.parameter() == param; + }); + CHECK(it != xs.end()); + + const auto &ys = it->arguments(); + std::transform(ys.begin(), + ys.end(), + std::back_inserter(res), + [](const std::string &x) { return x; }); + return res; + } + + void SetArgument( + google::protobuf::RepeatedPtrField *xs, + const std::string ¶m, + const std::vector &args) { + auto it = std::find_if( + xs->begin(), xs->end(), [&](const framework_proto::OpDesc_Var &it) { + return it.parameter() == param; + }); + if (it == xs->end()) { + auto *new_arg = xs->Add(); + new_arg->set_parameter(param); + for (const auto &arg : args) { + *new_arg->mutable_arguments()->Add() = arg; + } + } else { + it->mutable_arguments()->Clear(); + for (const auto &arg : args) { + *it->mutable_arguments()->Add() = arg; + } + } + } + + std::vector GetArgumentNames( + const google::protobuf::RepeatedPtrField &xs) + const { + std::vector res; + std::transform( + xs.begin(), + xs.end(), + std::back_inserter(res), + [](const framework_proto::OpDesc_Var &x) { return x.parameter(); }); + return res; + } + + private: + framework_proto::OpDesc *desc_; +}; + +template <> +void OpDesc::SetAttr(const std::string &name, + const std::string &v); + +template <> +void OpDesc::SetAttr>(const std::string &name, + const std::vector &v); + +} // namespace infrt::paddle::pb diff --git a/paddle/infrt/paddle/pb/program_desc.cc b/paddle/infrt/paddle/pb/program_desc.cc new file mode 100644 index 0000000000000..ed8a7e36e0129 --- /dev/null +++ b/paddle/infrt/paddle/pb/program_desc.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/paddle/pb/program_desc.h" + +#include +#include + +namespace infrt::paddle::pb { + +template <> +framework_proto::BlockDesc* ProgramDesc::GetBlock( + int32_t idx) { + CHECK_LT(idx, static_cast(BlocksSize())) << "idx >= blocks.size()"; + return desc_->mutable_blocks(idx); +} + +template <> +framework_proto::BlockDesc* +ProgramDesc::AddBlock() { + return desc_->add_blocks(); +} + +} // namespace infrt::paddle::pb diff --git a/paddle/infrt/paddle/pb/program_desc.h b/paddle/infrt/paddle/pb/program_desc.h new file mode 100644 index 0000000000000..4adad650c974d --- /dev/null +++ b/paddle/infrt/paddle/pb/program_desc.h @@ -0,0 +1,61 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include +#include + +#include "paddle/infrt/paddle/cpp/desc_api.h" +#include "paddle/infrt/paddle/framework.pb.h" + +namespace infrt::paddle::pb { +namespace framework_proto = ::paddle::framework::proto; + +class ProgramDesc : public cpp::ProgramDescAPI { + public: + ProgramDesc() = delete; + + explicit ProgramDesc(framework_proto::ProgramDesc *desc) : desc_(desc) { + CHECK(desc_); + } + + framework_proto::ProgramDesc *Proto() { return desc_; } + + const framework_proto::ProgramDesc &ReadonlyProto() const { return *desc_; } + + size_t BlocksSize() const override { return desc_->blocks_size(); } + + void ClearBlocks() override { desc_->clear_blocks(); } + + template + T *GetBlock(int32_t idx); + + template + T *AddBlock(); + + bool HasVersion() const override { return desc_->has_version(); } + + int64_t Version() const override { return desc_->version().version(); } + + void SetVersion(int64_t version) override { + desc_->mutable_version()->set_version(version); + } + + private: + framework_proto::ProgramDesc *desc_; // not_own +}; + +} // namespace infrt::paddle::pb diff --git a/paddle/infrt/paddle/pb/var_desc.cc b/paddle/infrt/paddle/pb/var_desc.cc new file mode 100644 index 0000000000000..cf80df4f1b845 --- /dev/null +++ b/paddle/infrt/paddle/pb/var_desc.cc @@ -0,0 +1,367 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/paddle/pb/var_desc.h" + +#include + +#include "paddle/infrt/paddle/cpp/desc_api.h" +#include "paddle/infrt/paddle/framework.pb.h" + +namespace infrt::paddle::pb { + +cpp::VarDescAPI::Type VarDesc::GetType() const { + auto type = desc_->type().type(); + +#define GET_TYPE_CASE_ITEM(type__) \ + case framework_proto::VarType::type__: \ + return cpp::VarDescAPI::Type::type__; + + switch (type) { + GET_TYPE_CASE_ITEM(LOD_TENSOR); + GET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY); + GET_TYPE_CASE_ITEM(LOD_RANK_TABLE); + GET_TYPE_CASE_ITEM(SELECTED_ROWS); + GET_TYPE_CASE_ITEM(FEED_MINIBATCH); + GET_TYPE_CASE_ITEM(FETCH_LIST); + GET_TYPE_CASE_ITEM(STEP_SCOPES); + GET_TYPE_CASE_ITEM(PLACE_LIST); + GET_TYPE_CASE_ITEM(READER); + default: + LOG(FATAL) << "Unknown var type"; + return VarDescAPI::Type(); + } +#undef GET_TYPE_CASE_ITEM +} + +void VarDesc::SetType(VarDescAPI::Type type) { +#define SET_TYPE_CASE_ITEM(type__) \ + case VarDescAPI::Type::type__: \ + desc_->mutable_type()->set_type(framework_proto::VarType::type__); \ + break; + + switch (type) { + SET_TYPE_CASE_ITEM(LOD_TENSOR); + SET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY); + SET_TYPE_CASE_ITEM(LOD_RANK_TABLE); + SET_TYPE_CASE_ITEM(SELECTED_ROWS); + SET_TYPE_CASE_ITEM(FEED_MINIBATCH); + SET_TYPE_CASE_ITEM(FETCH_LIST); + SET_TYPE_CASE_ITEM(STEP_SCOPES); + SET_TYPE_CASE_ITEM(PLACE_LIST); + SET_TYPE_CASE_ITEM(READER); + default: + LOG(FATAL) << "Unknown var type"; + } +#undef SET_TYPE_CASE_ITEM +} + +void VarDesc::SetShape(const std::vector &dims) { + VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims()); +} + +void VarDesc::SetTensorDescNum(size_t num) { + switch (desc_->type().type()) { + case framework_proto::VarType::READER: { + auto *lod_tensors_ptr = + desc_->mutable_type()->mutable_reader()->mutable_lod_tensor(); + lod_tensors_ptr->Clear(); + for (size_t i = 0; i < num; ++i) { + lod_tensors_ptr->Add(); + } + return; + } break; + default: + LOG(FATAL) << "Setting 'sub_tensor_number' is not supported by the type " + "of var %s." + << this->Name(); + } +} + +size_t VarDesc::GetTensorDescNum() const { + switch (desc_->type().type()) { + case framework_proto::VarType::READER: + return desc_->type().reader().lod_tensor_size(); + break; + default: + LOG(FATAL) << "Getting 'sub_tensor_number' is not supported by the type " + "of var %s." + << this->Name(); + } + return 0; +} + +void VarDesc::SetShapes( + const std::vector> &multiple_dims) { + if (multiple_dims.size() != GetTensorDescNum()) { + VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; + SetTensorDescNum(multiple_dims.size()); + } + std::vector tensors = + mutable_tensor_descs(); + for (size_t i = 0; i < multiple_dims.size(); ++i) { + VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims()); + } +} + +std::vector VarDesc::GetShape() const { + return RepeatedToVector(tensor_desc().dims()); +} + +std::vector> VarDesc::GetShapes() const { + std::vector descs = tensor_descs(); + std::vector> res; + res.reserve(descs.size()); + for (const auto &tensor_desc : descs) { + res.push_back(RepeatedToVector(tensor_desc.dims())); + } + return res; +} + +void VarDesc::SetDataType(VarDescAPI::VarDataType data_type) { +#define SET_DATA_TYPE_CASE_ITEM(type__) \ + case cpp::VarDescAPI::Type::type__: \ + mutable_tensor_desc()->set_data_type(framework_proto::VarType::type__); \ + break; + + switch (data_type) { + SET_DATA_TYPE_CASE_ITEM(BOOL); + SET_DATA_TYPE_CASE_ITEM(SIZE_T); + SET_DATA_TYPE_CASE_ITEM(UINT8); + SET_DATA_TYPE_CASE_ITEM(INT8); + SET_DATA_TYPE_CASE_ITEM(INT16); + SET_DATA_TYPE_CASE_ITEM(INT32); + SET_DATA_TYPE_CASE_ITEM(INT64); + SET_DATA_TYPE_CASE_ITEM(FP16); + SET_DATA_TYPE_CASE_ITEM(FP32); + SET_DATA_TYPE_CASE_ITEM(FP64); + default: + LOG(FATAL) << "Unknown var type: " << static_cast(data_type); + } +#undef SET_DATA_TYPE_CASE_ITEM +} + +void VarDesc::SetDataTypes( + const std::vector &multiple_data_type) { + if (multiple_data_type.size() != GetTensorDescNum()) { + VLOG(3) << "WARNING: The number of given data types(" + << multiple_data_type.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; + SetTensorDescNum(multiple_data_type.size()); + } + std::vector tensor_descs = + mutable_tensor_descs(); + for (size_t i = 0; i < multiple_data_type.size(); ++i) { + tensor_descs[i]->set_data_type(multiple_data_type[i]); + } +} + +// proto::VarType::Type VarDesc::GetDataType() const { +// return tensor_desc().data_type(); +// } +cpp::VarDescAPI::VarDataType VarDesc::GetDataType() const { + CHECK(desc_->has_type()) << "The var's type hasn't been set."; + CHECK(desc_->type().has_type()) << "The var type hasn't been set."; + if (desc_->type().type() != framework_proto::VarType::LOD_TENSOR) { + return VarDescAPI::Type(); + } + auto type = tensor_desc().data_type(); +#define GET_DATA_TYPE_CASE_ITEM(type__) \ + case framework_proto::VarType::Type::VarType_Type_##type__: \ + return VarDescAPI::Type::type__ + + switch (type) { + GET_DATA_TYPE_CASE_ITEM(BOOL); + GET_DATA_TYPE_CASE_ITEM(SIZE_T); + GET_DATA_TYPE_CASE_ITEM(UINT8); + GET_DATA_TYPE_CASE_ITEM(INT8); + GET_DATA_TYPE_CASE_ITEM(INT16); + GET_DATA_TYPE_CASE_ITEM(INT32); + GET_DATA_TYPE_CASE_ITEM(INT64); + GET_DATA_TYPE_CASE_ITEM(FP16); + GET_DATA_TYPE_CASE_ITEM(FP32); + GET_DATA_TYPE_CASE_ITEM(FP64); + default: + LOG(FATAL) << "Unknown var type: " << static_cast(type); + return VarDescAPI::Type(); + } +#undef GET_DATA_TYPE_CASE_ITEM +} + +std::vector VarDesc::GetDataTypes() const { + std::vector descs = tensor_descs(); + std::vector res; + res.reserve(descs.size()); + for (const auto &tensor_desc : descs) { + res.push_back(tensor_desc.data_type()); + } + return res; +} + +void VarDesc::SetLoDLevel(int32_t lod_level) { + switch (desc_->type().type()) { + case framework_proto::VarType::LOD_TENSOR: + desc_->mutable_type()->mutable_lod_tensor()->set_lod_level(lod_level); + break; + case framework_proto::VarType::LOD_TENSOR_ARRAY: + desc_->mutable_type()->mutable_tensor_array()->set_lod_level(lod_level); + break; + default: + LOG(FATAL) + << "Setting 'lod_level' is not supported by the type of var %s." + << this->Name(); + } +} + +void VarDesc::SetLoDLevels(const std::vector &multiple_lod_level) { + if (multiple_lod_level.size() != GetTensorDescNum()) { + VLOG(3) << "WARNING: The number of given lod_levels(" + << multiple_lod_level.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; + SetTensorDescNum(multiple_lod_level.size()); + } + switch (desc_->type().type()) { + case framework_proto::VarType::READER: { + size_t i = 0; + for (auto &lod_tensor : + *desc_->mutable_type()->mutable_reader()->mutable_lod_tensor()) { + lod_tensor.set_lod_level(multiple_lod_level[i++]); + } + } break; + default: + LOG(FATAL) + << "Setting 'lod_levels' is not supported by the type of var %s." + << this->Name(); + } +} + +int32_t VarDesc::GetLoDLevel() const { + switch (desc_->type().type()) { + case framework_proto::VarType::LOD_TENSOR: + return desc_->type().lod_tensor().lod_level(); + case framework_proto::VarType::LOD_TENSOR_ARRAY: + return desc_->type().tensor_array().lod_level(); + default: + LOG(FATAL) + << "Getting 'lod_level' is not supported by the type of var %s." + << this->Name(); + } + return 0; +} + +std::vector VarDesc::GetLoDLevels() const { + std::vector res; + switch (desc_->type().type()) { + case framework_proto::VarType::READER: + res.reserve(desc_->type().reader().lod_tensor_size()); + for (auto &lod_tensor : desc_->type().reader().lod_tensor()) { + res.push_back(lod_tensor.lod_level()); + } + return res; + break; + default: + LOG(FATAL) + << "Getting 'lod_levels' is not supported by the type of var %s." + << this->Name(); + } + return std::vector(); +} + +const framework_proto::VarType::TensorDesc &VarDesc::tensor_desc() const { + CHECK(desc_->has_type()) << "The var's type hasn't been set."; + CHECK(desc_->type().has_type()) << "The var type hasn't been set."; + switch (desc_->type().type()) { + case framework_proto::VarType::SELECTED_ROWS: + return desc_->type().selected_rows(); + case framework_proto::VarType::LOD_TENSOR: + return desc_->type().lod_tensor().tensor(); + case framework_proto::VarType::LOD_TENSOR_ARRAY: + return desc_->type().tensor_array().tensor(); + default: + LOG(FATAL) + << "Getting 'tensor_desc' is not supported by the type of var %s." + << this->Name(); + } + return framework_proto::VarDesc().type().lod_tensor().tensor(); +} + +std::vector VarDesc::tensor_descs() + const { + CHECK(desc_->has_type()) << "The var type hasn't been set."; + std::vector res; + res.reserve(GetTensorDescNum()); + switch (desc_->type().type()) { + case framework_proto::VarType::READER: + for (const auto &lod_tensor : desc_->type().reader().lod_tensor()) { + res.push_back(lod_tensor.tensor()); + } + return res; + default: + LOG(FATAL) + << "Getting 'tensor_descs' is not supported by the type of var " + "%s." + << this->Name(); + } + return std::vector(); +} + +framework_proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { + CHECK(desc_->has_type()) << "The var type hasn't been set."; + CHECK(desc_->type().has_type()) << "The var type hasn't been set."; + switch (desc_->type().type()) { + case framework_proto::VarType::SELECTED_ROWS: + return desc_->mutable_type()->mutable_selected_rows(); + case framework_proto::VarType::LOD_TENSOR: + return desc_->mutable_type()->mutable_lod_tensor()->mutable_tensor(); + case framework_proto::VarType::LOD_TENSOR_ARRAY: + return desc_->mutable_type()->mutable_tensor_array()->mutable_tensor(); + default: + LOG(FATAL) << "Getting 'mutable_tensor_desc' is not supported by the " + "type of var " + "%s." + << this->Name(); + } + return nullptr; +} + +std::vector +VarDesc::mutable_tensor_descs() { + CHECK(desc_->has_type()) << "The var type hasn't been set."; + CHECK(desc_->type().has_type()) << "The var type hasn't been set."; + std::vector res; + res.reserve(GetTensorDescNum()); + switch (desc_->type().type()) { + case framework_proto::VarType::READER: + for (auto &lod_tensor : + *desc_->mutable_type()->mutable_reader()->mutable_lod_tensor()) { + res.push_back(lod_tensor.mutable_tensor()); + } + return res; + default: + LOG(FATAL) + << "Getting 'tensor_descs' is not supported by the type of var " + "%s." + << this->Name(); + } + return std::vector(); +} + +} // namespace infrt::paddle::pb diff --git a/paddle/infrt/paddle/pb/var_desc.h b/paddle/infrt/paddle/pb/var_desc.h new file mode 100644 index 0000000000000..4cff5fdee0375 --- /dev/null +++ b/paddle/infrt/paddle/pb/var_desc.h @@ -0,0 +1,124 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include +#include +#include + +#include "paddle/infrt/paddle/cpp/desc_api.h" +#include "paddle/infrt/paddle/framework.pb.h" + +namespace infrt::paddle::pb { +namespace framework_proto = ::paddle::framework::proto; + +// convert between std::vector and protobuf repeated. +template +inline std::vector RepeatedToVector( + const google::protobuf::RepeatedField &repeated_field) { + std::vector ret; + ret.reserve(repeated_field.size()); + std::copy( + repeated_field.begin(), repeated_field.end(), std::back_inserter(ret)); + return ret; +} + +template +inline void VectorToRepeated(const std::vector &vec, + RepeatedField *repeated_field) { + repeated_field->Clear(); + repeated_field->Reserve(vec.size()); + for (const auto &elem : vec) { + *repeated_field->Add() = elem; + } +} + +// Specialize vector. +template +inline void VectorToRepeated(const std::vector &vec, + RepeatedField *repeated_field) { + repeated_field->Clear(); + repeated_field->Reserve(vec.size()); + for (auto elem : vec) { + *repeated_field->Add() = elem; + } +} + +class VarDesc : public cpp::VarDescAPI { + public: + VarDesc() = delete; + + explicit VarDesc(framework_proto::VarDesc *desc) : desc_(desc) { + CHECK(desc_); + } + + ::paddle::framework::proto::VarDesc *Proto() { return desc_; } + const framework_proto::VarDesc &ReadonlyProto() const { return *desc_; } + + std::string Name() const override { return desc_->name(); } + + void SetName(std::string name) override { desc_->set_name(name); } + + void SetTensorDescNum(size_t num); + + size_t GetTensorDescNum() const; + + void SetShape(const std::vector &dims); + + void SetShapes(const std::vector> &multiple_dims); + + std::vector GetShape() const; + + std::vector> GetShapes() const; + + void SetDataType(VarDescAPI::VarDataType data_type); + + void SetDataTypes( + const std::vector &multiple_data_type); + + VarDescAPI::VarDataType GetDataType() const; + + std::vector GetDataTypes() const; + + void SetLoDLevel(int32_t lod_level); + + void SetLoDLevels(const std::vector &multiple_lod_level); + + int32_t GetLoDLevel() const; + + std::vector GetLoDLevels() const; + + VarDescAPI::Type GetType() const override; + + void SetType(VarDescAPI::Type type) override; + + bool Persistable() const override { return desc_->persistable(); } + + void SetPersistable(bool persistable) override { + desc_->set_persistable(persistable); + } + + private: + const framework_proto::VarType::TensorDesc &tensor_desc() const; + std::vector tensor_descs() const; + framework_proto::VarType::TensorDesc *mutable_tensor_desc(); + std::vector mutable_tensor_descs(); + + framework_proto::VarDesc *desc_; +}; + +} // namespace infrt::paddle::pb diff --git a/paddle/infrt/paddle/scope.cc b/paddle/infrt/paddle/scope.cc new file mode 100644 index 0000000000000..d7bab9f749591 --- /dev/null +++ b/paddle/infrt/paddle/scope.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/paddle/scope.h" + +#include "paddle/infrt/common/common.h" + +namespace infrt { +namespace paddle { + +_Variable* Scope::FindVar(const std::string& name) const { + auto it = data_.find(name); + if (it != data_.end()) return it->second.get(); + return nullptr; +} + +Tensor Scope::GetTensor(const std::string& name) const { + CheckVarNameValid(name); + auto* var = FindVar(name); + CHECK(var) << "No variable called [" << name << "] found"; + return var->get(); +} + +std::vector Scope::var_names() const { + std::vector names; + for (auto& item : data_) { + names.push_back(item.first); + } + return names; +} + +} // namespace paddle +} // namespace infrt diff --git a/paddle/infrt/paddle/scope.h b/paddle/infrt/paddle/scope.h new file mode 100644 index 0000000000000..4ebf846374c6f --- /dev/null +++ b/paddle/infrt/paddle/scope.h @@ -0,0 +1,68 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include +#include +#include + +#include "paddle/infrt/common/macros.h" +#include "paddle/infrt/paddle/tensor.h" +#include "paddle/infrt/support/variant.h" + +namespace infrt { +namespace paddle { + +using _Variable = Variant; + +struct _Tensor_; + +class Scope { + public: + static std::shared_ptr Create() { return std::make_shared(); } + + //! Get or create a variable. + template + _Variable* Var(const std::string& name); + + //! Find a variable, get null if not exists. + _Variable* FindVar(const std::string& name) const; + + Tensor GetTensor(const std::string& name) const; + + //! Get variable names. + std::vector var_names() const; + + Scope() = default; + + private: + std::unordered_map> data_; + + INFRT_DISALLOW_COPY_AND_ASSIGN(Scope); +}; + +template +_Variable* Scope::Var(const std::string& name) { + VLOG(4) << "Scope insert Var [" << name << "]"; + _Variable* x = FindVar(name); + if (x) return x; + auto* data = new _Variable(T()); + data_[name].reset(data); + return data; +} + +} // namespace paddle +} // namespace infrt diff --git a/paddle/infrt/paddle/tensor.cc b/paddle/infrt/paddle/tensor.cc new file mode 100644 index 0000000000000..072701ee9077d --- /dev/null +++ b/paddle/infrt/paddle/tensor.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/paddle/tensor.h" + +namespace infrt { +namespace paddle {} // namespace paddle +} // namespace infrt diff --git a/paddle/infrt/paddle/tensor.h b/paddle/infrt/paddle/tensor.h new file mode 100644 index 0000000000000..5c4458bb62d73 --- /dev/null +++ b/paddle/infrt/paddle/tensor.h @@ -0,0 +1,107 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "paddle/infrt/common/buffer.h" +#include "paddle/infrt/common/common.h" +#include "paddle/infrt/common/object.h" + +namespace infrt { +namespace paddle { +using common::Target; + +struct Shape { + using dim_t = int; + + Shape() = default; + explicit Shape(const std::vector& data) : data_(data) {} + + void SetData(const std::vector& data) { data_ = data; } + + const std::vector& data() const INFRT_RESULT_SHOULD_USE { + return data_; + } + std::vector& data() INFRT_RESULT_SHOULD_USE { return data_; } + size_t size() const INFRT_RESULT_SHOULD_USE { return data_.size(); } + uint32_t numel() const INFRT_RESULT_SHOULD_USE { + return std::accumulate( + data_.begin(), data_.end(), 1, [](dim_t a, dim_t b) { return a * b; }); + } + + private: + std::vector data_; +}; + +class _Tensor_ : public common::Object { + public: + _Tensor_() : buffer_(std::make_shared()) {} + + Shape& shape() { return shape_; } + + void Resize(const Shape& shape) { + shape_ = shape; + buffer_->data()->resize( + reinterpret_cast(shape.data().data()), + shape.size()); + } + + template + inline T* mutable_data(const Target& target) { + set_type(type_of()); + if (target == common::DefaultHostTarget()) { + int alignment = type_of().ElementOf().bits(); + buffer_->ResizeLazy(alignment, shape_.numel() * sizeof(T), target); + } else { + buffer_->ResizeLazy(shape_.numel() * sizeof(T), target); + } + return reinterpret_cast(buffer_->data()->memory); + } + + template + const T* data() const { + return reinterpret_cast(buffer_->data()->memory); + } + + const Type& type() { return type_; } + + void set_type(Type type) { type_ = type; } + const Type& type() const { return type_; } + + infrt_buffer_t* buffer() { return buffer_->data(); } + + const char* type_info() const override { return __type_info__; } + + private: + common::Type type_; + // A shared ptr to make it easier to share buffer between tensors. + std::shared_ptr buffer_; + Shape shape_; + + static constexpr const char* __type_info__ = "_frontend_tensor_"; +}; + +class Tensor : public Shared<_Tensor_> { + public: + Tensor() : Shared(new _Tensor_) {} + explicit Tensor(_Tensor_* x) : Shared(x) {} +}; + +} // namespace paddle +} // namespace infrt diff --git a/paddle/infrt/support/CMakeLists.txt b/paddle/infrt/support/CMakeLists.txt new file mode 100644 index 0000000000000..9bcce6cab368d --- /dev/null +++ b/paddle/infrt/support/CMakeLists.txt @@ -0,0 +1 @@ +core_gather_headers() diff --git a/paddle/infrt/support/type_traits.h b/paddle/infrt/support/type_traits.h new file mode 100644 index 0000000000000..341dabb7c1c4a --- /dev/null +++ b/paddle/infrt/support/type_traits.h @@ -0,0 +1,147 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file defines type traits related utilities. + +#pragma once + +#include +#include +#include + +#include "llvm/ADT/STLExtras.h" + +namespace infrt { + +// Utility template for tag dispatching. +template +struct TypeTag {}; + +// This is the equivalent of std::void_t in C++17. +template +struct make_void { + typedef void type; +}; +template +using void_t = typename make_void::type; + +// The same as std::disjunction in C++17. +template +struct disjunction : std::false_type {}; +template +struct disjunction : B1 {}; +template +struct disjunction + : std::conditional_t> {}; + +// Check whether T may be a base class. +template +using MaybeBase = + llvm::conjunction, llvm::negation>>; + +// Find the index of a type in a tuple. +// +// Example: +// using Tuple = std::tuple; +// static_assert(TupleIndexOf::value == 0); +// static_assert(TupleIndexOf::value == 2); +template +struct TupleIndexOf; + +template +struct TupleIndexOf> + : std::integral_constant {}; + +template +struct TupleIndexOf> + : std::integral_constant>::value> { +}; + +template +struct TupleHasType; + +template +struct TupleHasType> + : disjunction...> {}; + +// The detector pattern in C++ that can be used for checking whether a type has +// a specific property, e.g. whether an internal type is present or whether a +// particular operation is valid. +// +// Sample usage: +// +// struct Foo { +// using difference_type = int; +// int get(); +// }; +// struct Bar {}; +// +// // Check whether a type T has an internal difference_type. +// template +// using diff_t = typename T::difference_type; +// +// static_assert(is_detected_v, "Foo has difference_type"); +// static_assert(!is_detected_v, "Bar has no difference_type"); +// +// // Check whether a type T has a get() member function. +// template +// using has_get_t = decltype(std::declval().get()); +// +// static_assert(is_detected_v, "Foo has get()"); +// static_assert(!is_detected_v, "Bar has no get()"); +// +// See https://en.cppreference.com/w/cpp/experimental/is_detected for details. + +namespace internal { + +// nonesuch is a class type used to indicate detection failure. +struct nonesuch { + ~nonesuch() = delete; + nonesuch(nonesuch const&) = delete; + void operator=(nonesuch const&) = delete; +}; + +template class Op, + class... Args> +struct detector : std::false_type { + using value_t = std::false_type; + using type = Default; +}; + +template class Op, class... Args> +struct detector>, Op, Args...> { + using value_t = std::true_type; + using type = Op; +}; + +} // namespace internal + +template