diff --git a/.gitignore b/.gitignore index 801790d0a4720..664c45b7202f6 100644 --- a/.gitignore +++ b/.gitignore @@ -52,12 +52,12 @@ tools/__pycache__ # This file is automatically generated. # TODO(zhiqiang) Move this file to build directory. -paddle/infrt/dialect/pd_ops.td +paddle/infrt/dialect/pd/ir/pd_ops.td paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td tools/infrt/kernels.json tools/infrt/kernel_signature.json -paddle/infrt/dialect/pd_ops_info.h +paddle/infrt/dialect/pd/common/pd_ops_info.h .lit_test_times.txt paddle/infrt/tests/dialect/Output paddle/infrt/tests/lit.cfg.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c5f711d2918b..6988434996bcc 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,6 +53,7 @@ option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) # to develop some acl related functionality on x86 option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND}) option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) +option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF) # Note(zhouwei): It use option above, so put here include(init) include(generic) # simplify cmake module diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake index 41b90345c8c5f..d3f330ba9dd0f 100644 --- a/cmake/external/cinn.cmake +++ b/cmake/external/cinn.cmake @@ -26,7 +26,7 @@ add_definitions(-w) ###################################### include(ExternalProject) set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN) -set(CINN_GIT_TAG release/v0.1) +set(CINN_GIT_TAG 56879b637e2c4db19091eedad03d7cc674e092a2) set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION} -DWITH_CUDA=${WITH_GPU} -DWITH_CUDNN=${WITH_GPU} diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake index 9f6fd32ad986c..5c48afa2806aa 100644 --- a/cmake/external/llvm.cmake +++ b/cmake/external/llvm.cmake @@ -99,7 +99,8 @@ endfunction() function(mlir_add_rewriter td_base) set(LLVM_TARGET_DEFINITIONS ${td_base}.td) - mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass") + set(LLVM_TARGET_DEPENDS ${LLVM_TARGET_DEPENDS} ${CMAKE_SOURCE_DIR}/paddle/infrt/dialect/infrt/ir/infrt_base.td) + mlir_tablegen(${td_base}.cpp.inc -gen-rewriters) add_public_tablegen_target(MLIR${td_base}IncGen) add_dependencies(mlir-headers MLIR${td_base}IncGen) endfunction() diff --git a/cmake/external/onnxruntime.cmake b/cmake/external/onnxruntime.cmake new file mode 100644 index 0000000000000..2162f87812d13 --- /dev/null +++ b/cmake/external/onnxruntime.cmake @@ -0,0 +1,94 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if (NOT WITH_ONNXRUNTIME) + return() +endif () + +if (WITH_ARM) + message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu") + return() +endif () + +INCLUDE(ExternalProject) + +add_definitions(-DPADDLE_WITH_ONNXRUNTIME) + +SET(ONNXRUNTIME_PROJECT "extern_onnxruntime") +SET(ONNXRUNTIME_PREFIX_DIR ${THIRD_PARTY_PATH}/onnxruntime) +SET(ONNXRUNTIME_SOURCE_DIR ${THIRD_PARTY_PATH}/onnxruntime/src/${ONNXRUNTIME_PROJECT}) +SET(ONNXRUNTIME_INSTALL_DIR ${THIRD_PARTY_PATH}/install/onnxruntime) +SET(ONNXRUNTIME_INC_DIR "${ONNXRUNTIME_INSTALL_DIR}/include" CACHE PATH "onnxruntime include directory." FORCE) +SET(ONNXRUNTIME_LIB_DIR "${ONNXRUNTIME_INSTALL_DIR}/lib" CACHE PATH "onnxruntime lib directory." FORCE) +SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${ONNXRUNTIME_LIB_DIR}") + + +if (WIN32) + SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-win-x64-1.10.0.zip") +elseif (APPLE) + SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-osx-x86_64-1.10.0.tgz") +else () + SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-linux-x64-1.10.0.tgz") +endif() + + +INCLUDE_DIRECTORIES(${ONNXRUNTIME_INC_DIR}) # For ONNXRUNTIME code to include internal headers. +if (WIN32) + SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME source library." FORCE) + SET(ONNXRUNTIME_SHARED_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) + SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.lib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE) +elseif (APPLE) + SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME source library." FORCE) + SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE) + SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) +else () + SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME source library." FORCE) + SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME static library." FORCE) + SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) +endif () + +if (WIN32) + ExternalProject_Add( + ${ONNXRUNTIME_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${ONNXRUNTIME_URL} + PREFIX ${ONNXRUNTIME_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_SHARED_LIB} && + ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.lib ${ONNXRUNTIME_LIB} && + ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR} + BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB} + ) +else () + ExternalProject_Add( + ${ONNXRUNTIME_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${ONNXRUNTIME_URL} + PREFIX ${ONNXRUNTIME_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_LIB} && + ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR} + BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB} + ) +endif() + +ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${ONNXRUNTIME_LIB}) +ADD_DEPENDENCIES(onnxruntime ${ONNXRUNTIME_PROJECT}) diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake new file mode 100644 index 0000000000000..ba6f0396008fc --- /dev/null +++ b/cmake/external/paddle2onnx.cmake @@ -0,0 +1,97 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT WITH_ONNXRUNTIME) + return() +endif() + +if (WITH_ARM) + message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu") + return() +endif () + +INCLUDE(ExternalProject) + +SET(PADDLE2ONNX_PROJECT "extern_paddle2onnx") +SET(PADDLE2ONNX_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle2onnx) +SET(PADDLE2ONNX_INSTALL_DIR ${THIRD_PARTY_PATH}/install/paddle2onnx) +SET(PADDLE2ONNX_INC_DIR "${PADDLE2ONNX_INSTALL_DIR}/include" CACHE PATH "paddle2onnx include directory." FORCE) +SET(PADDLE2ONNX_REPOSITORY ${GIT_URL}/PaddlePaddle/Paddle2ONNX.git) +SET(PADDLE2ONNX_TAG cpp) +SET(LIBDIR "lib") +SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}") + +INCLUDE_DIRECTORIES(${PADDLE2ONNX_INC_DIR}) # For PADDLE2ONNX code to include internal headers. +if(WIN32) + SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.lib" CACHE FILEPATH "paddle2onnx static library." FORCE) + SET(PADDLE2ONNX_SHARED_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.dll" CACHE FILEPATH "paddle2onnx shared library." FORCE) +elseif(APPLE) + SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.dylib" CACHE FILEPATH "PADDLE2ONNX library." FORCE) +else() + SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.so" CACHE FILEPATH "PADDLE2ONNX library." FORCE) +endif(WIN32) + + +# The protoc path is required to compile onnx. +string(REPLACE "/" ";" PROTOC_BIN_PATH ${PROTOBUF_PROTOC_EXECUTABLE}) +list(POP_BACK PROTOC_BIN_PATH) +list(JOIN PROTOC_BIN_PATH "/" PROTOC_BIN_PATH) + + +set(PADDLE2ONNX_OPTIONAL_ARGS + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH} + -DWITH_STATIC=OFF + -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} +) + +if (WITH_PYTHON) + set(PADDLE2ONNX_OPTIONAL_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS} + -DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE} + -DPYTHON_INCLUDE_DIR:PATH=${PYTHON_INCLUDE_DIR} + -DPYTHON_LIBRARY:FILEPATH=${PYTHON_LIBRARY} + ) +endif () + + +ExternalProject_Add( + ${PADDLE2ONNX_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + GIT_REPOSITORY ${PADDLE2ONNX_REPOSITORY} + GIT_TAG ${PADDLE2ONNX_TAG} + DEPENDS protobuf + PREFIX ${PADDLE2ONNX_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE2ONNX_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${PADDLE2ONNX_LIB} +) + +ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE2ONNX_LIB}) +ADD_DEPENDENCIES(paddle2onnx ${PADDLE2ONNX_PROJECT}) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index f7cb7716969f5..58ff5f0d2b715 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -198,7 +198,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}") ENDIF() - if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) + + if(WITH_ONNXRUNTIME) + SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) + SET(PROTOBUF_TAG v3.18.0) + elseif(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) SET(PROTOBUF_TAG v3.8.0) elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) @@ -248,7 +252,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ) ENDFUNCTION() -if(WITH_ASCEND OR WITH_ASCEND_CL) +if(WITH_ONNXRUNTIME) + SET(PROTOBUF_VERSION 3.18.0) +elseif(WITH_ASCEND OR WITH_ASCEND_CL) SET(PROTOBUF_VERSION 3.8.0) elseif(WITH_IPU) SET(PROTOBUF_VERSION 3.6.1) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 45a76fdc1f1a2..cfbe68eecbaca 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -36,7 +36,7 @@ ENDIF() if(NOT DEFINED XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220307") else() SET(XPU_BASE_URL "${XPU_BASE_URL}") endif() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index c48d31f7e4f90..851bd81403a85 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -114,6 +114,24 @@ function(copy_part_of_thrid_party TARGET DST) endif() endif() + if (WITH_ONNXRUNTIME) + set(dst_dir "${DST}/third_party/install/onnxruntime") + copy(${TARGET} + SRCS ${ONNXRUNTIME_INC_DIR} ${ONNXRUNTIME_LIB_DIR} + DSTS ${dst_dir} ${dst_dir}) + + set(dst_dir "${DST}/third_party/install/paddle2onnx") + if(WIN32) + copy(${TARGET} + SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_SHARED_LIB} ${PADDLE2ONNX_LIB} + DSTS ${dst_dir}/include ${dst_dir}/lib ${dst_dir}/lib) + else() + copy(${TARGET} + SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB} + DSTS ${dst_dir}/include ${dst_dir}/lib) + endif() + endif() + set(dst_dir "${DST}/third_party/install/gflags") copy(${TARGET} SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index ac3eff04d5383..7df095c6c2ec0 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -250,6 +250,12 @@ IF(WITH_TESTING OR WITH_DISTRIBUTE) list(APPEND third_party_deps extern_gtest) ENDIF() +if(WITH_ONNXRUNTIME) + include(external/onnxruntime) # download, build, install onnxruntime态paddle2onnx + include(external/paddle2onnx) + list(APPEND third_party_deps extern_onnxruntime extern_paddle2onnx) +endif() + if(WITH_GPU) if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) include(external/cub) # download cub diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index f88c993d85e2f..49ba9479d49e9 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,8 +1,9 @@ cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) +cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api string_helper) + if (WITH_DISTRIBUTE) cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper) endif() -cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup) if(WITH_NCCL) cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index 5dc43af117825..cb82677a281e9 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -171,10 +171,10 @@ ProcessGroupGloo::GlooTask::GlooTask(int rank, "Only CPU place is supported for ProcessGroupGloo.")); } -ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr& store, - int rank, int world_size, - const std::shared_ptr options) - : ProcessGroup(rank, world_size), _tag(0), _store(store) { +ProcessGroupGloo::ProcessGroupGloo( + const std::shared_ptr& store, int rank, + int world_size, const std::shared_ptr options) + : ProcessGroup(rank, world_size), _tag(0), _store(new GlooStore(store)) { _context = std::make_shared(rank, world_size); auto prefix_store = ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store); diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h index 24f156571a427..71e0a40f8a761 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -52,8 +52,7 @@ class ProcessGroupGloo : public ProcessGroup { class GlooStore : public ::gloo::rendezvous::Store { public: - explicit GlooStore( - const std::shared_ptr& store) + explicit GlooStore(const std::shared_ptr& store) : _store(store) {} ~GlooStore() = default; @@ -87,7 +86,7 @@ class ProcessGroupGloo : public ProcessGroup { } protected: - std::shared_ptr _store; + std::shared_ptr _store; }; class GlooOptions { @@ -100,9 +99,9 @@ class ProcessGroupGloo : public ProcessGroup { std::shared_ptr<::gloo::transport::Device> device; }; - explicit ProcessGroupGloo(const std::shared_ptr& store, int rank, - int world_size, - std::shared_ptr options); + explicit ProcessGroupGloo( + const std::shared_ptr& store, int rank, + int world_size, std::shared_ptr options); ~ProcessGroupGloo() = default; @@ -145,7 +144,7 @@ class ProcessGroupGloo : public ProcessGroup { protected: uint32_t _tag; std::shared_ptr _context; - std::shared_ptr _store; + std::shared_ptr<::gloo::rendezvous::Store> _store; }; } // namespace distributed diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc index 84f5ca48d25c8..2deeb7ca03003 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc @@ -139,11 +139,9 @@ bool ProcessGroupHCCL::HCCLTask::IsCompleted() { // TODO(sandyhouse): Add timeout for wait, now timeout unused bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) { SynchronizeStreams(); - if (FLAGS_hccl_blocking_wait) { - // NOTE(sandyhouse): It will block host for sync - while (!IsCompleted()) { - std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout)); - } + // NOTE(sandyhouse): It will block host for sync + while (!IsCompleted()) { + std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout)); } return true; } diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h index f2376b4eed760..83d509be2cdd7 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h @@ -84,29 +84,6 @@ class ProcessGroupHCCL : public ProcessGroup { std::vector& tensors, const BroadcastOptions& = BroadcastOptions()) override; - std::shared_ptr Barrier( - const BarrierOptions& = BarrierOptions()) override; - - std::shared_ptr Send(std::vector& tensors, - int dst_rank) override; - - std::shared_ptr Recv(std::vector& tensors, - int src_rank) override; - - std::shared_ptr AllGather( - std::vector& in_tensors, - std::vector& out_tensors) override; - - std::shared_ptr AllToAll( - std::vector& in, std::vector& out) override; - - std::shared_ptr Reduce( - std::vector& tensors, const ReduceOptions& opts) override; - - std::shared_ptr Scatter(std::vector& in_tensors, - std::vector& out_tensors, - const ScatterOptions&) override; - protected: virtual std::shared_ptr CreateTask( std::vector places, int rank, CommType opType, diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 67715f410d443..7f21bcee87ab7 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -88,8 +88,8 @@ void SyncDefaultStream( for (size_t i = 0; i < places.size(); ++i) { auto* default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(places[i])); - ncclEvents[i].Record(*dev_ctx[i]); - ncclEvents[i].Block(*default_ctx); + ncclEvents[i].Record(*default_ctx); + ncclEvents[i].Block(*dev_ctx[i]); } } diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 59f3ea3b0a7d8..be4c5423943f5 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -13,11 +13,24 @@ // limitations under the License. #include "paddle/fluid/distributed/collective/reducer.h" -#include "paddle/phi/common/data_type.h" namespace paddle { namespace distributed { +static Backend TransToBackend(platform::Place place) { + static const std::map type_backend = { + {phi::AllocationType::GPU, Backend::GPU}, + {phi::AllocationType::CPU, Backend::CPU}, + }; + + phi::AllocationType type = place.GetType(); + auto it = type_backend.find(type); + PADDLE_ENFORCE_EQ(it != type_backend.end(), true, + platform::errors::InvalidArgument( + "Place type (%s) is not supported. ", place)); + return it->second; +} + std::vector> Eager_AssignGroupBySize( const std::vector tensors, const std::vector &is_sparse_gradient, @@ -127,5 +140,663 @@ std::vector> Eager_AssignGroupBySize( return res; } +template +static void ConcatTensorsForAllReduce( + const DeviceContext &context, + const std::vector &dense_tensors_, + Tensor *p_dense_contents) { + operators::math::ConcatFunctor concat_functor_; + concat_functor_( + context, dense_tensors_, 0, + std::dynamic_pointer_cast(p_dense_contents->impl()) + .get()); +} + +template +static void SplitTensorsForAllReduce( + const DeviceContext &context, Tensor *p_dense_contents, + std::vector *p_dense_tensors) { + auto *in = + std::dynamic_pointer_cast(p_dense_contents->impl()) + .get(); + std::vector outs; + std::vector shape_refer; + + outs.reserve(p_dense_tensors->size()); + shape_refer.reserve(p_dense_tensors->size()); + + for (auto &tensor : *p_dense_tensors) { + outs.emplace_back(&tensor); + shape_refer.emplace_back(&tensor); + } + + operators::math::SplitFunctor split_functor_; + split_functor_(context, *in, shape_refer, 0, &outs); +} + +// context is used to select the stream for concat +template +static void ConcatTensorsWithType( + const DeviceContext &context, + const std::vector &dense_tensors_, + Tensor *p_dense_contents, phi::DataType type) { + switch (type) { + case phi::DataType::FLOAT16: + ConcatTensorsForAllReduce( + context, dense_tensors_, p_dense_contents); + break; + case phi::DataType::FLOAT32: + ConcatTensorsForAllReduce(context, dense_tensors_, + p_dense_contents); + break; + case phi::DataType::FLOAT64: + ConcatTensorsForAllReduce(context, dense_tensors_, + p_dense_contents); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it concats tensors for " + "allreduce.", + type)); + } +} + +// context is used to select the stream for split +template +static void SplitTensorsWithType(const DeviceContext &context, + Tensor *p_dense_contents, + std::vector *p_dense_tensors, + phi::DataType type) { + switch (type) { + case phi::DataType::FLOAT16: + SplitTensorsForAllReduce( + context, p_dense_contents, p_dense_tensors); + break; + case phi::DataType::FLOAT32: + SplitTensorsForAllReduce(context, p_dense_contents, + p_dense_tensors); + break; + case phi::DataType::FLOAT64: + SplitTensorsForAllReduce(context, p_dense_contents, + p_dense_tensors); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it splits tensors for " + "allreduce.", + type)); + } +} + +void EagerGroup::ConcatTensors(const platform::Place &place) { + if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_, + dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't concat grad tensors since it's not compiled with NCCL," + "Please recompile or reinstall Paddle with NCCL support.")); +#endif + } else if (platform::is_cpu_place(place)) { + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_, + dtype_); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Concat grad tensor not supported on place (%s)", place)); + } +} + +void EagerGroup::SplitTensors(const platform::Place &place) { + if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_, + dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't split grad tensor since it's not compiled with NCCL," + "Please recompile or reinstall Paddle with NCCL support.")); +#endif + } else if (platform::is_cpu_place(place)) { + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_, + dtype_); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Split grad tensor not supported on place (%s)", place)); + } +} + +EagerReducer::EagerReducer( + const std::vector tensors, + const std::vector> &group_indices, + const std::vector &is_sparse_gradient, + std::shared_ptr process_group, + const std::vector &group_size_limits, bool find_unused_parameters) + : tensors_(tensors), + group_indices_(group_indices), + is_sparse_gradient_(is_sparse_gradient), + process_group_(process_group), + group_size_limits_(group_size_limits), + find_unused_vars_each_step_(find_unused_parameters) { + VLOG(3) << "Start construct the Reducer ..."; + + nranks_ = process_group_->GetSize(); + + // initialize groups + InitializeGroups(group_indices); + + for (size_t global_var_index = 0; global_var_index < tensors_.size(); + ++global_var_index) { + auto tensor = tensors_[global_var_index]; + auto reduce_hook = [=](void) -> void { + this->AddDistHook(global_var_index); + }; + + const auto &grad_node = GetGradNodeFromTensor(&tensor); + + PADDLE_ENFORCE( + grad_node.get() != nullptr, + paddle::platform::errors::Fatal("Detected NULL grad_node," + "Leaf tensor should have had grad_node " + "with type: GradNodeAccumulation")); + const auto &accumulation_grad_node = + std::dynamic_pointer_cast(grad_node); + accumulation_grad_node->RegisterReduceHook( + std::make_shared(reduce_hook)); + + gradnode_index_map_[grad_node.get()] = global_var_index; + } + + vars_marked_ready_.resize(tensors_.size(), false); + local_used_vars_.resize(tensors_.size(), 0); + + if (find_unused_vars_each_step_) { + global_used_vars_ = paddle::experimental::empty( + ScalarArray({static_cast(tensors_.size())}), DataType::INT32, + TransToBackend(inner_place_)); + } +} + +std::shared_ptr EagerReducer::GetGradNodeFromTensor( + Tensor *tensor) { + auto *autograd_meta = tensor->get_autograd_meta(); + const auto &grad_node = + static_cast(autograd_meta)->GetMutableGradNode(); + return grad_node; +} + +void EagerReducer::InitializeGroups( + const std::vector> &group_indices) { + VLOG(3) << "Start initialize groups .."; + + // clear the group + groups_.clear(); + groups_.reserve(group_indices.size()); + + variable_locators_.clear(); + variable_locators_.resize(tensors_.size()); + + auto group_nums = group_indices.size(); + for (size_t group_index = 0; group_index < group_nums; ++group_index) { + const auto &tensor_indices_ = group_indices[group_index]; + PADDLE_ENFORCE_GT( + tensor_indices_.size(), 0, + platform::errors::PreconditionNotMet( + "The number of group[%d]'s elements is 0.", group_index)); + + EagerGroup group; + + // It's just for check the sparse or dense + auto first_var = tensors_[tensor_indices_.front()]; + if (tensor_indices_.size() == 1 && + is_sparse_gradient_[tensor_indices_.front()]) { + // process the sparse gradient. one sparse, one group + group.dtype_ = first_var.dtype(); + } else { + // process the dense gradient. + InitializeDenseGroups(tensor_indices_, &group); + // experimental::Backend backend = TransToBackend(inner_place_); + group.dense_contents_ = paddle::experimental::empty( + ScalarArray({group.all_length_}), group.dtype_, + TransToBackend(inner_place_)); + } + + // map tensors to this group by VariableLocator + size_t inside_group_index = 0; + for (const auto var_index : tensor_indices_) { + TensorLocator tensor_locator; + tensor_locator.group_index = group_index; + tensor_locator.inside_group_index = inside_group_index++; + variable_locators_[var_index] = tensor_locator; + } + group.tensor_indices_ = std::move(tensor_indices_); + groups_.emplace_back(std::move(group)); + + VLOG(3) << "The Group[" << group_index << "]:" << groups_.back(); + } +} + +void EagerReducer::InitializeDenseGroups( + const std::vector &tensor_indices_, EagerGroup *p_group) { + VLOG(3) << "InitializeDenseGroups."; + int64_t all_length = 0; + for (size_t index = 0; index < tensor_indices_.size(); ++index) { + auto tensor_index = tensor_indices_[index]; + auto &tensor = tensors_[tensor_index]; + auto &tensor_name = tensor.name(); + + PADDLE_ENFORCE_EQ(tensor.is_initialized(), true, + platform::errors::PreconditionNotMet( + "Tensor %s is not initialized.", tensor_name)); + const auto size = tensor.numel(); + PADDLE_ENFORCE_GT( + size, 0, platform::errors::PreconditionNotMet( + "The number of tensor %s's elements is 0.", tensor_name)); + all_length += size; + + p_group->length_.push_back(size); + + // for concat operator + p_group->origin_shapes_.push_back(ScalarArray(tensor.shape())); + p_group->dense_tensors_.push_back(phi::DenseTensor()); + + const auto &dtype = tensor.dtype(); + const auto &place = tensor.place(); + const auto &inner_place = tensor.impl()->place(); + if (index > 0) { + PADDLE_ENFORCE_EQ(dtype, p_group->dtype_, + platform::errors::PreconditionNotMet( + "Tensor %s has unexpected dtype.", tensor_name)); + PADDLE_ENFORCE_EQ(place, place_, + platform::errors::PreconditionNotMet( + "Tensor %s has different place. Expected place is " + "%s, but actual place is %s", + tensor_name, inner_place_, inner_place)); + } else { + p_group->dtype_ = dtype; + place_ = place; + inner_place_ = inner_place; + } + } + p_group->all_length_ = all_length; +} + +void EagerReducer::TraverseBackwardGraph(const std::vector &outputs) { + std::queue queue; + std::set visited; + + for (const auto &output : outputs) { + auto *auto_grad_meta = + static_cast(output.get_autograd_meta()); + if (!auto_grad_meta) continue; + auto shared_grad_node = auto_grad_meta->GetMutableGradNode(); + if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr || + auto_grad_meta->StopGradient()) { + continue; + } + egr::GradNodeBase *grad_node = shared_grad_node.get(); + queue.emplace(grad_node); + } + + while (!queue.empty()) { + egr::GradNodeBase *node = queue.front(); + queue.pop(); + const std::vector> &edges = node->GetEdges(); + for (size_t i = 0; i < edges.size(); i++) { + for (size_t j = 0; j < edges[i].size(); j++) { + const egr::Edge &edge = edges[i][j]; + auto next_node_shared = edge.GetMutableGradNode(); + if (!next_node_shared || !next_node_shared.get()) { + continue; + } + auto *next_node = next_node_shared.get(); + const bool was_inserted = visited.insert(next_node).second; + if (was_inserted) { + queue.emplace(next_node); + } + } + } + } + + for (const auto &it : gradnode_index_map_) { + if (visited.count(it.first) == 0) { + unused_vars_.push_back(it.second); + VLOG(3) << "[Rank " << process_group_->GetRank() << "]: " + << "Tensor " << tensors_[it.second].name() << " at index " + << it.second << " is marked as unused."; + } + } +} + +void EagerReducer::PrepareForBackward(const std::vector &outputs) { + VLOG(3) << "after forward, then reset count for backward."; + grad_need_hooks_ = true; + next_group_ = 0; + std::for_each(groups_.begin(), groups_.end(), [](EagerGroup &group) { + group.pending_ = group.tensor_indices_.size(); + }); + + // reinitialize vars_marked_ready_ for next iteration + vars_marked_ready_.clear(); + vars_marked_ready_.resize(tensors_.size(), false); + + PADDLE_ENFORCE_EQ( + groups_need_finalize_, false, + platform::errors::PreconditionNotMet( + "A serious error has occurred here. Please " + "set find_unused_parameters=True to traverse backward graph " + "in each step to prepare reduce in advance. If you have " + "set, There may be several reasons for this error: " + "1) Please note that all forward outputs derived from the module " + "parameters must participate in the calculation of losses and " + "subsequent gradient calculations. If not, the wrapper will hang, " + "waiting for autograd to generate gradients for these parameters. " + "you can use detach or stop_gradient to make the unused parameters " + "detached from the autograd graph. " + "2) Used multiple forwards and one backward. You may be able to wrap " + "multiple forwards in a model.")); + + // The first var to trigger the unused parameter + has_marked_unused_vars_ = false; + + if (find_unused_vars_once_ || find_unused_vars_each_step_) { + unused_vars_.clear(); + TraverseBackwardGraph(outputs); + // only check once in first step + find_unused_vars_once_ = false; + } + + if (find_unused_vars_each_step_ && unused_vars_.empty()) { + LOG_FIRST_N(WARNING, 1) + << "All parameters are involved in the backward pass. " + "It is recommended to set find_unused_parameters to False " + "to improve performance. However, if unused parameters " + "appear in subsequent iterative training, then an error " + "will occur. Please make it clear that in the subsequent " + "training, there will be no parameters that are not used " + "in the backward pass, and then set find_unused_parameters"; + } + + if (unused_vars_.size() == tensors_.size()) { + LOG_FIRST_N(WARNING, 1) + << "There is no parameter in the device involved " + "in the backward calculation. If there are " + "parameters on other devices involved in the " + "backward, then a serious error will occur here."; + } +} + +void EagerReducer::AddDistHook(size_t var_index) { + PADDLE_ENFORCE_LT(var_index, variable_locators_.size(), + platform::errors::OutOfRange( + "Out of bounds variable index. it must be less" + "than %d, but it is %d", + variable_locators_.size(), var_index)); + + // gradient synchronization is not required when grad_need_hooks_ is false. + if (!grad_need_hooks_) { + return; + } + + auto &tensor = tensors_[var_index]; + const auto &grad_node = GetGradNodeFromTensor(&tensor); + + VLOG(3) << "Tensor[" << var_index << "] [" << tensors_[var_index].name() + << "@Grad] arrived and triggered disthook"; + + local_used_vars_[var_index] = 1; + + if (!has_marked_unused_vars_) { + has_marked_unused_vars_ = true; + for (const auto unused_index : unused_vars_) { + MarkVarReady(unused_index, false); + } + } + MarkVarReady(var_index, true); +} + +void EagerReducer::MarkVarReady(const size_t var_index, + const bool is_used_var) { + VLOG(3) << "Tensor[" << var_index << "][" << tensors_[var_index].name() + << "] is marked ready."; + // error happened, if the var is ready before. + if (vars_marked_ready_[var_index]) { + auto error_info = string::Sprintf( + "Error happened, when parameter[%d][%s] has been ready before. " + "Please set find_unused_parameters=True to traverse backward graph " + "in each step to prepare reduce in advance. If you have set, " + "there may be several reasons for this error: " + "1) In multiple reentrant backward phase, some parameters are reused." + "2) Using model parameters outside of forward function. Please " + "make sure that model parameters are not shared in concurrent " + "forward-backward passes.", + var_index, tensors_[var_index].name()); + + PADDLE_ENFORCE_EQ(has_marked_unused_vars_, false, + platform::errors::PreconditionNotMet(error_info)); + + error_info += + "3) Unused parameters retrieval is incorrect. " + "The return value of forward will be used to retrieve" + " the unused parameters of the entire model. These " + "gradients of unused parameters will not be synchronized " + "between multiple cards. However, if the unused " + "parameters participate in the backward calculation " + "again at a later time (e.g. after the forward function, " + "the loss calculation uses the unused " + "paramters of the forward and trigger backward), " + "its gradient will be wrong."; + + PADDLE_ENFORCE_EQ(has_marked_unused_vars_, true, + platform::errors::PreconditionNotMet(error_info)); + } else { + vars_marked_ready_[var_index] = true; + } + groups_need_finalize_ = true; + + const auto &var_locator = variable_locators_[var_index]; + const auto group_index = var_locator.group_index; + const auto inside_group_index = var_locator.inside_group_index; + + auto &group = groups_[group_index]; + auto &group_tensor = group.dense_tensors_[inside_group_index]; + const auto length = group.length_[inside_group_index]; + + if (is_used_var) { + auto *autograd_meta = tensors_[var_index].get_autograd_meta(); + auto &grad_tensor = static_cast(autograd_meta)->Grad(); + group_tensor + .ShareDataWith( + *(std::dynamic_pointer_cast(grad_tensor.impl()))) + .Resize({grad_tensor.numel()}); + } else { + // TODO(shenliang03): maybe save the memory by avoiding tensor construction + if (!group_tensor.initialized()) { + group_tensor.Resize({static_cast(length)}); + group_tensor.mutable_data(inner_place_, group.dtype_); + } + if (HasGrad(var_index)) { + VLOG(3) << "Tensor[" << tensors_[var_index].name() << "] has grad"; + auto grad_tensor = egr::EagerUtils::mutable_grad(tensors_[var_index]); + group_tensor + .ShareDataWith(*( + std::dynamic_pointer_cast(grad_tensor->impl()))) + .Resize({length}); + } else { + VLOG(3) << "Tensor[" << tensors_[var_index].name() + << "] doesn't have grad"; + auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_); + group_tensor.Resize({static_cast(length)}); + phi::funcs::set_constant(*dev_ctx, &group_tensor, 0.0); + } + } + + if (--group.pending_ == 0) { + // can start allreduce + MarkGroupReady(group_index); + } + + if (next_group_ == groups_.size()) { + FinalizeBackward(); + } +} + +void EagerReducer::MarkGroupReady(size_t group_index) { + VLOG(3) << "Group[" << group_index << "] is ready"; + + PADDLE_ENFORCE_GE( + group_index, next_group_, + platform::errors::PreconditionNotMet( + "The index of the incoming group must be greater " + "than or equal to the previously synchronized group index, " + "expect it to greater than or equal to %d, but got %d.", + next_group_, group_index)); + + if (group_index > next_group_) { + VLOG(3) << "It will adjust the order of group in next batch automatically"; + return; + } + + for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0; + ++next_group_) { + UNUSED auto &group = groups_[next_group_]; + FusedAllReduceSchedule(&group, next_group_); + } +} + +bool EagerReducer::HasGrad(size_t var_index) { + auto grad = egr::EagerUtils::mutable_grad(tensors_[var_index]); + if (grad && grad->is_initialized()) { + return true; + } else { + return false; + } +} + +void EagerReducer::ProcessUnusedDenseVars() { + // The calculation stream must be used here to + // avoid conflicts with communication. + VLOG(3) << "Local used vars : " + << string::join_strings(local_used_vars_, ','); + + const auto *dev_ctx = + platform::DeviceContextPool::Instance().Get(inner_place_); + auto *global_used_tensor = + std::dynamic_pointer_cast(global_used_vars_.impl()) + .get(); + framework::TensorFromVector(local_used_vars_, *dev_ctx, + global_used_tensor); + + distributed::AllreduceOptions opts; + opts.reduce_op = ReduceOp::SUM; + std::vector reduce_tensors = {global_used_vars_}; + process_group_->AllReduce(reduce_tensors, opts)->Synchronize(); + + framework::TensorToVector(*global_used_tensor, *dev_ctx, + &local_used_vars_); + dev_ctx->Wait(); + + // sync compute stream to get global used var message, + // but maybe affect speed performance + VLOG(3) << "Global used vars : " + << string::join_strings(local_used_vars_, ','); + + for (const auto var_index : unused_vars_) { + const bool global_unused = (local_used_vars_[var_index] == 0); + + // global used but local unused, set grad + VLOG(3) << "[Rank " << process_group_->GetRank() << "]: " + << "Var [" << var_index << "] [" << tensors_[var_index].name() + << "] global_unused: " << global_unused + << " has grad: " << HasGrad(var_index); + + if (!global_unused) { + VLOG(3) << "Set Tensor[" << var_index << "]'s Grad for [Rank " + << process_group_->GetRank() << "]"; + const auto &var_locator = variable_locators_[var_index]; + const auto group_index = var_locator.group_index; + const auto &group = groups_[group_index]; + const auto inside_group_index = var_locator.inside_group_index; + auto &src_tensor = group.dense_tensors_[inside_group_index]; + + Tensor grad_value(std::make_shared(src_tensor)); + + auto dest_var_base = tensors_[var_index]; + auto grad_tensor = egr::EagerUtils::mutable_grad(dest_var_base); + grad_tensor->copy_(grad_value, inner_place_, true); + grad_tensor->reshape(dest_var_base.shape()); + } + } +} + +void EagerReducer::FinalizeBackward() { + groups_need_finalize_ = false; + grad_need_hooks_ = false; + for (auto &group : groups_) { + group.task->Synchronize(); + } + + for (auto &group : groups_) { + group.SplitTensors(inner_place_); + } + + if (find_unused_vars_each_step_) { + ProcessUnusedDenseVars(); + local_used_vars_.clear(); + local_used_vars_.resize(tensors_.size(), 0); + VLOG(3) << "ProcessUnusedDenseVars is finished."; + } + + VLOG(3) << "In the batch, Reducer is finished."; +} + +void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, + const int curr_group_index) { + // The overall timeline: concat > div_nranks > allreduce > split + distributed::AllreduceOptions opts; + opts.reduce_op = ReduceOp::SUM; + + VLOG(3) << "group [" << curr_group_index << "] start fused_allreduce."; + + // concat tensors + group->ConcatTensors(inner_place_); + + // div nranks + paddle::experimental::scale_(group->dense_contents_, 1.0 / nranks_, 0.0, + false); + + // all_reduce + std::vector reduce_tensors = {group->dense_contents_}; + group->task = process_group_->AllReduce(reduce_tensors, opts); + + // split in FinalizeBackward() +} + +std::ostream &operator<<(std::ostream &out, const EagerGroup &group) { + const auto &tensors_ = group.tensor_indices_; + out << "numel: " << group.all_length_ << " ;var number: " << tensors_.size() + << "\n"; + auto begin = tensors_.begin(); + auto end = tensors_.end(); + out << "["; + for (int i = 0; begin != end && i < 100; ++i, ++begin) { + if (i > 0) out << ' '; + out << *begin; + } + if (begin != end) { + out << " ..."; + } + out << "]\n"; + return out; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h index f8c75385ef8bd..d3ffa8498a14b 100644 --- a/paddle/fluid/distributed/collective/reducer.h +++ b/paddle/fluid/distributed/collective/reducer.h @@ -17,16 +17,126 @@ #include #include #include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/utils/hook_utils.h" #include "paddle/fluid/eager/api/utils/tensor_utils.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/api/lib/ext_compat_utils.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/utils/string/string_helper.h" namespace paddle { namespace distributed { using Tensor = paddle::experimental::Tensor; +using Scalar = paddle::experimental::ScalarBase; +using ScalarArray = + paddle::experimental::ScalarArrayBase; +using Backend = paddle::experimental::Backend; std::vector> Eager_AssignGroupBySize( - const std::vector, const std::vector& is_sparse_gradient, - const std::vector& group_size_limits, - const std::vector& tensor_indices = {}); + const std::vector, const std::vector &is_sparse_gradient, + const std::vector &group_size_limits, + const std::vector &tensor_indices = {}); + +class EagerGroup { + public: + Tensor dense_contents_; + + // for concat kernel + std::vector dense_tensors_; + std::vector length_; + int64_t all_length_{0}; + std::vector origin_shapes_; + + // Global indices of participating tensors in the group + std::vector tensor_indices_; + + // Number of params that haven't been ready. When it is 0, it means + // the group is ready. + size_t pending_ = -1; + + // external message of group + phi::DataType dtype_; + + // help to sync + std::shared_ptr task; + + // context is used to select the stream for concat + void ConcatTensors(const platform::Place &); + + // context is used to select the stream for split + void SplitTensors(const platform::Place &); + + friend std::ostream &operator<<(std::ostream &, const EagerGroup &); +}; + +struct TensorLocator { + // record the index in groups_ + size_t group_index; + size_t inside_group_index; +}; + +class EagerReducer { + public: + explicit EagerReducer( + const std::vector tensors, + const std::vector> &group_indices, + const std::vector &is_sparse_gradient, + std::shared_ptr process_group, + const std::vector &group_size_limits, + bool find_unused_parameters); + + virtual ~EagerReducer() {} + + std::shared_ptr GetGradNodeFromTensor(Tensor *tensor); + + void InitializeGroups(const std::vector> &group_indices); + void InitializeDenseGroups(const std::vector &tensor_indices_, + EagerGroup *p_group); + void PrepareForBackward(const std::vector &outputs); + void AddDistHook(size_t var_index); + void MarkVarReady(const size_t var_index, const bool is_used_var); + void MarkGroupReady(const size_t group_index); + void FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index); + void FinalizeBackward(); + void TraverseBackwardGraph(const std::vector &outputs); + void ProcessUnusedDenseVars(); + bool HasGrad(size_t var_index); + + private: + std::vector tensors_; + std::vector> group_indices_; + std::vector is_sparse_gradient_; + std::shared_ptr process_group_; + std::vector group_size_limits_; + + std::vector groups_; + std::vector variable_locators_; + PlaceType place_; + platform::Place inner_place_; + size_t next_group_ = 0; + int64_t nranks_ = -1; + + bool grad_need_hooks_{false}; + + std::vector vars_marked_ready_; + std::vector local_used_vars_; + + // Following variables are to help unused vars + std::vector unused_vars_; + std::map gradnode_index_map_; + bool has_marked_unused_vars_{false}; + bool find_unused_vars_each_step_{false}; + bool find_unused_vars_once_{true}; + bool groups_need_finalize_{false}; + Tensor global_used_vars_; +}; } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt index 3e734b1b9ed24..8641b36a1be8e 100644 --- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt +++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt @@ -4,7 +4,7 @@ if(WITH_PYTHON) endif() proto_library(interceptor_message_proto SRCS interceptor_message.proto) -if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL)) +if(WITH_DISTRIBUTE AND WITH_PSCORE) set(BRPC_DEPS brpc ssl crypto protobuf zlib leveldb snappy gflags glog) else() set(BRPC_DEPS "") diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc index 8d2ec5c41d864..80a6b4667aa1a 100644 --- a/paddle/fluid/distributed/fleet_executor/message_bus.cc +++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc @@ -67,8 +67,7 @@ bool MessageBus::IsInit() const { return is_init_; } MessageBus::~MessageBus() { VLOG(3) << "Message bus releases resource."; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) server_.Stop(1000); server_.Join(); #endif @@ -87,8 +86,7 @@ bool MessageBus::Send(int64_t dst_rank, IsInit(), true, platform::errors::PreconditionNotMet( "Using message bus since it has not been initialized.")); -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) int retry_time = 0; // message bus will retry sending for 10 times while (retry_time < 10) { ++retry_time; @@ -173,8 +171,7 @@ void MessageBus::ListenPort() { LOG(INFO) << "No need listen to port since training on single card."; return; } -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) // function keep listen the port and handle the message PADDLE_ENFORCE_EQ( server_.AddService(&message_service_, brpc::SERVER_DOESNT_OWN_SERVICE), 0, @@ -203,8 +200,7 @@ void MessageBus::ListenPort() { #endif } -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) bool MessageBus::SendInterRank(int64_t dst_rank, const InterceptorMessage& interceptor_message) { const auto& dst_addr = GetAddr(dst_rank); diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.h b/paddle/fluid/distributed/fleet_executor/message_bus.h index d805ac81606b8..dfd65fdbc00d4 100644 --- a/paddle/fluid/distributed/fleet_executor/message_bus.h +++ b/paddle/fluid/distributed/fleet_executor/message_bus.h @@ -20,8 +20,7 @@ #include #include -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #include "brpc/channel.h" #include "brpc/server.h" #include "paddle/fluid/distributed/fleet_executor/message_service.h" @@ -64,8 +63,7 @@ class MessageBus final { const std::string& GetAddr(int64_t rank) const; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) // send the message inter rank (dst is different rank with src) bool SendInterRank(int64_t dst_rank, const InterceptorMessage& interceptor_message); @@ -81,8 +79,7 @@ class MessageBus final { // the ip needs to be listened std::string addr_; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) MessageServiceImpl message_service_; // brpc server brpc::Server server_; diff --git a/paddle/fluid/distributed/fleet_executor/message_service.cc b/paddle/fluid/distributed/fleet_executor/message_service.cc index c3fff98f684ad..1c66d83ea34d7 100644 --- a/paddle/fluid/distributed/fleet_executor/message_service.cc +++ b/paddle/fluid/distributed/fleet_executor/message_service.cc @@ -11,8 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/fleet_executor/message_service.h" #include "brpc/server.h" #include "paddle/fluid/distributed/fleet_executor/global.h" diff --git a/paddle/fluid/distributed/fleet_executor/message_service.h b/paddle/fluid/distributed/fleet_executor/message_service.h index 02f73471e3b91..5ab687ff93dc4 100644 --- a/paddle/fluid/distributed/fleet_executor/message_service.h +++ b/paddle/fluid/distributed/fleet_executor/message_service.h @@ -11,8 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #pragma once #include "brpc/server.h" diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc index 18920d06f3854..ba039385a74ba 100644 --- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc @@ -24,10 +24,14 @@ limitations under the License. */ #include "paddle/fluid/distributed/fleet_executor/task_node.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(fill_constant); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto index 0ae87812bce43..fac30e26c388c 100644 --- a/paddle/fluid/distributed/ps.proto +++ b/paddle/fluid/distributed/ps.proto @@ -115,6 +115,7 @@ message TableParameter { optional CommonAccessorParameter common = 6; optional TableType type = 7; optional bool compress_in_save = 8 [ default = false ]; + optional GraphParameter graph_parameter = 9; } message TableAccessorParameter { @@ -211,3 +212,25 @@ message SparseAdamSGDParameter { // SparseAdamSGDRule optional double ada_epsilon = 5 [ default = 1e-08 ]; repeated float weight_bounds = 6; } + +message GraphParameter { + optional int32 task_pool_size = 1 [ default = 24 ]; + optional bool gpups_mode = 2 [ default = false ]; + optional string gpups_graph_sample_class = 3 + [ default = "CompleteGraphSampler" ]; + optional string gpups_graph_sample_args = 4 [ default = "" ]; + optional bool use_cache = 5 [ default = true ]; + optional float cache_ratio = 6 [ default = 0.3 ]; + optional int32 cache_ttl = 7 [ default = 5 ]; + optional GraphFeature graph_feature = 8; + optional string table_name = 9 [ default = "" ]; + optional string table_type = 10 [ default = "" ]; + optional int32 gpups_mode_shard_num = 11 [ default = 127 ]; + optional int32 gpu_num = 12 [ default = 1 ]; +} + +message GraphFeature { + repeated string name = 1; + repeated string dtype = 2; + repeated int32 shape = 3; +} \ No newline at end of file diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc index b8ccd8e744dab..f86b4b706b3e2 100644 --- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc +++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc @@ -414,6 +414,16 @@ std::future BrpcPsClient::load(uint32_t table_id, return send_cmd(table_id, PS_LOAD_ONE_TABLE, {epoch, mode}); } +std::future BrpcPsClient::Load(const LoadSaveContext &load_context) { + if (load_context.table_id < 0) { + return send_cmd(-1, PS_LOAD_ALL_TABLE, + {load_context.epoch, load_context.mode}); + } else { + return send_cmd(load_context.table_id, PS_LOAD_ONE_TABLE, + {load_context.epoch, load_context.mode}); + } +} + std::future BrpcPsClient::save(const std::string &epoch, const std::string &mode) { VLOG(1) << "BrpcPsClient::save path " << epoch; @@ -427,6 +437,19 @@ std::future BrpcPsClient::save(uint32_t table_id, return send_save_cmd(table_id, PS_SAVE_ONE_TABLE, {epoch, mode}); } +std::future BrpcPsClient::Save(const LoadSaveContext &save_context) { + if (save_context.table_id < 0) { + VLOG(1) << "BrpcPsClient::save path " << save_context.epoch; + return send_save_cmd(-1, PS_SAVE_ALL_TABLE, + {save_context.epoch, save_context.mode}); + } else { + VLOG(1) << "BrpcPsClient::save one table path " << save_context.epoch + << " table_id " << save_context.table_id; + return send_save_cmd(save_context.table_id, PS_SAVE_ONE_TABLE, + {save_context.epoch, save_context.mode}); + } +} + std::future BrpcPsClient::clear() { return send_cmd(-1, PS_CLEAR_ALL_TABLE, {}); } @@ -505,6 +528,44 @@ std::future BrpcPsClient::barrier(size_t table_id, return send_cmd(table_id, PS_BARRIER, {std::to_string(barrier_type)}); } +std::future BrpcPsClient::Pull(RequestContext &pull_context) { + if (pull_context.value_type == Dense) { // pull dense + Region *dense_region = + reinterpret_cast(pull_context.dense_values); + pull_dense(dense_region, pull_context.num, pull_context.table); + } else { // pull sparse + uint64_t *keys = reinterpret_cast(pull_context.keys); + float **select_values = + reinterpret_cast(pull_context.sparse_values); + size_t table_id = pull_context.table; + size_t num = pull_context.num; + bool is_training = pull_context.is_training; + if (pull_context.training_mode == Geo) { // for geo + pull_sparse_param(select_values, table_id, keys, num, is_training); + } else if (pull_context.training_mode == Async) { // for async + pull_sparse(select_values, table_id, keys, num, is_training); + } + } +} + +std::future BrpcPsClient::Push(RequestContext &push_context) { + if (push_context.value_type == Dense) { // push dense + const Region *dense_region = push_context.push_context.push_dense_values; + push_dense(dense_region, push_context.num, push_context.table); + } else { // push sparse + size_t table_id = push_context.table; + size_t num = push_context.num; + bool is_training = push_context.is_training; + if (push_context.training_mode == Geo) { // for geo + // TODO(zhaocaibei) + } else if (push_context.training_mode == Async) { // for async + const uint64_t *keys = push_context.push_context.keys; + const float **update_values = push_context.push_context.push_values; + push_sparse(table_id, keys, update_values, num); + } + } +} + std::future BrpcPsClient::pull_geo_param(size_t table_id, std::vector *values, std::vector *keys, diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.h b/paddle/fluid/distributed/ps/service/brpc_ps_client.h index 59ed59933db86..8b0cb0741b400 100644 --- a/paddle/fluid/distributed/ps/service/brpc_ps_client.h +++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h @@ -163,12 +163,17 @@ class BrpcPsClient : public PSClient { std::future load(uint32_t table_id, const std::string &epoch, const std::string &mode) override; + std::future Load(const LoadSaveContext &load_context) override; + std::future save(const std::string &epoch, const std::string &mode) override; std::future save(uint32_t table_id, const std::string &epoch, const std::string &mode) override; + virtual std::future Save( + const LoadSaveContext &save_context) override; + std::future clear() override; std::future clear(uint32_t table_id) override; @@ -199,6 +204,10 @@ class BrpcPsClient : public PSClient { const uint64_t *keys, size_t num, bool is_training); + virtual std::future Pull(RequestContext &pull_context) override; + + virtual std::future Push(RequestContext &push_context) override; + virtual std::future print_table_stat(uint32_t table_id); virtual std::future barrier(size_t table_id, uint32_t barrier_type); diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.h b/paddle/fluid/distributed/ps/service/brpc_ps_server.h index 4310c247438ce..d81a3a5df07f1 100644 --- a/paddle/fluid/distributed/ps/service/brpc_ps_server.h +++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.h @@ -51,7 +51,7 @@ class BrpcPsServer : public PSServer { _server.Join(); return 0; } - virtual int32_t port(); + int32_t port(); private: virtual int32_t initialize(); diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc index 301708f6b7bb3..a3db88e3b679d 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc +++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc @@ -44,7 +44,7 @@ void GraphPsService_Stub::service( } } -int GraphBrpcClient::get_server_index_by_id(uint64_t id) { +int GraphBrpcClient::get_server_index_by_id(int64_t id) { int shard_num = get_shard_num(); int shard_per_server = shard_num % server_size == 0 ? shard_num / server_size @@ -53,7 +53,7 @@ int GraphBrpcClient::get_server_index_by_id(uint64_t id) { } std::future GraphBrpcClient::get_node_feat( - const uint32_t &table_id, const std::vector &node_ids, + const uint32_t &table_id, const std::vector &node_ids, const std::vector &feature_names, std::vector> &res) { std::vector request2server; @@ -66,7 +66,7 @@ std::future GraphBrpcClient::get_node_feat( } } size_t request_call_num = request2server.size(); - std::vector> node_id_buckets(request_call_num); + std::vector> node_id_buckets(request_call_num); std::vector> query_idx_buckets(request_call_num); for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { int server_index = get_server_index_by_id(node_ids[query_idx]); @@ -129,7 +129,7 @@ std::future GraphBrpcClient::get_node_feat( closure->request(request_idx) ->add_params((char *)node_id_buckets[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); std::string joint_feature_name = paddle::string::join_strings(feature_names, '\t'); closure->request(request_idx) @@ -179,9 +179,9 @@ std::future GraphBrpcClient::clear_nodes(uint32_t table_id) { return fut; } std::future GraphBrpcClient::add_graph_node( - uint32_t table_id, std::vector &node_id_list, + uint32_t table_id, std::vector &node_id_list, std::vector &is_weighted_list) { - std::vector> request_bucket; + std::vector> request_bucket; std::vector> is_weighted_bucket; bool add_weight = is_weighted_list.size() > 0; std::vector server_index_arr; @@ -191,7 +191,7 @@ std::future GraphBrpcClient::add_graph_node( if (index_mapping[server_index] == -1) { index_mapping[server_index] = request_bucket.size(); server_index_arr.push_back(server_index); - request_bucket.push_back(std::vector()); + request_bucket.push_back(std::vector()); if (add_weight) is_weighted_bucket.push_back(std::vector()); } request_bucket[index_mapping[server_index]].push_back( @@ -229,7 +229,7 @@ std::future GraphBrpcClient::add_graph_node( size_t node_num = request_bucket[request_idx].size(); closure->request(request_idx) ->add_params((char *)request_bucket[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); if (add_weight) { bool weighted[is_weighted_bucket[request_idx].size() + 1]; for (size_t j = 0; j < is_weighted_bucket[request_idx].size(); j++) @@ -248,8 +248,8 @@ std::future GraphBrpcClient::add_graph_node( return fut; } std::future GraphBrpcClient::remove_graph_node( - uint32_t table_id, std::vector &node_id_list) { - std::vector> request_bucket; + uint32_t table_id, std::vector &node_id_list) { + std::vector> request_bucket; std::vector server_index_arr; std::vector index_mapping(server_size, -1); for (size_t query_idx = 0; query_idx < node_id_list.size(); ++query_idx) { @@ -257,7 +257,7 @@ std::future GraphBrpcClient::remove_graph_node( if (index_mapping[server_index] == -1) { index_mapping[server_index] = request_bucket.size(); server_index_arr.push_back(server_index); - request_bucket.push_back(std::vector()); + request_bucket.push_back(std::vector()); } request_bucket[index_mapping[server_index]].push_back( node_id_list[query_idx]); @@ -291,7 +291,7 @@ std::future GraphBrpcClient::remove_graph_node( closure->request(request_idx) ->add_params((char *)request_bucket[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); // PsService_Stub rpc_stub(get_cmd_channel(server_index)); GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index)); @@ -303,9 +303,9 @@ std::future GraphBrpcClient::remove_graph_node( } // char* &buffer,int &actual_size std::future GraphBrpcClient::batch_sample_neighbors( - uint32_t table_id, std::vector node_ids, int sample_size, - // std::vector>> &res, - std::vector> &res, + uint32_t table_id, std::vector node_ids, int sample_size, + // std::vector>> &res, + std::vector> &res, std::vector> &res_weight, bool need_weight, int server_index) { if (server_index != -1) { @@ -337,7 +337,7 @@ std::future GraphBrpcClient::batch_sample_neighbors( int start = 0; while (start < actual_size) { res[node_idx].emplace_back( - *(uint64_t *)(node_buffer + offset + start)); + *(int64_t *)(node_buffer + offset + start)); start += GraphNode::id_size; if (need_weight) { res_weight[node_idx].emplace_back( @@ -358,7 +358,7 @@ std::future GraphBrpcClient::batch_sample_neighbors( closure->request(0)->set_table_id(table_id); closure->request(0)->set_client_id(_client_id); closure->request(0)->add_params((char *)node_ids.data(), - sizeof(uint64_t) * node_ids.size()); + sizeof(int64_t) * node_ids.size()); closure->request(0)->add_params((char *)&sample_size, sizeof(int)); closure->request(0)->add_params((char *)&need_weight, sizeof(bool)); ; @@ -380,14 +380,14 @@ std::future GraphBrpcClient::batch_sample_neighbors( server2request[server_index] = request2server.size(); request2server.push_back(server_index); } - // res.push_back(std::vector>()); + // res.push_back(std::vector>()); res.push_back({}); if (need_weight) { res_weight.push_back({}); } } size_t request_call_num = request2server.size(); - std::vector> node_id_buckets(request_call_num); + std::vector> node_id_buckets(request_call_num); std::vector> query_idx_buckets(request_call_num); for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { int server_index = get_server_index_by_id(node_ids[query_idx]); @@ -428,7 +428,7 @@ std::future GraphBrpcClient::batch_sample_neighbors( int start = 0; while (start < actual_size) { res[query_idx].emplace_back( - *(uint64_t *)(node_buffer + offset + start)); + *(int64_t *)(node_buffer + offset + start)); start += GraphNode::id_size; if (need_weight) { res_weight[query_idx].emplace_back( @@ -459,7 +459,7 @@ std::future GraphBrpcClient::batch_sample_neighbors( closure->request(request_idx) ->add_params((char *)node_id_buckets[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); closure->request(request_idx) ->add_params((char *)&sample_size, sizeof(int)); closure->request(request_idx) @@ -476,7 +476,7 @@ std::future GraphBrpcClient::batch_sample_neighbors( } std::future GraphBrpcClient::random_sample_nodes( uint32_t table_id, int server_index, int sample_size, - std::vector &ids) { + std::vector &ids) { DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) { int ret = 0; auto *closure = (DownpourBrpcClosure *)done; @@ -490,7 +490,7 @@ std::future GraphBrpcClient::random_sample_nodes( auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); int index = 0; while (index < bytes_size) { - ids.push_back(*(uint64_t *)(buffer + index)); + ids.push_back(*(int64_t *)(buffer + index)); index += GraphNode::id_size; } delete[] buffer; @@ -633,7 +633,7 @@ std::future GraphBrpcClient::pull_graph_list( } std::future GraphBrpcClient::set_node_feat( - const uint32_t &table_id, const std::vector &node_ids, + const uint32_t &table_id, const std::vector &node_ids, const std::vector &feature_names, const std::vector> &features) { std::vector request2server; @@ -646,7 +646,7 @@ std::future GraphBrpcClient::set_node_feat( } } size_t request_call_num = request2server.size(); - std::vector> node_id_buckets(request_call_num); + std::vector> node_id_buckets(request_call_num); std::vector> query_idx_buckets(request_call_num); std::vector>> features_idx_buckets( request_call_num); @@ -696,7 +696,7 @@ std::future GraphBrpcClient::set_node_feat( closure->request(request_idx) ->add_params((char *)node_id_buckets[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); std::string joint_feature_name = paddle::string::join_strings(feature_names, '\t'); closure->request(request_idx) diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.h b/paddle/fluid/distributed/ps/service/graph_brpc_client.h index 06e753d028baa..e2b8a518615dc 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_client.h +++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.h @@ -63,8 +63,8 @@ class GraphBrpcClient : public BrpcPsClient { virtual ~GraphBrpcClient() {} // given a batch of nodes, sample graph_neighbors for each of them virtual std::future batch_sample_neighbors( - uint32_t table_id, std::vector node_ids, int sample_size, - std::vector>& res, + uint32_t table_id, std::vector node_ids, int sample_size, + std::vector>& res, std::vector>& res_weight, bool need_weight, int server_index = -1); @@ -75,20 +75,20 @@ class GraphBrpcClient : public BrpcPsClient { virtual std::future random_sample_nodes(uint32_t table_id, int server_index, int sample_size, - std::vector& ids); + std::vector& ids); virtual std::future get_node_feat( - const uint32_t& table_id, const std::vector& node_ids, + const uint32_t& table_id, const std::vector& node_ids, const std::vector& feature_names, std::vector>& res); virtual std::future set_node_feat( - const uint32_t& table_id, const std::vector& node_ids, + const uint32_t& table_id, const std::vector& node_ids, const std::vector& feature_names, const std::vector>& features); virtual std::future clear_nodes(uint32_t table_id); virtual std::future add_graph_node( - uint32_t table_id, std::vector& node_id_list, + uint32_t table_id, std::vector& node_id_list, std::vector& is_weighted_list); virtual std::future use_neighbors_sample_cache(uint32_t table_id, size_t size_limit, @@ -96,11 +96,11 @@ class GraphBrpcClient : public BrpcPsClient { virtual std::future load_graph_split_config(uint32_t table_id, std::string path); virtual std::future remove_graph_node( - uint32_t table_id, std::vector& node_id_list); + uint32_t table_id, std::vector& node_id_list); virtual int32_t initialize(); int get_shard_num() { return shard_num; } void set_shard_num(int shard_num) { this->shard_num = shard_num; } - int get_server_index_by_id(uint64_t id); + int get_server_index_by_id(int64_t id); void set_local_channel(int index) { this->local_channel = get_cmd_channel(index); } diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc index 441f489fb3097..20a55e4d11983 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc +++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc @@ -140,9 +140,9 @@ int32_t GraphBrpcService::add_graph_node(Table *table, return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - std::vector node_ids(node_data, node_data + node_num); + size_t node_num = request.params(0).size() / sizeof(int64_t); + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); std::vector is_weighted_list; if (request.params_size() == 2) { size_t weight_list_size = request.params(1).size() / sizeof(bool); @@ -165,9 +165,9 @@ int32_t GraphBrpcService::remove_graph_node(Table *table, "graph_get_node_feat request requires at least 1 argument"); return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - std::vector node_ids(node_data, node_data + node_num); + size_t node_num = request.params(0).size() / sizeof(int64_t); + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); ((GraphTable *)table)->remove_graph_node(node_ids); return 0; @@ -386,9 +386,9 @@ int32_t GraphBrpcService::graph_random_sample_neighbors( "graph_random_sample_neighbors request requires at least 3 arguments"); return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - int sample_size = *(uint64_t *)(request.params(1).c_str()); + size_t node_num = request.params(0).size() / sizeof(int64_t); + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + int sample_size = *(int64_t *)(request.params(1).c_str()); bool need_weight = *(bool *)(request.params(2).c_str()); std::vector> buffers(node_num); std::vector actual_sizes(node_num, 0); @@ -407,7 +407,7 @@ int32_t GraphBrpcService::graph_random_sample_neighbors( int32_t GraphBrpcService::graph_random_sample_nodes( Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl) { - size_t size = *(uint64_t *)(request.params(0).c_str()); + size_t size = *(int64_t *)(request.params(0).c_str()); std::unique_ptr buffer; int actual_size; if (((GraphTable *)table)->random_sample_nodes(size, buffer, actual_size) == @@ -430,9 +430,9 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table, "graph_get_node_feat request requires at least 2 arguments"); return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - std::vector node_ids(node_data, node_data + node_num); + size_t node_num = request.params(0).size() / sizeof(int64_t); + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); std::vector feature_names = paddle::string::split_string(request.params(1), "\t"); @@ -464,16 +464,16 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers( "at least 3 arguments"); return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t), + size_t node_num = request.params(0).size() / sizeof(int64_t), size_of_size_t = sizeof(size_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - int sample_size = *(uint64_t *)(request.params(1).c_str()); - bool need_weight = *(uint64_t *)(request.params(2).c_str()); - // std::vector res = ((GraphTable + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + int sample_size = *(int64_t *)(request.params(1).c_str()); + bool need_weight = *(int64_t *)(request.params(2).c_str()); + // std::vector res = ((GraphTable // *)table).filter_out_non_exist_nodes(node_data, sample_size); std::vector request2server; std::vector server2request(server_size, -1); - std::vector local_id; + std::vector local_id; std::vector local_query_idx; size_t rank = get_rank(); for (int query_idx = 0; query_idx < node_num; ++query_idx) { @@ -496,7 +496,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers( std::vector> local_buffers; std::vector local_actual_sizes; std::vector seq; - std::vector> node_id_buckets(request_call_num); + std::vector> node_id_buckets(request_call_num); std::vector> query_idx_buckets(request_call_num); for (int query_idx = 0; query_idx < node_num; ++query_idx) { int server_index = @@ -583,7 +583,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers( closure->request(request_idx) ->add_params((char *)node_id_buckets[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); closure->request(request_idx) ->add_params((char *)&sample_size, sizeof(int)); closure->request(request_idx) @@ -618,9 +618,9 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table, "graph_set_node_feat request requires at least 3 arguments"); return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - std::vector node_ids(node_data, node_data + node_num); + size_t node_num = request.params(0).size() / sizeof(int64_t); + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); std::vector feature_names = paddle::string::split_string(request.params(1), "\t"); diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.h b/paddle/fluid/distributed/ps/service/graph_brpc_server.h index aee0190850753..a978d97b296b0 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_server.h +++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.h @@ -43,7 +43,7 @@ class GraphBrpcServer : public PSServer { _server.Join(); return 0; } - virtual int32_t port(); + int32_t port(); std::condition_variable *export_cv() { return &cv_; } diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h index 21719fbdbf1d6..8a2bfbe31602b 100644 --- a/paddle/fluid/distributed/ps/service/ps_client.h +++ b/paddle/fluid/distributed/ps/service/ps_client.h @@ -26,6 +26,7 @@ #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" +#include "paddle/fluid/distributed/ps/table/table.h" #include "paddle/fluid/platform/timer.h" namespace paddle { @@ -59,6 +60,41 @@ class PSClientClosure : public google::protobuf::Closure { std::vector>> _promises; }; +struct LoadSaveContext { + int table_id; + std::string epoch; + std::string mode; +}; + +enum TrainingMode { Async = 0, Sync = 1, Geo = 3 }; + +enum TrainingPhase { Init = 0, Train = 1, Save = 2 }; + +// enum ValueType { +// Sparse = 0, +// Dense = 1 +// }; + +struct PushContext { + const uint64_t *keys; + const float **push_values; + const Region *push_dense_values; +}; + +struct RequestContext { + int table; + TrainingMode training_mode; // 1 for async, 2 for geo, 3 for sync + TrainingPhase training_phase; // 1 for init, 2 for train + ValueType value_type; // 1 for sparse, 2 for dense + void *keys; + void **sparse_values; // for sparse values + Region *dense_values; // for dense values + PushContext push_context; + size_t num; + bool is_training; + void *callback; +}; + class PSClient { public: PSClient() {} @@ -86,6 +122,9 @@ class PSClient { // ęŒ‡å®štableę•°ę®load virtual std::future load(uint32_t table_id, const std::string &epoch, const std::string &mode) = 0; + // context配ē½®load选锹 + virtual std::future Load(const LoadSaveContext &load_context) = 0; + // å…Ø量tableę•°ę®save value_accessorę ¹ę®modeļ¼ŒåÆčƒ½ęœ‰äøåŒēš„saveę”ä»¶ virtual std::future save(const std::string &epoch, const std::string &mode) = 0; @@ -93,6 +132,8 @@ class PSClient { virtual std::future save(uint32_t table_id, const std::string &epoch, const std::string &mode) = 0; + virtual std::future Save(const LoadSaveContext &save_context) = 0; + // ęø…ē©ŗtableę•°ę® virtual std::future clear() = 0; virtual std::future clear(uint32_t table_id) = 0; @@ -107,6 +148,8 @@ class PSClient { virtual std::future pull_dense(Region *regions, size_t region_num, size_t table_id) = 0; // äæē•™ + virtual std::future Push(RequestContext &push_context) = 0; + // firstly push dense param for parameter server // this is neccessary because dense weight initialized in trainer on cold // start @@ -117,6 +160,9 @@ class PSClient { virtual std::future push_dense(const Region *regions, size_t region_num, size_t table_id) = 0; + + virtual std::future Pull(RequestContext &pull_context) = 0; + // ä½æē”Økeysčæ›č”ŒpullčÆ·ę±‚ļ¼Œē»“ęžœå”«å……values // keys和valuesēš„äøŖę•°å‡äøŗnumäøŖļ¼ŒęƏäøŖvalue占ē”Øselect_sizeē©ŗé—“ // futureē»“ęŸå‰keys和valuesē¼“冲åŒŗäøčƒ½å†ę¬”ä½æē”Ø diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc index 972cce135f189..9e364b6d3ed7a 100644 --- a/paddle/fluid/distributed/ps/service/ps_local_client.cc +++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc @@ -56,6 +56,19 @@ ::std::future PsLocalClient::load(uint32_t table_id, return done(); } +std::future PsLocalClient::Load(const LoadSaveContext& load_context) { + if (load_context.table_id < 0) { + for (auto& it : _table_map) { + load(it.first, load_context.epoch, load_context.mode); + } + return done(); + } else { + auto* table_ptr = table(load_context.table_id); + table_ptr->load(load_context.epoch, load_context.mode); + return done(); + } +} + ::std::future PsLocalClient::save(const std::string& epoch, const std::string& mode) { // TODO @@ -74,6 +87,21 @@ ::std::future PsLocalClient::save(uint32_t table_id, return done(); } +::std::future PsLocalClient::Save( + const LoadSaveContext& save_context) { + if (save_context.table_id < 0) { + for (auto& it : _table_map) { + save(it.first, save_context.epoch, save_context.mode); + } + return done(); + } else { + auto* table_ptr = table(save_context.table_id); + table_ptr->flush(); + table_ptr->save(save_context.epoch, save_context.mode); + return done(); + } +} + ::std::future PsLocalClient::clear() { // TODO return done(); @@ -93,6 +121,51 @@ ::std::future PsLocalClient::stop_server() { return done(); } +::std::future PsLocalClient::Pull(RequestContext& pull_context) { + if (pull_context.value_type == Dense) { // pull dense + Region* dense_region = reinterpret_cast(pull_context.dense_values); + pull_dense(dense_region, pull_context.num, pull_context.table); + } else { // pull sparse + uint64_t* keys = reinterpret_cast(pull_context.keys); + char** select_values = reinterpret_cast(pull_context.sparse_values); + size_t table_id = pull_context.table; + size_t num = pull_context.num; + pull_sparse_ptr(select_values, table_id, keys, num); + } +} + +::std::future PsLocalClient::Push(RequestContext& push_context) { + if (push_context.value_type == Dense) { // push dense + if (push_context.training_phase == Init) { + const Region* regions = push_context.push_context.push_dense_values; + size_t region_num = push_context.num; + push_dense_param(regions, region_num, push_context.table); + } else { + if (push_context.training_mode == Geo) { // geo + float* total_send_data = + reinterpret_cast(push_context.dense_values); + size_t total_send_data_size = push_context.num; + push_dense_raw_gradient(push_context.table, total_send_data, + total_send_data_size, push_context.callback); + } else { // async and sync + const Region* regions = push_context.push_context.push_dense_values; + size_t region_num = push_context.num; + push_dense(regions, region_num, push_context.table); + } + } + } else { // push sparse + if (push_context.training_mode == Async) { + const uint64_t* keys = push_context.push_context.keys; + const float** update_values = push_context.push_context.push_values; + size_t table_id = push_context.table; + size_t num = push_context.num; + push_sparse(table_id, keys, update_values, num); + } else { + // TODO + } + } +} + ::std::future PsLocalClient::pull_dense(Region* regions, size_t region_num, size_t table_id) { diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.h b/paddle/fluid/distributed/ps/service/ps_local_client.h index e73974ac56286..83ca558e3d2cb 100644 --- a/paddle/fluid/distributed/ps/service/ps_local_client.h +++ b/paddle/fluid/distributed/ps/service/ps_local_client.h @@ -39,12 +39,16 @@ class PsLocalClient : public PSClient { virtual ::std::future load(uint32_t table_id, const std::string& epoch, const std::string& mode) override; + virtual std::future Load( + const LoadSaveContext& load_context) override; virtual ::std::future save(const std::string& epoch, const std::string& mode) override; virtual ::std::future save(uint32_t table_id, const std::string& epoch, const std::string& mode) override; + virtual std::future Save( + const LoadSaveContext& save_context) override; virtual ::std::future clear() override; virtual ::std::future clear(uint32_t table_id) override; @@ -55,6 +59,10 @@ class PsLocalClient : public PSClient { virtual ::std::future pull_dense(Region* regions, size_t region_num, size_t table_id); + virtual ::std::future Pull(RequestContext& pull_context) override; + + virtual ::std::future Push(RequestContext& push_context) override; + virtual ::std::future push_dense(const Region* regions, size_t region_num, size_t table_id); diff --git a/paddle/fluid/distributed/ps/service/ps_local_server.h b/paddle/fluid/distributed/ps/service/ps_local_server.h index 91f8bc4c91271..31b52126fc576 100644 --- a/paddle/fluid/distributed/ps/service/ps_local_server.h +++ b/paddle/fluid/distributed/ps/service/ps_local_server.h @@ -28,7 +28,6 @@ class PsLocalServer : public PSServer { virtual uint64_t start() { return 0; } virtual uint64_t start(const std::string &ip, uint32_t port) { return 0; } virtual int32_t stop() { return 0; } - virtual int32_t port() { return 0; } virtual int32_t configure( const PSParameter &config, PSEnvironment &env, size_t server_rank, const std::vector &server_sub_program = {}) { diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc index 088edcb75bbc6..c8be0f7971090 100644 --- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc +++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc @@ -44,9 +44,9 @@ void GraphPyService::add_table_feat_conf(std::string table_name, } } -void add_graph_node(std::vector node_ids, +void add_graph_node(std::vector node_ids, std::vector weight_list) {} -void remove_graph_node(std::vector node_ids) {} +void remove_graph_node(std::vector node_ids) {} void GraphPyService::set_up(std::string ips_str, int shard_num, std::vector node_types, std::vector edge_types) { @@ -260,7 +260,7 @@ void GraphPyClient::clear_nodes(std::string name) { } void GraphPyClient::add_graph_node(std::string name, - std::vector& node_ids, + std::vector& node_ids, std::vector& weight_list) { if (this->table_id_map.count(name)) { uint32_t table_id = this->table_id_map[name]; @@ -271,7 +271,7 @@ void GraphPyClient::add_graph_node(std::string name, } void GraphPyClient::remove_graph_node(std::string name, - std::vector& node_ids) { + std::vector& node_ids) { if (this->table_id_map.count(name)) { uint32_t table_id = this->table_id_map[name]; auto status = get_ps_client()->remove_graph_node(table_id, node_ids); @@ -290,13 +290,12 @@ void GraphPyClient::load_node_file(std::string name, std::string filepath) { } } -std::pair>, std::vector> +std::pair>, std::vector> GraphPyClient::batch_sample_neighbors(std::string name, - std::vector node_ids, + std::vector node_ids, int sample_size, bool return_weight, bool return_edges) { - // std::vector>> v; - std::vector> v; + std::vector> v; std::vector> v1; if (this->table_id_map.count(name)) { uint32_t table_id = this->table_id_map[name]; @@ -309,7 +308,7 @@ GraphPyClient::batch_sample_neighbors(std::string name, // res.first[1]: slice index // res.first[2]: src nodes // res.second: edges weight - std::pair>, std::vector> res; + std::pair>, std::vector> res; res.first.push_back({}); res.first.push_back({}); if (return_edges) res.first.push_back({}); @@ -342,10 +341,10 @@ void GraphPyClient::use_neighbors_sample_cache(std::string name, status.wait(); } } -std::vector GraphPyClient::random_sample_nodes(std::string name, - int server_index, - int sample_size) { - std::vector v; +std::vector GraphPyClient::random_sample_nodes(std::string name, + int server_index, + int sample_size) { + std::vector v; if (this->table_id_map.count(name)) { uint32_t table_id = this->table_id_map[name]; auto status = @@ -357,7 +356,7 @@ std::vector GraphPyClient::random_sample_nodes(std::string name, // (name, dtype, ndarray) std::vector> GraphPyClient::get_node_feat( - std::string node_type, std::vector node_ids, + std::string node_type, std::vector node_ids, std::vector feature_names) { std::vector> v( feature_names.size(), std::vector(node_ids.size())); @@ -371,7 +370,7 @@ std::vector> GraphPyClient::get_node_feat( } void GraphPyClient::set_node_feat( - std::string node_type, std::vector node_ids, + std::string node_type, std::vector node_ids, std::vector feature_names, const std::vector> features) { if (this->table_id_map.count(node_type)) { diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h index c25ef5035453d..85707137c1800 100644 --- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h +++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h @@ -70,18 +70,34 @@ class GraphPyService { ::paddle::distributed::TableAccessorParameter* accessor_proto = sparse_table_proto->mutable_accessor(); - ::paddle::distributed::CommonAccessorParameter* common_proto = - sparse_table_proto->mutable_common(); + // ::paddle::distributed::CommonAccessorParameter* common_proto = + // sparse_table_proto->mutable_common(); + ::paddle::distributed::GraphParameter* graph_proto = + sparse_table_proto->mutable_graph_parameter(); + + ::paddle::distributed::GraphFeature* graph_feature = + graph_proto->mutable_graph_feature(); + + graph_proto->set_task_pool_size(24); + + graph_proto->set_table_name(table_name); + graph_proto->set_table_type(table_type); + graph_proto->set_use_cache(false); // Set GraphTable Parameter - common_proto->set_table_name(table_name); - common_proto->set_name(table_type); + // common_proto->set_table_name(table_name); + // common_proto->set_name(table_type); + // for (size_t i = 0; i < feat_name.size(); i++) { + // common_proto->add_params(feat_dtype[i]); + // common_proto->add_dims(feat_shape[i]); + // common_proto->add_attributes(feat_name[i]); + // } + for (size_t i = 0; i < feat_name.size(); i++) { - common_proto->add_params(feat_dtype[i]); - common_proto->add_dims(feat_shape[i]); - common_proto->add_attributes(feat_name[i]); + graph_feature->add_dtype(feat_dtype[i]); + graph_feature->add_shape(feat_shape[i]); + graph_feature->add_name(feat_name[i]); } - accessor_proto->set_accessor_class("CommMergeAccessor"); } @@ -143,24 +159,24 @@ class GraphPyClient : public GraphPyService { void load_edge_file(std::string name, std::string filepath, bool reverse); void load_node_file(std::string name, std::string filepath); void clear_nodes(std::string name); - void add_graph_node(std::string name, std::vector& node_ids, + void add_graph_node(std::string name, std::vector& node_ids, std::vector& weight_list); - void remove_graph_node(std::string name, std::vector& node_ids); + void remove_graph_node(std::string name, std::vector& node_ids); int get_client_id() { return client_id; } void set_client_id(int client_id) { this->client_id = client_id; } void start_client(); - std::pair>, std::vector> - batch_sample_neighbors(std::string name, std::vector node_ids, + std::pair>, std::vector> + batch_sample_neighbors(std::string name, std::vector node_ids, int sample_size, bool return_weight, bool return_edges); - std::vector random_sample_nodes(std::string name, int server_index, - int sample_size); + std::vector random_sample_nodes(std::string name, int server_index, + int sample_size); std::vector> get_node_feat( - std::string node_type, std::vector node_ids, + std::string node_type, std::vector node_ids, std::vector feature_names); void use_neighbors_sample_cache(std::string name, size_t total_size_limit, size_t ttl); - void set_node_feat(std::string node_type, std::vector node_ids, + void set_node_feat(std::string node_type, std::vector node_ids, std::vector feature_names, const std::vector> features); std::vector pull_graph_list(std::string name, int server_index, diff --git a/paddle/fluid/distributed/ps/service/server.cc b/paddle/fluid/distributed/ps/service/server.cc index 5f1974e3e610c..893f671359e40 100644 --- a/paddle/fluid/distributed/ps/service/server.cc +++ b/paddle/fluid/distributed/ps/service/server.cc @@ -67,8 +67,6 @@ int32_t PSServer::configure( _config = config.server_param(); _rank = server_rank; _environment = &env; - _shuffled_ins = - paddle::framework::MakeChannel>(); size_t shard_num = env.get_ps_servers().size(); const auto &downpour_param = _config.downpour_server_param(); diff --git a/paddle/fluid/distributed/ps/service/server.h b/paddle/fluid/distributed/ps/service/server.h index 160d4a6128295..d2804405b4198 100644 --- a/paddle/fluid/distributed/ps/service/server.h +++ b/paddle/fluid/distributed/ps/service/server.h @@ -69,11 +69,6 @@ class PSServer { const PSParameter &config, PSEnvironment &env, size_t server_rank, const std::vector &server_sub_program = {}); - // return server_ip - virtual std::string ip() { return butil::my_ip_cstr(); } - // return server_port - virtual int32_t port() = 0; - virtual uint64_t start(const std::string &ip, uint32_t port) = 0; virtual int32_t stop() = 0; @@ -94,15 +89,6 @@ class PSServer { return &_table_map; } - typedef std::function MsgHandlerFunc; - virtual int registe_pserver2pserver_msg_handler(int msg_type, - MsgHandlerFunc handler) { - _msg_handler_map[msg_type] = handler; - return 0; - } - - paddle::framework::Channel> _shuffled_ins; - protected: virtual int32_t initialize() = 0; @@ -111,7 +97,6 @@ class PSServer { ServerParameter _config; PSEnvironment *_environment; std::unordered_map> _table_map; - std::unordered_map _msg_handler_map; protected: std::shared_ptr scope_; diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt index be916bf2e8003..2fa5ecb4051c5 100644 --- a/paddle/fluid/distributed/ps/table/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt @@ -53,7 +53,6 @@ cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_pro set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_library(memory_sparse_geo_table SRCS memory_sparse_geo_table.cc DEPS ps_framework_proto ${TABLE_DEPS} common_table) - cc_library(table SRCS table.cc DEPS memory_sparse_table memory_sparse_geo_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost) target_link_libraries(table -fopenmp) diff --git a/paddle/fluid/distributed/ps/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h index 7c91a60864980..07c211bb9c128 100644 --- a/paddle/fluid/distributed/ps/table/accessor.h +++ b/paddle/fluid/distributed/ps/table/accessor.h @@ -45,6 +45,17 @@ struct DataConverter { std::string deconverter; }; +struct AccessorInfo { + size_t dim; + size_t size; + size_t select_size; + size_t select_dim; + size_t update_size; + size_t update_dim; + size_t mf_size; + size_t fea_dim; +}; + class ValueAccessor { public: ValueAccessor() {} @@ -68,6 +79,8 @@ class ValueAccessor { } virtual int initialize() = 0; + virtual void GetTableInfo(AccessorInfo& info) = 0; + // valueē»“åŗ¦ virtual size_t dim() = 0; // value各äøŖē»“åŗ¦ēš„size @@ -163,6 +176,7 @@ class ValueAccessor { TableAccessorParameter _config; std::unordered_map> _data_coverter_map; + AccessorInfo _accessor_info; }; REGISTER_PSCORE_REGISTERER(ValueAccessor); } // namespace distributed diff --git a/paddle/fluid/distributed/ps/table/common_dense_table.cc b/paddle/fluid/distributed/ps/table/common_dense_table.cc index 607469e2f7b0d..cc0f5867a3d65 100644 --- a/paddle/fluid/distributed/ps/table/common_dense_table.cc +++ b/paddle/fluid/distributed/ps/table/common_dense_table.cc @@ -128,6 +128,21 @@ int32_t CommonDenseTable::set_global_lr(float* lr) { return 0; } +int32_t CommonDenseTable::Pull(TableContext& context) { + CHECK(context.value_type == Dense); + float* pull_values = context.pull_context.values; + return pull_dense(pull_values, context.num); +} + +int32_t CommonDenseTable::Push(TableContext& context) { + CHECK(context.value_type == Dense); + if (context.pull_context.values != nullptr) { + const float* values = context.push_context.values; + return push_dense(values, context.num); + } + return 0; +} + int32_t CommonDenseTable::pull_dense(float* pull_values, size_t num) { std::copy(values_[param_idx_].begin(), values_[param_idx_].end(), pull_values); diff --git a/paddle/fluid/distributed/ps/table/common_dense_table.h b/paddle/fluid/distributed/ps/table/common_dense_table.h index a4c0f29ddb877..cad49a0a449c4 100644 --- a/paddle/fluid/distributed/ps/table/common_dense_table.h +++ b/paddle/fluid/distributed/ps/table/common_dense_table.h @@ -40,6 +40,8 @@ class CommonDenseTable : public DenseTable { const std::string& name); virtual int32_t initialize_value(); virtual int32_t initialize_optimizer(); + virtual int32_t Pull(TableContext& context); + virtual int32_t Push(TableContext& context); int32_t pull_dense(float* pull_values, size_t num) override; int32_t push_dense_param(const float* values, size_t num) override; int32_t push_dense(const float* values, size_t num) override; diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc index 54b98cb96ce51..2c07bd65d63d4 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.cc +++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc @@ -27,6 +27,288 @@ namespace paddle { namespace distributed { +#ifdef PADDLE_WITH_HETERPS + +int CompleteGraphSampler::run_graph_sampling() { + pthread_rwlock_t *rw_lock = graph_table->rw_lock.get(); + pthread_rwlock_rdlock(rw_lock); + std::cout << "in graph sampling" << std::endl; + sample_nodes.clear(); + sample_neighbors.clear(); + sample_res.clear(); + sample_nodes.resize(gpu_num); + sample_neighbors.resize(gpu_num); + sample_res.resize(gpu_num); + std::vector>> + sample_nodes_ex(graph_table->task_pool_size_); + std::vector>> sample_neighbors_ex( + graph_table->task_pool_size_); + for (int i = 0; i < graph_table->task_pool_size_; i++) { + sample_nodes_ex[i].resize(gpu_num); + sample_neighbors_ex[i].resize(gpu_num); + } + std::vector> tasks; + for (size_t i = 0; i < graph_table->shards.size(); ++i) { + tasks.push_back( + graph_table->_shards_task_pool[i % graph_table->task_pool_size_] + ->enqueue([&, i, this]() -> int { + if (this->status == GraphSamplerStatus::terminating) return 0; + paddle::framework::GpuPsGraphNode node; + std::vector &v = + this->graph_table->shards[i]->get_bucket(); + size_t ind = i % this->graph_table->task_pool_size_; + for (size_t j = 0; j < v.size(); j++) { + size_t location = v[j]->get_id() % this->gpu_num; + node.node_id = v[j]->get_id(); + node.neighbor_size = v[j]->get_neighbor_size(); + node.neighbor_offset = + (int)sample_neighbors_ex[ind][location].size(); + sample_nodes_ex[ind][location].emplace_back(node); + for (int k = 0; k < node.neighbor_size; k++) + sample_neighbors_ex[ind][location].push_back( + v[j]->get_neighbor_id(k)); + } + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + tasks.clear(); + for (size_t i = 0; i < gpu_num; i++) { + tasks.push_back( + graph_table->_shards_task_pool[i % graph_table->task_pool_size_] + ->enqueue([&, i, this]() -> int { + if (this->status == GraphSamplerStatus::terminating) return 0; + int total_offset = 0; + size_t ind = i % this->graph_table->task_pool_size_; + for (int j = 0; j < this->graph_table->task_pool_size_; j++) { + for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) { + sample_nodes[ind].push_back(sample_nodes_ex[j][ind][k]); + sample_nodes[ind].back().neighbor_offset += total_offset; + } + size_t neighbor_size = sample_neighbors_ex[j][ind].size(); + total_offset += neighbor_size; + for (size_t k = 0; k < neighbor_size; k++) { + sample_neighbors[ind].push_back( + sample_neighbors_ex[j][ind][k]); + } + } + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + + if (this->status == GraphSamplerStatus::terminating) { + pthread_rwlock_unlock(rw_lock); + return 0; + } + for (size_t i = 0; i < gpu_num; i++) { + sample_res[i].node_list = sample_nodes[i].data(); + sample_res[i].neighbor_list = sample_neighbors[i].data(); + sample_res[i].node_size = sample_nodes[i].size(); + sample_res[i].neighbor_size = sample_neighbors[i].size(); + } + pthread_rwlock_unlock(rw_lock); + if (this->status == GraphSamplerStatus::terminating) { + return 0; + } + callback(sample_res); + return 0; +} +void CompleteGraphSampler::init(size_t gpu_num, GraphTable *graph_table, + std::vector args) { + this->gpu_num = gpu_num; + this->graph_table = graph_table; +} + +int BasicBfsGraphSampler::run_graph_sampling() { + pthread_rwlock_t *rw_lock = graph_table->rw_lock.get(); + pthread_rwlock_rdlock(rw_lock); + while (rounds > 0 && status == GraphSamplerStatus::running) { + for (size_t i = 0; i < sample_neighbors_map.size(); i++) { + sample_neighbors_map[i].clear(); + } + sample_neighbors_map.clear(); + std::vector nodes_left(graph_table->shards.size(), + node_num_for_each_shard); + std::promise prom; + std::future fut = prom.get_future(); + sample_neighbors_map.resize(graph_table->task_pool_size_); + int task_size = 0; + std::vector> tasks; + int init_size = 0; + //__sync_fetch_and_add + std::function bfs = [&, this](int i, int id) -> int { + VLOG(0) << "in bfs " << i << " " << id; + if (this->status == GraphSamplerStatus::terminating) { + int task_left = __sync_sub_and_fetch(&task_size, 1); + if (task_left == 0) { + prom.set_value(0); + } + return 0; + } + size_t ind = i % this->graph_table->task_pool_size_; + if (nodes_left[i] > 0) { + nodes_left[i]--; + auto iter = sample_neighbors_map[ind].find(id); + if (iter == sample_neighbors_map[ind].end()) { + sample_neighbors_map[ind][id] = std::vector(); + iter = sample_neighbors_map[ind].find(id); + Node *node = graph_table->shards[i]->find_node(id); + if (node != NULL) { + size_t edge_fetch_size = + std::min((size_t) this->edge_num_for_each_node, + node->get_neighbor_size()); + for (size_t k = 0; k < edge_fetch_size; k++) { + int64_t neighbor_id = node->get_neighbor_id(k); + int node_location = neighbor_id % this->graph_table->shard_num % + this->graph_table->task_pool_size_; + __sync_add_and_fetch(&task_size, 1); + graph_table->_shards_task_pool[node_location]->enqueue( + bfs, neighbor_id % this->graph_table->shard_num, neighbor_id); + iter->second.push_back(neighbor_id); + } + } + } + } + int task_left = __sync_sub_and_fetch(&task_size, 1); + if (task_left == 0) { + prom.set_value(0); + } + return 0; + }; + for (size_t i = 0; i < graph_table->shards.size(); ++i) { + std::vector &v = graph_table->shards[i]->get_bucket(); + if (v.size() > 0) { + init_size++; + __sync_add_and_fetch(&task_size, 1); + int64_t id = v[0]->get_id(); + graph_table->_shards_task_pool[i % graph_table->task_pool_size_] + ->enqueue(bfs, i, id); + } // if + } + if (init_size == 0) { + prom.set_value(0); + } + fut.get(); + if (this->status == GraphSamplerStatus::terminating) { + pthread_rwlock_unlock(rw_lock); + return 0; + } + std::cout << "bfs over" << std::endl; + sample_nodes.clear(); + sample_neighbors.clear(); + sample_res.clear(); + sample_nodes.resize(gpu_num); + sample_neighbors.resize(gpu_num); + sample_res.resize(gpu_num); + std::vector>> + sample_nodes_ex(graph_table->task_pool_size_); + std::vector>> sample_neighbors_ex( + graph_table->task_pool_size_); + for (int i = 0; i < graph_table->task_pool_size_; i++) { + sample_nodes_ex[i].resize(gpu_num); + sample_neighbors_ex[i].resize(gpu_num); + } + tasks.clear(); + for (size_t i = 0; i < (size_t)graph_table->task_pool_size_; ++i) { + tasks.push_back( + graph_table->_shards_task_pool[i]->enqueue([&, i, this]() -> int { + if (this->status == GraphSamplerStatus::terminating) { + return 0; + } + paddle::framework::GpuPsGraphNode node; + auto iter = sample_neighbors_map[i].begin(); + size_t ind = i; + for (; iter != sample_neighbors_map[i].end(); iter++) { + size_t location = iter->first % this->gpu_num; + node.node_id = iter->first; + node.neighbor_size = iter->second.size(); + node.neighbor_offset = + (int)sample_neighbors_ex[ind][location].size(); + sample_nodes_ex[ind][location].emplace_back(node); + for (auto k : iter->second) + sample_neighbors_ex[ind][location].push_back(k); + } + return 0; + })); + } + + for (size_t i = 0; i < tasks.size(); i++) { + tasks[i].get(); + sample_neighbors_map[i].clear(); + } + tasks.clear(); + if (this->status == GraphSamplerStatus::terminating) { + pthread_rwlock_unlock(rw_lock); + return 0; + } + for (size_t i = 0; i < gpu_num; i++) { + tasks.push_back( + graph_table->_shards_task_pool[i % graph_table->task_pool_size_] + ->enqueue([&, i, this]() -> int { + if (this->status == GraphSamplerStatus::terminating) { + pthread_rwlock_unlock(rw_lock); + return 0; + } + int total_offset = 0; + size_t ind = i % graph_table->task_pool_size_; + for (int j = 0; j < this->graph_table->task_pool_size_; j++) { + for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) { + sample_nodes[i].push_back(sample_nodes_ex[j][ind][k]); + sample_nodes[i].back().neighbor_offset += total_offset; + // neighbor_offset[i].push_back(total_offset + + // neighbor_offset_ex[j][i][k]); + } + size_t neighbor_size = sample_neighbors_ex[j][ind].size(); + total_offset += neighbor_size; + for (size_t k = 0; k < neighbor_size; k++) { + sample_neighbors[ind].push_back( + sample_neighbors_ex[j][ind][k]); + } + } + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + if (this->status == GraphSamplerStatus::terminating) { + pthread_rwlock_unlock(rw_lock); + return 0; + } + // int64_t total_neighbors = + // std::accumulate(shard_neighbor_size.begin(),shard_neighbor_size.end(),0); + for (size_t i = 0; i < gpu_num; i++) { + sample_res[i].node_list = sample_nodes[i].data(); + sample_res[i].neighbor_list = sample_neighbors[i].data(); + sample_res[i].node_size = sample_nodes[i].size(); + sample_res[i].neighbor_size = sample_neighbors[i].size(); + } + pthread_rwlock_unlock(rw_lock); + if (this->status == GraphSamplerStatus::terminating) { + return 0; + } + callback(sample_res); + rounds--; + if (rounds > 0) { + for (int i = 0; + i < interval && this->status == GraphSamplerStatus::running; i++) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + } + } + return 0; +} +void BasicBfsGraphSampler::init(size_t gpu_num, GraphTable *graph_table, + std::vector args) { + this->gpu_num = gpu_num; + this->graph_table = graph_table; + node_num_for_each_shard = args.size() > 0 ? std::stoi(args[0]) : 10; + edge_num_for_each_node = args.size() > 1 ? std::stoi(args[1]) : 10; + rounds = args.size() > 2 ? std::stoi(args[2]) : 1; + interval = args.size() > 3 ? std::stoi(args[3]) : 60; +} + +#endif + std::vector GraphShard::get_batch(int start, int end, int step) { if (start < 0) start = 0; std::vector res; @@ -38,10 +320,10 @@ std::vector GraphShard::get_batch(int start, int end, int step) { size_t GraphShard::get_size() { return bucket.size(); } -int32_t GraphTable::add_graph_node(std::vector &id_list, +int32_t GraphTable::add_graph_node(std::vector &id_list, std::vector &is_weight_list) { size_t node_size = id_list.size(); - std::vector>> batch(task_pool_size_); + std::vector>> batch(task_pool_size_); for (size_t i = 0; i < node_size; i++) { size_t shard_id = id_list[i] % shard_num; if (shard_id >= shard_end || shard_id < shard_start) { @@ -65,9 +347,9 @@ int32_t GraphTable::add_graph_node(std::vector &id_list, return 0; } -int32_t GraphTable::remove_graph_node(std::vector &id_list) { +int32_t GraphTable::remove_graph_node(std::vector &id_list) { size_t node_size = id_list.size(); - std::vector> batch(task_pool_size_); + std::vector> batch(task_pool_size_); for (size_t i = 0; i < node_size; i++) { size_t shard_id = id_list[i] % shard_num; if (shard_id >= shard_end || shard_id < shard_start) continue; @@ -98,7 +380,7 @@ void GraphShard::clear() { GraphShard::~GraphShard() { clear(); } -void GraphShard::delete_node(uint64_t id) { +void GraphShard::delete_node(int64_t id) { auto iter = node_location.find(id); if (iter == node_location.end()) return; int pos = iter->second; @@ -110,7 +392,7 @@ void GraphShard::delete_node(uint64_t id) { node_location.erase(id); bucket.pop_back(); } -GraphNode *GraphShard::add_graph_node(uint64_t id) { +GraphNode *GraphShard::add_graph_node(int64_t id) { if (node_location.find(id) == node_location.end()) { node_location[id] = bucket.size(); bucket.push_back(new GraphNode(id)); @@ -126,7 +408,7 @@ GraphNode *GraphShard::add_graph_node(Node *node) { } return (GraphNode *)bucket[node_location[id]]; } -FeatureNode *GraphShard::add_feature_node(uint64_t id) { +FeatureNode *GraphShard::add_feature_node(int64_t id) { if (node_location.find(id) == node_location.end()) { node_location[id] = bucket.size(); bucket.push_back(new FeatureNode(id)); @@ -134,11 +416,11 @@ FeatureNode *GraphShard::add_feature_node(uint64_t id) { return (FeatureNode *)bucket[node_location[id]]; } -void GraphShard::add_neighbor(uint64_t id, uint64_t dst_id, float weight) { +void GraphShard::add_neighbor(int64_t id, int64_t dst_id, float weight) { find_node(id)->add_edge(dst_id, weight); } -Node *GraphShard::find_node(uint64_t id) { +Node *GraphShard::find_node(int64_t id) { auto iter = node_location.find(id); return iter == node_location.end() ? nullptr : bucket[iter->second]; } @@ -185,14 +467,14 @@ int32_t GraphTable::load(const std::string &path, const std::string ¶m) { } int32_t GraphTable::get_nodes_ids_by_ranges( - std::vector> ranges, std::vector &res) { + std::vector> ranges, std::vector &res) { int start = 0, end, index = 0, total_size = 0; res.clear(); - std::vector>> tasks; + std::vector>> tasks; for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) { end = total_size + shards[i]->get_size(); start = total_size; - while (start < end && index < ranges.size()) { + while (start < end && index < (int)ranges.size()) { if (ranges[index].second <= start) index++; else if (ranges[index].first >= end) { @@ -204,7 +486,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges( first -= total_size; second -= total_size; tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( - [this, first, second, i]() -> std::vector { + [this, first, second, i]() -> std::vector { return shards[i]->get_ids_by_range(first, second); })); } @@ -276,6 +558,9 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { } int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { +#ifdef PADDLE_WITH_HETERPS + if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get()); +#endif auto paths = paddle::string::split_string(path, ";"); int64_t count = 0; std::string sample_type = "random"; @@ -351,6 +636,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { /*----------------------- relocate the duplicate nodes to make them distributed evenly among threads. */ + if (!use_duplicate_nodes) { +#ifdef PADDLE_WITH_HETERPS + if (gpups_mode) pthread_rwlock_unlock(rw_lock.get()); +#endif + + return 0; + } for (auto &shard : extra_shards) { auto bucket = shard->get_bucket(); for (size_t i = 0; i < bucket.size(); i++) { @@ -360,13 +652,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { int size = extra_nodes_to_thread_index.size(); if (size == 0) return 0; std::vector index; - for (int i = 0; i < used.size(); i++) index.push_back(i); + for (int i = 0; i < (int)used.size(); i++) index.push_back(i); sort(index.begin(), index.end(), [&](int &a, int &b) { return used[a] < used[b]; }); std::vector alloc(index.size(), 0), has_alloc(index.size(), 0); int t = 1, aim = 0, mod = 0; - for (; t < used.size(); t++) { + for (; t < (int)used.size(); t++) { if ((used[index[t]] - used[index[t - 1]]) * t >= size) { break; } else { @@ -380,7 +672,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { if (t - x <= mod) alloc[index[x]]++; alloc[index[x]] -= used[index[x]]; } - std::vector vec[index.size()]; + std::vector vec[index.size()]; for (auto p : extra_nodes_to_thread_index) { has_alloc[p.second]++; vec[p.second].push_back(p.first); @@ -395,7 +687,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { has_alloc[index[right]] - alloc[index[right]]); has_alloc[index[left]] += x; has_alloc[index[right]] -= x; - uint64_t id; + int64_t id; while (x--) { id = vec[index[right]].back(); vec[index[right]].pop_back(); @@ -424,10 +716,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { delete extra_shards[i]; extra_shards[i] = extra_shards_copy[i]; } +#ifdef PADDLE_WITH_HETERPS + if (gpups_mode) pthread_rwlock_unlock(rw_lock.get()); +#endif return 0; } -Node *GraphTable::find_node(uint64_t id) { +Node *GraphTable::find_node(int64_t id) { size_t shard_id = id % shard_num; if (shard_id >= shard_end || shard_id < shard_start) { if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0) @@ -443,7 +738,7 @@ Node *GraphTable::find_node(uint64_t id) { Node *node = shards[index]->find_node(id); return node; } -uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) { +uint32_t GraphTable::get_thread_pool_index(int64_t node_id) { if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0) return node_id % shard_num % shard_num_per_server % task_pool_size_; size_t src_shard_id = node_id % shard_num; @@ -456,8 +751,7 @@ uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) { return src_shard_id % shard_num_per_server % task_pool_size_; } -uint32_t GraphTable::get_thread_pool_index_by_shard_index( - uint64_t shard_index) { +uint32_t GraphTable::get_thread_pool_index_by_shard_index(int64_t shard_index) { return shard_index % shard_num_per_server % task_pool_size_; } @@ -484,7 +778,7 @@ int32_t GraphTable::random_sample_nodes(int sample_size, std::unique_ptr &buffer, int &actual_size) { int total_size = 0; - for (int i = 0; i < shards.size(); i++) { + for (int i = 0; i < (int)shards.size(); i++) { total_size += shards[i]->get_size(); } if (sample_size > total_size) sample_size = total_size; @@ -537,16 +831,16 @@ int32_t GraphTable::random_sample_nodes(int sample_size, } } for (auto &pair : first_half) second_half.push_back(pair); - std::vector res; + std::vector res; get_nodes_ids_by_ranges(second_half, res); - actual_size = res.size() * sizeof(uint64_t); + actual_size = res.size() * sizeof(int64_t); buffer.reset(new char[actual_size]); char *pointer = buffer.get(); memcpy(pointer, res.data(), actual_size); return 0; } int32_t GraphTable::random_sample_neighbors( - uint64_t *node_ids, int sample_size, + int64_t *node_ids, int sample_size, std::vector> &buffers, std::vector &actual_sizes, bool need_weight) { size_t node_num = buffers.size(); @@ -560,10 +854,10 @@ int32_t GraphTable::random_sample_neighbors( seq_id[index].emplace_back(idx); id_list[index].emplace_back(node_ids[idx], sample_size, need_weight); } - for (int i = 0; i < seq_id.size(); i++) { + for (int i = 0; i < (int)seq_id.size(); i++) { if (seq_id[i].size() == 0) continue; tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int { - uint64_t node_id; + int64_t node_id; std::vector> r; LRUResponse response = LRUResponse::blocked; if (use_cache) { @@ -576,7 +870,7 @@ int32_t GraphTable::random_sample_neighbors( std::vector sample_keys; auto &rng = _shards_task_rng_pool[i]; for (size_t k = 0; k < id_list[i].size(); k++) { - if (index < r.size() && + if (index < (int)r.size() && r[index].first.node_key == id_list[i][k].node_key) { idx = seq_id[i][k]; actual_sizes[idx] = r[index].second.actual_size; @@ -597,7 +891,7 @@ int32_t GraphTable::random_sample_neighbors( res.size() * (need_weight ? (Node::id_size + Node::weight_size) : Node::id_size); int offset = 0; - uint64_t id; + int64_t id; float weight; char *buffer_addr = new char[actual_size]; if (response == LRUResponse::ok) { @@ -632,13 +926,13 @@ int32_t GraphTable::random_sample_neighbors( return 0; } -int32_t GraphTable::get_node_feat(const std::vector &node_ids, +int32_t GraphTable::get_node_feat(const std::vector &node_ids, const std::vector &feature_names, std::vector> &res) { size_t node_num = node_ids.size(); std::vector> tasks; for (size_t idx = 0; idx < node_num; ++idx) { - uint64_t node_id = node_ids[idx]; + int64_t node_id = node_ids[idx]; tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( [&, idx, node_id]() -> int { Node *node = find_node(node_id); @@ -646,7 +940,8 @@ int32_t GraphTable::get_node_feat(const std::vector &node_ids, if (node == nullptr) { return 0; } - for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + for (int feat_idx = 0; feat_idx < (int)feature_names.size(); + ++feat_idx) { const std::string &feature_name = feature_names[feat_idx]; if (feat_id_map.find(feature_name) != feat_id_map.end()) { // res[feat_idx][idx] = @@ -665,19 +960,20 @@ int32_t GraphTable::get_node_feat(const std::vector &node_ids, } int32_t GraphTable::set_node_feat( - const std::vector &node_ids, + const std::vector &node_ids, const std::vector &feature_names, const std::vector> &res) { size_t node_num = node_ids.size(); std::vector> tasks; for (size_t idx = 0; idx < node_num; ++idx) { - uint64_t node_id = node_ids[idx]; + int64_t node_id = node_ids[idx]; tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( [&, idx, node_id]() -> int { size_t index = node_id % this->shard_num - this->shard_start; auto node = shards[index]->add_feature_node(node_id); node->set_feature_size(this->feat_name.size()); - for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + for (int feat_idx = 0; feat_idx < (int)feature_names.size(); + ++feat_idx) { const std::string &feature_name = feature_names[feat_idx]; if (feat_id_map.find(feature_name) != feat_id_map.end()) { node->set_feature(feat_id_map[feature_name], res[feat_idx][idx]); @@ -771,35 +1067,68 @@ int32_t GraphTable::pull_graph_list(int start, int total_size, return 0; } -int32_t GraphTable::get_server_index_by_id(uint64_t id) { +int32_t GraphTable::get_server_index_by_id(int64_t id) { return id % shard_num / shard_num_per_server; } +int32_t GraphTable::initialize(const TableParameter &config, + const FsClientParameter &fs_config) { + LOG(INFO) << "in graphTable initialize"; + _config = config; + if (initialize_accessor() != 0) { + LOG(WARNING) << "Table accessor initialize failed"; + return -1; + } -int32_t GraphTable::initialize() { + if (_afs_client.initialize(fs_config) != 0) { + LOG(WARNING) << "Table fs_client initialize failed"; + // return -1; + } + auto graph = config.graph_parameter(); + shard_num = _config.shard_num(); + LOG(INFO) << "in graphTable initialize over"; + return initialize(graph); +} +int32_t GraphTable::initialize(const GraphParameter &graph) { +#ifdef PADDLE_WITH_HETERPS + if (graph.gpups_mode()) { + gpups_mode = true; + if (shard_num == 0) { + shard_num = graph.gpups_mode_shard_num(); + server_num = 1; + _shard_idx = 0; + } + auto *sampler = + CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class()); + auto slices = + string::split_string(graph.gpups_graph_sample_args(), ","); + std::cout << "slices" << std::endl; + for (auto x : slices) std::cout << x << std::endl; + sampler->init(graph.gpu_num(), this, slices); + graph_sampler.reset(sampler); + } +#endif + task_pool_size_ = graph.task_pool_size(); _shards_task_pool.resize(task_pool_size_); for (size_t i = 0; i < _shards_task_pool.size(); ++i) { _shards_task_pool[i].reset(new ::ThreadPool(1)); _shards_task_rng_pool.push_back(paddle::framework::GetCPURandomEngine(0)); } - server_num = _shard_num; - // VLOG(0) << "in init graph table server num = " << server_num; - /* - _shard_num is actually server number here - when a server initialize its tables, it sets tables' _shard_num to server_num, - and _shard_idx to server - rank - */ - auto common = _config.common(); - - this->table_name = common.table_name(); - this->table_type = common.name(); + auto graph_feature = graph.graph_feature(); + // this->table_name = common.table_name(); + // this->table_type = common.name(); + this->table_name = graph.table_name(); + this->table_type = graph.table_type(); VLOG(0) << " init graph table type " << this->table_type << " table name " << this->table_name; - int feat_conf_size = static_cast(common.attributes().size()); + // int feat_conf_size = static_cast(common.attributes().size()); + int feat_conf_size = static_cast(graph_feature.name().size()); for (int i = 0; i < feat_conf_size; i++) { - auto &f_name = common.attributes()[i]; - auto &f_shape = common.dims()[i]; - auto &f_dtype = common.params()[i]; + // auto &f_name = common.attributes()[i]; + // auto &f_shape = common.dims()[i]; + // auto &f_dtype = common.params()[i]; + auto &f_name = graph_feature.name()[i]; + auto &f_shape = graph_feature.shape()[i]; + auto &f_dtype = graph_feature.dtype()[i]; this->feat_name.push_back(f_name); this->feat_shape.push_back(f_shape); this->feat_dtype.push_back(f_dtype); @@ -807,8 +1136,6 @@ int32_t GraphTable::initialize() { VLOG(0) << "init graph table feat conf name:" << f_name << " shape:" << f_shape << " dtype:" << f_dtype; } - - shard_num = _config.shard_num(); VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx" << _shard_idx; shard_num_per_server = sparse_local_shard_num(shard_num, server_num); @@ -826,5 +1153,6 @@ int32_t GraphTable::initialize() { return 0; } + } // namespace distributed }; // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h index c76a62248c8fc..f6f127621b947 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.h +++ b/paddle/fluid/distributed/ps/table/common_graph_table.h @@ -38,10 +38,14 @@ #include #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/common_table.h" +#include "paddle/fluid/distributed/ps/table/graph/class_macro.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/phi/core/utils/rw_lock.h" +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" +#endif namespace paddle { namespace distributed { class GraphShard { @@ -51,37 +55,37 @@ class GraphShard { ~GraphShard(); std::vector &get_bucket() { return bucket; } std::vector get_batch(int start, int end, int step); - std::vector get_ids_by_range(int start, int end) { - std::vector res; + std::vector get_ids_by_range(int start, int end) { + std::vector res; for (int i = start; i < end && i < (int)bucket.size(); i++) { res.push_back(bucket[i]->get_id()); } return res; } - GraphNode *add_graph_node(uint64_t id); + GraphNode *add_graph_node(int64_t id); GraphNode *add_graph_node(Node *node); - FeatureNode *add_feature_node(uint64_t id); - Node *find_node(uint64_t id); - void delete_node(uint64_t id); + FeatureNode *add_feature_node(int64_t id); + Node *find_node(int64_t id); + void delete_node(int64_t id); void clear(); - void add_neighbor(uint64_t id, uint64_t dst_id, float weight); - std::unordered_map &get_node_location() { + void add_neighbor(int64_t id, int64_t dst_id, float weight); + std::unordered_map &get_node_location() { return node_location; } private: - std::unordered_map node_location; + std::unordered_map node_location; std::vector bucket; }; enum LRUResponse { ok = 0, blocked = 1, err = 2 }; struct SampleKey { - uint64_t node_key; + int64_t node_key; size_t sample_size; bool is_weighted; - SampleKey(uint64_t _node_key, size_t _sample_size, bool _is_weighted) + SampleKey(int64_t _node_key, size_t _sample_size, bool _is_weighted) : node_key(_node_key), sample_size(_sample_size), is_weighted(_is_weighted) {} @@ -300,7 +304,7 @@ class ScaledLRU { node_size += lru_pool[i].node_size - lru_pool[i].remove_count; } - if (node_size <= size_t(1.1 * size_limit) + 1) return 0; + if ((size_t)node_size <= size_t(1.1 * size_limit) + 1) return 0; if (pthread_rwlock_wrlock(&rwlock) == 0) { // VLOG(0)<"in shrink\n"; global_count = 0; @@ -308,9 +312,9 @@ class ScaledLRU { global_count += lru_pool[i].node_size - lru_pool[i].remove_count; } // VLOG(0)<<"global_count "< size_limit) { + if ((size_t)global_count > size_limit) { size_t remove = global_count - size_limit; - for (int i = 0; i < lru_pool.size(); i++) { + for (size_t i = 0; i < lru_pool.size(); i++) { lru_pool[i].total_diff = 0; lru_pool[i].remove_count += 1.0 * (lru_pool[i].node_size - lru_pool[i].remove_count) / @@ -352,9 +356,69 @@ class ScaledLRU { friend class RandomSampleLRU; }; +#ifdef PADDLE_WITH_HETERPS +enum GraphSamplerStatus { waiting = 0, running = 1, terminating = 2 }; +class GraphTable; +class GraphSampler { + public: + GraphSampler() { + status = GraphSamplerStatus::waiting; + thread_pool.reset(new ::ThreadPool(1)); + callback = [](std::vector &res) { + return; + }; + } + virtual int run_graph_sampling() = 0; + virtual int start_graph_sampling() { + if (status != GraphSamplerStatus::waiting) { + return -1; + } + std::promise prom; + std::future fut = prom.get_future(); + graph_sample_task_over = thread_pool->enqueue([&prom, this]() { + prom.set_value(0); + status = GraphSamplerStatus::running; + return run_graph_sampling(); + }); + return fut.get(); + } + virtual void init(size_t gpu_num, GraphTable *graph_table, + std::vector args) = 0; + virtual void set_graph_sample_callback( + std::function &)> + callback) { + this->callback = callback; + } + + virtual int end_graph_sampling() { + if (status == GraphSamplerStatus::running) { + status = GraphSamplerStatus::terminating; + return graph_sample_task_over.get(); + } + return -1; + } + virtual GraphSamplerStatus get_graph_sampler_status() { return status; } + + protected: + std::function &)> + callback; + std::shared_ptr<::ThreadPool> thread_pool; + GraphSamplerStatus status; + std::future graph_sample_task_over; + std::vector sample_res; +}; +#endif + class GraphTable : public SparseTable { public: - GraphTable() { use_cache = false; } + GraphTable() { + use_cache = false; + shard_num = 0; +#ifdef PADDLE_WITH_HETERPS + gpups_mode = false; +#endif + rw_lock.reset(new pthread_rwlock_t()); + } virtual ~GraphTable(); virtual int32_t pull_graph_list(int start, int size, std::unique_ptr &buffer, @@ -362,7 +426,7 @@ class GraphTable : public SparseTable { int step); virtual int32_t random_sample_neighbors( - uint64_t *node_ids, int sample_size, + int64_t *node_ids, int sample_size, std::vector> &buffers, std::vector &actual_sizes, bool need_weight); @@ -370,9 +434,11 @@ class GraphTable : public SparseTable { int &actual_sizes); virtual int32_t get_nodes_ids_by_ranges( - std::vector> ranges, std::vector &res); - virtual int32_t initialize(); - + std::vector> ranges, std::vector &res); + virtual int32_t initialize() { return 0; } + virtual int32_t initialize(const TableParameter &config, + const FsClientParameter &fs_config); + virtual int32_t initialize(const GraphParameter &config); int32_t load(const std::string &path, const std::string ¶m); int32_t load_graph_split_config(const std::string &path); @@ -380,13 +446,16 @@ class GraphTable : public SparseTable { int32_t load_nodes(const std::string &path, std::string node_type); - int32_t add_graph_node(std::vector &id_list, + int32_t add_graph_node(std::vector &id_list, std::vector &is_weight_list); - int32_t remove_graph_node(std::vector &id_list); + int32_t remove_graph_node(std::vector &id_list); + + int32_t get_server_index_by_id(int64_t id); + Node *find_node(int64_t id); - int32_t get_server_index_by_id(uint64_t id); - Node *find_node(uint64_t id); + virtual int32_t Pull(TableContext &context) { return 0; } + virtual int32_t Push(TableContext &context) { return 0; } virtual int32_t pull_sparse(float *values, const PullSparseValue &pull_value) { @@ -407,16 +476,27 @@ class GraphTable : public SparseTable { return 0; } virtual int32_t initialize_shard() { return 0; } - virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index); - virtual uint32_t get_thread_pool_index(uint64_t node_id); + virtual int32_t set_shard(size_t shard_idx, size_t server_num) { + _shard_idx = shard_idx; + /* + _shard_num is not used in graph_table, this following operation is for the + purpose of + being compatible with base class table. + */ + _shard_num = server_num; + this->server_num = server_num; + return 0; + } + virtual uint32_t get_thread_pool_index_by_shard_index(int64_t shard_index); + virtual uint32_t get_thread_pool_index(int64_t node_id); virtual std::pair parse_feature(std::string feat_str); - virtual int32_t get_node_feat(const std::vector &node_ids, + virtual int32_t get_node_feat(const std::vector &node_ids, const std::vector &feature_names, std::vector> &res); virtual int32_t set_node_feat( - const std::vector &node_ids, + const std::vector &node_ids, const std::vector &feature_names, const std::vector> &res); @@ -433,11 +513,25 @@ class GraphTable : public SparseTable { } return 0; } - +#ifdef PADDLE_WITH_HETERPS + virtual int32_t start_graph_sampling() { + return this->graph_sampler->start_graph_sampling(); + } + virtual int32_t end_graph_sampling() { + return this->graph_sampler->end_graph_sampling(); + } + virtual int32_t set_graph_sample_callback( + std::function &)> + callback) { + graph_sampler->set_graph_sample_callback(callback); + return 0; + } +// virtual GraphSampler *get_graph_sampler() { return graph_sampler.get(); } +#endif protected: std::vector shards, extra_shards; size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num; - const int task_pool_size_ = 24; + int task_pool_size_ = 24; const int random_sample_nodes_ranges = 3; std::vector feat_name; @@ -450,11 +544,61 @@ class GraphTable : public SparseTable { std::vector> _shards_task_pool; std::vector> _shards_task_rng_pool; std::shared_ptr> scaled_lru; - std::unordered_set extra_nodes; - std::unordered_map extra_nodes_to_thread_index; + std::unordered_set extra_nodes; + std::unordered_map extra_nodes_to_thread_index; bool use_cache, use_duplicate_nodes; mutable std::mutex mutex_; + std::shared_ptr rw_lock; +#ifdef PADDLE_WITH_HETERPS + // paddle::framework::GpuPsGraphTable gpu_graph_table; + bool gpups_mode; + // std::shared_ptr<::ThreadPool> graph_sample_pool; + std::shared_ptr graph_sampler; + REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler) +#endif +}; + +#ifdef PADDLE_WITH_HETERPS +REGISTER_PSCORE_REGISTERER(GraphSampler); +class CompleteGraphSampler : public GraphSampler { + public: + CompleteGraphSampler() {} + ~CompleteGraphSampler() {} + // virtual pthread_rwlock_t *export_rw_lock(); + virtual int run_graph_sampling(); + virtual void init(size_t gpu_num, GraphTable *graph_table, + std::vector args_); + + protected: + GraphTable *graph_table; + std::vector> sample_nodes; + std::vector> sample_neighbors; + // std::vector sample_res; + // std::shared_ptr random; + int gpu_num; +}; + +class BasicBfsGraphSampler : public GraphSampler { + public: + BasicBfsGraphSampler() {} + ~BasicBfsGraphSampler() {} + // virtual pthread_rwlock_t *export_rw_lock(); + virtual int run_graph_sampling(); + virtual void init(size_t gpu_num, GraphTable *graph_table, + std::vector args_); + + protected: + GraphTable *graph_table; + // std::vector> sample_nodes; + std::vector> sample_nodes; + std::vector> sample_neighbors; + size_t gpu_num; + int node_num_for_each_shard, edge_num_for_each_node; + int rounds, interval; + std::vector>> + sample_neighbors_map; }; +#endif } // namespace distributed }; // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/common_sparse_table.cc b/paddle/fluid/distributed/ps/table/common_sparse_table.cc index b44d08b937a96..45be53335e1a1 100644 --- a/paddle/fluid/distributed/ps/table/common_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/common_sparse_table.cc @@ -355,6 +355,32 @@ int32_t CommonSparseTable::pour() { return 0; } +int32_t CommonSparseTable::Pull(TableContext& context) { + CHECK(context.value_type == Sparse); + if (context.use_ptr) { + char** pull_values = context.pull_context.ptr_values; + const uint64_t* keys = context.pull_context.keys; + return pull_sparse_ptr(pull_values, keys, context.num); + } else { + float* pull_values = context.pull_context.values; + const PullSparseValue& pull_value = context.pull_context.pull_value; + return pull_sparse(pull_values, pull_value); + } +} + +int32_t CommonSparseTable::Push(TableContext& context) { + CHECK(context.value_type == Sparse); + if (context.pull_context.values != nullptr) { + const float* values = context.push_context.values; + const uint64_t* keys = context.push_context.keys; + return push_sparse(keys, values, context.num); + } else { + const float** values = context.push_context.ptr_values; + const uint64_t* keys = context.push_context.keys; + return push_sparse(keys, values, context.num); + } +} + int32_t CommonSparseTable::pull_sparse(float* pull_values, const PullSparseValue& pull_value) { auto shard_num = task_pool_size_; diff --git a/paddle/fluid/distributed/ps/table/common_sparse_table.h b/paddle/fluid/distributed/ps/table/common_sparse_table.h index 82481dcd584e4..138c544742066 100644 --- a/paddle/fluid/distributed/ps/table/common_sparse_table.h +++ b/paddle/fluid/distributed/ps/table/common_sparse_table.h @@ -121,6 +121,9 @@ class CommonSparseTable : public SparseTable { virtual int32_t push_dense(const float* values, size_t num) { return 0; } // unused method end + virtual int32_t Pull(TableContext& context); + virtual int32_t Push(TableContext& context); + virtual int32_t initialize(); virtual int32_t initialize_shard() { return 0; } virtual int32_t initialize_value(); diff --git a/paddle/fluid/distributed/ps/table/common_table.h b/paddle/fluid/distributed/ps/table/common_table.h index bac826dfe0e20..3d291c0152246 100644 --- a/paddle/fluid/distributed/ps/table/common_table.h +++ b/paddle/fluid/distributed/ps/table/common_table.h @@ -119,6 +119,9 @@ class BarrierTable : public Table { virtual void *get_shard(size_t shard_idx) { return 0; } + virtual int32_t Pull(TableContext &context) { return 0; } + virtual int32_t Push(TableContext &context) { return 0; } + int32_t pull_dense(float *values, size_t num) override { return 0; } int32_t push_dense(const float *values, size_t num) override { return 0; } diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc index 866bd8114ccea..43e143dca901b 100644 --- a/paddle/fluid/distributed/ps/table/ctr_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc @@ -38,6 +38,16 @@ int CtrCommonAccessor::initialize() { return 0; } +void CtrCommonAccessor::GetTableInfo(AccessorInfo& info) { + info.dim = dim(); + info.size = size(); + info.select_dim = select_dim(); + info.select_size = select_size(); + info.update_dim = update_dim(); + info.update_size = update_size(); + info.fea_dim = fea_dim(); +} + size_t CtrCommonAccessor::dim() { return common_feature_value.dim(); } size_t CtrCommonAccessor::dim_size(size_t dim) { diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h index 1e31fec04649b..bc46217955a8a 100644 --- a/paddle/fluid/distributed/ps/table/ctr_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h @@ -126,6 +126,7 @@ class CtrCommonAccessor : public ValueAccessor { virtual int initialize(); virtual ~CtrCommonAccessor() {} + virtual void GetTableInfo(AccessorInfo& info); // valueē»“åŗ¦ virtual size_t dim(); // value各äøŖē»“åŗ¦ēš„size diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc index b07bcf70ad7af..bccf1fdebafa0 100644 --- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc @@ -37,6 +37,16 @@ int DownpourCtrDoubleAccessor::initialize() { return 0; } +void DownpourCtrDoubleAccessor::GetTableInfo(AccessorInfo& info) { + info.dim = dim(); + info.size = size(); + info.select_dim = select_dim(); + info.select_size = select_size(); + info.update_dim = update_dim(); + info.update_size = update_size(); + info.fea_dim = fea_dim(); +} + size_t DownpourCtrDoubleAccessor::dim() { auto embedx_dim = _config.embedx_dim(); return DownpourCtrDoubleFeatureValue::dim(embedx_dim); diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h index d7c717ace0988..d7942634e8600 100644 --- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h @@ -168,6 +168,7 @@ class DownpourCtrDoubleAccessor : public ValueAccessor { DownpourCtrDoubleAccessor() {} virtual ~DownpourCtrDoubleAccessor() {} virtual int initialize(); + virtual void GetTableInfo(AccessorInfo& info); // valueē»“åŗ¦ virtual size_t dim(); // value各äøŖē»“åŗ¦ēš„size diff --git a/paddle/fluid/distributed/ps/table/depends/sparse_utils.h b/paddle/fluid/distributed/ps/table/depends/sparse_utils.h index 708f7786bf3b0..98e0250acc4d6 100644 --- a/paddle/fluid/distributed/ps/table/depends/sparse_utils.h +++ b/paddle/fluid/distributed/ps/table/depends/sparse_utils.h @@ -58,7 +58,7 @@ struct PullSparseValue { std::vector* offset_shard) const { offset_shard->reserve(numel_ / shard_num + 1); for (int x = 0; x < numel_; ++x) { - if (feasigns_[x] % shard_num == shard_id) { + if (int(feasigns_[x] % shard_num) == shard_id) { offset_shard->push_back(x); } } diff --git a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc index 5f22c3a436f1f..e8ca7430351de 100644 --- a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc +++ b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc @@ -37,6 +37,16 @@ int DownpourCtrAccessor::initialize() { return 0; } +void DownpourCtrAccessor::GetTableInfo(AccessorInfo& info) { + info.dim = dim(); + info.size = size(); + info.select_dim = select_dim(); + info.select_size = select_size(); + info.update_dim = update_dim(); + info.update_size = update_size(); + info.fea_dim = fea_dim(); +} + size_t DownpourCtrAccessor::dim() { auto embedx_dim = _config.embedx_dim(); return DownpourCtrFeatureValue::dim(embedx_dim); diff --git a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h index 5de7b12e01f0d..11991ad044ff6 100644 --- a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h +++ b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h @@ -160,6 +160,7 @@ class DownpourCtrAccessor : public ValueAccessor { virtual ~DownpourCtrAccessor() {} virtual int initialize(); + virtual void GetTableInfo(AccessorInfo& info); // valueē»“åŗ¦ virtual size_t dim(); // value各äøŖē»“åŗ¦ēš„size diff --git a/paddle/fluid/distributed/ps/table/graph/class_macro.h b/paddle/fluid/distributed/ps/table/graph/class_macro.h new file mode 100644 index 0000000000000..bf59dbacb2537 --- /dev/null +++ b/paddle/fluid/distributed/ps/table/graph/class_macro.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#define DECLARE_GRAPH_FRIEND_CLASS(a) friend class a; +#define DECLARE_1_FRIEND_CLASS(a, ...) DECLARE_GRAPH_FRIEND_CLASS(a) +#define DECLARE_2_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_1_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_3_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_2_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_4_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_3_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_5_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_4_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_6_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_5_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_7_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_6_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_8_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_7_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_9_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_8_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_10_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_9_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_11_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_10_FRIEND_CLASS(__VA_ARGS__) +#define REGISTER_GRAPH_FRIEND_CLASS(n, ...) \ + DECLARE_##n##_FRIEND_CLASS(__VA_ARGS__) diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc index d1961b655d882..004a536e8e56c 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc +++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc @@ -17,11 +17,11 @@ namespace paddle { namespace distributed { -void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { +void GraphEdgeBlob::add_edge(int64_t id, float weight = 1) { id_arr.push_back(id); } -void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { +void WeightedGraphEdgeBlob::add_edge(int64_t id, float weight = 1) { id_arr.push_back(id); weight_arr.push_back(weight); } diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.h b/paddle/fluid/distributed/ps/table/graph/graph_edge.h index 3dfe5a6f357a7..5fc785fe25682 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_edge.h +++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.h @@ -24,19 +24,20 @@ class GraphEdgeBlob { GraphEdgeBlob() {} virtual ~GraphEdgeBlob() {} size_t size() { return id_arr.size(); } - virtual void add_edge(uint64_t id, float weight); - uint64_t get_id(int idx) { return id_arr[idx]; } + virtual void add_edge(int64_t id, float weight); + int64_t get_id(int idx) { return id_arr[idx]; } virtual float get_weight(int idx) { return 1; } + std::vector& export_id_array() { return id_arr; } protected: - std::vector id_arr; + std::vector id_arr; }; class WeightedGraphEdgeBlob : public GraphEdgeBlob { public: WeightedGraphEdgeBlob() {} virtual ~WeightedGraphEdgeBlob() {} - virtual void add_edge(uint64_t id, float weight); + virtual void add_edge(int64_t id, float weight); virtual float get_weight(int idx) { return weight_arr[idx]; } protected: diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h index b838c2c1258d8..c6c594036d4fc 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_node.h +++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h @@ -48,6 +48,7 @@ class Node { virtual void set_feature(int idx, std::string str) {} virtual void set_feature_size(int size) {} virtual int get_feature_size() { return 0; } + virtual size_t get_neighbor_size() { return 0; } protected: uint64_t id; @@ -70,6 +71,7 @@ class GraphNode : public Node { } virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); } virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); } + virtual size_t get_neighbor_size() { return edges->size(); } protected: Sampler *sampler; diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h index 89c4fc15ae279..3b43f99543fdd 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h +++ b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h @@ -48,6 +48,8 @@ class MemorySparseGeoTable : public SparseTable { virtual int32_t save(const std::string& path, const std::string& param) { return 0; } + virtual int32_t Pull(TableContext& context) { return 0; } + virtual int32_t Push(TableContext& context) { return 0; } virtual int32_t flush() { return 0; } virtual int32_t shrink(const std::string& param) { return 0; } virtual void clear() { return; } diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc index 7ce6e9005cf56..98454ca747d31 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc @@ -390,6 +390,26 @@ std::pair MemorySparseTable::print_table_stat() { return {feasign_size, mf_size}; } +int32_t MemorySparseTable::Pull(TableContext& context) { + CHECK(context.value_type == Sparse); + if (context.use_ptr) { + char** pull_values = context.pull_context.ptr_values; + const uint64_t* keys = context.pull_context.keys; + return pull_sparse_ptr(pull_values, keys, context.num); + } else { + float* pull_values = context.pull_context.values; + const PullSparseValue& pull_value = context.pull_context.pull_value; + return pull_sparse(pull_values, pull_value); + } +} + +int32_t MemorySparseTable::Push(TableContext& context) { + CHECK(context.value_type == Sparse); + + const uint64_t* keys = context.push_context.keys; + return push_sparse(keys, context.push_context.ptr_values, context.num); +} + int32_t MemorySparseTable::pull_sparse(float* pull_values, const PullSparseValue& pull_value) { CostTimer timer("pserver_sparse_select_all"); diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h index 5770f25f8f41d..d26c67319760d 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_table.h +++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h @@ -48,6 +48,9 @@ class MemorySparseTable : public SparseTable { virtual int32_t push_dense(const float* values, size_t num) { return 0; } // unused method end + virtual int32_t Pull(TableContext& context); + virtual int32_t Push(TableContext& context); + virtual int32_t initialize(); virtual int32_t initialize_shard() { return 0; } virtual int32_t initialize_value(); diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc index 60514b4e19ffa..5bc58bc5a1108 100644 --- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc @@ -61,6 +61,21 @@ int32_t SSDSparseTable::initialize() { return 0; } +int32_t SSDSparseTable::Pull(TableContext& context) { + CHECK(context.value_type == Sparse); + if (context.use_ptr) { + char** pull_values = context.pull_context.ptr_values; + const uint64_t* keys = context.pull_context.keys; + return pull_sparse_ptr(pull_values, keys, context.num); + } else { + float* pull_values = context.pull_context.values; + const PullSparseValue& pull_value = context.pull_context.pull_value; + return pull_sparse(pull_values, pull_value); + } +} + +int32_t SSDSparseTable::Push(TableContext& context) { return 0; } + int32_t SSDSparseTable::pull_sparse(float* pull_values, const PullSparseValue& pull_value) { auto shard_num = task_pool_size_; diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h index f5e8a7067e0e0..3a703d7d966d3 100644 --- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h +++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h @@ -42,6 +42,9 @@ class SSDSparseTable : public CommonSparseTable { // exchange data virtual int32_t update_table(); + virtual int32_t Pull(TableContext& context); + virtual int32_t Push(TableContext& context); + virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value); virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys, diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc index fa8169da07ab7..fc2ea56e95d77 100644 --- a/paddle/fluid/distributed/ps/table/table.cc +++ b/paddle/fluid/distributed/ps/table/table.cc @@ -37,6 +37,8 @@ REGISTER_PSCORE_CLASS(Table, CommonDenseTable); REGISTER_PSCORE_CLASS(Table, CommonSparseTable); #ifdef PADDLE_WITH_HETERPS REGISTER_PSCORE_CLASS(Table, SSDSparseTable); +REGISTER_PSCORE_CLASS(GraphSampler, CompleteGraphSampler); +REGISTER_PSCORE_CLASS(GraphSampler, BasicBfsGraphSampler); #endif REGISTER_PSCORE_CLASS(Table, SparseGeoTable); REGISTER_PSCORE_CLASS(Table, BarrierTable); diff --git a/paddle/fluid/distributed/ps/table/table.h b/paddle/fluid/distributed/ps/table/table.h index da1bb668ccfa3..2bd2a42b6c58f 100644 --- a/paddle/fluid/distributed/ps/table/table.h +++ b/paddle/fluid/distributed/ps/table/table.h @@ -32,6 +32,30 @@ namespace paddle { namespace distributed { + +enum ValueType { Sparse = 0, Dense = 1 }; + +struct PullContext { + const uint64_t *keys; + const PullSparseValue pull_value; + float *values; + char **ptr_values; +}; + +struct TablePushContext { + const uint64_t *keys; + const float *values; + const float **ptr_values; +}; + +struct TableContext { + ValueType value_type; + PullContext pull_context; + TablePushContext push_context; + size_t num; + bool use_ptr; +}; + class Table { public: Table() {} @@ -39,6 +63,8 @@ class Table { virtual int32_t initialize(const TableParameter &config, const FsClientParameter &fs_config); + virtual int32_t Pull(TableContext &context) = 0; + virtual int32_t Push(TableContext &context) = 0; virtual int32_t pull_dense(float *values, size_t num) = 0; virtual int32_t push_dense(const float *values, size_t num) = 0; // for push global_step diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.cc b/paddle/fluid/distributed/ps/table/tensor_accessor.cc index 70a580c1e53a9..8c5349bff832c 100644 --- a/paddle/fluid/distributed/ps/table/tensor_accessor.cc +++ b/paddle/fluid/distributed/ps/table/tensor_accessor.cc @@ -20,6 +20,16 @@ namespace distributed { int CommMergeAccessor::initialize() { return 0; } +void CommMergeAccessor::GetTableInfo(AccessorInfo &info) { + info.dim = dim(); + info.size = size(); + info.select_dim = select_dim(); + info.select_size = select_size(); + info.update_dim = update_dim(); + info.update_size = update_size(); + info.fea_dim = fea_dim(); +} + // value ē»“åŗ¦ size_t CommMergeAccessor::dim() { return 0; } diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.h b/paddle/fluid/distributed/ps/table/tensor_accessor.h index 5041b8fdf8733..1873b743b44ec 100644 --- a/paddle/fluid/distributed/ps/table/tensor_accessor.h +++ b/paddle/fluid/distributed/ps/table/tensor_accessor.h @@ -30,6 +30,7 @@ class CommMergeAccessor : public ValueAccessor { CommMergeAccessor() {} virtual ~CommMergeAccessor() {} virtual int initialize(); + virtual void GetTableInfo(AccessorInfo &info); // valueē»“åŗ¦ virtual size_t dim(); // value各äøŖē»“åŗ¦ēš„size diff --git a/paddle/fluid/distributed/ps/table/tensor_table.h b/paddle/fluid/distributed/ps/table/tensor_table.h index 64d81327acc55..23a62365c0f5a 100644 --- a/paddle/fluid/distributed/ps/table/tensor_table.h +++ b/paddle/fluid/distributed/ps/table/tensor_table.h @@ -48,6 +48,8 @@ class TensorTable : public Table { TensorTable() {} virtual ~TensorTable() {} + virtual int32_t Pull(TableContext &context) { return 0; } + virtual int32_t Push(TableContext &context) { return 0; } int32_t pull_dense(float *values, size_t num) override { return 0; } int32_t push_dense(const float *values, size_t num) override { return 0; } diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc index 0588dbdf0fc61..c887cfeb71eef 100644 --- a/paddle/fluid/distributed/ps/wrapper/fleet.cc +++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc @@ -30,6 +30,32 @@ bool FleetWrapper::is_initialized_ = false; std::shared_ptr FleetWrapper::pserver_ptr_ = NULL; +void FleetWrapper::Stop() { StopServer(); } + +void FleetWrapper::Load(WrapperContext& context) { + auto table_id = context.table_id; + if (table_id >= 0 && context.meta != "") { + LoadSparseOnServer(context.path, context.meta, context.table_id); + return; + } + if (table_id < 0) { // laod all + LoadModel(context.path, context.mode); + } else { // load one table + LoadModelOneTable(table_id, context.path, context.mode); + } + return; +} + +void FleetWrapper::Save(WrapperContext& context) { + auto table_id = context.table_id; + if (table_id < 0) { + SaveModel(context.path, context.mode); + } else { + SaveModelOneTable(table_id, context.path, context.mode); + } + return; +} + void FleetWrapper::SetClient2ClientConfig(int request_timeout_ms, int connect_timeout_ms, int max_retry) { diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h index a535b8c5bf8f9..d68c453c6d51b 100644 --- a/paddle/fluid/distributed/ps/wrapper/fleet.h +++ b/paddle/fluid/distributed/ps/wrapper/fleet.h @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h" #include "paddle/fluid/distributed/ps/service/ps_service/service.h" +#include "paddle/fluid/distributed/ps/wrapper/ps_wrapper.h" #include "paddle/fluid/framework/archive.h" #include "paddle/fluid/framework/io/fs.h" #include "paddle/fluid/framework/io/shell.h" @@ -54,7 +55,7 @@ using framework::Variable; using RpcCtxMap = std::unordered_map; -class FleetWrapper { +class FleetWrapper : public PSWrapper { public: virtual ~FleetWrapper() {} FleetWrapper() { @@ -68,7 +69,13 @@ class FleetWrapper { // pserver request max retry client2client_max_retry_ = 3; } + virtual int32_t Initialize(InitContext& context) { return 0; } + virtual void Stop() override; + + virtual void Load(WrapperContext& context) override; + + virtual void Save(WrapperContext& context) override; // set client to client communication config void SetClient2ClientConfig(int request_timeout_ms, int connect_timeout_ms, int max_retry); diff --git a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h index c92835aa995ad..ca02ad31195ef 100755 --- a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h +++ b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h @@ -1,18 +1,84 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_ -#define PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_ - -#endif // PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h" +#include "paddle/fluid/distributed/ps/service/ps_service/service.h" +#include "paddle/fluid/framework/archive.h" +#include "paddle/fluid/framework/io/fs.h" +#include "paddle/fluid/framework/io/shell.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace paddle { +namespace framework { +class Scope; +class SelectedRows; +class Variable; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace distributed { + +class PSCore; + +using framework::LoDTensor; +using framework::Scope; +using phi::SelectedRows; +using framework::Variable; + +using RpcCtxMap = std::unordered_map; + +struct WrapperContext { + uint32_t table_id; + const std::string path; + const int mode; + const std::string meta; +}; + +struct InitContext { + const std::vector dev_ids; // for gpu +}; + +class PSWrapper { + public: + virtual ~PSWrapper() {} + PSWrapper() {} + // init server + + virtual int32_t Initialize(InitContext& context) = 0; + + virtual void Stop() = 0; + + virtual void Load(WrapperContext& context) = 0; + + virtual void Save(WrapperContext& context) = 0; +}; + +} // end namespace distributed +} // end namespace paddle diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc index eb98c89c99e47..b0d5add49565f 100644 --- a/paddle/fluid/distributed/store/tcp_store.cc +++ b/paddle/fluid/distributed/store/tcp_store.cc @@ -136,10 +136,6 @@ void MasterDaemon::run() { } for (size_t i = 1; i < fds.size(); i++) { - VLOG(0) << "fds.size:" << fds.size(); - VLOG(0) << "fds.size-i:" << i; - VLOG(0) << "fds[i].revents:" << fds[i].revents; - try { if (fds[i].revents == 0) { continue; diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index 2223334ccc442..cb46c38d4de4b 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -24,6 +24,9 @@ cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope serv set_source_files_properties(graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(graph_node_split_test SRCS graph_node_split_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) +set_source_files_properties(graph_table_sample_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(graph_table_sample_test SRCS graph_table_sample_test.cc DEPS scope server communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) + set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table) diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc index 9949dce4e933b..a2f495de3c953 100644 --- a/paddle/fluid/distributed/test/graph_node_split_test.cc +++ b/paddle/fluid/distributed/test/graph_node_split_test.cc @@ -236,7 +236,7 @@ void RunGraphSplit() { sleep(2); std::map> dense_regions; dense_regions.insert( - std::pair>(0, {})); + std::pair>(0, {})); auto regions = dense_regions[0]; RunClient(dense_regions, 0, pserver_ptr_->get_service()); @@ -250,16 +250,16 @@ void RunGraphSplit() { worker_ptr_->load(0, std::string(edge_file_name), std::string("e>")); srand(time(0)); pull_status.wait(); - std::vector> _vs; + std::vector> _vs; std::vector> vs; pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 10240001024), 4, _vs, vs, true); + 0, std::vector(1, 10240001024), 4, _vs, vs, true); pull_status.wait(); ASSERT_EQ(0, _vs[0].size()); _vs.clear(); vs.clear(); pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 97), 4, _vs, vs, true); + 0, std::vector(1, 97), 4, _vs, vs, true); pull_status.wait(); ASSERT_EQ(3, _vs[0].size()); std::remove(edge_file_name); diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc index 22c2d1e60992e..565d51379d5a8 100644 --- a/paddle/fluid/distributed/test/graph_node_test.cc +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -48,10 +48,10 @@ namespace distributed = paddle::distributed; void testSampleNodes( std::shared_ptr& worker_ptr_) { - std::vector ids; + std::vector ids; auto pull_status = worker_ptr_->random_sample_nodes(0, 0, 6, ids); - std::unordered_set s; - std::unordered_set s1 = {37, 59}; + std::unordered_set s; + std::unordered_set s1 = {37, 59}; pull_status.wait(); for (auto id : ids) s.insert(id); ASSERT_EQ(true, s.size() == s1.size()); @@ -106,14 +106,14 @@ void testFeatureNodeSerializeFloat64() { void testSingleSampleNeighboor( std::shared_ptr& worker_ptr_) { - std::vector> vs; + std::vector> vs; std::vector> vs1; auto pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 37), 4, vs, vs1, true); + 0, std::vector(1, 37), 4, vs, vs1, true); pull_status.wait(); - std::unordered_set s; - std::unordered_set s1 = {112, 45, 145}; + std::unordered_set s; + std::unordered_set s1 = {112, 45, 145}; for (auto g : vs[0]) { s.insert(g); } @@ -126,7 +126,7 @@ void testSingleSampleNeighboor( vs.clear(); vs1.clear(); pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 96), 4, vs, vs1, true); + 0, std::vector(1, 96), 4, vs, vs1, true); pull_status.wait(); s1 = {111, 48, 247}; for (auto g : vs[0]) { @@ -147,30 +147,30 @@ void testAddNode( std::shared_ptr& worker_ptr_) { worker_ptr_->clear_nodes(0); int total_num = 270000; - uint64_t id; - std::unordered_set id_set; + int64_t id; + std::unordered_set id_set; for (int i = 0; i < total_num; i++) { while (id_set.find(id = rand()) != id_set.end()) ; id_set.insert(id); } - std::vector id_list(id_set.begin(), id_set.end()); + std::vector id_list(id_set.begin(), id_set.end()); std::vector weight_list; auto status = worker_ptr_->add_graph_node(0, id_list, weight_list); status.wait(); - std::vector ids[2]; + std::vector ids[2]; for (int i = 0; i < 2; i++) { auto sample_status = worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]); sample_status.wait(); } - std::unordered_set id_set_check(ids[0].begin(), ids[0].end()); + std::unordered_set id_set_check(ids[0].begin(), ids[0].end()); for (auto x : ids[1]) id_set_check.insert(x); ASSERT_EQ(id_set.size(), id_set_check.size()); for (auto x : id_set) { ASSERT_EQ(id_set_check.find(x) != id_set_check.end(), true); } - std::vector remove_ids; + std::vector remove_ids; for (auto p : id_set_check) { if (remove_ids.size() == 0) remove_ids.push_back(p); @@ -187,7 +187,7 @@ void testAddNode( worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]); sample_status.wait(); } - std::unordered_set id_set_check1(ids[0].begin(), ids[0].end()); + std::unordered_set id_set_check1(ids[0].begin(), ids[0].end()); for (auto x : ids[1]) id_set_check1.insert(x); ASSERT_EQ(id_set_check1.size(), id_set_check.size()); for (auto x : id_set_check1) { @@ -196,14 +196,14 @@ void testAddNode( } void testBatchSampleNeighboor( std::shared_ptr& worker_ptr_) { - std::vector> vs; + std::vector> vs; std::vector> vs1; - std::vector v = {37, 96}; + std::vector v = {37, 96}; auto pull_status = worker_ptr_->batch_sample_neighbors(0, v, 4, vs, vs1, false); pull_status.wait(); - std::unordered_set s; - std::unordered_set s1 = {112, 45, 145}; + std::unordered_set s; + std::unordered_set s1 = {112, 45, 145}; for (auto g : vs[0]) { s.insert(g); } @@ -417,7 +417,7 @@ void RunBrpcPushSparse() { std::map> dense_regions; dense_regions.insert( - std::pair>(0, {})); + std::pair>(0, {})); auto regions = dense_regions[0]; RunClient(dense_regions, 0, pserver_ptr_->get_service()); @@ -427,14 +427,14 @@ void RunBrpcPushSparse() { worker_ptr_->load(0, std::string(edge_file_name), std::string("e>")); srand(time(0)); pull_status.wait(); - std::vector> _vs; + std::vector> _vs; std::vector> vs; testSampleNodes(worker_ptr_); sleep(5); testSingleSampleNeighboor(worker_ptr_); testBatchSampleNeighboor(worker_ptr_); pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 10240001024), 4, _vs, vs, true); + 0, std::vector(1, 10240001024), 4, _vs, vs, true); pull_status.wait(); ASSERT_EQ(0, _vs[0].size()); paddle::distributed::GraphTable* g = @@ -445,14 +445,14 @@ void RunBrpcPushSparse() { while (round--) { vs.clear(); pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 37), 1, _vs, vs, false); + 0, std::vector(1, 37), 1, _vs, vs, false); pull_status.wait(); for (int i = 0; i < ttl; i++) { - std::vector> vs1; + std::vector> vs1; std::vector> vs2; pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 37), 1, vs1, vs2, false); + 0, std::vector(1, 37), 1, vs1, vs2, false); pull_status.wait(); ASSERT_EQ(_vs[0].size(), vs1[0].size()); @@ -540,7 +540,7 @@ void RunBrpcPushSparse() { // Test Pull by step - std::unordered_set count_item_nodes; + std::unordered_set count_item_nodes; // pull by step 2 for (int test_step = 1; test_step < 4; test_step++) { count_item_nodes.clear(); @@ -558,18 +558,18 @@ void RunBrpcPushSparse() { ASSERT_EQ(count_item_nodes.size(), 12); } - std::pair>, std::vector> res; + std::pair>, std::vector> res; res = client1.batch_sample_neighbors( - std::string("user2item"), std::vector(1, 96), 4, true, false); + std::string("user2item"), std::vector(1, 96), 4, true, false); ASSERT_EQ(res.first[0].size(), 3); - std::vector node_ids; + std::vector node_ids; node_ids.push_back(96); node_ids.push_back(37); res = client1.batch_sample_neighbors(std::string("user2item"), node_ids, 4, true, false); ASSERT_EQ(res.first[1].size(), 1); - std::vector nodes_ids = client2.random_sample_nodes("user", 0, 6); + std::vector nodes_ids = client2.random_sample_nodes("user", 0, 6); ASSERT_EQ(nodes_ids.size(), 2); ASSERT_EQ(true, (nodes_ids[0] == 59 && nodes_ids[1] == 37) || (nodes_ids[0] == 37 && nodes_ids[1] == 59)); diff --git a/paddle/fluid/distributed/test/graph_table_sample_test.cc b/paddle/fluid/distributed/test/graph_table_sample_test.cc new file mode 100644 index 0000000000000..65455028247dd --- /dev/null +++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include // NOLINT +#include +#include +#include +#include // NOLINT +#include +#include +#include "google/protobuf/text_format.h" + +#include +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/ps/service/env.h" +#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/ps/table/common_graph_table.h" +#include "paddle/fluid/distributed/ps/table/graph/graph_node.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/phi/kernels/funcs/math_function.h" +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace memory = paddle::memory; +namespace distributed = paddle::distributed; + +std::vector edges = { + std::string("37\t45\t0.34"), std::string("37\t145\t0.31"), + std::string("37\t112\t0.21"), std::string("96\t48\t1.4"), + std::string("96\t247\t0.31"), std::string("96\t111\t1.21"), + std::string("59\t45\t0.34"), std::string("59\t145\t0.31"), + std::string("59\t122\t0.21"), std::string("97\t48\t0.34"), + std::string("97\t247\t0.31"), std::string("97\t111\t0.21")}; +// odd id:96 48 122 112 +char edge_file_name[] = "edges.txt"; + +std::vector nodes = { + std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"), + std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"), + std::string("user\t59\ta 0.11\tb 11 14"), + std::string("user\t97\ta 0.11\tb 12 11"), + std::string("item\t45\ta 0.21"), + std::string("item\t145\ta 0.21"), + std::string("item\t112\ta 0.21"), + std::string("item\t48\ta 0.21"), + std::string("item\t247\ta 0.21"), + std::string("item\t111\ta 0.21"), + std::string("item\t46\ta 0.21"), + std::string("item\t146\ta 0.21"), + std::string("item\t122\ta 0.21"), + std::string("item\t49\ta 0.21"), + std::string("item\t248\ta 0.21"), + std::string("item\t113\ta 0.21")}; +char node_file_name[] = "nodes.txt"; + +void prepare_file(char file_name[], std::vector data) { + std::ofstream ofile; + ofile.open(file_name); + for (auto x : data) { + ofile << x << std::endl; + } + + ofile.close(); +} + +void testGraphSample() { +#ifdef PADDLE_WITH_HETERPS + ::paddle::distributed::GraphParameter table_proto; + table_proto.set_gpups_mode(true); + table_proto.set_gpups_mode_shard_num(127); + table_proto.set_gpu_num(2); + + distributed::GraphTable graph_table, graph_table1; + graph_table.initialize(table_proto); + prepare_file(edge_file_name, edges); + graph_table.load(std::string(edge_file_name), std::string("e>")); + std::vector res; + std::promise prom; + std::future fut = prom.get_future(); + graph_table.set_graph_sample_callback( + [&res, &prom](std::vector &res0) { + res = res0; + prom.set_value(0); + }); + graph_table.start_graph_sampling(); + fut.get(); + graph_table.end_graph_sampling(); + ASSERT_EQ(2, res.size()); + // 37 59 97 + for (int i = 0; i < (int)res[1].node_size; i++) { + std::cout << res[1].node_list[i].node_id << std::endl; + } + ASSERT_EQ(3, res[1].node_size); + + ::paddle::distributed::GraphParameter table_proto1; + table_proto1.set_gpups_mode(true); + table_proto1.set_gpups_mode_shard_num(127); + table_proto1.set_gpu_num(2); + table_proto1.set_gpups_graph_sample_class("BasicBfsGraphSampler"); + table_proto1.set_gpups_graph_sample_args("5,5,1,1"); + graph_table1.initialize(table_proto1); + graph_table1.load(std::string(edge_file_name), std::string("e>")); + std::vector res1; + std::promise prom1; + std::future fut1 = prom1.get_future(); + graph_table1.set_graph_sample_callback( + [&res1, &prom1](std::vector &res0) { + res1 = res0; + prom1.set_value(0); + }); + graph_table1.start_graph_sampling(); + fut1.get(); + graph_table1.end_graph_sampling(); + // distributed::BasicBfsGraphSampler *sampler1 = + // (distributed::BasicBfsGraphSampler *)graph_table1.get_graph_sampler(); + // sampler1->start_graph_sampling(); + // std::this_thread::sleep_for (std::chrono::seconds(1)); + // std::vector res1;// = + // sampler1->fetch_sample_res(); + ASSERT_EQ(2, res1.size()); + // odd id:96 48 122 112 + for (int i = 0; i < (int)res1[0].node_size; i++) { + std::cout << res1[0].node_list[i].node_id << std::endl; + } + ASSERT_EQ(4, res1[0].node_size); +#endif +} + +TEST(testGraphSample, Run) { testGraphSample(); } diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index 698a698fc6d18..691a381405e9a 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -1,6 +1,7 @@ -set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) +set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node custom_operator_node) + set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) -set(generated_deps dygraph_function dygraph_node) +set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) message("Performing Eager Dygraph Auto Code Generation") @@ -9,6 +10,8 @@ endif() add_subdirectory(api) add_subdirectory(accumulation) +add_subdirectory(custom_operator) + cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor) cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator) diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc index 3a2ec403c0a59..9c4089af092e4 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.cc +++ b/paddle/fluid/eager/accumulation/accumulation_node.cc @@ -24,7 +24,7 @@ #include "paddle/fluid/platform/errors.h" #include "glog/logging.h" - +DECLARE_bool(retain_grad_for_all_tensor); namespace egr { static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, @@ -39,8 +39,8 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, } std::vector> GradNodeAccumulation:: -operator()( - const std::vector>& grads) { +operator()(const std::vector>& grads, + bool create_graph) { VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation"; PADDLE_ENFORCE(grads.size() == 1, paddle::platform::errors::Fatal( @@ -62,7 +62,7 @@ operator()( grad_out = grads[0][0]; } - if (!weak_grad_.expired()) { + if (!weak_grad_.expired() && FLAGS_retain_grad_for_all_tensor) { auto grad = weak_grad_.lock(); CopyOrAddTensor(grad.get(), grad_out); } diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h index 07fa40165167c..a91a0b6e34c0d 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.h +++ b/paddle/fluid/eager/accumulation/accumulation_node.h @@ -35,8 +35,15 @@ class GradNodeAccumulation : public GradNodeBase { // Functor: perform backward computations virtual std::vector> operator()( - const std::vector>& grads) - override; + const std::vector>& grads, + bool create_graph = false) override; + + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } std::string name() { return "GradNodeAccumulation"; } diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc index 5a2595b9103e4..0bc998a03a80b 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc @@ -145,8 +145,8 @@ void GradNodeScale::SetTensorWrappers_X( void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; } std::vector> GradNodeScale:: -operator()( - const std::vector>& grads) { +operator()(const std::vector>& grads, + bool create_graph) { // 1. Check Output Size PADDLE_ENFORCE( ((grads.size() == 1) && (grads[0].size() == 1)), diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h index 247fde6ed1f86..e263f73a6b8a4 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h @@ -39,8 +39,15 @@ class GradNodeScale : public GradNodeBase { // Functor: perform backward computations virtual std::vector> operator()( - const std::vector>& grads) - override; + const std::vector>& grads, + bool create_graph = false) override; + + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } void SetTensorWrappers_X( const std::vector& tensors); diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc index ba6a936d68651..1be3b31de00a6 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc @@ -86,9 +86,9 @@ paddle::experimental::Tensor scale(const paddle::experimental::Tensor& x, scale_node->SetTensorWrappers_X({x}); // Set Grad out rank as same as fwd input and set stop gradient to bwd - scale_node->SetGradOutMeta(p_autograd_in, /*slot id*/ 0); + scale_node->SetGradOutMeta(x, /*slot id*/ 0); // Set Grad out rank as same as fwd input and set stop gradient to bwd - scale_node->SetGradInMeta(p_autograd_out, /*slot id*/ 0); + scale_node->SetGradInMeta(out, /*slot id*/ 0); // Set History for output set current Grad Node for EagerUtils::SetHistory(p_autograd_out, scale_node); diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h index 00578d9a359a3..a9a62fcd50e7a 100644 --- a/paddle/fluid/eager/api/utils/global_utils.h +++ b/paddle/fluid/eager/api/utils/global_utils.h @@ -18,7 +18,7 @@ #include #include #include "paddle/fluid/imperative/tracer.h" - +#include "paddle/phi/api/ext/op_meta_info.h" namespace egr { class UniqueNameGenerator { @@ -70,6 +70,21 @@ class Controller { void SetInEagerMode(bool in_eager_mode) { in_eager_mode_ = in_eager_mode; } + const std::unordered_map>& + GetOpMetaInfoMap() { + return op_meta_info_map_; + } + + void MergeOpMetaInfoMap(const std::unordered_map< + std::string, std::vector>& map) { + op_meta_info_map_.insert(map.begin(), map.end()); + } + + std::unordered_map>>& + GetCustomEdgesSlotMap() { + return custom_edges_slot_map_; + } + private: Controller() = default; static Controller* controller_; @@ -77,6 +92,11 @@ class Controller { new paddle::imperative::Tracer()}; // TODO(jiabin): remove when we don't need imperative. bool in_eager_mode_{false}; + std::unordered_map> + op_meta_info_map_; + /* op_type : {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}*/ + std::unordered_map>> + custom_edges_slot_map_; DISABLE_COPY_AND_ASSIGN(Controller); }; diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc index 77c39d1b0a37c..b485beca57a21 100644 --- a/paddle/fluid/eager/api/utils/tensor_utils.cc +++ b/paddle/fluid/eager/api/utils/tensor_utils.cc @@ -30,7 +30,8 @@ namespace egr_utils_api { bool IsLeafTensor(const paddle::experimental::Tensor& target) { std::shared_ptr grad_node = EagerUtils::grad_node(target); - if (std::dynamic_pointer_cast(grad_node)) { + if (!grad_node || + std::dynamic_pointer_cast(grad_node)) { return true; } diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index dc79a8a45a246..df2cdc35626a8 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -56,23 +56,29 @@ static std::string LegalizeVariableName(const std::string& var_name) { return ret; } -static bool IgnoreGradAttribute(const std::string& op_type, - const std::string& attr_name) { - // Attributes in operators_with_attrs are created manually during code - // generation - // We should ignore these arbitrary attrs when setting up grad attribute map - if (operators_with_attrs.count(op_type)) { - if (operators_with_attrs[op_type].count(attr_name)) { - return true; - } - } +static std::string HandleDynamicGradAttributes(const std::string& fwd_op_type, + const std::string& attrs_name) { + std::string additional_grad_attrs_str = ""; + + if (fwd_op_type == "sum") { + const char* GRAD_ATTRS_TEMPLATE = " %s[\"%s\"] = %s;\n"; + additional_grad_attrs_str = paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "scale", "float(1.0)"); + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias", "float(0.0f)"); + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias_after_scale", "bool(true)"); + + } else if (fwd_op_type == "scale") { + const char* GRAD_ATTRS_TEMPLATE = " %s[\"%s\"] = %s;\n"; - // Only allow SumOp - if (op_type != "sum") { - return true; + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias", "float(0.0f)"); + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias_after_scale", "bool(true)"); } - return false; + return additional_grad_attrs_str; } static void PrepareAttrMapForOps() { @@ -973,7 +979,9 @@ static bool CollectGradInformationFromOpInfo( /* --------------------------------------------------- */ static std::string GenerateGradNodeCreationContent( const ForwardGenerationInfo& fwd_info, - const GradNodeGenerationInfo& bwd_info) { + const GradNodeGenerationInfo& bwd_info, + const std::string& trace_op_body_str, + std::map inplace_map = {}) { VLOG(6) << "Generating GradNode Creation codes"; const std::string& op_type = fwd_info.GetOpType(); @@ -992,7 +1000,8 @@ static std::string GenerateGradNodeCreationContent( // If single output slotname and not duplicable, // then generate: "egr::AutogradMeta* p_autograd_out = // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")" - std::string get_autograd_meta_str = " // Prepare Autograd Meta \n"; + std::string get_input_autograd_meta_str = " // Prepare Autograd Meta \n"; + std::string get_output_autograd_meta_str = ""; // If single output slotname and not duplicable, // then generate: "egr::AutogradMeta* p_autograd_out = // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")" @@ -1000,22 +1009,39 @@ static std::string GenerateGradNodeCreationContent( const std::string& output_name = output.name(); const std::string& output_autograd_name = "p_autograd_" + output_name; + // output autograd_meta should be got after running TraceOP. if (output.duplicable()) { const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = - " std::vector %s = " + " std::vector %s = " "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_output_autograd_meta_str += paddle::string::Sprintf( GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); } else { - const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = - " egr::AutogradMeta* %s = " - "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( - GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); + // In inplace op, the case where output is duplicable is not considered. + // Replace output directly with input in inplace op. + if (!inplace_map.empty() && inplace_map.count(output_name)) { + auto inplace_input_name = inplace_map[output_name]; + const std::string& inplace_input_autograd_name = + "p_autograd_" + inplace_input_name; + const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = + " %s = egr::EagerUtils::autograd_meta(&%s);\n"; + get_output_autograd_meta_str += paddle::string::Sprintf( + GET_SINGLE_AUTOGRAD_META_TEMPLATE, inplace_input_autograd_name, + inplace_input_name); + } else { + const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = + " egr::AutogradMeta* %s = " + "egr::EagerUtils::autograd_meta(&%s);\n"; + get_output_autograd_meta_str += + paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE, + output_autograd_name, output_name); + } } } VLOG(6) << "Generated outputs autograd_meta"; + // input autograd_meta should be got before running TraceOP (for checking + // inplace). for (const proto::OpProto::Var& input : in_vars) { const std::string& input_name = input.name(); const std::string& input_autograd_name = "p_autograd_" + input_name; @@ -1024,28 +1050,46 @@ static std::string GenerateGradNodeCreationContent( const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = " std::vector %s = " "egr::EagerUtils::nullable_autograd_meta(%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_input_autograd_meta_str += paddle::string::Sprintf( GET_MULTI_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name); } else if (input.dispensable()) { const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = " egr::AutogradMeta* %s = " "egr::EagerUtils::nullable_autograd_meta(%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_input_autograd_meta_str += paddle::string::Sprintf( GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name); } else { const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = " egr::AutogradMeta* %s = " "egr::EagerUtils::nullable_autograd_meta(%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_input_autograd_meta_str += paddle::string::Sprintf( GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name); } } VLOG(6) << "Generated inputs autograd_meta"; + // check inplace input to avoid inplace operations on leaf nodes with + // stop_gradient=False. + std::string check_inplace_str = ""; + if (!inplace_map.empty()) { + const char* CHECKING_INPLACE_TEMPLATE = + " // Check Inplace\n" + " egr::EagerUtils::CheckInplace(%s, p_autograd_%s, " + "require_any_grad);\n"; + for (auto& inplace_pair : inplace_map) { + std::string inplace_name = inplace_pair.second; + check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE, + inplace_name, inplace_name); + } + VLOG(6) << "Check Inplace Input"; + } + std::string prepare_autograd_meta_str = ""; - prepare_autograd_meta_str += get_autograd_meta_str; + // only generate input autograd_meta in temporary. + // output autograd_meta will be generated after running TraceOP. + prepare_autograd_meta_str += get_input_autograd_meta_str; prepare_autograd_meta_str += "\n"; // [GradOpNode] GetTraceBackward @@ -1060,7 +1104,7 @@ static std::string GenerateGradNodeCreationContent( size_t bwd_in_slot_num = out_vars.size(); size_t bwd_out_slot_num = in_vars.size(); const char* GRAD_OP_NODE_TEMPLATE = - " auto grad_node = std::make_shared(%d, %d);\n"; + " auto grad_node = std::make_shared(%d, %d);\n"; grad_node_creation_str += " // Create GradOpNode\n"; grad_node_creation_str += paddle::string::Sprintf( GRAD_OP_NODE_TEMPLATE, op_type, bwd_in_slot_num, bwd_out_slot_num); @@ -1069,14 +1113,14 @@ static std::string GenerateGradNodeCreationContent( VLOG(6) << "Generated GradOpNode construction"; // [GradOpNode] Set Attrs - grad_node_creation_str += " // Set Attributes\n"; - grad_node_creation_str += " grad_node->SetAttrMap(std::move(attrs));\n"; + grad_node_creation_str += " // Set Attributes\n"; + grad_node_creation_str += " grad_node->SetAttrMap(std::move(attrs));\n"; grad_node_creation_str += - " grad_node->SetDefaultAttrMap(std::move(default_attrs));\n"; + " grad_node->SetDefaultAttrMap(std::move(default_attrs));\n"; grad_node_creation_str += "\n"; // [GradOpNode] Set TensorWrappers - grad_node_creation_str += " // Set Tensor Wrappers\n"; + grad_node_creation_str += " // Set Tensor Wrappers\n"; for (const auto& iter : op_base_infos) { const std::map& grad_ins_fwd_slotname_map = iter.GetGradInsFwdSlotnameMap(); @@ -1088,10 +1132,18 @@ static std::string GenerateGradNodeCreationContent( full_reserved = "true"; } const char* SET_TENSOR_WRAPPER_TEMPLATE = - " grad_node->SetTensorWrapper%s(%s, %s);\n"; - grad_node_creation_str += paddle::string::Sprintf( - SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, tensor_wrapper_name, - full_reserved); + " grad_node->SetTensorWrapper%s(%s, %s);\n"; + // Replace output directly with input in inplace op. + if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) { + auto inplace_input_name = inplace_map[tensor_wrapper_name]; + grad_node_creation_str += paddle::string::Sprintf( + SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, + inplace_input_name, full_reserved); + } else { + grad_node_creation_str += paddle::string::Sprintf( + SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, + tensor_wrapper_name, full_reserved); + } } } grad_node_creation_str += "\n"; @@ -1109,12 +1161,12 @@ static std::string GenerateGradNodeCreationContent( size_t input_position = fwd_inputs_name_pos_map.at(input_name); const char* SET_GRAD_OUT_META_TEMPLATE = - " grad_node->SetGradOutMeta(%s, %d);\n"; + " grad_node->SetGradOutMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_OUT_META_TEMPLATE, input_autograd_name, input_position); + SET_GRAD_OUT_META_TEMPLATE, input_name, input_position); const char* ADD_EDGES_TEMPLATE = - " if(%s) grad_node->AddEdges(%s, %d);\n"; + " if(%s) grad_node->AddEdges(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf(ADD_EDGES_TEMPLATE, input_autograd_name, input_autograd_name, input_position); @@ -1123,11 +1175,11 @@ static std::string GenerateGradNodeCreationContent( size_t input_position = fwd_inputs_name_pos_map.at(input_name); const char* SET_GRAD_OUT_META_TEMPLATE = - " grad_node->SetGradOutMeta(&%s, %d);\n"; + " grad_node->SetGradOutMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_OUT_META_TEMPLATE, input_autograd_name, input_position); + SET_GRAD_OUT_META_TEMPLATE, input_name, input_position); - const char* ADD_EDGES_TEMPLATE = " grad_node->AddEdges(&%s, %d);\n"; + const char* ADD_EDGES_TEMPLATE = " grad_node->AddEdges(&%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( ADD_EDGES_TEMPLATE, input_autograd_name, input_position); } @@ -1139,73 +1191,125 @@ static std::string GenerateGradNodeCreationContent( std::string pass_stop_gradient_args = "false"; for (const proto::OpProto::Var& output : out_vars) { const std::string& output_name = output.name(); - const std::string& output_autograd_name = "p_autograd_" + output_name; - size_t output_position = fwd_outputs_name_pos_map.at(output_name); - - // Intermediate Tensor does not require SetHistory, nor RetainGrad - - if (output.duplicable()) { - pass_stop_gradient_args += ", &" + output_autograd_name; + // Replace output directly with input in inplace op. + if (!inplace_map.empty() && inplace_map.count(output_name)) { + auto inplace_input_name = inplace_map[output_name]; + const std::string& inplace_input_autograd_name = + "p_autograd_" + inplace_input_name; + size_t output_position = fwd_outputs_name_pos_map.at(output_name); + + // Intermediate Tensor does not require SetHistory, nor RetainGrad + pass_stop_gradient_args += ", " + inplace_input_autograd_name; const char* SET_OUT_RANK_TEMPLATE = - " egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n"; + " egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + SET_OUT_RANK_TEMPLATE, inplace_input_autograd_name, output_position); // Intermediate Tensor does not require SetHistory if (!output.intermediate()) { const char* SET_HISTORY_TEMPLATE = - " egr::EagerUtils::SetHistory(&%s, grad_node);\n"; - grad_node_creation_str += - paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); + " egr::EagerUtils::SetHistory(%s, grad_node);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_HISTORY_TEMPLATE, inplace_input_autograd_name); } const char* SET_GRAD_IN_META_TEMPLATE = - " grad_node->SetGradInMeta(&%s, %d);\n"; + " grad_node->SetGradInMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position); + SET_GRAD_IN_META_TEMPLATE, inplace_input_name, output_position); + // Intermediate Tensor does not require CheckAndRetainGrad + if (!output.intermediate()) { + VLOG(6) << "Generated Call RetainGradForTensor"; + const char* RETAIN_GRAD_TEMPLATE = + " egr::EagerUtils::CheckAndRetainGrad(%s);\n"; + grad_node_creation_str += + paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, inplace_input_name); + } } else { - pass_stop_gradient_args += ", " + output_autograd_name; - const char* SET_OUT_RANK_TEMPLATE = - " egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n"; - grad_node_creation_str += paddle::string::Sprintf( - SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + const std::string& output_autograd_name = "p_autograd_" + output_name; + size_t output_position = fwd_outputs_name_pos_map.at(output_name); - // Intermediate Tensor does not require SetHistory + // Intermediate Tensor does not require SetHistory, nor RetainGrad + + if (output.duplicable()) { + pass_stop_gradient_args += ", &" + output_autograd_name; + const char* SET_OUT_RANK_TEMPLATE = + " egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + + // Intermediate Tensor does not require SetHistory + if (!output.intermediate()) { + const char* SET_HISTORY_TEMPLATE = + " egr::EagerUtils::SetHistory(&%s, grad_node);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_HISTORY_TEMPLATE, output_autograd_name); + } + const char* SET_GRAD_IN_META_TEMPLATE = + " grad_node->SetGradInMeta(%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_GRAD_IN_META_TEMPLATE, output_name, output_position); + + } else { + pass_stop_gradient_args += ", " + output_autograd_name; + const char* SET_OUT_RANK_TEMPLATE = + " egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + + // Intermediate Tensor does not require SetHistory + if (!output.intermediate()) { + const char* SET_HISTORY_TEMPLATE = + " egr::EagerUtils::SetHistory(%s, grad_node);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_HISTORY_TEMPLATE, output_autograd_name); + } + const char* SET_GRAD_IN_META_TEMPLATE = + " grad_node->SetGradInMeta(%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_GRAD_IN_META_TEMPLATE, output_name, output_position); + } + + // Intermediate Tensor does not require CheckAndRetainGrad if (!output.intermediate()) { - const char* SET_HISTORY_TEMPLATE = - " egr::EagerUtils::SetHistory(%s, grad_node);\n"; + VLOG(6) << "Generated Call RetainGradForTensor"; + const char* RETAIN_GRAD_TEMPLATE = + " egr::EagerUtils::CheckAndRetainGrad(%s);\n"; grad_node_creation_str += - paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); + paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name); } - const char* SET_GRAD_IN_META_TEMPLATE = - " grad_node->SetGradInMeta(%s, %d);\n"; - grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position); - } - - // Intermediate Tensor does not require CheckAndRetainGrad - if (!output.intermediate()) { - VLOG(6) << "Generated Call RetainGradForTensor"; - const char* RETAIN_GRAD_TEMPLATE = - " egr::EagerUtils::CheckAndRetainGrad(%s);\n"; - grad_node_creation_str += - paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name); } } VLOG(6) << "Generated SetGradIn/OutMeta"; // [Generation] GradNode Creation + // After getting require_any_grad, firstly use CheckInplace method for inplace + // op. + // Then execute TraceOp and generate output autograd_meta. + // Finally, Construct GradNode. (Replace output directly with input in inplace + // op.) + // Add event record + std::string event_name = op_type + " node_creation"; const char* GRAD_NODE_CREATION_TEMPLATE = - " %s" + "%s" " bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n" - " if(require_any_grad) {\n" - " VLOG(6) << \" Construct Grad for %s \"; \n" - " egr::EagerUtils::PassStopGradient(%s);\n" - "%s\n }"; + "%s\n" + "%s" + " {\n" + " paddle::platform::RecordEvent node_creation_record_event(\"%s\", " + "paddle::platform::TracerEventType::Operator, 1);\n" + "%s" + " if(require_any_grad) {\n" + " VLOG(6) << \" Construct Grad for %s \"; \n" + " egr::EagerUtils::PassStopGradient(%s);\n" + " %s\n" + " }\n" + " }"; std::string grad_node_creation_body_str = paddle::string::Sprintf( GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str, - compute_require_grad_args, op_type, pass_stop_gradient_args, - grad_node_creation_str); + compute_require_grad_args, check_inplace_str, trace_op_body_str, + event_name, get_output_autograd_meta_str, op_type, + pass_stop_gradient_args, grad_node_creation_str); return grad_node_creation_body_str; } @@ -1215,7 +1319,8 @@ static std::string GenerateGradNodeCreationContent( /* -------------------------------- */ static std::pair GenerateForwardFunctionContents( const ForwardGenerationInfo& fwd_info, - const GradNodeGenerationInfo& bwd_info) { + const GradNodeGenerationInfo& bwd_info, + std::map inplace_map = {}) { /* --- Process Forward Info ---*/ const std::string& op_type = fwd_info.GetOpType(); const std::unordered_map& fwd_inputs_name_pos_map = @@ -1295,8 +1400,21 @@ static std::pair GenerateForwardFunctionContents( core_ops_args_type_info[op_type][input_position] = "list"; } else { - const char* FWD_INS_ARG_TEMPLATE = - "const paddle::experimental::Tensor& %s"; + // inplace tensor can't be const + const char* FWD_INS_ARG_TEMPLATE; + bool flag_find_input_name = false; + if (!inplace_map.empty()) { + for (auto& inplace_pair : inplace_map) { + if (inplace_pair.second == input_name) { + flag_find_input_name = true; + FWD_INS_ARG_TEMPLATE = "paddle::experimental::Tensor& %s"; + break; + } + } + } + if (!flag_find_input_name) { + FWD_INS_ARG_TEMPLATE = "const paddle::experimental::Tensor& %s"; + } input_args_str_list[input_position] = paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name); @@ -1356,6 +1474,7 @@ static std::pair GenerateForwardFunctionContents( // [Generation] Get Outs Map std::string outs_contents_str = ""; + std::string inplace_mapping_str = ""; for (const proto::OpProto::Var& output : out_vars) { const std::string& output_name = output.name(); std::string outnum = "1"; @@ -1398,6 +1517,22 @@ static std::pair GenerateForwardFunctionContents( } core_ops_args_info[op_type].push_back(output_var_name); + } else if (!inplace_map.empty() && inplace_map.count(output_name)) { + // In inplace op, replace the output with the input directly. + PADDLE_ENFORCE_NE( + inplace_map[output_name], "", + paddle::platform::errors::InvalidArgument( + "Inplace op %s has no input corresponding to output %s.", op_type, + output_name)); + const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },"; + auto inplace_input_name = inplace_map[output_name]; + outs_contents_str += paddle::string::Sprintf( + FWD_OUTS_CONTENT_TEMPLATE, output_name, inplace_input_name); + + // inplace_map used in TraceOp. + const char* INPLACE_MAPPING_TEMPLATE = R"({"%s", "%s"},)"; + inplace_mapping_str += paddle::string::Sprintf( + INPLACE_MAPPING_TEMPLATE, inplace_input_name, output_name); } else { if (output.duplicable()) { outnum = output_name + "Num"; @@ -1424,6 +1559,8 @@ static std::pair GenerateForwardFunctionContents( } if (outs_contents_str.size() > 0) outs_contents_str.pop_back(); // Remove trailing "," + if (inplace_mapping_str.size() > 0) + inplace_mapping_str.pop_back(); // Remove trailing "," const char* FWD_OUTS_MAP_TEMPLATE = " std::map GenerateForwardFunctionContents( dygraph_function_args_str += ", const paddle::framework::AttributeMap& attr_map"; + /* --------- Generate TraceOp ----- */ + // TraceOp should be run after compute require_any_grad. (for checking + // inplace) + // `trace_op_body_str` will be passed as a parameter to + // `GenerateGradNodeCreationContent`. + std::string trace_op_body_str = ""; // [Generation] Get TraceOp const char* FWD_TRACE_OP_TEMPLATE = " paddle::framework::AttributeMap attrs = attr_map;\n" @@ -1464,11 +1607,12 @@ static std::pair GenerateForwardFunctionContents( " egr::Controller::Instance().GetCurrentTracer()->TraceOp(\"%s\", ins, " "outs, attrs, \n" " egr::Controller::Instance().GetExpectedPlace(),\n" - " &default_attrs, true, {});\n"; - std::string trace_op_str = - paddle::string::Sprintf(FWD_TRACE_OP_TEMPLATE, op_type); - generated_function_body += trace_op_str; - generated_function_body += "\n"; + " &default_attrs, true, {%s});\n"; + std::string trace_op_str = paddle::string::Sprintf( + FWD_TRACE_OP_TEMPLATE, op_type, inplace_mapping_str); + + trace_op_body_str += trace_op_str; + trace_op_body_str += "\n"; VLOG(6) << "Generated AttrMap & TraceOp"; @@ -1533,34 +1677,64 @@ static std::pair GenerateForwardFunctionContents( output_varname, output_var_args_name); } } else { - const char* FWD_OUT_TENSOR_TEMPLATE = - " paddle::experimental::Tensor %s;\n" - " egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n"; - out_tensor_str = - paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname, - output_name, output_varname); + if (!inplace_map.empty() && inplace_map.count(output_name)) { + // Modify meta info of inplace tensor. + // Bump inplace version of inplace tensor. + auto inplace_input_name = inplace_map[output_name]; + const char* FWD_OUT_TENSOR_TEMPLATE = + " egr::EagerUtils::ModifyInplaceInput(outs[\"%s\"][0], &%s);\n" + " %s.bump_inplace_version();\n" + " VLOG(3) << \"Tensor(\" << %s.name() << \") uses Inplace " + "Strategy.\";\n"; + out_tensor_str = paddle::string::Sprintf( + FWD_OUT_TENSOR_TEMPLATE, output_name, inplace_input_name, + inplace_input_name, inplace_input_name); + } else { + const char* FWD_OUT_TENSOR_TEMPLATE = + " paddle::experimental::Tensor %s;\n" + " egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n"; + out_tensor_str = + paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname, + output_name, output_varname); + } } return_types[return_position] = "paddle::experimental::Tensor"; } - return_contents[return_position] = output_varname; - generated_function_body += out_tensor_str; + if (!inplace_map.empty() && inplace_map.count(output_name)) { + // Replace output directly with input in inplace op. + return_contents[return_position] = inplace_map[output_name]; + } else { + return_contents[return_position] = output_varname; + } + trace_op_body_str += out_tensor_str; } - generated_function_body += "\n"; + trace_op_body_str += "\n"; VLOG(6) << "Converted Output VarBase to EagerVariable(s)"; + /* ------ END Generate TraceOp ----- */ // [Generation] Handle core_ops_returns_info - core_ops_returns_info[op_type] = return_contents; + // avoid inplace op changing core_ops_returns_info + if (core_ops_returns_info.empty() || !core_ops_returns_info.count(op_type)) { + core_ops_returns_info[op_type] = return_contents; + } // [Generation] ComputeRequireGrad -> GradNodeCreation + if (!bwd_info.GenerateForwardOnly()) { - std::string grad_node_creation_body_str = - GenerateGradNodeCreationContent(fwd_info, bwd_info); + // If GradNode needs to be generated, pass `trace_op_body_str` + // into `GenerateGradNodeCreationContent`. + std::string grad_node_creation_body_str = GenerateGradNodeCreationContent( + fwd_info, bwd_info, trace_op_body_str, inplace_map); + generated_function_body += grad_node_creation_body_str; generated_function_body += "\n"; // [Generation] Call RetainGradForTensor VLOG(6) << "Generated GradNode Creation codes"; + } else { + // If GradNode doesn't need to be generated, generate TraceOP directly. + generated_function_body += trace_op_body_str; } // [Generation] Handle return: Tuple/Vector/Tensor @@ -1607,17 +1781,33 @@ static std::pair GenerateForwardFunctionContents( VLOG(6) << "Generated return codes"; // [Generation] Get Full Function - std::string function_name = op_type + "_dygraph_function"; + std::string function_name; + if (inplace_map.empty()) { + function_name = op_type + "_dygraph_function"; + } else { + // change function_name for inplace op. + function_name = op_type + "__dygraph_function"; + } if (dygraph_function_args_str.size() > 0) { auto iter = dygraph_function_args_str.begin(); if ((*iter) == ',') dygraph_function_args_str.erase(iter); } - const char* FWD_FUNCTION_TEMPLATE = "%s %s(%s) {\n\n%s\n}\n\n"; + const char* DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE = + " paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", " + "paddle::platform::TracerEventType::Operator, 1);"; + std::string event_name = op_type + " dygraph"; + std::string fwd_record_event_str = paddle::string::Sprintf( + DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE, event_name); + const char* FWD_FUNCTION_TEMPLATE = + "%s %s(%s) {\n\n" + "%s\n" + "%s\n" + "}\n\n"; std::string fwd_function_str = paddle::string::Sprintf( FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name, - dygraph_function_args_str, generated_function_body); + dygraph_function_args_str, fwd_record_event_str, generated_function_body); // [Generation] Generate forward functions header const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n"; @@ -1804,7 +1994,7 @@ static std::string GenerateSingleOpBase( !is_op_base_per_duplicable_input) { const char* GRAD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", egr::EagerUtils::CreateVars( " - "this->OutputMeta()[%d].Size() ) },"; + "this->OutputMeta()[%d].size() ) },"; outs_contents_str += paddle::string::Sprintf( GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position); } else { @@ -1842,18 +2032,17 @@ static std::string GenerateSingleOpBase( const char* ATTRS_TEMPLATE = " auto& %s = this->attr_map_;\n"; std::string grad_attrs_str = paddle::string::Sprintf(ATTRS_TEMPLATE, attrs_name); - for (const auto& iter : grad_attrs) { - if (IgnoreGradAttribute(fwd_op_type, iter.first)) continue; - std::pair type_val = - GetAttrType(iter.second, false /*is_arg*/); - const char* GRAD_ATTRS_TEMPLATE = - " %s %s = %s;\n" - " %s[\"%s\"] = %s;\n"; - std::string var_name = iter.first + std::to_string(*outs_size); - grad_attrs_str += paddle::string::Sprintf( - GRAD_ATTRS_TEMPLATE, type_val.first, var_name, type_val.second, - attrs_name, iter.first, var_name); - } + if (fwd_op_type == "cast") { + // swtich in out dtype + const char* CAST_GRAD = + " auto temp_type = %s[\"in_dtype\"];\n" + " %s[\"in_dtype\"] = %s[\"out_dtype\"];\n" + " %s[\"out_dtype\"] = temp_type;\n"; + grad_attrs_str += paddle::string::Sprintf(CAST_GRAD, attrs_name, attrs_name, + attrs_name, attrs_name); + } + // Handle dynamic grad attributes + grad_attrs_str += HandleDynamicGradAttributes(fwd_op_type, attrs_name); generated_grad_function_body += grad_attrs_str; const char* TRACE_OP_TEMPLATE = @@ -2032,7 +2221,7 @@ static std::string GenerateGradNodeCCContents( if (is_op_base_per_duplicable_input) { const char* OP_BASE_PER_DUP_INPUT_TEMPLATE = - " for(int i = 0; i < this->OutputMeta()[0].Size(); i++) {\n" + " for(size_t i = 0; i < this->OutputMeta()[0].size(); i++) {\n" " %s\n" " }\n"; generated_grad_function_body = paddle::string::Sprintf( @@ -2044,6 +2233,8 @@ static std::string GenerateGradNodeCCContents( "GradNode%s::ApplyGradientHooks(grads);\n" " std::vector> outputs(%d);\n" " %s\n" + " if(NeedComplexToRealConversion()) " + "HandleComplexGradToRealGrad(&outputs);\n" " return outputs;\n"; generated_grad_function_body = paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(), @@ -2053,7 +2244,8 @@ static std::string GenerateGradNodeCCContents( const char* GRAD_FUNCTION_TEMPLATE = "std::vector> " "GradNode%s::operator()(const " - "std::vector>& grads) {\n%s\n}"; + "std::vector>& grads, " + "bool create_graph) {\n%s\n}"; std::string grad_function_str = paddle::string::Sprintf( GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body); @@ -2088,18 +2280,28 @@ static std::string GenerateGradNodeHeaderContents( "\n" " virtual std::vector> " "operator()(const " - "std::vector>& grads) " + "std::vector>& grads, const " + "bool create_graph = false) " "override;\n" "\n" + " void ClearTensorWrappers() override { \n" + "%s\n" + " is_tensor_wrappers_cleared = true;\n" + " }\n" " std::string name() override { return \" GradNode%s \"; } \n " "\n" " // SetX, SetY, ...\n" "%s\n" " // SetAttrMap\n" "%s\n" + " bool IsTensorWrappersCleared() override { \n" + " return is_tensor_wrappers_cleared;\n" + " }\n" " private:\n" " // TensorWrappers\n" "%s\n" + " bool is_tensor_wrappers_cleared = false;\n" + "\n" " // Attribute Map\n" "%s\n" "};"; @@ -2133,6 +2335,7 @@ static std::string GenerateGradNodeHeaderContents( std::string set_tensor_wrappers_str = ""; std::string tensor_wrapper_members_str = ""; + std::string clear_tensor_wrappers_str = ""; for (const auto& iter : op_base_infos) { const std::map& grad_ins_fwd_slotname_map = iter.GetGradInsFwdSlotnameMap(); @@ -2164,6 +2367,13 @@ static std::string GenerateGradNodeHeaderContents( SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name, struct_tensor_wrapper_name); + const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = + "for (auto tw: %s) {\n" + " tw.clear();\n" + " }\n"; + clear_tensor_wrappers_str += paddle::string::Sprintf( + CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name); + } else { const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE = "const paddle::experimental::Tensor& %s"; @@ -2176,10 +2386,14 @@ static std::string GenerateGradNodeHeaderContents( TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name); const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE = - "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);"; + "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);\n"; tensor_wrapper_body_str = paddle::string::Sprintf( SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name, tensor_wrapper_name, full_reserved_str); + + const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = " %s.clear();\n"; + clear_tensor_wrappers_str += paddle::string::Sprintf( + CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name); } std::string full_reserved_signature_str = "bool full_reserved"; const char* SET_TENSOR_WRAPPER_TEMPLATE = @@ -2194,8 +2408,8 @@ static std::string GenerateGradNodeHeaderContents( std::string grad_node_str = paddle::string::Sprintf( GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type, - op_type, op_type, set_tensor_wrappers_str, set_attr_map_str, - tensor_wrapper_members_str, attr_members_str); + op_type, clear_tensor_wrappers_str, op_type, set_tensor_wrappers_str, + set_attr_map_str, tensor_wrapper_members_str, attr_members_str); return grad_node_str; } @@ -2240,8 +2454,9 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path, "\"paddle/fluid/eager/api/generated/fluid_generated/" "dygraph_forward_api.h\"\n" "#include " - "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n" - "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"; + "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n" + "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n" + "#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n\n"; std::string forward_cc_include_str = paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE); std::ofstream forward_cc_stream(forward_cc_path, std::ios::out); @@ -2379,7 +2594,7 @@ static void DygraphCodeGeneration(const std::string& output_dir) { /* --------------------------- */ VLOG(6) << "-------- GenerateForwardFunctionContents -------"; std::pair body_and_declaration = - GenerateForwardFunctionContents(fwd_info, bwd_info); + GenerateForwardFunctionContents(fwd_info, bwd_info, {}); fwd_function_str += body_and_declaration.first + "\n"; @@ -2387,6 +2602,30 @@ static void DygraphCodeGeneration(const std::string& output_dir) { std::string fwd_function_declare_str = body_and_declaration.second; dygraph_forward_api_str += fwd_function_declare_str; + auto& infer_inplace = + paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_; + std::map inplace_map; + // Inplace Function Generator. + // `sum` op has duplicate input. Don't consider adding inplace strategy + // for `sum` in temporary. + if (op_type != "sum" && infer_inplace) { + auto in_to_outs = infer_inplace(true); + for (auto& inplace_pair : in_to_outs) { + inplace_map[inplace_pair.second] = inplace_pair.first; + } + + VLOG(6) << "-------- GenerateInplaceForwardFunctionContents -------"; + std::pair inplace_body_and_declaration = + GenerateForwardFunctionContents(fwd_info, bwd_info, inplace_map); + + fwd_function_str += inplace_body_and_declaration.first + "\n"; + + VLOG(6) << "-------- GenerateInplaceDygraphForwardAPIContents -------"; + std::string inplace_fwd_function_declare_str = + inplace_body_and_declaration.second; + dygraph_forward_api_str += inplace_fwd_function_declare_str; + } + if (bwd_info.GenerateForwardOnly()) continue; VLOG(6) << "-------- GenerateGradNodeHeaderContents -------"; diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt index c6bca01205e19..771351dd4affb 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt @@ -1,5 +1,5 @@ -set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml") -set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml") +set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml") +set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml") set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc") set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h") set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc") @@ -27,6 +27,7 @@ add_custom_target(eager_final_state_codegen set(tmp_python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h") set(python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function_impl.h") + add_custom_target(eager_final_state_python_c_codegen COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py" "--api_yaml_path=${api_yaml_path}" diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index f56cf8ef24cf6..1685b6f3cb5c3 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -23,10 +23,12 @@ core_ops_args_info = {} core_ops_args_type_info = {} +namespace = "" yaml_types_mapping = { 'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \ 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ + 'str' : 'std::string', \ 'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ 'int64[]' : 'std::vector', 'int[]' : 'std::vector', 'Tensor' : 'Tensor', @@ -125,6 +127,7 @@ def GetAutoGradMetaVectorName(string): def ReadFwdFile(filepath): f = open(filepath, 'r') contents = yaml.load(f, Loader=yaml.FullLoader) + f.close() return contents @@ -133,15 +136,25 @@ def ReadBwdFile(filepath): contents = yaml.load(f, Loader=yaml.FullLoader) ret = {} for content in contents: - assert 'backward_api' in content.keys() - api_name = content['backward_api'] + if 'backward_api' in content.keys(): + api_name = content['backward_api'] + else: + assert False + ret[api_name] = content + f.close() return ret ###################### ### Yaml Parsers ### ###################### +def RemoveSpecialSymbolsInName(string): + # Remove any name after '@' + ret = string.split("@")[0] + return ret + + def IntermediateValidationCheck(intermediate_outputs, forward_returns_list): # intermediate_outputs : [name0, name1, ...] # forward_returns_list : [[ret_name, type, orig_pos], ...] @@ -160,15 +173,19 @@ def IntermediateValidationCheck(intermediate_outputs, forward_returns_list): def ParseDispensable(string): # string: "X, Y" + string = RemoveSpecialSymbolsInName(string) return [v.strip() for v in string.split(",")] def ParseIntermediate(string): + string = RemoveSpecialSymbolsInName(string) return [v.strip() for v in string.split(",")] def ParseNoNeedBuffer(string): # string: "x, y" + string = RemoveSpecialSymbolsInName(string) + no_need_buffer_set = set() for name in string.split(","): no_need_buffer_set.add(name.strip()) @@ -196,8 +213,11 @@ def ParseYamlArgs(string): default_value = m.group(3).split("=")[1].strip() if len( m.group(3).split("=")) > 1 else None - assert arg_type in yaml_types_mapping.keys() + assert arg_type in yaml_types_mapping.keys( + ), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping." arg_type = yaml_types_mapping[arg_type] + + arg_name = RemoveSpecialSymbolsInName(arg_name) if "Tensor" in arg_type: assert default_value is None inputs_list.append([arg_name, arg_type, i]) @@ -229,10 +249,12 @@ def ParseYamlReturns(string): else: ret_type = ret.strip() - assert ret_type in yaml_types_mapping.keys() + assert ret_type in yaml_types_mapping.keys( + ), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping." ret_type = yaml_types_mapping[ret_type] assert "Tensor" in ret_type + ret_name = RemoveSpecialSymbolsInName(ret_name) returns_list.append([ret_name, ret_type, i]) return returns_list @@ -456,6 +478,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, # SetTensorWrapper Methods & TensorWrapper Members set_tensor_wrapper_methods_str = "" tensor_wrapper_members_str = "" + clear_tensor_wrapper_str = "" for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items(): if tname in no_need_buffer_set: no_need_buffer = "true" @@ -477,6 +500,13 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, """ tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format( tensor_wrapper_name) + + CLEAR_TENSOR_WRAPPERS_TEMPLATE = """ + {}.clear(); +""" + clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format( + tensor_wrapper_name) + else: assert IsVectorTensorType(ttype) SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """ @@ -494,6 +524,15 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, """ tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format( tensor_wrapper_name) + + CLEAR_TENSOR_WRAPPERS_TEMPLATE = """ + for (auto tw: {}) { + tw.clear(); + }; +""" + clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format( + tensor_wrapper_name) + # End: SetTensorWrapper Methods & TensorWrapper Members # SetAttributes & Attribute Members @@ -502,7 +541,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, for aname, atype, default_val, _ in backward_attrs_list: saved_attr_name = GetSavedName(aname) SET_ATTR_METHOD_TEMPLATE = """ - void SetAttribute{}({} {}) {{ + void SetAttribute{}({} {}) {{ {} = {}; }} """ @@ -533,25 +572,37 @@ class {} : public egr::GradNodeBase {{ ~{}() override = default; virtual std::vector> operator()( - const std::vector>& grads) override; + const std::vector>& grads, bool create_graph = false) override; std::string name() override {{ return \" {} \"; }} + + void ClearTensorWrappers() override {{ + {} + is_tensor_wrappers_cleared = true; + }} + // SetTensorWrapperX, SetTensorWrapperY, ... {} // SetAttributes {} + + bool IsTensorWrappersCleared() override {{ + return is_tensor_wrappers_cleared; + }} private: // TensorWrappers {} + bool is_tensor_wrappers_cleared = false; + // Attributes {} }}; """ node_declaration_str = NODE_DECLARATION_TEMPLATE.format( grad_node_name, grad_node_name, grad_node_name, grad_node_name, - grad_node_name, set_tensor_wrapper_methods_str, - set_attribute_methods_str, tensor_wrapper_members_str, - attribute_members_str) + grad_node_name, clear_tensor_wrapper_str, + set_tensor_wrapper_methods_str, set_attribute_methods_str, + tensor_wrapper_members_str, attribute_members_str) return node_declaration_str @@ -605,19 +656,27 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, else: # Rearrange output order accordingly returns_str += f"returns[{fwd_position}] = grad_api_returns[{grad_api_position}];\n" + returns_str += f"if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n" returns_str += f"return returns;\n" grad_node_name = GetGradNodeName(fwd_api_name) + + if len(namespace) > 0: + grad_api_namespace = f"paddle::experimental::{namespace}" + else: + grad_api_namespace = f"paddle::experimental" + FUNCTION_TEMPLATE = """ -std::vector> {}::operator()(const std::vector>& grads) {{ +std::vector> {}::operator()(const std::vector>& grads, bool create_graph) {{ // Call grad_api function - auto grad_api_returns = paddle::experimental::{}({}); + auto grad_api_returns = {}::{}({}); {} }} """ node_definition_str = FUNCTION_TEMPLATE.format( - grad_node_name, bwd_api_name, grad_api_args_str, returns_str) + grad_node_name, grad_api_namespace, bwd_api_name, grad_api_args_str, + returns_str) return node_definition_str @@ -671,7 +730,7 @@ def GenerateNodeCreationCodes( else: # Tuple api_result if IsPlainTensorType(rtype): - output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);" + output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));" else: assert IsVectorTensorType(rtype) output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n" @@ -699,18 +758,27 @@ def GenerateNodeCreationCodes( # SetTensorWrappers set_tensor_wrappers_list = [] - for name, (_, is_fwd_input, _) in backward_fwd_input_map.items(): + for name, (atype, is_fwd_input, pos) in backward_fwd_input_map.items(): is_optional = (name in optional_inputs) + if is_fwd_input: if is_optional: set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);" else: set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);" else: + if num_fwd_outputs > 1: + # Aligned with forward output position + assert name in forward_outputs_position_map.keys() + fwd_output_pos = forward_outputs_position_map[name][1] + tw_name = f"std::get<{fwd_output_pos}>(api_result)" + else: + tw_name = f"api_result" + if is_optional: - set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, false);" + set_tensor_wrappers = f" if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);" else: - set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, false);" + set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({tw_name}, false);" set_tensor_wrappers_list.append(set_tensor_wrappers) set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list) @@ -719,7 +787,7 @@ def GenerateNodeCreationCodes( set_edges_list = [] for name, (_, pos) in forward_inputs_position_map.items(): input_autograd_meta_name = GetAutoGradMetaName(name) - set_grad_out_meta = f" grad_node->SetGradOutMeta({input_autograd_meta_name}, {pos});" + set_grad_out_meta = f" grad_node->SetGradOutMeta({name}, {pos});" set_edges = f" grad_node->AddEdges({input_autograd_meta_name}, {pos});" set_grad_out_meta_list.append(set_grad_out_meta) set_edges_list.append(set_edges) @@ -736,17 +804,18 @@ def GenerateNodeCreationCodes( output_autograd_meta_name = GetAutoGradMetaName(name) set_out_rank = f" egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});" set_history = f" egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);" - set_grad_in_meta = f" grad_node->SetGradInMeta({output_autograd_meta_name}, {pos});" + if num_outputs == 1: + set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);" + set_grad_in_meta = f" grad_node->SetGradInMeta(api_result, {pos});" + else: + set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));" + set_grad_in_meta = f" grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});" set_out_rank_list.append(set_out_rank) set_history_list.append(set_history) set_grad_in_meta_list.append(set_grad_in_meta) - - if num_outputs == 1: - set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);" - else: - set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result[{pos}]);" set_retain_grad_list.append(set_retain_grad) + set_out_rank_str = "\n".join(set_out_rank_list) set_history_str = "\n".join(set_history_list) set_grad_in_meta_str = "\n".join(set_grad_in_meta_list) @@ -850,7 +919,11 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, function_name = fwd_api_name else: function_name = fwd_api_name + "_intermediate" - forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});" + + if len(namespace) > 0: + forward_call_str = f"auto api_result = paddle::experimental::{namespace}::{function_name}({inputs_call_args_str});" + else: + forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});" # Get return type list & outputs num_outputs = len(forward_outputs_position_map.keys()) - len( @@ -864,7 +937,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, returns_list[0] = f"api_result" else: # Tuple api_result - returns_list[pos] = f"api_result[{pos}]" + returns_list[pos] = f"std::get<{pos}>(api_result)" if IsPlainTensorType(rtype): returns_type_list[pos] = "paddle::experimental::Tensor" @@ -887,8 +960,20 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map, backward_attrs_list, optional_inputs) + node_event_name = fwd_api_name + " node_creation" + NODE_CREATION_TEMPLATE = """{{\n + paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n + {}\n + }}""" + node_creation_str = NODE_CREATION_TEMPLATE.format(node_event_name, + node_creation_str) + + dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);" + FORWARD_FUNCTION_TEMPLATE = """ {} {}({}) {{ + {} + // Forward API Call {} @@ -902,7 +987,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, forward_function_name = GetForwardFunctionName(fwd_api_name) forward_function_str = FORWARD_FUNCTION_TEMPLATE.format( returns_type_str, forward_function_name, inputs_args_definition_str, - forward_call_str, node_creation_str, returns_str) + dygraph_event_str, forward_call_str, node_creation_str, returns_str) forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});" return forward_function_str, forward_function_declaration_str @@ -1002,6 +1087,7 @@ def GenerateNodeCCFile(filepath, node_definition_str): #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" #include "paddle/fluid/eager/to_static/run_program_op_node.h" +#include "paddle/phi/api/backward/sparse_bw_api.h" """ file_contents += node_definition_str with open(filepath, 'a') as f: @@ -1022,10 +1108,13 @@ def GenerateNodeHFile(filepath, node_declaration_str): def GenerateForwardCCFile(filepath, forward_definition_str): file_contents = """ +#include "paddle/phi/api/lib/dygraph_api.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" +#include "paddle/phi/api/include/sparse_api.h" #include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" """ @@ -1055,134 +1144,184 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str): if __name__ == "__main__": args = ParseArguments() - api_yaml_path = args.api_yaml_path - backward_yaml_path = args.backward_yaml_path - - fwd_api_list = ReadFwdFile(api_yaml_path) - grad_api_dict = ReadBwdFile(backward_yaml_path) + api_yaml_paths = args.api_yaml_path.split(",") + backward_yaml_paths = args.backward_yaml_path.split(",") # Generate per Dygraph API node_declaration_str = "" node_definition_str = "" forward_definition_str = "" forward_declaration_str = "" - for fwd_api in fwd_api_list: - # We only generate Ops with grad - if 'backward' not in fwd_api.keys(): - continue - assert 'api' in fwd_api.keys() - assert 'args' in fwd_api.keys() - assert 'output' in fwd_api.keys() - assert 'backward' in fwd_api.keys() - - no_need_buffer_set = set() - if 'no_need_buffer' in fwd_api.keys(): - no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer']) - - fwd_api_name = fwd_api['api'] - fwd_args_str = fwd_api['args'] - fwd_returns_str = fwd_api['output'] - - bwd_api_name = fwd_api['backward'] - assert bwd_api_name in grad_api_dict.keys() - bwd_api = grad_api_dict[bwd_api_name] - - assert 'args' in bwd_api.keys() - assert 'output' in bwd_api.keys() - assert 'forward' in bwd_api.keys() - - # Parse Dispensable Inputs - optional_inputs = [] - if 'optional' in fwd_api.keys(): - optional_inputs = ParseDispensable(fwd_api['optional']) - - bwd_forward_str = bwd_api['forward'] - bwd_args_str = bwd_api['args'] - bwd_returns_str = bwd_api['output'] - - # Collect Forward Inputs/Outputs - forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward( - bwd_forward_str) - print("Parsed Forward Inputs List: ", forward_inputs_list) - print("Prased Forward Attrs List: ", forward_attrs_list) - print("Parsed Forward Returns List: ", forward_returns_list) - - intermediate_outputs = [] - if 'intermediate' in fwd_api.keys(): - intermediate_outputs = ParseIntermediate(fwd_api['intermediate']) - - IntermediateValidationCheck(intermediate_outputs, forward_returns_list) - - # Collect Original Forward Inputs/Outputs and then perform validation checks - orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward( - fwd_args_str, fwd_returns_str) - print("Parsed Original Forward Inputs List: ", orig_forward_inputs_list) - print("Prased Original Forward Attrs List: ", orig_forward_attrs_list) - print("Parsed Original Forward Returns List: ", - orig_forward_returns_list) - - # Forward Validation Checks - ForwardsValidationCheck(forward_inputs_list, forward_attrs_list, - forward_returns_list, orig_forward_inputs_list, - orig_forward_attrs_list, - orig_forward_returns_list) - - # Parse Backward Inputs/Outputs - backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward( - bwd_args_str, bwd_returns_str) - print("Parsed Backward Inputs List: ", backward_inputs_list) - print("Prased Backward Attrs List: ", backward_attrs_list) - print("Parsed Backward Returns List: ", backward_returns_list) - - # Determine Forward Inputs/Outputs Position - forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( - forward_inputs_list, forward_returns_list) - print("Generated Forward Input Position Map: ", - forward_inputs_position_map) - print("Generated Forward Output Position Map: ", - forward_outputs_position_map) - - # SlotName Matching - backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching( - backward_inputs_list, backward_returns_list, - forward_inputs_position_map, forward_outputs_position_map) - print("Generated Backward Fwd Input Map: ", backward_fwd_input_map) - print("Generated Backward Grad Input Map: ", backward_grad_input_map) - print("Generated Backward Grad Output Map: ", backward_grad_output_map) - - # Backward Validation Check - BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map, - backward_attrs_list) - - # Node Declaration Generation - node_declaration_str += GenerateNodeDeclaration( - fwd_api_name, backward_fwd_input_map, backward_attrs_list, - no_need_buffer_set) - print("Generated Node Declaration: ", node_declaration_str) - - node_definition_str += GenerateNodeDefinition( - fwd_api_name, bwd_api_name, backward_fwd_input_map, - backward_grad_input_map, backward_grad_output_map, - backward_attrs_list) - print("Generated Node Definition: ", node_definition_str) - - # Node Definition Generation - definition_declaration_pair = GenerateForwardDefinition( - fwd_api_name, bwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, - backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list, optional_inputs, - intermediate_outputs) - print("Generated Forward Definition: ", forward_definition_str) - print("Generated Forward Declaration: ", forward_declaration_str) - forward_definition_str += definition_declaration_pair[0] - forward_declaration_str += definition_declaration_pair[1] - - # For python-level API dispatch - CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, - forward_attrs_list) + for i in range(len(api_yaml_paths)): + api_yaml_path = api_yaml_paths[i] + backward_yaml_path = backward_yaml_paths[i] + + if "sparse" in api_yaml_path: + assert "sparse" in backward_yaml_path + namespace = "sparse" + else: + namespace = "" + + fwd_api_list = ReadFwdFile(api_yaml_path) + grad_api_dict = ReadBwdFile(backward_yaml_path) + + yaml_forward_definition_str = "" + yaml_forward_declaration_str = "" + yaml_node_declaration_str = "" + yaml_node_definition_str = "" + for fwd_api in fwd_api_list: + # We only generate Ops with grad + if 'backward' not in fwd_api.keys(): + continue + + assert 'api' in fwd_api.keys() + assert 'args' in fwd_api.keys() + assert 'output' in fwd_api.keys() + assert 'backward' in fwd_api.keys() + + no_need_buffer_set = set() + if 'no_need_buffer' in fwd_api.keys(): + no_need_buffer_set = ParseNoNeedBuffer(fwd_api[ + 'no_need_buffer']) + + fwd_api_name = fwd_api['api'] + fwd_args_str = fwd_api['args'] + fwd_returns_str = fwd_api['output'] + + bwd_api_name = fwd_api['backward'] + assert bwd_api_name in grad_api_dict.keys() + bwd_api = grad_api_dict[bwd_api_name] + + assert 'args' in bwd_api.keys() + assert 'output' in bwd_api.keys() + assert 'forward' in bwd_api.keys() + + # Parse Dispensable Inputs + optional_inputs = [] + if 'optional' in fwd_api.keys(): + optional_inputs = ParseDispensable(fwd_api['optional']) + + bwd_forward_str = bwd_api['forward'] + bwd_args_str = bwd_api['args'] + bwd_returns_str = bwd_api['output'] + + # Collect Forward Inputs/Outputs + forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward( + bwd_forward_str) + print("Parsed Forward Inputs List: ", forward_inputs_list) + print("Prased Forward Attrs List: ", forward_attrs_list) + print("Parsed Forward Returns List: ", forward_returns_list) + + intermediate_outputs = [] + if 'intermediate' in fwd_api.keys(): + intermediate_outputs = ParseIntermediate(fwd_api[ + 'intermediate']) + + IntermediateValidationCheck(intermediate_outputs, + forward_returns_list) + + # Collect Original Forward Inputs/Outputs and then perform validation checks + orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward( + fwd_args_str, fwd_returns_str) + print("Parsed Original Forward Inputs List: ", + orig_forward_inputs_list) + print("Prased Original Forward Attrs List: ", + orig_forward_attrs_list) + print("Parsed Original Forward Returns List: ", + orig_forward_returns_list) + + # Forward Validation Checks + ForwardsValidationCheck( + forward_inputs_list, forward_attrs_list, forward_returns_list, + orig_forward_inputs_list, orig_forward_attrs_list, + orig_forward_returns_list) + + # Parse Backward Inputs/Outputs + backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward( + bwd_args_str, bwd_returns_str) + print("Parsed Backward Inputs List: ", backward_inputs_list) + print("Prased Backward Attrs List: ", backward_attrs_list) + print("Parsed Backward Returns List: ", backward_returns_list) + + # Determine Forward Inputs/Outputs Position + forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( + forward_inputs_list, forward_returns_list) + print("Generated Forward Input Position Map: ", + forward_inputs_position_map) + print("Generated Forward Output Position Map: ", + forward_outputs_position_map) + + # SlotName Matching + backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching( + backward_inputs_list, backward_returns_list, + forward_inputs_position_map, forward_outputs_position_map) + print("Generated Backward Fwd Input Map: ", backward_fwd_input_map) + print("Generated Backward Grad Input Map: ", + backward_grad_input_map) + print("Generated Backward Grad Output Map: ", + backward_grad_output_map) + + # Backward Validation Check + BackwardValidationCheck(backward_fwd_input_map, + backward_grad_input_map, + backward_attrs_list) + + # Node Declaration Generation + yaml_node_declaration_str += GenerateNodeDeclaration( + fwd_api_name, backward_fwd_input_map, backward_attrs_list, + no_need_buffer_set) + print("Generated Node Declaration: ", node_declaration_str) + + yaml_node_definition_str += GenerateNodeDefinition( + fwd_api_name, bwd_api_name, backward_fwd_input_map, + backward_grad_input_map, backward_grad_output_map, + backward_attrs_list) + print("Generated Node Definition: ", node_definition_str) + + # Node Definition Generation + definition_declaration_pair = GenerateForwardDefinition( + fwd_api_name, bwd_api_name, forward_inputs_position_map, + forward_outputs_position_map, orig_forward_attrs_list, + backward_fwd_input_map, backward_grad_input_map, + backward_grad_output_map, backward_attrs_list, optional_inputs, + intermediate_outputs) + print("Generated Forward Definition: ", forward_definition_str) + print("Generated Forward Declaration: ", forward_declaration_str) + yaml_forward_definition_str += definition_declaration_pair[0] + yaml_forward_declaration_str += definition_declaration_pair[1] + + # For python-level API dispatch + CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, + forward_outputs_position_map, + orig_forward_attrs_list) + + if len(namespace) > 0: + forward_definition_str += f"""namespace {namespace} {{ + {yaml_forward_definition_str} +}} +""" + + forward_declaration_str += f"""namespace {namespace} {{ + {yaml_forward_declaration_str} +}} +""" + + node_declaration_str += f"""namespace {namespace} {{ + {yaml_node_declaration_str} +}} +""" + + node_definition_str += f"""namespace {namespace} {{ + {yaml_node_definition_str} +}} +""" + + else: + forward_definition_str += yaml_forward_definition_str + forward_declaration_str += yaml_forward_declaration_str + node_declaration_str += yaml_node_declaration_str + node_definition_str += yaml_node_definition_str # Generate Files nodes_h_path = args.nodes_h_path diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index d0506e45eb476..e1c2cf871ea42 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -14,9 +14,18 @@ import os import argparse -from eager_gen import yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap +import logging +from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap + +########################### +## Global Configurations ## +########################### +skipped_forward_api_names = set(["scale"]) + + +def SkipAPIGeneration(forward_api_name): + return (forward_api_name in skipped_forward_api_names) -skipped_fwd_api_names = set(["scale"]) atype_to_parsing_function = { "bool": "CastPyArg2Boolean", @@ -24,7 +33,7 @@ "long": "CastPyArg2Long", "int64_t": "CastPyArg2Long", "float": "CastPyArg2Float", - "string": "CastPyArg2String", + "std::string": "CastPyArg2String", "std::vector": "CastPyArg2Booleans", "std::vector": "CastPyArg2Ints", "std::vector": "CastPyArg2Longs", @@ -39,64 +48,35 @@ } -def ParseArguments(): - parser = argparse.ArgumentParser( - description='Eager Code Generator Args Parser') - parser.add_argument('--api_yaml_path', type=str) - parser.add_argument('--output_path', type=str) - - args = parser.parse_args() - return args - - def FindParsingFunctionFromAttributeType(atype): if atype not in atype_to_parsing_function.keys(): - print(f"Unable to find {atype} in atype_to_parsing_function.") - assert False + assert False, f"Unable to find {atype} in atype_to_parsing_function." return atype_to_parsing_function[atype] -def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, - forward_attrs_list, forward_outputs_position_map, - optional_inputs, is_forward_only): - # forward_inputs_position_map = { "name" : [type, fwd_position] } - # forward_outputs_position_map = { "name" : [type, fwd_position] } - # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] - # optional_inputs = [name0, ...] - - # Get EagerTensor from args - # Get dygraph function call args - num_args = len(forward_inputs_position_map.keys()) + len(forward_attrs_list) - num_input_tensors = len(forward_inputs_position_map.keys()) - dygraph_function_call_list = ["" for i in range(num_args)] - get_eager_tensor_str = "" - for name, (ttype, pos) in forward_inputs_position_map.items(): - is_optional = (name in optional_inputs) - if IsVectorTensorType(ttype): - get_eager_tensor_str += f" auto {name} = GetTensorListFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" - else: - if is_optional: - get_eager_tensor_str += f" auto {name} = GetOptionalTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" - else: - get_eager_tensor_str += f" auto {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" - dygraph_function_call_list[pos] = f"{name}" +########################## +## Refactored Functions ## +########################## +PARSE_PYTHON_C_TENSORS_TEMPLATE = \ +" auto {} = {}(\"{}\", \"{}\", args, {}, false);\n" + - parse_attributes_str = "" - # Get Attributes - for name, atype, _, pos in forward_attrs_list: - parsing_function = FindParsingFunctionFromAttributeType(atype) - key = f"{name}" +PARSE_PYTHON_C_ARGS_TEMPLATE = \ +""" PyObject* {}_obj = PyTuple_GET_ITEM(args, {});\n + {} {} = {}({}_obj, \"{}\", {});\n""" - parse_attributes_str += f" PyObject* {name}_obj = PyTuple_GET_ITEM(args, {pos});\n" - parse_attributes_str += f" {atype} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n" - dygraph_function_call_list[pos] = f"{name}" - dygraph_function_call_str = ",".join(dygraph_function_call_list) +RECORD_EVENT_TEMPLATE = \ +" paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);" - PYTHON_C_FUNCTION_TEMPLATE = """ + +PYTHON_C_FUNCTION_TEMPLATE = \ +""" static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs) {{ + {} + PyThreadState *tstate = nullptr; try {{ @@ -126,22 +106,50 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, }} """ - if is_forward_only: - fwd_function_name = fwd_api_name - else: - fwd_function_name = GetForwardFunctionName(fwd_api_name) - python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( - fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str, - fwd_function_name, dygraph_function_call_str) - python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" +FUNCTION_NAME_TEMPLATE = \ +"{}{}{}" - return python_c_function_str, python_c_function_reg_str +PYTHON_C_FUNCTION_REG_TEMPLATE = \ +"{{\"final_state_{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}" -def GenerateCoreOpsInfoMap(): - result = """ + +PYTHON_C_WRAPPER_TEMPLATE = \ +""" +#pragma once + +#include "pybind11/detail/common.h" +#include "paddle/phi/api/all.h" +#include "paddle/phi/api/lib/dygraph_api.h" +#include "paddle/phi/common/backend.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/api/include/sparse_api.h" +#include "paddle/fluid/pybind/op_function_common.h" +#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" +#include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include + +namespace paddle {{ +namespace pybind {{ + +{} + +static PyMethodDef EagerFinalStateMethods[] = {{ + {} +}}; + +}} // namespace pybind +}} // namespace paddle +""" + + +CORE_OPS_INFO = \ +""" static PyObject * eager_get_final_state_core_ops_args_info(PyObject *self) { PyThreadState *tstate = nullptr; try @@ -186,10 +194,12 @@ def GenerateCoreOpsInfoMap(): return nullptr; } } - """ +""" - core_ops_infos_registry = """ - ,{\"get_final_state_core_ops_args_info\", + +CORE_OPS_INFO_REGISTRY = \ +""" + {\"get_final_state_core_ops_args_info\", (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_args_info.\"}, {\"get_final_state_core_ops_args_type_info\", @@ -201,7 +211,259 @@ def GenerateCoreOpsInfoMap(): METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_returns_info.\"}, """ - return result, core_ops_infos_registry +NAMESPACE_WRAPPER_TEMPLATE = \ +"""namespace {} {{ + {} +}} +""" + + +####################### +## Generator Classes ## +####################### +class PythonCSingleFunctionGenerator: + def __init__(self, fwd_api_contents, namespace): + self.fwd_api_contents = fwd_api_contents + self.namespace = namespace + + # Raw Contents + self.forward_api_name = "" + self.forward_args_str = "" + self.forward_returns_str = "" + + # Raw Data + self.forward_attrs_list = None #[ [attr_name, attr_type, default_value, orig_position], ...] + self.forward_inputs_list = None #[ [arg_name, arg_type, orig_position], ...] + self.forward_returns_list = None #[ [ret_name, ret_type, orig_position], ...] + + # Processed Data + self.forward_inputs_position_map = None #{ "name" : [type, fwd_position] } + self.forward_outputs_position_map = None #{ "name" : [type, fwd_position] } + + # Special Op Attributes + self.optional_inputs = [] #[name, ...] + self.is_forward_only = True + + # Generated Results + self.python_c_function_str = "" + self.python_c_function_reg_str = "" + + def CollectRawContents(self): + fwd_api_contents = self.fwd_api_contents + + assert 'api' in fwd_api_contents.keys( + ), "Unable to find \"api\" in fwd_api_contents keys" + assert 'args' in fwd_api_contents.keys( + ), "Unable to find \"args\" in fwd_api_contents keys" + assert 'output' in fwd_api_contents.keys( + ), "Unable to find \"output\" in fwd_api_contents keys" + + self.forward_api_name = fwd_api_contents['api'] + self.forward_args_str = fwd_api_contents['args'] + self.forward_returns_str = fwd_api_contents['output'] + + def CollectIsForwardOnly(self): + fwd_api_contents = self.fwd_api_contents + self.is_forward_only = False if 'backward' in fwd_api_contents.keys( + ) else True + + def CollectOptionalInputs(self): + fwd_api_contents = self.fwd_api_contents + if 'optional' in fwd_api_contents.keys(): + self.optional_inputs = ParseDispensable(fwd_api_contents[ + 'optional']) + + def CollectForwardInOutAttr(self): + forward_args_str = self.forward_args_str + forward_returns_str = self.forward_returns_str + + self.forward_inputs_list, self.forward_attrs_list, self.forward_returns_list = ParseYamlForward( + forward_args_str, forward_returns_str) + + def CollectForwardPositionMap(self): + forward_inputs_list = self.forward_inputs_list + forward_returns_list = self.forward_returns_list + + self.forward_inputs_position_map, self.forward_outputs_position_map = DetermineForwardPositionMap( + forward_inputs_list, forward_returns_list) + + def GeneratePythonCFunction(self): + namespace = self.namespace + forward_api_name = self.forward_api_name + forward_attrs_list = self.forward_attrs_list + forward_inputs_position_map = self.forward_inputs_position_map + forward_outputs_position_map = self.forward_outputs_position_map + optional_inputs = self.optional_inputs + is_forward_only = self.is_forward_only + + # Generate Python-C Tensors Parsing Logic + get_eager_tensor_str = "" + for name, (ttype, pos) in forward_inputs_position_map.items(): + is_optional = (name in optional_inputs) + if IsVectorTensorType(ttype): + get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format( + name, "GetTensorListFromArgs", forward_api_name, name, pos) + else: + if is_optional: + get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format( + name, "GetOptionalTensorFromArgs", forward_api_name, + name, pos) + else: + get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format( + name, "GetTensorFromArgs", forward_api_name, name, pos) + + parse_attributes_str = "" + + # Generate Python-C Attributes Parsing Logic + for name, atype, _, pos in forward_attrs_list: + parsing_function_name = FindParsingFunctionFromAttributeType(atype) + parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format( + name, pos, atype, name, parsing_function_name, name, + forward_api_name, pos) + + # Generate Dygraph Function Call Logic + num_args = len(forward_inputs_position_map.keys()) + len( + forward_attrs_list) + dygraph_function_call_list = ["" for i in range(num_args)] + for name, (_, pos) in forward_inputs_position_map.items(): + dygraph_function_call_list[pos] = f"{name}" + for name, _, _, pos in forward_attrs_list: + dygraph_function_call_list[pos] = f"{name}" + dygraph_function_call_str = ",".join(dygraph_function_call_list) + + # Generate Python-C Function Definitions + if is_forward_only: + fwd_function_name = FUNCTION_NAME_TEMPLATE.format( + "paddle::experimental::", namespace, forward_api_name) + else: + fwd_function_name = FUNCTION_NAME_TEMPLATE.format( + "::", namespace, GetForwardFunctionName(forward_api_name)) + + # Generate Record Event for performance profiling + pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format( + "pythonc_record_event", forward_api_name, "pybind_imperative_func") + self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( + forward_api_name, pythonc_record_event_str, forward_api_name, + get_eager_tensor_str, parse_attributes_str, fwd_function_name, + dygraph_function_call_str) + + # Generate Python-C Function Registration + self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format( + forward_api_name, namespace, forward_api_name, forward_api_name) + + def run(self): + # Initialized is_forward_only + self.CollectIsForwardOnly() + + # Initialized forward_api_name, forward_args_str, forward_returns_str + self.CollectRawContents() + if SkipAPIGeneration(self.forward_api_name): return False + + # Initialized optional_inputs + self.CollectOptionalInputs() + + # Initialized forward_inputs_list, forward_returns_list, forward_attrs_list + self.CollectForwardInOutAttr() + logging.info( + f"Parsed Original Forward Inputs List: \n{self.forward_inputs_list}") + logging.info( + f"Prased Original Forward Attrs List: \n{self.forward_attrs_list}") + logging.info( + f"Parsed Original Forward Returns List: \n{self.forward_returns_list}" + ) + + # Initialized forward_inputs_position_map, forward_outputs_position_map + self.CollectForwardPositionMap() + logging.info( + f"Generated Forward Input Position Map: {self.forward_inputs_position_map}" + ) + logging.info( + f"Generated Forward Output Position Map: {self.forward_outputs_position_map}" + ) + + # Code Generation + self.GeneratePythonCFunction() + logging.info( + f"Generated Python-C Function: {self.python_c_function_str}") + logging.info( + f"Generated Python-C Function Declaration: {self.python_c_function_reg_str}" + ) + + return True + + +class PythonCYamlGenerator: + def __init__(self, path): + self.yaml_path = path + + self.namespace = "" + self.forward_api_list = [] + + # Generated Result + self.python_c_functions_reg_str = "" + self.python_c_functions_str = "" + + def ParseYamlContents(self): + yaml_path = self.yaml_path + self.forward_api_list = ReadFwdFile(yaml_path) + + def GeneratePythonCFunctions(self): + namespace = self.namespace + forward_api_list = self.forward_api_list + + for forward_api_content in forward_api_list: + f_generator = PythonCSingleFunctionGenerator(forward_api_content, + namespace) + status = f_generator.run() + + if status == True: + self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n" + self.python_c_functions_str += f_generator.python_c_function_str + "\n" + + def InferNameSpace(self): + yaml_path = self.yaml_path + if "sparse" in yaml_path: + self.namespace = "sparse::" + + def AttachNamespace(self): + namespace = self.namespace + python_c_functions_str = self.python_c_functions_str + + if namespace != "": + if namespace.endswith("::"): + namespace = namespace[:-2] + self.python_c_functions_str = NAMESPACE_WRAPPER_TEMPLATE.format( + namespace, python_c_functions_str) + + def run(self): + # Infer namespace from yaml_path + self.InferNameSpace() + + # Read Yaml file + self.ParseYamlContents() + + # Code Generation + self.GeneratePythonCFunctions() + + # Wrap with namespace + self.AttachNamespace() + + +############################ +## Code Generation Helper ## +############################ +def ParseArguments(): + parser = argparse.ArgumentParser( + description='Eager Code Generator Args Parser') + parser.add_argument('--api_yaml_path', type=str) + parser.add_argument('--output_path', type=str) + + args = parser.parse_args() + return args + + +def GenerateCoreOpsInfoMap(): + return CORE_OPS_INFO, CORE_OPS_INFO_REGISTRY def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): @@ -213,33 +475,6 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): python_c_function_reg_str += core_ops_infos_registry python_c_function_reg_str += "\n {nullptr,nullptr,0,nullptr}" - PYTHON_C_WRAPPER_TEMPLATE = """ -#pragma once - -#include "pybind11/detail/common.h" -#include "paddle/phi/api/all.h" -#include "paddle/phi/common/backend.h" -#include "paddle/phi/common/data_type.h" -#include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" -#include "paddle/fluid/pybind/op_function_common.h" -#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" -#include "paddle/fluid/pybind/exception.h" -#include - -namespace paddle {{ -namespace pybind {{ - -{} - -static PyMethodDef EagerFinalStateMethods[] = {{ - {} -}}; - -}} // namespace pybind -}} // namespace paddle - -""" python_c_str = PYTHON_C_WRAPPER_TEMPLATE.format(python_c_function_str, python_c_function_reg_str) @@ -253,63 +488,23 @@ def GeneratePythonCFile(filepath, python_c_str): if __name__ == "__main__": args = ParseArguments() + api_yaml_paths = args.api_yaml_path.split(",") - api_yaml_path = args.api_yaml_path - fwd_api_list = ReadFwdFile(api_yaml_path) - - python_c_function_list = [] - python_c_function_reg_list = [] - for fwd_api in fwd_api_list: - - # We only generate Ops with grad - is_forward_only = False - if 'backward' not in fwd_api.keys(): - is_forward_only = True - - assert 'api' in fwd_api.keys() - assert 'args' in fwd_api.keys() - assert 'output' in fwd_api.keys() - - fwd_api_name = fwd_api['api'] - fwd_args_str = fwd_api['args'] - fwd_returns_str = fwd_api['output'] - - if fwd_api_name in skipped_fwd_api_names: - continue - - # Parse Dispensable Inputs - optional_inputs = [] - if 'optional' in fwd_api.keys(): - optional_inputs = ParseDispensable(fwd_api['optional']) - - # Collect Original Forward Inputs/Outputs and then perform validation checks - forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward( - fwd_args_str, fwd_returns_str) - print("Parsed Original Forward Inputs List: ", forward_inputs_list) - print("Prased Original Forward Attrs List: ", forward_attrs_list) - print("Parsed Original Forward Returns List: ", forward_returns_list) - - forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( - forward_inputs_list, forward_returns_list) - print("Generated Forward Input Position Map: ", - forward_inputs_position_map) - print("Generated Forward Output Position Map: ", - forward_outputs_position_map) + generated_python_c_functions = "" + generated_python_c_registration = "" + for i in range(len(api_yaml_paths)): + api_yaml_path = api_yaml_paths[i] - python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( - fwd_api_name, forward_inputs_position_map, forward_attrs_list, - forward_outputs_position_map, optional_inputs, is_forward_only) - python_c_function_list.append(python_c_function_str) - python_c_function_reg_list.append(python_c_function_reg_str) - print("Generated Python-C Function: ", python_c_function_str) + y_generator = PythonCYamlGenerator(api_yaml_path) + y_generator.run() - python_c_functions_str = "\n".join(python_c_function_list) - python_c_functions_reg_str = ",\n".join(python_c_function_reg_list) + generated_python_c_functions += y_generator.python_c_functions_str + "\n" + generated_python_c_registration += y_generator.python_c_functions_reg_str + "\n" - python_c_str = GeneratePythonCWrappers(python_c_functions_str, - python_c_functions_reg_str) + python_c_str = GeneratePythonCWrappers(generated_python_c_functions, + generated_python_c_registration) - print("Generated Python-C Codes: ", python_c_str) + logging.info(f"Generated Python-C Codes: \n{python_c_str}") output_path = args.output_path for path in [output_path]: diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h index 9e1dc4f2c8c6b..dca76d3b8a0db 100644 --- a/paddle/fluid/eager/autograd_meta.h +++ b/paddle/fluid/eager/autograd_meta.h @@ -145,8 +145,7 @@ class AutogradMeta : public AbstractAutogradMeta { private: // TODO(jiabin) :Should we use pointer instead of object? std::shared_ptr grad_{ - std::make_shared( - egr::Controller::Instance().GenerateUniqueName("@grad"))}; + std::make_shared()}; // GradNodeBase is base class of all grad op which is a // wrapper for grad op. This class will make grad op easy diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 934497d7d179c..ebd3333c52659 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -19,6 +19,8 @@ #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_tensor_holder.h" #include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" @@ -27,6 +29,325 @@ namespace egr { +/* +* GeneralGrad is Helpper class to implement custom grad operation between +* outputs and inputs. +* +* **/ +class GeneralGrad { + public: + static GeneralGrad& Instance() { return *general_grad_; } + + // Get inputs's / no_grad_vars's GradNodes and InputMeta Info + void GetTargetNodesInfo( + const std::vector& inputs, + bool is_no_grad_vars) { + std::string msg = is_no_grad_vars ? "no_grad_vars" : "inputs"; + VLOG(6) << "Running in GetTargetNodesInfo."; + if (!inputs.empty()) { + VLOG(6) << msg << " are not empty."; + size_t num_inputs = inputs.size(); + for (size_t i = 0; i < num_inputs; i++) { + AutogradMeta* auto_grad_meta = + EagerUtils::unsafe_autograd_meta(inputs[i]); + auto target_node = auto_grad_meta->GetMutableGradNode().get(); + PADDLE_ENFORCE_NOT_NULL(target_node, + paddle::platform::errors::Fatal( + "There is no grad op for %s:[%d] or it's" + "stop_gradient=True.", + msg, i)); + if (is_no_grad_vars) { + (no_grad_var_nodes_inputmeta_map)[target_node] = auto_grad_meta; + } else { // normal input + (input_target_nodes_inputmeta_map)[target_node] = auto_grad_meta; + } + } + } + } + + // Purify potential_startup_nodes, remove nodes those are the same as + // input_target_nodes + void PurifyPotentialStartUpNodes() { + VLOG(6) << "Running in PurifyPotentialStartUpNodes"; + if (input_target_nodes_inputmeta_map.empty()) return; + std::unordered_set potential_startup_nodes_to_be_erased; + for (auto startup_op : potential_startup_nodes) { + auto iter = input_target_nodes_inputmeta_map.find(startup_op); + if (iter != input_target_nodes_inputmeta_map.end()) { + potential_startup_nodes_to_be_erased.emplace(iter->first); + } + } + if (!potential_startup_nodes_to_be_erased.empty()) { + for (auto nodes : potential_startup_nodes_to_be_erased) { + potential_startup_nodes.erase(nodes); + } + } + } + + // Remove some nodes those doesn't need to be + // stored in potential_stop_nodes态potential_startup_nodes + void UpdateGraphInfo() { + // Updated potential_sotp_nodes by depending_nodes, + // make sure the path from root to target_node is ok + std::unordered_set _startup_ops; + VLOG(6) << "Running in UpdateGraphInfo"; + std::queue queue; + for (auto& target_nodes_inputmeta_pair : input_target_nodes_inputmeta_map) { + queue.emplace(target_nodes_inputmeta_pair.first); + } + + while (!queue.empty()) { + auto* target_node = queue.front(); + queue.pop(); + if (!(depending_nodes)[target_node].empty()) { + auto precedding_nodes = (depending_nodes)[target_node]; + for (auto pre_nodes : precedding_nodes) { + queue.emplace(pre_nodes); + if (potential_stop_nodes.find(pre_nodes) != + potential_stop_nodes.end()) { + potential_stop_nodes.erase(pre_nodes); + } + } + } else { // startup_ops have no precedding nodes + VLOG(6) << "Emplace _startup_ops"; + _startup_ops.emplace(target_node); + } + } + // Purify potential_startup_nodes again, remove some + // potential startup_nodes that unreach to input target nodes + if (!_startup_ops.empty()) { + std::unordered_set potential_startup_nodes_to_be_erased; + for (auto node : potential_startup_nodes) { + if (_startup_ops.count(node) == 0) { + VLOG(6) << "Set up potential_startup_nodes_to_be_erased"; + potential_startup_nodes_to_be_erased.emplace(node); + } + } + if (!potential_startup_nodes_to_be_erased.empty()) { + for (auto node : potential_startup_nodes_to_be_erased) { + VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased"; + potential_startup_nodes.erase(node); + } + } + } + } + + // Get Graph Info Betweent input target GradNode and outputsļ¼Œ + // record depending_nodes态potential_stop_nodes态potential_startup_nodes + void GetGraphInfoBetweenTargets(const std::queue& init_queue) { + VLOG(6) << "Runing In GetGraphInfoBetweenTargets"; + + // Calculate in_degree for each node + std::unordered_map node_in_degree_map; + + // Copy nodes + std::queue queue = init_queue; + std::unordered_set visited; + + // Visit each node exactly once in any order + while (!queue.empty()) { + GradNodeBase* node = queue.front(); + queue.pop(); + + if (visited.count(node)) { + continue; + } + visited.insert(node); + + // Check node is target_nodes or not, if node is not target_node, + // all the next_node will be marked in potential_stop_nodes + bool is_potential_stop_nodes = + input_target_nodes_inputmeta_map.count(node); + + // Find and append next nodes + const std::vector>& edges = node->GetEdges(); + for (const auto& edge_list : edges) { + for (const Edge& edge : edge_list) { + GradNodeBase* next_node = edge.GetMutableGradNode().get(); + + // Next node could be nullptr if it is leaf tensor with no + // AccumulationNode attached + // Or it could also originated from dispensable inputs + if (!next_node) continue; + + // if node not in input_target_nodes, + // all the next_nodes of current node will be inserted to + // potential_stop_node + if (is_potential_stop_nodes) { + potential_stop_nodes.emplace(next_node); + } + + // Update in_degree + if (!node_in_degree_map.count(next_node)) + node_in_degree_map[next_node] = 0; + node_in_degree_map[next_node]++; + + // Record depending relationship + (depending_nodes)[next_node].emplace(node); + queue.push(next_node); + } + } + } + // Update Graph Info, remove some nodes in + // potential_stop_nodes态potential_startup_nodes态 + UpdateGraphInfo(); + } + + void ModifyReadyQueue(std::queue* queue) { + std::queue tmp_queue; + for (auto nodes : potential_startup_nodes) { + tmp_queue.emplace(nodes); + } + tmp_queue.swap(*queue); + } + + // Set result for input target grad_var when potential_startup_nodes is empty + void SetResultForInputTargetVar( + const std::unordered_map>& + node_input_buffers_dict) { + if (potential_startup_nodes.size() == 0) { + for (auto input_target_node : *GetInPutTargetNodesInputMetaMap()) { + // out rank_info of forward op + auto rank_info = input_target_node.second->OutRankInfo(); + auto iter = node_input_buffers_dict.find(input_target_node.first); + if (iter != node_input_buffers_dict.end()) { + auto& target_result = + (iter->second)->Buffers()[rank_info.first][rank_info.second]; + // save the target result + results_map[input_target_node.first] = target_result; + } + } + } + } + + // Set input target grad_var from node_input_buffer by inputmeta + void SetResultForInputTargetVar(GradTensorHolder input_buffers, + GradNodeBase* node) { + auto iter = GetInPutTargetNodesInputMetaMap()->find(node); + if (iter != GetInPutTargetNodesInputMetaMap()->end()) { + VLOG(6) << "Get target result by by inputmeta"; + // out rank_info of forward op + auto rank_info = (iter->second)->OutRankInfo(); + // rank_info is a pair, first means slot_id, second means rank. + auto& target_result = + input_buffers.Buffers()[rank_info.first][rank_info.second]; + // save the target result + results_map[node] = target_result; + } + } + + std::vector GetResults( + const std::vector& inputs, + bool allow_unused, bool create_graph) { + VLOG(6) << "Running in GetResults"; + if (inputs.empty()) return {}; + + std::vector results; + results.reserve(inputs.size()); + + for (size_t i = 0; i < inputs.size(); ++i) { + auto& input = inputs[i]; + AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input); + auto target_node = auto_grad_meta->GetMutableGradNode().get(); + + auto iter = results_map.find(target_node); + if (iter != results_map.end()) { + // set StopGradient = !create_graph + AutogradMeta* tensor_auto_grad_meta = + EagerUtils::autograd_meta(&(iter->second)); + tensor_auto_grad_meta->SetStopGradient(!create_graph); + results.emplace_back(iter->second); + } else { + PADDLE_ENFORCE_EQ(allow_unused, true, + paddle::platform::errors::InvalidArgument( + "The %d-th input does not appear in the backward " + "graph. Please check the input tensor or set " + "allow_unused=True to get None result.", + i)); + results.emplace_back(); + } + } + Clear(); + return results; + } + + void PreparedForGeneralGrad( + const std::vector& inputs, + const std::vector& no_grad_vars, + std::queue* queue, + const std::unordered_map>& + node_input_buffers_dict) { + // Get no_grad_vars's GradNodes and InputMeta Info + GetTargetNodesInfo(no_grad_vars, true /* is_no_grad_vars */); + // Get inputs's GradNodes and InputMeta Info + GetTargetNodesInfo(inputs, false /* is_no_grad_vars */); + // Purify potential_startup_ops, remove those nodes that are the same as + // input_target_nodes + PurifyPotentialStartUpNodes(); + // Get Graph Info Betweent input target gradnode and outputs + // Record the depending_nodes and + // potential_stop_nodes态potential_startup_nodes + GetGraphInfoBetweenTargets(*queue); + // Reset queue. Queue is empty only when + // 1.input equals to output. 2.input can not reach to output. + ModifyReadyQueue(queue); + // Set result for input target grad_var when queue is empty + if (queue->empty()) SetResultForInputTargetVar(node_input_buffers_dict); + } + + bool IsPotentialStopNodes(GradNodeBase* node) { + return potential_stop_nodes.count(node); + } + + std::unordered_map* + GetNoGradVarNodesInputMetaMap() { + return &no_grad_var_nodes_inputmeta_map; + } + + std::unordered_map* + GetInPutTargetNodesInputMetaMap() { + return &input_target_nodes_inputmeta_map; + } + + std::unordered_set* GetPotentialStopNodes() { + return &potential_stop_nodes; + } + + std::unordered_set* GetPotentialStartupNodes() { + return &potential_startup_nodes; + } + + void Clear() { + no_grad_var_nodes_inputmeta_map.clear(); + input_target_nodes_inputmeta_map.clear(); + potential_startup_nodes.clear(); + potential_stop_nodes.clear(); + depending_nodes.clear(); + results_map.clear(); + } + + private: + GeneralGrad() = default; + static GeneralGrad* general_grad_; + // no_grad_vars's GradNode and GradNode's InputMeta. + std::unordered_map + no_grad_var_nodes_inputmeta_map; + // inputs's GradNode and GradNode's InputMeta. + std::unordered_map + input_target_nodes_inputmeta_map; + // Record all the potential startup_nodes, will be changed. + std::unordered_set potential_startup_nodes; + // Record all the potential stop nodes, will be changed. + std::unordered_set potential_stop_nodes; + std::unordered_map /* pre nodes */> + depending_nodes; + std::unordered_map results_map; + DISABLE_COPY_AND_ASSIGN(GeneralGrad); +}; + std::unordered_map getInDegreeMap( const std::queue& init_queue) { // Calculate in_degree for each node @@ -74,14 +395,51 @@ std::unordered_map getInDegreeMap( return node_in_degree_map; } -void RunBackward(const std::vector& tensors, - const std::vector& grad_tensors, - bool retain_graph) { +// Enforce GradNode has TensorWrappers as Input +void EnforceGradNodeHasInput(GradNodeBase* node) { + VLOG(6) << "Running in EnforceGradNodeHasInput"; + PADDLE_ENFORCE_NE( + node->IsTensorWrappersCleared(), true, + paddle::platform::errors::Fatal( + "The TensorWrappers of %s do not exist. This may be because:\n" + "You calculate backward twice for the same subgraph without " + "setting retain_graph=True. Please set retain_graph=True in the " + "first backward/grad call.\n", + node->name())); +} + +void DuplicateCheck(const std::vector& inputs, + bool is_input) { + std::unordered_set visisted_ins; + std::string msg = is_input ? "inputs" : "outputs"; + for (auto in : inputs) { + AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in); + PADDLE_ENFORCE_EQ( + visisted_ins.count(auto_grad_meta), 0, + paddle::platform::errors::AlreadyExists( + "%s contain duplicate tensor %s, please check %s carefully.", msg, + in.name(), msg)); + visisted_ins.insert(auto_grad_meta); + } +} + +GeneralGrad* GeneralGrad::general_grad_ = new GeneralGrad(); + +std::vector RunBackward( + const std::vector& tensors, // output + const std::vector& grad_tensors, + bool retain_graph, bool create_graph = false, + const std::vector& inputs = {}, + bool allow_unused = false, + const std::vector& no_grad_vars = {}) { VLOG(6) << "Start Backward"; // *Gradient Hook should happen at node-level // *Inplace version check should perform at node-level // *Cross-batch accumulation happens at forward pass + // GeneralGrad + bool is_general_grad = !inputs.empty(); + /* --- Initialization --- */ // 1. Init queue with starting nodes // 2. Prepare initial input buffers @@ -112,7 +470,8 @@ void RunBackward(const std::vector& tensors, // Prepare GradTensorHolder if (!node_input_buffers_dict.count(grad_node)) { - VLOG(6) << "Create Value for grad input tensor " << i; + VLOG(6) << "Create Value for grad input tensor " << i + << " of grad node: " << grad_node->name(); node_input_buffers_dict[grad_node] = std::make_unique(grad_node->InputMeta()); } @@ -123,11 +482,20 @@ void RunBackward(const std::vector& tensors, paddle::platform::errors::Fatal( "Detected size mismatch between tensors and grad_tensors" "grad_tensors should either have " - "size = 0 or same size as tensors")); + "size = 0 or same size as tensors.")); // Feed given tensor if it's provided VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor"; - node_input_buffers_dict[grad_node]->add( - input_info.first, input_info.second, grad_tensors[i]); + + if (grad_tensors[i].is_initialized()) { + // Deep copy + paddle::experimental::Tensor tmp_tensor; + tmp_tensor.copy_(grad_tensors[i], grad_tensors[i].inner_place(), true); + node_input_buffers_dict[grad_node]->add(input_info.first, + input_info.second, tmp_tensor); + } else { + node_input_buffers_dict[grad_node]->add( + input_info.first, input_info.second, grad_tensors[i]); + } } else { VLOG(6) << "Fill grad input tensor " << i << " with 1.0"; @@ -140,8 +508,11 @@ void RunBackward(const std::vector& tensors, input_info.first, input_info.second, tensor, true /*fill_one=true*/); } - // Prepare queue + // Prepare queue, potential startup_nodes queue.push(grad_node); + if (is_general_grad) { + GeneralGrad::Instance().GetPotentialStartupNodes()->emplace(grad_node); + } } VLOG(6) << "Update In degree Map for backward"; @@ -149,37 +520,88 @@ void RunBackward(const std::vector& tensors, std::unordered_map node_in_degree_map = getInDegreeMap(queue); + if (is_general_grad) { + // Prepare several vital preprocess for GeneralGrad + GeneralGrad::Instance().PreparedForGeneralGrad(inputs, no_grad_vars, &queue, + node_input_buffers_dict); + } + + VLOG(6) << " startup_ops' size is :" << queue.size(); + /* --- Topological Visit --- */ // 1. Pop queue // 2. Run node + // |- Check and capture target result // |- node(grads) // |- Prepare for next node // 3. Update queue VLOG(6) << "Run Backward"; while (!queue.empty()) { GradNodeBase* node = queue.front(); + VLOG(6) << "Running GradNode:" << node->name(); + + paddle::platform::RecordEvent node_record_event( + std::string(typeid(*node).name()) + " grad_node", + paddle::platform::TracerEventType::Operator, 1); + + if (queue.size() > 1 && node_in_degree_map[node] != 0) { + queue.pop(); + continue; + } queue.pop(); // Run node: This is where Hook happens PADDLE_ENFORCE( node_input_buffers_dict.count(node), paddle::platform::errors::Fatal( - "Unable to find next node in the InputBuufer" - "Trying to run Node without configuring its GradTensorHolder")); + "Unable to find next node in the GradTensorHolder \n" + "Trying to run Node without configuring its GradTensorHolder.")); std::unique_ptr node_input_buffer = std::move(node_input_buffers_dict[node]); - VLOG(6) << "Run Backward Kernel with input_buffer"; + // Set input target grad_var from node_input_buffer by inputmeta + if (!inputs.empty() && is_general_grad) { + GeneralGrad::Instance().SetResultForInputTargetVar(*node_input_buffer, + node); + } + + // no_grad_vars + if (!no_grad_vars.empty() && is_general_grad) { + auto iter = + GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->find(node); + if (iter != + GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->end()) { + VLOG(6) << "Change the input buffer[slot][rank] by Zeros"; + auto rank_info = (iter->second)->OutRankInfo(); + node_input_buffer->SetBufferSlotRankZeros(rank_info.first, + rank_info.second); + } + } + + VLOG(6) << "Running GradNode:" << node->name(); + + // Check input + EnforceGradNodeHasInput(node); + + VLOG(6) << "Run Backward Kernel with GradTensorHolder."; // Run Pre Backward Node and get outputs std::vector> grad_output_tensors = - (*node)(node_input_buffer->Buffers()); + (*node)(node_input_buffer->Buffers(), create_graph); + + // retain_grad or not + if (!retain_graph) { + VLOG(6) + << "retain_graph is false, need to clear the TensorWrapper of nodes."; + node->ClearTensorWrappers(); + } + // TODO(jiabin): Should we erase it or find a more efficient way. + node_input_buffers_dict.erase(node); // Prepare GradTensorHolder for next node const std::vector>& edges = node->GetEdges(); - PADDLE_ENFORCE(edges.size() == grad_output_tensors.size() || edges.empty(), paddle::platform::errors::Fatal( "Number of edges should be either empty ( for leaf node " @@ -190,6 +612,7 @@ void RunBackward(const std::vector& tensors, for (size_t i = 0; i < edges.size(); i++) { for (size_t j = 0; j < edges[i].size(); j++) { const Edge& edge = edges[i][j]; + auto edge_rank = edge.GetEdgeRankInfo(); // Since we make edge has as same rank as bwd outputs, we indexing them // with @@ -203,6 +626,7 @@ void RunBackward(const std::vector& tensors, grad_output_tensors[i].empty()) { continue; } + PADDLE_ENFORCE_LT( j, grad_output_tensors[i].size(), paddle::platform::errors::Fatal( @@ -215,9 +639,8 @@ void RunBackward(const std::vector& tensors, if ((!grad_output_tensor.defined() || !grad_output_tensor.initialized())) { - VLOG(6) - << "We get grad_output_tensor with slot: " << i << ", rank: " << j - << " as uninitialized or undefined in both tensor and variable"; + VLOG(6) << "We get grad_output_tensor with slot: " << i + << ", rank: " << j << " as uninitialized or undefined tensor"; } VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i << ", rank: " << j @@ -228,6 +651,8 @@ void RunBackward(const std::vector& tensors, const auto& input_meta = next_node->InputMeta(); auto grad_tensor_holder = std::make_unique(input_meta); + VLOG(6) << "Construct GradTensorHolder for grad node: " + << next_node->name(); node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); } VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first @@ -237,16 +662,54 @@ void RunBackward(const std::vector& tensors, // Update queue node_in_degree_map[next_node]--; - PADDLE_ENFORCE(node_in_degree_map[next_node] >= 0, - paddle::platform::errors::Fatal( - "Detected in-degree value smaller than zero." - "Node's in-degree cannot be negative")); - if (node_in_degree_map[next_node] == 0) { - queue.emplace(std::move(next_node)); + + PADDLE_ENFORCE( + node_in_degree_map[next_node] >= 0, + paddle::platform::errors::Fatal( + "Detected in-degree value smaller than zero. For Node: %s" + "Node's in-degree cannot be negative.", + next_node->name())); + + if (is_general_grad) { + bool is_potential_stop_node = + GeneralGrad::Instance().GetPotentialStopNodes()->count(next_node); + if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) { + queue.emplace(std::move(next_node)); + } + } else { + if (node_in_degree_map[next_node] == 0) { + queue.emplace(std::move(next_node)); + } } } } } + if (!is_general_grad) return {}; + return GeneralGrad::Instance().GetResults(inputs, allow_unused, create_graph); +} + +void Backward( + const std::vector& tensors, // outputs + const std::vector& grad_tensors, + bool retain_graph) { + VLOG(6) << "Run in Backward"; + paddle::platform::RecordEvent backward_record_event( + "backward", paddle::platform::TracerEventType::Operator, 1); + RunBackward(tensors, grad_tensors, retain_graph); } +std::vector Grad( + const std::vector& tensors, // outputs + const std::vector& inputs, + const std::vector& grad_tensors, + bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused, + const std::vector& no_grad_vars) { + VLOG(6) << "Run in Grad"; + + DuplicateCheck(inputs, true /* is_input */); + DuplicateCheck(tensors, false /* is_input */); + + return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs, + allow_unused, no_grad_vars); +} } // namespace egr diff --git a/paddle/fluid/eager/backward.h b/paddle/fluid/eager/backward.h index 2856d9fb87f34..bebe664838e6c 100644 --- a/paddle/fluid/eager/backward.h +++ b/paddle/fluid/eager/backward.h @@ -19,12 +19,20 @@ namespace egr { -// run_backward(): +// Backward(): // tensors corresponds to those lived in the backward graph // each grad_tensors[i] keeps the value for its corresponding tensors[i] -void RunBackward(const std::vector &tensors, - const std::vector &grad_tensors, - bool retain_graph = false); +void Backward(const std::vector& tensors, + const std::vector& grad_tensors, + bool retain_graph = false); + +std::vector Grad( + const std::vector& tensors, + const std::vector& inputs, + const std::vector& grad_tensors = {}, + bool retain_graph = false, bool create_graph = false, + bool only_inputs = false, bool allow_unused = false, + const std::vector& no_grad_vars = {}); // Reserved for gradient() diff --git a/paddle/fluid/eager/custom_operator/CMakeLists.txt b/paddle/fluid/eager/custom_operator/CMakeLists.txt new file mode 100644 index 0000000000000..ccc9a03a55660 --- /dev/null +++ b/paddle/fluid/eager/custom_operator/CMakeLists.txt @@ -0,0 +1 @@ +cc_library(custom_operator_node SRCS custom_operator_node.cc DEPS phi_tensor phi_api grad_node_info custom_operator op_meta_info) diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc new file mode 100644 index 0000000000000..72af1cc4b0686 --- /dev/null +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/eager/custom_operator/custom_operator_node.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/fluid/framework/op_meta_info_helper.h" +#include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace egr { +std::vector> RunCustomOpNode:: +operator()(const std::vector>& grads, + bool create_graph) { + paddle::CustomOpKernelContext ctx; + auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs( + egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); + auto grad_outputs_names = paddle::framework::OpMetaInfoHelper::GetOutputs( + egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); + auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_); + auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap(); + + std::vector> tmp_ins( + grad_inputs_name.size()); + VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size() + << ", whose grad_inputs_name size is: " << grad_inputs_name.size(); + for (size_t i = 0; i < grads.size(); i++) { + if (map[1].find(i) != map[1].end()) { + VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][i]; + tmp_ins[map[1][i]] = grads[i]; + } + } + + for (auto it : fwd_outs) { + VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first; + tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second)); + } + + for (auto it : fwd_ins) { + VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first; + tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second)); + } + + VLOG(6) << "Prepare Grad inputs"; + for (const auto& in : tmp_ins) { + ctx.EmplaceBackInputs(in); + } + VLOG(6) << "Prepare Grad attrs"; + ctx.EmplaceBackAttrs(attrs_); + std::vector> outs( + GetEdges().size()); + std::vector> tmp_outs( + grad_outputs_names.size()); + VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size(); + for (size_t i = 0; i < GetEdges().size(); i++) { + if (map[0].find(i) != map[0].end()) { + VLOG(7) << "Insert grad outputs: " << i + << " with size: " << GetEdges()[i].size() + << " to tmp_outputs: " << map[0][i]; + for (size_t j = 0; j < GetEdges()[i].size(); j++) { + outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */ + std::make_shared( + phi::DataType::UNDEFINED), + egr::Controller::Instance().GenerateUniqueName( + "custom_tmp_grad")); + } + tmp_outs[map[0][i]] = outs[i]; + } + } + for (size_t i = 0; i < tmp_outs.size(); i++) { + VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size(); + ctx.EmplaceBackOutputs(tmp_outs[i]); + } + VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_; + + (*paddle::framework::OpMetaInfoHelper::GetKernelFn( + kernel_map.at(op_type_)[1]))(&ctx); + return outs; +} +} // namespace egr diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h new file mode 100644 index 0000000000000..6ece2658575c7 --- /dev/null +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h @@ -0,0 +1,83 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/hooks.h" +#include "paddle/fluid/eager/tensor_wrapper.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/utils/any.h" + +namespace egr { +class RunCustomOpNode : public GradNodeBase { + public: + // Constructor: configure fwd input tensors to grad node + explicit RunCustomOpNode(size_t bwd_in_slot_num, size_t bwd_out_slot_num, + const std::string& op_type) + : GradNodeBase(bwd_in_slot_num, bwd_out_slot_num), op_type_(op_type) { + VLOG(6) << "Construct RunCustomOpNode for op: " << op_type; + } + + ~RunCustomOpNode() override { + VLOG(6) << "Destruct RunCustomOpNode for op: " << op_type_; + } + + // Functor: perform backward computations + virtual std::vector> operator()( + const std::vector>& grads, + bool create_graph) override; + + std::string name() { + return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_); + } + + static std::vector ConstructTensorWrapper( + const std::vector& fwd_var) { + std::vector res; + for (auto const& var : fwd_var) { + res.emplace_back(var); + } + return res; + } + + static std::vector Recover( + std::vector* fwd_var) { + std::vector res; + for (size_t i = 0; i < fwd_var->size(); i++) { + res.emplace_back(fwd_var->at(i).recover(nullptr)); + } + return res; + } + + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } + + void SetAttrs(const std::vector& attr) { attrs_ = attr; } + + public: + std::unordered_map> fwd_outs; + std::unordered_map> fwd_ins; + std::unordered_map grads2grad_in_map; + + private: + std::vector attrs_; + std::string op_type_{""}; +}; + +} // namespace egr diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 427be83c3bbee..891ad4d8983b5 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -15,17 +15,23 @@ #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/utils.h" + #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/var_type.h" + #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" #include "glog/logging.h" /** - * Implementation of GradNodeBase, Edge and InputBuffer. + * Implementation of GradNodeBase, Edge and GradTensorHolder. **/ namespace egr { @@ -33,7 +39,6 @@ GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) { VLOG(6) << "Construct GradNodeBase"; bwd_in_meta_.resize(bwd_in_slot_num); bwd_out_meta_.resize(bwd_out_slot_num); - // adj_edges has the same num as backward outputs adj_edges_.resize(bwd_out_slot_num); } @@ -44,24 +49,20 @@ void GradNodeBase::AddEdges(std::vector* metas, size_t slot_id) { "Given slot id is out of range of adj_edges outter size, " "adj_edges is designed to has the same size of grad " "inputs's slot num.")); - for (const auto& meta : *metas) { + + for (size_t i = 0; i < metas->size(); i++) { + const auto& meta = (*metas)[i]; // adj_edges has as same rank as fwd inputs, and record it's output rank // from // its pre-ops if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node && node.get()) { - VLOG(6) << "Add Edges for slot: " << slot_id - << " which is: " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); - } else { + if (!node || !node.get()) { meta->SetGradNode(std::make_shared(meta)); - VLOG(6) << "Add Edges for slot: " << slot_id - << " which is: " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); } + + adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), + meta->OutRankInfo()); } } } @@ -73,130 +74,205 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { "Given slot id is out of range of adj_edges outter size, " "adj_edges is designed to has the same size of grad " "inputs's slot num.")); + if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node && node.get()) { - VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " - << this->name() << " to " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); - } else { + if (!node || !node.get()) { meta->SetGradNode(std::make_shared(meta)); - VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " - << this->name() << " to " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); } + VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " + << this->name() << " to " << meta->GetMutableGradNode()->name(); + + adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), + meta->OutRankInfo()); } } -const std::vector& GradNodeBase::InputMeta() const { +const std::vector>& GradNodeBase::InputMeta() const { return bwd_in_meta_; } -const std::vector& GradNodeBase::OutputMeta() const { +const std::vector>& GradNodeBase::OutputMeta() const { return bwd_out_meta_; } -void GradNodeBase::SetGradInMeta(std::vector* fwd_out, +void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out, size_t slot_rank) { - size_t slot_size = fwd_out->size(); + auto* fwd_out_meta = egr::EagerUtils::nullable_autograd_meta(fwd_out); PADDLE_ENFORCE_LE( slot_rank, (bwd_in_meta_.size() - 1), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_in_meta_ size, since " "bwd_in_meta_ is designed to hold as same num as backward " "inputs.")); - auto& meta = bwd_in_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_in_meta should only be init once, addition " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); - // Init stop gradient vector before use to avoid push back - meta.Init(slot_size); - for (size_t i = 0; i < slot_size; i++) { - PADDLE_ENFORCE_NOT_NULL((*fwd_out)[i], - paddle::platform::errors::PreconditionNotMet( - "Bwd_in_meta should only be called while " - "autograd_meta is not null. If you got this " - "error, it indicates bugs in framework.")); - if ((*fwd_out)[i]->StopGradient()) { - // Set Stop Gradient only when its true or non-initialized autograd_meta, - // since all default value is false. - meta.SetStopGradient(i, (*fwd_out)[i]->StopGradient()); + auto& metas = bwd_in_meta_.at(slot_rank); + if (metas.size() == 0) { + metas.resize(1); + } + + auto& meta = metas[0]; + meta.SetStopGradient(fwd_out_meta->StopGradient()); + + // Record TensorMeta + if (phi::DenseTensor::classof(fwd_out.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_out.impl().get()); + + PADDLE_ENFORCE_NE( + dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal( + "Attempting to copy DenseTensorMeta with phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); + + if (paddle::framework::IsComplexType( + paddle::framework::TransToProtoVarType(dense_tensor->type()))) { + need_complex_to_real_ = true; } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with " + "non-DenseTensor argument."; } } -void GradNodeBase::SetGradInMeta(AutogradMeta* fwd_out, size_t slot_rank) { +void GradNodeBase::SetGradInMeta( + const std::vector& fwd_out, + size_t slot_rank) { + size_t slot_size = fwd_out.size(); PADDLE_ENFORCE_LE( slot_rank, (bwd_in_meta_.size() - 1), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_in_meta_ size, since " "bwd_in_meta_ is designed to hold as same num as backward " "inputs.")); - auto& meta = bwd_in_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_in_meta should only be init once, Additional " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); + auto& metas = bwd_in_meta_.at(slot_rank); // Init stop gradient vector before use to avoid push back - VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank; - meta.Init(1); - meta.SetStopGradient(0, fwd_out->StopGradient()); + if (metas.size() < slot_size) { + VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank; + metas.resize(slot_size); + } + for (size_t i = 0; i < slot_size; i++) { + auto& meta = metas[i]; + const auto& fwd_out_tensor = fwd_out[i]; + auto* fwd_out_meta = + egr::EagerUtils::nullable_autograd_meta(fwd_out_tensor); + PADDLE_ENFORCE_NOT_NULL(fwd_out_meta, + paddle::platform::errors::PreconditionNotMet( + "Bwd_in_meta should only be called while " + "autograd_meta is not null. If you got this " + "error, it indicates bugs in framework.")); + if (fwd_out_meta->StopGradient()) { + // Set Stop Gradient only when its true or non-initialized autograd_meta, + // since all default value is false. + meta.SetStopGradient(fwd_out_meta->StopGradient()); + } + + // Record TensorMeta + if (phi::DenseTensor::classof(fwd_out_tensor.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_out_tensor.impl().get()); + + PADDLE_ENFORCE_NE( + dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta " + "with phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); + if (paddle::framework::IsComplexType( + paddle::framework::TransToProtoVarType(dense_tensor->type()))) { + need_complex_to_real_ = true; + } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta " + "with non-DenseTensor argument."; + } + } } -void GradNodeBase::SetGradOutMeta(std::vector* fwd_in, +void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in, size_t slot_rank) { - size_t slot_size = fwd_in->size(); + auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in); PADDLE_ENFORCE_LE( - slot_rank, (bwd_out_meta_.size() - 1), + (slot_rank + 1), bwd_out_meta_.size(), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_out_meta_ size, " "since bwd_out_meta_ is designed to hold as same num as " "backward outputs.")); - auto& meta = bwd_out_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_out_meta should only be init once. Additional " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); + auto& metas = bwd_out_meta_.at(slot_rank); // Init stop gradient vector before use to avoid push back - meta.Init(slot_size); - for (size_t i = 0; i < slot_size; i++) { - if (!(*fwd_in)[i]) { - meta.SetStopGradient(i, true); - continue; - } - if ((*fwd_in)[i]->StopGradient()) { - // Set Stop Gradient only when its true or non-initialized autograd_meta, - // since all default value is false. - meta.SetStopGradient(i, (*fwd_in)[i]->StopGradient()); + if (metas.size() == 0) { + metas.resize(1); + } + auto& meta = metas[0]; + if (fwd_in_meta) { + meta.SetStopGradient(fwd_in_meta->StopGradient()); + } else { + meta.SetStopGradient(true); + } + + // Record TensorMeta + if (fwd_in.impl() && fwd_in.impl().get()) { + if (phi::DenseTensor::classof(fwd_in.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_in.impl().get()); + PADDLE_ENFORCE_NE( + dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta " + "with phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with " + "non-DenseTensor argument."; } } -void GradNodeBase::SetGradOutMeta(AutogradMeta* fwd_in, size_t slot_rank) { +void GradNodeBase::SetGradOutMeta( + const std::vector& fwd_in, size_t slot_rank) { + size_t slot_size = fwd_in.size(); PADDLE_ENFORCE_LE( - (slot_rank + 1), bwd_out_meta_.size(), + slot_rank, (bwd_out_meta_.size() - 1), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_out_meta_ size, " "since bwd_out_meta_ is designed to hold as same num as " "backward outputs.")); - auto& meta = bwd_out_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_out_meta should only be init once. Additional " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); + auto& metas = bwd_out_meta_.at(slot_rank); // Init stop gradient vector before use to avoid push back - meta.Init(1); - if (fwd_in) { - meta.SetStopGradient(0, fwd_in->StopGradient()); - } else { - meta.SetStopGradient(0, true); + if (metas.size() < slot_size) { + metas.resize(slot_size); + } + for (size_t i = 0; i < slot_size; i++) { + const auto& fwd_in_tensor = fwd_in[i]; + auto& meta = metas[i]; + auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in_tensor); + if (fwd_in_meta) { + // Set Stop Gradient only when its true or non-initialized autograd_meta, + // since all default value is false. + meta.SetStopGradient(fwd_in_meta->StopGradient()); + } + + // Record TensorMeta + if (fwd_in_tensor.impl() && fwd_in_tensor.impl().get()) { + if (phi::DenseTensor::classof(fwd_in_tensor.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_in_tensor.impl().get()); + + PADDLE_ENFORCE_NE(dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal( + "Attempting to copy DenseTensorMeta with " + "phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); + } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta " + "with non-DenseTensor argument."; + } } } @@ -207,12 +283,8 @@ void GradNodeBase::SetDefaultGradInOutMeta() { "meta setter, other size of inputs and outputs should " "create with Setter and Getters")); // Default stop_gradient is false and slot id is 0, slot size is 1; - bwd_out_meta_[0].Init(1); - bwd_in_meta_[0].Init(1); -} - -const std::vector>& GradNodeBase::GetEdges() const { - return adj_edges_; + bwd_out_meta_[0].resize(1); + bwd_in_meta_[0].resize(1); } int64_t GradNodeBase::RegisterGradientHook( @@ -222,6 +294,10 @@ int64_t GradNodeBase::RegisterGradientHook( return next_hook_id_++; } +const std::vector>& GradNodeBase::GetEdges() const { + return adj_edges_; +} + std::vector> GradNodeBase::ApplyGradientHooks( const std::vector>& tensors) { @@ -270,4 +346,45 @@ GradNodeBase::ApplyGradientHooks( return outs; } +void GradNodeBase::HandleComplexGradToRealGrad( + std::vector>* out_grads) { + for (size_t slot_id = 0; slot_id < out_grads->size(); slot_id++) { + const std::vector& slot_out_grads = + (*out_grads)[slot_id]; + for (size_t rank_id = 0; rank_id < slot_out_grads.size(); rank_id++) { + const GradSlotMeta& slot_meta = bwd_out_meta_[slot_id][rank_id]; + + PADDLE_ENFORCE( + slot_meta.HasTensorMeta() > 0, + paddle::platform::errors::Fatal( + "We require TensorMeta in GradInputMeta() to obtain forward data " + "types." + "However, no TensorMeta is detected in bwd_out_meta_.")); + + auto fwd_data_type = paddle::framework::TransToProtoVarType( + slot_meta.GetTensorMeta().dtype); + const paddle::experimental::Tensor& grad = slot_out_grads[rank_id]; + + if (paddle::framework::IsComplexType(fwd_data_type)) continue; + + // Only Handle Complex To Real for DenseTensor for now + if (phi::DenseTensor::classof(grad.impl().get())) { + phi::DenseTensor* grad_dense_tensor = + static_cast(grad.impl().get()); + + auto curr_data_type = + paddle::framework::TransToProtoVarType(grad_dense_tensor->type()); + if (!paddle::framework::IsComplexType(curr_data_type)) continue; + + // Convert Complex GradOut to Real + auto out = std::make_shared(); + paddle::framework::TransComplexToReal(fwd_data_type, curr_data_type, + *grad_dense_tensor, out.get()); + + (*out_grads)[slot_id][rank_id].set_impl(out); + } + } + } +} + } // namespace egr diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index 16513f05e0777..4b21a193ee021 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -57,21 +57,28 @@ class AutogradMeta; class GradSlotMeta { public: GradSlotMeta() = default; - void Init(size_t size) { - size_ = static_cast(size); - stop_gradient_.resize(size, false); + bool IsStopGradient() const { return stop_gradient_; } + void SetStopGradient(bool stop_gradient = true) { + stop_gradient_ = stop_gradient; } - bool IsInitialized() const { return size_ != -1; } - bool IsStopGradient(size_t rank) const { return stop_gradient_[rank]; } - int Size() const { return size_; } - void SetStopGradient(size_t rank, bool stop_gradient = true) { - stop_gradient_.at(rank) = stop_gradient; + void SetTensorMeta(const phi::DenseTensorMeta& meta) { + meta_ = std::make_shared(meta); + } + bool HasTensorMeta() const { return meta_ && meta_.get(); } + const phi::DenseTensorMeta& GetTensorMeta() const { + if (!HasTensorMeta()) { + PADDLE_THROW(paddle::platform::errors::Fatal( + "meta_ of GradSlotMeta has not been initialized yet." + "You're expected to check Edge availability with HasTensorMeta()" + "before calling GetTensorMeta() interface.")); + } + return *meta_.get(); } private: - int size_{-1}; - std::vector stop_gradient_{false}; + bool stop_gradient_{false}; + std::shared_ptr meta_ = nullptr; }; class GradNodeBase { @@ -95,8 +102,12 @@ class GradNodeBase { * is better choice to fit this format. * **/ virtual std::vector> operator()( - const std::vector>& grads) = 0; + const std::vector>& grads, + bool create_graph = false) = 0; + + virtual void ClearTensorWrappers() = 0; + virtual bool IsTensorWrappersCleared() = 0; /** * AddEdges is designed to set input tensors' backward Node as current * node's Edges. @@ -108,25 +119,30 @@ class GradNodeBase { void AddEdges(std::vector* metas, size_t slot_id); void AddEdges(AutogradMeta* meta, size_t slot_id); - /** - * GetEdges is designed to get all edges of current node**/ - const std::vector>& GetEdges() const; + // adj_edges were moved inside OutputMeta(), so no available direct access + // from GradNodeBase. + // To access Edges, get GradSlotMeta by calling OutputMeta(), then use + // slot_meta.GetEdge() /** * Get Input Meta of current Grad node**/ - const std::vector& InputMeta() const; + const std::vector>& InputMeta() const; /** * Get Output Meta of current Grad node**/ - const std::vector& OutputMeta() const; + const std::vector>& OutputMeta() const; /** * Set bwd ins and outs info with forward vars * **/ - void SetGradInMeta(std::vector* fwd_out, size_t slot_rank); - void SetGradInMeta(AutogradMeta* fwd_out, size_t slot_rank); + void SetGradInMeta(const std::vector& fwd_out, + size_t slot_rank); + void SetGradInMeta(const paddle::experimental::Tensor& fwd_out, + size_t slot_rank); - void SetGradOutMeta(std::vector* fwd_in, size_t slot_rank); - void SetGradOutMeta(AutogradMeta* fwd_in, size_t slot_rank); + void SetGradOutMeta(const std::vector& fwd_in, + size_t slot_rank); + void SetGradOutMeta(const paddle::experimental::Tensor& fwd_in, + size_t slot_rank); /** * Default setters for Grad in/out meta this should be used for same special @@ -158,11 +174,21 @@ class GradNodeBase { std::vector> ApplyGradientHooks( const std::vector>& tensors); + /** + * Handle Complex - Real Type Promotion + * **/ + void HandleComplexGradToRealGrad( + std::vector>* out_grads); + bool NeedComplexToRealConversion() { return need_complex_to_real_; } + virtual std::string name() { return "GradNodeBase"; } - private: - // TODO(jiabin): Use SmallVector instead after merge PR from develop + /** + * GetEdges is designed to get all edges of current node**/ + const std::vector>& GetEdges() const; + private: + // TODO(zhanlve): Merge adj_edges_ into GradOutMeta // Edges recorded the backward related node info, which indicate all edges // linked // by this Grad Node. @@ -170,10 +196,10 @@ class GradNodeBase { std::vector> adj_edges_; // bwd_out_meta_ is used to record Grad output info for backward - std::vector bwd_out_meta_; + std::vector> bwd_out_meta_; // bwd_in_meta_ used to record Grad input info for backward - std::vector bwd_in_meta_; + std::vector> bwd_in_meta_; // Gradient Hooks // Customer may register a list of hooks which will be called in order during // backward @@ -184,6 +210,8 @@ class GradNodeBase { /* hook */ std::shared_ptr>> gradient_hooks_; + // We handle complex to real conversion only if any complex GradIn is involved + bool need_complex_to_real_ = false; int64_t next_hook_id_{0}; }; diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc index 69fc7df2f1420..038ad09aa4d8b 100644 --- a/paddle/fluid/eager/grad_tensor_holder.cc +++ b/paddle/fluid/eager/grad_tensor_holder.cc @@ -21,6 +21,11 @@ namespace egr { +void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) { + buffer_[slot_id][rank] = + paddle::experimental::zeros_like(buffer_[slot_id][rank]); +} + void GradTensorHolder::add(size_t slot_id, size_t rank, const paddle::experimental::Tensor& t, bool fill_one) { @@ -88,7 +93,7 @@ void GradTensorHolder::add(size_t slot_id, size_t rank, // Create new tensor->impl and fill it with 1.0 if (t.defined()) { // Fill 1.0 - buffer_[slot_id][rank] = paddle::experimental::ones_like(t); + buffer_[slot_id][rank] = paddle::experimental::ones_like(t, t.dtype()); } } } diff --git a/paddle/fluid/eager/grad_tensor_holder.h b/paddle/fluid/eager/grad_tensor_holder.h index d66a81fe82859..8c00f9161b629 100644 --- a/paddle/fluid/eager/grad_tensor_holder.h +++ b/paddle/fluid/eager/grad_tensor_holder.h @@ -26,12 +26,13 @@ namespace egr { * GradTensorHolder should have as same format as forward output **/ class GradTensorHolder { public: - explicit GradTensorHolder(const std::vector& meta) { - VLOG(7) << "Init GradTensorHolder with meta size: " << meta.size(); - buffer_.resize(meta.size()); + explicit GradTensorHolder( + const std::vector>& metas) { + VLOG(7) << "Init GradTensorHolder with meta size: " << metas.size(); + buffer_.resize(metas.size()); for (size_t i = 0; i < buffer_.size(); i++) { - VLOG(7) << "Init GradTensorHolder with meta rank: " << meta[i].Size(); - buffer_[i].resize(meta[i].Size()); + VLOG(7) << "Init GradTensorHolder with meta rank: " << metas[i].size(); + buffer_[i].resize(metas[i].size()); } } @@ -56,6 +57,8 @@ class GradTensorHolder { return buffer_; } + void SetBufferSlotRankZeros(size_t slot_id, size_t rank); + private: std::vector> buffer_; }; diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index 31aaa93c41643..8da27f3bb8a13 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -36,6 +36,15 @@ class TensorWrapper { explicit TensorWrapper(const paddle::experimental::Tensor& tensor, bool full_reserved = false, bool no_need_buffer = false) { + // set inplace_version_snapshot_ according to tensor's current inplace + // version. + if (tensor.impl() && phi::DenseTensor::classof(tensor.impl().get())) { + phi::DenseTensor* dense_tensor = + static_cast(tensor.impl().get()); + auto& inplace_version_counter = dense_tensor->InplaceVersionCounter(); + inplace_version_snapshot_ = inplace_version_counter.CurrentVersion(); + } + /** * Normally, we should fully reserved all non-output or non-leaf fwd tensor * here. And for fwd output tensor, we should not reserve its autogradmeta, @@ -49,6 +58,7 @@ class TensorWrapper { } // shallow copy tensor_impl here + no_need_buffer_ = no_need_buffer; if (no_need_buffer) { if (phi::DenseTensor::classof(tensor.impl().get())) { // Only Copy Meta @@ -86,6 +96,7 @@ class TensorWrapper { // if it's full_reserved just return the full copy of tensor if (full_reserved_) { + check_inplace_version(); return intermidiate_tensor_; } else { std::shared_ptr new_grad_node = grad_node; @@ -94,13 +105,52 @@ class TensorWrapper { intermidiate_tensor_.set_autograd_meta( std::static_pointer_cast( p_ab_autograd_meta)); + check_inplace_version(); return intermidiate_tensor_; } } + void check_inplace_version() { + if (no_need_buffer_) { + VLOG(6) << "There's no need to check inplace_version because " + "no_need_buffer_ is true."; + return; + } + if (intermidiate_tensor_.impl() && + phi::DenseTensor::classof(intermidiate_tensor_.impl().get())) { + phi::DenseTensor* dense_tensor = + static_cast(intermidiate_tensor_.impl().get()); + auto& inplace_version_counter = dense_tensor->InplaceVersionCounter(); + + uint32_t current_inplace_version = + inplace_version_counter.CurrentVersion(); + PADDLE_ENFORCE_EQ( + current_inplace_version, inplace_version_snapshot_, + paddle::platform::errors::PermissionDenied( + "Tensor '%s' used in gradient computation has been " + "modified by an inplace operation. " + "Its version is %d but the expected version is %d. " + "Please fix your code to void calling an inplace operator " + "after using the Tensor which will used in gradient " + "computation.", + intermidiate_tensor_.name(), current_inplace_version, + inplace_version_snapshot_)); + VLOG(6) << " The inplace_version_snapshot_ of Tensor '" + << intermidiate_tensor_.name() << "' is [ " + << inplace_version_snapshot_ << " ]"; + VLOG(6) << " The current_inplace_version of Tensor '" + << intermidiate_tensor_.name() << "' is [ " + << current_inplace_version << " ]"; + } + } + + void clear() { intermidiate_tensor_.reset(); } + private: bool full_reserved_ = false; + bool no_need_buffer_ = false; std::pair out_rank_info_; paddle::experimental::Tensor intermidiate_tensor_; + uint32_t inplace_version_snapshot_ = 0; }; } // namespace egr diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc index 1683f4ed5fbe5..c8b2d22dcf951 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc @@ -17,6 +17,14 @@ #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy_sr, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy_sr, GPU, ALL_LAYOUT); +#endif namespace eager_test { using AbstractAutogradMeta = paddle::experimental::AbstractAutogradMeta; @@ -151,5 +159,50 @@ TEST(EagerVariable, Constructor) { CHECK_EQ(dt3_tmp_ptr[1], 10.0f); t4.reset(); CHECK(t4.defined() == false); + + VLOG(6) << "Check Tensor Copy_"; + std::vector rows = {1, 2}; + std::vector dims = {2}; + paddle::experimental::Tensor t7(std::make_shared(rows, 2)); + std::dynamic_pointer_cast(t7.impl()) + ->mutable_value() + ->Resize(phi::make_ddim(dims)); + auto* dt7_tmp_ptr = std::dynamic_pointer_cast(t7.impl()) + ->mutable_value() + ->mutable_data(paddle::platform::CPUPlace()); + dt7_tmp_ptr[0] = 6.0f; + dt7_tmp_ptr[1] = 11.0f; + + paddle::experimental::Tensor t8; + paddle::experimental::Tensor t5; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + paddle::experimental::Tensor t6; + paddle::experimental::Tensor t9; + VLOG(6) << "Check Tensor Copy_ Selected Rows"; + t8.copy_(t7, paddle::platform::CUDAPlace(0), true); + t9.copy_(t8, paddle::platform::CPUPlace(), true); + auto* dt9_tmp_ptr = std::dynamic_pointer_cast(t9.impl()) + ->value() + .data(); + CHECK_EQ(dt9_tmp_ptr[0], 6.0f); + CHECK_EQ(dt9_tmp_ptr[1], 11.0f); + CHECK_EQ(std::dynamic_pointer_cast(t9.impl())->height(), + 2); + + VLOG(6) << "Check Tensor Copy_ Dense Tensor"; + t5.copy_(t3, paddle::platform::CUDAPlace(0), true); + t6.copy_(t5, paddle::platform::CPUPlace(), true); + auto* dt6_tmp_ptr = + std::dynamic_pointer_cast(t6.impl())->data(); + CHECK_EQ(dt6_tmp_ptr[0], 5.0f); + CHECK_EQ(dt6_tmp_ptr[1], 10.0f); +#else + t5.copy_(t3, paddle::platform::CPUPlace(), true); + auto* dt5_tmp_ptr = + std::dynamic_pointer_cast(t5.impl())->data(); + CHECK_EQ(dt5_tmp_ptr[0], 5.0f); + CHECK_EQ(dt5_tmp_ptr[1], 10.0f); +#endif + VLOG(6) << "Finish"; } diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc index e3db309c4016a..d592b5ccf66ff 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include "glog/logging.h" #include "gtest/gtest.h" @@ -23,14 +24,9 @@ TEST(GradNodeInfo, GradSlotMeta) { auto grad_slot = egr::GradSlotMeta(); - CHECK(grad_slot.IsInitialized() == false); - VLOG(6) << "Init GradSlotMeta"; - grad_slot.Init(2); - CHECK(grad_slot.IsInitialized() == true); VLOG(6) << "Set SetStopGradient"; - grad_slot.SetStopGradient(0); - CHECK(grad_slot.IsStopGradient(0) == true); - CHECK_EQ(grad_slot.Size(), 2); + grad_slot.SetStopGradient(); + CHECK(grad_slot.IsStopGradient() == true); } void TestGradNodeBase(bool is_remove_gradient_hook) { @@ -56,18 +52,22 @@ void TestGradNodeBase(bool is_remove_gradient_hook) { ->data()[0], 6.0f); VLOG(6) << "Test Add Edges"; - egr::Edge edge0(grad_test_node1, 1, 2); - auto auto_grad0 = std::make_shared(edge0); + egr::Edge tmp_edge0(grad_test_node1, 1, 2); + auto auto_grad0 = std::make_shared(tmp_edge0); auto_grad0->SetStopGradient(false); - egr::Edge edge1(grad_test_node1, 3, 4); - auto auto_grad1 = std::make_shared(edge1); + + egr::Edge tmp_edge1(grad_test_node1, 3, 4); + auto auto_grad1 = std::make_shared(tmp_edge1); + et1.set_autograd_meta(auto_grad1); auto_grad1->SetStopGradient(false); grad_test_node0->AddEdges(auto_grad0.get(), 0); + CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().first, size_t(1)); CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().second, size_t(2)); std::vector metas = {auto_grad1.get()}; + grad_test_node0->AddEdges(&metas, 1); CHECK_EQ(grad_test_node0->GetEdges()[1][0].GetEdgeRankInfo().first, size_t(3)); @@ -76,22 +76,30 @@ void TestGradNodeBase(bool is_remove_gradient_hook) { VLOG(6) << "Test Set Meta and Get Meta"; auto_grad1->SetStopGradient(true); - grad_test_node0->SetGradInMeta(&metas, 0); - grad_test_node0->SetGradInMeta(auto_grad1.get(), 1); - grad_test_node0->SetGradOutMeta(&metas, 0); - grad_test_node0->SetGradOutMeta(auto_grad1.get(), 1); - CHECK_EQ(grad_test_node0->InputMeta()[0].Size(), 1); - CHECK_EQ(grad_test_node0->InputMeta()[1].Size(), 1); - CHECK(grad_test_node0->OutputMeta()[0].IsStopGradient(0)); - CHECK(grad_test_node0->OutputMeta()[1].IsStopGradient(0)); + grad_test_node0->SetGradInMeta(et1, 0); + grad_test_node0->SetGradInMeta({et1}, 1); + grad_test_node0->SetGradOutMeta(et1, 0); + grad_test_node0->SetGradOutMeta({et1}, 1); + CHECK_EQ(grad_test_node0->InputMeta()[0].size(), size_t(1)); + CHECK_EQ(grad_test_node0->InputMeta()[1].size(), size_t(1)); + CHECK_EQ(grad_test_node0->InputMeta()[0][0].GetTensorMeta().dtype, + meta.dtype); + CHECK_EQ(grad_test_node0->InputMeta()[1][0].GetTensorMeta().dtype, + meta.dtype); + CHECK(grad_test_node0->OutputMeta()[0][0].IsStopGradient()); + CHECK(grad_test_node0->OutputMeta()[1][0].IsStopGradient()); + CHECK_EQ(grad_test_node0->OutputMeta()[0][0].GetTensorMeta().dtype, + meta.dtype); + CHECK_EQ(grad_test_node0->OutputMeta()[1][0].GetTensorMeta().dtype, + meta.dtype); VLOG(6) << "Test Default Set Meta and Get Meta"; auto grad_test_node2 = std::make_shared( /* val */ 5.0, /* in_num */ 1, /* out_num */ 1); grad_test_node2->SetDefaultGradInOutMeta(); - CHECK(grad_test_node2->OutputMeta()[0].IsInitialized()); - CHECK(grad_test_node2->OutputMeta()[0].IsStopGradient(0) == false); - CHECK_EQ(grad_test_node2->OutputMeta()[0].Size(), 1); + CHECK_GT(grad_test_node2->OutputMeta()[0].size(), size_t(0)); + CHECK(grad_test_node2->OutputMeta()[0][0].IsStopGradient() == false); + CHECK_EQ(grad_test_node2->OutputMeta()[0].size(), size_t(1)); VLOG(6) << "Test Gradient Hook"; auto gradient_hook = []( @@ -135,7 +143,17 @@ TEST(GradNodeInfo, GradNodeBase) { } TEST(GradNodeInfo, Edge) { + phi::DenseTensorMeta meta = + phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1})); + std::shared_ptr dt = std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + paddle::experimental::Tensor et1(dt); + auto grad_test_node0 = std::make_shared(5, 2, 2); + auto auto_grad1 = std::make_shared(); VLOG(6) << "Test Construct Edge"; egr::Edge edge0 = egr::Edge(); CHECK(edge0.IsInitialized() == false); @@ -145,13 +163,12 @@ TEST(GradNodeInfo, Edge) { egr::Edge(grad_test_node0, std::make_pair(size_t(1), size_t(0))); VLOG(6) << "Test Set Edge's Grad Node"; auto* grad_node = edge1.GetGradNode(); + et1.set_autograd_meta(auto_grad1); + grad_node->SetGradInMeta(et1, 0); + CHECK_EQ(grad_node->InputMeta().size(), size_t(2)); - auto mt_grad_node = edge1.GetMutableGradNode(); - auto auto_grad1 = std::make_shared(); std::vector metas = {auto_grad1.get()}; - // Uninitialized AutogradMeta indicates - mt_grad_node->SetGradInMeta(&metas, 0); - CHECK(grad_node->InputMeta()[0].IsStopGradient(0) == true); + CHECK(grad_node->InputMeta()[0][0].IsStopGradient() == true); VLOG(6) << "Test Get/Set Edge Rank Info"; CHECK_EQ(edge2.GetEdgeRankInfo().first, size_t(1)); CHECK_EQ(edge2.GetEdgeRankInfo().second, size_t(0)); diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h index 535c93ac53b17..0b167203735d6 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h @@ -32,8 +32,8 @@ class GradTestNode : public egr::GradNodeBase { GradTestNode() : GradNodeBase() { val_ = 1.0; } std::string name() override { return "GradTestNode"; } std::vector> operator()( - const std::vector>& grads) - override { + const std::vector>& grads, + bool create_graph = false) override { val_ = std::dynamic_pointer_cast(grads[0][0].impl()) ->data()[0]; phi::DenseTensorMeta meta = @@ -49,6 +49,11 @@ class GradTestNode : public egr::GradNodeBase { std::vector> res = {{et1}}; return res; } + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } float val_; }; } // namespace eager_test diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc index 8c6eeca9d3d5d..645eac06ddda5 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc @@ -24,12 +24,13 @@ #include "paddle/phi/core/kernel_registry.h" +PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT); + // TODO(jiabin): remove nolint here!!! using namespace egr; // NOLINT TEST(GradTensorHolder, Constructor) { - GradSlotMeta slot_meta; - slot_meta.Init(1); + std::vector slot_meta(1); GradTensorHolder grad_tensor_holder = GradTensorHolder({slot_meta}); GradTensorHolder grad_tensor_holder2 = GradTensorHolder(grad_tensor_holder); @@ -70,8 +71,7 @@ TEST(GradTensorHolder, Interfaces) { paddle::experimental::Tensor et1 = paddle::experimental::Tensor(dt1); // Constructor empty GradTensorHolder - GradSlotMeta slot_meta; - slot_meta.Init(1); + std::vector slot_meta(1); GradTensorHolder grad_tensor_holder = GradTensorHolder({slot_meta, slot_meta}); @@ -136,8 +136,7 @@ TEST(GradTensorHolder, SelectedRowsMergeAdd) { paddle::experimental::Tensor t2(sr2); // Constructor empty GradTensorHolder - GradSlotMeta slot_meta; - slot_meta.Init(1); + std::vector slot_meta(1); GradTensorHolder grad_tensor_holder = GradTensorHolder({slot_meta, slot_meta}); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc index 6c4bf9a4f17e6..056c7102f663b 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc @@ -33,6 +33,16 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT); + using namespace egr; // NOLINT using namespace egr_utils_api; // NOLINT @@ -72,6 +82,47 @@ TEST(Benchmark, EagerScaleCPU) { } } +TEST(Benchmark, EagerMatmulCPU) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + for (const std::string& mode : {"Accuracy", "Performance"}) { + paddle::framework::DDim ddimX = phi::make_ddim({2, 2}); + paddle::experimental::Tensor X = CreateTensorWithValue( + ddimX, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0, true); + RetainGradForTensor(X); + + paddle::framework::DDim ddimY = phi::make_ddim({2, 2}); + paddle::experimental::Tensor Y = CreateTensorWithValue( + ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 2.0, true); + RetainGradForTensor(Y); + + if (mode == "Accuracy") { + benchmark_eager_matmul(X, Y, true /* accuracy_check */); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_matmul_cpu.out"); +#endif + benchmark_eager_matmul(X, Y); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + TEST(Benchmark, EagerIntermediateMatmulCPU) { // Prepare Device Contexts eager_test::InitEnv(paddle::platform::CPUPlace()); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc index 14e7ce8cfcfb4..5e790389819f5 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc @@ -32,11 +32,21 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + using namespace egr; // NOLINT using namespace egr_utils_api; // NOLINT #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); + TEST(Benchmark, EagerScaleCUDA) { eager_test::InitEnv(paddle::platform::CUDAPlace()); @@ -74,6 +84,50 @@ TEST(Benchmark, EagerScaleCUDA) { } } +TEST(Benchmark, EagerMatmulCUDA) { + paddle::platform::CUDAPlace place; + eager_test::InitEnv(place); + + for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { + paddle::framework::DDim ddimX = phi::make_ddim({2, 2}); + paddle::experimental::Tensor X = CreateTensorWithValue( + ddimX, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0, true); + RetainGradForTensor(X); + + paddle::framework::DDim ddimY = phi::make_ddim({2, 2}); + paddle::experimental::Tensor Y = CreateTensorWithValue( + ddimY, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 2.0, true); + RetainGradForTensor(Y); + + if (mode == "Accuracy") { + benchmark_eager_matmul(X, Y, true /* accuracy_check */); + + } else if (mode == "WarmUp") { + benchmark_eager_matmul(X, Y); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_matmul_cuda.out"); +#endif + benchmark_eager_matmul(X, Y); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + TEST(Benchmark, EagerIntermediateMatmulCUDA) { paddle::platform::CUDAPlace place; eager_test::InitEnv(place); @@ -186,7 +240,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) { USE_OP_ITSELF(scale); USE_OP_ITSELF(matmul_v2); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); +USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(elementwise_add); #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc index 3292de9363696..b4b47a85f6666 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc @@ -34,6 +34,16 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT); + namespace paddle { namespace imperative { diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc index e9b7d10070dbf..a3e393b039425 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc @@ -34,8 +34,18 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); + namespace paddle { namespace imperative { @@ -248,7 +258,7 @@ TEST(Benchmark, FluidMLPCUDA) { USE_OP_ITSELF(scale); USE_OP_ITSELF(matmul_v2); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); +USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(elementwise_add); #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc index 96126fa5466aa..c8fb6050e9d45 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc @@ -28,6 +28,7 @@ #include "paddle/fluid/eager/utils.h" // Eager Generated +#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" // Fluid @@ -36,7 +37,7 @@ #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/memory/memcpy.h" -static size_t max_num_benchmark_runs = 5000; +static size_t max_num_benchmark_runs = 4000; namespace egr { @@ -57,7 +58,7 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, } std::vector target_tensors = {input_tensor}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); if (accuracy_check) { // Examine Forward Grad (w.r.t max_num_runs = 10) @@ -67,6 +68,29 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, } } +void benchmark_eager_matmul(const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Y, + bool accuracy_check) { + paddle::experimental::Tensor input_tensor0 = X; + + size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs; + for (size_t i = 0; i < max_num_runs; i++) { + input_tensor0 = + matmul_final_state_dygraph_function(input_tensor0, Y, false, false); + } + + std::vector target_tensors = {input_tensor0}; + Backward(target_tensors, {}); + + if (accuracy_check) { + // Examine Forward Grad (w.r.t max_num_runs = 2) + eager_test::CompareTensorWithValue(input_tensor0, 16); + // Examine Backward Grad (w.r.t max_num_runs = 2) + eager_test::CompareGradTensorWithValue(X, 16); + eager_test::CompareGradTensorWithValue(Y, 16); + } +} + /* ----------------------------------- */ /* ---- Eager Intermediate Matmul ---- */ /* ----------------------------------- */ @@ -82,7 +106,7 @@ void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X, } std::vector target_tensors = {input_tensor0}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); if (accuracy_check) { // Examine Forward Grad (w.r.t max_num_runs = 2) @@ -113,7 +137,7 @@ void benchmark_eager_intermediate_mlp( reduce_sum_dygraph_function(input0, {{"reduce_all", true}}); std::vector target_tensors = {Out}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); if (accuracy_check) { std::unordered_map result = diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h index 0086b51b57e15..86bf13707ed40 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h @@ -51,15 +51,10 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, bool accuracy_check = false); /* ---- Eager MatMul ---- */ -/* -void benchmark_eager_matmul(const paddle::experimental::Tensor& X, const -paddle::experimental::Tensor& Y, +void benchmark_eager_matmul(const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Y, bool accuracy_check = false); -void benchmark_eager_mlp(const paddle::experimental::Tensor& X, - const std::vector& Ws, - const std::vector& Bs, - bool accuracy_check = false); -*/ + void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X, const paddle::experimental::Tensor& Y, bool accuracy_check = false); diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt index c65ad4641cf22..52dba6b9218c7 100644 --- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt @@ -5,6 +5,7 @@ cc_test(test_egr_task_backward SRCS backward_test.cc DEPS ${eager_deps} ${fluid_ cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) +cc_test(test_egr_task_grad SRCS grad_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node) diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc index a4bc56bd606f3..87f8f6eca1f88 100644 --- a/paddle/fluid/eager/tests/task_tests/backward_test.cc +++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc @@ -30,6 +30,11 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); + namespace egr { TEST(Backward, SingleNodeEmptyGrad) { @@ -75,7 +80,7 @@ TEST(Backward, SingleNodeEmptyGrad) { } std::vector outs = {target_tensor}; // Run Backward - RunBackward(outs, {}); + Backward(outs, {}); // Check Output Value eager_test::CompareGradTensorWithValue(leaf_tensor, 5.0); @@ -134,7 +139,7 @@ TEST(Backward, SingleNodeCustomGrad) { } // Run Backward - RunBackward(target_tensors, grad_tensors); + Backward(target_tensors, grad_tensors); // Check Output Value eager_test::CompareGradTensorWithValue(leaf_tensor, 50.0); @@ -207,7 +212,7 @@ TEST(Backward, LinearNodes) { } // Use Empty Grad Tensor - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); // Check Output Value eager_test::CompareGradTensorWithValue(leaf_tensor, 50.0); @@ -311,7 +316,7 @@ TEST(Backward, WithAccumulation) { node2_ptr->AddEdges(&res2, 0); } - RunBackward(target_tensors, grad_tensors); + Backward(target_tensors, grad_tensors); eager_test::CompareGradTensorWithValue(leaf_tensor, 2500.0); } diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc index 524872b2e5563..8b0759c17ed37 100644 --- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc +++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc @@ -31,6 +31,10 @@ #include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { TEST(CrossBatchAccumulation, SingleScaleNode) { @@ -67,12 +71,12 @@ TEST(CrossBatchAccumulation, SingleScaleNode) { std::vector res = {meta}; scale_node_ptr->AddEdges(&res, 0); - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 1.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 5.0); - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 1.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 10.0); diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc index 49bbfc77741a5..dc44d95daac1d 100644 --- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc +++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc @@ -27,6 +27,10 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { TEST(Forward, SingleNode) { diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc index 5a7bafb2fe370..882695e98d109 100644 --- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc +++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc @@ -30,6 +30,13 @@ #include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT); +#endif + namespace egr { paddle::experimental::Tensor hook_function( @@ -79,7 +86,7 @@ TEST(FwdBwdJoint, SingleNode) { std::vector outs = {out}; // 4. Run Backward - RunBackward(outs, {}); + Backward(outs, {}); VLOG(7) << "Target Grad is: " << std::static_pointer_cast( @@ -130,7 +137,7 @@ TEST(FwdBwdJoint, LinearNodes) { std::vector outs = {out1}; // 4. Run Backward - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 10.0); @@ -196,7 +203,7 @@ TEST(FwdBwdJoint, BranchedNodes) { // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 30.0); @@ -253,7 +260,7 @@ TEST(FwdBwdJoint, GradientHook) { // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad // leaf grad @@ -311,13 +318,13 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) { // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 30.0); // Cross Batch Accumulation - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 60.0); @@ -349,7 +356,7 @@ TEST(FwdBwdJoint, SingleNodeCUDA) { std::vector outs = {out}; // 4. Run Backward - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 2.0); @@ -405,7 +412,7 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) { // TODO(jiabin): fix this with add functor // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 30.0); diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc index 4b7077b13bdd6..49e517dc9b3f3 100644 --- a/paddle/fluid/eager/tests/task_tests/generated_test.cc +++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc @@ -30,6 +30,12 @@ #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" #include "paddle/phi/core/kernel_registry.h" +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); + namespace egr { TEST(Generated, Sigmoid) { @@ -51,7 +57,7 @@ TEST(Generated, Sigmoid) { std::vector target_tensors = {output_tensor}; VLOG(6) << "Runing Backward"; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); VLOG(6) << "Finish Backward"; eager_test::CompareGradTensorWithValue(tensor, 0.25); @@ -83,7 +89,7 @@ TEST(Generated, Matmul_v2) { eager_test::CompareTensorWithValue(output_tensor, 96); std::vector target_tensors = {output_tensor}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 2.0 * 20); eager_test::CompareGradTensorWithValue(Y, 3.0 * 4); @@ -114,7 +120,7 @@ TEST(Generated, ElementwiseAdd) { eager_test::CompareTensorWithValue(output_tensor, 5); std::vector target_tensors = {output_tensor}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 1.0); eager_test::CompareGradTensorWithValue(Y, 1.0); @@ -122,6 +128,6 @@ TEST(Generated, ElementwiseAdd) { } // namespace egr -USE_OP(sigmoid); +USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(matmul_v2); diff --git a/paddle/fluid/eager/tests/task_tests/grad_test.cc b/paddle/fluid/eager/tests/task_tests/grad_test.cc new file mode 100644 index 0000000000000..6b03799c48659 --- /dev/null +++ b/paddle/fluid/eager/tests/task_tests/grad_test.cc @@ -0,0 +1,339 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h" +#include "paddle/fluid/eager/api/utils/tensor_utils.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/backward.h" +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/tests/test_utils.h" + +#include "paddle/fluid/eager/api/all.h" + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); +namespace egr { + +TEST(Grad, SingleNodeEmptyGrad) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Inputs + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor (output) + paddle::experimental::Tensor output_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + + // Create input tensor + const paddle::experimental::Tensor leaf_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/); + + { + // Create Scale Node + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + + // Set grad in/out meta + node0_ptr->SetDefaultGradInOutMeta(); + + // Output_tensor set GradNode态OutRank态StopGradient propertis + AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&output_tensor); + auto_grad_meta->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta->SetStopGradient(false); + + // Get autograd_meta from input tensor + AutogradMeta* auto_grad_meta1 = + EagerUtils::unsafe_autograd_meta(leaf_tensor); + + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); + + // input tensor set GradNode态OutRank态StopGradient propertis + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta1->SetStopGradient(false); + + // grad_node Add Edges + std::vector res = {auto_grad_meta1}; + node0_ptr->AddEdges(&res, 0); + } + std::vector outs = {output_tensor}; + + // Run Grad + auto result = Grad(outs, {leaf_tensor}, {}); + // Check Output Value + eager_test::CompareTensorWithValue(result[0], 5.0); +} + +TEST(Grad, SingleNodeCustomGrad) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Inputs + std::vector target_tensors; + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor + paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + target_tensors.emplace_back(std::move(tensor)); + + std::vector grad_tensors; + // Create Grad Tensor + paddle::experimental::Tensor grad_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/); + grad_tensors.emplace_back(std::move(grad_tensor)); + + paddle::experimental::Tensor leaf_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/); + + { + // Create Scale Node + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + + // Set grad in/out meta + node0_ptr->SetDefaultGradInOutMeta(); + + // Connect Tensor and Node via AutoGradMeta + AutogradMeta* auto_grad_meta = + EagerUtils::autograd_meta(&(target_tensors[0])); + auto_grad_meta->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta->SetStopGradient(false); + + AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); + + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta1->SetStopGradient(false); + std::vector res = {auto_grad_meta1}; + node0_ptr->AddEdges(&res, 0); + } + + auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors); + + // Check Output Value + eager_test::CompareTensorWithValue(result[0], 50.0); +} + +/* +Node1 + | +Node0 + | + { } // empty grad tensor +*/ +TEST(Grad, LinearNodes) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Target Tensor + std::vector target_tensors; + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor + paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + target_tensors.emplace_back(std::move(tensor)); + + paddle::experimental::Tensor leaf_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/); + { + // Create Node0 + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + + // Set grad in/out meta for node0 + node0_ptr->SetDefaultGradInOutMeta(); + + // Create Node1 + auto node1_ptr = std::make_shared(1, 1); + node1_ptr->SetAttributes_scale(10.0 /*scale*/); + + // Set grad in/out meta for node1 + node1_ptr->SetDefaultGradInOutMeta(); + + // Connect Input Tensor and Node0 via AutoGradMeta + AutogradMeta* auto_grad_meta = + EagerUtils::autograd_meta(&(target_tensors[0])); + auto_grad_meta->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta->SetStopGradient(false); + // Connect Node0 -> Node1 via Edge + auto meta0 = egr::AutogradMeta(); + meta0.SetStopGradient(false); + meta0.SetSingleOutRankWithSlot(0, 0); + meta0.SetGradNode(node1_ptr); + std::vector res0 = {&meta0}; + node0_ptr->AddEdges(&res0, 0); + + AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); + + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + + auto_grad_meta1->SetStopGradient(false); + std::vector res1 = {auto_grad_meta1}; + node1_ptr->AddEdges(&res1, 0); + } + + // Use Empty Grad Tensor + auto result = Grad(target_tensors, {leaf_tensor}, {}); + + // Check Output Value + eager_test::CompareTensorWithValue(result[0], 50.0); +} + +/* + Node2 + | | +Node0 Node1 + | | + in0 in1 +*/ +TEST(Grad, WithAccumulation) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Inputs + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor + std::vector target_tensors; + paddle::experimental::Tensor tensor0 = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + paddle::experimental::Tensor tensor1 = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + target_tensors.emplace_back(std::move(tensor0)); + target_tensors.emplace_back(std::move(tensor1)); + + // Create Grad Tensor + std::vector grad_tensors; + paddle::experimental::Tensor grad_tensor0 = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/); + paddle::experimental::Tensor grad_tensor1 = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/); + grad_tensors.emplace_back(std::move(grad_tensor0)); + grad_tensors.emplace_back(std::move(grad_tensor1)); + + paddle::experimental::Tensor leaf_tensor; + { + // Create Node0 + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + node0_ptr->SetDefaultGradInOutMeta(); + + // Create Node1 + auto node1_ptr = std::make_shared(1, 1); + node1_ptr->SetAttributes_scale(10.0 /*scale*/); + node1_ptr->SetDefaultGradInOutMeta(); + // Create Node2 + auto node2_ptr = std::make_shared(1, 1); + node2_ptr->SetAttributes_scale(20.0 /*scale*/); + node2_ptr->SetDefaultGradInOutMeta(); + // Connect Inp0 and Node0 via AutoGradMeta + AutogradMeta* auto_grad_meta0 = + EagerUtils::autograd_meta(&(target_tensors[0])); + auto_grad_meta0->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta0->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta0->SetStopGradient(false); + // Connect Inp1 and Node1 via AutoGradMeta + AutogradMeta* auto_grad_meta1 = + EagerUtils::autograd_meta(&(target_tensors[1])); + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(node1_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta1->SetStopGradient(false); + + // Connect Node0 -> Node2 via Edge + auto meta0 = egr::AutogradMeta(); + meta0.SetStopGradient(false); + meta0.SetSingleOutRankWithSlot(0, 0); + meta0.SetGradNode(node2_ptr); + std::vector res0 = {&meta0}; + node0_ptr->AddEdges(&res0, 0); + + // Connect Node1 -> Node2 via Edge + auto meta1 = egr::AutogradMeta(); + meta1.SetStopGradient(false); + meta1.SetSingleOutRankWithSlot(0, 0); + meta1.SetGradNode(node2_ptr); + std::vector res1 = {&meta1}; + node1_ptr->AddEdges(&res1, 0); + + AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta2); + + auto_grad_meta2->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta2->SetSingleOutRankWithSlot(0, 0); + + auto_grad_meta2->SetStopGradient(false); + std::vector res2 = {auto_grad_meta2}; + node2_ptr->AddEdges(&res2, 0); + } + + auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors); + + eager_test::CompareTensorWithValue(result[0], 2500.0); +} + +} // namespace egr diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc index 9cda961741f55..2c53fc89f650e 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc @@ -31,6 +31,10 @@ #include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { paddle::experimental::Tensor hook_function( @@ -128,7 +132,7 @@ TEST(RetainGrad, HookBeforeRetainGrad) { leaf_tensor); // result: 4.0*5.0 + 3.0 = 23.0 } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 4.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 23.0); @@ -195,7 +199,7 @@ TEST(RetainGrad, HookAfterRetainGrad) { leaf_tensor, std::make_shared(hook_function)); } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 1.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 23.0); } diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc index 15b2a62dca751..b86865e2d126f 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc @@ -27,6 +27,12 @@ #include "paddle/fluid/eager/hooks.h" #include "paddle/phi/core/kernel_registry.h" +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); + namespace egr { paddle::experimental::Tensor hook_function( @@ -102,7 +108,7 @@ void test_sigmoid(bool is_remove_gradient_hook) { } VLOG(6) << "Runing Backward"; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); VLOG(6) << "Finish Backward"; eager_test::CompareGradTensorWithValue( @@ -160,7 +166,7 @@ void test_elementwiseAdd(bool is_remove_gradient_hook) { grad_node_tmp->RemoveGradientHook(hook_id); } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 1.0); eager_test::CompareGradTensorWithValue( @@ -218,7 +224,7 @@ void test_matmul(bool is_remove_gradient_hook) { grad_node_tmp->RemoveGradientHook(hook_id); } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 2.0 * 20); eager_test::CompareGradTensorWithValue( @@ -249,6 +255,6 @@ TEST(Hook_intermidiate, Matmul_v2) { } } // namespace egr -USE_OP(sigmoid); +USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(matmul_v2); diff --git a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc index ea821d195099f..24e5da060111f 100644 --- a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc +++ b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc @@ -23,6 +23,10 @@ #include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { TEST(TensorUtils, Test) { diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h index 6f8bccd64e45f..277319bc700b6 100644 --- a/paddle/fluid/eager/to_static/run_program_op_func.h +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -57,6 +57,7 @@ inline void run_program_dygraph_function( auto grad_node = std::make_shared(1, 2); grad_node->SetFwdOutNames(out_names); + grad_node->SetOut(out); // Set Attributes grad_node->SetAttrMap(attrs); // Set TensorWrappers @@ -65,10 +66,10 @@ inline void run_program_dygraph_function( grad_node->SetStepScope(step_scope); // Set Grad out rank as same as fwd input and set stop gradient to bwd - grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0); - grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1); + grad_node->SetGradOutMeta(x, /*slot id*/ 0); + grad_node->SetGradOutMeta(params, /*slot id*/ 1); - grad_node->SetGradInMeta(&p_autograd_outs, 0); + grad_node->SetGradInMeta(deref_out, 0); // Set Next Edges grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0); grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1); diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index ae5d86664a346..4eaa64d3ac659 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -260,9 +260,9 @@ inline void RunProgramAPI( } VLOG(2) << "The number of sub scopes after forward: " << out_scope_vec->front()->kids().size(); - // #ifdef PADDLE_WITH_MKLDNN - // if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); - // #endif +#ifdef PADDLE_WITH_MKLDNN + if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); +#endif } inline void RunProgramGradAPI( @@ -357,7 +357,7 @@ inline void RunProgramGradAPI( details::ShareTensorsFromScope(params_grad, *global_block, &scope); // Step5. drop current scope - // global_inner_scope->DeleteScope(&scope); + global_inner_scope->DeleteScope(&scope); VLOG(2) << "The number of sub scopes after backward: " << global_inner_scope->kids().size(); } @@ -370,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase { ~GradNodeRunProgram() override = default; // Functor: perform backward computations virtual std::vector> operator()( - const std::vector> &grads) - override { + const std::vector> &grads, + bool create_graph) override { VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram"; PADDLE_ENFORCE_EQ( grads.size(), 1, @@ -400,6 +400,10 @@ class GradNodeRunProgram : public egr::GradNodeBase { paddle::platform::errors::InvalidArgument( "The grads[0].size() and fwd_out_names_.size() should be equal.")); for (size_t i = 0; i < fwd_out_names_.size(); ++i) { + auto &out_grad = egr::EagerUtils::unsafe_autograd_meta(*out_[i])->Grad(); + const_cast(out_grad).set_impl( + grads[0][i].impl()); + const_cast(grads[0][i]) .set_name(fwd_out_names_[i] + "@GRAD"); } @@ -411,6 +415,12 @@ class GradNodeRunProgram : public egr::GradNodeBase { // return {x_grad, details::DereferenceTensors(params_grad_ptr)}; } + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } + // SetAttrMap void SetAttrMap(const paddle::framework::AttributeMap &attrs) { attrs_ = attrs; @@ -432,6 +442,10 @@ class GradNodeRunProgram : public egr::GradNodeBase { fwd_out_names_ = out_names; } + void SetOut(const std::vector &out) { + out_ = out; + } + protected: void ConstructGradTensors( const std::vector &fwd_tensors, @@ -440,7 +454,11 @@ class GradNodeRunProgram : public egr::GradNodeBase { // such as: name, tensor type(DenseTensor or SelectedRows). VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); for (auto &fwd_t : fwd_tensors) { - grad_tensors->emplace_back(fwd_t.impl()); + if (phi::DenseTensor::classof(fwd_t.impl().get())) { + grad_tensors->emplace_back(std::make_shared()); + } else if (phi::SelectedRows::classof(fwd_t.impl().get())) { + grad_tensors->emplace_back(std::make_shared()); + } auto &grad_t = grad_tensors->back(); grad_t.set_name(fwd_t.name() + "@GRAD"); } @@ -462,6 +480,7 @@ class GradNodeRunProgram : public egr::GradNodeBase { std::vector step_scope_; std::vector fwd_out_names_; + std::vector out_; // Attribute Map paddle::framework::AttributeMap attrs_; diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 8a57d2694535e..048087903a47c 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -212,6 +212,27 @@ std::vector> EagerUtils::CreateVars( return res; } +void EagerUtils::ModifyInplaceInput( + const std::shared_ptr& inplace_variable, + paddle::experimental::Tensor* inplace_tensor) { + // Only modify the meta information of the inplace tensor, because + // EagerVariable cannot modify Tensor's meta information after inplace + // op (such as ``reshape``) is executed. + PADDLE_ENFORCE_NOT_NULL(inplace_tensor, + paddle::platform::errors::Fatal( + "Inplace Tensor is null and cannot be modified. " + "We are tring to Modify Inplace Input from its " + "shared_ptr, this error may indicate the inplace " + " input is nullptr")); + if (phi::DenseTensor::classof(inplace_variable->GetTensorBase().get())) { + phi::DenseTensor* variable_dense_tensor = + static_cast(inplace_variable->GetTensorBase().get()); + phi::DenseTensor* tensor_dense_tensor = + static_cast(inplace_tensor->impl().get()); + tensor_dense_tensor->set_meta(variable_dense_tensor->meta()); + } +} + std::vector EagerUtils::GetOutputs( const std::vector>& outs) { std::vector res; diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h index fa5735e6f32a0..fbd080ef70e25 100644 --- a/paddle/fluid/eager/utils.h +++ b/paddle/fluid/eager/utils.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/fluid/eager/api/utils/tensor_utils.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" @@ -144,6 +145,19 @@ class EagerUtils { iter.apply(std::forward(args)...); } + static void CheckInplace(const paddle::experimental::Tensor& target, + const AutogradMeta* autograd_meta, + bool require_any_grad) { + if (require_any_grad && autograd_meta) { + PADDLE_ENFORCE_EQ(!autograd_meta->StopGradient() && + egr::egr_utils_api::IsLeafTensor(target), + false, paddle::platform::errors::InvalidArgument( + "Leaf Var (%s) that doesn't stop gradient " + "can't use inplace strategy.", + target.name())); + } + } + // TensorWrapper Utils static paddle::experimental::Tensor RecoverTensorWrapper( TensorWrapper* tw, const std::shared_ptr& grad_node); @@ -171,6 +185,9 @@ class EagerUtils { static std::vector> CreateVars( const size_t num); // Construct Tensor From var + static void ModifyInplaceInput( + const std::shared_ptr& inplace_variable, + paddle::experimental::Tensor* inplace_tensor); static std::vector GetOutputs( const std::vector>& outs); static paddle::experimental::Tensor GetOutput( diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index aa92a3b2226c1..5dc3d9e89c557 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -440,6 +440,7 @@ message(STATUS "branch: ${PADDLE_BRANCH}") configure_file(commit.h.in commit.h) cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api) + #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index b9e3bee25f6b5..478e39b99dcc9 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -25,6 +25,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/op_meta_info_helper.h" @@ -946,15 +947,16 @@ void RegisterOperatorWithMetaInfoMap( ////////////////////// User APIs /////////////////////// // load op api -void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) { +const std::unordered_map>& +LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) { void* handle = paddle::platform::dynload::GetOpDsoHandle(dso_name); VLOG(3) << "load custom_op lib: " << dso_name; typedef OpMetaInfoMap& get_op_meta_info_map_t(); auto* get_op_meta_info_map = detail::DynLoad(handle, "PD_GetOpMetaInfoMap"); auto& op_meta_info_map = get_op_meta_info_map(); - RegisterOperatorWithMetaInfoMap(op_meta_info_map, handle); + return op_meta_info_map.GetMap(); } } // namespace framework diff --git a/paddle/fluid/framework/custom_operator.h b/paddle/fluid/framework/custom_operator.h index 4310b56437182..fef1e82a14fe6 100644 --- a/paddle/fluid/framework/custom_operator.h +++ b/paddle/fluid/framework/custom_operator.h @@ -20,9 +20,9 @@ limitations under the License. */ namespace paddle { namespace framework { - // Load custom op api: register op after user compiled -void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name); +const std::unordered_map>& +LoadOpMetaInfoAndRegisterOp(const std::string& dso_name); // Register custom op api: register op directly void RegisterOperatorWithMetaInfoMap( @@ -31,6 +31,5 @@ void RegisterOperatorWithMetaInfoMap( // Interface for selective register custom op. void RegisterOperatorWithMetaInfo(const std::vector& op_meta_infos, void* dso_handle = nullptr); - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc index 1a4f283f511da..589d09bf81c1d 100644 --- a/paddle/fluid/framework/data_device_transform.cc +++ b/paddle/fluid/framework/data_device_transform.cc @@ -34,6 +34,14 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place, return; } + // NOTE(hqp): Special case for CPU->MLU, avoid stream sync. + if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) { + paddle::framework::TensorCopy( + in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place), + out); + return; + } + // NOTE(yy): TransDataDevice should wait for computation of input. if (!platform::is_cuda_pinned_place(in.place())) { platform::DeviceContextPool::Instance().Get(in.place())->Wait(); diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 48850d4624a14..f951b5d0f5070 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -174,10 +174,11 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool force_disable_gc, bool keep_kid_scopes) { platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); + auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); #ifdef PADDLE_WITH_MKLDNN platform::AttachPointerHashToMKLDNNKey(this, place_); + platform::RegisterModelLayout(ctx->ops_, place_); #endif - auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars, keep_kid_scopes); } diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index 17346f5fd9393..2b8b4b3ff9573 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -10,8 +10,9 @@ IF(WITH_GPU) nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS}) nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) - nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm) + nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table) nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps) + nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps) ENDIF() IF(WITH_ROCM) hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h new file mode 100644 index 0000000000000..235f7a226ad17 --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h @@ -0,0 +1,120 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifdef PADDLE_WITH_HETERPS +namespace paddle { +namespace framework { +struct GpuPsGraphNode { + int64_t node_id; + int neighbor_size, neighbor_offset; + // this node's neighbor is stored on [neighbor_offset,neighbor_offset + + // neighbor_size) of int64_t *neighbor_list; +}; + +struct GpuPsCommGraph { + int64_t *neighbor_list; + GpuPsGraphNode *node_list; + int neighbor_size, node_size; + // the size of neighbor array and graph_node_list array + GpuPsCommGraph() + : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {} + GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_, + int neighbor_size_, int node_size_) + : neighbor_list(neighbor_list_), + node_list(node_list_), + neighbor_size(neighbor_size_), + node_size(node_size_) {} +}; + +/* +suppose we have a graph like this + +0----3-----5----7 + \ |\ |\ + 17 8 9 1 2 + +we save the nodes in arbitrary order, +in this example,the order is +[0,5,1,2,7,3,8,9,17] +let us name this array u_id; +we record each node's neighbors: +0:3,17 +5:3,7 +1:7 +2:7 +7:1,2,5 +3:0,5,8,9 +8:3 +9:3 +17:0 + +by concatenating each node's neighbor_list in the order we save the node id. +we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0] +this is the neighbor_list of GpuPsCommGraph +given this neighbor_list and the order to save node id, +we know, +node 0's neighbors are in the range [0,1] of neighbor_list +node 5's neighbors are in the range [2,3] of neighbor_list +node 1's neighbors are in the range [4,4] of neighbor_list +node 2:[5,5] +node 7:[6,6] +node 3:[9,12] +node 8:[13,13] +node 9:[14,14] +node 17:[15,15] +... +by the above information, +we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph +of size 9, +where node_list[i].id = u_id[i] +then we have: +node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0 +node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2 +node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4 +node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5 +node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6 +node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9 +node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13 +node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14 +node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15 +*/ +struct NeighborSampleResult { + int64_t *val; + int *actual_sample_size, sample_size, key_size; + NeighborSampleResult(int _sample_size, int _key_size) + : sample_size(_sample_size), key_size(_key_size) { + actual_sample_size = NULL; + val = NULL; + }; + ~NeighborSampleResult() { + if (val != NULL) cudaFree(val); + if (actual_sample_size != NULL) cudaFree(actual_sample_size); + } +}; + +struct NodeQueryResult { + int64_t *val; + int actual_sample_size; + NodeQueryResult() { + val = NULL; + actual_sample_size = 0; + }; + ~NodeQueryResult() { + if (val != NULL) cudaFree(val); + } +}; +} +}; +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h index a6508bf96c00f..3d1599a76e8eb 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -13,115 +13,27 @@ // limitations under the License. #pragma once +#include #include "heter_comm.h" +#include "paddle/fluid/distributed/ps/table/common_graph_table.h" +#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" #include "paddle/fluid/platform/enforce.h" #ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { -struct GpuPsGraphNode { - int64_t node_id; - int neighbor_size, neighbor_offset; - // this node's neighbor is stored on [neighbor_offset,neighbor_offset + - // neighbor_size) of int64_t *neighbor_list; -}; - -struct GpuPsCommGraph { - int64_t *neighbor_list; - GpuPsGraphNode *node_list; - int neighbor_size, node_size; - // the size of neighbor array and graph_node_list array - GpuPsCommGraph() - : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {} - GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_, - int neighbor_size_, int node_size_) - : neighbor_list(neighbor_list_), - node_list(node_list_), - neighbor_size(neighbor_size_), - node_size(node_size_) {} -}; - -/* -suppose we have a graph like this -0----3-----5----7 - \ |\ |\ - 17 8 9 1 2 - -we save the nodes in arbitrary order, -in this example,the order is -[0,5,1,2,7,3,8,9,17] -let us name this array u_id; -we record each node's neighbors: -0:3,17 -5:3,7 -1:7 -2:7 -7:1,2,5 -3:0,5,8,9 -8:3 -9:3 -17:0 - -by concatenating each node's neighbor_list in the order we save the node id. -we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0] -this is the neighbor_list of GpuPsCommGraph -given this neighbor_list and the order to save node id, -we know, -node 0's neighbors are in the range [0,1] of neighbor_list -node 5's neighbors are in the range [2,3] of neighbor_list -node 1's neighbors are in the range [4,4] of neighbor_list -node 2:[5,5] -node 7:[6,6] -node 3:[9,12] -node 8:[13,13] -node 9:[14,14] -node 17:[15,15] -... -by the above information, -we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph -of size 9, -where node_list[i].id = u_id[i] -then we have: -node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0 -node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2 -node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4 -node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5 -node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6 -node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9 -node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13 -node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14 -node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15 -*/ -struct NeighborSampleResult { - int64_t *val; - int *actual_sample_size, sample_size, key_size; - NeighborSampleResult(int _sample_size, int _key_size) - : sample_size(_sample_size), key_size(_key_size) { - actual_sample_size = NULL; - val = NULL; - }; - ~NeighborSampleResult() { - if (val != NULL) cudaFree(val); - if (actual_sample_size != NULL) cudaFree(actual_sample_size); - } -}; - -struct NodeQueryResult { - int64_t *val; - int actual_sample_size; - NodeQueryResult() { - val = NULL; - actual_sample_size = 0; - }; - ~NodeQueryResult() { - if (val != NULL) cudaFree(val); - } -}; class GpuPsGraphTable : public HeterComm { public: GpuPsGraphTable(std::shared_ptr resource) : HeterComm(1, resource) { load_factor_ = 0.25; + rw_lock.reset(new pthread_rwlock_t()); + cpu_table_status = -1; + } + ~GpuPsGraphTable() { + if (cpu_table_status != -1) { + end_graph_sampling(); + } } void build_graph_from_cpu(std::vector &cpu_node_list); NodeQueryResult *graph_node_sample(int gpu_id, int sample_size); @@ -129,14 +41,26 @@ class GpuPsGraphTable : public HeterComm { int sample_size, int len); NodeQueryResult *query_node_list(int gpu_id, int start, int query_size); void clear_graph_info(); - void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num, - int sample_size, int *h_left, - int *h_right, - int64_t *src_sample_res, - int *actual_sample_size); + void move_neighbor_sample_result_to_source_gpu( + int gpu_id, int gpu_num, int *h_left, int *h_right, + int64_t *src_sample_res, thrust::host_vector &total_sample_size); + void move_neighbor_sample_size_to_source_gpu(int gpu_id, int gpu_num, + int *h_left, int *h_right, + int *actual_sample_size, + int *total_sample_size); + int init_cpu_table(const paddle::distributed::GraphParameter &graph); + int load(const std::string &path, const std::string ¶m); + virtual int32_t end_graph_sampling() { + return cpu_graph_table->end_graph_sampling(); + } private: std::vector gpu_graph_list; + std::shared_ptr cpu_graph_table; + std::shared_ptr rw_lock; + mutable std::mutex mutex_; + std::condition_variable cv_; + int cpu_table_status; }; } }; diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h index 839c7e5468c6c..acd3f0a290d0b 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h @@ -13,9 +13,23 @@ // limitations under the License. #pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + #ifdef PADDLE_WITH_HETERPS +//#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" namespace paddle { namespace framework { + +constexpr int WARP_SIZE = 32; + /* comment 0 this kernel just serves as an example of how to sample nodes' neighbors. @@ -28,30 +42,116 @@ sample_size; */ -__global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index, - int* actual_size, - int64_t* sample_result, int sample_size, - int len) { - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < len) { +struct MaxFunctor { + int sample_size; + HOSTDEVICE explicit inline MaxFunctor(int sample_size) { + this->sample_size = sample_size; + } + HOSTDEVICE inline int operator()(int x) const { + if (x > sample_size) { + return sample_size; + } + return x; + } +}; + +struct DegreeFunctor { + GpuPsCommGraph graph; + HOSTDEVICE explicit inline DegreeFunctor(GpuPsCommGraph graph) { + this->graph = graph; + } + HOSTDEVICE inline int operator()(int i) const { + return graph.node_list[i].neighbor_size; + } +}; + +template +__global__ void neighbor_sample(const uint64_t rand_seed, GpuPsCommGraph graph, + int sample_size, int* index, int len, + int64_t* sample_result, int* output_idx, + int* output_offset) { + assert(blockDim.x == WARP_SIZE); + assert(blockDim.y == BLOCK_WARPS); + + int i = blockIdx.x * TILE_SIZE + threadIdx.y; + const int last_idx = min(static_cast(blockIdx.x + 1) * TILE_SIZE, len); + curandState rng; + curand_init(rand_seed * gridDim.x + blockIdx.x, + threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng); + + while (i < last_idx) { auto node_index = index[i]; - actual_size[i] = graph.node_list[node_index].neighbor_size < sample_size - ? graph.node_list[node_index].neighbor_size - : sample_size; - int offset = graph.node_list[node_index].neighbor_offset; - for (int j = 0; j < actual_size[i]; j++) { - sample_result[sample_size * i + j] = graph.neighbor_list[offset + j]; + int degree = graph.node_list[node_index].neighbor_size; + const int offset = graph.node_list[node_index].neighbor_offset; + int output_start = output_offset[i]; + + if (degree <= sample_size) { + // Just copy + for (int j = threadIdx.x; j < degree; j += WARP_SIZE) { + sample_result[output_start + j] = graph.neighbor_list[offset + j]; + } + } else { + for (int j = threadIdx.x; j < degree; j += WARP_SIZE) { + output_idx[output_start + j] = j; + } + + __syncwarp(); + + for (int j = sample_size + threadIdx.x; j < degree; j += WARP_SIZE) { + const int num = curand(&rng) % (j + 1); + if (num < sample_size) { + atomicMax( + reinterpret_cast(output_idx + output_start + num), + static_cast(j)); + } + } + + __syncwarp(); + + for (int j = threadIdx.x; j < sample_size; j += WARP_SIZE) { + const int perm_idx = output_idx[output_start + j] + offset; + sample_result[output_start + j] = graph.neighbor_list[perm_idx]; + } } + + i += BLOCK_WARPS; } } +int GpuPsGraphTable::init_cpu_table( + const paddle::distributed::GraphParameter& graph) { + cpu_graph_table.reset(new paddle::distributed::GraphTable); + cpu_table_status = cpu_graph_table->initialize(graph); + if (cpu_table_status != 0) return cpu_table_status; + std::function&)> callback = + [this](std::vector& res) { + pthread_rwlock_wrlock(this->rw_lock.get()); + this->clear_graph_info(); + this->build_graph_from_cpu(res); + pthread_rwlock_unlock(this->rw_lock.get()); + cv_.notify_one(); + }; + cpu_graph_table->set_graph_sample_callback(callback); + return cpu_table_status; +} + +int GpuPsGraphTable::load(const std::string& path, const std::string& param) { + int status = cpu_graph_table->load(path, param); + if (status != 0) { + return status; + } + std::unique_lock lock(mutex_); + cpu_graph_table->start_graph_sampling(); + cv_.wait(lock); + return 0; +} /* comment 1 gpu i triggers a neighbor_sample task, when this task is done, this function is called to move the sample result on other gpu back - to gup i and aggragate the result. + to gpu i and aggragate the result. the sample_result is saved on src_sample_res and the actual sample size for each node is saved on actual_sample_size. the number of actual sample_result for @@ -68,9 +168,50 @@ __global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index, that's what fill_dvals does. */ +void GpuPsGraphTable::move_neighbor_sample_size_to_source_gpu( + int gpu_id, int gpu_num, int* h_left, int* h_right, int* actual_sample_size, + int* total_sample_size) { + // This function copyed actual_sample_size to source_gpu, + // and calculate total_sample_size of each gpu sample number. + for (int i = 0; i < gpu_num; i++) { + if (h_left[i] == -1 || h_right[i] == -1) { + continue; + } + auto shard_len = h_right[i] - h_left[i] + 1; + auto& node = path_[gpu_id][i].nodes_.front(); + cudaMemcpyAsync(reinterpret_cast(actual_sample_size + h_left[i]), + node.val_storage + sizeof(int) * shard_len, + sizeof(int) * shard_len, cudaMemcpyDefault, + node.out_stream); + } + for (int i = 0; i < gpu_num; ++i) { + if (h_left[i] == -1 || h_right[i] == -1) { + total_sample_size[i] = 0; + continue; + } + auto& node = path_[gpu_id][i].nodes_.front(); + cudaStreamSynchronize(node.out_stream); + + auto shard_len = h_right[i] - h_left[i] + 1; + thrust::device_vector t_actual_sample_size(shard_len); + thrust::copy(actual_sample_size + h_left[i], + actual_sample_size + h_left[i] + shard_len, + t_actual_sample_size.begin()); + total_sample_size[i] = thrust::reduce(t_actual_sample_size.begin(), + t_actual_sample_size.end()); + } +} + void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( - int gpu_id, int gpu_num, int sample_size, int* h_left, int* h_right, - int64_t* src_sample_res, int* actual_sample_size) { + int gpu_id, int gpu_num, int* h_left, int* h_right, int64_t* src_sample_res, + thrust::host_vector& total_sample_size) { + /* + if total_sample_size is [4, 5, 1, 6], + then cumsum_total_sample_size is [0, 4, 9, 10]; + */ + thrust::host_vector cumsum_total_sample_size(gpu_num, 0); + thrust::exclusive_scan(total_sample_size.begin(), total_sample_size.end(), + cumsum_total_sample_size.begin(), 0); for (int i = 0; i < gpu_num; i++) { if (h_left[i] == -1 || h_right[i] == -1) { continue; @@ -80,14 +221,10 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( // auto& node = path_[gpu_id][i].nodes_[cur_step]; auto& node = path_[gpu_id][i].nodes_.front(); cudaMemcpyAsync( - reinterpret_cast(src_sample_res + h_left[i] * sample_size), + reinterpret_cast(src_sample_res + cumsum_total_sample_size[i]), node.val_storage + sizeof(int64_t) * shard_len, - node.val_bytes_len - sizeof(int64_t) * shard_len, cudaMemcpyDefault, + sizeof(int64_t) * total_sample_size[i], cudaMemcpyDefault, node.out_stream); - cudaMemcpyAsync(reinterpret_cast(actual_sample_size + h_left[i]), - node.val_storage + sizeof(int) * shard_len, - sizeof(int) * shard_len, cudaMemcpyDefault, - node.out_stream); } for (int i = 0; i < gpu_num; ++i) { if (h_left[i] == -1 || h_right[i] == -1) { @@ -102,17 +239,35 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( TODO: how to optimize it to eliminate the for loop */ -__global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals, - int* d_shard_actual_sample_size, - int* d_actual_sample_size, int* idx, - int sample_size, int len) { +__global__ void fill_dvalues_actual_sample_size(int* d_shard_actual_sample_size, + int* d_actual_sample_size, + int* idx, int len) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { d_actual_sample_size[idx[i]] = d_shard_actual_sample_size[i]; - // d_vals[idx[i]] = d_shard_vals[i]; - for (int j = 0; j < sample_size; j++) { - d_vals[idx[i] * sample_size + j] = d_shard_vals[i * sample_size + j]; + } +} + +template +__global__ void fill_dvalues_sample_result(int64_t* d_shard_vals, + int64_t* d_vals, + int* d_actual_sample_size, int* idx, + int* offset, int* d_offset, + int len) { + assert(blockDim.x == WARP_SIZE); + assert(blockDim.y == BLOCK_WARPS); + + int i = blockIdx.x * TILE_SIZE + threadIdx.y; + const int last_idx = min(static_cast(blockIdx.x + 1) * TILE_SIZE, len); + while (i < last_idx) { + const int sample_size = d_actual_sample_size[idx[i]]; + for (int j = threadIdx.x; j < sample_size; j += WARP_SIZE) { + d_vals[offset[idx[i]] + j] = d_shard_vals[d_offset[i] + j]; } +#ifdef PADDLE_WITH_CUDA + __syncwarp(); +#endif + i += BLOCK_WARPS; } } @@ -226,14 +381,12 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, h_left = [0,5],h_right = [4,8] */ + NeighborSampleResult* result = new NeighborSampleResult(sample_size, len); if (len == 0) { return result; } - cudaMalloc((void**)&result->val, len * sample_size * sizeof(int64_t)); - cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int)); - int* actual_sample_size = result->actual_sample_size; - int64_t* val = result->val; + int total_gpu = resource_->total_gpu(); int dev_id = resource_->dev_id(gpu_id); platform::CUDAPlace place = platform::CUDAPlace(dev_id); @@ -258,11 +411,6 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t)); int64_t* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); - auto d_shard_vals = memory::Alloc(place, len * sizeof(int64_t)); - int64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); - auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int)); - int* d_shard_actual_sample_size_ptr = - reinterpret_cast(d_shard_actual_sample_size->ptr()); split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id); @@ -302,6 +450,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, of alloc_mem_i, actual_sample_size_of_x equals ((int *)alloc_mem_i)[shard_len + x] */ + create_storage(gpu_id, i, shard_len * sizeof(int64_t), shard_len * (1 + sample_size) * sizeof(int64_t)); } @@ -322,6 +471,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, h_right[i] - h_left[i] + 1, resource_->remote_stream(i, gpu_id)); } + for (int i = 0; i < total_gpu; ++i) { if (h_left[i] == -1) { continue; @@ -335,10 +485,42 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, int* res_array = reinterpret_cast(node.val_storage); int* actual_size_array = res_array + shard_len; int64_t* sample_array = (int64_t*)(res_array + shard_len * 2); - neighbor_sample_example<<remote_stream(i, gpu_id)>>>( - graph, res_array, actual_size_array, sample_array, sample_size, - shard_len); + + // 1. get actual_size_array. + // 2. get sum of actual_size. + // 3. get offset ptr + thrust::device_vector t_res_array(shard_len); + thrust::copy(res_array, res_array + shard_len, t_res_array.begin()); + thrust::device_vector t_actual_size_array(shard_len); + thrust::transform(t_res_array.begin(), t_res_array.end(), + t_actual_size_array.begin(), DegreeFunctor(graph)); + + if (sample_size >= 0) { + thrust::transform(t_actual_size_array.begin(), t_actual_size_array.end(), + t_actual_size_array.begin(), MaxFunctor(sample_size)); + } + + thrust::copy(t_actual_size_array.begin(), t_actual_size_array.end(), + actual_size_array); + + int total_sample_sum = + thrust::reduce(t_actual_size_array.begin(), t_actual_size_array.end()); + + thrust::device_vector output_idx(total_sample_sum); + thrust::device_vector output_offset(shard_len); + thrust::exclusive_scan(t_actual_size_array.begin(), + t_actual_size_array.end(), output_offset.begin(), 0); + + constexpr int BLOCK_WARPS = 128 / WARP_SIZE; + constexpr int TILE_SIZE = BLOCK_WARPS * 16; + const dim3 block_(WARP_SIZE, BLOCK_WARPS); + const dim3 grid_((shard_len + TILE_SIZE - 1) / TILE_SIZE); + neighbor_sample< + BLOCK_WARPS, + TILE_SIZE><<remote_stream(i, gpu_id)>>>( + 0, graph, sample_size, res_array, shard_len, sample_array, + thrust::raw_pointer_cast(output_idx.data()), + thrust::raw_pointer_cast(output_offset.data())); } for (int i = 0; i < total_gpu; ++i) { @@ -349,13 +531,56 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, tables_[i]->rwlock_->UNLock(); } // walk_to_src(num, total_gpu, h_left, h_right, d_shard_vals_ptr); - move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size, - h_left, h_right, d_shard_vals_ptr, - d_shard_actual_sample_size_ptr); - fill_dvalues<<>>( - d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size, - d_idx_ptr, sample_size, len); + auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int)); + int* d_shard_actual_sample_size_ptr = + reinterpret_cast(d_shard_actual_sample_size->ptr()); + // Store total sample number of each gpu. + thrust::host_vector d_shard_total_sample_size(total_gpu, 0); + move_neighbor_sample_size_to_source_gpu( + gpu_id, total_gpu, h_left, h_right, d_shard_actual_sample_size_ptr, + thrust::raw_pointer_cast(d_shard_total_sample_size.data())); + int allocate_sample_num = 0; + for (int i = 0; i < total_gpu; ++i) { + allocate_sample_num += d_shard_total_sample_size[i]; + } + auto d_shard_vals = + memory::Alloc(place, allocate_sample_num * sizeof(int64_t)); + int64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); + move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, h_left, h_right, + d_shard_vals_ptr, + d_shard_total_sample_size); + + cudaMalloc((void**)&result->val, allocate_sample_num * sizeof(int64_t)); + cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int)); + cudaMalloc((void**)&result->offset, len * sizeof(int)); + int64_t* val = result->val; + int* actual_sample_size = result->actual_sample_size; + int* offset = result->offset; + + fill_dvalues_actual_sample_size<<>>( + d_shard_actual_sample_size_ptr, actual_sample_size, d_idx_ptr, len); + thrust::device_vector t_actual_sample_size(len); + thrust::copy(actual_sample_size, actual_sample_size + len, + t_actual_sample_size.begin()); + thrust::exclusive_scan(t_actual_sample_size.begin(), + t_actual_sample_size.end(), offset, 0); + int* d_offset; + cudaMalloc(&d_offset, len * sizeof(int)); + thrust::copy(d_shard_actual_sample_size_ptr, + d_shard_actual_sample_size_ptr + len, + t_actual_sample_size.begin()); + thrust::exclusive_scan(t_actual_sample_size.begin(), + t_actual_sample_size.end(), d_offset, 0); + constexpr int BLOCK_WARPS_ = 128 / WARP_SIZE; + constexpr int TILE_SIZE_ = BLOCK_WARPS_ * 16; + const dim3 block__(WARP_SIZE, BLOCK_WARPS_); + const dim3 grid__((len + TILE_SIZE_ - 1) / TILE_SIZE_); + fill_dvalues_sample_result<<>>( + d_shard_vals_ptr, val, actual_sample_size, d_idx_ptr, offset, d_offset, + len); + cudaStreamSynchronize(stream); for (int i = 0; i < total_gpu; ++i) { int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; @@ -364,6 +589,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, } destroy_storage(gpu_id, i); } + cudaFree(d_offset); return result; } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 2cf702969f99a..f85ed330dc8ea 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifdef PADDLE_WITH_HETERPS +//#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" #include namespace paddle { diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu new file mode 100644 index 0000000000000..8c7ea10b26565 --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu @@ -0,0 +1,108 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" +#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" +#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" +#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" +#include "paddle/fluid/platform/cuda_device_guard.h" + +using namespace paddle::framework; +void prepare_file(char file_name[], std::vector data) { + std::ofstream ofile; + ofile.open(file_name); + for (auto x : data) { + ofile << x << std::endl; + } + + ofile.close(); +} +char edge_file_name[] = "edges.txt"; +TEST(TEST_FLEET, graph_sample) { + std::vector edges; + int gpu_count = 3; + std::vector dev_ids; + dev_ids.push_back(0); + dev_ids.push_back(1); + dev_ids.push_back(2); + + std::shared_ptr resource = + std::make_shared(dev_ids); + resource->enable_p2p(); + GpuPsGraphTable g(resource); + int node_count = 10; + std::vector> neighbors(node_count); + int ind = 0; + int64_t node_id = 0; + // std::vector graph_list(gpu_count); + while (ind < node_count) { + int neighbor_size = ind + 1; + while (neighbor_size--) { + edges.push_back(std::to_string(ind) + "\t" + std::to_string(node_id) + + "\t1.0"); + node_id++; + } + ind++; + } + /* + gpu 0: + 0,3,6,9 + gpu 1: + 1,4,7 + gpu 2: + 2,5,8 + + query(2,6) returns nodes [6,9,1,4,7,2] + */ + ::paddle::distributed::GraphParameter table_proto; + table_proto.set_gpups_mode(true); + table_proto.set_gpups_mode_shard_num(127); + table_proto.set_gpu_num(3); + table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler"); + table_proto.set_gpups_graph_sample_args("5,5,1,1"); + prepare_file(edge_file_name, edges); + g.init_cpu_table(table_proto); + g.load(std::string(edge_file_name), std::string("e>")); + /* + node x's neighbor list = [(1+x)*x/2,(1+x)*x/2 + 1,.....,(1+x)*x/2 + x] + so node 6's neighbors are [21,22...,27] + node 7's neighbors are [28,29,..35] + node 0's neighbors are [0] + query([7,0,6],sample_size=3) should return [28,29,30,0,x,x,21,22,23] + 6 --index-->2 + 0 --index--->0 + 7 --index-->2 + */ + int64_t cpu_key[3] = {7, 0, 6}; + void *key; + cudaMalloc((void **)&key, 3 * sizeof(int64_t)); + cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice); + auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3); + int64_t *res = new int64_t[9]; + cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost); + std::sort(res, res + 3); + std::sort(res + 6, res + 9); + int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23}; + for (int i = 0; i < 9; i++) { + if (expected_sample_val[i] != -1) { + ASSERT_EQ(res[i], expected_sample_val[i]); + } + } + delete[] res; + delete neighbor_sample_res; +} diff --git a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu index 697e0ba2cdf34..06c7026eb51ca 100644 --- a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu +++ b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu @@ -94,19 +94,44 @@ TEST(TEST_FLEET, graph_comm) { 0 --index--->0 7 --index-->2 */ + int64_t cpu_key[3] = {7, 0, 6}; void *key; cudaMalloc((void **)&key, 3 * sizeof(int64_t)); cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice); auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3); - res = new int64_t[9]; - cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost); - int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23}; - for (int i = 0; i < 9; i++) { - if (expected_sample_val[i] != -1) { - ASSERT_EQ(res[i], expected_sample_val[i]); + res = new int64_t[7]; + cudaMemcpy(res, neighbor_sample_res->val, 56, cudaMemcpyDeviceToHost); + int *actual_sample_size = new int[3]; + cudaMemcpy(actual_sample_size, neighbor_sample_res->actual_sample_size, 12, + cudaMemcpyDeviceToHost); // 3, 1, 3 + int *cumsum_sample_size = new int[3]; + cudaMemcpy(cumsum_sample_size, neighbor_sample_res->offset, 12, + cudaMemcpyDeviceToHost); // 0, 3, 4 + + std::vector> neighbors_; + std::vector neighbors_7 = {28, 29, 30, 31, 32, 33, 34, 35}; + std::vector neighbors_0 = {0}; + std::vector neighbors_6 = {21, 22, 23, 24, 25, 26, 27}; + neighbors_.push_back(neighbors_7); + neighbors_.push_back(neighbors_0); + neighbors_.push_back(neighbors_6); + for (int i = 0; i < 3; i++) { + for (int j = cumsum_sample_size[i]; + j < cumsum_sample_size[i] + actual_sample_size[i]; j++) { + bool flag = false; + for (int k = 0; k < neighbors_[i].size(); k++) { + if (res[j] == neighbors_[i][k]) { + flag = true; + break; + } + } + ASSERT_EQ(flag, true); } } + delete[] res; + delete[] actual_sample_size; + delete[] cumsum_sample_size; delete neighbor_sample_res; } diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 31a30f72e3aa6..432e57107e84d 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -148,7 +148,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; + VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; } else { CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos); VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset"; @@ -182,7 +182,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; + VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; } timeline.Start(); @@ -300,7 +300,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { int32_t cnt = 0; while (true) { auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - reinterpret_cast(local_ptr[i].data()), this->table_id_, + i, reinterpret_cast(local_ptr[i].data()), this->table_id_, local_keys[i].data(), key_size); bool flag = true; @@ -378,8 +378,8 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { int32_t cnt = 0; while (true) { auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - reinterpret_cast(local_dim_ptr[i][j].data()), this->table_id_, - local_dim_keys[i][j].data(), key_size); + i, reinterpret_cast(local_dim_ptr[i][j].data()), + this->table_id_, local_dim_keys[i][j].data(), key_size); bool flag = true; tt.wait(); @@ -431,7 +431,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec() + VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec() << " seconds."; if (multi_node_) { auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance(); @@ -603,7 +603,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() + VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() << " seconds."; } @@ -746,7 +746,7 @@ void PSGPUWrapper::BeginPass() { "[BeginPass] after build_task, current task is not null.")); } - VLOG(1) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s"; + VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s"; } void PSGPUWrapper::EndPass() { @@ -769,7 +769,7 @@ void PSGPUWrapper::EndPass() { current_task_ = nullptr; gpu_free_channel_->Put(current_task_); timer.Pause(); - VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; + VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; } void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index 91ef59575c3aa..2babecc6ddf93 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -78,6 +78,11 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext { return var_types[0] == proto::VarType::SELECTED_ROWS; } + bool IsDenseTensorVectorInput(const std::string& name) const override { + auto var_types = ctx_.GetInputsVarType(name); + return var_types[0] == proto::VarType::LOD_TENSOR_ARRAY; + } + bool IsDenseTensorOutput(const std::string& name) const override { auto var_types = ctx_.GetOutputsVarType(name); return var_types[0] == proto::VarType::LOD_TENSOR; @@ -90,6 +95,8 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext { bool IsForInferShape() const override { return true; } + bool IsRuntime() const override { return ctx_.IsRuntime(); } + private: const InferShapeContext& ctx_; }; @@ -123,9 +130,14 @@ class CompatMetaTensor : public phi::MetaTensor { return var->Get().dims(); } else if (var->IsType()) { return var->Get().dims(); + } else if (var->IsType()) { + // use tensor array size as dims + auto& tensor_array = var->Get(); + return phi::make_ddim({static_cast(tensor_array.size())}); } else { PADDLE_THROW(platform::errors::Unimplemented( - "Currently, only can get dims from DenseTensor or SelectedRows.")); + "Currently, only can get dims from DenseTensor or SelectedRows or " + "DenseTensorArray.")); } } else { auto* var = BOOST_GET_CONST(VarDesc*, var_); @@ -142,6 +154,10 @@ class CompatMetaTensor : public phi::MetaTensor { return var->Get().dtype(); } else if (var->IsType()) { return var->Get().dtype(); + } else if (var->IsType()) { + // NOTE(chenweihang): do nothing + // Unsupported get dtype from LoDTensorArray now + return phi::DataType::UNDEFINED; } else { PADDLE_THROW(platform::errors::Unimplemented( "Currently, only can get dtype from DenseTensor or SelectedRows.")); @@ -155,7 +171,19 @@ class CompatMetaTensor : public phi::MetaTensor { DataLayout layout() const override { if (is_runtime_) { auto* var = BOOST_GET_CONST(Variable*, var_); - return var->Get().layout(); + if (var->IsType()) { + return var->Get().layout(); + } else if (var->IsType()) { + return var->Get().layout(); + } else if (var->IsType()) { + // NOTE(chenweihang): do nothing + // Unsupported get layout from LoDTensorArray now + return phi::DataLayout::UNDEFINED; + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Currently, only can get layout from DenseTensor or " + "SelectedRows.")); + } } else { // NOTE(chenweihang): do nothing // Unsupported get layout for VarDesc now @@ -172,6 +200,16 @@ class CompatMetaTensor : public phi::MetaTensor { } else if (var->IsType()) { auto* tensor = var->GetMutable()->mutable_value(); phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims; + } else if (var->IsType()) { + auto* tensor_array = var->GetMutable(); + // Note: Here I want enforce `tensor_array->size() == 0UL`, because + // inplace using on LoDTensorArray is dangerous, but the unittest + // `test_list` contains this behavior + PADDLE_ENFORCE_EQ(dims.size(), 1UL, + platform::errors::InvalidArgument( + "LoDTensorArray can only have one dimension.")); + // only set the array size for LoDTensorArray input + tensor_array->resize(dims[0]); } else { PADDLE_THROW(platform::errors::Unimplemented( "Currently, only can set dims from DenseTensor or SelectedRows.")); @@ -191,6 +229,9 @@ class CompatMetaTensor : public phi::MetaTensor { } else if (var->IsType()) { auto* tensor = var->GetMutable()->mutable_value(); phi::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype; + } else if (var->IsType()) { + // NOTE(chenweihang): do nothing + // Unsupported set dtype for LoDTensorArray now } else { PADDLE_THROW(platform::errors::Unimplemented( "Currently, only can set dtype from DenseTensor or SelectedRows.")); @@ -204,10 +245,20 @@ class CompatMetaTensor : public phi::MetaTensor { void set_layout(DataLayout layout) override { if (is_runtime_) { auto* var = BOOST_GET(Variable*, var_); - LoDTensor* tensor = var->GetMutable(); - phi::DenseTensorUtils::GetMutableMeta( - static_cast(tensor)) - ->layout = layout; + if (var->IsType()) { + auto* tensor = var->GetMutable(); + phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout; + } else if (var->IsType()) { + auto* tensor = var->GetMutable()->mutable_value(); + phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout; + } else if (var->IsType()) { + // NOTE(chenweihang): do nothing + // Unsupported set dtype for LoDTensorArray now + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Currently, only can set layout from DenseTensor or " + "SelectedRows.")); + } } else { // NOTE(chenweihang): do nothing // Unsupported set layout for VarDesc now @@ -247,13 +298,11 @@ class CompatMetaTensor : public phi::MetaTensor { } void share_meta(const MetaTensor& meta_tensor) override { + share_dims(meta_tensor); set_dtype(meta_tensor.dtype()); - // VarDesc doesn't contains layout, so we cannot share layout - // set_layout(meta_tensor.layout()); - - // special case 1: share lod of LoDTensor + set_layout(meta_tensor.layout()); + // special case: share lod of LoDTensor share_lod(meta_tensor); - share_dims(meta_tensor); } private: @@ -295,7 +344,8 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, VLOG(3) << "BuildInferMetaContext: op kernel signature - " << signature; // 2. build infermeta context - phi::InferMetaContext infer_meta_context(ctx->IsRuntime()); + phi::InferMetaContext infer_meta_context( + {ctx->IsRuntime(), ctx->IsRunMKLDNNKernel()}); auto& input_names = std::get<0>(signature.args); auto& attr_names = std::get<1>(signature.args); @@ -439,6 +489,51 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, attr_name, infershape_input.size())); } } + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + auto& attr = attr_reader.GetAttr(attr_name); + if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + infer_meta_context.EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + infer_meta_context.EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + infer_meta_context.EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + infer_meta_context.EmplaceBackAttr(std::move(scalar_list)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported cast op attribute `%s` to vector when " + "construct InferMetaContext.", + attr_names[i])); + } } else if (ctx->HasAttr(attr_name)) { // Emplace Back Attr according to the type of attr. auto& attr = attr_reader.GetAttr(attr_name); @@ -497,8 +592,22 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, "Unsupported attribute type is received when call " "InferShapeFunctor.")); } - } else { - // do nothing + } else if (ctx->HasInput(attr_name)) { + // convert from data + if (attr_defs[i].type_index == std::type_index(typeid(int32_t))) { + if (ctx->IsRuntime()) { + const auto& infershape_inputs = ctx->GetInputVarPtrs(attr_name); + auto var_temp = BOOST_GET_CONST(Variable*, infershape_inputs[i]); + auto val = experimental::MakePhiScalarFromVar(*var_temp); + int32_t val_int = val.template to(); + infer_meta_context.EmplaceBackAttr(val_int); + } else { + infer_meta_context.EmplaceBackAttr(-1); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Get value from variable only support int yet")); + } } } diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index a1f2d6edca6a2..7aaaef712a6e9 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -97,6 +97,7 @@ pass_library(layer_norm_fuse_pass inference) pass_library(add_support_int8_pass inference) pass_library(matmul_scale_fuse_pass inference) pass_library(gpu_cpu_map_matmul_to_mul_pass inference) +pass_library(mixed_precision_configure_pass inference) pass_library(generate_pass DEPS pass_desc_proto) target_link_libraries(generate_pass pass_desc_proto) @@ -126,6 +127,7 @@ if(WITH_MKLDNN) pass_library(interpolate_mkldnn_pass inference DIR mkldnn) pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn) pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn) pass_library(cpu_quantize_placement_pass base DIR mkldnn) pass_library(cpu_quantize_pass inference DIR mkldnn) pass_library(cpu_quantize_squash_pass inference DIR mkldnn) diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 036fde8fac6d9..f5f6f3ecb855c 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -95,6 +95,7 @@ std::map> Graph::InitFromBlock( std::unordered_map> name_to_desc_block_id; + block_id_ = block.ID(); const BlockDesc *block_var_visible = █ while (block_var_visible != nullptr) { for (auto *var : block_var_visible->AllVars()) { diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 21e743e3587d8..10645f08dc3ba 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -230,6 +230,7 @@ class Graph { auto *x = AddNode(new ir::Node(var_desc, block_id == -1 ? block_id_ : block_id)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } @@ -245,6 +246,7 @@ class Graph { "The OpDesc used to create operator node is null.")); auto *x = AddNode(new ir::Node(op_desc)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } @@ -263,6 +265,7 @@ class Graph { num_node_created_); auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable, block_id_)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } @@ -276,6 +279,7 @@ class Graph { } auto *x = AddNode(new ir::Node(name, type, block_id_)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index d7d866fa98bb5..164a13d1560f4 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -918,6 +918,36 @@ PDNode *patterns::ConvActivation::operator()( return activation_out_var; } +PDNode *patterns::ElementwiseActivation::operator()( + paddle::framework::ir::PDNode *elementwise_a, + const std::string &elementwise_type, const std::string &activation_type) { + // Create Operators + elementwise_a->assert_is_op_input(elementwise_type, "X"); + auto *elementwise_op = + pattern->NewNode(elementwise_repr())->assert_is_op(elementwise_type); + auto *activation_op = + pattern->NewNode(activation_repr())->assert_is_op(activation_type); + // Create variables + auto *elementwise_b = pattern->NewNode(elementwise_b_repr()) + ->AsInput() + ->assert_is_op_input(elementwise_type, "Y"); + // intermediate variable, will be removed in the IR after fuse. + auto *elementwise_out_var = + pattern->NewNode(elementwise_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op(elementwise_type) + ->assert_is_op_input(activation_type); + // output + auto *activation_out_var = pattern->NewNode(activation_out_repr()) + ->AsOutput() + ->assert_is_op_output(activation_type); + + elementwise_op->LinksFrom({elementwise_a, elementwise_b}) + .LinksTo({elementwise_out_var}); + activation_op->LinksFrom({elementwise_out_var}).LinksTo({activation_out_var}); + return activation_out_var; +} + PDNode *patterns::SeqConvEltAddRelu::operator()( paddle::framework::ir::PDNode *seqconv_input) { // Create Operators @@ -2022,18 +2052,19 @@ PDNode *patterns::Pool::operator()() { return output_var; } -PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { - auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) - ->assert_is_op("elementwise_add"); +PDNode *patterns::Elementwise::operator()(PDNode *x_var, PDNode *y_var, + const std::string elementwise_type) { + auto elementwise_op = + pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type); - x_var->AsInput()->assert_is_op_input("elementwise_add", "X"); - y_var->AsInput()->assert_is_op_input("elementwise_add", "Y"); - auto out_var = pattern->NewNode(elementwise_add_out_repr()) + x_var->AsInput()->assert_is_op_input(elementwise_type, "X"); + y_var->AsInput()->assert_is_op_input(elementwise_type, "Y"); + auto out_var = pattern->NewNode(elementwise_out_repr()) ->AsOutput() - ->assert_is_op_output("elementwise_add", "Out"); + ->assert_is_op_output(elementwise_type, "Out"); - elementwise_add_op->LinksFrom({x_var, y_var}); - elementwise_add_op->LinksTo({out_var}); + elementwise_op->LinksFrom({x_var, y_var}); + elementwise_op->LinksTo({out_var}); return out_var; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 0f21906d08d0e..17c70ace301d3 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -487,6 +487,28 @@ struct ConvActivation : public PatternBase { PATTERN_DECL_NODE(activation_out); }; +// Elementwise with Activation +// op: elementwise + activation +// named nodes: +// elementwise_a, elementwise_b, +// elementwise_out, elementwise, +// activation_out, activation +struct ElementwiseActivation : public PatternBase { + ElementwiseActivation(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elementwise_add_activation") {} + + PDNode* operator()(PDNode* elementwise_a, const std::string& elementwise_type, + const std::string& activation_type); + + // declare operator node's name + PATTERN_DECL_NODE(elementwise); + PATTERN_DECL_NODE(activation); + // declare variable node's name + PATTERN_DECL_NODE(elementwise_b); + PATTERN_DECL_NODE(elementwise_out); + PATTERN_DECL_NODE(activation_out); +}; + // SEQCONV with Elementwise_Add ReLU // op: seqconv + elementwise_add + relu // named nodes: @@ -994,20 +1016,20 @@ struct Pool : public PatternBase { PATTERN_DECL_NODE(pool_output); }; -// ElementwiseAdd used in residual connections. -// y_var is used and convolution output. -// The operator is removed, when residual -// connection fusion is on. -struct ElementwiseAdd : public PatternBase { - ElementwiseAdd(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "elementwise_add") {} +// Elementwise ops +// Forward pass for element-wise operators (add, mul) +// elementwise_mul_out is the result of the operator +struct Elementwise : public PatternBase { + Elementwise(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elementwise") {} - PDNode* operator()(PDNode* x_var, PDNode* y_var); + PDNode* operator()(PDNode* x_var, PDNode* y_var, + const std::string elementwise_type); - PATTERN_DECL_NODE(elementwise_add_op); - PATTERN_DECL_NODE(elementwise_add_x); - PATTERN_DECL_NODE(elementwise_add_y); - PATTERN_DECL_NODE(elementwise_add_out); + PATTERN_DECL_NODE(elementwise_op); + PATTERN_DECL_NODE(elementwise_x); + PATTERN_DECL_NODE(elementwise_y); + PATTERN_DECL_NODE(elementwise_out); }; // Transpose op diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc index 9fe50deaf2d72..7cdb7a8854aad 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc +++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc @@ -25,14 +25,14 @@ std::set ignored_ops = { "sum", "clip", "clip_by_norm", - "square", "reduce_sum", "sqrt", "elementwise_max", "elementwise_div", "elementwise_mul", - "scale", // adamax - "assign", // adamw + "scale", // adamax + "assign", // adamw + "squared_l2_norm" // gradient_clip_norm }; const bool startswith(const std::string& str, const std::string& pre) { @@ -62,6 +62,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { new_op.SetAttr("with_lr_sched", false); std::set set_ops{}; + // save the weight decay tensor_name and weight_decay_value for Lamb + std::vector weight_decay_vars{}; + std::vector weight_decay_values{}; + // use map store ? for (auto* node : graph->Nodes()) { if (!node->IsOp()) { @@ -75,6 +79,15 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { auto op_role = static_cast(op_role_); if (op_role == OpRole::kOptimize) { + // save weight decay value from every lamb optimizer op + if (op_type == "lamb" && op->HasAttr("weight_decay")) { + auto weight_decay_value = + BOOST_GET_CONST(float, op->GetAttr("weight_decay")); + auto params = op->Output("ParamOut"); + weight_decay_vars.push_back(params[0]); + weight_decay_values.push_back(weight_decay_value); + } + if (set_ops.count(op_type)) { continue; } @@ -270,7 +283,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { // seems with_lr_sched is always true new_op.SetAttr("with_lr_sched", true); - // setup weight deacy + // setup weight decay for Lamb + new_op.SetAttr("weight_decay_vars", weight_decay_vars); + new_op.SetAttr("weight_decay_values", weight_decay_values); + // weight_decay/coeff is "scale" attr of scale_op if (set_ops.count("scale") && set_ops.count("sum")) { if (set_ops.count("sign")) { diff --git a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc b/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc index e754ba72ad857..5cd8358dc083e 100644 --- a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc +++ b/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc @@ -30,7 +30,8 @@ void TransferCastOpPass::ApplyImpl(ir::Graph* graph) const { auto ipu_backend = platform::ipu::IpuBackend::GetInstance(); auto enable_fp16 = ipu_backend->GetIpuStrategy()->enable_fp16; - if (enable_fp16) { + auto transfer_cast_op = ipu_backend->GetIpuStrategy()->transfer_cast_op; + if (enable_fp16 && transfer_cast_op) { for (auto* node : graph->Nodes()) { if (node->IsOp() && node->Op()->Type() == "popart_cast") { if (BOOST_GET_CONST(std::string, node->Op()->GetAttr("to")) == diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc index 1b2a62695fb13..9fc6de3c8c172 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc @@ -73,8 +73,10 @@ static void ShareVarInfoToCinnLaunch( varinfo_maps.at(cinn_launch_op->GetScopeIdx()); // collect all MemOptVarInfos of external variables - // that would be eager deleted after the cinn_launch subgraph executed, - // and store them as attribute of the subgraph + // that were eager deleted after the cinn_launch subgraph executed, + // and we will delete them in advance among eager_deletion_ops + // inside cinn_launch subgraph, so store them as attribute of the subgraph + // to pass to the inner eager_deletion_ops. for (const auto& var_name : vars_to_delete) { auto it = src_varinfo_map.find(var_name); PADDLE_ENFORCE_NE(it, src_varinfo_map.end(), @@ -82,6 +84,8 @@ static void ShareVarInfoToCinnLaunch( "MemOptVarInfo of var[%s] not found", var_name)); dst_varinfo_map.emplace(var_name, it->second); } + // skip running of the followed eager_deletion_op + followed_eager_deletion_op->SetSkipRunning(true); } static void TakeVarInfoFromMainGraph( diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc index d33dc7f49feb0..636a594a657cb 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc @@ -20,12 +20,15 @@ #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(scale); USE_OP(elementwise_mul); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add_grad); +PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT); + DECLARE_double(eager_delete_tensor_gb); namespace paddle { diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc new file mode 100644 index 0000000000000..4aa59d9196b1b --- /dev/null +++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc @@ -0,0 +1,149 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace framework { +namespace ir { + +void MixedPrecisionConfigurePass::InsertCastOps( + Graph* graph, const StringSet& blacklist) const { + VLOG(3) << "Insert the cast op before and after the kernel that does not " + "supports fp16 precision"; + + auto update_cast_desc = [&]( + framework::OpDesc& desc, const std::string& x_name, + const std::string& out_name, const int in_dtype, const int out_dtype) { + desc.SetType("cast"); + desc.SetInput("X", {x_name}); + desc.SetOutput("Out", {out_name}); + desc.SetAttr("in_dtype", in_dtype); + desc.SetAttr("out_dtype", out_dtype); + desc.SetAttr("use_mkldnn", false); + desc.SetAttr("with_quant_attr", false); + desc.Flush(); + }; + + auto cast_input = [&](Graph* graph, Node* op_node, + const StringSet& cast_list) { + auto inlinks = op_node->inputs; + for (auto* pre_node : inlinks) { + if (pre_node->IsVar()) { + const auto is_persistable = pre_node->Var()->Persistable(); + const auto is_float = + pre_node->Var()->GetDataType() == proto::VarType::FP16 || + pre_node->Var()->GetDataType() == proto::VarType::FP32 || + pre_node->Var()->GetDataType() == proto::VarType::FP64; + if (!is_persistable && is_float) { + int suffix = 0; + for (auto* pre_node_input : pre_node->inputs) { + if (!pre_node_input->IsOp()) continue; + const auto& type = pre_node_input->Op()->Type(); + if (!cast_list.count(type) && type != "cast") { + std::string old_name = pre_node->Name(); + std::string new_name = + old_name + "_cast.tmp_" + std::to_string(suffix); + suffix++; + + framework::OpDesc new_op_desc(op_node->Op()->Block()); + // 4 for fp16, 5 for fp32 + update_cast_desc(new_op_desc, old_name, new_name, 4, 5); + auto* new_op = graph->CreateOpNode(&new_op_desc); + + VarDesc out_var(new_name); + out_var.SetPersistable(false); + auto* node_var = graph->CreateVarNode(&out_var); + + op_node->Op()->RenameInput(old_name, new_name); + IR_NODE_LINK_TO(pre_node, new_op); + IR_NODE_LINK_TO(new_op, node_var); + IR_NODE_LINK_TO(node_var, op_node); + } + } + } + } + } + }; + + auto cast_output = [&](Graph* graph, Node* op_node, + const StringSet& cast_list) { + auto outlinks = op_node->outputs; + for (auto* next_node : outlinks) { + if (next_node->IsVar()) { + const auto is_persistable = next_node->Var()->Persistable(); + const auto is_float = + next_node->Var()->GetDataType() == proto::VarType::FP16 || + next_node->Var()->GetDataType() == proto::VarType::FP32 || + next_node->Var()->GetDataType() == proto::VarType::FP64; + if (!is_persistable && is_float) { + int suffix = 0; + for (auto* next_node_output : next_node->outputs) { + if (!next_node_output->IsOp()) continue; + + const auto& type = next_node_output->Op()->Type(); + if (!cast_list.count(type) && type != "cast") { + std::string old_name = next_node->Name(); + std::string new_name = + old_name + "_cast.tmp_" + std::to_string(suffix); + suffix++; + + framework::OpDesc new_op_desc(op_node->Op()->Block()); + // 4 for fp16, 5 for fp32 + update_cast_desc(new_op_desc, old_name, new_name, 5, 4); + auto* new_op = graph->CreateOpNode(&new_op_desc); + + VarDesc out_var(new_name); + out_var.SetPersistable(false); + auto* node_var = graph->CreateVarNode(&out_var); + + next_node_output->Op()->RenameInput(old_name, new_name); + IR_NODE_LINK_TO(next_node, new_op); + IR_NODE_LINK_TO(new_op, node_var); + IR_NODE_LINK_TO(node_var, next_node_output); + } + } + } + } + } + }; + + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || + op_node->Op()->Type() == "fetch") + continue; + + const auto& type = op_node->Op()->Type(); + if (blacklist.count(type)) { + cast_input(graph, op_node, blacklist); + cast_output(graph, op_node, blacklist); + } + } +} + +void MixedPrecisionConfigurePass::ApplyImpl(Graph* graph) const { + const auto blacklist = + Get>("gpu_fp16_disabled_op_types"); + InsertCastOps(graph, blacklist); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(mixed_precision_configure_pass, + paddle::framework::ir::MixedPrecisionConfigurePass); diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.h b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h new file mode 100644 index 0000000000000..fc5a612ecb833 --- /dev/null +++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +using StringSet = std::unordered_set; + +class MixedPrecisionConfigurePass : public FusePassBase { + public: + MixedPrecisionConfigurePass() = default; + virtual ~MixedPrecisionConfigurePass() {} + + protected: + void ApplyImpl(Graph* graph) const override; + + private: + void InsertCastOps(Graph* graph, const StringSet& blacklist) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index c537d05738529..fc2758c273450 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -22,6 +22,7 @@ #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace framework { @@ -117,7 +118,7 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() { .IsType>() .End() .AddAttr("data_format") - .IsStringIn({"NCHW", "AnyLayout"}) + .IsStringIn({"NHWC", "NCHW", "AnyLayout"}) .End(); AddOpCompat(OpCompat("elementwise_add")) @@ -135,226 +136,138 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() { .End(); } -ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle( - const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func, - const ResidualConnectionMKLDNNFusePass::IdentityConvFunc& - get_node_from_conv_op, - const ResidualConnectionMKLDNNFusePass::IdentityElementwiseAddFunc& - get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass) - : fusion_stats{std::make_shared(0)}, - can_fuse_func{can_fuse_func}, - get_node_from_conv_op{get_node_from_conv_op}, - get_node_from_elementwise_add_op{get_node_from_elementwise_add_op}, - pass_{pass} {} - -void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()( - const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { - Node* conv_op; - Node* conv_input; - Node* conv_filter; - Node* conv_output; - - Node* elementwise_add_op; - Node* elementwise_add_identity; - Node* elementwise_add_out; - - std::tie(conv_op, conv_input, conv_filter, conv_output) = - get_node_from_conv_op(subgraph); - std::tie(elementwise_add_op, elementwise_add_identity, elementwise_add_out) = - get_node_from_elementwise_add_op(subgraph); - - if (!can_fuse_func(conv_op, elementwise_add_op)) return; - - if (!IsReachable(graph, elementwise_add_identity, conv_output)) return; - - if (HasFusedActivation(conv_op)) return; - - if (!pass_->IsCompat(subgraph, graph)) { - LOG(WARNING) - << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; - return; - } +GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( + const std::string& name_scope, + const GraphWithStats& graph_with_stats) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); - conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()}); - conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); - conv_op->Op()->SetAttr("fuse_residual_connection", true); + patterns::Conv conv_pattern{pattern, name_scope}; + auto conv_output = conv_pattern(); - GraphSafeRemoveNodes(graph, {conv_output, elementwise_add_op}); + patterns::Elementwise elementwise_pattern{pattern, name_scope}; + elementwise_pattern( + conv_output, pattern->NewNode(elementwise_pattern.elementwise_y_repr()), + "elementwise_add"); + conv_output->AsIntermediate(); - IR_NODE_LINK_TO(elementwise_add_identity, conv_op); - IR_NODE_LINK_TO(conv_op, elementwise_add_out); + int found_conv_as_x_count = 0; - (*fusion_stats)++; -} + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); -ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::ProjectionFuseHandle( - const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func, - const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc& - get_node_from_conv_x_op, - const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc& - get_node_from_conv_y_op, - const ResidualConnectionMKLDNNFusePass::ProjectionElementwiseAddFunc& - get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass) - : fusion_stats{std::make_shared(0)}, - can_fuse_func{can_fuse_func}, - get_node_from_conv_x_op{get_node_from_conv_x_op}, - get_node_from_conv_y_op{get_node_from_conv_y_op}, - get_node_from_elementwise_add_op{get_node_from_elementwise_add_op}, - pass_{pass} {} - -void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()( - const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { - Node* conv_x_op; - Node* conv_x_input; - Node* conv_x_filter; - Node* conv_x_output; - - Node* conv_y_op; - Node* conv_y_input; - Node* conv_y_filter; - Node* conv_y_output; - - Node* elementwise_add_op; - Node* elementwise_add_out; - - if (!pass_->IsCompat(subgraph, graph)) { - LOG(WARNING) - << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; - return; - } + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_identity, elementwise_y, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); - std::tie(conv_x_op, conv_x_input, conv_x_filter, conv_x_output) = - get_node_from_conv_x_op(subgraph); - std::tie(conv_y_op, conv_y_input, conv_y_filter, conv_y_output) = - get_node_from_conv_y_op(subgraph); - std::tie(elementwise_add_op, elementwise_add_out) = - get_node_from_elementwise_add_op(subgraph); - - if (!can_fuse_func(conv_x_op, elementwise_add_op)) return; - if (!can_fuse_func(conv_y_op, elementwise_add_op)) return; - - Node* projection_node; - Node* residual_conv_op; - Node* residual_conv_output; - - if (IsReachable(graph, conv_x_input, conv_y_output)) { - projection_node = conv_x_output; - residual_conv_op = conv_y_op; - residual_conv_output = conv_y_output; - } else if (IsReachable(graph, conv_y_input, conv_x_output)) { - projection_node = conv_y_output; - residual_conv_op = conv_x_op; - residual_conv_output = conv_x_output; - } else { - return; - } + if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return; - if (HasFusedActivation(residual_conv_op)) return; + if (!IsReachable(g, elementwise_identity, conv_output)) return; - residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()}); - residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + if (HasFusedActivation(conv_op)) return; - residual_conv_op->Op()->SetAttr("fuse_residual_connection", true); + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; + return; + } - GraphSafeRemoveNodes(graph, {residual_conv_output, elementwise_add_op}); + conv_op->Op()->SetInput("ResidualData", {elementwise_identity->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); + conv_op->Op()->SetAttr("fuse_residual_connection", true); - IR_NODE_LINK_TO(projection_node, residual_conv_op); - IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out); + GraphSafeRemoveNodes(g, {conv_output, elementwise_op}); - (*fusion_stats)++; -} + IR_NODE_LINK_TO(elementwise_identity, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_out); -std::tuple -ResidualConnectionMKLDNNFusePass::GetNodesFromConv( - const patterns::Conv& conv_pattern, - const GraphPatternDetector::subgraph_t& subgraph) const { - GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); + found_conv_as_x_count++; + }; + + gpd(graph_with_stats.first, handler); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_conv_as_x_count + << " conv (as x) + elementwise_add patterns"; + paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + } - return std::make_tuple(conv_op, conv_input, conv_filter, conv_output); + return std::make_pair(graph_with_stats.first, + found_conv_as_x_count + graph_with_stats.second); } -GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( +GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( const std::string& name_scope, const GraphWithStats& graph_with_stats) const { - ir::Graph* graph; - int stats; - - std::tie(graph, stats) = graph_with_stats; - GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); patterns::Conv conv_pattern{pattern, name_scope}; auto conv_output = conv_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; - elementwise_add_pattern( - conv_output, - pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr())); + patterns::Elementwise elementwise_pattern{pattern, name_scope}; + elementwise_pattern( + pattern->NewNode(elementwise_pattern.elementwise_x_repr()), conv_output, + "elementwise_add"); conv_output->AsIntermediate(); - auto get_node_from_elementwise_add = [&elementwise_add_pattern]( - const GraphPatternDetector::subgraph_t& subgraph) - -> std::tuple { - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); - - return std::make_tuple(elementwise_add_op, elementwise_add_y, - elementwise_add_out); - }; - - return ExecuteHandleOnGraph( - &gpd, graph_with_stats, - [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_pattern, subgraph); - }, - get_node_from_elementwise_add, this); -} + int found_conv_as_y_count = 0; -GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( - const std::string& name_scope, - const GraphWithStats& graph_with_stats) const { - GraphPatternDetector gpd; - auto pattern = gpd.mutable_pattern(); + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); - patterns::Conv conv_pattern{pattern, name_scope}; - auto conv_output = conv_pattern(); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; - elementwise_add_pattern( - pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()), - conv_output); - conv_output->AsIntermediate(); + if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return; + + if (!IsReachable(g, elementwise_x, conv_output)) return; + + if (HasFusedActivation(conv_op)) return; + + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; + return; + } + + conv_op->Op()->SetInput("ResidualData", {elementwise_x->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); + conv_op->Op()->SetAttr("fuse_residual_connection", true); + + GraphSafeRemoveNodes(g, {conv_output, elementwise_op}); + + IR_NODE_LINK_TO(elementwise_x, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_out); + + found_conv_as_y_count++; + }; + + gpd(graph_with_stats.first, handler); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_conv_as_y_count + << " conv (as y) + elementwise_add patterns"; + paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + } - auto get_node_from_elementwise_add = [&elementwise_add_pattern]( - const GraphPatternDetector::subgraph_t& subgraph) - -> std::tuple { - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); - - return std::make_tuple(elementwise_add_op, elementwise_add_x, - elementwise_add_out); - }; - - return ExecuteHandleOnGraph( - &gpd, graph_with_stats, - [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_pattern, subgraph); - }, - get_node_from_elementwise_add, this); + return std::make_pair(graph_with_stats.first, + found_conv_as_y_count + graph_with_stats.second); } GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( @@ -369,44 +282,89 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( patterns::Conv conv_y_pattern{pattern, name_scope}; auto conv_y_output = conv_y_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; - elementwise_add_pattern(conv_x_output, conv_y_output); + patterns::Elementwise elementwise_pattern{pattern, name_scope}; + elementwise_pattern(conv_x_output, conv_y_output, "elementwise_add"); conv_x_output->AsIntermediate(); conv_y_output->AsIntermediate(); - auto get_node_from_elementwise_add = [&elementwise_add_pattern]( - const GraphPatternDetector::subgraph_t& subgraph) - -> std::tuple { - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); - - return std::make_tuple(elementwise_add_op, elementwise_add_out); - }; - - return ExecuteHandleOnGraph( - &gpd, graph_with_stats, - [this, - &conv_x_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_x_pattern, subgraph); - }, - [this, - &conv_y_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_y_pattern, subgraph); - }, - get_node_from_elementwise_add, this); + int found_projection_conv_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(conv_x_op, conv_op, conv_x_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_x_input, conv_input, conv_x_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_x_filter, conv_filter, conv_x_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_x_output, conv_output, conv_x_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(conv_y_op, conv_op, conv_y_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_y_input, conv_input, conv_y_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); + + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; + return; + } + + if (FindFuseOption(*conv_x_op, *elementwise_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_y_op, *elementwise_op) != FUSE_MKLDNN) return; + + Node* projection_node; + Node* residual_conv_op; + Node* residual_conv_output; + if (IsReachable(g, conv_x_input, conv_y_output)) { + projection_node = conv_x_output; + residual_conv_op = conv_y_op; + residual_conv_output = conv_y_output; + } else if (IsReachable(g, conv_y_input, conv_x_output)) { + projection_node = conv_y_output; + residual_conv_op = conv_x_op; + residual_conv_output = conv_x_output; + } else { + return; + } + + if (HasFusedActivation(residual_conv_op)) return; + + residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()}); + residual_conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); + + residual_conv_op->Op()->SetAttr("fuse_residual_connection", true); + + GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_op}); + + IR_NODE_LINK_TO(projection_node, residual_conv_op); + IR_NODE_LINK_TO(residual_conv_op, elementwise_out); + + found_projection_conv_count++; + }; + + gpd(graph_with_stats.first, handler); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_projection_conv_count + << " projection conv (as y) + elementwise_add patterns"; + paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + } + + return std::make_pair(graph_with_stats.first, + found_projection_conv_count + graph_with_stats.second); } -void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { +void ResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const { FusePassBase::Init(name_scope_, graph); - auto fused_graph_with_stats = FuseConvAsY( - name_scope_, - FuseConvAsX(name_scope_, - FuseProjectionConv(name_scope_, std::make_pair(graph, 0)))); + auto graph_with_stats = + FuseProjectionConv(name_scope_, std::make_pair(graph, 0)); + graph_with_stats = FuseConvAsX(name_scope_, graph_with_stats); + graph_with_stats = FuseConvAsY(name_scope_, graph_with_stats); - LOG(INFO) << "Fused graph " << fused_graph_with_stats.second << "\n"; - AddStatis(fused_graph_with_stats.second); + AddStatis(graph_with_stats.second); } } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h index c83335da2f629..c4351b382187d 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h @@ -28,19 +28,9 @@ namespace paddle { namespace framework { namespace ir { -class Graph; -class GraphPatternDetector; -class Node; -namespace patterns { -struct Conv; -} // namespace patterns - -using graph_ptr = ir::Graph*; using GraphWithStats = std::pair; -void CorrectGraphEdges(Graph* graph, Node* from, Node* to); bool IsReachable(ir::Graph* graph, Node* from, Node* to); -paddle::optional HasBias(const Node& op, const std::string& bias_name); class ResidualConnectionMKLDNNFusePass : public FusePassBase { private: @@ -52,91 +42,13 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase { const std::string& name_scope, const GraphWithStats& graph_with_stats) const; - template - using GetNodeFunc = - std::function; - using IdentityConvFunc = GetNodeFunc>; - using IdentityElementwiseAddFunc = - GetNodeFunc>; - - using ProjectionConvFunc = IdentityConvFunc; - using ProjectionElementwiseAddFunc = GetNodeFunc>; - - using CanFuseFunc = std::function; - - std::tuple GetNodesFromConv( - const patterns::Conv& conv_pattern, - const GraphPatternDetector::subgraph_t& subgraph) const; - - std::tuple GetNodesFromProjectionConv( - const patterns::Conv& conv_pattern, - const GraphPatternDetector::subgraph_t& subgraph) const; - - template - GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd, - const GraphWithStats& graph_with_stats, - OpFuncs&&... op_funcs) const { - ir::Graph* graph; - int stats; - - std::tie(graph, stats) = graph_with_stats; - - auto can_fuse = [this](Node* op1, Node* op2) -> bool { - return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN; - }; - auto fuse_handle = HandleType{can_fuse, std::forward(op_funcs)...}; - - (*gpd)(graph, fuse_handle); - - return std::make_pair(graph, stats + fuse_handle.get_stats()); - } - - struct IdentityFuseHandle { - IdentityFuseHandle( - const CanFuseFunc& can_fuse_func, - const IdentityConvFunc& get_node_from_conv_op, - const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass); - - void operator()(const GraphPatternDetector::subgraph_t& subgraph, - Graph* graph); - int get_stats() const { return *fusion_stats; } - - private: - std::shared_ptr fusion_stats; - CanFuseFunc can_fuse_func; - IdentityConvFunc get_node_from_conv_op; - IdentityElementwiseAddFunc get_node_from_elementwise_add_op; - const ResidualConnectionMKLDNNFusePass* pass_; - }; - - struct ProjectionFuseHandle { - ProjectionFuseHandle( - const CanFuseFunc& can_fuse_func, - const ProjectionConvFunc& get_node_from_conv_x_op, - const ProjectionConvFunc& get_node_from_conv_y_op, - const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass); - - void operator()(const GraphPatternDetector::subgraph_t& subgraph, - Graph* graph); - int get_stats() const { return *fusion_stats; } - - private: - std::shared_ptr fusion_stats; - CanFuseFunc can_fuse_func; - ProjectionConvFunc get_node_from_conv_x_op; - ProjectionConvFunc get_node_from_conv_y_op; - ProjectionElementwiseAddFunc get_node_from_elementwise_add_op; - const ResidualConnectionMKLDNNFusePass* pass_; - }; - public: ResidualConnectionMKLDNNFusePass(); virtual ~ResidualConnectionMKLDNNFusePass() {} protected: - void ApplyImpl(graph_ptr graph) const; + void ApplyImpl(ir::Graph* graph) const; + static bool HasFusedActivation(Node* conv_node) { return !(conv_node->Op() ->GetAttrIfExists("fuse_activation") diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 371482b5343d6..f4358fb243f20 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -807,74 +807,74 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const { PrettyLogDetail("--- quantized %d matmul ops", quantize_matmul_count); } -void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const { +void CPUQuantizePass::QuantizeElementwise( + Graph* graph, const std::string elementwise_type) const { GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_}; + patterns::Elementwise elementwise_pattern{pattern, name_scope_}; - elementwise_add_pattern( - pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()), - pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr())); + elementwise_pattern( + pattern->NewNode(elementwise_pattern.elementwise_x_repr()), + pattern->NewNode(elementwise_pattern.elementwise_y_repr()), + elementwise_type); - int quantize_elementwise_add_count = 0; + int quantize_elementwise_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - VLOG(4) << "Quantize elementwise_add op"; - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); + VLOG(4) << "Quantize " + elementwise_type + " op"; + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); // skip if should not be quantized - if (!platform::HasOpINT8DataType(elementwise_add_op->Op())) { - LogQuantizationDisabled(elementwise_add_op); + if (!platform::HasOpINT8DataType(elementwise_op->Op())) { + LogQuantizationDisabled(elementwise_op); return; } - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_y, elementwise_y, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); if (!AreScalesPresentForNodes( - {elementwise_add_x, elementwise_add_y, elementwise_add_out})) { - LogCannotQuantizeOp(elementwise_add_op, + {elementwise_x, elementwise_y, elementwise_out})) { + LogCannotQuantizeOp(elementwise_op, "No scale available for the operator"); return; } bool is_x_unsigned{false}, is_y_unsigned{false}; - auto input_x_scale = - GetScaleValueForNode(elementwise_add_x, &is_x_unsigned); - auto input_y_scale = - GetScaleValueForNode(elementwise_add_y, &is_y_unsigned); + auto input_x_scale = GetScaleValueForNode(elementwise_x, &is_x_unsigned); + auto input_y_scale = GetScaleValueForNode(elementwise_y, &is_y_unsigned); // TODO(sfraczek): add support for different signness if (is_x_unsigned != is_y_unsigned) { - LogCannotQuantizeOp(elementwise_add_op, - "ElementwiseAdd inputs must be of the same type."); + LogCannotQuantizeOp(elementwise_op, + "Elementwise inputs must be of the same type."); return; } - QuantizeInput(g, elementwise_add_op, elementwise_add_x, "X", input_x_scale, + QuantizeInput(g, elementwise_op, elementwise_x, "X", input_x_scale, is_x_unsigned, "Scale_x"); - QuantizeInput(g, elementwise_add_op, elementwise_add_y, "Y", input_y_scale, + QuantizeInput(g, elementwise_op, elementwise_y, "Y", input_y_scale, is_y_unsigned, "Scale_y"); bool is_output_unsigned{false}; auto output_scale = - GetScaleValueForNode(elementwise_add_out, &is_output_unsigned); + GetScaleValueForNode(elementwise_out, &is_output_unsigned); - DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out", - output_scale, is_output_unsigned, "Scale_out"); + DequantizeOutput(g, elementwise_op, elementwise_out, "Out", output_scale, + is_output_unsigned, "Scale_out"); - ++quantize_elementwise_add_count; + ++quantize_elementwise_count; }; gpd(graph, handler); - AddStatis(quantize_elementwise_add_count); + AddStatis(quantize_elementwise_count); - PrettyLogDetail("--- quantized %d elementwise_add ops", - quantize_elementwise_add_count); + PrettyLogDetail("--- quantized %d %s ops", quantize_elementwise_count, + elementwise_type); } void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const { @@ -1146,7 +1146,8 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizeFc(graph); QuantizeReshape(graph); QuantizeMatmul(graph); - QuantizeElementwiseAdd(graph); + QuantizeElementwise(graph, "elementwise_add"); + QuantizeElementwise(graph, "elementwise_mul"); QuantizeFusionGru(graph); QuantizeMultiGru(graph); QuantizeFusionLSTM(graph); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index 412c4e40a01d5..3a286264e41ff 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -57,7 +57,8 @@ class CPUQuantizePass : public FusePassBase { void QuantizeTranspose(Graph* graph) const; void QuantizeReshape(Graph* graph) const; void QuantizeMatmul(Graph* graph) const; - void QuantizeElementwiseAdd(Graph* graph) const; + void QuantizeElementwise(Graph* graph, + const std::string elementwise_type) const; void QuantizeFusionGru(Graph* graph) const; void QuantizeMultiGru(Graph* graph) const; void QuantizeFusionLSTM(Graph* graph) const; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index 889417b78c864..22000865948d6 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -90,7 +90,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetAttr("Scale_x", 1.0f); op->SetAttr("Scale_y", 1.0f); op->SetAttr("Scale_out", 1.0f); - } else if (type == "elementwise_add") { + } else if (type == "elementwise_add" || type == "elementwise_mul") { op->SetInput("X", {inputs[0]}); if (inputs.size() > 1) op->SetInput("Y", {inputs[1]}); op->SetOutput("Out", {outputs[0]}); @@ -167,7 +167,8 @@ void CheckScales(const OpDesc* op, float scale, float shift) { scale); scale_names.push_back("Scale_in"); scale_names.push_back("Scale_out"); - } else if (type == "matmul" || type == "elementwise_add") { + } else if (type == "matmul" || type == "elementwise_add" || + type == "elementwise_mul") { scale_names.push_back("Scale_x"); scale_names.push_back("Scale_y"); scale_names.push_back("Scale_out"); @@ -546,46 +547,77 @@ TEST(CpuQuantizePass, matmul_not_quantized) { expected_operators, added_nodes, 1.0f); } -static const std::initializer_list variable_names_elementwise_add = - {"a", "b", "c", "d", "e", "f"}; +static const std::initializer_list variable_names_elementwise = { + "a", "b", "c", "d", "e", "f"}; -ProgramDesc BuildProgramDescElementwiseAdd() { +ProgramDesc BuildProgramDescElementwise(const std::string elementwise_type, + const std::string elementwise_name) { ProgramDesc prog; - for (auto& v : variable_names_elementwise_add) { + for (auto& v : variable_names_elementwise) { prog.MutableBlock(0)->Var(v); } SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true); SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true); - SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true, + SetOp(&prog, elementwise_type, elementwise_name, {"b", "d"}, {"e"}, true, "int8"); SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32"); return prog; } -TEST(CpuQuantizePass, elementwise_add) { +void TestElementwise(const std::string elementwise_type, + const std::string elementwise_name) { // 2 Quant + 2 IN + 1 DeQuant + 1 OUT int added_nodes = 6; std::unordered_map expected_operators = { - {"elementwise_add", 1}, {"quantize", 2}, {"dequantize", 3}}; - MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add, - expected_operators, added_nodes, SCALE * S8_MAX); + {elementwise_type, 1}, {"quantize", 2}, {"dequantize", 3}}; + MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name), + variable_names_elementwise, expected_operators, added_nodes, + SCALE * S8_MAX); } -TEST(CpuQuantizePass, elementwise_add_output_scale_missing) { +void TestElementwiseOutputScaleMissing(const std::string elementwise_type, + const std::string elementwise_name) { int added_nodes = 0; std::unordered_map expected_operators = { - {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}}; - MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add, - expected_operators, added_nodes, 1.f, 1.f, "e"); + {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}}; + MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name), + variable_names_elementwise, expected_operators, added_nodes, 1.f, + 1.f, "e"); } -TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) { +void TestElementwiseUnsignedAndSignedInput(const std::string elementwise_type, + const std::string elementwise_name) { int added_nodes = 0; std::unordered_map expected_operators = { - {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}}; - MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add, - expected_operators, added_nodes, 1.f, 1.f, "", "b"); + {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}}; + MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name), + variable_names_elementwise, expected_operators, added_nodes, 1.f, + 1.f, "", "b"); +} + +TEST(CpuQuantizePass, elementwise_add) { + TestElementwise("elementwise_add", "ElementwiseAdd"); +} + +TEST(CpuQuantizePass, elementwise_add_output_scale_missing) { + TestElementwiseOutputScaleMissing("elementwise_add", "ElementwiseAdd"); +} + +TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) { + TestElementwiseUnsignedAndSignedInput("elementwise_add", "ElementwiseAdd"); +} + +TEST(CpuQuantizePass, elementwise_mul) { + TestElementwise("elementwise_mul", "ElementwiseMul"); +} + +TEST(CpuQuantizePass, elementwise_mul_output_scale_missing) { + TestElementwiseOutputScaleMissing("elementwise_mul", "ElementwiseMul"); +} + +TEST(CpuQuantizePass, elementwise_mul_unsigned_and_signed_input) { + TestElementwiseUnsignedAndSignedInput("elementwise_mul", "ElementwiseMul"); } const std::vector churn_out_vars(ProgramDesc* prog, diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc index 5f74b61ee86aa..3b883dac9782a 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc @@ -26,10 +26,10 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Marks operators which are to be quantized."; std::unordered_set supported_op_types = std::unordered_set( - {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", "fc", - "matmul", "nearest_interp", "nearest_interp_v2", "pool2d", - "prior_box", "reshape2", "transpose2", "fusion_gru", "fusion_lstm", - "multi_gru", "slice"}); + {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", + "elementwise_mul", "fc", "matmul", "nearest_interp", + "nearest_interp_v2", "pool2d", "prior_box", "reshape2", "transpose2", + "fusion_gru", "fusion_lstm", "multi_gru", "slice"}); const auto& excluded_ids_list = Get>("quantize_excluded_op_ids"); const auto& op_types_list = diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc new file mode 100644 index 0000000000000..b7f7a8071d214 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +using string::PrettyLogDetail; + +void ElementwiseActivationOneDNNPass::ApplyImpl(Graph *graph) const { + std::vector act_types = { + "relu", "tanh", "leaky_relu", "swish", "hardswish", "sqrt", + "abs", "clip", "gelu", "relu6", "sigmoid"}; + std::vector elt_types = {"elementwise_add", "elementwise_sub", + "elementwise_mul"}; + + for (const auto &elt_type : elt_types) + for (const auto &act_type : act_types) { + std::unordered_map attr_map; + + if (act_type == "swish") + attr_map.emplace("beta", "activation_alpha"); + else if (act_type == "relu6") + attr_map.emplace("threshold", "activation_alpha"); + else if (act_type == "clip") { + attr_map.emplace("min", "activation_alpha"); + attr_map.emplace("max", "activation_beta"); + } else { + attr_map.emplace("alpha", "activation_alpha"); + attr_map.emplace("beta", "activation_beta"); + } + FuseElementwiseAct(graph, elt_type, act_type, attr_map); + } +} + +void ElementwiseActivationOneDNNPass::FuseElementwiseAct( + Graph *graph, const std::string &elt_type, const std::string &act_type, + const std::unordered_map &attr_map) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + FusePassBase::Init("elementwise_act", graph); + + GraphPatternDetector gpd; + auto *elementwise_input = gpd.mutable_pattern() + ->NewNode(elt_type + "_act/elementwise_input") + ->AsInput() + ->assert_is_op_input(elt_type, "X"); + patterns::ElementwiseActivation elementwise_act_pattern(gpd.mutable_pattern(), + elt_type + "_act"); + elementwise_act_pattern(elementwise_input, elt_type, act_type); + + int found_elementwise_activation_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "Fuse " << elt_type << " with activation op."; + // Elementwise output + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_act_pattern); + // ACT output + GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out, + elementwise_act_pattern); + // ops + GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise, + elementwise_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(activation, activation, elementwise_act_pattern); + + auto *elementwise_op = elementwise->Op(); + + if (elementwise_op->HasAttr("use_mkldnn")) { + const std::string wo_elt_type = + "The " + elt_type; // Workaround for PP error message checking. + PADDLE_ENFORCE_EQ( + BOOST_GET_CONST(bool, elementwise_op->GetAttr("use_mkldnn")), true, + platform::errors::PreconditionNotMet( + wo_elt_type + "+Act fusion may happen only when oneDNN library " + "is used.")); + } + + auto *activation_op = activation->Op(); + for (const auto &attr : attr_map) { + if (activation_op->HasAttr(attr.first)) { + elementwise_op->SetAttr(attr.second, + activation_op->GetAttr(attr.first)); + } + } + + if (act_type == "gelu" && activation_op->HasAttr("approximate") && + BOOST_GET_CONST(bool, activation_op->GetAttr("approximate"))) + elementwise_op->SetAttr("activation_type", std::string("gelu_tanh")); + else + elementwise_op->SetAttr("activation_type", act_type); + + elementwise_op->SetOutput("Out", {activation_out->Name()}); + + IR_OP_VAR_LINK(elementwise, activation_out); + GraphSafeRemoveNodes(g, {activation, elementwise_out}); + found_elementwise_activation_count++; + }; + + gpd(graph, handler); + AddStatis(found_elementwise_activation_count); + PrettyLogDetail("--- fused %d %s with %s activation", + found_elementwise_activation_count, elt_type, act_type); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(elt_act_mkldnn_fuse_pass, + paddle::framework::ir::ElementwiseActivationOneDNNPass); +REGISTER_PASS_CAPABILITY(elt_act_mkldnn_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("elementwise_add", 1) + .LE("elementwise_sub", 1) + .LE("elementwise_mul", 1) + .LE("relu", 0) + .LE("tanh", 0) + .LE("leaky_relu", 1) + .LE("swish", 0) + .LE("hard_swish", 0) + .LE("sqrt", 0) + .LE("abs", 0) + .LE("clip", 1) + .LE("gelu", 0) + .LE("relu6", 0) + .LE("sigmoid", 0)); diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h new file mode 100644 index 0000000000000..b8b7d06a82850 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * \brief Fuse the Elementwise and activation operators into single + * OneDNN's Elementwise with post-op. + */ +class ElementwiseActivationOneDNNPass : public FusePassBase { + public: + virtual ~ElementwiseActivationOneDNNPass() {} + + protected: + void ApplyImpl(Graph *graph) const override; + + void FuseElementwiseAct( + Graph *graph, const std::string &elt_types, const std::string &act_types, + const std::unordered_map &attr_map) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index 11190309814e7..17663ecf6baa3 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -28,12 +28,13 @@ USE_OP_ITSELF(batch_norm); USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN); -USE_OP(conv2d_transpose); +USE_OP_ITSELF(conv2d_transpose); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(gelu); +USE_OP_ITSELF(gelu); USE_OP_DEVICE_KERNEL(gelu, MKLDNN); +PD_DECLARE_ARG_MAPPING_FN(gelu); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index 796aa4039c9e8..7df957b2c0eca 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -15,8 +15,10 @@ #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h" #include -#include #include + +#include + #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_registry.h" @@ -24,12 +26,13 @@ USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(leaky_relu); +USE_OP_ITSELF(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); -USE_OP(gelu); +USE_OP_ITSELF(gelu); USE_OP_ITSELF(relu); -USE_OP(tanh); +USE_OP_ITSELF(tanh); USE_OP_DEVICE_KERNEL(tanh, MKLDNN); +PD_DECLARE_ARG_MAPPING_FN(gelu); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 7e61d6ae4248b..8c51c278d4872 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -125,6 +125,7 @@ class Node { // Only use this for auto parallel. // A node does not have original desc if the return is zero. uint64_t OriginalDescId() const { return original_desc_id_; } + int GraphId() const { return graph_id_; } bool IsOp() const { return type_ == Type::kOperation; } bool IsVar() const { return type_ == Type::kVariable; } @@ -246,10 +247,12 @@ class Node { // Store the original id of var desc or op desc. // Only use this for auto parallel. uint64_t original_desc_id_{0}; + int graph_id_{-1}; private: // ID can only set by a Graph. void SetId(int id) { id_ = id; } + void SetGraphId(int graph_id) { graph_id_ = graph_id; } // desc_order can only set by a Graph when constructing a Graph from a // BlockDesc. diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index ece4815858640..f30d1ea1b83dd 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -41,6 +41,7 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc, void NaiveExecutor::Run() { #ifdef PADDLE_WITH_MKLDNN platform::AttachPointerHashToMKLDNNKey(this, place_); + platform::RegisterModelLayout(ops_, place_); #endif platform::ScopedFlushDenormal flush; for (auto &op : ops_) { diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 62d87b6917e40..7fe1852f7396c 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -31,14 +31,14 @@ USE_OP(slice); USE_OP(concat); USE_OP(matmul); USE_OP_ITSELF(elementwise_add); -USE_OP(sigmoid); -USE_OP(tanh); +USE_OP_ITSELF(sigmoid); +USE_OP_ITSELF(tanh); USE_OP(elementwise_mul); USE_OP(softmax_with_cross_entropy); USE_OP_ITSELF(reduce_mean); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); -USE_OP(reduce_mean_grad); +USE_OP_ITSELF(reduce_sum_grad); +USE_OP_ITSELF(reduce_mean_grad); USE_OP_ITSELF(reshape2_grad); USE_OP(softmax_with_cross_entropy_grad); USE_OP_ITSELF(elementwise_add_grad); @@ -46,9 +46,9 @@ USE_OP(matmul_grad); USE_OP(square); USE_OP(transpose2_grad); USE_OP(concat_grad); -USE_OP(elementwise_mul_grad); -USE_OP(sigmoid_grad); -USE_OP(tanh_grad); +USE_OP_ITSELF(elementwise_mul_grad); +USE_OP_ITSELF(sigmoid_grad); +USE_OP_ITSELF(tanh_grad); USE_OP(sum); USE_OP(slice_grad); USE_OP(lookup_table_grad); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f8e30c1ee294e..42fbeb5d29ce4 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -628,10 +628,12 @@ std::vector ExecutionContext::MultiOutput( bool OpSupportGPU(const std::string& op_type) { // check in new Function kernel first + bool has_phi_kernel = false; auto& kernel_factory = phi::KernelFactory::Instance(); auto kernel_key_map = kernel_factory.SelectKernelMap(phi::TransToPhiKernelName(op_type)); for (auto& kernel : kernel_key_map) { + has_phi_kernel = true; if (platform::is_gpu_place(phi::TransToPhiPlace(kernel.first.backend()))) { return true; } @@ -639,12 +641,19 @@ bool OpSupportGPU(const std::string& op_type) { auto& all_kernels = OperatorWithKernel::AllOpKernels(); auto it = all_kernels.find(op_type); - if (it == all_kernels.end()) { - // All control operator must support GPU - return true; - } - for (auto& kern_pair : it->second) { - if (platform::is_gpu_place(kern_pair.first.place_)) { + if (it != all_kernels.end()) { + for (auto& kern_pair : it->second) { + if (platform::is_gpu_place(kern_pair.first.place_)) { + return true; + } + } + } else { + if (has_phi_kernel) { + // if has phi kernel, but not find phi gpu kernel and fluid gpu kernel, + // this op doesn't support GPU + return false; + } else { + // All control operator must support GPU return true; } } @@ -1456,7 +1465,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { kernel_iter = kernels.find(expected_kernel_key); } #endif -#ifdef PADDLE_WITH_XPU + +#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) if (platform::is_xpu_place(expected_kernel_key.place_) && (kernel_iter == kernels.end() || !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) || @@ -1470,17 +1480,36 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { #endif #ifdef PADDLE_WITH_XPU_KP - bool use_xpu_kp_kernel_rt = - FLAGS_run_kp_kernel && - paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key); - bool use_xpu_kp_kernel_debug = - paddle::platform::is_in_xpu_kpwhite_list(type_); - if (platform::is_xpu_place(expected_kernel_key.place_) && - (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) { - expected_kernel_key.library_type_ = LibraryType::kKP; - kernel_iter = kernels.find(expected_kernel_key); - VLOG(3) << "using XPU KP kernel: " << type_ - << ", using_kernel_key:" << expected_kernel_key; + if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) { + bool use_xpu_kp_kernel_rt = + FLAGS_run_kp_kernel && + paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key); + bool use_xpu_kp_kernel_debug = + paddle::platform::is_in_xpu_kpwhite_list(type_); + if (use_xpu_kp_kernel_rt) { + VLOG(3) << "xpu_kp using rt mode "; + } + if (use_xpu_kp_kernel_debug) { + VLOG(3) << "xpu_kp using debug mode "; + } + bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug); + if (is_xpu_kp_support) { + expected_kernel_key.library_type_ = LibraryType::kKP; + kernel_iter = kernels.find(expected_kernel_key); + VLOG(3) << "using XPU KP kernel: " << type_ + << ", using_kernel_key:" << expected_kernel_key; + } + bool is_xpu_unsupport = + (!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) || + paddle::platform::is_in_xpu_black_list(type_)); + if (!is_xpu_kp_support && + (kernel_iter == kernels.end() || is_xpu_unsupport)) { + VLOG(3) << "missing XPU kernel: " << type_ + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } } #endif @@ -2083,16 +2112,25 @@ void OperatorWithKernel::BuildPhiKernelContext( auto* var = ins_vector[offset]; if (var->IsType()) { tensor_in = &(var->Get()); + pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); } else if (var->IsType()) { tensor_in = &(var->Get()); + pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); + } else if (var->IsType()) { + paddle::SmallVector tensor_vector; + auto& tensor_array = var->Get(); + for (auto& t : tensor_array) { + tensor_vector.emplace_back(&t); + } + pt_kernel_context->EmplaceBackInputsWithoutSetRange(tensor_vector); + end_idx += tensor_array.size() - 1; } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported input `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); } - - pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); } + // Note: here cannot deal with vector input pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i); } VLOG(4) << "Done inputs"; @@ -2120,22 +2158,33 @@ void OperatorWithKernel::BuildPhiKernelContext( for (size_t offset = 0; offset < outs_vector.size(); ++offset) { phi::TensorBase* tensor_out = nullptr; auto* var = outs_vector[offset]; - if (var) { if (var->template IsType()) { tensor_out = var->template GetMutable(); + pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } else if (var->template IsType()) { tensor_out = var->template GetMutable(); + pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); + } else if (var->template IsType()) { + paddle::SmallVector tensor_vector; + auto* tensor_array = + var->template GetMutable(); + // Note: If the input LoDTensorArray size is 0, the output + // LoDTensorArray is also 0 + for (auto& t : *tensor_array) { + tensor_vector.emplace_back(&t); + } + pt_kernel_context->EmplaceBackOutputsWithoutSetRange(tensor_vector); + end_idx += tensor_array->size() - 1; } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported output `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); } + } else { + pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } - - pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } - pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i); } VLOG(4) << "Done outputs"; @@ -2250,42 +2299,67 @@ void OperatorWithKernel::BuildPhiKernelContext( } } else { // TODO(chenweihang): support other attrs later - auto& attr = Attrs().at(attr_names[i]); + auto attr_it = attrs_.find(attr_names[i]); if (attr_defs[i].type_index == std::type_index(typeid(int))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int, attr)); + if (attr_it == attrs_.end()) { + auto in_it = ctx.inputs.find(attr_names[i]); + if (in_it != ctx.inputs.end()) { + // get data from input + auto val = experimental::MakePhiScalarFromVar(*(in_it->second[0])); + int32_t val_int = val.template to(); + pt_kernel_context->EmplaceBackAttr(val_int); + } else { + PADDLE_THROW(platform::errors::NotFound( + "can not find attribute `%s` both in attribute and input ", + attr_names[i])); + } + } else { + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(int, attr_it->second)); + } } else if (attr_defs[i].type_index == std::type_index(typeid(float))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(float, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(float, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(bool, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(int64_t, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(std::string))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(std::string, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(phi::DataType))) { auto data_type = paddle::framework::TransToPhiDataType( static_cast( - BOOST_GET_CONST(int, attr))); + BOOST_GET_CONST(int, attr_it->second))); pt_kernel_context->EmplaceBackAttr(data_type); } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { - if (std::type_index(attr.type()) == + if (std::type_index(attr_it->second.type()) == std::type_index(typeid(std::vector))) { pt_kernel_context->EmplaceBackAttr( - BOOST_GET_CONST(std::vector, attr)); - } else if (std::type_index(attr.type()) == + BOOST_GET_CONST(std::vector, attr_it->second)); + } else if (std::type_index(attr_it->second.type()) == std::type_index(typeid(std::vector))) { // Emplace Back Attr according to the type of Phi_Kernel args. - const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); + const auto& vector_int_attr = + BOOST_GET_CONST(std::vector, attr_it->second); const std::vector vector_int64_attr(vector_int_attr.begin(), vector_int_attr.end()); pt_kernel_context->EmplaceBackAttr(vector_int64_attr); } } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { - const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); + const auto& vector_int_attr = + BOOST_GET_CONST(std::vector, attr_it->second); pt_kernel_context->EmplaceBackAttr(vector_int_attr); + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr_it->second)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` when construct " diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 1a1171f1dba4d..6f68c261d2b24 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -483,6 +483,10 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext { return ctx_.InputVar(name)->IsType(); } + bool IsDenseTensorVectorInput(const std::string& name) const override { + return ctx_.InputVar(name)->IsType(); + } + bool IsDenseTensorOutput(const std::string& name) const override { return ctx_.OutputVar(name)->IsType(); } diff --git a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc index 23cb653fef22a..7a7a7b2798f59 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc @@ -45,8 +45,8 @@ Program CreateAddProgram() { NetBuilder builder("net_builder"); auto a = builder.CreateInput(Float(32), {M, N}); auto b = builder.CreateInput(Float(32), {M, N}); - auto c = builder.add(a, b); - auto d = builder.add(a, c); + auto c = builder.Add(a, b); + auto d = builder.Add(a, c); auto program = builder.Build(); return program; @@ -116,8 +116,8 @@ TEST(net_build, program_execute_fc) { auto w = builder.CreateInput(Float(32), {N, K}, "W"); // weight auto b = builder.CreateInput(Float(32), {N}, "B"); // bias - auto mul_out = builder.mul(a, w, 2, 1); - auto add_out = builder.add(mul_out, b); + auto mul_out = builder.Mul(a, w, 2, 1); + auto add_out = builder.Add(mul_out, b); auto program = builder.Build(); #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 10ceae62dccbb..e8cd84248ea85 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -79,18 +79,6 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#ifdef PADDLE_WITH_IPU - else if (platform::is_ipu_place(src_place) && // NOLINT - platform::is_cpu_place(dst_place)) { - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else if (platform::is_cpu_place(src_place) && - platform::is_ipu_place(dst_place)) { - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else if (platform::is_ipu_place(src_place) && - platform::is_ipu_place(dst_place)) { - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } -#endif #ifdef PADDLE_WITH_CUSTOM_DEVICE else if (platform::is_custom_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { @@ -390,6 +378,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, "Copying from %s to %s is not supported.", src_place, dst_place)); } #endif +#ifdef PADDLE_WITH_IPU + else if (platform::is_ipu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_ipu_place(dst_place)) { + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else if (platform::is_ipu_place(src_place) && // NOLINT + platform::is_ipu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data sync from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "Copying from %s to %s is not supported.", src_place, dst_place)); + } +#endif } template @@ -447,27 +458,15 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#ifdef PADDLE_WITH_IPU - else if (platform::is_ipu_place(src_place) && // NOLINT - platform::is_cpu_place(dst_place)) { - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else if (platform::is_cpu_place(src_place) && // NOLINT - platform::is_ipu_place(dst_place)) { - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else { // NOLINT - PADDLE_THROW(platform::errors::Unimplemented( - "Copy from %s to %s is not supported.", src_place, dst_place)); - } -#endif #ifdef PADDLE_WITH_CUSTOM_DEVICE else if (platform::is_custom_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { /* custom_device -> cpu*/ memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); - } + } // NOLINT else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/ memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); - } + } // NOLINT else if (platform::is_custom_place(src_place) && // NOLINT platform::is_custom_place( dst_place)) { /* custom_device -> custom_device*/ @@ -483,11 +482,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, else if (platform::is_xpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } + } // NOLINT else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_xpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } + } // NOLINT else if (platform::is_xpu_place(src_place) && // NOLINT platform::is_xpu_place(dst_place)) { if (src_ptr == dst_ptr) { @@ -502,7 +501,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, auto xpu_ctx = platform::DeviceContextPool::Instance().Get(xpu_dst_place); xpu_ctx->Wait(); } - } + } // NOLINT else { // NOLINT PADDLE_THROW(platform::errors::Unimplemented( "Copy from %s to %s is not supported.", src_place, dst_place)); @@ -601,6 +600,29 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif +#ifdef PADDLE_WITH_IPU + else if (platform::is_ipu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_ipu_place(dst_place)) { + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else if (platform::is_ipu_place(src_place) && // NOLINT + platform::is_ipu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data sync from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "Copy from %s to %s is not supported.", src_place, dst_place)); + } +#endif } template @@ -1224,8 +1246,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor, proto::VarType::TensorDesc desc; { // int32_t size // proto buffer - int32_t size; + int32_t size = -1; is.read(reinterpret_cast(&size), sizeof(size)); + PADDLE_ENFORCE_EQ(is.good(), true, platform::errors::Unavailable( + "Cannot read tensor desc size")); + PADDLE_ENFORCE_GE(size, 0, platform::errors::InvalidArgument( + "Tensor desc size should >= 0")); std::unique_ptr buf(new char[size]); is.read(reinterpret_cast(buf.get()), size); PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index 149202468be6c..7d60b7d26f3fb 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -124,7 +124,7 @@ AmpOperators::AmpOperators() OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16)); unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(), unsupported_ops_gpu_bf16.end()); -// NOTE: GPU/NPU/XPU is compiled seperatly. +// NOTE: GPU/NPU/XPU/MLU is compiled seperatly. #elif defined(PADDLE_WITH_ASCEND_CL) auto unsupported_ops_npu_fp16 = std::get<2>( OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16)); @@ -143,6 +143,15 @@ AmpOperators::AmpOperators() OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16)); unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(), unsupported_ops_xpu_bf16.end()); +#elif defined(PADDLE_WITH_MLU) + auto unsupported_ops_mlu_fp16 = std::get<2>( + OpSupportedInfos("MLU", paddle::framework::proto::VarType::FP16)); + unsupported_fp16_ops_->insert(unsupported_ops_mlu_fp16.begin(), + unsupported_ops_mlu_fp16.end()); + auto unsupported_ops_mlu_bf16 = std::get<2>( + OpSupportedInfos("MLU", paddle::framework::proto::VarType::BF16)); + unsupported_bf16_ops_->insert(unsupported_ops_mlu_bf16.begin(), + unsupported_ops_mlu_bf16.end()); #endif VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " " << unsupported_fp16_ops_->size() << " " @@ -209,7 +218,10 @@ inline bool NeedCast(const std::shared_ptr& var) { auto data_type = GetDataType(var); if (paddle::platform::is_gpu_place(place) || paddle::platform::is_cuda_pinned_place(place) || - paddle::platform::is_xpu_place(place)) { + paddle::platform::is_xpu_place(place) || + paddle::platform::is_mlu_place(place) || + paddle::platform::is_npu_place(place) || + paddle::platform::is_npu_pinned_place(place)) { // CudaPinndePlace is added for varbase created by dataloader if (data_type == paddle::framework::proto::VarType::FP32 || data_type == paddle::framework::proto::VarType::FP16 || diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index 7416d206fc43e..d7478b18dba06 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -389,6 +389,9 @@ static void PerformBackwardInplace(const std::string& op_type, } void BasicEngine::Execute() { + platform::RecordEvent backward_record_event( + "backward", platform::TracerEventType::Operator, 1); + if (init_nodes_.empty()) { return; } @@ -412,7 +415,7 @@ void BasicEngine::Execute() { for (auto& cur_op : *shared_cur_node) { platform::RecordEvent op_type_record_event( - cur_op.Type(), platform::TracerEventType::Operator, 1); + cur_op.Type() + " grad_node", platform::TracerEventType::Operator, 1); ++op_num; diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 12aa13bbacc3b..499cf4d8ad6d8 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -423,7 +423,7 @@ void TensorAdd(const VarType& src, VarType* dst) { } if (data_type == framework::proto::VarType::BF16) { if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return TensorAddImpl( src_tensor, dst_tensor, place); #else diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 2317bfdd7c0d5..a427b9b819911 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -234,7 +234,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, auto& kernels = kernels_iter->second; auto kernel_iter = kernels.find(expected_kernel_key); -#ifdef PADDLE_WITH_XPU +#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && (kernel_iter == kernels.end() || is_xpu_unsupport)) { VLOG(3) << "missing XPU kernel: " << op.Type() @@ -243,28 +243,36 @@ PreparedOp PrepareImpl(const NameVarMap& ins, expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } - #endif #ifdef PADDLE_WITH_XPU_KP - bool use_xpu_kp_kernel_rt = - FLAGS_run_kp_kernel && - paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); - bool use_xpu_kp_kernel_debug = - paddle::platform::is_in_xpu_kpwhite_list(op.Type()); - if (use_xpu_kp_kernel_rt) { - VLOG(3) << "xpu_kp using rt mode "; - } - if (use_xpu_kp_kernel_debug) { - VLOG(3) << "xpu_kp using debug mode "; - } - if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && - (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) { - expected_kernel_key.place_ = platform::XPUPlace(); - expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; - kernel_iter = kernels.find(expected_kernel_key); - VLOG(3) << "using XPU KP kernel: " << op.Type() - << ", using_kernel_key:" << expected_kernel_key; + if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) { + bool use_xpu_kp_kernel_rt = + FLAGS_run_kp_kernel && + paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); + bool use_xpu_kp_kernel_debug = + paddle::platform::is_in_xpu_kpwhite_list(op.Type()); + if (use_xpu_kp_kernel_rt) { + VLOG(3) << "xpu_kp using rt mode "; + } + if (use_xpu_kp_kernel_debug) { + VLOG(3) << "xpu_kp using debug mode "; + } + bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug); + if (is_xpu_kp_support) { + expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; + kernel_iter = kernels.find(expected_kernel_key); + VLOG(3) << "using XPU KP kernel: " << op.Type() + << ", using_kernel_key:" << expected_kernel_key; + } + if (!is_xpu_kp_support && + (kernel_iter == kernels.end() || is_xpu_unsupport)) { + VLOG(3) << "missing XPU kernel: " << op.Type() + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } } #endif diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index d7c0c8cc547e6..9daac181d57de 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -264,14 +264,23 @@ void BuildDygraphPhiKernelContext( size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second); - if ((it == ins.end()) && - (input_defs[i].type_index == - std::type_index(typeid(paddle::optional)))) { - kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); - auto end_idx = start_idx + 1; - kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); - continue; + if (it == ins.end()) { + if (LIKELY(input_defs[i].type_index == + std::type_index( + typeid(paddle::optional)))) { + kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); + auto end_idx = start_idx + 1; + kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); + continue; + } else { + PADDLE_THROW(phi::errors::NotFound( + "Can not find input variable '%s' for %s OP, please check whether " + "the name setting in OpArgumentMapping is consistent with that in " + "OpMaker.", + input_names[i], pt_kernel_signature.name)); + } } + auto ins_vector = it->second; size_t end_idx = start_idx + ins_vector.size(); @@ -280,14 +289,23 @@ void BuildDygraphPhiKernelContext( auto& var = ins_vector[offset]->Var(); if (var.template IsType()) { tensor_in = &(var.template Get()); + kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); } else if (var.template IsType()) { tensor_in = &(var.template Get()); + kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); + } else if (var.template IsType()) { + paddle::SmallVector tensor_vector; + auto& tensor_array = var.template Get(); + for (auto& t : tensor_array) { + tensor_vector.emplace_back(&t); + } + kernel_ctx->EmplaceBackInputsWithoutSetRange(tensor_vector); + end_idx += tensor_array.size() - 1; } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported input `%s` type when call pt kernel.", framework::ToTypeName(var.Type()))); } - kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); } kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); } @@ -317,22 +335,32 @@ void BuildDygraphPhiKernelContext( if (var) { if (var->template IsType()) { tensor_out = var->template GetMutable(); + kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } else if (var->template IsType()) { tensor_out = var->template GetMutable(); + kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); + } else if (var->template IsType()) { + paddle::SmallVector tensor_vector; + auto* tensor_array = + var->template GetMutable(); + for (auto& t : *tensor_array) { + tensor_vector.emplace_back(&t); + } + kernel_ctx->EmplaceBackOutputsWithoutSetRange(tensor_vector); + end_idx += tensor_array->size() - 1; } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported output `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); } + } else { + kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } - - kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i); } for (size_t i = 0; i < attr_names.size(); ++i) { - VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i]; if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { if (attrs.find(attr_names[i]) != attrs.end()) { // shape is in the attribute @@ -410,6 +438,17 @@ void BuildDygraphPhiKernelContext( experimental::MakePhiScalarFromVar(ins_vector[0]->Var()))); } + } else if (ins.find(attr_names[i]) != ins.end()) { + // deal tensor attr here + auto& ins_vector = ins.at(attr_names[i]); + auto tensor_attr = + experimental::MakePhiScalarFromVar(ins_vector[0]->Var()); + if (attr_defs[i].type_index == std::type_index(typeid(int))) { + int val = tensor_attr.template to(); + kernel_ctx->EmplaceBackAttr(val); + } else { + PADDLE_THROW(platform::errors::Unimplemented("only support int here")); + } } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); @@ -466,6 +505,7 @@ void BuildDygraphPhiKernelContext( } } else { // TODO(chenweihang): support other attrs later + auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); if (attr_defs[i].type_index == std::type_index(typeid(int))) { kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr)); @@ -501,6 +541,10 @@ void BuildDygraphPhiKernelContext( } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector, attr)); + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + kernel_ctx->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` when construct " diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index fec9afbf3b403..03fa46eab5367 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -1109,8 +1109,9 @@ void Reducer::FinalizeBackward() { if (find_unused_vars_each_step_) { // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \ + defined(PADDLE_WITH_CNCL) ProcessUnusedDenseVars(); #endif // Initialize local used vars diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc index 3ac2028790608..02a1689c23a3f 100644 --- a/paddle/fluid/imperative/tests/test_hooks.cc +++ b/paddle/fluid/imperative/tests/test_hooks.cc @@ -24,6 +24,10 @@ #include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/memory/memcpy.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); namespace platform = paddle::platform; namespace framework = paddle::framework; diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc index 17cbe06748234..4cda3f32fdf3f 100644 --- a/paddle/fluid/imperative/tests/test_prepare_op.cc +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -24,6 +24,13 @@ #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/imperative/prepared_operator.h" #include "paddle/fluid/imperative/type_defs.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(relu, GPU, ALL_LAYOUT); +#endif namespace imperative = paddle::imperative; namespace platform = paddle::platform; diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc index d05036f7a12eb..f754c6fdd0ee7 100644 --- a/paddle/fluid/imperative/tests/test_tracer.cc +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -28,6 +28,14 @@ #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); +#endif namespace imperative = paddle::imperative; namespace platform = paddle::platform; @@ -591,5 +599,5 @@ TEST(test_tracer, eager_tracer) { USE_OP(mul); USE_OP(mul_grad); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); +USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 01c9d2847e0c8..d18c8e96c49b6 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -177,7 +177,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, paddle::framework::AttributeMap* passed_default_attrs_, bool use_default_attr_map) { platform::RecordEvent op_type_record_event( - type, platform::TracerEventType::Operator, 1); + type + " trace_op", platform::TracerEventType::Operator, 1); platform::ScopedFlushDenormal flush; VLOG(1) << "Trace Op: " << type; if (FLAGS_use_mkldnn) { @@ -297,19 +297,24 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, program_desc_tracer_->InsertOp(type, new_ins, outs, attrs); } - if (ComputeRequiredGrad(new_ins, outs, trace_backward)) { - PADDLE_ENFORCE_EQ( - passed_default_attrs_, nullptr, - paddle::platform::errors::PermissionDenied( - "We expect passed_default_attrs_ is nullptr while " - "use_default_attr_map is true, however we got not null " - "passed_default_attrs_. Please check your usage of trace_op. ")); - CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place, - inplace_map); - } else { - VLOG(3) << "No Grad to track for Op: " << type; + { + platform::RecordEvent node_creation_record_event( + type + " node_creation", platform::TracerEventType::Operator, 1); + + if (ComputeRequiredGrad(new_ins, outs, trace_backward)) { + PADDLE_ENFORCE_EQ( + passed_default_attrs_, nullptr, + paddle::platform::errors::PermissionDenied( + "We expect passed_default_attrs_ is nullptr while " + "use_default_attr_map is true, however we got not null " + "passed_default_attrs_. Please check your usage of trace_op. ")); + CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place, + inplace_map); + } else { + VLOG(3) << "No Grad to track for Op: " << type; + } + VLOG(6) << "Finish Trace Op: " << type; } - VLOG(6) << "Finish Trace Op: " << type; } template void Tracer::TraceOp( @@ -385,8 +390,8 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins, } phi::KernelSignature Tracer::GetExpectedKernelSignature( - const std::string& type, const NameVarBaseMap& ins, - const NameVarBaseMap& outs, framework::AttributeMap attrs) const { + const std::string& type, const NameTensorMap& ins, + const NameTensorMap& outs, framework::AttributeMap attrs) const { auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false); framework::RuntimeContext ctx({}, {}); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -401,7 +406,7 @@ phi::KernelSignature Tracer::GetExpectedKernelSignature( attr_checker == nullptr ? empty_attrs_map : attr_checker->GetDefaultAttrMap(); auto dygraph_exe_ctx = - imperative::DygraphExecutionContext( + imperative::DygraphExecutionContext( *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, default_attrs); auto* opbase_with_kernel = diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index fd13fce6a6e17..f24961885c9b8 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -156,8 +156,8 @@ class Tracer { } phi::KernelSignature GetExpectedKernelSignature( - const std::string& type, const NameVarBaseMap& ins, - const NameVarBaseMap& outs, framework::AttributeMap attrs) const; + const std::string& type, const NameTensorMap& ins, + const NameTensorMap& outs, framework::AttributeMap attrs) const; paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists( const platform::Place& place); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 26b8b9e8e17e0..5d0c3c98d2f61 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -45,6 +45,11 @@ add_subdirectory(api) set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg}) + +if(WITH_ONNXRUNTIME) + set(STATIC_INFERENCE_API ${STATIC_INFERENCE_API} onnxruntime_predictor) +endif() + #TODO(wilber, T8T9): Do we still need to support windows gpu static library? if(WIN32 AND WITH_GPU) cc_library(paddle_inference DEPS ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules}) @@ -91,6 +96,13 @@ if (WITH_PSCORE) set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service) endif () +if (WITH_ONNXRUNTIME) + set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} + ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc + ) + set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} onnxruntime_predictor) +endif (WITH_ONNXRUNTIME) + # Create shared inference library cc_library(paddle_inference_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${SHARED_INFERENCE_DEPS}) diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index a5c32164bf1a2..74e8ca3f229c6 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -188,6 +188,9 @@ struct Argument { DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); + DECL_ARGUMENT_FIELD(use_gpu_fp16, UseGPUFp16, bool); + DECL_ARGUMENT_FIELD(gpu_fp16_disabled_op_types, GpuFp16DisabledOpTypes, + std::unordered_set); // Usually use for trt dynamic shape. // TRT will select the best kernel according to opt shape diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 796c86a3ad1ef..287c896e49bf2 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -189,6 +189,10 @@ void IRPassManager::CreatePasses(Argument *argument, new int(argument->dlnne_min_subgraph_size())); pass->Set("program", new framework::ProgramDesc *(&argument->main_program())); + } else if (pass_name == "mixed_precision_configure_pass") { + pass->Set("gpu_fp16_disabled_op_types", + new std::unordered_set( + argument->gpu_fp16_disabled_op_types())); } if (pass_name == "lite_subgraph_pass") { bool lite_enable_int8 = diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index daa18d8c78bf8..614eea24a0e2e 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" #include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/enforce.h" @@ -65,6 +66,26 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) { #else +void IrParamsSyncAmongDevicesPass::GetVarNameToOpTypeMap( + const framework::ir::Graph &graph, + std::unordered_map *var_name_op_type_map) { + std::vector node_list = + framework::ir::TopologyVarientSort( + graph, static_cast(0)); + for (auto *op_node : node_list) { + if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || + op_node->Op()->Type() == "fetch") + continue; + + for (auto *pre_node : op_node->inputs) { + if (pre_node->IsVar() && pre_node->Var()->Persistable()) { + var_name_op_type_map->insert(std::pair( + pre_node->Var()->Name(), op_node->Op()->Type())); + } + } + } +} + void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { // The parameters are on the cpu, therefore, synchronization is not necessary. if (!argument->use_gpu()) return; @@ -102,6 +123,16 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { if (with_dynamic_shape) { reserve_cpu_weights = true; } + + bool mixed_precision_mode = + argument->Has("use_gpu_fp16") && argument->use_gpu_fp16(); + std::unordered_map var_name_op_type_map{}; + std::unordered_set blacklist{}; + if (mixed_precision_mode) { + GetVarNameToOpTypeMap(graph, &var_name_op_type_map); + blacklist = argument->gpu_fp16_disabled_op_types(); + } + for (auto &var_name : all_vars) { if (std::count(repetitive_params.begin(), repetitive_params.end(), var_name)) { @@ -117,18 +148,29 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { var->IsType()) { auto *t = var->GetMutable(); - platform::CPUPlace cpu_place; - framework::LoDTensor temp_tensor; - temp_tensor.Resize(t->dims()); - temp_tensor.mutable_data(cpu_place); - - // Copy the parameter data to a tmp tensor. - paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor); - // Reallocation the space on GPU - t->clear(); - - // Copy parameter data to newly allocated GPU space. - paddle::framework::TensorCopySync(temp_tensor, place, t); + bool is_float = t->dtype() == paddle::experimental::DataType::FLOAT32 || + t->dtype() == paddle::experimental::DataType::FLOAT64; + if (mixed_precision_mode && + !blacklist.count(var_name_op_type_map[var_name]) && is_float) { + framework::Tensor half_tensor; + half_tensor.set_type(paddle::experimental::DataType::FLOAT16); + half_tensor.Resize(t->dims()); + auto *half_data = + half_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < t->numel(); i++) { + auto *data = t->mutable_data(platform::CPUPlace()); + half_data[i] = static_cast(data[i]); + } + t->clear(); + paddle::framework::TensorCopySync(half_tensor, place, t); + } else { + platform::CPUPlace cpu_place; + framework::LoDTensor temp_tensor; + temp_tensor.Resize(t->dims()); + paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor); + t->clear(); + paddle::framework::TensorCopySync(temp_tensor, place, t); + } } } } diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index d5e98ec886e65..f8209f051d534 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -38,7 +38,12 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass { #ifdef PADDLE_WITH_ASCEND_CL void CopyParamsToNpu(Argument *argument); #else - void CopyParamsToGpu(Argument *argument); + + void GetVarNameToOpTypeMap( + const framework::ir::Graph& graph, + std::unordered_map* var_name_op_type_map); + + void CopyParamsToGpu(Argument* argument); #endif }; diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 1f83e606c3fde..bdc16ef4c7907 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -49,8 +49,15 @@ if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter) endif() -cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} - zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils) +if (WITH_ONNXRUNTIME) + cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} + zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils onnxruntime paddle2onnx) + cc_library(onnxruntime_predictor SRCS onnxruntime_predictor.cc DEPS analysis_predictor) +else (WITH_ONNXRUNTIME) + cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} + zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils) +endif (WITH_ONNXRUNTIME) + cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) @@ -75,6 +82,16 @@ elseif (WIN32) ARGS --dirname=${WORD2VEC_MODEL_DIR}) endif() +if (WITH_ONNXRUNTIME) + if (NOT APPLE AND NOT WIN32) + cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS paddle_inference_shared + ARGS --dirname=${MOBILENETV2_MODEL_DIR}) + elseif (WIN32) + cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS onnxruntime_predictor benchmark ${inference_deps} + ARGS --dirname=${MOBILENETV2_MODEL_DIR}) + endif() +endif() + if(WITH_TESTING AND WITH_MKLDNN) if (NOT APPLE AND NOT WIN32) cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR}) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 9c33d70030645..d08d28a3f6233 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -83,6 +83,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path, Update(); } + void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -97,12 +98,26 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, Update(); } + void AnalysisConfig::DisableGpu() { use_gpu_ = false; Update(); } +void AnalysisConfig::Exp_EnableUseGpuFp16( + std::unordered_set op_list) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + use_gpu_fp16_ = true; + gpu_fp16_disabled_op_types_.insert(op_list.begin(), op_list.end()); +#else + LOG(ERROR) << "Please compile with gpu to Exp_EnableUseGpuFp16()"; + use_gpu_fp16_ = false; +#endif + + Update(); +} + void AnalysisConfig::DisableFCPadding() { use_fc_padding_ = false; @@ -168,6 +183,33 @@ void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num, Update(); } +void AnalysisConfig::EnableONNXRuntime() { +#ifdef PADDLE_WITH_ONNXRUNTIME + use_onnxruntime_ = true; +#else + LOG(ERROR) << "Please compile with onnxruntime to EnableONNXRuntime()"; + use_onnxruntime_ = false; +#endif + + Update(); +} + +void AnalysisConfig::DisableONNXRuntime() { + use_onnxruntime_ = false; + Update(); +} + +void AnalysisConfig::EnableORTOptimization() { +#ifdef PADDLE_WITH_ONNXRUNTIME + enable_ort_optimization_ = true; +#else + LOG(ERROR) << "Please compile with onnxruntime to EnableORTOptimization()"; + enable_ort_optimization_ = false; +#endif + + Update(); +} + AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { #define CP_MEMBER(member__) member__ = other.member__; @@ -186,6 +228,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(use_cudnn_); CP_MEMBER(gpu_device_id_); CP_MEMBER(memory_pool_init_size_mb_); + CP_MEMBER(use_gpu_fp16_); + CP_MEMBER(gpu_fp16_disabled_op_types_); CP_MEMBER(enable_memory_optim_); // TensorRT related. @@ -546,6 +590,20 @@ void AnalysisConfig::Update() { #endif } + if (use_gpu_fp16_) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (!enable_ir_optim_) { + LOG(ERROR) << "Exp_EnableUseGpuFp16() only works when IR optimization is " + "enabled."; + } else if (!use_gpu()) { + LOG(ERROR) + << "Exp_EnableUseGpuFp16() only works when use_gpu is enabled."; + } else { + pass_builder()->Exp_EnableUseGpuFp16(); + } +#endif + } + if (use_mkldnn_) { #ifdef PADDLE_WITH_MKLDNN if (!enable_ir_optim_) { @@ -642,6 +700,8 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << params_file_; ss << use_gpu_; + ss << use_gpu_fp16_; + for (auto &item : gpu_fp16_disabled_op_types_) ss << item; ss << use_fc_padding_; ss << gpu_device_id_; ss << xpu_device_id_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index df61b5103195d..a7caa3e369f80 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -50,8 +50,7 @@ #include "paddle/phi/api/ext/op_meta_info.h" #include "paddle/utils/string/split.h" -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h" #include "paddle/fluid/distributed/fleet_executor/task_node.h" @@ -65,6 +64,10 @@ #include "paddle/fluid/inference/api/mkldnn_quantizer.h" #endif +#ifdef PADDLE_WITH_ONNXRUNTIME +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" +#endif + #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/helper.h" @@ -370,8 +373,7 @@ static void DisablePrepareDataOpt( } bool AnalysisPredictor::PrepareExecutor() { -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) if (config_.dist_config().use_dist_model()) { VLOG(3) << "use_dist_model is enabled, will init FleetExecutor."; return PrepareFleetExecutor(); @@ -389,8 +391,7 @@ bool AnalysisPredictor::PrepareExecutor() { return true; } -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) bool AnalysisPredictor::PrepareFleetExecutor() { VLOG(3) << "AnalysisPredictor::PrepareFleetExecutor()"; if (config_.dist_config().nranks() > 1 && !CommInit()) { @@ -868,6 +869,11 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_); } + if (config_.gpu_fp16_enabled()) { + argument_.SetUseGPUFp16(true); + argument_.SetGpuFp16DisabledOpTypes(config_.gpu_fp16_disabled_op_types_); + } + if (config_.lite_engine_enabled()) { argument_.SetCpuMathLibraryNumThreads( config_.cpu_math_library_num_threads()); @@ -1185,8 +1191,7 @@ std::vector AnalysisPredictor::GetOutputNames() { std::unique_ptr AnalysisPredictor::GetInputTensor( const std::string &name) { framework::Scope *scope; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) if (config_.dist_config().use_dist_model()) { scope = scope_.get(); } else { @@ -1235,8 +1240,7 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( std::unique_ptr AnalysisPredictor::GetOutputTensor( const std::string &name) { framework::Scope *scope; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) if (config_.dist_config().use_dist_model()) { scope = scope_.get(); } else { @@ -1283,8 +1287,7 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( } bool AnalysisPredictor::ZeroCopyRun() { -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) if (config_.dist_config().use_dist_model()) { VLOG(3) << "ZeroCopyRun will use the fleet executor."; inference::Timer timer; @@ -1762,6 +1765,27 @@ namespace paddle_infer { Predictor::Predictor(const Config &config) { const_cast(&config)->SwitchUseFeedFetchOps(false); // The second parameter indicates that the discard log is not printed + if (config.use_onnxruntime()) { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (config.use_gpu()) { + LOG(WARNING) << "The current ONNXRuntime backend doesn't support GPU," + "and it falls back to use Paddle Inference."; + } else if (!paddle::CheckConvertToONNX(config)) { + LOG(WARNING) + << "Paddle2ONNX do't support convert the Modelļ¼Œ fall back to using " + "Paddle Inference."; + } else { + predictor_ = paddle::CreatePaddlePredictor< + Config, paddle::PaddleEngineKind::kONNXRuntime>(config); + return; + } +#else + LOG(WARNING) + << "The onnxruntime backend isn't enabled," + " and please re-compile Paddle with WITH_ONNXRUNTIME option," + "fall back to using Paddle Inference."; +#endif + } predictor_ = paddle::CreatePaddlePredictor< Config, paddle::PaddleEngineKind::kAnalysis>(config); } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 21a7e9658bbee..d9992f3fbef9d 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -18,8 +18,7 @@ #include #include #include -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" #endif #include "paddle/fluid/framework/naive_executor.h" @@ -395,8 +394,7 @@ class AnalysisPredictor : public PaddlePredictor { void StatisticShapeRangeInfo(); void CollectShapeRangeInfo(); -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) // fleet exe related /// @@ -488,8 +486,7 @@ class AnalysisPredictor : public PaddlePredictor { std::map>> shape_info_; static int clone_num_; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) // fleet executor related distributed::FleetExecutorDesc executor_desc_; std::shared_ptr fleet_exe_; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 9c7e5c6b27e68..ecb5eaf982548 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -357,6 +357,37 @@ TEST(AnalysisPredictor, set_xpu_device_id) { } #endif +TEST(AnalysisPredictor, enable_onnxruntime) { + AnalysisConfig config; + config.EnableONNXRuntime(); +#ifdef PADDLE_WITH_ONNXRUNTIME + ASSERT_TRUE(config.use_onnxruntime()); +#else + ASSERT_TRUE(!config.use_onnxruntime()); +#endif + config.EnableORTOptimization(); +#ifdef PADDLE_WITH_ONNXRUNTIME + ASSERT_TRUE(config.ort_optimization_enabled()); +#else + ASSERT_TRUE(!config.ort_optimization_enabled()); +#endif + config.DisableONNXRuntime(); + ASSERT_TRUE(!config.use_onnxruntime()); +} + +TEST(AnalysisPredictor, exp_enable_use_gpu_fp16) { + AnalysisConfig config; + config.SwitchIrOptim(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + config.EnableUseGpu(100, 0); + config.Exp_EnableUseGpuFp16(); + ASSERT_TRUE(config.gpu_fp16_enabled()); +#else + config.DisableGpu(); +#endif + LOG(INFO) << config.Summary(); +} + } // namespace paddle namespace paddle_infer { @@ -408,6 +439,27 @@ TEST(Predictor, Run) { predictor->TryShrinkMemory(); } +TEST(Predictor, EnableONNXRuntime) { + Config config; + config.SetModel(FLAGS_dirname); + config.EnableONNXRuntime(); + config.EnableORTOptimization(); + auto predictor = CreatePredictor(config); +} + +TEST(Predictor, Exp_EnableUseGpuFp16) { + Config config; + config.SetModel(FLAGS_dirname); + config.SwitchIrOptim(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + config.EnableUseGpu(100, 0); + config.Exp_EnableUseGpuFp16(); +#else + config.DisableGpu(); +#endif + auto predictor = CreatePredictor(config); +} + TEST(Tensor, CpuShareExternalData) { Config config; config.SetModel(FLAGS_dirname); diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index d03840ada36bc..df98a7b05cf3f 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -4,6 +4,7 @@ option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL. option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON) option(USE_TENSORRT "Compile demo with TensorRT." OFF) +option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF) if(NOT WITH_STATIC_LIB) add_definitions("-DPADDLE_WITH_SHARED_LIB") @@ -46,6 +47,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") link_directories("${PADDLE_LIB}/paddle/lib") +if (WITH_ONNXRUNTIME) + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include") + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include") + + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib") + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib") +endif() if (WIN32) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") @@ -151,6 +159,17 @@ else() endif() endif() +if (WITH_ONNXRUNTIME) + if(WIN32) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx) + elseif(APPLE) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx) + else() + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx) + endif() +endif() + + if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} @@ -213,6 +232,14 @@ if(WIN32) COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release ) endif() + if(WITH_ONNXRUNTIME) + add_custom_command(TARGET ${DEMO_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + ) + endif() if(NOT WITH_STATIC_LIB) add_custom_command(TARGET ${DEMO_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} diff --git a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc new file mode 100644 index 0000000000000..ef5c08cd041eb --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc @@ -0,0 +1,64 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file contains demo of mobilenet for tensorrt. + */ + +#include // use glog instead of CHECK to avoid importing other paddle header files. +#include +#include "gflags/gflags.h" +#include "utils.h" // NOLINT + +DEFINE_string(modeldir, "", "Directory of the inference model."); + +namespace paddle { +namespace demo { + +/* + * Use the onnxruntime engine to inference the demo. + */ +void Main() { + paddle::AnalysisConfig config; + config.EnableONNXRuntime(); + config.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + auto predictor = paddle_infer::CreatePredictor(config); + + // Inference. + std::vector input_shape = {1, 3, 224, 224}; + std::vector input_data(1 * 3 * 224 * 224, 1.0); + std::vector out_data; + out_data.resize(1000); + auto input_names = predictor->GetInputNames(); + auto output_names = predictor->GetOutputNames(); + auto input_tensor = predictor->GetInputHandle(input_names[0]); + input_tensor->Reshape(input_shape); + auto output_tensor = predictor->GetOutputHandle(output_names[0]); + + input_tensor->CopyFromCpu(input_data.data()); + predictor->Run(); + output_tensor->CopyToCpu(out_data.data()); + + VLOG(3) << "output.size " << out_data.size(); +} + +} // namespace demo +} // namespace paddle + +int main(int argc, char** argv) { + ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); + paddle::demo::Main(); + return 0; +} diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 5f062e8063253..2c0945cd5b386 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -21,7 +21,8 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode DATA_DIR=$4 # dataset USE_TENSORRT=$5 TENSORRT_ROOT_DIR=$6 # TensorRT root dir, default to /usr -MSVC_STATIC_CRT=$7 +WITH_ONNXRUNTIME=$7 +MSVC_STATIC_CRT=$8 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform @@ -38,6 +39,30 @@ else use_gpu_list='false' fi +mkdir -p $DATA_DIR +cd $DATA_DIR + +if [ $7 == ON ]; then + ONNXRUNTIME_LIB=${inference_install_dir}/third_party/install/onnxruntime/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ONNXRUNTIME_LIB} + PADDLE2ONNX_LIB=${inference_install_dir}/third_party/install/paddle2onnx/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${PADDLE2ONNX_LIB} + #download model + mkdir -p MobileNetV2 + cd MobileNetV2 + if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then + echo "MobileNetV2.inference.model.tar.gz has been downloaded." + else + if [ $WIN_DETECT != "" ]; then + wget -q -Y off http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz + else + wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz + fi + tar xzf *.tar.gz + fi + cd .. +fi + PREFIX=inference-vis-demos%2F URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX} @@ -58,8 +83,7 @@ function download() { fi cd .. } -mkdir -p $DATA_DIR -cd $DATA_DIR + vis_demo_list='se_resnext50 ocr mobilenet' for vis_demo_name in $vis_demo_list; do download $vis_demo_name @@ -93,7 +117,8 @@ for WITH_STATIC_LIB in ON OFF; do -DDEMO_NAME=simple_on_word2vec \ -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ - -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT + -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln for use_gpu in $use_gpu_list; do Release/simple_on_word2vec.exe \ @@ -112,7 +137,8 @@ for WITH_STATIC_LIB in ON OFF; do -DDEMO_NAME=vis_demo \ -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ - -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT + -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln for use_gpu in $use_gpu_list; do for vis_demo_name in $vis_demo_list; do @@ -138,7 +164,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ -DUSE_TENSORRT=$USE_TENSORRT \ - -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln Release/trt_mobilenet_demo.exe \ --modeldir=$DATA_DIR/mobilenet/model \ @@ -156,7 +183,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=simple_on_word2vec \ -DWITH_GPU=$TEST_GPU_CPU \ - -DWITH_STATIC_LIB=$WITH_STATIC_LIB + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model' if [ -d $word2vec_model ]; then @@ -176,7 +204,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=vis_demo \ -DWITH_GPU=$TEST_GPU_CPU \ - -DWITH_STATIC_LIB=$WITH_STATIC_LIB + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) for use_gpu in $use_gpu_list; do for vis_demo_name in $vis_demo_list; do @@ -200,7 +229,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ -DUSE_TENSORRT=$USE_TENSORRT \ - -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) ./trt_mobilenet_demo \ --modeldir=$DATA_DIR/mobilenet/model \ @@ -211,6 +241,26 @@ for WITH_STATIC_LIB in ON OFF; do exit 1 fi fi + + # --------onnxruntime mobilenetv2 on linux/mac------ + if [ $WITH_ONNXRUNTIME == ON ]; then + rm -rf * + cmake .. -DPADDLE_LIB=${inference_install_dir} \ + -DWITH_MKL=$TURN_ON_MKL \ + -DDEMO_NAME=onnxruntime_mobilenet_demo \ + -DWITH_GPU=$TEST_GPU_CPU \ + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DUSE_TENSORRT=$USE_TENSORRT \ + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME + make -j$(nproc) + ./onnxruntime_mobilenet_demo \ + --modeldir=$DATA_DIR/MobileNetV2/MobileNetV2 + if [ $? -ne 0 ]; then + echo "onnxruntime demo onnxruntime_mobilenet_demo runs fail." + exit 1 + fi + fi fi done set +x diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt index 4341fb0a9ccd8..b2cfb060dd325 100644 --- a/paddle/fluid/inference/api/details/CMakeLists.txt +++ b/paddle/fluid/inference/api/details/CMakeLists.txt @@ -14,7 +14,11 @@ # cc_library(reset_tensor_array SRCS reset_tensor_array.cc DEPS lod_tensor scope) -cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce) +if (WITH_ONNXRUNTIME) + cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce onnxruntime) +else (WITH_ONNXRUNTIME) + cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce) +endif (WITH_ONNXRUNTIME) cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc) cc_test(zero_copy_tensor_test SRCS zero_copy_tensor_test.cc DEPS paddle_inference_api) diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 18b1d09f0e8a7..66dec0157d98e 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -22,12 +22,22 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" #include "paddle/phi/core/allocator.h" +#ifdef PADDLE_WITH_ONNXRUNTIME +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" +#endif namespace paddle_infer { using float16 = paddle::platform::float16; void Tensor::Reshape(const std::vector &shape) { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + shape_.assign(shape.begin(), shape.end()); + return; + } +#endif + PADDLE_ENFORCE_EQ( name_.empty(), false, paddle::platform::errors::PreconditionNotMet( @@ -123,6 +133,11 @@ T *Tensor::data(PlaceType *place, int *size) const { } DataType Tensor::type() const { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + return dtype_; + } +#endif EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto type = paddle::framework::TransToProtoVarType(tensor->dtype()); if (type == paddle::framework::proto::VarType::FP32) { @@ -145,6 +160,13 @@ PlaceType Tensor::place() const { return place_; } template void Tensor::CopyFromCpu(const T *data) { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + ORTCopyFromCpu(data); + return; + } +#endif + EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_GE(tensor->numel(), 0, paddle::platform::errors::PreconditionNotMet( @@ -382,6 +404,13 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, template void Tensor::CopyToCpu(T *data) const { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + ORTCopyToCpu(data); + return; + } +#endif + CopyToCpuImpl(data, nullptr, nullptr, nullptr); } @@ -489,12 +518,7 @@ template PD_INFER_DECL uint8_t *Tensor::mutable_data(PlaceType place); template PD_INFER_DECL int8_t *Tensor::mutable_data(PlaceType place); template PD_INFER_DECL float16 *Tensor::mutable_data(PlaceType place); -Tensor::Tensor(void *scope) : scope_{scope} { - PADDLE_ENFORCE_NOT_NULL(scope_, - paddle::platform::errors::PreconditionNotMet( - "The `scope` can not be nullptr. It should be " - "set to the pointer of scope.")); -} +Tensor::Tensor(void *scope) : scope_{scope} {} template void *Tensor::FindTensor() const { @@ -513,6 +537,26 @@ void *Tensor::FindTensor() const { } std::vector Tensor::shape() const { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + std::vector shape; + // input handle + if (idx_ < 0) { + shape.assign(shape_.begin(), shape_.end()); + } else { // output handle + auto binding = binding_.lock(); + PADDLE_ENFORCE_NOT_NULL(binding, + paddle::platform::errors::PreconditionNotMet( + "output tensor [%s] no binding ptr", name_)); + std::vector outputs = binding->GetOutputValues(); + Ort::Value &value = outputs[idx_]; + auto info = value.GetTensorTypeAndShapeInfo(); + auto ort_shape = info.GetShape(); + shape.assign(ort_shape.begin(), ort_shape.end()); + } + return shape; + } +#endif EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_NOT_NULL( tensor_, paddle::platform::errors::PreconditionNotMet( @@ -573,4 +617,99 @@ void Tensor::SetPlace(PlaceType place, int device) { device_ = device; } +#ifdef PADDLE_WITH_ONNXRUNTIME +void Tensor::SetOrtMark(bool is_ort_tensor) { is_ort_tensor_ = is_ort_tensor; } + +void Tensor::SetOrtBinding(const std::shared_ptr binding) { + binding_ = binding; +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, data, size, shape, + shape_len); +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int64_t *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, data, size, shape, + shape_len); +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int32_t *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, data, size, shape, + shape_len); +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, uint8_t *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, data, size, shape, + shape_len); +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int8_t *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, data, size, shape, + shape_len); +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float16 *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, static_cast(data), + size * sizeof(float16), shape, shape_len, + ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16); +} + +template +void Tensor::ORTCopyFromCpu(const T *data) { + auto binding = binding_.lock(); + PADDLE_ENFORCE_NOT_NULL(binding, + paddle::platform::errors::PreconditionNotMet( + "input tensor [%s] no binding ptr", name_)); + const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda"; + Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, device_, + OrtMemTypeDefault); + size_t size = std::accumulate(begin(shape_), end(shape_), 1UL, + std::multiplies()); + auto ort_value = GetOrtVaule(memory_info, const_cast(data), size, + shape_.data(), shape_.size()); + binding->BindInput(name_.c_str(), ort_value); +} + +template +void Tensor::ORTCopyToCpu(T *data) const { + auto binding = binding_.lock(); + PADDLE_ENFORCE_NOT_NULL(binding, + paddle::platform::errors::PreconditionNotMet( + "output tensor [%s] no binding ptr", name_)); + std::vector outputs = binding->GetOutputValues(); + Ort::Value &value = outputs[idx_]; + auto info = value.GetTensorTypeAndShapeInfo(); + size_t size = info.GetElementCount() * sizeof(T); + + if (place_ == PlaceType::kCPU) { + std::memcpy(static_cast(data), value.GetTensorData(), size); + } else { + paddle::memory::Copy(paddle::platform::CPUPlace(), + static_cast(data), + paddle::platform::CUDAPlace(device_), + value.GetTensorData(), size, nullptr); + } +} + +template void Tensor::ORTCopyFromCpu(const float *data); +template void Tensor::ORTCopyFromCpu(const int64_t *data); +template void Tensor::ORTCopyFromCpu(const int32_t *data); +template void Tensor::ORTCopyFromCpu(const uint8_t *data); +template void Tensor::ORTCopyFromCpu(const int8_t *data); +template void Tensor::ORTCopyFromCpu(const float16 *data); + +template void Tensor::ORTCopyToCpu(float *data) const; +template void Tensor::ORTCopyToCpu(int32_t *data) const; +template void Tensor::ORTCopyToCpu(uint8_t *data) const; +template void Tensor::ORTCopyToCpu(int8_t *data) const; +template void Tensor::ORTCopyToCpu(float16 *data) const; +#endif + } // namespace paddle_infer diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc new file mode 100644 index 0000000000000..bd9de252a0962 --- /dev/null +++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc @@ -0,0 +1,307 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid//platform/device/gpu/gpu_types.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/version.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/inference/utils/io_utils.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { + +paddle_infer::DataType ConvertONNXType(ONNXTensorElementDataType type) { + switch (type) { + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: + return paddle_infer::DataType::FLOAT32; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: + return paddle_infer::DataType::FLOAT16; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: + return paddle_infer::DataType::INT8; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: + return paddle_infer::DataType::INT32; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: + return paddle_infer::DataType::INT64; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: + return paddle_infer::DataType::UINT8; + default: + LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast(type); + return paddle_infer::DataType::FLOAT32; + } +} + +bool CheckConvertToONNX(const AnalysisConfig &config) { + if (!config.model_dir().empty()) { + LOG(ERROR) << "Paddle2ONNX not support model_dir config"; + // TODO(heliqi jiangjiajun): Paddle2ONNX not support + // config.model_dir() + "/__model__" + // config.model_dir() + var_name + return false; + } else if (config.prog_file().empty() || config.params_file().empty()) { + LOG(ERROR) << string::Sprintf( + "not valid model path '%s' or program path '%s' or params path '%s'.", + config.model_dir(), config.prog_file(), config.params_file()); + return false; + } + return paddle2onnx::IsExportable(config.prog_file(), config.params_file(), + config.model_from_memory()); +} + +bool ONNXRuntimePredictor::Init() { + VLOG(3) << "ONNXRuntime Predictor::init()"; + + // Now ONNXRuntime only suuport CPU + const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu"; + if (config_.use_gpu()) { + place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); + } else { + place_ = paddle::platform::CPUPlace(); + } + + std::string onnx_proto; + paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto, + config_.model_from_memory()); + + Ort::SessionOptions session_options; + if (config_.ort_optimization_enabled()) { + session_options.SetGraphOptimizationLevel( + GraphOptimizationLevel::ORT_ENABLE_ALL); + } + // Turn optimization off first, and then turn it on when it's stable + // session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + // session_options.EnableCpuMemArena(); + // session_options.EnableMemPattern(); + // session_options.SetInterOpNumThreads(config_.cpu_math_library_num_threads()); + session_options.SetIntraOpNumThreads(config_.cpu_math_library_num_threads()); + VLOG(2) << "ONNXRuntime threads " << config_.cpu_math_library_num_threads(); + if (config_.profile_enabled()) { + LOG(WARNING) << "ONNXRuntime Profiler is activated, which might affect the " + "performance"; +#if defined(_WIN32) + session_options.EnableProfiling(L"ONNX"); +#else + session_options.EnableProfiling("ONNX"); +#endif + } else { + VLOG(2) << "ONNXRuntime Profiler is deactivated, and no profiling report " + "will be " + "generated."; + } + session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options}; + binding_ = std::make_shared(session_); + + Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, + place_.GetDeviceId(), OrtMemTypeDefault); + Ort::Allocator allocator(session_, memory_info); + + size_t n_inputs = session_.GetInputCount(); + for (size_t i = 0; i < n_inputs; ++i) { + auto input_name = session_.GetInputName(i, allocator); + auto type_info = session_.GetInputTypeInfo(i); + std::vector shape = + type_info.GetTensorTypeAndShapeInfo().GetShape(); + ONNXTensorElementDataType data_type = + type_info.GetTensorTypeAndShapeInfo().GetElementType(); + input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type}); + allocator.Free(input_name); + } + + size_t n_outputs = session_.GetOutputCount(); + for (size_t i = 0; i < n_outputs; ++i) { + auto output_name = session_.GetOutputName(i, allocator); + auto type_info = session_.GetOutputTypeInfo(i); + std::vector shape = + type_info.GetTensorTypeAndShapeInfo().GetShape(); + ONNXTensorElementDataType data_type = + type_info.GetTensorTypeAndShapeInfo().GetElementType(); + output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type}); + + Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator, + place_.GetDeviceId(), OrtMemTypeDefault); + binding_->BindOutput(output_name, out_memory_info); + + allocator.Free(output_name); + } + return true; +} + +template <> +std::unique_ptr +CreatePaddlePredictor( + const AnalysisConfig &config) { + if (config.glog_info_disabled()) { + FLAGS_logtostderr = 1; + FLAGS_minloglevel = 2; // GLOG_ERROR + } + + PADDLE_ENFORCE_EQ( + config.is_valid(), true, + platform::errors::InvalidArgument( + "Note: Each config can only be used for one predictor.")); + + VLOG(3) << "create ONNXRuntimePredictor"; + + std::unique_ptr predictor(new ONNXRuntimePredictor(config)); + // Each config can only be used for one predictor. + config.SetInValid(); + auto predictor_p = dynamic_cast(predictor.get()); + + if (!predictor_p->Init()) { + return nullptr; + } + + return predictor; +} + +std::vector ONNXRuntimePredictor::GetInputNames() { + std::vector input_names; + for (auto input_desc : input_desc_) { + input_names.push_back(input_desc.name); + } + return input_names; +} + +std::map> +ONNXRuntimePredictor::GetInputTensorShape() { + std::map> input_shapes; + for (auto input_desc : input_desc_) { + input_shapes[input_desc.name] = input_desc.shape; + } + return input_shapes; +} + +std::vector ONNXRuntimePredictor::GetOutputNames() { + std::vector output_names; + for (auto output_desc : output_desc_) { + output_names.push_back(output_desc.name); + } + return output_names; +} + +bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name, + bool is_input) { + if (is_input) { + for (auto i : input_desc_) + if (i.name == name) return true; + } else { + for (auto i : output_desc_) + if (i.name == name) return true; + } + return false; +} + +std::unique_ptr ONNXRuntimePredictor::GetInputTensor( + const std::string &name) { + PADDLE_ENFORCE_EQ(FindONNXDesc(name, true), true, + platform::errors::PreconditionNotMet( + "The in variable named %s is not found in the " + "ONNXPredictor.", + name)); + std::unique_ptr res(new ZeroCopyTensor(nullptr)); + res->input_or_output_ = true; + res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = place_; + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } + res->SetOrtMark(true); + res->SetOrtBinding(binding_); + return res; +} + +std::unique_ptr ONNXRuntimePredictor::GetOutputTensor( + const std::string &name) { + PADDLE_ENFORCE_EQ(FindONNXDesc(name, false), true, + platform::errors::PreconditionNotMet( + "The out variable named %s is not found in the " + "ONNXPredictor.", + name)); + std::unique_ptr res(new ZeroCopyTensor(nullptr)); + res->input_or_output_ = false; + res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = place_; + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } + res->SetOrtMark(true); + res->SetOrtBinding(binding_); + int size = output_desc_.size(); + for (int i = 0; i < size; ++i) + if (output_desc_[i].name == name) { + res->idx_ = i; + res->dtype_ = ConvertONNXType(output_desc_[i].dtype); + break; + } + return res; +} + +bool ONNXRuntimePredictor::Run(const std::vector &inputs, + std::vector *output_data, + int batch_size) { + LOG(ERROR) << "Not support Run"; + return false; +} + +bool ONNXRuntimePredictor::ZeroCopyRun() { + try { + session_.Run({}, *(binding_.get())); + } catch (const std::exception &e) { + LOG(ERROR) << e.what(); + return false; + } + + return true; +} + +std::unique_ptr ONNXRuntimePredictor::Clone() { + LOG(ERROR) << "Not support Clone(), Please create new Predictor"; + return nullptr; +} + +uint64_t ONNXRuntimePredictor::TryShrinkMemory() { + return paddle::memory::Release(place_); +} + +ONNXRuntimePredictor::~ONNXRuntimePredictor() { + binding_->ClearBoundInputs(); + binding_->ClearBoundOutputs(); + + memory::Release(place_); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h new file mode 100644 index 0000000000000..d01756e4b96b1 --- /dev/null +++ b/paddle/fluid/inference/api/onnxruntime_predictor.h @@ -0,0 +1,211 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/framework/op_compatible_info.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/device/gpu/gpu_types.h" +#include "paddle/fluid/string/printf.h" + +#include "onnxruntime_c_api.h" // NOLINT +#include "onnxruntime_cxx_api.h" // NOLINT +#include "paddle2onnx/converter.h" + +#ifdef PADDLE_WITH_TESTING +#include +#include +#endif + +/// +/// \file onnxruntime_predictor.h +/// +/// \brief A predictor using ONNXRuntime +/// +/// \author heliqi@baidu.com +/// \date 2022-02-14 +/// \since 2.3.0 +/// + +namespace paddle { + +bool CheckConvertToONNX(const AnalysisConfig &config); + +struct ONNXDesc { + std::string name; + std::vector shape; + ONNXTensorElementDataType dtype; +}; + +/// +/// \class ONNXRuntimePredictor +/// +/// \brief The ONNXRuntimePredictor using ONNXRuntime for inference +/// +/// The predictor has the following typical uses: +/// +/// Get predictor +/// \code{cpp} +/// auto predictor = CreatePaddlePredictor(config); +/// \endcode +/// +/// Get input or output names +/// \code{cpp} +/// auto input_names = predictor->GetInputNames(); +/// auto output_names = predictor->GetOutputNames(); +/// \endcode +/// +/// Get input or output tensors +/// \code{cpp} +/// auto input_t = predictor->GetInputTensor(input_names[0]); +/// auto output_t = predictor->GetOutputTensor(output_names[0]); +/// \endcode +/// +/// Run predictor +/// \code{cpp} +/// predictor->ZeroCopyRun(); +/// \endcode +/// +class ONNXRuntimePredictor : public PaddlePredictor { + public: + /// + /// \brief Construct a new ONNXRuntime Predictor object + /// + /// \param[in] AnalysisConfig config + /// + explicit ONNXRuntimePredictor(const AnalysisConfig &config) + : config_(config), env_(ORT_LOGGING_LEVEL_WARNING, "onnx") { + predictor_id_ = inference::GetUniqueId(); + } + /// + /// \brief Destroy the ONNXRuntime Predictor object + /// + ~ONNXRuntimePredictor(); + + /// + /// \brief Initialize predictor + /// + /// \return Whether the init function executed successfully + /// + bool Init(); + + /// + /// \brief Get the input names + /// + /// \return input names + /// + std::vector GetInputNames(); + + /// + /// \brief Get the output names + /// + /// \return output names + /// + std::vector GetOutputNames(); + + /// + /// \brief Get the Input Tensor object + /// + /// \param[in] name input name + /// \return input tensor + /// + std::unique_ptr GetInputTensor( + const std::string &name) override; + + /// + /// \brief Get the Output Tensor object + /// + /// \param[in] name otuput name + /// \return output tensor + /// + std::unique_ptr GetOutputTensor( + const std::string &name) override; + /// + /// \brief Get all input names and their corresponding shapes + /// + /// \return the map of input names and shapes + /// + std::map> GetInputTensorShape() override; + + /// Not supoort + bool Run(const std::vector &inputs, + std::vector *output_data, + int batch_size = -1) override; + + /// + /// \brief Run the prediction engine + /// + /// \return Whether the function executed successfully + /// + bool ZeroCopyRun() override; + + /// + /// \brief Release all tmp tensor to compress the size of the memory pool. + /// The memory pool is considered to be composed of a list of chunks, if + /// the chunk is not occupied, it can be released. + /// + /// \return Number of bytes released. It may be smaller than the actual + /// released memory, because part of the memory is not managed by the + /// MemoryPool. + /// + uint64_t TryShrinkMemory() override; + /// + /// \brief Clone to get the new predictor. thread safe. + /// + /// \return get a new predictor + /// + std::unique_ptr Clone() override; + + private: + /// + /// \brief Whether to find in/out by name. + /// + /// \param[in] name input or output name + /// + /// \param[in] is_input input(true) or output(false) + /// + /// \return Whether to find by name + /// + bool FindONNXDesc(const std::string &name, bool is_input); + + private: + AnalysisConfig config_; + + // ONNXRuntime + Ort::Env env_; + Ort::Session session_{nullptr}; + std::shared_ptr binding_; + + platform::Place place_; + std::vector input_desc_; + std::vector output_desc_; + int predictor_id_; + +// Some more detailed tests, they are made the friends of the predictor, so that +// the all the details can be tested. +#if PADDLE_WITH_TESTING + FRIEND_TEST(ONNXRuntimePredictor, onnxruntime_on); +#endif +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc new file mode 100644 index 0000000000000..2be2de9c60bb1 --- /dev/null +++ b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" + +#include +#include +#include +#include // NOLINT +#include +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_api.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" +#include "paddle/fluid/inference/utils/io_utils.h" +#include "paddle/fluid/platform/cpu_info.h" + +DEFINE_string(dirname, "", "dirname to tests."); + +namespace paddle { + +TEST(ONNXRuntimePredictor, onnxruntime_on) { + AnalysisConfig config; + config.SetModel(FLAGS_dirname + "/inference.pdmodel", + FLAGS_dirname + "/inference.pdiparams"); + config.EnableONNXRuntime(); + config.EnableORTOptimization(); + config.SetCpuMathLibraryNumThreads(2); + LOG(INFO) << config.Summary(); + + auto _predictor = + CreatePaddlePredictor(config); + ASSERT_TRUE(_predictor); + auto* predictor = static_cast(_predictor.get()); + + ASSERT_TRUE(predictor); + ASSERT_TRUE(!predictor->Clone()); + ASSERT_TRUE(predictor->scope_); + ASSERT_TRUE(predictor->sub_scope_); + ASSERT_EQ(predictor->scope_->parent(), nullptr); + ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get()); + // Dummy Input Data + std::vector input_shape = {-1, 3, 224, 224}; + std::vector input_data(1 * 3 * 224 * 224, 1.0); + std::vector out_data; + out_data.resize(1000); + + // testing all interfaces + auto input_names = predictor->GetInputNames(); + auto output_names = predictor->GetOutputNames(); + auto get_input_shape = predictor->GetInputTensorShape(); + + ASSERT_EQ(input_names.size(), 1UL); + ASSERT_EQ(output_names.size(), 1UL); + ASSERT_EQ(input_names[0], "inputs"); + ASSERT_EQ(output_names[0], "save_infer_model/scale_0.tmp_1"); + ASSERT_EQ(get_input_shape["inputs"], input_shape); + + auto input_tensor = predictor->GetInputTensor(input_names[0]); + input_tensor->Reshape({1, 3, 224, 224}); + auto output_tensor = predictor->GetOutputTensor(output_names[0]); + + input_tensor->CopyFromCpu(input_data.data()); + ASSERT_TRUE(predictor->ZeroCopyRun()); + output_tensor->CopyToCpu(out_data.data()); + + predictor->TryShrinkMemory(); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index b4a358394404f..bdfe0e46e9ca4 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -253,6 +253,19 @@ struct PD_INFER_DECL AnalysisConfig { /// /// void DisableGpu(); + /// + /// \brief Enable GPU fp16 precision computation, in experimental state. + /// + /// \param op_list The operator type list. + /// + void Exp_EnableUseGpuFp16(std::unordered_set op_list = {}); + /// + /// \brief A boolean state telling whether the GPU fp16 precision is turned + /// on. + /// + /// \return bool Whether the GPU fp16 precision is turned on. + /// + bool gpu_fp16_enabled() const { return use_gpu_fp16_; } /// /// \brief Turn on XPU. @@ -319,6 +332,18 @@ struct PD_INFER_DECL AnalysisConfig { /// void EnableNpu(int device_id = 0); /// + /// \brief Turn on ONNXRuntime. + /// + void EnableONNXRuntime(); + /// + /// \brief Turn off ONNXRuntime. + /// + void DisableONNXRuntime(); + /// + /// \brief Turn on ONNXRuntime Optimization. + /// + void EnableORTOptimization(); + /// /// \brief A boolean state telling whether the GPU is turned on. /// /// \return bool Whether the GPU is turned on. @@ -342,6 +367,19 @@ struct PD_INFER_DECL AnalysisConfig { /// bool use_ipu() const { return use_ipu_; } /// + /// \brief A boolean state telling whether the ONNXRuntime is turned on. + /// + /// \return bool Whether the ONNXRuntime is turned on. + /// + bool use_onnxruntime() const { return use_onnxruntime_; } + /// + /// \brief A boolean state telling whether the ONNXRuntime Optimization is + /// turned on. + /// + /// \return bool Whether the ONNXRuntime Optimization is turned on. + /// + bool ort_optimization_enabled() const { return enable_ort_optimization_; } + /// /// \brief Get the GPU device id. /// /// \return int The GPU device id. @@ -834,6 +872,9 @@ struct PD_INFER_DECL AnalysisConfig { int gpu_device_id_{0}; uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. bool thread_local_stream_{false}; + bool use_gpu_fp16_{false}; + std::unordered_set gpu_fp16_disabled_op_types_{ + "conv2d_fusion", "conv2d", "roll", "strided_slice"}; bool use_cudnn_{false}; @@ -841,6 +882,10 @@ struct PD_INFER_DECL AnalysisConfig { bool use_npu_{false}; int npu_device_id_{0}; + // ONNXRuntime related + bool use_onnxruntime_{false}; + bool enable_ort_optimization_{false}; + // Padding related bool use_fc_padding_{true}; diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index c129efe494b4f..657dd9b600cce 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -192,6 +192,7 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor { private: friend class AnalysisPredictor; + friend class ONNXRuntimePredictor; explicit ZeroCopyTensor(void* scope) : paddle_infer::Tensor{scope} {} }; @@ -381,6 +382,7 @@ enum class PaddleEngineKind { kNative = 0, ///< Use the native Fluid facility. kAutoMixedTensorRT, ///< Automatically mix Fluid with TensorRT. kAnalysis, ///< More optimization. + kONNXRuntime, ///< Use ONNXRuntime }; template @@ -395,6 +397,11 @@ template <> PD_INFER_DECL std::unique_ptr CreatePaddlePredictor< AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config); +template <> +PD_INFER_DECL std::unique_ptr +CreatePaddlePredictor( + const AnalysisConfig& config); + PD_INFER_DECL int PaddleDtypeSize(PaddleDType dtype); PD_INFER_DECL std::string get_version(); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index f5f36d805b43e..95975d8f2a892 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -172,6 +172,40 @@ void GpuPassStrategy::EnableCUDNN() { use_cudnn_ = true; } +void GpuPassStrategy::Exp_EnableUseGpuFp16() { + passes_.assign({ + "is_test_pass", // + "simplify_with_basic_ops_pass", // + "conv_bn_fuse_pass", // + "conv_eltwiseadd_bn_fuse_pass", // + "embedding_eltwise_layernorm_fuse_pass", // + "multihead_matmul_fuse_pass_v2", // + "gpu_cpu_squeeze2_matmul_fuse_pass", // + "gpu_cpu_reshape2_matmul_fuse_pass", // + "gpu_cpu_flatten2_matmul_fuse_pass", // + "gpu_cpu_map_matmul_v2_to_mul_pass", // + "gpu_cpu_map_matmul_v2_to_matmul_pass", // + "gpu_cpu_map_matmul_to_mul_pass", // + // "fc_fuse_pass", // + "fc_elementwise_layernorm_fuse_pass", // +#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be + // guaranteed at least v7 +// cudnn8.0 has memory leak problem in conv + eltwise + act, so we +// disable the pass. +#if !(CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8100) + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // +#endif + "conv_elementwise_add_fuse_pass", // +#endif // + "transpose_flatten_concat_fuse_pass", // + "mixed_precision_configure_pass", // + "runtime_context_cache_pass" // + }); + + use_gpu_fp16_ = true; +} + void GpuPassStrategy::EnableMKLDNN() { LOG(ERROR) << "GPU not support MKLDNN yet"; } @@ -262,6 +296,7 @@ void CpuPassStrategy::EnableMKLDNN() { // "fc_act_mkldnn_fuse_pass", "batch_norm_act_fuse_pass", // "softplus_activation_mkldnn_fuse_pass", // + "elt_act_mkldnn_fuse_pass", // // TODO(intel): Please fix the bug on windows. // https://github.com/PaddlePaddle/Paddle/issues/29710 // "mkldnn_inplace_pass", // This pass should be activated after diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 351cf71e5ca74..02290ed33ff1c 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -125,6 +125,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { /// \brief Enable the use of cuDNN kernel. virtual void EnableCUDNN() {} + /// \brief Enable use gpu fp16 kernel. + virtual void Exp_EnableUseGpuFp16() {} + /// \brief Enable the use of MKLDNN. /// The MKLDNN control exists in both CPU and GPU mode, because there can /// still be some CPU kernels running in GPU mode. @@ -140,6 +143,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { /// \return A bool variable implying whether we are in gpu mode. bool use_gpu() const { return use_gpu_; } + /// \brief Check if we are using gpu fp16 kernel. + /// \return A bool variable implying whether we are in gpu fp16 mode. + bool use_gpu_fp16() const { return use_gpu_fp16_; } + /// \brief Check if we are using xpu. /// \return A bool variable implying whether we are in xpu mode. bool use_xpu() const { return use_xpu_; } @@ -162,6 +169,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { bool use_npu_{false}; bool use_ipu_{false}; bool use_mkldnn_{false}; + bool use_gpu_fp16_{false}; /// \endcond }; @@ -223,6 +231,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { /// \brief Enable the use of cuDNN kernel. void EnableCUDNN() override; + /// \brief Enable the use of gpu fp16 kernel. + void Exp_EnableUseGpuFp16() override; + /// \brief Not supported in GPU mode yet. void EnableMKLDNN() override; @@ -238,6 +249,7 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { protected: /// \cond Protected bool use_cudnn_{false}; + bool use_gpu_fp16_{false}; /// \endcond }; diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index 5a98d109aed79..2afe2d32e2f60 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -18,6 +18,11 @@ #include "paddle_infer_declare.h" // NOLINT +#ifdef PADDLE_WITH_ONNXRUNTIME +#include "onnxruntime_c_api.h" // NOLINT +#include "onnxruntime_cxx_api.h" // NOLINT +#endif + namespace paddle_infer { /// \brief Experimental. @@ -175,6 +180,23 @@ class PD_INFER_DECL Tensor { PlaceType place_; int device_; +#ifdef PADDLE_WITH_ONNXRUNTIME + bool is_ort_tensor_{false}; + std::vector shape_; + std::weak_ptr binding_; + int idx_{-1}; + + void SetOrtMark(bool is_ort_tensor); + + void SetOrtBinding(const std::shared_ptr binding); + + template + void ORTCopyFromCpu(const T* data); + + template + void ORTCopyToCpu(T* data) const; +#endif + friend class paddle_infer::contrib::TensorUtils; #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST) friend class paddle_infer::InferApiTesterUtils; diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc index e342190fda1ac..d7b07652babbd 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.cc +++ b/paddle/fluid/inference/capi_exp/pd_config.cc @@ -126,6 +126,26 @@ PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) { return config->use_gpu(); } +void PD_ConfigEnableONNXRuntime(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableONNXRuntime(); +} + +void PD_ConfigDisableONNXRuntime(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->DisableONNXRuntime(); +} + +PD_Bool PD_ConfigONNXRuntimeEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->use_onnxruntime(); +} + +void PD_ConfigEnableORTOptimization(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableORTOptimization(); +} + void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config, int32_t l3_workspace_size, PD_Bool locked, PD_Bool autotune, const char* autotune_file, diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h index c314aca918f14..f6b754cad213f 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.h +++ b/paddle/fluid/inference/capi_exp/pd_config.h @@ -152,6 +152,34 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu( PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu( __pd_keep PD_Config* pd_config); /// +/// \brief Turn on ONNXRuntime. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableONNXRuntime( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn off ONNXRuntime. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableONNXRuntime( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the ONNXRutnime is turned on. +/// +/// \return Whether the ONNXRuntime is turned on. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigONNXRuntimeEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on ONNXRuntime Optimization. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableORTOptimization( + __pd_keep PD_Config* pd_config); +/// /// \brief Turn on XPU. /// /// \param[in] pd_onfig config diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go index def26913b0a1c..8f9f34c06b476 100644 --- a/paddle/fluid/inference/goapi/config.go +++ b/paddle/fluid/inference/goapi/config.go @@ -160,6 +160,36 @@ func (config *Config) EnableUseGpu(memorySize uint64, deviceId int32) { C.PD_ConfigEnableUseGpu(config.c, C.uint64_t(memorySize), C.int32_t(deviceId)) } +/// +/// \brief Turn on ONNXRuntime. +/// +func (config *Config) EnableONNXRuntime() { + C.PD_ConfigEnableONNXRuntime(config.c) +} + +/// +/// \brief Turn off ONNXRuntime. +/// +func (config *Config) DisableONNXRuntime() { + C.PD_ConfigDisableONNXRuntime(config.c) +} + +/// +/// \brief A boolean state telling whether the ONNXRuntime is turned on. +/// +/// \return bool Whether the ONNXRuntime is turned on. +/// +func (config *Config) ONNXRuntimeEnabled() bool { + return cvtPDBoolToGo(C.PD_ConfigONNXRuntimeEnabled(config.c)) +} + +/// +/// \brief Turn on ONNXRuntime Optimization. +/// +func (config *Config) EnableORTOptimization() { + C.PD_ConfigEnableORTOptimization(config.c) +} + /// /// \brief Turn on XPU. /// diff --git a/paddle/fluid/inference/goapi/config_test.go b/paddle/fluid/inference/goapi/config_test.go index b82161880839e..297841dcbcf6c 100644 --- a/paddle/fluid/inference/goapi/config_test.go +++ b/paddle/fluid/inference/goapi/config_test.go @@ -122,3 +122,20 @@ func TestMkldnn(t *testing.T) { config.SetBfloat16Op([]string{"fc", "mul"}) } + +func TestONNXRuntime(t *testing.T) { + config := NewConfig() + config.SetModelDir("modelDir") + t.Log(config.ModelDir()) + + config.EnableONNXRuntime() + t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled()) + + config.DisableONNXRuntime() + t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled()) + + config.EnableORTOptimization() + + config.SetCpuMathLibraryNumThreads(4) + t.Logf("CpuMathLibraryNumThreads:%+v", config.CpuMathLibraryNumThreads()) +} \ No newline at end of file diff --git a/paddle/fluid/inference/goapi/predictor_test.go b/paddle/fluid/inference/goapi/predictor_test.go index 40e518304510c..755558f96238d 100644 --- a/paddle/fluid/inference/goapi/predictor_test.go +++ b/paddle/fluid/inference/goapi/predictor_test.go @@ -66,6 +66,42 @@ func TestNewPredictor(t *testing.T) { cloned.ClearIntermediateTensor() } +func TestONNXRuntimePredictor(t *testing.T) { + t.Logf("Version:\n%+v", Version()) + config := NewConfig() + config.SetModel("./mobilenetv1/inference.pdmodel", "./mobilenetv1/inference.pdiparams") + config.EnableONNXRuntime() + config.EnableORTOptimization() + predictor := NewPredictor(config) + inNames := predictor.GetInputNames() + t.Logf("InputNames:%+v", inNames) + outNames := predictor.GetOutputNames() + t.Logf("OutputNames:%+v", outNames) + + inHandle := predictor.GetInputHandle(inNames[0]) + inHandle.Reshape([]int32{1, 3, 224, 224}) + t.Logf("inHandle name:%+v, shape:%+v", inHandle.Name(), inHandle.Shape()) + + data := make([]float32, numElements([]int32{1, 3, 224, 224})) + for i := 0; i < int(numElements([]int32{1, 3, 224, 224})); i++ { + data[i] = float32(i%255) * 0.1 + } + inHandle.CopyFromCpu(data) + t.Logf("inHandle Type:%+v", inHandle.Type()) + + predictor.Run() + + outHandle := predictor.GetOutputHandle(outNames[0]) + t.Logf("outHandle name:%+v", outHandle.Name()) + + outShape := outHandle.Shape() + t.Logf("outHandle Shape:%+v", outShape) + outData := make([]float32, numElements(outShape)) + outHandle.CopyToCpu(outData) + t.Log(outData) +} + + func TestFromBuffer(t *testing.T) { modelFile, err := os.Open("./mobilenetv1/inference.pdmodel") if err != nil { diff --git a/paddle/fluid/inference/goapi/test.sh b/paddle/fluid/inference/goapi/test.sh index edccc2648c012..cff9fd4aa7cea 100644 --- a/paddle/fluid/inference/goapi/test.sh +++ b/paddle/fluid/inference/goapi/test.sh @@ -22,6 +22,7 @@ fi # 2. set LD_LIBRARY_PATH export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/mklml/lib/:$PWD/paddle_inference_c/third_party/install/mkldnn/lib/:$PWD/paddle_inference_c/paddle/lib/ +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/onnxruntime/lib/:$PWD/paddle_inference_c/third_party/install/paddle2onnx/lib/ # 3. go test go clean -testcache diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc index 8c61200f7f57c..b69292827aa13 100644 --- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc @@ -89,5 +89,5 @@ class DropoutOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(dropout); +USE_OP_ITSELF(dropout); REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc index 67e7c78b62e9d..496e8932a690d 100644 --- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc @@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/layer_norm_op.h" + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h" diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index fe04d552e4026..7b65d2d7c97cc 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -328,5 +328,5 @@ class Pool2dOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(pool2d); +USE_OP_ITSELF(pool2d); REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc index b8e87a8d94d1f..5a306f622adbe 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc @@ -224,5 +224,5 @@ class Pool3dOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(pool3d); +USE_OP_ITSELF(pool3d); REGISTER_TRT_OP_CONVERTER(pool3d, Pool3dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 7f7313fbcb596..1ad82df41737c 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -53,6 +53,6 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); } } // namespace paddle USE_OP_ITSELF(relu); -USE_OP(sigmoid); -USE_OP(tanh); +USE_OP_ITSELF(sigmoid); +USE_OP_ITSELF(tanh); USE_OP(relu6); diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc index b96992ef8514a..a856d14144469 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" USE_OP_ITSELF(conv2d); -USE_OP(conv2d_transpose); +USE_OP_ITSELF(conv2d_transpose); namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc index 474fd92071fb0..cf37739608763 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc @@ -57,4 +57,4 @@ TEST(DropoutOpConverter, main) { } // namespace inference } // namespace paddle -USE_OP(dropout); +USE_OP_ITSELF(dropout); diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc index 1725888abc379..f17e00de0eeb7 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc @@ -45,4 +45,4 @@ TEST(leaky_relu_op, test_leaky_relu) { } // namespace paddle // USE_OP(leaky_relu); -USE_OP(leaky_relu); +USE_OP_ITSELF(leaky_relu); diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc index bded833505cd2..36f13262a73d7 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc @@ -71,4 +71,4 @@ TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); } } // namespace inference } // namespace paddle -USE_OP(pool2d); +USE_OP_ITSELF(pool2d); diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu index 861e98e443756..67d44184a76d0 100644 --- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu @@ -17,7 +17,7 @@ #include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h" -#include "paddle/fluid/operators/layer_norm_op.h" +#include "paddle/phi/kernels/layer_norm_kernel.h" namespace paddle { namespace inference { @@ -83,7 +83,7 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs, cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size, cudaMemcpyHostToDevice, stream); - paddle::operators::LayerNormDirectCUDAFunctor layer_norm; + phi::LayerNormDirectCUDAFunctor layer_norm; layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d, variance_d, begin_norm_axis, eps); return cudaGetLastError() != cudaSuccess; @@ -177,7 +177,7 @@ int LayerNormPluginDynamic::enqueue( cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size, cudaMemcpyHostToDevice, stream); - paddle::operators::LayerNormDirectCUDAFunctor layer_norm; + phi::LayerNormDirectCUDAFunctor layer_norm; layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d, variance_d, begin_norm_axis, eps); } else { diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu index 861a9aa9d000b..5596a89a083fe 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h" -#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace inference { @@ -108,16 +108,14 @@ int Pool3DPlugin::enqueue(int batchSize, const void *const *inputs, output_shape.insert(output_shape.begin(), batchSize); if (pool3d_type_ == Pool3DType::max) { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, true, adaptive_, odatas[0], stream, pool_process); } else if (pool3d_type_ == Pool3DType::avg) { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, true, adaptive_, odatas[0], stream, pool_process); @@ -351,16 +349,14 @@ int Pool3DPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, } if (pool3d_type_ == "max") { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings, true, adaptive_, output, stream, pool_process); } else if (pool3d_type_ == "avg") { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings, true, adaptive_, output, stream, pool_process); diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu index 6d711c26adc6f..9bfe98d759d8e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h" -#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace inference { @@ -84,16 +84,14 @@ int PoolPlugin::enqueue(int batchSize, const void *const *inputs, output_shape.insert(output_shape.begin(), batchSize); if (pool_type_ == PoolType::max) { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, true, false, odatas[0], stream, pool_process); } else if (pool_type_ == PoolType::avg) { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, exclusive_, adaptive_, odatas[0], stream, @@ -292,16 +290,14 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, } if (pool_type_ == "max") { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings, true, false, output, stream, pool_process); } else if (pool_type_ == "avg") { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings, exclusive_, adaptive_, output, stream, pool_process); diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc index df0eb58c2bd58..a341ffd7a081c 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc @@ -81,6 +81,18 @@ TEST(PD_Config, interface) { PD_ConfigSetBfloat16Op(config, 1, &ops_name); #endif + PD_ConfigEnableONNXRuntime(config); + bool onnxruntime_enabled = PD_ConfigONNXRuntimeEnabled(config); +#ifdef PADDLE_WITH_ONNXRUNTIME + EXPECT_TRUE(onnxruntime_enabled); +#else + EXPECT_FALSE(onnxruntime_enabled); +#endif + PD_ConfigDisableONNXRuntime(config); + bool onnxruntime_disabled = PD_ConfigONNXRuntimeEnabled(config); + EXPECT_FALSE(onnxruntime_disabled); + PD_ConfigEnableORTOptimization(config); + PD_ConfigEnableMemoryOptim(config, true); bool memory_enabled = PD_ConfigMemoryOptimEnabled(config); EXPECT_TRUE(memory_enabled); diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt index 9d83f8ff8fdc4..f376cbd4fb302 100644 --- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt +++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt @@ -5,6 +5,7 @@ option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." OFF) option(USE_TENSORRT "Compile demo with TensorRT." OFF) option(WITH_GTEST "Compile demo with GTEST" OFF) +option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF) if(NOT WITH_STATIC_LIB) add_definitions("-DPADDLE_WITH_SHARED_LIB") @@ -45,6 +46,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") link_directories("${PADDLE_LIB}/paddle/lib") +if (WITH_ONNXRUNTIME) + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include") + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include") + + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib") + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib") +endif() if (WIN32) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") @@ -172,6 +180,16 @@ else() endif() endif() +if (WITH_ONNXRUNTIME) + if(WIN32) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx) + elseif(APPLE) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx) + else() + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx) + endif() +endif() + if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} @@ -248,6 +266,14 @@ if(WIN32) COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release ) endif() + if(WITH_ONNXRUNTIME) + add_custom_command(TARGET ${DEMO_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + ) + endif() if(NOT WITH_STATIC_LIB) add_custom_command(TARGET ${DEMO_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} diff --git a/paddle/fluid/inference/tests/infer_ut/run.sh b/paddle/fluid/inference/tests/infer_ut/run.sh index dd4b64f28d739..8123d37850034 100755 --- a/paddle/fluid/inference/tests/infer_ut/run.sh +++ b/paddle/fluid/inference/tests/infer_ut/run.sh @@ -20,7 +20,8 @@ TURN_ON_MKL=$2 # use MKL or Openblas TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode DATA_DIR=$4 # dataset TENSORRT_ROOT_DIR=$5 # TensorRT ROOT dir, default to /usr/local/TensorRT -MSVC_STATIC_CRT=$6 +WITH_ONNXRUNTIME=$6 +MSVC_STATIC_CRT=$7 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir EXIT_CODE=0 # init default exit code WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform @@ -144,7 +145,8 @@ function compile_test() { -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ -DWITH_GTEST=ON \ -DCMAKE_CXX_FLAGS='/std:c++17' \ - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release ALL_BUILD.vcxproj else cmake .. -DPADDLE_LIB=${inference_install_dir} \ @@ -154,7 +156,8 @@ function compile_test() { -DWITH_STATIC_LIB=OFF \ -DUSE_TENSORRT=$USE_TENSORRT \ -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ - -DWITH_GTEST=ON + -DWITH_GTEST=ON \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) fi; cd - diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index 05c468b798886..6b6c0cd22f03b 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -80,6 +80,14 @@ if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inferenc endif() set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model") +if(WITH_ONNXRUNTIME) + set(MOBILENETV2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/MobileNetV2") + if(NOT EXISTS ${MOBILENETV2_INSTALL_DIR}/MobileNetV2.inference.model.tar.gz) + inference_download_and_uncompress_without_verify(${MOBILENETV2_INSTALL_DIR} ${INFERENCE_URL} "MobileNetV2.inference.model.tar.gz") + endif() + set(MOBILENETV2_MODEL_DIR "${MOBILENETV2_INSTALL_DIR}/MobileNetV2") +endif() + function (inference_base_test_build TARGET) set(options "") set(oneValueArgs "") diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index a7a417c29a7bd..f296ce96d4e5f 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -131,4 +131,7 @@ cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_aut if(NOT WIN32) cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator) cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator) + if (WITH_GPU) + cc_library(cuda_ipc_allocator SRCS cuda_ipc_allocator.cc DEPS allocator) + endif() endif(NOT WIN32) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 61e292a922f0e..4a44448dc84cf 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -219,6 +219,12 @@ class AllocatorFacadePrivate { } InitNaiveBestFitCUDAPinnedAllocator(); #endif +#ifdef PADDLE_WITH_ASCEND_CL + for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { + InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); + } + InitNaiveBestFitNPUPinnedAllocator(); +#endif #ifdef PADDLE_WITH_XPU for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc new file mode 100644 index 0000000000000..b2f24d5aed1eb --- /dev/null +++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _WIN32 + +#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h" +#include "paddle/fluid/platform/cuda_device_guard.h" + +#include +#include +#include +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace memory { +namespace allocation { + +namespace { +std::mutex ipc_mutex_; +std::unordered_map> ipc_handle_to_baseptr_; +} // namespace + +std::shared_ptr GetIpcBasePtr(std::string handle) { + std::lock_guard lock(ipc_mutex_); + + auto iter = ipc_handle_to_baseptr_.find(handle); + if (iter != ipc_handle_to_baseptr_.end()) { + auto baseptr = iter->second.lock(); + if (baseptr) return baseptr; + } + // The IpcMemHandle can only open once for the same handle, + // so here we cache it here. + void *baseptr = nullptr; + auto ipc_handle = + reinterpret_cast(handle.c_str()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcOpenMemHandle( + &baseptr, *ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + // Close ipc handle on the same device. + int device_id = platform::GetCurrentDeviceId(); + // Add deleter to close ipc handle. + auto sp = std::shared_ptr(baseptr, [handle, device_id](void *ptr) { + platform::CUDADeviceGuard guard(device_id); + std::lock_guard lock(ipc_mutex_); + PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcCloseMemHandle(ptr)); + ipc_handle_to_baseptr_.erase(handle); + VLOG(6) << "cudaIpcCloseMemHandle for ptr:" + << "\t" << ptr; + }); + std::weak_ptr wp = sp; + ipc_handle_to_baseptr_.insert(iter, {handle, wp}); + + return sp; +} + +CudaIpcAllocation::~CudaIpcAllocation() { + shared_ptr_.reset(); + VLOG(6) << "tensor deleted cudaIpcCloseMemHandle for ptr:" + << "\t" << this->ptr(); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle + +#endif diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.h b/paddle/fluid/memory/allocation/cuda_ipc_allocator.h new file mode 100644 index 0000000000000..52e3cf10ea73a --- /dev/null +++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.h @@ -0,0 +1,56 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _WIN32 +#pragma once + +#include +#include // NOLINT +#include +#include +#include + +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace memory { +namespace allocation { + +std::shared_ptr GetIpcBasePtr(std::string handle); + +class CudaIpcAllocation : public Allocation { + public: + explicit CudaIpcAllocation(void *ptr, size_t size, int device_id, + std::shared_ptr shared_ptr) + : Allocation(ptr, size, platform::CUDAPlace(device_id)), + device_id_(std::move(device_id)), + shared_ptr_(std::move(shared_ptr)) {} + + inline const int &device_id() const { return device_id_; } + + ~CudaIpcAllocation() override; + + private: + int device_id_; + std::shared_ptr shared_ptr_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle + +#endif diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc index acaf5d548555c..25c2235cce853 100644 --- a/paddle/fluid/memory/allocation/mmap_allocator.cc +++ b/paddle/fluid/memory/allocation/mmap_allocator.cc @@ -29,6 +29,155 @@ namespace paddle { namespace memory { namespace allocation { +std::string GetIPCName() { + static std::random_device rd; + std::string handle = "/paddle_"; +#ifdef _WIN32 + handle += std::to_string(GetCurrentProcessId()); +#else + handle += std::to_string(getpid()); +#endif + handle += "_"; + handle += std::to_string(rd()); + return handle; +} + +struct CountInfo { + std::atomic refcount; +}; + +void AllocateMemoryMap(std::string filename, int flags, size_t size, + void **map_ptr_, int *fd_) { + // TODO(@ZHUI): support win32 + int file_flags = 0; + int fd = -1; + if (flags & MAPPED_SHAREDMEM) { + file_flags = O_RDWR | O_CREAT; + } else { + file_flags = O_RDONLY; + } + if (flags & MAPPED_EXCLUSIVE) { + file_flags |= O_EXCL; + } + if (flags & MAPPED_NOCREATE) { + file_flags &= ~O_CREAT; + } + + if (!(flags & MAPPED_FROMFD)) { + if (flags & MAPPED_SHAREDMEM) { + fd = shm_open(filename.c_str(), file_flags, (mode_t)0600); + PADDLE_ENFORCE_NE( + fd, -1, + platform::errors::Unavailable( + "File descriptor %s open failed, unable in read-write mode", + filename.c_str())); + VLOG(6) << "shm_open: " << filename; + } + } else { + fd = -1; + } + + PADDLE_ENFORCE_EQ(ftruncate(fd, size), 0, + platform::errors::Unavailable( + "Fruncate a file to a specified length failed!")); + + if (flags & MAPPED_SHAREDMEM) { + *map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + } else { + *map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + } + + PADDLE_ENFORCE_NE(*map_ptr_, MAP_FAILED, + platform::errors::Unavailable( + "Memory map failed when create shared memory.")); + + if (flags & MAPPED_KEEPFD) { + *fd_ = fd; + } else { + PADDLE_ENFORCE_NE(::close(fd), -1, + platform::errors::Unavailable( + "Error closing memory maped file <", filename, ">")); + + *fd_ = -1; + } +} + +std::shared_ptr +AllocateRefcountedMemoryMapAllocation(std::string filename, int flags, + size_t size) { + int fd = -1; + void *base_ptr = nullptr; + AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd); + void *aliged_base_ptr = + static_cast(static_cast(base_ptr) + mmap_alignment); + return std::make_shared(aliged_base_ptr, size, + filename, flags, fd); +} + +RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation( + void *ptr, size_t size, std::string ipc_name, int fd, int flags) + : MemoryMapAllocation(ptr, size, ipc_name, fd, flags) { + // must reset base ptr first. + resetBaseptr(); + initializeRefercount(); +} + +void MemoryMapAllocation::close() { + if (closed_) { + return; + } + closed_ = true; +} + +MemoryMapAllocation::~MemoryMapAllocation() { close(); } + +void RefcountedMemoryMapAllocation::incref() { + CountInfo *info = static_cast(map_ptr_); + ++info->refcount; +} + +int RefcountedMemoryMapAllocation::decref() { + CountInfo *info = static_cast(map_ptr_); + return --info->refcount == 0; +} + +void RefcountedMemoryMapAllocation::resetBaseptr() { + map_ptr_ = + static_cast(static_cast(map_ptr_) - mmap_alignment); + map_size_ = map_size_ + mmap_alignment; +} + +void RefcountedMemoryMapAllocation::initializeRefercount() { + CountInfo *info = reinterpret_cast(map_ptr_); + + if (flags_ & MAPPED_EXCLUSIVE) { + new (&info->refcount) std::atomic(1); + } else { + info->refcount++; + } +} + +void RefcountedMemoryMapAllocation::close() { + if (closed_) { + return; + } + closed_ = true; + void *data = map_ptr_; + CountInfo *info = reinterpret_cast(data); + if (--info->refcount == 0) { + PADDLE_ENFORCE_NE( + shm_unlink(ipc_name_.c_str()), -1, + platform::errors::Unavailable( + "could not unlink the shared memory file ", ipc_name_)); + VLOG(6) << "shm_unlink file: " << ipc_name_; + } + + PADDLE_ENFORCE_NE( + munmap(map_ptr_, map_size_), -1, + platform::errors::Unavailable("could not unmap the shared memory file: ", + strerror(errno), " (", errno, ")")); +} + MemoryMapWriterAllocation::~MemoryMapWriterAllocation() { PADDLE_ENFORCE_NE( munmap(this->ptr(), this->size()), -1, @@ -44,30 +193,30 @@ MemoryMapReaderAllocation::~MemoryMapReaderAllocation() { /* Here we do not pay attention to the result of shm_unlink, because the memory mapped file may have been cleared due to the MemoryMapFdSet::Clear() */ + + // Code of DataLoader subprocess: + // + // core._array_to_share_memory_tensor(b) + // out_queue.put((idx, tensor_list, structure)) + // core._remove_tensor_list_mmap_fds(tensor_list) + + /* If the tensor in already in the send queue, the tensor will be + * deconstructed by the function. If the tensor not send yet, it + * will be cleared by MemoryMapFdSet::Clear(). + * If the `_remove_tensor_list_mmap_fds` have be interrupted, the + * tensor will be cleared by both methods. + * */ + shm_unlink(this->ipc_name().c_str()); MemoryMapFdSet::Instance().Remove(this->ipc_name()); VLOG(3) << "~MemoryMapReaderAllocation: " << this->ipc_name(); } -std::string GetIPCName() { - static std::random_device rd; - std::string handle = "/paddle_"; -#ifdef _WIN32 - handle += std::to_string(GetCurrentProcessId()); -#else - handle += std::to_string(getpid()); -#endif - handle += "_"; - handle += std::to_string(rd()); - return handle; -} - std::shared_ptr AllocateMemoryMapWriterAllocation( size_t size) { const std::string &ipc_name = GetIPCName(); int flags = O_RDWR | O_CREAT; - - int fd = shm_open(ipc_name.c_str(), flags, 0644); + int fd = shm_open(ipc_name.c_str(), flags, 0600); PADDLE_ENFORCE_NE( fd, -1, platform::errors::Unavailable("File descriptor %s open failed", ipc_name.c_str())); @@ -86,12 +235,14 @@ std::shared_ptr AllocateMemoryMapWriterAllocation( std::shared_ptr RebuildMemoryMapReaderAllocation( const std::string &ipc_name, size_t size) { - int fd = shm_open(ipc_name.c_str(), O_RDONLY, 0644); + int flags = O_RDWR | O_CREAT; + flags &= ~O_CREAT; + + int fd = shm_open(ipc_name.c_str(), flags, 0600); PADDLE_ENFORCE_NE( fd, -1, platform::errors::Unavailable("File descriptor %s open failed", ipc_name.c_str())); - - void *ptr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + void *ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); PADDLE_ENFORCE_NE(ptr, MAP_FAILED, platform::errors::Unavailable( "Memory map failed when rebuild shared memory.")); diff --git a/paddle/fluid/memory/allocation/mmap_allocator.h b/paddle/fluid/memory/allocation/mmap_allocator.h index 3f91e5c427808..4f8dbfbb51e66 100644 --- a/paddle/fluid/memory/allocation/mmap_allocator.h +++ b/paddle/fluid/memory/allocation/mmap_allocator.h @@ -16,8 +16,9 @@ #ifndef _WIN32 +#include #include -#include // NOLINT +#include #include #include #include @@ -28,6 +29,72 @@ namespace paddle { namespace memory { namespace allocation { +std::string GetIPCName(); + +static constexpr int64_t mmap_alignment = 64; + +enum MappedModes { + MAPPED_SHAREDMEM = 1, + MAPPED_EXCLUSIVE = 2, + MAPPED_NOCREATE = 4, + MAPPED_KEEPFD = 8, + MAPPED_FROMFD = 16, + MAPPED_UNLINK = 32 +}; + +class MemoryMapAllocation : public Allocation { + public: + explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name) + : Allocation(ptr, size, platform::CPUPlace()), + ipc_name_(std::move(ipc_name)), + map_ptr_(ptr), + map_size_(size) {} + explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name, + int flags, int fd) + : Allocation(ptr, size, platform::CPUPlace()), + ipc_name_(std::move(ipc_name)), + fd_(fd), + flags_(flags), + map_ptr_(ptr), + map_size_(size) {} + + inline const std::string &ipc_name() const { return ipc_name_; } + + virtual void close(); + + ~MemoryMapAllocation() override; + + protected: + std::string ipc_name_; + int fd_ = -1; + int flags_ = 0; + void *map_ptr_ = nullptr; + size_t map_size_ = 0; + bool closed_ = false; +}; + +class RefcountedMemoryMapAllocation : public MemoryMapAllocation { + public: + RefcountedMemoryMapAllocation(void *ptr, size_t size, std::string ipc_name, + int flags, int fd); + + void incref(); + int decref(); + void close() override; + virtual ~RefcountedMemoryMapAllocation() { close(); } + + protected: + void initializeRefercount(); + void resetBaseptr(); +}; + +void AllocateMemoryMap(std::string filename, int flags, size_t size, + void **base_ptr_, int *fd_); + +std::shared_ptr +AllocateRefcountedMemoryMapAllocation(std::string filename, int flags, + size_t size); + class MemoryMapWriterAllocation : public Allocation { public: explicit MemoryMapWriterAllocation(void *ptr, size_t size, diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 91a0352e1915e..e77be832c0cc8 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -161,7 +161,7 @@ cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEP set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function lod_tensor maxouting unpooling pooling lod_rank_table context_project -sequence_pooling segment_pooling executor device_memory_aligment generator) +sequence_pooling executor device_memory_aligment generator) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse matrix_solve) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 66f1bcc8b6869..845d0ed073b32 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -1482,6 +1482,20 @@ REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor); REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); +REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor); +REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu, + ThresholdedReluFunctor, ThresholdedReluGradFunctor); +REGISTER_ACTIVATION_OP(hard_shrink, HardShrink, HardShrinkFunctor, + HardShrinkGradFunctor); +REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor, + SoftShrinkGradFunctor); +REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor, + TanhShrinkGradFunctor); +REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor); +REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, + HardSigmoidGradFunctor); +REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor, + LogSigmoidGradFunctor); /* ========================== sigmoid register ============================= */ @@ -1516,30 +1530,6 @@ REGISTER_OPERATOR(sigmoid_triple_grad, ops::SigmoidTripleGradFunctor::FwdDeps()>, ops::ActivationTripleGradOpInplaceInferer); -// Register Sigmoid/GradSigmoid Kernels -REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor, - SigmoidGradFunctor); - -// Register DoubleGrad Kernel -REGISTER_OP_CPU_KERNEL( - sigmoid_grad_grad, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>); - -// Register TripleGrad Kernel -REGISTER_OP_CPU_KERNEL( - sigmoid_triple_grad, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>); - /* ========================================================================== */ /* ========================== tanh register ============================= */ @@ -1567,23 +1557,6 @@ REGISTER_OPERATOR( ops::ActivationOpTripleGrad::FwdDeps()>, ops::ActivationTripleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor); -REGISTER_OP_CPU_KERNEL( - tanh_grad_grad, ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>); -// Register TripleGrad Kernel -REGISTER_OP_CPU_KERNEL( - tanh_triple_grad, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>); /* ========================================================================== */ /* ========================== relu register ============================= */ @@ -1623,16 +1596,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad2::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor, - LeakyReluGradFunctor); -REGISTER_OP_CPU_KERNEL( - leaky_relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel< - plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor>); /* ========================================================================== */ /* ======================== elu register ============================ */ @@ -1650,22 +1613,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_OP_CPU_KERNEL(elu, - ops::ActivationKernel>, - ops::ActivationKernel>); -REGISTER_OP_CPU_KERNEL( - elu_grad, ops::ELUGradKernel, - ops::ELUGradKernel); -REGISTER_OP_CPU_KERNEL( - elu_grad_grad, ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>); - /* ========================================================================== */ /* ======================== logit register ============================ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 4b79397b6cdf2..f1984af6e15ea 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -238,21 +238,20 @@ struct BaseActivationFunctor { AttrPair GetAttrs() { return AttrPair(); } }; -// sigmoid(x) = 1 / (1 + exp(-x)) -template -struct SigmoidFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = static_cast(1) / (static_cast(1) + (-x).exp()); - } -}; - #define USE_PHI_FUNCTOR(name) \ template \ using name##Functor = phi::funcs::name##Functor; \ template \ using name##GradFunctor = phi::funcs::name##GradFunctor; +#define USE_PHI_DOUBLE_GRAD_FUNCTOR(name) \ + template \ + using name##GradGradFunctor = phi::funcs::name##GradGradFunctor; + +#define USE_PHI_TRIPLE_GRAD_FUNCTOR(name) \ + template \ + using name##TripleGradFunctor = phi::funcs::name##TripleGradFunctor; + USE_PHI_FUNCTOR(Cos) USE_PHI_FUNCTOR(Tan) USE_PHI_FUNCTOR(Acos) @@ -264,181 +263,27 @@ USE_PHI_FUNCTOR(Cosh) USE_PHI_FUNCTOR(Asinh) USE_PHI_FUNCTOR(Acosh) USE_PHI_FUNCTOR(Atanh) - -template -struct SigmoidGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * out * (static_cast(1) - out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -/* - Out - DOut -> SigmoidGradGrad -> DOutNew - DDX DDOut - - DDOut = (1-Out)*Out*DDX - DOutNew = (1-2*Out)*DOut*DDX -*/ -template -struct SigmoidGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - framework::Tensor* dOutNew, framework::Tensor* ddOut) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad")); - - if (dOutNew) { - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad")); - auto dout_new = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad")); - dout_new.device(*d) = - (static_cast(1) - static_cast(2) * out) * dout * ddx; - } - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad")); - ddout.device(*d) = (static_cast(1) - out) * out * ddx; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -/* - Out - DOut D_Dout - DDx -> SigmoidTripleGrad -> D_DDx - D_DDout d_OutNew - D_Dout_new - - D_Dout = (1-2*Out)*DDx*D_Dout_new - D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new - D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new - - Out, DDX, DOut, D_DDOut, D_DOut_New // input - D_OutNew, D_DOut, D_DDx // output -*/ -template -struct SigmoidTripleGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - const framework::Tensor* d_DDOut, - const framework::Tensor* d_dOut_New, - framework::Tensor* d_d_Out, framework::Tensor* d_Out_New, - framework::Tensor* d_DDx) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad")); - auto d_ddOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad")); - auto d_dOutNew = framework::EigenVector::Flatten(GET_DATA_SAFELY( - d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad")); - - if (d_Out_New) { - auto d_OutNew = framework::EigenVector::Flatten(GET_DATA_SAFELY( - d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad")); - d_OutNew.device(*d) = (ddx - static_cast(2) * out * ddx) * d_ddOut - - static_cast(2) * dout * ddx * d_dOutNew; - } - if (d_d_Out) { - auto d_dOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad")); - d_dOut.device(*d) = - (static_cast(1) - static_cast(2) * out) * ddx * d_dOutNew; - } - if (d_DDx) { - auto d_ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad")); - d_ddx.device(*d) = - (static_cast(1) - out) * out * d_ddOut + - (static_cast(1) - static_cast(2) * out) * dout * d_dOutNew; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -// silu(x) = x / (1 + exp(-x)) -template -struct SiluFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - auto temp = static_cast(1) / (static_cast(1) + (-x).exp()); - out.device(d) = x * temp; - } -}; - -// silu'(x) = (1 / (1 + e^{-x})) * (1 + out * e^{-x})) -template -struct SiluGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp1 = static_cast(1) + (-x).exp(); // 1+e^(-x) - auto temp2 = x * (-x).exp(); // x*e^(-x) - dx.device(d) = dout * ((static_cast(1) / temp1) * - (static_cast(1) + (temp2 / temp1))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -// Originally: logsigmoid(x) = -log (1 + exp(-x)) -// For numerical stability, we can use the log-sum-exp trick: -// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ -// We can rewrite the above equation as: -// out = -log( exp(0) + exp(-x)) [since exp(0) = 1] -// = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0))) -// = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x - -// max(-x, 0))) -// = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) -// = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))) -// -// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0)) -// + exp(-x - max(-x, 0)))) -template -struct LogSigmoidFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) - out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log()); - } -}; - -// Originally: f' = exp(-x) / (1 + exp(-x)) -// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + -// exp(-x - max(-x, 0))) -template -struct LogSigmoidGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) - dx.device(d) = - dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; +USE_PHI_FUNCTOR(Tanh) +USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh) +USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh) +USE_PHI_FUNCTOR(BRelu) +USE_PHI_FUNCTOR(ThresholdedRelu) +USE_PHI_FUNCTOR(LeakyRelu) +USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu) +USE_PHI_FUNCTOR(HardShrink) +USE_PHI_FUNCTOR(SoftShrink) +USE_PHI_FUNCTOR(TanhShrink) +USE_PHI_FUNCTOR(Silu) +USE_PHI_FUNCTOR(ELU) +USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU) +USE_PHI_FUNCTOR(Sigmoid) +USE_PHI_DOUBLE_GRAD_FUNCTOR(Sigmoid) +USE_PHI_TRIPLE_GRAD_FUNCTOR(Sigmoid) +USE_PHI_FUNCTOR(LogSigmoid) +USE_PHI_FUNCTOR(HardSigmoid) + +template +using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor; // exp(x) = e^x template @@ -497,210 +342,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor; template using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor; -// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) -template -struct TanhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.tanh(); - } -}; - -template -struct TanhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * (static_cast(1) - out * out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct TanhGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - framework::Tensor* dOutNew, framework::Tensor* ddOut) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad")); - // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out - // * ddx) - if (dOutNew) { - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad")); - auto dout_new = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad")); - dout_new.device(*d) = - static_cast(-1) * dout * static_cast(2) * out * ddx; - } - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad")); - ddout.device(*d) = (static_cast(1) - out * out) * ddx; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; -/* - Out - DOut D_Dout - DDx -> TanhTripleGrad -> D_DDx - D_DDout d_OutNew - D_Dout_new - - D_Dout = (-2) * Out * DDx * D_Dout_new - D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new - D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new - - Out, DDX, DOut, D_DDOut, D_DOut_New // input - D_OutNew, D_DOut, D_DDx // output -*/ -template -struct TanhTripleGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - const framework::Tensor* d_DDOut, - const framework::Tensor* d_dOut_New, - framework::Tensor* d_d_Out, framework::Tensor* d_Out_New, - framework::Tensor* d_DDx) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad")); - auto d_ddOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad")); - auto d_dOutNew = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad")); - - if (d_Out_New) { - auto d_OutNew = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad")); - d_OutNew.device(*d) = (static_cast(-2) * out * ddx * d_ddOut) - - (static_cast(2) * dout * ddx * d_dOutNew); - } - if (d_d_Out) { - auto d_dOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad")); - d_dOut.device(*d) = static_cast(-2) * out * ddx * d_dOutNew; - } - if (d_DDx) { - auto d_ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad")); - d_ddx.device(*d) = (static_cast(1) - (out * out)) * d_ddOut - - static_cast(2) * out * dout * d_dOutNew; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -// tanhshrink(x) = x - tanh(x) -// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) -template -struct TanhShrinkFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x - x.tanh(); - } -}; - -template -struct TanhShrinkGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * (x.tanh() * x.tanh()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -// tanhshrink(x) = x - tanh(x) -// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) -template -struct HardShrinkFunctor : public BaseActivationFunctor { - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - template - void operator()(Device d, X x, Out out) const { - auto temp1 = x < static_cast(threshold * -1.f); - auto temp2 = x > static_cast(threshold); - out.device(d) = x * (temp1 || temp2).template cast(); - } -}; - -template -struct HardShrinkGradFunctor : public BaseActivationFunctor { - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp1 = x < static_cast(threshold * -1.f); - auto temp2 = x > static_cast(threshold); - dx.device(d) = dout * (temp1 || temp2).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0 -// otherwise -template -struct SoftShrinkFunctor : public BaseActivationFunctor { - float lambda; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"lambda", &lambda}}; - } - - template - void operator()(Device d, X x, Out out) const { - auto lambdaT = static_cast(lambda); - auto temp1 = (x > lambdaT).template cast(); - auto temp2 = (x < -lambdaT).template cast(); - out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT); - } -}; - -template -struct SoftShrinkGradFunctor : public BaseActivationFunctor { - float lambda; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"lambda", &lambda}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto lambdaT = static_cast(lambda); - auto temp1 = (x > lambdaT).template cast(); - auto temp2 = (x < -lambdaT).template cast(); - dx.device(d) = dout * (temp1 + temp2).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - // sqrt(x) = x^(1/2) template struct SqrtFunctor : public BaseActivationFunctor { @@ -909,42 +550,6 @@ struct SquareGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct BReluFunctor : public BaseActivationFunctor { - float t_min; - float t_max; - - // NOTE: Explicit hides the `BaseActivationFunctor::GetAttrs` - // not polymorphism for speed. - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - - template - void operator()(Device d, X x, Out out) const { - out.device(d) = - x.cwiseMax(static_cast(t_min)).cwiseMin(static_cast(t_max)); - } -}; - -template -struct BReluGradFunctor : public BaseActivationFunctor { - float t_min; - float t_max; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * - ((x > static_cast(t_min)) * (x < static_cast(t_max))) - .template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - // relu6(x) = min(max(0, x), 6) template struct Relu6Functor : public BaseActivationFunctor { @@ -1168,116 +773,28 @@ struct SoftReluGradFunctor : public BaseActivationFunctor { } }; -template -struct LeakyReluFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } +template +class ELUGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* X = context.Input("X"); + auto* Out = context.Input("Out"); + auto* dOut = + context.Input(framework::GradVarName("Out")); + auto* dX = context.Output(framework::GradVarName("X")); + const float alpha = context.Attr("alpha"); + dX->mutable_data(context.GetPlace()); - template - void operator()(Device d, X x, Out out) const { - if (alpha < 1.f) { - out.device(d) = x.cwiseMax(static_cast(alpha) * x); - } else { - out.device(d) = x.cwiseMin(static_cast(alpha) * x); - } - } -}; - -template -struct LeakyReluGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp1 = - static_cast(alpha) * (x < static_cast(0)).template cast(); - auto temp2 = (x >= static_cast(0)).template cast(); - dx.device(d) = dout * (temp1 + temp2).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct ELUFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - template - void operator()(Device d, X x, Out out) const { - out.device(d) = - (x < static_cast(0)) - .select(static_cast(alpha) * (x.exp() - static_cast(1)), x); - } -}; - -template -struct ELUGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - // case 1: alpha >= 0 - // dx = dout, if out > 0 - // dx = dout * (out + alpha), if out <= 0 - dx.device(d) = (out > static_cast(0)) - .select(dout, dout * (out + static_cast(alpha))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - // case 2: alpha < 0 - // dx = dout, if x > 0 - // dx = dout * (out + alpha), if x <=0 - dx.device(d) = (x > static_cast(0)) - .select(dout, dout * static_cast(alpha) * x.exp()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -class ELUGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* X = context.Input("X"); - auto* Out = context.Input("Out"); - auto* dOut = - context.Input(framework::GradVarName("Out")); - auto* dX = context.Output(framework::GradVarName("X")); - const float alpha = context.Attr("alpha"); - dX->mutable_data(context.GetPlace()); - - auto x = framework::EigenVector::Flatten( - GET_DATA_SAFELY(X, "Input", "X", "elu_grad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "elu_grad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "dOut", "elu_grad")); - auto dx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dX, "Output", "dX", "elu_grad")); - auto* place = - context.template device_context().eigen_device(); + auto x = framework::EigenVector::Flatten( + GET_DATA_SAFELY(X, "Input", "X", "elu_grad")); + auto out = framework::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "elu_grad")); + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "dOut", "elu_grad")); + auto dx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dX, "Output", "dX", "elu_grad")); + auto* place = + context.template device_context().eigen_device(); if (alpha > 0) { ELUGradFunctor functor; @@ -1430,74 +947,6 @@ struct STanhGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct ThresholdedReluFunctor : public BaseActivationFunctor { - float threshold; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out) const { - auto th = static_cast(threshold); - out.device(d) = (x > th).template cast() * x; - } -}; - -template -struct ThresholdedReluGradFunctor : public BaseActivationFunctor { - float threshold; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto th = static_cast(threshold); - dx.device(d) = dout * (x > th).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct HardSigmoidFunctor : public BaseActivationFunctor { - float slope; - float offset; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - - template - void operator()(Device d, X x, Out out) const { - auto temp = x * static_cast(slope) + static_cast(offset); - out.device(d) = - temp.cwiseMax(static_cast(0)).cwiseMin(static_cast(1)); - } -}; - -template -struct HardSigmoidGradFunctor : public BaseActivationFunctor { - float slope; - float offset; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * - ((out > static_cast(0)) * (out < static_cast(1))) - .template cast() * - static_cast(slope); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - template struct SwishFunctor : public BaseActivationFunctor { float beta; @@ -1531,121 +980,6 @@ struct SwishGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -/* - * in arguments: x, out, ddx - * out arguments: ddout, dout, dx - */ -template -inline void ExtractActivationDoubleGradTensor( - const framework::ExecutionContext& ctx, const framework::Tensor** X, - const framework::Tensor** Out, const framework::Tensor** ddX, - framework::Tensor** dX, framework::Tensor** dOut, - framework::Tensor** ddOut) { - auto ddx_var = ctx.InputVar("DDX"); - auto ddo_var = ctx.OutputVar("DDOut"); - PADDLE_ENFORCE_NOT_NULL( - ddx_var, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("DDX"))); - if (CanBeUsedBySelectedRows.count(ctx.Type())) { - *ddX = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*ddx_var); - if (ddo_var) { - *ddOut = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - ddo_var); - } - } else { - *ddX = ctx.Input("DDX"); - if (ddo_var) { - *ddOut = ctx.Output("DDOut"); - } - } - PADDLE_ENFORCE_NOT_NULL( - *ddX, - platform::errors::NotFound( - "Cannot get the tensor from the Variable Output, variable name = %s", - ctx.OutputName("DDX"))); - - if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepX)) { - auto x_var = ctx.InputVar("X"); - PADDLE_ENFORCE_NOT_NULL( - x_var, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("X"))); - auto dx_var = ctx.OutputVar("DX"); - if (CanBeUsedBySelectedRows.count(ctx.Type())) { - *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var); - if (dx_var) { - *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - dx_var); - } - } else { - *X = ctx.Input("X"); - if (dx_var) { - *dX = ctx.Output("DX"); - } - } - } else { - VLOG(10) << "Inplace activation of Op: " << ctx.Type(); - *X = *ddX; - } - if (static_cast(kDepValue) & - static_cast(ActBwdOpFwdDeps::kDepOut)) { - auto out_var = ctx.InputVar("Out"); - PADDLE_ENFORCE_NOT_NULL( - out_var, - platform::errors::NotFound( - "Cannot get the tensor from the Variable Out, variable name = %s", - ctx.InputName("Out"))); - auto dout_var = ctx.OutputVar("DOut"); - if (CanBeUsedBySelectedRows.count(ctx.Type())) { - *Out = - paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var); - if (dout_var) { - *dOut = - paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - dout_var); - } - } else { - *Out = ctx.Input("Out"); - if (dout_var) { - *dOut = ctx.Output("DOut"); - } - } - } else { - VLOG(10) << "Inplace activation of Op: " << ctx.Type(); - *Out = *ddX; - } -} - -template -class ActivationDoubleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *X, *Out, *ddX; - X = Out = ddX = nullptr; - framework::Tensor *ddOut, *dOut, *dX; - ddOut = dOut = dX = nullptr; - - ExtractActivationDoubleGradTensor(ctx, &X, &Out, &ddX, - &dX, &dOut, &ddOut); - - if (ddOut) ddOut->mutable_data(ctx.GetPlace()); - if (dOut) dOut->mutable_data(ctx.GetPlace()); - if (dX) dX->mutable_data(Out->dims(), ctx.GetPlace()); - - auto& place = ctx.template device_context(); - - Functor functor; - auto attrs = functor.GetAttrs(); - for (auto& attr : attrs) { - *attr.second = ctx.Attr(attr.first); - } - functor(place, X, Out, ddX, ddOut, dOut, dX); - } -}; - template struct AbsGradGradFunctor : public BaseActivationFunctor { template @@ -1667,73 +1001,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct LeakyReluGradGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(const Device& dev, const framework::Tensor* X, - const framework::Tensor* Out, const framework::Tensor* ddX, - framework::Tensor* ddOut, framework::Tensor* dOut, - framework::Tensor* dX) const { - if (ddOut) { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad")); - auto x = framework::EigenVector::Flatten( - GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad")); - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad")); - ddout.device(*d) = - ddx * - ((x > static_cast(0)).template cast() + - static_cast(alpha) * (x <= static_cast(0)).template cast()) - .template cast(); - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct ELUGradGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(const Device& dev, const framework::Tensor* X, - const framework::Tensor* ddX, framework::Tensor* ddOut, - const framework::Tensor* dOut, framework::Tensor* dX) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad")); - auto x = framework::EigenVector::Flatten( - GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad")); - - if (dX) { - auto dx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad")); - dx.device(*d) = ddx * dout * static_cast(alpha) * x.exp() * - (x <= static_cast(0)).template cast(); - } - - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad")); - ddout.device(*d) = ddx * - ((x > static_cast(0)).template cast() + - static_cast(alpha) * x.exp() * - (x <= static_cast(0)).template cast()) - .template cast(); - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CELUGradGradFunctor : public BaseActivationFunctor { float alpha; @@ -1907,211 +1174,6 @@ inline void ExtractDoubleGradTensorWithInputDOut( } } -template -class SigmoidDoubleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut; - framework::Tensor *dOutNew, *ddOut; - Out = ddX = dOut = nullptr; - dOutNew = ddOut = nullptr; - // extract ddx(input) and out(input) - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - PADDLE_ENFORCE_NOT_NULL( - ddX, platform::errors::NotFound( - "Cannot get input Variable ddX, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - Out, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("Out"))); - // set output ddout - ddOut = ctx.Output("DDOut"); - // extract dOut(intput) - dOut = ctx.Input("DOut"); - PADDLE_ENFORCE_NOT_NULL( - dOut, platform::errors::NotFound( - "Cannot get input Variable dOut, variable name = %s", - ctx.InputName("DOut"))); - dOutNew = ctx.Output("DOutNew"); - if (dOutNew) dOutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, dOutNew, ddOut); - } -}; - -// Out, DDX, DOut, D_DDOut, D_DOut_New // input -// D_OutNew, D_DOut, D_DDx // output -template -class SigmoidTripleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew; - framework::Tensor *d_OutNew, *d_dOut, *d_ddx; - Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr; - d_OutNew = d_dOut = d_ddx = nullptr; - - // extract ddx(input), out(input), dOut(input), d_ddOut(input), - // d_dOutNew(input) - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - dOut = ctx.Input("DOut"); - d_ddOut = ctx.Input("D_DDOut"); - d_dOutNew = ctx.Input("D_DOut_New"); - - PADDLE_ENFORCE_NOT_NULL( - ddX, platform::errors::NotFound( - "Cannot get input Variable ddX, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - Out, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("Out"))); - PADDLE_ENFORCE_NOT_NULL( - dOut, platform::errors::NotFound( - "Cannot get input Variable dOut, variable name = %s", - ctx.InputName("DOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_ddOut, platform::errors::NotFound( - "Cannot get input Variable d_ddOut, variable name = %s", - ctx.InputName("D_DDOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_dOutNew, - platform::errors::NotFound( - "Cannot get input Variable d_dOutNew, variable name = %s", - ctx.InputName("D_DOutNew"))); - - // set output d_OutNew态d_dOut态d_ddx - d_dOut = ctx.Output("D_DOut"); - d_OutNew = ctx.Output("D_OutNew"); - d_ddx = ctx.Output("D_DDx"); - - if (d_dOut) d_dOut->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_OutNew) d_OutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_ddx) d_ddx->mutable_data(ddX->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew, // input - d_dOut, d_OutNew, d_ddx); // output - } -}; - -template -class TanhDoubleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut; - framework::Tensor *dOutNew, *ddOut; - Out = ddX = dOut = nullptr; - dOutNew = ddOut = nullptr; - - // extract ddx(input) and out(input) - auto ddx_var = ctx.InputVar("DDX"); - auto out_var = ctx.InputVar("Out"); - PADDLE_ENFORCE_NOT_NULL( - ddx_var, platform::errors::NotFound( - "Cannot get input Variable ddx, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - out_var, platform::errors::NotFound( - "Cannot get input Variable out, variable name = %s", - ctx.InputName("Out"))); - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - - // set output ddout - auto ddout_var = ctx.OutputVar("DDOut"); - if (ddout_var) { - ddOut = ctx.Output("DDOut"); - } - - // extract dOut(intput) - auto dout_var = ctx.InputVar("DOut"); - PADDLE_ENFORCE_NOT_NULL( - dout_var, platform::errors::NotFound( - "Cannot get input Variable dout_var, variable name = %s", - ctx.InputName("DOut"))); - dOut = ctx.Input("DOut"); - - // set output dout_new - auto dout_new_var = ctx.OutputVar("DOutNew"); - if (dout_new_var) { - dOutNew = ctx.Output("DOutNew"); - } - - if (dOutNew) dOutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, dOutNew, ddOut); - } -}; - -template -class TanhTripeGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew; - framework::Tensor *d_OutNew, *d_dOut, *d_ddx; - Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr; - d_OutNew = d_dOut = d_ddx = nullptr; - - // extract ddx(input), out(input), dOut(input), d_ddOut(input), - // d_dOutNew(input) - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - dOut = ctx.Input("DOut"); - d_ddOut = ctx.Input("D_DDOut"); - d_dOutNew = ctx.Input("D_DOut_New"); - - PADDLE_ENFORCE_NOT_NULL( - ddX, platform::errors::NotFound( - "Cannot get input Variable ddX, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - Out, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("Out"))); - PADDLE_ENFORCE_NOT_NULL( - dOut, platform::errors::NotFound( - "Cannot get input Variable dOut, variable name = %s", - ctx.InputName("DOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_ddOut, platform::errors::NotFound( - "Cannot get input Variable d_ddOut, variable name = %s", - ctx.InputName("D_DDOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_dOutNew, - platform::errors::NotFound( - "Cannot get input Variable d_dOutNew, variable name = %s", - ctx.InputName("D_DOutNew"))); - - // set output d_OutNew态d_dOut态d_ddx - d_dOut = ctx.Output("D_DOut"); - d_OutNew = ctx.Output("D_OutNew"); - d_ddx = ctx.Output("D_DDx"); - - if (d_dOut) d_dOut->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_OutNew) d_OutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_ddx) d_ddx->mutable_data(ddX->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew, // input - d_dOut, d_OutNew, d_ddx); // output - } -}; - template class SquareDoubleGradKernel : public framework::OpKernel { @@ -2493,29 +1555,19 @@ struct LogGradGradFunctor : public BaseActivationFunctor { } // namespace operators } // namespace paddle -#define FOR_EACH_ACTIVATION_OP(__macro) \ - __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ - __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ - __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ - __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ - __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ - __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ - __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ - __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ - __macro(log2, Log2, Log2Functor, Log2GradFunctor); \ - __macro(log10, Log10, Log10Functor, Log10GradFunctor); \ - __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor); \ - __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ - __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ - __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ - __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ - __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \ - __macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \ - __macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \ - __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ - HardSigmoidGradFunctor); \ - __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ - __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor, \ - ThresholdedReluGradFunctor); \ - __macro(mish, Mish, MishFunctor, MishGradFunctor); \ +#define FOR_EACH_ACTIVATION_OP(__macro) \ + __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ + __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ + __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ + __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ + __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ + __macro(log2, Log2, Log2Functor, Log2GradFunctor); \ + __macro(log10, Log10, Log10Functor, Log10GradFunctor); \ + __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ + __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ + __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ + __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ + __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \ + __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ + __macro(mish, Mish, MishFunctor, MishGradFunctor); \ __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor); diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 208abd0949aa8..7c1b288080162 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -15,170 +15,11 @@ limitations under the License. */ #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" + namespace paddle { namespace operators { -template -struct CudaLeakyReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // leakyrelu(x) = x > 0 ? x : alpha * x - __device__ __forceinline__ T operator()(const T x) const { - return x > zero ? x : static_cast(alpha) * x; - } -}; - -template -struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // dx = dout * (x > 0 ? 1 : alpha) - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return x > zero ? dout : static_cast(alpha) * dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct CudaSigmoidFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // sigmoid(x) = 1 / (1 + exp(-x)) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(one / (one + exp(-x))); - } -}; - -template -struct CudaSigmoidGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout * out * (1 - out) - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return dout * out * (one - out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct CudaSiluFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // silu(x) = x / (1 + exp(-x)) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(x / (one + exp(-x))); - } -}; - -template -struct CudaSiluGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - MPType temp = one / (one + exp(-x)); - return static_cast(dout * (temp * (one + x * (one - temp)))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct CudaLogSigmoidFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - - // logsigmoid(x) = log(1 / (1 + exp(-x))) - // For numerical stability, - // logsigmoid(x) = - // - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - MPType temp = x > zero ? zero : -x; - return static_cast(-temp - log(exp(-temp) + exp(-x - temp))); - } -}; - -template -struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - - // dx = dout * exp(-x) / (1 + exp(-x)) - // For numerical stability: - // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x, - // 0))) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - MPType temp1 = x > zero ? zero : -x; - MPType temp2 = exp(-x - temp1); - return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct CudaSoftShrinkFunctor : public BaseActivationFunctor { - float lambda; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"lambda", &lambda}}; - } - - // softshrink(x) = x - lambda, if x > lambda; - // x + lambda, if x < -lambda; - // 0, otherwise. - __device__ __forceinline__ T operator()(const T x) const { - T l = static_cast(lambda); - T temp1 = static_cast(x > l); - T temp2 = static_cast(x < -l); - return temp1 * (x - l) + temp2 * (x + l); - } -}; - -template -struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float lambda; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"lambda", &lambda}}; - } - - // dx = dout, if x > lambda or x < -lambda else 0 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - T l = static_cast(lambda); - return (x >= -l && x <= l) ? zero : dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaCeilFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -224,31 +65,6 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor { } }; -template -struct CudaTanhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // tanh(x) = tanh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(tanh(x)); - } -}; - -template -struct CudaTanhGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout * (1 - out^2) - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return dout * (one - out * out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - template struct CudaReciprocalFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); @@ -476,45 +292,6 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct CudaBReluFunctor : public BaseActivationFunctor { - float t_min; - float t_max; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - - // brelu(x) = min(max(x, t_min), t_max) - __device__ __forceinline__ T operator()(const T x) const { - T t_min_cast = static_cast(t_min); - T t_max_cast = static_cast(t_max); - T temp_max = x > t_min_cast ? x : t_min_cast; - T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast; - return temp_min; - } -}; - -template -struct CudaBReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float t_min; - float t_max; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - - // dx = (x > t_min && x < t_max) ? dout : 0 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - T t_min_cast = static_cast(t_min); - T t_max_cast = static_cast(t_max); - return (x > t_min_cast && x < t_max_cast) ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaSoftReluFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -711,109 +488,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor { } }; -template -struct CudaTanhShrinkFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // tanhshrink(x) = x - tanh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(x - tanh(x)); - } -}; - -template -struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * tanh(x)^2 - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * tanh(x) * tanh(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct CudaHardShrinkFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x - __device__ __forceinline__ T operator()(const T x) const { - T t = static_cast(threshold); - return (x > -t && x < t) ? zero : x; - } -}; - -template -struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // dx = (x > -threshold && x < threshold) ? 0 : dout - __device__ __forceinline__ T operator()(const T dout, const T x) const { - T t = static_cast(threshold); - return (x > -t && x < t) ? zero : dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct CudaHardSigmoidFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - T one = static_cast(1.0f); - float slope; - float offset; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - - // hard_sigmoid(x) = 0, when x <= -3 - // 1, when x >= 3 - // x * slope + offset, otherwise - __device__ __forceinline__ T operator()(const T x) const { - T temp = x * static_cast(slope) + static_cast(offset); - T temp_max = temp > zero ? temp : zero; - T temp_min = temp_max < one ? temp_max : one; - return temp_min; - } -}; - -template -struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - T one = static_cast(1.0f); - float slope; - float offset; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - - // dx = (out > 0 && out < 1) ? dout * slope : 0 - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return (out > zero && out < one) ? dout * static_cast(slope) : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - template struct CudaSwishFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -907,38 +581,6 @@ struct CudaMishGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct CudaThresholdedReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // thresholded_relu(x) = x > threshold ? x : 0 - __device__ __forceinline__ T operator()(const T x) const { - return x > static_cast(threshold) ? x : zero; - } -}; - -template -struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // dx = x > threshold ? dout : 0 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return x > static_cast(threshold) ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaHardSwishFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); @@ -991,110 +633,6 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct CudaELUFunctor : public BaseActivationFunctor { - using CT = typename details::MPTypeTrait::Type; - CT zero = static_cast(0.0f); - CT one = static_cast(1.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // elu(x) = x, if x > 0 - // elu(x) = alpha * (e^x - 1), if x <= 0 - __device__ __forceinline__ T operator()(const T arg_x) const { - CT x = static_cast(arg_x); - CT temp = static_cast(alpha) * (exp(x) - one); - CT res = x > zero ? x : temp; - return static_cast(res); - } -}; - -template -struct CudaELUGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // case 1: alpha >= 0 - // dx = dout, if out > 0 - // dx = dout * (out + alpha), if out <= 0 - __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const { - MPType dout = static_cast(arg_dout); - MPType out = static_cast(arg_out); - MPType a = static_cast(alpha); - MPType out_pos = static_cast(out > zero); - MPType out_neg = static_cast(out <= zero); - return static_cast(dout * (out_pos + out_neg * (out + a))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // case 2: alpha < 0 - // dx = dout, if x > 0 - // dx = dout * (out + alpha), if x <=0 - __device__ __forceinline__ T operator()(const T arg_dout, const T arg_out, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType out = static_cast(arg_out); - MPType x = static_cast(arg_x); - MPType a = static_cast(alpha); - MPType x_pos = static_cast(x > zero); - MPType x_neg = static_cast(x <= zero); - return static_cast(dout * (x_pos + x_neg * (out + a))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -class ELUGradCudaKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* out = ctx.Input("Out"); - auto* x = ctx.Input("X"); - auto* d_x = ctx.Output(framework::GradVarName("X")); - d_x->mutable_data(ctx.GetPlace()); - const float alpha = ctx.Attr("alpha"); - - auto& dev_ctx = ctx.device_context(); - std::vector ins = {d_out, out}; - std::vector outs = {d_x}; - if (alpha > 0) { - CudaELUGradFunctor functor; - functor.alpha = alpha; - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } else { - CudaELUGradNegativeAlphaFunctor functor; - functor.alpha = alpha; - ins.push_back(x); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } - } -}; - template struct CudaCELUFunctor : public BaseActivationFunctor { using CT = typename details::MPTypeTrait::Type; @@ -1212,6 +750,34 @@ class ActivationGradCudaKernel } }; +USE_PHI_FUNCTOR(CudaCos) +USE_PHI_FUNCTOR(CudaTan) +USE_PHI_FUNCTOR(CudaAcos) +USE_PHI_FUNCTOR(CudaSin) +USE_PHI_FUNCTOR(CudaAsin) +USE_PHI_FUNCTOR(CudaAtan) +USE_PHI_FUNCTOR(CudaSinh) +USE_PHI_FUNCTOR(CudaCosh) +USE_PHI_FUNCTOR(CudaAsinh) +USE_PHI_FUNCTOR(CudaAcosh) +USE_PHI_FUNCTOR(CudaAtanh) +USE_PHI_FUNCTOR(CudaTanh) +USE_PHI_FUNCTOR(CudaBRelu) +USE_PHI_FUNCTOR(CudaLeakyRelu) +USE_PHI_FUNCTOR(CudaThresholdedRelu) +USE_PHI_FUNCTOR(CudaHardShrink) +USE_PHI_FUNCTOR(CudaSoftShrink) +USE_PHI_FUNCTOR(CudaTanhShrink) +USE_PHI_FUNCTOR(CudaSilu) +USE_PHI_FUNCTOR(CudaELU) +USE_PHI_FUNCTOR(CudaSigmoid) +USE_PHI_FUNCTOR(CudaLogSigmoid) +USE_PHI_FUNCTOR(CudaHardSigmoid) + +template +using CudaELUGradNegativeAlphaFunctor = + phi::funcs::CudaELUGradNegativeAlphaFunctor; + } // namespace operators } // namespace paddle @@ -1270,40 +836,6 @@ namespace plat = paddle::platform; ops::ActivationGradCudaKernel>); -/* ======================== leaky relu register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, - CudaLeakyReluGradFunctor); - -REGISTER_OP_CUDA_KERNEL( - leaky_relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel< - plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor>); -/* ========================================================================== */ - -/* ======================== elu register ============================ */ -REGISTER_OP_CUDA_KERNEL( - elu, ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - elu_grad, ops::ELUGradCudaKernel, - ops::ELUGradCudaKernel, - ops::ELUGradCudaKernel); - -REGISTER_OP_CUDA_KERNEL( - elu_grad_grad, ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>); /* ========================================================================== */ /* ======================== celu register ============================ */ @@ -1319,58 +851,6 @@ REGISTER_OP_CUDA_KERNEL( ops::CELUGradGradFunctor>); /* ========================================================================== */ -/* =========================== sigmoid register ============================ - */ -REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, - CudaSigmoidGradFunctor); - -REGISTER_OP_CUDA_KERNEL( - sigmoid_grad_grad, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>); - -REGISTER_OP_CUDA_KERNEL( - sigmoid_triple_grad, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel< - plat::CUDADeviceContext, - ops::SigmoidTripleGradFunctor>); -/* ========================================================================== */ - -/* =========================== tanh register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor, - CudaTanhGradFunctor); - -REGISTER_OP_CUDA_KERNEL( - tanh_grad_grad, - ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>); - -REGISTER_OP_CUDA_KERNEL( - tanh_triple_grad, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>); -/* ========================================================================== */ - /* =========================== sqrt register ============================= */ REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, CudaSqrtGradFunctor); @@ -1508,9 +988,6 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ #define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \ - __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor); \ - __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \ - CudaLogSigmoidGradFunctor); \ __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \ CudaSoftShrinkGradFunctor); \ __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor); \ @@ -1521,7 +998,6 @@ REGISTER_OP_CUDA_KERNEL( __macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor); \ __macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor); \ __macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor); \ - __macro(brelu, BRelu, CudaBReluFunctor, CudaBReluGradFunctor); \ __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \ __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor); \ __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor); \ @@ -1531,76 +1007,228 @@ REGISTER_OP_CUDA_KERNEL( CudaTanhShrinkGradFunctor); \ __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor, \ CudaHardShrinkGradFunctor); \ - __macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor, \ - CudaHardSigmoidGradFunctor); \ __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor); \ __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor); \ - __macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor, \ - CudaThresholdedReluGradFunctor); \ __macro(hard_swish, HardSwish, CudaHardSwishFunctor, \ CudaHardSwishGradFunctor); FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL) #ifdef PADDLE_WITH_XPU_KP -#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, op_name, functor, \ - grad_functor) \ - REGISTER_OP_KERNEL( \ - act_type, KP, plat::XPUPlace, \ - ops::ActivationCudaKernel>); \ - REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace, \ - ops::ActivationGradCudaKernel>); - -REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, - CudaLeakyReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(relu, Relu, CudaReluFunctor, - CudaReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, - CudaSigmoidGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor, - CudaReciprocalGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor, - CudaSoftplusGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor, - CudaHardSwishGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor, - CudaCELUGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, - CudaSqrtGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor, - CudaSquareGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor, - CudaSiluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, - CudaLogSigmoidGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor, - CudaSoftShrinkGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor, - CudaZeroGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor, - CudaZeroGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor, - CudaLog1pGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor, - CudaBReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor, - CudaSoftReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor, - CudaSoftsignGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor, - CudaRelu6GradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor, - CudaHardShrinkGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid, - CudaHardSigmoidFunctor, - CudaHardSigmoidGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor, - CudaSwishGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu, - CudaThresholdedReluFunctor, - CudaThresholdedReluGradFunctor); +REGISTER_OP_KERNEL( + brelu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + brelu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(ceil, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + ceil_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(celu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + celu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(elu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + elu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(exp, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + exp_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(floor, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + floor_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + hard_shrink, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + hard_shrink_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + hard_sigmoid, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + hard_sigmoid_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(hard_swish, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + hard_swish_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + leaky_relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + leaky_relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(log, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + log_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(log1p, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + log1p_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + logsigmoid, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + logsigmoid_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + reciprocal, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + reciprocal_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(relu6, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + relu6_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(sigmoid, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + sigmoid_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(silu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + silu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(soft_relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + soft_relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(softplus, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + softplus_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + softshrink, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + softshrink_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(softsign, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + softsign_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(sqrt, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + sqrt_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(square, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + square_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(swish, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + swish_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + thresholded_relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + thresholded_relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); #endif // PADDLE_WITH_XPU_KP diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc index 8fb9929c39e92..88d7cb7c1f5f4 100644 --- a/paddle/fluid/operators/allclose_op.cc +++ b/paddle/fluid/operators/allclose_op.cc @@ -12,52 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/allclose_op.h" #include #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { -template -struct GetTensorValue { - T operator()(const platform::CPUDeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - return *(tensor.data()); - } -}; - -template -struct AllcloseFunctor { - void operator()(const platform::CPUDeviceContext& ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - auto* in_a = in.data(); - auto* in_b = other.data(); - auto* out_data = output->mutable_data(ctx.GetPlace()); - auto num = in.numel(); - *out_data = true; - for (int i = 0; i < num; i++) { - const T a = in_a[i], b = in_b[i]; - bool val; - if (std::isnan(a) || std::isnan(b)) { - val = equal_nan && std::isnan(a) == std::isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - *out_data &= val; - } - } -}; - class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -96,40 +64,6 @@ class AllcloseOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Allclose"); - OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Allclose"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Allclose"); - - auto input_dim = ctx->GetInputDim("Input"); - auto other_dim = ctx->GetInputDim("Other"); - PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(), - platform::errors::PreconditionNotMet( - "Input(Input) and Input(Other) must have the same " - "dimension size.")); - int n = input_dim.size(); - bool is_runtime = ctx->IsRuntime(); - for (int i = 0; i < n; i++) { - if (is_runtime) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } else { - if (!(input_dim[i] < 0 || other_dim[i] < 0)) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } - } - } - - ctx->SetOutputDim("Out", phi::make_ddim({1})); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -152,13 +86,13 @@ class AllcloseOpVarTypeInference : public framework::VarTypeInference { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; +DECLARE_INFER_SHAPE_FUNCTOR(allclose, AllcloseInferShapeFunctor, + PD_INFER_META(phi::AllValueCompareInferMeta)); REGISTER_OPERATOR( allclose, ops::AllcloseOp, ops::AllcloseOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, - ops::AllcloseOpVarTypeInference); -REGISTER_OP_CPU_KERNEL(allclose, ops::AllcloseKernel, - ops::AllcloseKernel); + ops::AllcloseOpVarTypeInference, AllcloseInferShapeFunctor); /* ========================== register checkpoint ===========================*/ REGISTER_OP_VERSION(allclose) diff --git a/paddle/fluid/operators/allclose_op.cu b/paddle/fluid/operators/allclose_op.cu deleted file mode 100644 index 32c90ff8fdc10..0000000000000 --- a/paddle/fluid/operators/allclose_op.cu +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/allclose_op.h" - -namespace paddle { -namespace operators { - -template -struct GetTensorValue { - T operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - const T* data = tensor.data(); - T value; - const auto gpu_place = dev_ctx.GetPlace(); - memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T), - dev_ctx.stream()); - return value; - } -}; - -template -__global__ void AllcloseCUDAKernel(const T* in_data, const T* other_data, - const double rtol, const double atol, - bool equal_nan, int num, bool* out_data) { - unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; - bool val; - for (int i = idx; i < num; i += blockDim.x * gridDim.x) { - const T a = in_data[i], b = other_data[i]; - if (isnan(a) || isnan(b)) { - val = equal_nan && isnan(a) == isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - if (!val) *out_data = false; - } -} - -template -struct AllcloseFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - int num = in.numel(); - const T* in_data = in.data(); - const T* other_data = other.data(); - bool* out_data = output->mutable_data(dev_ctx.GetPlace()); - int block = 1024; - int grid = (block - 1 + num) / block; - grid = (grid > block) ? block : grid; -#ifdef PADDLE_WITH_HIP - hipMemset(out_data, true, sizeof(bool)); -#else - cudaMemset(out_data, true, sizeof(bool)); -#endif - AllcloseCUDAKernel<<>>( - in_data, other_data, rtol, atol, equal_nan, num, out_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(allclose, ops::AllcloseKernel, - ops::AllcloseKernel); diff --git a/paddle/fluid/operators/allclose_op.h b/paddle/fluid/operators/allclose_op.h deleted file mode 100644 index 7a36754194ace..0000000000000 --- a/paddle/fluid/operators/allclose_op.h +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; - -template -struct GetTensorValue { - T operator()(const platform::DeviceContext& ctx, - const framework::Tensor& tensor) const; -}; - -template -struct AllcloseFunctor { - void operator()(const DeviceContext& ctx, const framework::Tensor& in, - const framework::Tensor& other, const float rtol, - const float atol, bool equal_nan, framework::Tensor* output); -}; - -template -class AllcloseKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // get attrs - bool equal_nan = ctx.Attr("equal_nan"); - // get input/output - const auto* input = ctx.Input("Input"); - const auto* other = ctx.Input("Other"); - auto* out = ctx.Output("Out"); - - double rtol_v = std::stod(ctx.Attr("rtol")); - double atol_v = std::stod(ctx.Attr("atol")); - - auto& dev_ctx = ctx.template device_context(); - GetTensorValue get_tensor_value; - if (ctx.HasInput("Rtol")) { - const auto* rtol = ctx.Input("Rtol"); - PADDLE_ENFORCE_EQ( - rtol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Rtol) size must be 1, but get %d.", rtol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(rtol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Rtol) type must be double, but get %s.", - framework::DataTypeToString( - framework::TransToProtoVarType(rtol->dtype())))); - rtol_v = get_tensor_value(dev_ctx, *rtol); - } - if (ctx.HasInput("Atol")) { - const auto* atol = ctx.Input("Atol"); - PADDLE_ENFORCE_EQ( - atol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Atol) size must be 1, but get %d", atol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(atol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Atol) type must be double, but get %s", - framework::DataTypeToString( - framework::TransToProtoVarType(atol->dtype())))); - atol_v = get_tensor_value(dev_ctx, *atol); - } - - AllcloseFunctor()(dev_ctx, *input, *other, rtol_v, atol_v, - equal_nan, out); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc new file mode 100644 index 0000000000000..237cfcc6f1172 --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto& dev_ctx = ctx.template device_context(); + const auto xs = ctx.MultiInput("X"); + const auto* scale = ctx.Input("Scale"); + auto outs = ctx.MultiOutput("Out"); + auto* found_inf = ctx.Output("FoundInfinite"); + + found_inf->mutable_data(dev_ctx.GetPlace()); + + MLUCnnlTensorDesc scale_desc(*scale); + MLUCnnlTensorDesc found_inf_desc(*found_inf, CNNL_LAYOUT_ARRAY, + ToCnnlDataType()); + + for (size_t i = 0; i < xs.size(); ++i) { + const auto* x = xs[i]; + auto* out = outs[i]; + out->mutable_data(ctx.GetPlace()); + + // check is_finite or is_nan + Tensor is_finite(found_inf->type()); + if (i != 0) { + is_finite.Resize(phi::make_ddim({1})); + is_finite.mutable_data(ctx.GetPlace()); + } else { + is_finite.ShareDataWith(*found_inf); + } + + MLUCnnlTensorDesc x_desc(*x); + + MLUCnnl::IsNanInf(ctx, x_desc.get(), GetBasePtr(x), + GetBasePtr(&is_finite)); + + // save is_finite by logical_and op after checking every input + if (i != 0) { + MLUCnnlTensorDesc is_finite_desc(is_finite, CNNL_LAYOUT_ARRAY, + ToCnnlDataType()); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_OR, found_inf_desc.get(), + GetBasePtr(found_inf), is_finite_desc.get(), + GetBasePtr(&is_finite), found_inf_desc.get(), + GetBasePtr(found_inf)); + } + + // The normal logic is : + // out = in, if found_inf = true + // out = in/scale, if found_inf = false + // But when found_inf is true, the data of Out should not be used. + // So, on MLU, we always compute out with in/scale. + MLUCnnlTensorDesc out_desc(*out); + MLUCnnl::Div(ctx, CNNL_COMPUTATION_HIGH_PRECISION, x_desc.get(), + GetBasePtr(x), scale_desc.get(), GetBasePtr(scale), + out_desc.get(), GetBasePtr(out)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_MLU_KERNEL(check_finite_and_unscale, + ops::CheckFiniteAndUnscaleMLUKernel, + ops::CheckFiniteAndUnscaleMLUKernel); diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc index 0f5c048b6be9c..c5e4188ca2d6f 100644 --- a/paddle/fluid/operators/arg_max_op.cc +++ b/paddle/fluid/operators/arg_max_op.cc @@ -15,23 +15,19 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/arg_min_max_op_base.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +DECLARE_INFER_SHAPE_FUNCTOR(arg_max, ArgMaxInferShapeFunctor, + PD_INFER_META(phi::ArgMinMaxInferMeta)); + REGISTER_OPERATOR( arg_max, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMaxOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL( - arg_max, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel); + paddle::framework::EmptyGradOpMaker, + ArgMaxInferShapeFunctor); + REGISTER_OP_VERSION(arg_max) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h deleted file mode 100644 index b77031f7fb4c9..0000000000000 --- a/paddle/fluid/operators/arg_min_max_op_base.cu.h +++ /dev/null @@ -1,202 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#if defined(__NVCC__) || defined(__HIPCC__) - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include -#include -#include -#include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/phi/core/ddim.h" - -namespace paddle { -namespace operators { - -namespace { // NOLINT -template -using KeyValuePair = cub::KeyValuePair; -using Tensor = framework::Tensor; - -} // end namespace - -#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ - case (1 << (log2_block_dim)): { \ - constexpr auto kBlockDim = (1 << (log2_block_dim)); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM_CASE(...) \ - FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); - -template -__global__ void ArgCUDAKernel(const int64_t height, // n * h - const int64_t width, // c - const int64_t post_size, // h - const Reducer reducer, const T init, const T* in, - IndType* out) { - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - for (int idx = blockIdx.x; idx < height; idx += gridDim.x) { - KeyValuePair kv_pair = {-1, init}; - int h = idx / post_size; - int w = idx % post_size; - for (int k = threadIdx.x; k < width; k += blockDim.x) { - kv_pair = - reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair); - } - kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer); - if (threadIdx.x == 0) { - out[idx] = static_cast(kv_pair.key); - } - __syncthreads(); - } -} - -template -void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input, - Tensor* indices, const int64_t pre, const int64_t post, - const int64_t n) { - auto cu_stream = ctx.stream(); - auto ComputeBlockSize = [](int64_t col) { - auto block_size = 8; - if (col > 512) - block_size = 1024; - else if (col > 256) - block_size = 512; - else if (col > 128) - block_size = 256; - else if (col > 64) - block_size = 128; - else if (col > 32) - block_size = 64; - else if (col > 16) - block_size = 32; - else if (col > 8) - block_size = 16; -#ifdef __HIPCC__ - block_size = std::min(block_size, 256); -#endif - return block_size; - }; - - int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; - int64_t height = pre * post; - int64_t width = n; - int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx; - - const T* in_data = input.data(); - IndType* out_data = indices->mutable_data(ctx.GetPlace()); - - if (typeid(Reducer) == typeid(cub::ArgMax)) { - switch (ComputeBlockSize(width)) { - FIXED_BLOCK_DIM_CASE( - ArgCUDAKernel<<>>( - height, width, post, Reducer(), std::numeric_limits::lowest(), - in_data, out_data)); - } - } else { - switch (ComputeBlockSize(width)) { - FIXED_BLOCK_DIM_CASE( - ArgCUDAKernel<<>>( - height, width, post, Reducer(), std::numeric_limits::max(), - in_data, out_data)); - } - } -} - -template -struct VisitDataCudaArgMinMaxFunctor { - const framework::ExecutionContext& ctx; - - explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx) - : ctx(ctx) {} - template - void apply() const { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - int axis = ctx.Attr("axis"); - const bool& flatten = ctx.Attr("flatten"); - - framework::DDim input_dims; - if (flatten) { - input_dims = phi::make_ddim({input->numel()}); - // if flatten, the axis just as 0 - axis = 0; - } else { - input_dims = input->dims(); - if (axis < 0) axis += input->dims().size(); - } - - int64_t numel = input->numel(); - int64_t groups = numel / input_dims[axis]; - int64_t pre = 1; - int64_t post = 1; - int64_t n = input_dims[axis]; - - for (int i = 0; i < axis; i++) { - pre *= input_dims[i]; - } - - for (int i = axis + 1; i < input_dims.size(); i++) { - post *= input_dims[i]; - } - - const auto& dev_ctx = ctx.cuda_device_context(); - ComputeFullArg(dev_ctx, *input, output, pre, post, n); - } -}; -template -class ArgMinMaxOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dtype = ctx.Attr("dtype"); - if (dtype < 0) { - framework::VisitDataTypeTiny( - static_cast( - framework::proto::VarType::INT64), - VisitDataCudaArgMinMaxFunctor(ctx)); - return; - } - framework::VisitDataTypeTiny( - static_cast(dtype), - VisitDataCudaArgMinMaxFunctor(ctx)); - } -}; - -#endif - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h index d3ce61d183a3d..585341beea12c 100644 --- a/paddle/fluid/operators/arg_min_max_op_base.h +++ b/paddle/fluid/operators/arg_min_max_op_base.h @@ -27,193 +27,9 @@ limitations under the License. */ namespace paddle { namespace operators { -enum ArgMinMaxType { kArgMin, kArgMax }; - -template -struct ArgMinMaxFunctor {}; - -#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value) \ - template \ - struct ArgMinMaxFunctor { \ - void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \ - framework::LoDTensor* out, framework::DDim x_dims, \ - int64_t axis, bool keepdims) { \ - auto in_eigen = framework::EigenTensor::From(in, x_dims); \ - if (keepdims) { \ - auto out_eigen = framework::EigenTensor::From(*out); \ - out_eigen.device(*(ctx.eigen_device())) = \ - in_eigen.eigen_op_type(axis).template cast(); \ - } else { \ - auto out_eigen = framework::EigenTensor::From(*out); \ - out_eigen.device(*(ctx.eigen_device())) = \ - in_eigen.eigen_op_type(axis).template cast(); \ - } \ - } \ - } - -DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin); -DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax); - -template -struct VisitDataArgMinMaxFunctor { - const framework::ExecutionContext& ctx; - - explicit VisitDataArgMinMaxFunctor(const framework::ExecutionContext& ctx) - : ctx(ctx) {} - template - void apply() const { - auto& x = *(ctx.Input("X")); - auto& out = *(ctx.Output("Out")); - out.template mutable_data(ctx.GetPlace()); - auto axis = ctx.Attr("axis"); - auto keepdims = ctx.Attr("keepdims"); - const bool& flatten = ctx.Attr("flatten"); - // paddle do not have the scalar tensor, just return the shape [1] tensor - if (flatten) keepdims = true; - - // if flatten, will construct the new dims for the cacluate - framework::DDim x_dims; - if (flatten) { - x_dims = phi::make_ddim({x.numel()}); - // if flatten, the axis just as 0 - axis = 0; - } else { - x_dims = x.dims(); - if (axis < 0) axis += x_dims.size(); - } - auto& dev_ctx = ctx.template device_context(); - -#define CALL_ARG_MINMAX_FUNCTOR(rank) \ - ArgMinMaxFunctor \ - functor##rank; \ - functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims) - - switch (x_dims.size()) { - case 1: - CALL_ARG_MINMAX_FUNCTOR(1); - break; - case 2: - CALL_ARG_MINMAX_FUNCTOR(2); - break; - case 3: - CALL_ARG_MINMAX_FUNCTOR(3); - break; - case 4: - CALL_ARG_MINMAX_FUNCTOR(4); - break; - case 5: - CALL_ARG_MINMAX_FUNCTOR(5); - break; - case 6: - CALL_ARG_MINMAX_FUNCTOR(6); - break; - default: - PADDLE_ENFORCE_LE( - x_dims.size(), 6, - platform::errors::InvalidArgument( - "%s operator doesn't supports tensors whose ranks are greater " - "than 6.", - (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"))); - break; -#undef CALL_ARG_MINMAX_FUNCTOR - } - } -}; - -template -class ArgMinMaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dtype = ctx.Attr("dtype"); - if (dtype < 0) { - framework::VisitDataTypeTiny( - static_cast( - framework::proto::VarType::INT64), - VisitDataArgMinMaxFunctor(ctx)); - return; - } - framework::VisitDataTypeTiny( - static_cast(dtype), - VisitDataArgMinMaxFunctor(ctx)); - } -}; - -template -using ArgMinKernel = ArgMinMaxKernel; - -template -using ArgMaxKernel = ArgMinMaxKernel; - class ArgMinMaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "arg_min_max"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "arg_min_max"); - const auto& x_dims = ctx->GetInputDim("X"); - int64_t axis = ctx->Attrs().Get("axis"); - bool keepdims = ctx->Attrs().Get("keepdims"); - const bool& flatten = ctx->Attrs().Get("flatten"); - - PADDLE_ENFORCE_GE(axis, -x_dims.size(), - platform::errors::InvalidArgument( - "'axis'(%d) must be greater than or equal to" - " -Rank(X)(%d).", - axis, -x_dims.size())); - PADDLE_ENFORCE_LT( - axis, x_dims.size(), - platform::errors::InvalidArgument( - "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", axis, - x_dims.size())); - - const int& dtype = ctx->Attrs().Get("dtype"); - PADDLE_ENFORCE_EQ( - (dtype < 0 || dtype == 2 || dtype == 3), true, - platform::errors::InvalidArgument( - "The attribute of dtype in argmin/argmax must be [%s] or [%s], but " - "received [%s]", - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64), - paddle::framework::DataTypeToString( - static_cast(dtype)))); - - auto x_rank = x_dims.size(); - if (axis < 0) axis += x_rank; - if (ctx->IsRuntime()) { - if (dtype == framework::proto::VarType::INT32) { - int64_t all_element_num = 0; - if (flatten) { - all_element_num = phi::product(x_dims); - - } else { - all_element_num = x_dims[axis]; - } - PADDLE_ENFORCE_LE( - all_element_num, INT_MAX, - platform::errors::InvalidArgument( - "The element num of the argmin/argmax input at axis is " - "%d, is larger than int32 maximum value:%d, you must " - "set the dtype of argmin/argmax to 'int64'.", - all_element_num, INT_MAX)); - } - } - std::vector vec; - if (flatten) { - vec.emplace_back(static_cast(1)); - } else { - for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]); - if (keepdims) { - vec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]); - } - ctx->SetOutputDim("Out", phi::make_ddim(vec)); - } }; class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc index 0a4ba6fb0bfdf..fb3abd01af8c3 100644 --- a/paddle/fluid/operators/arg_min_op.cc +++ b/paddle/fluid/operators/arg_min_op.cc @@ -12,26 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/arg_min_max_op_base.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +DECLARE_INFER_SHAPE_FUNCTOR(arg_min, ArgMinInferShapeFunctor, + PD_INFER_META(phi::ArgMinMaxInferMeta)); REGISTER_OPERATOR( arg_min, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMinOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + ArgMinInferShapeFunctor); -REGISTER_OP_CPU_KERNEL( - arg_min, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel); REGISTER_OP_VERSION(arg_min) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc index 9e525c20335d3..1a8aca777370b 100644 --- a/paddle/fluid/operators/argsort_op.cc +++ b/paddle/fluid/operators/argsort_op.cc @@ -12,40 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/argsort_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { class ArgsortOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "argsort"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "argsort"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "argsort"); - - auto in_dims = ctx->GetInputDim("X"); - int axis = ctx->Attrs().Get("axis"); - - auto num_dims = in_dims.size(); - PADDLE_ENFORCE_GE(axis, -num_dims, - platform::errors::InvalidArgument( - "'axis'(%d) must be greater than or equal to" - " -num_dims(%d).", - axis, -num_dims)); - PADDLE_ENFORCE_LT( - axis, num_dims, - platform::errors::InvalidArgument( - "'axis'(%d) must be less than num_dims(%d).", axis, num_dims)); - - ctx->ShareDim("X", "Out"); - ctx->ShareDim("X", "Indices"); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } }; class ArgsortGradOp : public framework::OperatorWithKernel { @@ -122,18 +101,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ArgsortGradNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(argsort, ArgsortInferShapeFunctor, + PD_INFER_META(phi::ArgsortInferMeta)); REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker, ops::ArgsortGradOpMaker, - ops::ArgsortGradOpMaker); + ops::ArgsortGradOpMaker, + ArgsortInferShapeFunctor); REGISTER_OPERATOR(argsort_grad, ops::ArgsortGradOp, ops::ArgsortGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL(argsort, - ops::ArgsortKernel, - ops::ArgsortKernel, - ops::ArgsortKernel, - ops::ArgsortKernel); -REGISTER_OP_CPU_KERNEL( - argsort_grad, ops::ArgsortGradientKernel, - ops::ArgsortGradientKernel, - ops::ArgsortGradientKernel, - ops::ArgsortGradientKernel); diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu deleted file mode 100644 index 8b7a0b3eadb16..0000000000000 --- a/paddle/fluid/operators/argsort_op.cu +++ /dev/null @@ -1,430 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/argsort_op.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -#ifdef __HIPCC__ -namespace rocprim { -namespace detail { -template <> -struct radix_key_codec_base - : radix_key_codec_integral {}; -} // namespace detail -} // namespace rocprim -#else -// set cub base traits in order to handle float16 -namespace cub { -template <> -struct NumericTraits - : BaseTraits {}; -} // namespace cub -#endif - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -// Iter for move to next row -struct SegmentOffsetIter { - EIGEN_DEVICE_FUNC - explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const { - return idx * num_cols_; - } - - int num_cols_; -}; - -template -static __global__ void FillIndex(T* indices, T num_rows, T num_cols) { - int col_id = threadIdx.x; - int row_id = blockIdx.x; - - for (T j = row_id; j < num_rows; j += gridDim.x) { - for (T i = col_id; i < num_cols; i += blockDim.x) { - indices[j * num_cols + i] = i; - } - } -} - -template -static __global__ void FillFlattenGrad(const T* dO, const IndType* indices, - int64_t size, T* dX) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = index; i < size; i += stride) { - dX[indices[i]] = dO[i]; - } -} - -template -static __global__ void FillGrad(const T* dO, const IndType* indices, T* dX, - IndType num_rows, IndType num_cols) { - int col_id = threadIdx.x; - int row_id = blockIdx.x; - - for (IndType j = row_id; j < num_rows; j += gridDim.x) { - for (IndType i = col_id; i < num_cols; i += blockDim.x) { - dX[j * num_cols + indices[j * num_cols + i]] = dO[j * num_cols + i]; - } - } -} - -// Sort by flag descending, True: descending. False: Ascending. -// Default is false. -template -void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, - Tensor* output, Tensor* indices, const IndType num_rows, - const IndType num_cols, const bool descending) { - auto cu_stream = ctx.stream(); - - Tensor input_indices; - - const std::vector dims = {num_rows, num_cols}; - auto dim = phi::make_ddim(dims); - input_indices.Resize(dim); - input_indices.mutable_data(ctx.GetPlace()); - - size_t temp_storage_bytes = -1; - - auto ComputeBlockSize = [](IndType col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - - int block_size = ComputeBlockSize(num_cols); - - int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - // actually, int num_rows < max_grid_size - int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; - // Init a index array - FillIndex<<>>( - input_indices.data(), num_rows, num_cols); - - T* sorted_out_ptr; - IndType* sorted_indices_ptr; - - const T* inp = input->data(); - T* out = output->mutable_data(ctx.GetPlace()); - IndType* ind = indices->mutable_data(ctx.GetPlace()); - - sorted_out_ptr = out; - sorted_indices_ptr = ind; - - // create iter for counting input - cub::CountingInputIterator counting_iter(0); - // segment_offset is used for move to next row - cub::TransformInputIterator> - segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols)); - - gpuError_t err; - if (descending) { - err = cub::DeviceSegmentedRadixSort::SortPairsDescending( - nullptr, temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } else { - err = cub::DeviceSegmentedRadixSort::SortPairs( - nullptr, temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } - PADDLE_ENFORCE_GPU_SUCCESS(err); - - Tensor temp_storage; - temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); - - if (descending) { - err = cub::DeviceSegmentedRadixSort::SortPairsDescending( - temp_storage.data(), temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } else { - err = cub::DeviceSegmentedRadixSort::SortPairs( - temp_storage.data(), temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } - - PADDLE_ENFORCE_GPU_SUCCESS(err); -} - -template -void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO, - const Tensor* indices, Tensor* dX, const IndType num_rows, - const IndType num_cols) { - auto cu_stream = ctx.stream(); - - auto ComputeBlockSize = [](IndType col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - - int block_size = ComputeBlockSize(num_cols); - - int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - // actually, int num_rows < max_grid_size - int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; - FillGrad<<>>( - dO->data(), indices->data(), dX->data(), num_rows, - num_cols); -} - -template -void ArgFlattenAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO, - const Tensor* indices, int64_t size, Tensor* dX) { - auto cu_stream = ctx.stream(); - - const int64_t block_size = - std::min(size, static_cast(ctx.GetMaxThreadsPerBlock())); - int64_t max_threads = ctx.GetMaxPhysicalThreadCount(); - const int64_t max_blocks = - std::max(((max_threads - 1) / block_size + 1), static_cast(1)); - const int64_t grid_size = - std::min(max_blocks, (size + block_size - 1) / block_size); - - FillFlattenGrad<<>>( - dO->data(), indices->data(), size, dX->data()); -} - -template -class ArgsortOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = ctx.Attr("axis"); - bool descending = ctx.Attr("descending"); - - auto in_dims = input->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - const T* in_data = input->data(); - auto size = input->numel(); - T* out_data = output->mutable_data(ctx.GetPlace()); - int64_t* ids_data = indices->mutable_data(ctx.GetPlace()); - - // Use thrust for parallel acceleration when the input size is equal to the - // length of the ā€˜axisā€™ dimension. - // Compared to the following 'Special case for full sort', ascending sort is - // 34 times faster and descending sort is 31 times faster. - if (size == in_dims[axis]) { - thrust::sequence(thrust::device, ids_data, ids_data + size); - thrust::copy(thrust::device, in_data, in_data + size, out_data); - thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data); - if (descending) { - thrust::reverse(thrust::device, out_data, out_data + size); - thrust::reverse(thrust::device, ids_data, ids_data + size); - } - return; - } - - // Special case for full sort, speedup ~190x. - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - ArgFullSort(dev_ctx, input, output, indices, input_height, - input_width, descending); - } else { - // if not full sort, do transpose first - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_inp; - T* trans_inp_data = trans_inp.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - // Do transpose - TransCompute(ndims, dev_ctx, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - T* out_data = output->mutable_data(ctx.GetPlace()); - - Tensor tmp_indices; - // temp indices for sorting - tmp_indices.mutable_data(trans_dims, ctx.GetPlace()); - indices->mutable_data(ctx.GetPlace()); - - ArgFullSort(dev_ctx, &trans_inp, &tmp_out, &tmp_indices, - input_height, input_width, descending); - - TransCompute( - ndims, dev_ctx, tmp_indices, indices, trans); - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, - output, trans); - return; - } - } -}; - -template -class ArgsortGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* indices = ctx.Input("Indices"); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dO = ctx.Input(framework::GradVarName("Out")); - int axis = ctx.Attr("axis"); - - dX->mutable_data(ctx.GetPlace()); - if (dO->numel() == 0) return; - - auto in_dims = dX->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - int64_t size = dX->numel(); - const auto& dev_ctx = ctx.cuda_device_context(); - - // Parallel acceleration when the input size is equal to the length of the - // ā€˜axisā€™ dimension. - // Compared to 'special case for full sort' below, the gradient calculation - // is 10 times faster. - if (size == in_dims[axis]) { - ArgFlattenAssign(dev_ctx, dO, indices, size, dX); - return; - } - - // Special case for full sort, speedup ~190x. - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - ArgFullAssign(dev_ctx, dO, indices, dX, input_height, - input_width); - } else { - // if not full sort, do transpose first - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, ctx.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - // Do transpose - TransCompute(ndims, dev_ctx, *dO, - &trans_dO, trans); - TransCompute( - ndims, dev_ctx, *indices, &trans_ind, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - - ArgFullAssign(dev_ctx, &trans_dO, &trans_ind, &tmp_out, - input_height, input_width); - - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, dX, - trans); - return; - } - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - argsort, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - argsort_grad, paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel); diff --git a/paddle/fluid/operators/argsort_op.h b/paddle/fluid/operators/argsort_op.h deleted file mode 100644 index d850e51a4bf06..0000000000000 --- a/paddle/fluid/operators/argsort_op.h +++ /dev/null @@ -1,243 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; - -template -using EigenVector = framework::EigenVector; - -using Tensor = framework::Tensor; - -template -static void FullSort(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, Type* t_indices, - bool descending) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.push_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.push_back(std::pair(e_input(i, j), j)); - } - } - std::sort(col_vec.begin(), col_vec.end(), - [&](const std::pair& l, const std::pair& r) { - if (descending) - return l.first > r.first; - else - return l.first < r.first; - }); - - for (Type j = 0; j < input_width; ++j) { - t_out[i * input_width + j] = col_vec[j].first; - t_indices[i * input_width + j] = col_vec[j].second; - } - } -} - -template -static void FullAssign(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, - const framework::Tensor* indices, T* t_out) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = EigenVector::Flatten(*input); - auto e_indices = EigenVector::Flatten(*indices); - for (Type j = 0; j < input_width; ++j) { - t_out[i * input_width + e_indices(j)] = e_input(j); - } - } else { - auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - t_out[i * input_width + e_indices(i, j)] = e_input(i, j); - } - } - } -} - -template -class ArgsortKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = ctx.Attr("axis"); - bool descending = ctx.Attr("descending"); - - auto in_dims = input->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - T* out_data = output->mutable_data(ctx.GetPlace()); - - // Do full sort - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - int64_t* ids_data = indices->mutable_data(ctx.GetPlace()); - FullSort(input_height, input_width, in_dims.size(), input, - out_data, ids_data, descending); - } else { - // If not full sort do transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_inp; - trans_inp.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - auto& dev_ctx = ctx.template device_context(); - // Do transpose - TransCompute(ndims, dev_ctx, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - output->mutable_data(ctx.GetPlace()); - - Tensor tmp_indices; - - auto* t_ind = - tmp_indices.mutable_data(trans_dims, ctx.GetPlace()); - - FullSort(input_height, input_width, in_dims.size(), - &trans_inp, t_out, t_ind, descending); - - indices->mutable_data(ctx.GetPlace()); - TransCompute( - ndims, dev_ctx, tmp_indices, indices, trans); - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, - output, trans); - } - } -}; - -template -class ArgsortGradientKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* indices = ctx.Input("Indices"); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dO = ctx.Input(framework::GradVarName("Out")); - int axis = ctx.Attr("axis"); - - auto in_dims = indices->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto& place = *ctx.template device_context() - .eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - - // Do full assign - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - FullAssign(input_height, input_width, in_dims.size(), dO, - indices, dX->data()); - } else { - // If not full assign do transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, ctx.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - auto& dev_ctx = ctx.template device_context(); - // Do transpose - TransCompute(ndims, dev_ctx, *dO, - &trans_dO, trans); - TransCompute( - ndims, dev_ctx, *indices, &trans_ind, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - - FullAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out); - - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, dX, - trans); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc index 077be715bece0..c927eec00bc8b 100644 --- a/paddle/fluid/operators/argsort_op_npu.cc +++ b/paddle/fluid/operators/argsort_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/argsort_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/argsort_op_xpu.cc b/paddle/fluid/operators/argsort_op_xpu.cc index 18e81936a16c6..359b00fcf87ee 100644 --- a/paddle/fluid/operators/argsort_op_xpu.cc +++ b/paddle/fluid/operators/argsort_op_xpu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/argsort_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc index 684ac5bafd0ef..ea6614cbfbdf8 100644 --- a/paddle/fluid/operators/assign_op.cc +++ b/paddle/fluid/operators/assign_op.cc @@ -16,6 +16,9 @@ limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { class OpDesc; @@ -36,26 +39,6 @@ class AssignOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - if (ctx->HasInput("X")) { - auto type = ctx->GetInputsVarType("X")[0]; - if (type == framework::proto::VarType::SELECTED_ROWS || - type == framework::proto::VarType::LOD_TENSOR) { - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("X", /*->*/ "Out"); - } - } else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) { - if (ctx->IsRuntime()) { - // The runtime output shape is determined in kernel. - return; - } else { - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - } - } - } - } - protected: framework::OpKernelType GetKernelTypeForVar( const std::string &var_name, const framework::Tensor &tensor, @@ -91,24 +74,6 @@ class AssignInferVarType : public framework::VarTypeInference { } }; -class AssignKernel { - public: - void operator()(const framework::ExecutionContext &ctx) const { - auto *x = ctx.InputVar("X"); - if (x == nullptr) { - return; - } - PADDLE_ENFORCE_EQ( - ctx.HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of assign_op is not found.")); - auto *out = ctx.OutputVar("Out"); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(ctx.GetPlace()); - - framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); - } -}; - class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -147,23 +112,11 @@ DECLARE_INPLACE_OP_INFERER(AssignOpInplaceInferer, {"X", "Out"}); namespace ops = paddle::operators; namespace plat = paddle::platform; + +DECLARE_INFER_SHAPE_FUNCTOR(assign, AssignInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker, ops::AssignGradMaker, ops::AssignOpProtoMaker, ops::AssignOpInplaceInferer, - ops::AssignInferVarType); - -REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double, - ops::AssignKernel, int, ops::AssignKernel, - int64_t, ops::AssignKernel, uint8_t, - ops::AssignKernel, bool, ops::AssignKernel, - plat::float16, ops::AssignKernel, plat::bfloat16, - ops::AssignKernel); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double, - ops::AssignKernel, int, ops::AssignKernel, - int64_t, ops::AssignKernel, uint8_t, - ops::AssignKernel, bool, ops::AssignKernel, - plat::float16, ops::AssignKernel); -#endif + ops::AssignInferVarType, AssignInferShapeFunctor); diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc index 72488a932d9c3..b91eb50646fec 100644 --- a/paddle/fluid/operators/assign_op_npu_test.cc +++ b/paddle/fluid/operators/assign_op_npu_test.cc @@ -23,14 +23,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(assign); +USE_OP_ITSELF(assign); USE_OP_DEVICE_KERNEL(assign, NPU); template diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 174207deb08b8..5194c8772e47b 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -21,6 +21,9 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/multiary.h" + namespace paddle { namespace operators { @@ -297,184 +300,6 @@ The required data format for this layer is one of the following: )DOC"); } -template -class BatchNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - float momentum = ctx.Attr("momentum"); - const bool is_test = ctx.Attr("is_test"); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool trainable_stats = ctx.Attr("trainable_statistics"); - bool test_mode = is_test && (!trainable_stats); - - bool global_stats = test_mode || use_global_stats; - - const std::string data_layout_str = ctx.Attr("data_layout"); - DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - - const auto *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_GE( - x_dims.size(), 2, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be larger than 1." - "But received: the size of input X's dimensions is [%d]", - x_dims.size())); - PADDLE_ENFORCE_LE( - x_dims.size(), 5, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be less than 6." - "But received: the size of input X's dimensionss is [%d]", - x_dims.size())); - const int N = x_dims[0]; - const int C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); - const int sample_size = x->numel() / N / C; - - auto *y = ctx.Output("Y"); - - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - - // alloc memory - y->mutable_data(ctx.GetPlace()); - mean_out->mutable_data(ctx.GetPlace()); - variance_out->mutable_data(ctx.GetPlace()); - saved_mean->mutable_data(ctx.GetPlace()); - saved_variance->mutable_data(ctx.GetPlace()); - - // input dimension is 2 and the format is NCHW. The input can be regarded - // as NHWC format - if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { - data_layout = DataLayout::kNHWC; - } - - if (!global_stats) { - // saved_xx is use just in this batch of data - EigenVectorArrayMap saved_mean_e( - saved_mean->mutable_data(ctx.GetPlace()), C); - EigenVectorArrayMap saved_variance_e( - saved_variance->mutable_data(ctx.GetPlace()), C); - saved_mean_e.setZero(); - saved_variance_e.setZero(); - - EigenVectorArrayMap running_mean_arr( - mean_out->mutable_data(ctx.GetPlace()), C); - EigenVectorArrayMap running_var_arr( - variance_out->mutable_data(ctx.GetPlace()), C); - - if ((N * sample_size) == 1) { - // Only 1 element in normalization dimension, - // we skip the batch norm calculation, let y = x. - framework::TensorCopy(*x, ctx.GetPlace(), y); - return; - } - - switch (data_layout) { - case DataLayout::kNCHW: { - ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); - for (int nc = 0; nc < N * C; ++nc) { - saved_mean_e(nc % C) += x_arr.col(nc).sum(); - } - saved_mean_e /= N * sample_size; - for (int nc = 0; nc < N * C; ++nc) { - saved_variance_e(nc % C) += - (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm(); - } - saved_variance_e /= N * sample_size; - break; - } - case DataLayout::kNHWC: { - ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); - for (int i = 0; i < N * sample_size; ++i) { - saved_mean_e += x_arr.col(i); - } - saved_mean_e /= N * sample_size; - for (int i = 0; i < N * sample_size; ++i) { - saved_variance_e += - (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e); - } - saved_variance_e /= N * sample_size; - break; - } - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Unknown storage order: %s", data_layout_str)); - } - - // if MomentumTensor is set, use MomentumTensor value, momentum - // is only used in this training branch - if (ctx.HasInput("MomentumTensor")) { - const auto *mom_tensor = ctx.Input("MomentumTensor"); - momentum = mom_tensor->data()[0]; - } - - running_mean_arr = - running_mean_arr * momentum + saved_mean_e * (1. - momentum); - running_var_arr = - running_var_arr * momentum + saved_variance_e * (1. - momentum); - } - - // use SavedMean and SavedVariance to do normalize - Eigen::Array inv_std(C); - if (global_stats) { - ConstEigenVectorArrayMap var_arr( - ctx.Input("Variance")->data(), C); - inv_std = (var_arr + epsilon).sqrt().inverse(); - } else { - EigenVectorArrayMap saved_inv_std( - ctx.Output("SavedVariance")->data(), C); - // inverse SavedVariance first, gradient will use it too. - saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt(); - inv_std = saved_inv_std; - } - ConstEigenVectorArrayMap mean_arr( - global_stats ? ctx.Input("Mean")->data() - : ctx.Output("SavedMean")->data(), - C); - - // ((x - est_mean) * (inv_var) * scale + bias - // formula transform ====> - // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - ConstEigenVectorArrayMap scale_arr(scale->data(), C); - ConstEigenVectorArrayMap bias_arr(bias->data(), C); - Eigen::Array new_scale = inv_std * scale_arr; - Eigen::Array new_bias = - bias_arr - mean_arr * inv_std * scale_arr; - - switch (data_layout) { - case DataLayout::kNCHW: { - EigenArrayMap y_arr(y->mutable_data(ctx.GetPlace()), sample_size, - N * C); - ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); - for (int nc = 0; nc < N * C; ++nc) { - y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C); - } - break; - } - case DataLayout::kNHWC: { - EigenArrayMap(y->mutable_data(ctx.GetPlace()), C, - N * sample_size) = - (ConstEigenArrayMap(x->data(), C, N * sample_size).colwise() * - new_scale) - .colwise() + - new_bias; - break; - } - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Unknown storage order: %d", data_layout)); - } - } -}; - void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const { // check input OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "BatchNormGrad"); @@ -585,261 +410,6 @@ framework::OpKernelType BatchNormGradOp::GetKernelTypeForVar( tensor.place(), tensor.layout()); } -template -class BatchNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - const auto *saved_mean = ctx.Input("SavedMean"); - // SavedVariance have been reverted in forward operator - const auto *saved_inv_variance = ctx.Input("SavedVariance"); - const std::string data_layout_str = ctx.Attr("data_layout"); - bool use_global_stats = ctx.Attr("use_global_stats"); - const bool is_test = ctx.Attr("is_test"); - const float epsilon = ctx.Attr("epsilon"); - DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - use_global_stats = is_test || use_global_stats; - - // batch_norm with inplace as false will take X as grad input, which - // is same as cuDNN batch_norm backward calculation, batch_norm - // with inplace as true only take Y as input and X should be calculate - // by inverse operation of batch_norm on Y - const Tensor *x; - bool is_inplace; - if (ctx.HasInput("Y")) { - x = ctx.Input("Y"); - is_inplace = true; - // if the input of batch norm is stop_gradient, d_x is null. - if (d_x) { - PADDLE_ENFORCE_EQ(d_x, d_y, - platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD not inplace in inplace mode")); - } - } else { - x = ctx.Input("X"); - is_inplace = false; - if (d_x) { - PADDLE_ENFORCE_NE( - d_x, d_y, platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD inplaced in non-inplace mode")); - } - } - - // Get the size for each dimension. - // NCHW [batch_size, in_channels, in_height, in_width] - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_GE( - x_dims.size(), 2, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be larger than 1." - "But received: the size of input X's dimensions is [%d]", - x_dims.size())); - PADDLE_ENFORCE_LE( - x_dims.size(), 5, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be less than 6." - "But received: the size of input X's dimensions is [%d]", - x_dims.size())); - const int N = x_dims[0]; - const int C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); - const int sample_size = x->numel() / N / C; - - // input dimension is 2 and the format is NCHW. The input can be regarded as - // NHWC format - if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { - data_layout = DataLayout::kNHWC; - } - - // init output - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - } - - const T *mean_data = saved_mean->data(); - const T *inv_var_data = saved_inv_variance->data(); - Tensor inv_var_tensor; - if (use_global_stats) { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_variance = ctx.Input("Variance"); - mean_data = running_mean->data(); - inv_var_tensor.Resize({C}); - T *running_inv_var_data = inv_var_tensor.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); - ConstEigenVectorArrayMap var_arr(running_variance->data(), C); - - inv_var_tmp = (var_arr + epsilon).sqrt().inverse(); - inv_var_data = running_inv_var_data; - } - - ConstEigenVectorArrayMap scale_arr(scale->data(), C); - ConstEigenVectorArrayMap bias_arr(bias->data(), C); - ConstEigenVectorArrayMap mean_arr(mean_data, C); - ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); - - T *d_bias_data = nullptr; - T *d_scale_data = nullptr; - if (d_scale && d_bias) { - d_scale->mutable_data(ctx.GetPlace()); - d_bias->mutable_data(ctx.GetPlace()); - d_bias_data = d_bias->mutable_data(ctx.GetPlace()); - d_scale_data = d_scale->mutable_data(ctx.GetPlace()); - } - - // d_bias = np.sum(d_y, axis=0) - // d_scale = np.sum((X - mean) / inv_std * dy, axis=0) - // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0) - // - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0)) - EigenVectorArrayMap d_bias_arr(d_bias_data, C); - EigenVectorArrayMap d_scale_arr(d_scale_data, C); - - if (d_scale && d_bias) { - d_bias_arr.setZero(); - d_scale_arr.setZero(); - } - - if (d_x && (N * sample_size) == 1 && !use_global_stats) { - framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); - return; - } - - int scale_coefff = use_global_stats ? 1 : N * sample_size; - const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff; - - Tensor dy_sum; - dy_sum.Resize({C}); - dy_sum.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap dy_sum_arr(dy_sum.mutable_data(ctx.GetPlace()), - C); - - Tensor dy_mul_x_sub_mean_mul_invstd_sum; - dy_mul_x_sub_mean_mul_invstd_sum.Resize({C}); - dy_mul_x_sub_mean_mul_invstd_sum.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap dy_mul_x_sub_mean_mul_invstd_sum_arr( - dy_mul_x_sub_mean_mul_invstd_sum.mutable_data(ctx.GetPlace()), C); - - dy_sum_arr.setZero(); - dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero(); - - // inplace calculation - // Y: ((x - est_mean) * (inv_var) * scale + bias - // formula transform ====> - // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) - // X: (y - bias) / scale / (inv_var) + est_mean - // formula transform ====> - // (y - bias) / (scale * inv_var) + est_mean - switch (data_layout) { - case DataLayout::kNCHW: { - if (is_inplace) { - auto px = *x; - EigenArrayMap x_data(px.mutable_data(ctx.GetPlace()), - sample_size, N * C); - ConstEigenArrayMap y_data(x->data(), sample_size, N * C); - for (int nc = 0; nc < N * C; ++nc) { - x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) / - scale_inv_var_nhw(nc % C) / scale_coefff + - mean_arr(nc % C); - } - } - ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); - ConstEigenArrayMap d_y_arr(d_y->data(), sample_size, N * C); - - for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - dy_sum_arr(c) += d_y_arr.col(nc).sum(); - dy_mul_x_sub_mean_mul_invstd_sum_arr(c) += - ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc)) - .sum(); - } - - if (d_scale && d_bias) { - d_bias_arr = dy_sum_arr; - d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; - } - - if (d_x) { - EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), - sample_size, N * C); - if (!use_global_stats) { - for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - d_x_arr.col(nc) = - scale_inv_var_nhw(c) * - (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) - - (x_arr.col(nc) - mean_arr[c]) * - dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * - inv_var_arr(c)); - } - } else { - for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc); - } - } - } - break; - } - case DataLayout::kNHWC: { - if (is_inplace) { - auto px = *x; - EigenArrayMap x_data(px.mutable_data(ctx.GetPlace()), C, - N * sample_size); - ConstEigenArrayMap y_data(x->data(), C, N * sample_size); - for (int nhw = 0; nhw < N * sample_size; nhw++) { - x_data.col(nhw) = (y_data.col(nhw) - bias_arr) / scale_inv_var_nhw / - scale_coefff + - mean_arr; - } - } - ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); - ConstEigenArrayMap d_y_arr(d_y->data(), C, N * sample_size); - - for (int nhw = 0; nhw < N * sample_size; ++nhw) { - dy_sum_arr += d_y_arr.col(nhw); - dy_mul_x_sub_mean_mul_invstd_sum_arr += - (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw); - } - - if (d_scale && d_bias) { - d_bias_arr = dy_sum_arr; - d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; - } - - if (d_x) { - EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), C, - N * sample_size); - if (!use_global_stats) { - for (int nhw = 0; nhw < N * sample_size; ++nhw) { - d_x_arr.col(nhw) = - scale_inv_var_nhw * - (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr - - (x_arr.col(nhw) - mean_arr) * - dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr); - } - } else { - for (int nhw = 0; nhw < N * sample_size; ++nhw) { - d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw); - } - } - } - break; - } - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Unknown storage order: %s", data_layout_str)); - } - } -}; - template void BatchNormGradMaker::Apply(GradOpPtr op) const { op->SetType(this->ForwardOpType() + "_grad"); @@ -951,335 +521,16 @@ framework::OpKernelType BatchNormDoubleGradOp::GetExpectedKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); } -template -class BatchNormDoubleGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *X = ctx.Input("X"); - const auto *Scale = ctx.Input("Scale"); - const auto *dY = ctx.Input("DY"); - const auto *Saved_mean = ctx.Input("SavedMean"); - const auto *Saved_variance = ctx.Input("SavedVariance"); - const float epsilon = ctx.Attr("epsilon"); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool is_test = ctx.Attr("is_test"); - - PADDLE_ENFORCE_EQ( - is_test, false, - platform::errors::InvalidArgument( - "`is_test = True` CANNOT be used in train program. If " - "you want to use global status in pre_train model, " - "please set `use_global_stats = True`")); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - - const auto *ddX = ctx.Input("DDX"); - const auto *ddScale = ctx.Input("DDScale"); - const auto *ddBias = ctx.Input("DDBias"); - - auto *dX = ctx.Output("DX"); - auto *dScale = ctx.Output("DScale"); - auto *ddY = ctx.Output("DDY"); - dX->mutable_data(ctx.GetPlace()); - ddY->mutable_data(ctx.GetPlace()); - - auto &dev_ctx = ctx.template device_context(); - - const auto &x_dims = X->dims(); - const int C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); - const int sample_size = X->numel() / C; - phi::funcs::SetConstant set_constant; - - const T *mean_data = Saved_mean->data(); - const T *inv_var_data = Saved_variance->data(); - - Tensor inv_var_tensor; - if (use_global_stats) { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_variance = ctx.Input("Variance"); - mean_data = running_mean->data(); - inv_var_tensor.Resize({C}); - - T *running_inv_var_data = inv_var_tensor.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); - ConstEigenVectorArrayMap var_arr(running_variance->data(), C); - - inv_var_tmp = (var_arr + epsilon).sqrt().inverse(); - inv_var_data = running_inv_var_data; - } - - // transpose NCHW -> NHWC for easy calculate - Tensor transformed_x(X->type()); - Tensor transformed_dy(dY->type()); - Tensor transformed_ddx(ddX->type()); - - Tensor transformed_dx(dX->type()); - Tensor transformed_ddy(ddY->type()); - if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; - // Input Tensor - ResizeToChannelLast(ctx, X, - &transformed_x); - TransToChannelLast(ctx, X, &transformed_x); - ResizeToChannelLast(ctx, dY, - &transformed_dy); - TransToChannelLast(ctx, dY, - &transformed_dy); - ResizeToChannelLast(ctx, ddX, - &transformed_ddx); - TransToChannelLast(ctx, ddX, - &transformed_ddx); - // Output Tensor - ResizeToChannelLast(ctx, dX, - &transformed_dx); - ResizeToChannelLast(ctx, ddY, - &transformed_ddy); - } else { - transformed_x.ShareDataWith(*X); - transformed_dy.ShareDataWith(*dY); - transformed_ddx.ShareDataWith(*ddX); - - transformed_dx.ShareDataWith(*dX); - transformed_ddy.ShareDataWith(*ddY); - } - - ConstEigenArrayMap x_arr(transformed_x.data(), C, sample_size); - ConstEigenVectorArrayMap mean_arr(mean_data, C); - ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); - - Tensor mean_tile; - mean_tile.Resize({C, sample_size}); - mean_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap mean_tile_data(mean_tile.mutable_data(ctx.GetPlace()), - C, sample_size); - - Tensor inv_var_tile; - inv_var_tile.Resize({C, sample_size}); - inv_var_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap inv_var_tile_data( - inv_var_tile.mutable_data(ctx.GetPlace()), C, sample_size); - - mean_tile_data = mean_arr.replicate(1, sample_size); - inv_var_tile_data = inv_var_arr.replicate(1, sample_size); - - Tensor Scale_data; - if (!Scale) { - Scale_data.mutable_data({C}, ctx.GetPlace()); - set_constant(dev_ctx, &Scale_data, static_cast(1)); - } - ConstEigenVectorArrayMap scale_arr( - Scale ? Scale->data() : Scale_data.data(), C); - - Tensor scale_tile; - scale_tile.Resize({C, sample_size}); - scale_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap scale_tile_data(scale_tile.mutable_data(ctx.GetPlace()), - C, sample_size); - scale_tile_data = scale_arr.replicate(1, sample_size); - - ConstEigenArrayMap dy_arr(transformed_dy.data(), C, sample_size); - ConstEigenArrayMap ddx_arr(transformed_ddx.data(), C, sample_size); - - Tensor x_sub_mean_mul_invstd; - x_sub_mean_mul_invstd.Resize({C, sample_size}); - x_sub_mean_mul_invstd.mutable_data(ctx.GetPlace()); - EigenArrayMap x_sub_mean_mul_invstd_arr( - x_sub_mean_mul_invstd.mutable_data(ctx.GetPlace()), C, sample_size); - x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data; - - if (dX) { - dX->mutable_data(ctx.GetPlace()); - EigenArrayMap dx_arr(transformed_dx.mutable_data(ctx.GetPlace()), C, - sample_size); - dx_arr.setZero(); - if (use_global_stats) { - // math: dx = (ddscale * dy) * inv_var - if (ddScale) { - ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); - Tensor ddscale_tile; - ddscale_tile.Resize({C, sample_size}); - EigenArrayMap ddscale_tile_data( - ddscale_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddscale_tile_data = ddscale_arr.replicate(1, sample_size); - - dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data; - } - } else { - // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx, - // axis=(n,h,w)) * - // np.sum(dy, axis=(n,h,w)) - - // np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x - - // mean), - // axis=(n,h,w)) * inv_var.pow(2) * - // np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) / - // NxHxW * - // np.sum(ddx * (x - mean)) * - // (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW * - // np.sum(dy, - // axis=(n,h,w)) * (x - mean) * - // (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var - - // inv_var - // * - // np.mean(dy, axis=(n,h,w)) - - // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), - // axis=(n,h,w))) - - if (ddX) { - dx_arr += - (x_sub_mean_mul_invstd_arr * inv_var_tile_data * - inv_var_tile_data / sample_size) - .colwise() * - (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size - - (dy_arr * ddx_arr).rowwise().sum() + - 3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() * - (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / - sample_size); - - dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() * - (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / - sample_size * - (dy_arr.rowwise().sum() / sample_size - dy_arr); - - dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() * - (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / - sample_size * - (ddx_arr.rowwise().sum() / sample_size - ddx_arr); - - dx_arr = scale_tile_data * dx_arr; - } - if (ddScale) { - ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); - Tensor ddscale_tile; - ddscale_tile.Resize({C, sample_size}); - EigenArrayMap ddscale_tile_data( - ddscale_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddscale_tile_data = ddscale_arr.replicate(1, sample_size); - - dx_arr += (dy_arr * inv_var_tile_data - - (dy_arr.rowwise().sum().replicate(1, sample_size) / - sample_size) * - inv_var_tile_data - - x_sub_mean_mul_invstd_arr * inv_var_tile_data * - (dy_arr * x_sub_mean_mul_invstd_arr) - .rowwise() - .sum() - .replicate(1, sample_size) / - sample_size) * - ddscale_tile_data; - } - } - if (data_layout == DataLayout::kNCHW) { - VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; - TransToChannelFirst( - ctx, &transformed_dx, dX); - } - } - if (dScale) { - dScale->mutable_data(ctx.GetPlace()); - EigenVectorArrayMap dscale_arr(dScale->mutable_data(ctx.GetPlace()), - C); - dscale_arr.setZero(); - if (use_global_stats) { - // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var - if (ddX) { - dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum(); - } - } else { - // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) * - // inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) * - // ddx - if (ddX) { - Tensor first_grad; - first_grad.Resize({C, sample_size}); - EigenArrayMap first_grad_arr( - first_grad.mutable_data(ctx.GetPlace()), C, sample_size); - first_grad_arr.setZero(); - - first_grad_arr += - inv_var_tile_data * - (dy_arr - - dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size - - x_sub_mean_mul_invstd_arr * - (dy_arr * x_sub_mean_mul_invstd_arr) - .rowwise() - .sum() - .replicate(1, sample_size) / - sample_size); - dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum(); - } - } - } - - if (ddY) { - ddY->mutable_data(ctx.GetPlace()); - EigenArrayMap ddy_arr(transformed_ddy.mutable_data(ctx.GetPlace()), - C, sample_size); - ddy_arr.setZero(); - if (use_global_stats) { - // math: ddy = r * ddx * inv_var + ddbias + - // ddscale * (x - mean) * inv_var - if (ddX) { - ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data; - } - } else { - // math: ddy = (x - mean) * inv_var * ddscale + ddbias + - // scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) * - // np.mean(ddx * (x - mean), axis=(n,h,w))) - if (ddX) { - ddy_arr += - scale_tile_data * inv_var_tile_data * - (ddx_arr - - ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size - - x_sub_mean_mul_invstd_arr * - (ddx_arr * x_sub_mean_mul_invstd_arr) - .rowwise() - .sum() - .replicate(1, sample_size) / - sample_size); - } - } - if (ddScale) { - ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); - Tensor ddscale_tile; - ddscale_tile.Resize({C, sample_size}); - EigenArrayMap ddscale_tile_data( - ddscale_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddscale_tile_data = ddscale_arr.replicate(1, sample_size); - - ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data; - } - - if (ddBias) { - ConstEigenVectorArrayMap ddbias_arr(ddBias->data(), C); - Tensor ddbias_tile; - ddbias_tile.Resize({C, sample_size}); - EigenArrayMap ddbias_tile_data( - ddbias_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddbias_tile_data = ddbias_arr.replicate(1, sample_size); - - ddy_arr += ddbias_tile_data; - } - - if (data_layout == DataLayout::kNCHW) { - VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; - TransToChannelFirst( - ctx, &transformed_ddy, ddY); - } - } - } -}; - DECLARE_INPLACE_OP_INFERER(BatchNormDoubleGradOpInplaceInferer, {"DY", "DDY"}); } // namespace operators } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(batch_norm, BatchNormInferShapeFunctor, + PD_INFER_META(phi::BatchNormInferMeta)); + REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, ops::BatchNormOpInferVarType, ops::BatchNormGradMaker, diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h index f8d37d685b929..d274e8d2c006d 100644 --- a/paddle/fluid/operators/batch_norm_op.h +++ b/paddle/fluid/operators/batch_norm_op.h @@ -113,23 +113,5 @@ class BatchNormOpInferVarType } }; -template -class BatchNormKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - -template -class BatchNormGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - -template -class BatchNormDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc index 0e64b461786cc..6507890a8b5dc 100644 --- a/paddle/fluid/operators/batch_norm_op_mlu.cc +++ b/paddle/fluid/operators/batch_norm_op_mlu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { @@ -20,6 +21,8 @@ namespace operators { template class MLUBatchNormOpKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + public: void Compute(const framework::ExecutionContext &ctx) const override { const auto &place = ctx.GetPlace(); @@ -68,10 +71,10 @@ class MLUBatchNormOpKernel : public framework::OpKernel { // alloc memory y->mutable_data(place); - mean_out->mutable_data(place); - variance_out->mutable_data(place); - saved_mean->mutable_data(place); - saved_variance->mutable_data(place); + mean_out->mutable_data(place); + variance_out->mutable_data(place); + saved_mean->mutable_data(place); + saved_variance->mutable_data(place); Tensor transformed_x; Tensor transformed_y; @@ -132,6 +135,8 @@ class MLUBatchNormOpKernel : public framework::OpKernel { template class MLUBatchNormGradOpKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + public: void Compute(const framework::ExecutionContext &ctx) const override { const auto *x = ctx.Input("X"); @@ -154,10 +159,10 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel { auto &dev_ctx = ctx.template device_context(); auto d_x_tmp = ctx.AllocateTmpTensor(x->dims(), dev_ctx); - auto scale_grad_tmp = - ctx.AllocateTmpTensor(scale->dims(), dev_ctx); + auto scale_grad_tmp = ctx.AllocateTmpTensor( + scale->dims(), dev_ctx); auto bias_grad_tmp = - ctx.AllocateTmpTensor(bias->dims(), dev_ctx); + ctx.AllocateTmpTensor(bias->dims(), dev_ctx); if (d_x == nullptr) { d_x = &d_x_tmp; @@ -171,8 +176,8 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel { const auto &place = ctx.GetPlace(); d_x->mutable_data(place); - d_scale->mutable_data(place); - d_bias->mutable_data(place); + d_scale->mutable_data(place); + d_bias->mutable_data(place); use_global_stats = is_test || use_global_stats; diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc index a70b6e991161d..ae03ecbcb16a0 100644 --- a/paddle/fluid/operators/batch_norm_op_npu.cc +++ b/paddle/fluid/operators/batch_norm_op_npu.cc @@ -76,10 +76,10 @@ class NPUBatchNormOpKernel : public framework::OpKernel { auto *variance_out = ctx.Output("VarianceOut"); auto *saved_mean = ctx.Output("SavedMean"); auto *saved_variance = ctx.Output("SavedVariance"); - mean_out->mutable_data(ctx.GetPlace()); - variance_out->mutable_data(ctx.GetPlace()); - saved_mean->mutable_data(ctx.GetPlace()); - saved_variance->mutable_data(ctx.GetPlace()); + mean_out->mutable_data(ctx.GetPlace()); + variance_out->mutable_data(ctx.GetPlace()); + saved_mean->mutable_data(ctx.GetPlace()); + saved_variance->mutable_data(ctx.GetPlace()); // if MomentumTensor is set, use MomentumTensor value, momentum // is only used in this training branch @@ -170,8 +170,8 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel { auto stream = ctx.template device_context().stream(); if (d_scale && d_bias) { - d_scale->mutable_data(ctx.GetPlace()); - d_bias->mutable_data(ctx.GetPlace()); + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); if (use_global_stats) { const auto *running_mean = ctx.Input("Mean"); const auto *running_variance = ctx.Input("Variance"); diff --git a/paddle/fluid/operators/cholesky_solve_op.cc b/paddle/fluid/operators/cholesky_solve_op.cc index 6b5bae8fc73fe..5403e2440ee58 100644 --- a/paddle/fluid/operators/cholesky_solve_op.cc +++ b/paddle/fluid/operators/cholesky_solve_op.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/cholesky_solve_op.h" -#include "paddle/fluid/operators/solve_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -39,50 +40,6 @@ class CholeskySolveOpMaker : public framework::OpProtoAndCheckerMaker { class CholeskySolveOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *context) const override { - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "CholeskySolve"); - OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "CholeskySolve"); - OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "CholeskySolve"); - auto u_dims = context->GetInputDim("Y"); - auto b_dims = context->GetInputDim("X"); - int u_rank = u_dims.size(); - int b_rank = b_dims.size(); - PADDLE_ENFORCE_GE(u_rank, 2, - platform::errors::InvalidArgument( - "the rank of input Y must greater or equal to 2")); - PADDLE_ENFORCE_GE(b_rank, 2, - platform::errors::InvalidArgument( - "the rank of input X must greater or equal to 2")); - PADDLE_ENFORCE_EQ(u_dims[u_rank - 1], u_dims[u_rank - 2], - platform::errors::InvalidArgument( - "input Matrix Y should be square matrix," - "But Got last shape of %ld x %ld", - u_dims[u_rank - 1], u_dims[u_rank - 2])); - PADDLE_ENFORCE_EQ( - b_dims[b_rank - 2], u_dims[u_rank - 2], - platform::errors::InvalidArgument( - "the first dim of input X must equal to the dim of input Y," - "But Got %ld and %ld", - b_dims[b_rank - 2], u_dims[u_rank - 2])); - - std::vector u_dims_vec = phi::vectorize(u_dims); - std::vector b_dims_vec = phi::vectorize(b_dims); - - std::vector u_dims_vec_cut(u_dims_vec.begin(), - u_dims_vec.end() - 2); - std::vector b_dims_vec_cut(b_dims_vec.begin(), - b_dims_vec.end() - 2); - - std::vector expand_batch_portion = - get_broadcast_batch_portion(u_dims_vec_cut, b_dims_vec_cut); - - std::vector b_broadcast_dims({expand_batch_portion}); - b_broadcast_dims.insert(b_broadcast_dims.end(), - {b_dims_vec[b_rank - 2], b_dims_vec[b_rank - 1]}); - - // dim of 'Out' is the same with 'Y' after broadcast - context->SetOutputDim("Out", phi::make_ddim(b_broadcast_dims)); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -151,22 +108,15 @@ class CholeskySolveGradOp : public framework::OperatorWithKernel { } // namespace operators } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(cholesky_solve, CholeskySolveInferShapeFunctor, + PD_INFER_META(phi::CholeskySolveInferMeta)); + REGISTER_OPERATOR(cholesky_solve, ops::CholeskySolveOp, ops::CholeskySolveOpMaker, ops::CholeskySolveOpVarTypeInference, ops::CholeskySolveOpGradMaker, - ops::CholeskySolveOpGradMaker); + ops::CholeskySolveOpGradMaker, + CholeskySolveInferShapeFunctor); REGISTER_OPERATOR(cholesky_solve_grad, ops::CholeskySolveGradOp); - -REGISTER_OP_CPU_KERNEL( - cholesky_solve, - ops::CholeskySolveKernel, - ops::CholeskySolveKernel); - -REGISTER_OP_CPU_KERNEL( - cholesky_solve_grad, - ops::CholeskySolveGradKernel, - ops::CholeskySolveGradKernel); -// Complex<> is not supported because of TensorExpand, which used to boardcast -// input Tensor diff --git a/paddle/fluid/operators/cholesky_solve_op.cu b/paddle/fluid/operators/cholesky_solve_op.cu deleted file mode 100644 index 1b551a7cd0343..0000000000000 --- a/paddle/fluid/operators/cholesky_solve_op.cu +++ /dev/null @@ -1,136 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_WITH_HIP -// HIP not support cusolver - -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/cholesky_solve_op.h" -#include "paddle/fluid/platform/dynload/cusolver.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using CUDADeviceContext = paddle::platform::CUDADeviceContext; - -template -void cusolver_potrs(const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, - int n, int nrhs, T *Adata, int lda, T *Bdata, int ldb, - int *devInfo); - -template <> -void cusolver_potrs(const cusolverDnHandle_t &cusolverH, - cublasFillMode_t uplo, int n, int nrhs, float *Adata, - int lda, float *Bdata, int ldb, int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSpotrs( - cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo)); -} - -template <> -void cusolver_potrs(const cusolverDnHandle_t &cusolverH, - cublasFillMode_t uplo, int n, int nrhs, - double *Adata, int lda, double *Bdata, int ldb, - int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDpotrs( - cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo)); -} - -template <> -void cusolver_potrs>( - const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs, - platform::complex *Adata, int lda, platform::complex *Bdata, - int ldb, int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnCpotrs( - cusolverH, uplo, n, nrhs, reinterpret_cast(Adata), lda, - reinterpret_cast(Bdata), ldb, devInfo)); -} - -template <> -void cusolver_potrs>( - const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs, - platform::complex *Adata, int lda, platform::complex *Bdata, - int ldb, int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnZpotrs( - cusolverH, uplo, n, nrhs, - reinterpret_cast(Adata), lda, - reinterpret_cast(Bdata), ldb, devInfo)); -} - -template -class CholeskySolveFunctor { - public: - void operator()(const platform::CUDADeviceContext &dev_ctx, bool upper, int n, - int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) { - cublasFillMode_t uplo = - upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; - - /* step 1: get cusolver handle*/ - auto cusolverH = dev_ctx.cusolver_dn_handle(); - - /* step 2: solve A0*X0 = B0 */ - cusolver_potrs(cusolverH, uplo, n, nrhs, Adata, lda, Bdata, lda, - devInfo); - - PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); - } -}; - -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor &in, Tensor *out, - const framework::ExecutionContext &ctx) { - // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] - // out_reduce_dim should be [0, 2] - const std::vector in_dims = phi::vectorize(in.dims()); - auto in_size = in_dims.size(); - const std::vector out_dims = phi::vectorize(out->dims()); - auto out_size = out_dims.size(); - - std::vector out_bst_dims(in_size); - - std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1); - std::copy(out_dims.data(), out_dims.data() + out_size, - out_bst_dims.data() + in_size - out_size); - - std::vector out_reduce_dims; - for (size_t idx = 0; idx <= in_size - 3; idx++) { - if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) { - out_reduce_dims.push_back(idx); - } - } - gpuStream_t stream = ctx.cuda_device_context().stream(); - TensorReduceImpl>( - ctx.cuda_device_context(), in, out, kps::IdentityFunctor(), - out_reduce_dims, stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - cholesky_solve, - ops::CholeskySolveKernel, - ops::CholeskySolveKernel); - -REGISTER_OP_CUDA_KERNEL( - cholesky_solve_grad, - ops::CholeskySolveGradKernel, - ops::CholeskySolveGradKernel); - -#endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h deleted file mode 100644 index 74b961d4e55e8..0000000000000 --- a/paddle/fluid/operators/cholesky_solve_op.h +++ /dev/null @@ -1,252 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/solve_op.h" -#include "paddle/fluid/operators/triangular_solve_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/phi/kernels/funcs/lapack/lapack_function.h" -#include "paddle/phi/kernels/math_kernel.h" -#include "paddle/phi/kernels/transpose_kernel.h" - -namespace paddle { -namespace operators { // namespace operators - -template -class CholeskySolveFunctor { - public: - void operator()(const platform::DeviceContext &dev_ctx, bool upper, int n, - int nrhs, T *Adata, int lda, T *Bdata, int *devInfo); -}; - -template -class CholeskySolveFunctor { - public: - void operator()(const platform::CPUDeviceContext &dev_ctx, bool upper, int n, - int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) { - char uplo = upper ? 'U' : 'L'; - phi::funcs::lapackCholeskySolve(uplo, n, nrhs, Adata, lda, Bdata, lda, - devInfo); - } -}; - -template -void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx, - const framework::Tensor &uin, - const framework::Tensor &bin, framework::Tensor *out, - bool upper) { - const auto &dev_ctx = ctx.template device_context(); - // framework::Tensor broadcast - std::vector u_bst_dims_vec; - std::vector b_bst_dims_vec; - std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(uin, bin); - framework::Tensor u_bst(uin.type()); - TensorExpand(dev_ctx, uin, &u_bst, u_bst_dims_vec); - - framework::Tensor b_bst(bin.type()); - TensorExpand(dev_ctx, bin, &b_bst, b_bst_dims_vec); - - auto &phi_dev_ctx = static_cast< - const typename framework::ConvertToPhiContext::TYPE &>( - dev_ctx); - - // calculate u's conjugate for complex - framework::Tensor u_conj(u_bst.type()); - platform::ForRange u_for_range(dev_ctx, u_bst.numel()); - phi::funcs::ConjFunctor u_functor( - u_bst.data(), u_bst.numel(), - u_conj.mutable_data(u_bst.dims(), dev_ctx.GetPlace())); - u_for_range(u_functor); - u_conj = phi::TransposeLast2Dim(phi_dev_ctx, u_conj); - - // calculate b's conjugate for complex - framework::Tensor b_conj(b_bst.type()); - platform::ForRange b_for_range(dev_ctx, b_bst.numel()); - phi::funcs::ConjFunctor b_functor( - b_bst.data(), b_bst.numel(), - b_conj.mutable_data(b_bst.dims(), dev_ctx.GetPlace())); - b_for_range(b_functor); - b_conj = phi::TransposeLast2Dim(phi_dev_ctx, b_conj); - - auto ut_data = u_conj.mutable_data(dev_ctx.GetPlace()); - auto uindims = u_bst.dims(); - auto bindims = b_bst.dims(); - int uinrank = uindims.size(); - int binrank = bindims.size(); - - int n = uindims[uinrank - 2]; - int nrhs = bindims[binrank - 1]; - int ldab = std::max(1, n); - - // framework::Tensor out_copy(b_conj.type()); - // out_copy.Resize(b_conj.dims()); - framework::TensorCopy(b_conj, dev_ctx.GetPlace(), out); - T *out_data = out->mutable_data(dev_ctx.GetPlace()); - - auto info_dims = phi::slice_ddim(bindims, 0, binrank - 2); - auto batchsize = product(info_dims); - - framework::Tensor tmp; - std::vector tmpdim(1, batchsize); - tmp.Resize(phi::make_ddim(tmpdim)); - int *info = tmp.mutable_data(dev_ctx.GetPlace()); - - CholeskySolveFunctor functor; - for (int b = 0; b < batchsize; b++) { - auto uin_data_item = &ut_data[b * n * n]; - auto out_data_item = &out_data[b * n * nrhs]; - auto info_item = &info[b]; - functor(dev_ctx, upper, n, nrhs, uin_data_item, ldab, out_data_item, - info_item); - } - - // calculate out's conjugate for complex - platform::ForRange out_for_range(dev_ctx, out->numel()); - phi::funcs::ConjFunctor out_functor( - out->data(), out->numel(), - out->mutable_data(out->dims(), dev_ctx.GetPlace())); - out_for_range(out_functor); - *out = phi::TransposeLast2Dim(phi_dev_ctx, *out); -} - -template -class CholeskySolveKernel : public framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext &ctx) const override { - auto *uin = ctx.Input("Y"); - auto *bin = ctx.Input("X"); - auto *out = ctx.Output("Out"); - auto upper = ctx.Attr("upper"); - cholesky_solve_fn(ctx, *uin, *bin, out, upper); - } -}; - -template -class CholeskySolveGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *bin = ctx.Input("X"); - auto *uin = ctx.Input("Y"); - auto *out = ctx.Input("Out"); - auto *dout = ctx.Input(framework::GradVarName("Out")); - auto *db = ctx.Output(framework::GradVarName("X")); - auto *du = ctx.Output(framework::GradVarName("Y")); - auto upper = ctx.Attr("upper"); - - const auto &dev_ctx = ctx.template device_context(); - auto &phi_dev_ctx = static_cast< - const typename framework::ConvertToPhiContext::TYPE &>( - dev_ctx); - - std::vector u_bst_dims_vec; - std::vector b_bst_dims_vec; - std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(*uin, *bin); - framework::Tensor u_bst(uin->type()); - TensorExpand(dev_ctx, *uin, &u_bst, u_bst_dims_vec); - - framework::Tensor db_bst(bin->type()); - TensorExpand(dev_ctx, *bin, &db_bst, b_bst_dims_vec); - - if (dout) { - db->mutable_data(dev_ctx.GetPlace()); - cholesky_solve_fn(ctx, u_bst, *dout, &db_bst, upper); - - if (db_bst.dims() == db->dims()) { - framework::TensorCopy(db_bst, dev_ctx.GetPlace(), dev_ctx, db); - } else { - MatrixReduceSumFunctor functor; - functor(db_bst, db, ctx); - db->Resize(bin->dims()); - } - - auto blas = phi::funcs::GetBlas(ctx); - - // calculate out's conjugate for complex - framework::Tensor out_conj(out->type()); - platform::ForRange out_for_range(dev_ctx, out->numel()); - phi::funcs::ConjFunctor out_functor( - out->data(), out->numel(), - out_conj.mutable_data(out->dims(), dev_ctx.GetPlace())); - out_for_range(out_functor); - out_conj = phi::TransposeLast2Dim(phi_dev_ctx, out_conj); - - framework::Tensor commonterm(out->type()); - auto outdims = out_conj.dims(); - auto dbdims = db_bst.dims(); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(outdims, 0, false); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(dbdims, 0, false); - auto cmtdim = outdims; - cmtdim[cmtdim.size() - 2] = dbdims[dbdims.size() - 2]; - commonterm.Resize(cmtdim); - commonterm.mutable_data(dev_ctx.GetPlace()); - blas.MatMul(db_bst, mat_dim_b, out_conj, mat_dim_a, static_cast(1), - &commonterm, static_cast(0)); - - // calculate commonterm's conjugate for complex - framework::Tensor commonterm_conj(commonterm.type()); - platform::ForRange commonterm_for_range( - dev_ctx, commonterm.numel()); - phi::funcs::ConjFunctor commonterm_functor( - commonterm.data(), commonterm.numel(), - commonterm_conj.mutable_data(commonterm.dims(), - dev_ctx.GetPlace())); - commonterm_for_range(commonterm_functor); - commonterm_conj = phi::TransposeLast2Dim(phi_dev_ctx, commonterm_conj); - - phi::AddRawKernel( - static_cast::TYPE &>(dev_ctx), - commonterm, commonterm_conj, -1, &commonterm); - - auto mat_dim_u = - phi::funcs::CreateMatrixDescriptor(u_bst.dims(), 0, false); - auto mat_dim_c = - phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false); - - Tensor du_bst(uin->type()); - // get upper or lower triangular - du_bst.Resize(u_bst.dims()); - du_bst.mutable_data(dev_ctx.GetPlace()); - if (upper) { - blas.MatMul(u_bst, mat_dim_u, commonterm, mat_dim_c, static_cast(-1), - &du_bst, static_cast(0)); - } else { - blas.MatMul(commonterm, mat_dim_c, u_bst, mat_dim_u, static_cast(-1), - &du_bst, static_cast(0)); - } - - const auto &udims = u_bst.dims(); - const auto H = udims[udims.size() - 2]; - const auto W = udims[udims.size() - 1]; - platform::ForRange x_for_range(dev_ctx, u_bst.numel()); - TrilTriuCompute tril_triu_computer(du_bst.data(), 0, !upper, H, W, - u_bst.data()); - x_for_range(tril_triu_computer); - - du->mutable_data(dev_ctx.GetPlace()); - if (u_bst.dims() == du->dims()) { - framework::TensorCopy(u_bst, dev_ctx.GetPlace(), dev_ctx, du); - } else { - MatrixReduceSumFunctor functor; - functor(u_bst, du, ctx); - du->Resize(uin->dims()); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc index 2afee35112e6f..0edbee534c0b5 100644 --- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc @@ -22,11 +22,17 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/init.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP(cinn_launch); USE_OP(cinn_instruction_run); USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +#ifdef PADDLE_WITH_CUDA +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +#endif + namespace paddle::operators { using framework::paddle2cinn::CinnCompiler; diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index 460d417e61fd4..585f1caabed05 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -26,12 +26,18 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/init.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP(cinn_launch); USE_OP(cinn_instruction_run); USE_OP_ITSELF(elementwise_add); DECLARE_double(eager_delete_tensor_gb); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +#ifdef PADDLE_WITH_CUDA +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +#endif + namespace paddle::operators { using framework::paddle2cinn::CinnCompiler; diff --git a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc new file mode 100644 index 0000000000000..f29bc57c9a5f4 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc @@ -0,0 +1,81 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allgather_op.h" + +#if defined(PADDLE_WITH_CNCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/mlu/cncl_helper.h" +#endif +#include "paddle/fluid/framework/convert_utils.h" + +namespace paddle { +namespace operators { + +template +class CAllGatherOpMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_CNCL) + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + cnclDataType_t dtype = + platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype())); + + int nranks = ctx.Attr("nranks"); + int rid = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + auto comm = platform::CNCLCommContext::Instance().Get(rid, place); + PADDLE_ENFORCE_EQ( + nranks, comm->nranks(), + platform::errors::InvalidArgument("nranks: %s should equal to %s", + nranks, comm->nranks())); + + framework::DDim out_dims = x->dims(); + out_dims[0] *= nranks; + out->mutable_data(out_dims, place); + + uint32_t send_numel = x->numel(); + void* send_buff = reinterpret_cast(const_cast(x->data())); + void* recv_buff = reinterpret_cast(out->data()); + + mluStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(send_buff, recv_buff, send_numel, + dtype, comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with MLU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(c_allgather, ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel); diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc index c0968581acda9..7206dd01bcaa3 100644 --- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc index 31b00a93f1396..0946ad8aca65e 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 7e5120cd2b392..2c4e85400ca4a 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -413,7 +413,7 @@ class CAllReduceOpMLUKernel : public framework::OpKernel { auto place = ctx.GetPlace(); cnclDataType_t dtype = - platform::ToCNCLDataType(framework::TransToProtoVarType(in->type())); + platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype())); int64_t numel = in->numel(); const void* sendbuff = in->data(); out->Resize(in->dims()); diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc index 9c11704704ed4..61e5f27903477 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc index d315f211709e4..d1e269fb5a4fe 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc @@ -31,7 +31,7 @@ class CBroadcastOPMLUKernel : public framework::OpKernel { auto out = ctx.Output("Out"); int numel = x->numel(); cnclDataType_t dtype = - platform::ToCNCLDataType(framework::TransToProtoVarType(x->type())); + platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype())); int rid = ctx.Attr("ring_id"); auto place = ctx.GetPlace(); diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc index 5787090e6a52f..cf4d6a28744b3 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc index c79b2f92b69a1..c4e410d04da5f 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc index d9a7a4abb08fc..8b498787c69db 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc index b8abf458c1c6d..133085ad3f3b0 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc index bb78971734bf0..36c6f4fadd0fc 100644 --- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc +++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc @@ -27,7 +27,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc index 8f7b8c4a9040b..6e02d36215697 100644 --- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc index c40b2c3e76a02..57e3dd53cc774 100644 --- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc @@ -25,7 +25,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/controlflow/compare_op_mlu.cc b/paddle/fluid/operators/controlflow/compare_op_mlu.cc index 9dc287ab76a67..c39743ef9914c 100644 --- a/paddle/fluid/operators/controlflow/compare_op_mlu.cc +++ b/paddle/fluid/operators/controlflow/compare_op_mlu.cc @@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index bc29c92b09426..8a190c1a1e091 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -40,6 +40,13 @@ class FeedVariableVisitor : public boost::static_visitor { out_var_->GetMutable(); if (platform::is_same_place(in_tensor.place(), place_)) { out_tensor->ShareDataWith(in_tensor); +#ifdef PADDLE_WITH_IPU + } else if (platform::is_ipu_place(place_)) { + // For ipu, both in_tensor and out_tensor are allocated on cpu, + // PopART will copy tensor from host automatically, + // no TensorCopy() is required here. + out_tensor->ShareDataWith(in_tensor); +#endif } else { platform::DeviceContext *context = platform::DeviceContextPool::Instance().Get(place_); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 8213e877f7224..9be63a85fc0de 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -27,6 +27,9 @@ limitations under the License. */ #endif #include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -841,6 +844,8 @@ framework::OpKernelType ConvOpDoubleGrad::GetExpectedKernelType( } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(conv2d, Conv2dInferShapeFunctor, + PD_INFER_META(phi::ConvInferMeta)); REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker, ops::ConvOpInferVarType, ops::Conv2DGradMaker, @@ -851,6 +856,8 @@ REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad, REGISTER_OPERATOR(conv2d_grad_grad, ops::ConvOpDoubleGrad); // depthwise convolution op +DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d, DepthwiseConv2dInferShapeFunctor, + PD_INFER_META(phi::ConvInferMeta)); REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, ops::ConvOpInferVarType, ops::Conv2DGradMaker, @@ -860,6 +867,8 @@ REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad, ops::Conv2DDoubleGradMaker); REGISTER_OPERATOR(depthwise_conv2d_grad_grad, ops::ConvOpDoubleGrad); +DECLARE_INFER_SHAPE_FUNCTOR(conv3d, Conv3dInferShapeFunctor, + PD_INFER_META(phi::ConvInferMeta)); REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker, ops::ConvOpInferVarType, ops::Conv3DGradMaker, diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc index 8897f7b229c32..fcda16a3e72ac 100644 --- a/paddle/fluid/operators/conv_op_npu.cc +++ b/paddle/fluid/operators/conv_op_npu.cc @@ -356,7 +356,7 @@ class NPUConvGradOpKernel : public framework::OpKernel { auto stream = ctx.template device_context().stream(); if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); + filter_grad->mutable_data(ctx.GetPlace()); std::vector filter_shape_vec = phi::vectorize(filter->dims()); const auto& runner = NpuOpRunner( diff --git a/paddle/fluid/operators/conv_op_xpu.cc b/paddle/fluid/operators/conv_op_xpu.cc index ddfc6fe862c27..e4751f1f26008 100644 --- a/paddle/fluid/operators/conv_op_xpu.cc +++ b/paddle/fluid/operators/conv_op_xpu.cc @@ -19,14 +19,16 @@ namespace operators { template class GemmConvXPUKernel : public framework::OpKernel { + using XPUT = typename XPUTypeTrait::Type; + public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *input = context.Input("Input"); // The filter will be reshaped in the calculations, // so here use an assignment operation, // that avoids modifying the variable in the Scope. Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); + Tensor *output = context.Output("Output"); output->mutable_data(context.GetPlace()); int groups = context.Attr("groups"); std::vector strides = context.Attr>("strides"); @@ -53,11 +55,16 @@ class GemmConvXPUKernel : public framework::OpKernel { const int img_h = static_cast(input->dims()[2]); const int img_w = static_cast(input->dims()[3]); const int f = static_cast(filter.dims()[0]); - auto& dev_ctx = context.template device_context(); - int r = xpu::conv2d( - dev_ctx.x_context(), input->data(), filter.data(), - output->data(), batch_size, img_c, img_h, img_w, f, ksize, - strides, paddings, dilations, groups, nullptr, nullptr, nullptr, true); + + const XPUT *input_data = reinterpret_cast(input->data()); + const XPUT *filter_data = reinterpret_cast(filter.data()); + XPUT *output_data = reinterpret_cast(output->data()); + + auto &dev_ctx = context.template device_context(); + int r = xpu::conv2d( + dev_ctx.x_context(), input_data, filter_data, output_data, batch_size, + img_c, img_h, img_w, f, ksize, strides, paddings, dilations, groups, + nullptr, nullptr, nullptr, true); PADDLE_ENFORCE_EQ( r, XPU_SUCCESS, platform::errors::External("XPU conv kernel return wrong value[%d %s]", @@ -67,14 +74,16 @@ class GemmConvXPUKernel : public framework::OpKernel { template class GemmConvGradXPUKernel : public framework::OpKernel { + using XPUT = typename XPUTypeTrait::Type; + public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *input = context.Input("Input"); + const Tensor *output_grad = context.Input(framework::GradVarName("Output")); - Tensor* input_grad = + Tensor *input_grad = context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = + Tensor *filter_grad = context.Output(framework::GradVarName("Filter")); // The filter and filter_grad will be reshaped in the calculations, // so here use an assignment operation, @@ -107,19 +116,27 @@ class GemmConvGradXPUKernel : public framework::OpKernel { const int img_h = static_cast(input->dims()[2]); const int img_w = static_cast(input->dims()[3]); const int f = static_cast(filter.dims()[0]); + + const XPUT *input_data = reinterpret_cast(input->data()); + const XPUT *filter_data = reinterpret_cast(filter.data()); + const XPUT *output_grad_data = + reinterpret_cast(output_grad->data()); + XPUT *input_grad_data = nullptr; if (input_grad) { input_grad->mutable_data(context.GetPlace()); + input_grad_data = reinterpret_cast(input_grad->data()); } + XPUT *filter_grad_data = nullptr; if (filter_grad) { filter_grad->mutable_data(context.GetPlace()); + filter_grad_data = reinterpret_cast(filter_grad->data()); } - auto& dev_ctx = context.template device_context(); - int r = xpu::conv2d_grad( - dev_ctx.x_context(), input->data(), filter.data(), - output_grad->data(), input_grad ? input_grad->data() : nullptr, - filter_grad ? filter_grad->data() : nullptr, batch_size, img_c, - img_h, img_w, f, ksize, strides, paddings, dilations, groups, nullptr, - nullptr, nullptr, nullptr, nullptr, true); + auto &dev_ctx = context.template device_context(); + int r = xpu::conv2d_grad( + dev_ctx.x_context(), input_data, filter_data, output_grad_data, + input_grad_data, filter_grad_data, batch_size, img_c, img_h, img_w, f, + ksize, strides, paddings, dilations, groups, nullptr, nullptr, nullptr, + nullptr, nullptr, true); PADDLE_ENFORCE_EQ( r, XPU_SUCCESS, platform::errors::External("XPU conv kernel return wrong value[%d %s]", @@ -130,14 +147,22 @@ class GemmConvGradXPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( - depthwise_conv2d, - ops::GemmConvXPUKernel); -REGISTER_OP_XPU_KERNEL( - conv2d, ops::GemmConvXPUKernel); + conv2d, ops::GemmConvXPUKernel, + ops::GemmConvXPUKernel); REGISTER_OP_XPU_KERNEL( conv2d_grad, - ops::GemmConvGradXPUKernel); + ops::GemmConvGradXPUKernel, + ops::GemmConvGradXPUKernel); +REGISTER_OP_XPU_KERNEL( + depthwise_conv2d, + ops::GemmConvXPUKernel, + ops::GemmConvXPUKernel); REGISTER_OP_XPU_KERNEL( depthwise_conv2d_grad, - ops::GemmConvGradXPUKernel); + ops::GemmConvGradXPUKernel, + ops::GemmConvGradXPUKernel); #endif diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu deleted file mode 100644 index 1841b78af32dd..0000000000000 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ /dev/null @@ -1,1286 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/memory.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/operators/conv_miopen_helper.h" -#else -#include "paddle/fluid/operators/conv_cudnn_helper.h" -#endif -#include "paddle/fluid/operators/conv_transpose_op.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/padding.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -static void DataTranspose(const framework::ExecutionContext& ctx, - const Tensor* input, Tensor* output, - const std::vector& axis, int flag = 0) { - auto& dev_ctx = ctx.template device_context(); - phi::funcs::Transpose transpose; - auto in_dims = input->dims(); - std::vector input_transpose_vec; - for (size_t i = 0; i < axis.size(); ++i) { - if (flag == 0) - input_transpose_vec.push_back(in_dims[axis[i]]); - else - input_transpose_vec.push_back(in_dims[i]); - } - framework::DDim input_transpose_dims(phi::make_ddim(input_transpose_vec)); - output->mutable_data(input_transpose_dims, ctx.GetPlace()); - transpose(dev_ctx, *input, output, axis); -} - -template -class CUDNNConvTransposeOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto* input = ctx.Input("Input"); - auto* filter = ctx.Input("Filter"); - auto* output = ctx.Output("Output"); - - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - - // cudnn v5 does not support dilations - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - const T* filter_data = filter->data(); - const std::string data_layout_str = ctx.Attr("data_format"); - const paddle::platform::DataLayout data_layout = - (data_layout_str != "NHWC" ? platform::DataLayout::kNCHW - : platform::DataLayout::kNHWC); - - // if channel_last, transpose to channel_first - Tensor input_transpose; - std::vector input_vec = phi::vectorize(input->dims()); - std::vector output_vec = phi::vectorize(output->dims()); - if (data_layout == platform::DataLayout::kNHWC) { - if (strides.size() == 2U) { - std::vector axis = {0, 3, 1, 2}; - for (size_t i = 0; i < axis.size(); ++i) { - input_vec[i] = input->dims()[axis[i]]; - output_vec[i] = output->dims()[axis[i]]; - } - DataTranspose(ctx, input, &input_transpose, axis); - } else if (strides.size() == 3U) { - std::vector axis = {0, 4, 1, 2, 3}; - for (size_t i = 0; i < axis.size(); ++i) { - input_vec[i] = input->dims()[axis[i]]; - output_vec[i] = output->dims()[axis[i]]; - } - DataTranspose(ctx, input, &input_transpose, axis); - } - } else { - input_transpose = *input; - } - - // update padding and dilation - auto in_dims = input_transpose.dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); - - std::vector input_pad(input_transpose.dims().size() * 2, 0); - Tensor transformed_input; - std::vector padding_common(data_dim, 0); - if (!is_sys_pad) { - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = input_transpose.dims()[0]; - new_input_shape_vec[1] = input_transpose.dims()[1]; - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_input_shape_vec[i + 2] = - input_transpose.dims()[i + 2] + padding_diff[i]; - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_input.Resize(new_input_shape); - auto& dev_ctx = - ctx.template device_context(); - - transformed_input = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - const int rank = input_transpose.dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - phi::funcs::PadFunction( - dev_ctx, input_pad, input_transpose, pad_value, - &transformed_input); - } break; - case 5: { - phi::funcs::PadFunction( - dev_ctx, input_pad, input_transpose, pad_value, - &transformed_input); - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Op(ConvTranspose) only supports 4-D or 5-D input Tensor.")); - } - } else { - transformed_input = input_transpose; - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - std::vector starts(data_dim, 0); - std::vector ends(data_dim, 0); - std::vector axes(data_dim, 0); - for (size_t i = 0; i < data_dim; ++i) { - starts[i] = input_pad[2 * i + 4] * (strides[i] + 1); - ends[i] = starts[i] + output_vec[i + 2]; - axes[i] = i + 2; - } - - const T* input_data = transformed_input.data(); - input_vec = phi::vectorize(transformed_input.dims()); - - std::vector transformed_output_vec = output_vec; - for (size_t i = 0; i < data_dim; ++i) { - transformed_output_vec[i + 2] = - output_vec[i + 2] + - (input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] - - 2 * padding_common[i] + paddings[2 * i] + paddings[2 * i + 1]; - } - - Tensor transformed_output; - if (!is_sys_pad) { - DDim transformed_output_shape(phi::make_ddim(transformed_output_vec)); - transformed_output.mutable_data(transformed_output_shape, - ctx.GetPlace()); - } else { - output->mutable_data(ctx.GetPlace()); - transformed_output.ShareDataWith(*output); - transformed_output.Resize(phi::make_ddim(transformed_output_vec)); - } - T* transformed_output_data = transformed_output.data(); - - platform::DataLayout layout; - - int iwo_groups = groups; - int c_groups = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_groups = 1; - c_groups = groups; - groups = 1; -#endif - - if (strides.size() == 2U) { - layout = platform::DataLayout::kNCHW; - } else { - layout = platform::DataLayout::kNCDHW; - } - - size_t workspace_size = 0; -#ifdef PADDLE_WITH_HIP - miopenConvBwdDataAlgorithm_t algo{}; -#else - cudnnConvolutionBwdDataAlgo_t algo{}; -#endif - // ------------------- cudnn conv algorithm --------------------- - auto& dev_ctx = ctx.template device_context(); - auto handle = dev_ctx.cudnn_handle(); - auto layout_tensor = GetCudnnTensorFormat(layout); - bool deterministic = FLAGS_cudnn_deterministic; - - auto dtype = platform::CudnnDataType::type; - // ------------------- cudnn descriptors --------------------- - ConvArgs args{&transformed_output, - filter, - &transformed_input, - strides, - padding_common, - dilations, - dtype}; - args.handle = handle; - args.idesc.set(transformed_output, iwo_groups); - args.wdesc.set(*filter, layout_tensor, iwo_groups); - args.odesc.set(transformed_input, iwo_groups); - args.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); - -#ifdef PADDLE_WITH_HIP - using search = SearchAlgorithm; - workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); - algo = search::Find( - args, false, deterministic, workspace_size, - ctx.template device_context()); -#else - using search = SearchAlgorithm; - algo = search::Find( - args, false, deterministic, - ctx.template device_context()); - workspace_size = - std::max(workspace_size, search::GetWorkspaceSize(args, algo)); -#endif - - // ------------------- cudnn conv transpose forward --------------------- - int input_offset = - transformed_input.numel() / transformed_input.dims()[0] / groups; - int output_offset = - transformed_output.numel() / transformed_output.dims()[0] / groups; - int filter_offset = filter->numel() / groups; - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - for (int g = 0; g < groups; g++) { -#ifdef PADDLE_WITH_HIP - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args.odesc.desc(), - input_data + input_offset * g, args.wdesc.desc(), - filter_data + filter_offset * g, args.cdesc.desc(), algo, &beta, - args.idesc.desc(), transformed_output_data + output_offset * g, - cudnn_workspace, workspace_size)); - }; -#else // PADDLE_WITH_HIP - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args.wdesc.desc(), - filter_data + filter_offset * g, args.odesc.desc(), - input_data + input_offset * g, args.cdesc.desc(), algo, - cudnn_workspace, workspace_size, &beta, args.idesc.desc(), - transformed_output_data + output_offset * g)); - }; -#endif // PADDLE_WITH_HIP - workspace_handle.RunFunc(cudnn_func, workspace_size); - } - if (!is_sys_pad && strides.size() == 2U) { - Slice( - ctx, &transformed_output, output, starts, ends, axes); - } else if (!is_sys_pad && strides.size() == 3U) { - Slice( - ctx, &transformed_output, output, starts, ends, axes); - } - - if (data_layout == platform::DataLayout::kNHWC) { - Tensor output_transpose; - Tensor output_nchw; - output_nchw.ShareDataWith(*output); - output_nchw.Resize(phi::make_ddim(output_vec)); - if (strides.size() == 2U) { - std::vector axis = {0, 2, 3, 1}; - DataTranspose(ctx, &output_nchw, &output_transpose, axis); - *output = output_transpose; - } else if (strides.size() == 3U) { - std::vector axis = {0, 2, 3, 4, 1}; - DataTranspose(ctx, &output_nchw, &output_transpose, axis); - *output = output_transpose; - } - } - } -}; - -template -class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto input = ctx.Input("Input"); - auto filter = ctx.Input("Filter"); - auto output_grad = ctx.Input(framework::GradVarName("Output")); - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto filter_grad = ctx.Output(framework::GradVarName("Filter")); - const T* filter_data = filter->data(); - - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - // cudnn v5 does not support dilations - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - int user_workspace_size = ctx.Attr("workspace_size_MB"); - const std::string data_layout_str = ctx.Attr("data_format"); - const paddle::platform::DataLayout data_layout = - (data_layout_str != "NHWC" ? platform::DataLayout::kNCHW - : platform::DataLayout::kNHWC); - - // if channel_last, transpose to channel_first - Tensor input_transpose; - Tensor output_grad_transpose; - std::vector input_vec = phi::vectorize(input->dims()); - std::vector output_vec = phi::vectorize(output_grad->dims()); - if (data_layout == platform::DataLayout::kNHWC) { - if (strides.size() == 2U) { - std::vector axis = {0, 3, 1, 2}; - for (size_t i = 0; i < axis.size(); ++i) { - input_vec[i] = input->dims()[axis[i]]; - output_vec[i] = output_grad->dims()[axis[i]]; - } - DataTranspose(ctx, input, &input_transpose, axis); - DataTranspose(ctx, output_grad, &output_grad_transpose, axis); - } else if (strides.size() == 3U) { - std::vector axis = {0, 4, 1, 2, 3}; - for (size_t i = 0; i < axis.size(); ++i) { - input_vec[i] = input->dims()[axis[i]]; - output_vec[i] = output_grad->dims()[axis[i]]; - } - DataTranspose(ctx, input, &input_transpose, axis); - DataTranspose(ctx, output_grad, &output_grad_transpose, axis); - } - } else { - input_transpose = *input; - output_grad_transpose = *output_grad; - } - - // update padding and dilation - auto in_dims = input_transpose.dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); - - std::vector input_pad(input_transpose.dims().size() * 2, 0); - Tensor transformed_output_grad; - std::vector padding_common(data_dim, 0); - if (!is_sys_pad) { - std::vector padding_diff(data_dim); - std::vector new_output_grad_shape_vec(data_dim + 2); - new_output_grad_shape_vec[0] = output_grad_transpose.dims()[0]; - new_output_grad_shape_vec[1] = output_grad_transpose.dims()[1]; - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_output_grad_shape_vec[i + 2] = - output_grad_transpose.dims()[i + 2] + padding_diff[i]; - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - framework::DDim new_output_grad_shape( - phi::make_ddim(new_output_grad_shape_vec)); - transformed_output_grad.Resize(new_output_grad_shape); - auto& dev_ctx = - ctx.template device_context(); - - transformed_output_grad = - ctx.AllocateTmpTensor( - new_output_grad_shape, dev_ctx); - const int rank = input_transpose.dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - phi::funcs::PadFunction( - dev_ctx, input_pad, output_grad_transpose, pad_value, - &transformed_output_grad); - } break; - case 5: { - phi::funcs::PadFunction( - dev_ctx, input_pad, output_grad_transpose, pad_value, - &transformed_output_grad); - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Op(ConvTranspose) only supports 4-D or 5-D input Tensor.")); - } - } else { - transformed_output_grad = output_grad_transpose; - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* input_data = input_transpose.data(); - const T* output_grad_data = transformed_output_grad.data(); - output_vec = phi::vectorize(transformed_output_grad.dims()); - - // ------------------- cudnn descriptors --------------------- - platform::DataLayout layout; - - if (strides.size() == 2U) { - layout = platform::DataLayout::kNCHW; - } else { - layout = platform::DataLayout::kNCDHW; - } - - int iwo_groups = groups; - int c_groups = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_groups = 1; - c_groups = groups; - groups = 1; -#endif - - auto dtype = platform::CudnnDataType::type; - - ConvArgs args1{&transformed_output_grad, - filter, - &input_transpose, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args2{&transformed_output_grad, - filter, - &input_transpose, - strides, - padding_common, - dilations, - dtype}; - -#ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t data_algo{}; - miopenConvBwdWeightsAlgorithm_t filter_algo{}; -#else - cudnnConvolutionFwdAlgo_t data_algo{}; - cudnnConvolutionBwdFilterAlgo_t filter_algo{}; -#endif - - auto layout_tensor = GetCudnnTensorFormat(layout); - size_t workspace_size = 0; - auto& dev_ctx = ctx.template device_context(); - auto handle = dev_ctx.cudnn_handle(); - bool deterministic = FLAGS_cudnn_deterministic; - T* input_grad_data = nullptr; - T* filter_grad_data = nullptr; - - if (input_grad) { - input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - args1.handle = handle; - args1.idesc.set(transformed_output_grad, iwo_groups); - args1.wdesc.set(*filter, layout_tensor, iwo_groups); - args1.odesc.set(input_transpose, iwo_groups); - args1.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search1::GetWorkspaceSize(args1)); - data_algo = search1::Find( - args1, false, deterministic, workspace_size, - ctx.template device_context()); -#else - using search1 = SearchAlgorithm; - data_algo = search1::Find( - args1, false, deterministic, - ctx.template device_context()); - workspace_size = - std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); -#endif - } - - if (filter_grad) { - filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); - args2.handle = handle; - args2.idesc.set(transformed_output_grad, iwo_groups); - args2.wdesc.set(*filter_grad, layout_tensor, iwo_groups); - args2.odesc.set(input_transpose, iwo_groups); - args2.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2)); - filter_algo = search2::Find( - args2, false, deterministic, workspace_size, - ctx.template device_context()); -#else - using search2 = SearchAlgorithm; - filter_algo = search2::Find( - args2, false, deterministic, - ctx.template device_context()); - workspace_size = std::max(workspace_size, - search2::GetWorkspaceSize(args2, filter_algo)); -#endif - } - - // ------------------- cudnn conv backward data --------------------- - // FIXME(typhoonzero): template type T may not be the same as cudnn call. - int input_offset = input->numel() / input->dims()[0] / groups; - int output_grad_offset = transformed_output_grad.numel() / - transformed_output_grad.dims()[0] / groups; - int filter_offset = filter->numel() / groups; - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - if (input_grad) { - // Because beta is zero, it is unnecessary to reset input_grad. - for (int g = 0; g < groups; g++) { -#ifdef PADDLE_WITH_HIP - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args1.idesc.desc(), - output_grad_data + output_grad_offset * g, args1.wdesc.desc(), - filter_data + filter_offset * g, args1.cdesc.desc(), - data_algo, &beta, args1.odesc.desc(), - input_grad_data + input_offset * g, cudnn_workspace, - workspace_size)); - }; -#else // PADDLE_WITH_HIP - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnConvolutionForward( - handle, &alpha, args1.idesc.desc(), - output_grad_data + output_grad_offset * g, args1.wdesc.desc(), - filter_data + filter_offset * g, args1.cdesc.desc(), data_algo, - cudnn_workspace, workspace_size, &beta, args1.odesc.desc(), - input_grad_data + input_offset * g)); - }; -#endif // PADDLE_WITH_HIP - workspace_handle.RunFunc(cudnn_func, workspace_size); - } - - if (data_layout == platform::DataLayout::kNHWC) { - Tensor input_grad_transpose; - Tensor input_grad_nchw; - input_grad_nchw.ShareDataWith(*input_grad); - input_grad_nchw.Resize(phi::make_ddim(input_vec)); - if (strides.size() == 2U) { - std::vector axis = {0, 2, 3, 1}; - DataTranspose(ctx, &input_grad_nchw, &input_grad_transpose, - axis); - *input_grad = input_grad_transpose; - } else if (strides.size() == 3U) { - std::vector axis = {0, 2, 3, 4, 1}; - DataTranspose(ctx, &input_grad_nchw, &input_grad_transpose, - axis); - *input_grad = input_grad_transpose; - } - } - } - - // ------------------- cudnn conv backward filter --------------------- - if (filter_grad) { - // Because beta is zero, it is unnecessary to reset filter_grad. - // Gradient with respect to the filter - for (int g = 0; g < groups; g++) { -#ifdef PADDLE_WITH_HIP - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args2.odesc.desc(), - input_data + input_offset * g, args2.idesc.desc(), - output_grad_data + output_grad_offset * g, args2.cdesc.desc(), - filter_algo, &beta, args2.wdesc.desc(), - filter_grad_data + filter_offset * g, cudnn_workspace, - workspace_size)); - }; -#else // PADDLE_WITH_HIP - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args2.idesc.desc(), - output_grad_data + output_grad_offset * g, args2.odesc.desc(), - input_data + input_offset * g, args2.cdesc.desc(), - filter_algo, cudnn_workspace, workspace_size, &beta, - args2.wdesc.desc(), filter_grad_data + filter_offset * g)); - }; -#endif // PADDLE_WITH_HIP - workspace_handle.RunFunc(cudnn_func, workspace_size); - } - } - } -}; - -/* - * Inputs: I, W, dO, ddI, ddW - * Outputs: ddO, dW, dI - * ddo = conv_bp_data(W, ddI) + conv_bp_data(ddW, I) - * dW = conv_bp_filter(dO, ddI) - * dI = conv(dO, ddW) - */ -template -class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto X = ctx.Input("Input"); - auto W = ctx.Input("Filter"); - auto dO = ctx.Input("DOutput"); - auto ddX = ctx.Input("DDInput"); - auto ddW = ctx.Input("DDFilter"); - - auto ddO = ctx.Output("DDOutput"); - auto dW = ctx.Output("DFilter"); - auto dX = ctx.Output("DInput"); - - if (ddO) { - ddO->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, ddO, static_cast(0)); - } - if (dW) { - dW->mutable_data(ctx.GetPlace()); - } - if (dX) { - dX->mutable_data(ctx.GetPlace()); - } - - const T* dy = dO->data(); - const T* w = W->data(); - - const T* ddx = nullptr; - const T* ddw = nullptr; - T *dw, *dx, *ddy; - dw = dx = ddy = nullptr; - T* transformed_dx = nullptr; - const std::vector& strides = ctx.Attr>("strides"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - - bool deterministic = FLAGS_cudnn_deterministic; - - std::vector paddings = ctx.Attr>("paddings"); - - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensors to channel first----------- - Tensor transformed_X_channel(X->type()); - Tensor transformed_dO_channel(dO->type()); - Tensor transformed_ddX_channel(X->type()); - - Tensor transformed_ddO_channel(dO->type()); - Tensor transformed_dX_channel(X->type()); - - if (channel_last) { - ResizeToChannelFirst( - ctx, X, &transformed_X_channel); - TransToChannelFirst( - ctx, X, &transformed_X_channel); - - ResizeToChannelFirst( - ctx, dO, &transformed_dO_channel); - TransToChannelFirst( - ctx, dO, &transformed_dO_channel); - - if (ddX) { - ResizeToChannelFirst( - ctx, ddX, &transformed_ddX_channel); - TransToChannelFirst( - ctx, ddX, &transformed_ddX_channel); - } - - if (ddO) { - ResizeToChannelFirst( - ctx, ddO, &transformed_ddO_channel); - } - if (dX) { - ResizeToChannelFirst( - ctx, dX, &transformed_dX_channel); - transformed_dX_channel.mutable_data(ctx.GetPlace()); - } - - } else { - transformed_X_channel = *X; - transformed_dO_channel = *dO; - if (ddX) { - transformed_ddX_channel = *ddX; - } - if (dX) { - transformed_dX_channel = *dX; - } - } - std::vector output_vec = - phi::vectorize(transformed_dO_channel.dims()); - - auto in_dims = transformed_X_channel.dims(); - auto filter_dims = W->dims(); - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); - Tensor transformed_X(X->type()); - Tensor transformed_ddX(X->type()); - - Tensor transformed_dO(dO->type()); - - std::vector padding_common(data_dim, 0); - std::vector input_pad(X->dims().size() * 2, 0); - - if (!is_sys_pad) { - // get pad - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - std::vector new_output_grad_shape_vec(data_dim + 2); - - new_input_shape_vec[0] = transformed_X_channel.dims()[0]; - new_input_shape_vec[1] = transformed_X_channel.dims()[1]; - - new_output_grad_shape_vec[0] = transformed_dO_channel.dims()[0]; - new_output_grad_shape_vec[1] = transformed_dO_channel.dims()[1]; - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_input_shape_vec[i + 2] = - transformed_X_channel.dims()[i + 2] + padding_diff[i]; - - new_output_grad_shape_vec[i + 2] = - transformed_dO_channel.dims()[i + 2] + padding_diff[i]; - - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_X.Resize(new_input_shape); - transformed_ddX.Resize(new_input_shape); - - framework::DDim new_output_grad_shape( - phi::make_ddim(new_output_grad_shape_vec)); - transformed_dO.Resize(new_output_grad_shape); - - transformed_dO = - ctx.AllocateTmpTensor( - new_output_grad_shape, dev_ctx); - - transformed_X = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - if (ddX) { - transformed_ddX = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - - // pad for input - const int rank = X->dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_X_channel, pad_value, - &transformed_X); - if (dO) { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_dO_channel, pad_value, - &transformed_dO); - } - - if (ddX) { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_ddX_channel, pad_value, - &transformed_ddX); - } - } break; - case 5: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_X_channel, pad_value, - &transformed_X); - if (ddX) { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_ddX_channel, pad_value, - &transformed_ddX); - } - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - - } else { - transformed_X = transformed_X_channel; - transformed_dO = transformed_dO_channel; - if (ddX) { - transformed_ddX = transformed_ddX_channel; - } - - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - std::vector starts(data_dim, 0); - std::vector ends(data_dim, 0); - std::vector axes(data_dim, 0); - for (size_t i = 0; i < data_dim; ++i) { - starts[i] = input_pad[2 * i + 4] * (strides[i] + 1); - ends[i] = starts[i] + output_vec[i + 2]; - axes[i] = i + 2; - } - - std::vector transformed_output_vec = output_vec; - for (size_t i = 0; i < data_dim; ++i) { - transformed_output_vec[i + 2] = - output_vec[i + 2] + - (input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] - - 2 * padding_common[i] + paddings[2 * i] + paddings[2 * i + 1]; - } - - if (!is_sys_pad) { - DDim transformed_output_shape(phi::make_ddim(transformed_output_vec)); - transformed_ddO_channel.mutable_data(transformed_output_shape, - ctx.GetPlace()); - } else { - ddO->mutable_data(ctx.GetPlace()); - transformed_ddO_channel = *ddO; - transformed_ddO_channel.Resize(phi::make_ddim(transformed_output_vec)); - } - - const T* x = transformed_X.data(); - - int iwo_group = groups; - int c_group = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_group = 1; - c_group = groups; - groups = 1; -#endif - auto dtype = platform::CudnnDataType::type; - - auto handle = dev_ctx.cudnn_handle(); - - ConvArgs args1{&transformed_ddO_channel, - W, - &transformed_ddX, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args2{&transformed_ddO_channel, ddW, &transformed_X, strides, - padding_common, dilations, dtype}; - - ConvArgs args3{&transformed_dO, - dW, - &transformed_ddX_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args4{ - &transformed_dO, ddW, &transformed_dX_channel, strides, padding_common, - dilations, dtype}; -#ifdef PADDLE_WITH_HIP - miopenConvBwdDataAlgorithm_t bwd_algo1 = - static_cast(0); - miopenConvBwdDataAlgorithm_t bwd_algo2 = - static_cast(0); - miopenConvFwdAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); -#else - cudnnConvolutionBwdDataAlgo_t bwd_algo1 = - static_cast(0); - cudnnConvolutionBwdDataAlgo_t bwd_algo2 = - static_cast(0); - cudnnConvolutionFwdAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); -#endif - - auto layout = GetCudnnTensorFormat(platform::DataLayout::kNCHW); - - // ddo = conv(ddI, W) + conv(I, ddW) - size_t workspace_size = 0; - - T* transformed_ddy_channel = nullptr; - - if (ddO) { - ddy = ddO->data(); - transformed_ddy_channel = transformed_ddO_channel.data(); - if (ddX) { - args1.handle = handle; - args1.idesc.set(transformed_ddO_channel, iwo_group); - args1.wdesc.set(*W, layout, iwo_group); - args1.odesc.set(transformed_ddX, iwo_group); - args1.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size = search1::GetWorkspaceSize(args1); - bwd_algo1 = search1::Find( - args1, false, deterministic, workspace_size, - ctx.template device_context()); -#else - using search1 = SearchAlgorithm; - bwd_algo1 = search1::Find( - args1, false, deterministic, - ctx.template device_context()); - workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1); -#endif - } - - if (ddW) { - ddw = ddW->data(); - args2.handle = handle; - args2.idesc.set(transformed_ddO_channel, iwo_group); - args2.wdesc.set(*ddW, layout, iwo_group); - args2.odesc.set(transformed_X, iwo_group); - args2.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2)); - bwd_algo2 = search2::Find( - args2, false, deterministic, workspace_size, - ctx.template device_context()); -#else - using search2 = SearchAlgorithm; - bwd_algo2 = search2::Find( - args2, false, deterministic, - ctx.template device_context()); - workspace_size = std::max(workspace_size, - search2::GetWorkspaceSize(args2, bwd_algo2)); -#endif - } - } - - if (dW && ddX) { - dw = dW->data(); - args3.handle = handle; - args3.idesc.set(transformed_dO, iwo_group); - args3.wdesc.set(*dW, layout, iwo_group); - - args3.odesc.set(transformed_ddX_channel, iwo_group); - - args3.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); -#ifdef PADDLE_WITH_HIP - using search3 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = search3::Find( - args3, false, deterministic, workspace_size, - ctx.template device_context()); -#else - using search3 = SearchAlgorithm; - filter_algo = search3::Find( - args3, false, deterministic, - ctx.template device_context()); - workspace_size = std::max(workspace_size, - search3::GetWorkspaceSize(args3, filter_algo)); -#endif - } - - if (ddW && dX) { - transformed_dx = transformed_dX_channel.data(); - - args4.handle = handle; - args4.idesc.set(transformed_dO, iwo_group); - args4.wdesc.set(*ddW, layout, iwo_group); - args4.odesc.set(transformed_dX_channel, iwo_group); - args4.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); -#ifdef PADDLE_WITH_HIP - using search4 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = search4::Find( - args4, false, deterministic, workspace_size, - ctx.template device_context()); -#else - using search4 = SearchAlgorithm; - data_algo = search4::Find( - args4, false, deterministic, - ctx.template device_context()); - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); -#endif - } - - int i_n, i_c, i_d, i_h, i_w; - GetNCDHW(transformed_X.dims(), platform::DataLayout::kNCHW, &i_n, &i_c, - &i_d, &i_h, &i_w); - - int o_n, o_c, o_d, o_h, o_w; - GetNCDHW(transformed_dO.dims(), platform::DataLayout::kNCHW, &o_n, &o_c, - &o_d, &o_h, &o_w); - - int group_offset_in = - transformed_X.numel() / transformed_X.dims()[0] / groups; - int group_offset_out = - transformed_dO.numel() / transformed_dO.dims()[0] / groups; - int group_offset_filter = W->numel() / groups; - - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - - auto wkspace_handle = dev_ctx.cudnn_workspace_handle(); - - if (ddO) { - if (ddX) { - ddx = transformed_ddX.data(); - for (int i = 0; i < groups; i++) { -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), - ddx + i * group_offset_in, args1.wdesc.desc(), - w + i * group_offset_filter, args1.cdesc.desc(), - bwd_algo1, &beta, args1.idesc.desc(), - transformed_ddy_channel + i * group_offset_out, - workspace_ptr, workspace_size)); - }, - workspace_size); -#else // PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args1.wdesc.desc(), - w + i * group_offset_filter, args1.odesc.desc(), - ddx + i * group_offset_in, args1.cdesc.desc(), - bwd_algo1, workspace_ptr, workspace_size, &beta, - args1.idesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); -#endif // PADDLE_WITH_HIP - } - } - if (ddW) { - for (int i = 0; i < groups; i++) { -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - Tensor conv_x_ddw(dO->type()); - conv_x_ddw.Resize(transformed_ddO_channel.dims()); - T* conv_x_ddw_data = conv_x_ddw.mutable_data(ctx.GetPlace()); - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args2.odesc.desc(), - x + i * group_offset_in, args2.wdesc.desc(), - ddw + i * group_offset_filter, args2.cdesc.desc(), - bwd_algo2, &beta, args2.idesc.desc(), - conv_x_ddw_data + i * group_offset_out, workspace_ptr, - workspace_size)); - }, - workspace_size); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor( - handle, miopenTensorOpAdd, &alpha, args2.idesc.desc(), - transformed_ddy_channel + i * group_offset_out, &alpha, - args2.idesc.desc(), conv_x_ddw_data + i * group_offset_out, &beta, - args2.idesc.desc(), - transformed_ddy_channel + i * group_offset_out)); -#else // PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args2.wdesc.desc(), - ddw + i * group_offset_filter, args2.odesc.desc(), - x + i * group_offset_in, args2.cdesc.desc(), bwd_algo2, - workspace_ptr, workspace_size, &alpha, - args2.idesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); -#endif // PADDLE_WITH_HIP - } - } - if ((!is_sys_pad) && (!channel_last)) { - if (strides.size() == 2U) { - Slice( - ctx, &transformed_ddO_channel, ddO, starts, ends, axes); - } else if (!is_sys_pad && strides.size() == 3U) { - Slice( - ctx, &transformed_ddO_channel, ddO, starts, ends, axes); - } - } else if ((!is_sys_pad) && (channel_last)) { - if (strides.size() == 2U) { - Slice( - ctx, &transformed_ddO_channel, &transformed_ddO_channel, starts, - ends, axes); - } else if (!is_sys_pad && strides.size() == 3U) { - Slice( - ctx, &transformed_ddO_channel, &transformed_ddO_channel, starts, - ends, axes); - } - - TransToChannelLast( - ctx, &transformed_ddO_channel, ddO); - } - } - - T* transformed_dy_channel = transformed_dO.data(); - if (dW && ddX) { - ddx = transformed_ddX_channel.data(); - for (int i = 0; i < groups; i++) { -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args3.odesc.desc(), - ddx + i * group_offset_in, args3.idesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.cdesc.desc(), filter_algo, &beta, - args3.wdesc.desc(), dw + i * group_offset_filter, - workspace_ptr, workspace_size)); - }, - workspace_size); -#else // PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args3.idesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.odesc.desc(), ddx + i * group_offset_in, - args3.cdesc.desc(), filter_algo, workspace_ptr, - workspace_size, &beta, args3.wdesc.desc(), - dw + i * group_offset_filter)); - }, - workspace_size); -#endif // PADDLE_WITH_HIP - } - } - - if (dX && ddW) { - ddw = ddW->data(); - for (int i = 0; i < groups; i++) { -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args4.idesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.wdesc.desc(), ddw + i * group_offset_filter, - args4.cdesc.desc(), data_algo, &beta, args4.odesc.desc(), - transformed_dx + i * group_offset_in, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else // PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args4.idesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.wdesc.desc(), ddw + i * group_offset_filter, - args4.cdesc.desc(), data_algo, workspace_ptr, - workspace_size, &beta, args4.odesc.desc(), - transformed_dx + i * group_offset_in)); - }, - workspace_size); -#endif // PADDLE_WITH_HIP - } - if (channel_last) { - TransToChannelLast( - ctx, &transformed_dX_channel, dX); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeOpKernel, - ops::CUDNNConvTransposeOpKernel); -REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeGradOpKernel, - ops::CUDNNConvTransposeGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_transpose_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvTransposeDoubleGradOpKernel, - paddle::operators::CUDNNConvTransposeDoubleGradOpKernel); - -REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeOpKernel, - ops::CUDNNConvTransposeOpKernel); -REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeGradOpKernel, - ops::CUDNNConvTransposeGradOpKernel); -#else -REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeOpKernel, - ops::CUDNNConvTransposeOpKernel, - ops::CUDNNConvTransposeOpKernel); -REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeGradOpKernel, - ops::CUDNNConvTransposeGradOpKernel, - ops::CUDNNConvTransposeGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_transpose_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvTransposeDoubleGradOpKernel, - paddle::operators::CUDNNConvTransposeDoubleGradOpKernel, - paddle::operators::CUDNNConvTransposeDoubleGradOpKernel); - -REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeOpKernel, - ops::CUDNNConvTransposeOpKernel, - ops::CUDNNConvTransposeOpKernel); -REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeGradOpKernel, - ops::CUDNNConvTransposeGradOpKernel, - ops::CUDNNConvTransposeGradOpKernel); -#endif diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index 86532664985b4..fe76fc3aebbc1 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -13,13 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/conv_transpose_op.h" -#include + #include #include #include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h" - +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -29,165 +33,6 @@ namespace operators { using DataLayout = framework::DataLayout; -void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ConvTranspose"); - OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "ConvTranspose"); - OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "ConvTranspose"); - - auto in_dims = ctx->GetInputDim("Input"); - auto filter_dims = ctx->GetInputDim("Filter"); - std::vector output_size = - ctx->Attrs().Get>("output_size"); - std::vector output_padding = - ctx->Attrs().Get>("output_padding"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - std::vector dilations = ctx->Attrs().Get>("dilations"); - int groups = ctx->Attrs().Get("groups"); - std::string padding_algorithm = - ctx->Attrs().Get("padding_algorithm"); - const std::string data_layout_str = - ctx->Attrs().Get("data_format"); - const DataLayout data_layout = - ctx->IsRunMKLDNNKernel() ? DataLayout::kNCHW - : framework::StringToDataLayout(data_layout_str); - - PADDLE_ENFORCE_EQ(in_dims.size() == 4 || in_dims.size() == 5, true, - platform::errors::InvalidArgument( - "Input of Op(conv_transpose) should be 4-D or " - "5-D Tensor. But received: %u-D Tensor, " - "the shape of input is [%s]", - in_dims.size(), in_dims)); - PADDLE_ENFORCE_EQ( - in_dims.size(), filter_dims.size(), - platform::errors::InvalidArgument( - "The input's dimension size and filter's dimension size of " - "Op (conv_transpose) should be equal. But received: the shape of " - "input is [%s], the dimension size of input is [%d], the shape " - "of filter is [%s], the dimension size of filter is [%d]. ", - in_dims, in_dims.size(), filter_dims, filter_dims.size())); - - int stride_size = strides.size(); - for (int i = 0; i < stride_size; ++i) { - PADDLE_ENFORCE_GT( - strides[i], 0, - platform::errors::InvalidArgument( - "The stride of Op(Conv) should be larget than 0, but received " - "stride is %d.", - strides[i])); - } - - int in_sub_stride_size = in_dims.size() - stride_size; - - PADDLE_ENFORCE_EQ( - in_dims.size() - strides.size(), 2U, - platform::errors::InvalidArgument( - "The input's dimension size minus Attr(stride)'s size must " - "be euqal to 2 for Op(conv_transpose). But received: [%d], the " - "input's dimension size is [%d], the shape of input " - "is [%s], the Attr(stride)'s size is [%d].", - in_sub_stride_size, in_dims.size(), in_dims, strides.size())); - if (output_size.size()) - PADDLE_ENFORCE_EQ( - output_size.size(), strides.size(), - platform::errors::InvalidArgument( - "The Attr(output_size) and Attr(stride) of Op(conv_transpose) " - "should be the same.")); - if (output_padding.size()) - PADDLE_ENFORCE_EQ( - output_padding.size(), strides.size(), - platform::errors::InvalidArgument( - "The Attr(output_padding) and Attr(stride) of Op(conv_transpose) " - "should be the same.")); - - const int64_t C = - (data_layout != DataLayout::kNHWC ? in_dims[1] - : in_dims[in_dims.size() - 1]); - PADDLE_ENFORCE_EQ( - C, filter_dims[0], - platform::errors::InvalidArgument( - "The number of input channels should be equal to filter channels " - "for Op(conv_transpose). But received: the input's channels is " - "[%d], the shape of input is [%s], the filter's channels is [%d], " - "the shape of filter is [%s]. The data_format is %s." - "The error may come from wrong data_format setting.", - C, in_dims, filter_dims[0], filter_dims, data_layout_str)); - - framework::DDim in_data_dims; - if (data_layout != DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - std::vector output_shape({in_dims[0]}); - if (data_layout != DataLayout::kNHWC) { - output_shape.push_back(filter_dims[1] * groups); - } - const int offset = (data_layout != DataLayout::kNHWC ? 2 : 1); - for (size_t i = 0; i < strides.size(); ++i) { - auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; - auto infer_shape = (ctx->IsRuntime() || in_dims[i + offset] > 0) - ? (in_dims[i + offset] - 1) * strides[i] - - paddings[2 * i] - paddings[2 * i + 1] + - filter_extent - : -1; - if (output_size.size()) { - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_GE( - output_size[i], infer_shape, - platform::errors::InvalidArgument( - "output_size of Op(ConvTransposeOp) should not be " - "less than the infered output size. But received output_size = " - "[%s], whose dim %d is less than the infered output size [%s]", - phi::make_ddim(output_size).to_str(), i, infer_shape)); - PADDLE_ENFORCE_LT( - output_size[i], infer_shape + strides[i], - platform::errors::InvalidArgument( - "output_size of Op(ConvTransposeOp) should be less " - "than infered size + stride. But received output_size = [%s], " - "whose dim %d is not less than the infered output size (%d) + " - "stride (%d) = %d", - phi::make_ddim(output_size).to_str(), i, infer_shape, - strides[i], infer_shape + strides[i])); - } - output_shape.push_back(output_size[i]); - } else if (output_padding.size()) { - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_GE( - output_padding[i], 0, - platform::errors::InvalidArgument( - "output_padding of Op(ConvTransposeOp) should not be " - "less than the 0. But received output_padding = " - "[%s], whose dim %d is less than 0", - phi::make_ddim(output_padding).to_str(), i)); - PADDLE_ENFORCE_LT( - output_padding[i], std::max(strides[i], dilations[i]), - platform::errors::InvalidArgument( - "output_padding of Op(ConvTransposeOp) should be less " - "than either stride or dilation. But received output_size = " - "[%s], " - "whose dim %d is not less than either stride (%d) or " - "dilation (%d)", - phi::make_ddim(output_size).to_str(), i, strides[i], - dilations[i])); - } - output_shape.push_back((infer_shape + output_padding[i])); - } else { - output_shape.push_back(infer_shape); - } - } - if (data_layout == DataLayout::kNHWC) { - output_shape.push_back(filter_dims[1] * groups); - } - ctx->SetOutputDim("Output", phi::make_ddim(output_shape)); -} - framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; @@ -217,7 +62,7 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( } framework::OpKernelType ConvTransposeOp::GetKernelTypeForVar( - const std::string& var_name, const Tensor& tensor, + const std::string& var_name, const framework::Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const { #ifdef PADDLE_WITH_MKLDNN // Only input require reshaping, weights and @@ -493,17 +338,6 @@ The input(X) size and output(Out) size may be different. )DOC"); } -void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const { - auto in_dims = ctx->GetInputDim("Input"); - auto filter_dims = ctx->GetInputDim("Filter"); - if (ctx->HasOutput(framework::GradVarName("Input"))) { - ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); - } - if (ctx->HasOutput(framework::GradVarName("Filter"))) { - ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); - } -} - framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { bool use_cudnn = @@ -587,24 +421,6 @@ class ConvTransposeDoubleGradMaker : public framework::SingleGradOpMaker { } }; -void ConvTransposeOpDoubleGrad::InferShape( - framework::InferShapeContext* ctx) const { - auto x_dims = ctx->GetInputDim("Input"); - auto w_dims = ctx->GetInputDim("Filter"); - auto do_dims = ctx->GetInputDim("DOutput"); - - if (ctx->HasOutput("DDOutput") && - (ctx->HasInput("DDInput") || (ctx->HasInput("DDFilter")))) { - ctx->SetOutputDim("DDOutput", do_dims); - } - if (ctx->HasOutput("DFilter") && ctx->HasInput("DDInput")) { - ctx->SetOutputDim("DFilter", w_dims); - } - if (ctx->HasOutput("DInput") && ctx->HasInput("DDFilter")) { - ctx->SetOutputDim("DInput", x_dims); - } -} - framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { bool use_cudnn = @@ -635,59 +451,57 @@ framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType( namespace ops = paddle::operators; // conv2d_transpose +DECLARE_INFER_SHAPE_FUNCTOR(conv2d_transpose, Conv2dTranposeInferShapeFunctor, + PD_INFER_META(phi::ConvTransposeInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(conv2d_transpose_grad, + Conv2dTranposeGradInferShapeFunctor, + PD_INFER_META(phi::ConvTransposeGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR( + conv2d_transpose_grad_grad, Conv2dTranposeDoubleGradInferShapeFunctor, + PD_INFER_META(phi::Conv2dTransposeDoubleGradInferMeta)); + REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker, ops::ConvTransposeGradOpMaker, - ops::ConvTransposeGradOpMaker); -REGISTER_OPERATOR( - conv2d_transpose_grad, ops::ConvTransposeOpGrad, - ops::ConvTransposeDoubleGradMaker, - ops::ConvTransposeDoubleGradMaker); -REGISTER_OPERATOR(conv2d_transpose_grad_grad, ops::ConvTransposeOpDoubleGrad); - -REGISTER_OP_CPU_KERNEL( - conv2d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_CPU_KERNEL( - conv2d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::ConvTransposeGradOpMaker, + Conv2dTranposeInferShapeFunctor); +REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad, + ops::ConvTransposeDoubleGradMaker, + ops::ConvTransposeDoubleGradMaker, + Conv2dTranposeGradInferShapeFunctor); +REGISTER_OPERATOR(conv2d_transpose_grad_grad, ops::ConvTransposeOpDoubleGrad, + Conv2dTranposeDoubleGradInferShapeFunctor); // conv3d_transpose +DECLARE_INFER_SHAPE_FUNCTOR(conv3d_transpose, Conv3dTranposeInferShapeFunctor, + PD_INFER_META(phi::ConvTransposeInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(conv3d_transpose_grad, + Conv3dTranposeGradInferShapeFunctor, + PD_INFER_META(phi::ConvTransposeGradInferMeta)); + REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker, ops::ConvTransposeGradOpMaker, - ops::ConvTransposeGradOpMaker); -REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad); - -REGISTER_OP_CPU_KERNEL( - conv3d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_CPU_KERNEL( - conv3d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::ConvTransposeGradOpMaker, + Conv3dTranposeInferShapeFunctor); +REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad, + Conv3dTranposeGradInferShapeFunctor); // depthwise conv2d_transpose +DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d_transpose, + DepthWiseConv2dTranposeInferShapeFunctor, + PD_INFER_META(phi::ConvTransposeInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d_transpose_grad, + DepthWiseConv2dTranposeGradInferShapeFunctor, + PD_INFER_META(phi::ConvTransposeGradInferMeta)); + REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker, ops::ConvTransposeGradOpMaker, - ops::ConvTransposeGradOpMaker); -REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad); - -REGISTER_OP_CPU_KERNEL( - depthwise_conv2d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_CPU_KERNEL( - depthwise_conv2d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::ConvTransposeGradOpMaker, + DepthWiseConv2dTranposeInferShapeFunctor); +REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad, + DepthWiseConv2dTranposeGradInferShapeFunctor); REGISTER_OP_VERSION(conv_transpose) .AddCheckpoint( diff --git a/paddle/fluid/operators/conv_transpose_op.cu b/paddle/fluid/operators/conv_transpose_op.cu deleted file mode 100644 index 054cb4b33895b..0000000000000 --- a/paddle/fluid/operators/conv_transpose_op.cu +++ /dev/null @@ -1,185 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/conv_transpose_op.h" -#include "paddle/phi/kernels/gpu/depthwise_conv.h" - -namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; -using DDim = framework::DDim; - -template -class DepthwiseConvTransposeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - int groups = context.Attr("groups"); - PADDLE_ENFORCE_EQ( - groups, filter.dims()[0], - platform::errors::InvalidArgument( - "groups should be error to the 1st dimension of filter. But " - "received groups is %d and filter dimension[0] is %d", - groups, filter.dims()[0])); - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - for (auto v : dilations) { - PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument( - "dilations should be 1 in depthwise conv. " - "But received dilations is %d", - v)); - } - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - output->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, output, static_cast(0)); - - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad( - static_cast::TYPE&>(dev_ctx), - *output, filter, *input, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, output, data_layout); - } -}; - -template -class DepthwiseConvTransposeGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - auto& dev_ctx = context.template device_context(); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - if (input_grad) { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv( - static_cast::TYPE&>(dev_ctx), - *output_grad, filter, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, input_grad, data_layout); - } - - if (filter_grad) { - phi::funcs::SetConstant set_zero; - filter_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, filter_grad, static_cast(0)); - - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad( - static_cast::TYPE&>(dev_ctx), - *output_grad, *input, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, filter_grad, data_layout); - } - } -}; - -} // namespace operators -} // namespace paddle -// conv2d -REGISTER_OP_CUDA_KERNEL(conv2d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); -REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); - -// conv3d -REGISTER_OP_CUDA_KERNEL(conv3d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_CUDA_KERNEL(conv3d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); - -// depthwise conv2d -REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose, - ops::DepthwiseConvTransposeKernel, - ops::DepthwiseConvTransposeKernel); -REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose_grad, - ops::DepthwiseConvTransposeGradKernel, - ops::DepthwiseConvTransposeGradKernel); diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index ee0fb7ab36833..ac95dceb8280c 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -13,72 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/fluid/operators/math/im2col.h" -#include "paddle/fluid/operators/math/vol2col.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" + +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using DDim = framework::DDim; - -template -static void Slice(const framework::ExecutionContext& context, - const Tensor* input, Tensor* out, - const std::vector& begin_vec, - const std::vector& end_vec, - const std::vector& axes_vec) { - auto& place = - *context.template device_context().eigen_device(); - auto in_dims = input->dims(); - auto offsets = Eigen::DSizes(); - auto extents = Eigen::DSizes(); - for (size_t i = 0; i < D; ++i) { - offsets[i] = 0; - extents[i] = in_dims[i]; - } - - std::vector out_shape_vec = phi::vectorize(in_dims); - for (size_t i = 0; i < axes_vec.size(); ++i) { - offsets[axes_vec[i]] = begin_vec[i]; - extents[axes_vec[i]] = end_vec[i] - begin_vec[i]; - out_shape_vec[axes_vec[i]] = end_vec[i] - begin_vec[i]; - } - - framework::DDim out_dims(phi::make_ddim(out_shape_vec)); - out->mutable_data(out_dims, context.GetPlace()); - - auto in_t = - framework::EigenTensor::From( - *input); - auto out_t = - framework::EigenTensor::From( - *out, out_dims); - - EigenSlice, T, D>::Eval(place, out_t, in_t, - offsets, extents); - out->Resize(out_dims); -} - -template -static void Slice(const framework::ExecutionContext& context, - const Tensor* input, Tensor* out, int64_t begin_idx, - int64_t end_idx, int64_t axes) { - std::vector begin_vec = {begin_idx}; - std::vector end_vec = {end_idx}; - std::vector axes_vec = {axes}; - Slice(context, input, out, begin_vec, end_vec, axes_vec); -} - // Define Op classes in .h file so that other conv transpose // operator implementations can reuse the code. class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker { @@ -94,21 +36,19 @@ class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker { class ConvTransposeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override; framework::OpKernelType GetKernelTypeForVar( - const std::string& var_name, const Tensor& tensor, + const std::string& var_name, const framework::Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const override; }; class ConvTransposeOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; protected: framework::OpKernelType GetExpectedKernelType( @@ -118,464 +58,11 @@ class ConvTransposeOpGrad : public framework::OperatorWithKernel { class ConvTransposeOpDoubleGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override; }; -template -class GemmConvTransposeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - // The filter will be reshaped, so it should not be constant pointer - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - int groups = context.Attr("groups"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - auto out_dims = output->dims(); - const int batch_size = static_cast(input->dims()[0]); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - // input_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first - // input_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last - std::vector input_shape_vec = phi::vectorize(input->dims()); - // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} - std::vector filter_shape_vec = phi::vectorize(filter.dims()); - - // use col_shape in the im2col and col2im (or vol2col and col2vol) - // calculation - // col_shape_vec: {o_c/g, k_h, k_w, h, w} or {o_c/g, k_d, k_h, k_w, d, h, w} - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - if (data_layout != framework::DataLayout::kNHWC) { - col_shape_vec[0] = out_dims[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2]; - } - } else { - col_shape_vec[0] = out_dims[out_dims.size() - 1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 1]; - } - } - DDim col_shape(phi::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d * k_h * k_w, d * h * w) - DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim + 1); - - Tensor col; - col.mutable_data(col_shape, context.GetPlace()); - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - Tensor col_matrix; - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - - // output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first - // output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last - DDim output_shape = - phi::slice_ddim(output->dims(), 1, output->dims().size()); - - // input matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first - // input matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last - DDim input_matrix_shape; - if (data_layout != framework::DataLayout::kNHWC) { - input_matrix_shape = {in_dims[1], col_matrix_shape[1]}; - } else { - input_matrix_shape = {col_matrix_shape[1], in_dims[in_dims.size() - 1]}; - } - - // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w) - DDim filter_matrix_shape; - if (data_layout != framework::DataLayout::kNHWC) { - filter_matrix_shape = {in_dims[1], col_matrix_shape[0]}; - } else { - filter_matrix_shape = {in_dims[in_dims.size() - 1], col_matrix_shape[0]}; - } - filter.Resize(filter_matrix_shape); - - output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - set_zero(dev_ctx, output, static_cast(0)); - - int in_step = - (data_layout != framework::DataLayout::kNHWC - ? static_cast(in_dims[1]) / groups - : static_cast(in_dims[in_dims.size() - 1]) / groups); - - int out_step = - (data_layout != framework::DataLayout::kNHWC - ? static_cast(out_dims[1]) / groups - : static_cast(out_dims[out_dims.size() - 1]) / groups); - math::Col2ImFunctor col2im; - math::Col2VolFunctor col2vol; - math::ConcatFunctor concat_functor; - - // convolution transpose: gemm + col2im or col2vol (similar to conv-backward - // on input) - size_t D = input->dims().size(); - for (int i = 0; i < batch_size; i++) { - // batch with size (i_c, h * w) or (i_c, d * h * w) for channel_first - // batch with size (h * w, i_c) or (d * h * w, i_c) for channel_last - Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape); - - // output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first - // output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last - Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape); - - std::vector output_batch_vec; - for (int g = 0; g < groups; g++) { - int64_t start = g * in_step; - int64_t end = (g + 1) * in_step; - int axes = (data_layout != framework::DataLayout::kNHWC ? 0 : 1); - Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step); - Tensor in_slice, out_slice; - - // col_matrix = filter_slice * input_slice - // of shape (o_c/g * k_h * k_w, h * w) - // or (o_c/g * k_d * k_h * k_w, d * h * w) - if (data_layout != framework::DataLayout::kNHWC) { - in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step); - out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(filter_slice, true, in_slice, false, static_cast(1.0), - &col_matrix, static_cast(0.0)); - } else { - Slice(context, &input_batch, &in_slice, start, - end, axes); - start = g * out_step; - end = (g + 1) * out_step; - axes = D - 2; - if (D == 4U) { - Slice(context, &output_batch, &out_slice, - start, end, axes); - } else if (D == 5U) { - Slice(context, &output_batch, &out_slice, - start, end, axes); - } - blas.MatMul(filter_slice, true, in_slice, true, static_cast(1.0), - &col_matrix, static_cast(0.0)); - } - - if (data_dim == 2U) { - // col2im: col_matrix -> dy - // from (o_c/g * k_h * k_w, h * w) to (o_c/g, o_h, o_w) or (o_h, o_w, - // o_c/g) - col2im(dev_ctx, col, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &out_slice, data_layout); - } else if (data_dim == 3U) { - // col2vol: col_matrix -> dy - // from (o_c/g * k_d * k_h * k_w, d * h * w) to (o_c/g, o_d, o_h, o_w) - // or (o_d, o_h, o_w, o_c/g) - col2vol(dev_ctx, col, dilations, strides, paddings, &out_slice, - data_layout); - } - if (data_layout == framework::DataLayout::kNHWC) { - output_batch_vec.push_back(out_slice); - } - } - if (data_layout == framework::DataLayout::kNHWC) { - concat_functor(dev_ctx, output_batch_vec, static_cast(D - 2), - &output_batch); - } - } - } -}; - -template -class GemmConvTransposeGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - // For filter, we do not use const pointer b/c we will do reshape, - // but we should avoid modifying its value. - Tensor filter = *context.Input("Filter"); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - - if ((!input_grad) && (!filter_grad)) return; - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - int groups = context.Attr("groups"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - auto out_grad_dims = output_grad->dims(); - const int batch_size = static_cast(input->dims()[0]); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - // input_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first - // input_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last - std::vector input_shape_vec = phi::vectorize(input->dims()); - // filter_shape_vec: {i_c, o_c, k_h, k_w} or {i_c, o_c, k_d, k_h, k_w} - std::vector filter_shape_vec = phi::vectorize(filter.dims()); - - // use col_shape in the im2col and col2im (or vol2col and col2vol) - // calculation - // col_shape_vec: {o_c, k_h, k_w, h, w} or {o_c, k_d, k_h, k_w, d, h, w} for - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - if (data_layout != framework::DataLayout::kNHWC) { - col_shape_vec[0] = out_grad_dims[1]; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2]; - } - } else { - col_shape_vec[0] = out_grad_dims[out_grad_dims.size() - 1]; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 1]; - } - } - DDim col_shape(phi::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: (o_c * k_h * k_w, h * w) or (o_c * k_d * k_h * k_w, d * h * w) - DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim + 1); - - // output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first - // output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last - DDim output_shape = - phi::slice_ddim(output_grad->dims(), 1, output_grad->dims().size()); - - // input matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first - // input matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last - DDim input_matrix_shape; - if (data_layout != framework::DataLayout::kNHWC) { - input_matrix_shape = {in_dims[1], col_matrix_shape[1]}; - } else { - input_matrix_shape = {col_matrix_shape[1], in_dims[in_dims.size() - 1]}; - } - - // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w) - DDim filter_matrix_shape; - if (data_layout != framework::DataLayout::kNHWC) { - filter_matrix_shape = {in_dims[1], col_matrix_shape[0] / groups}; - } else { - filter_matrix_shape = {in_dims[in_dims.size() - 1], - col_matrix_shape[0] / groups}; - } - filter.Resize(filter_matrix_shape); - - int in_step = - (data_layout != framework::DataLayout::kNHWC - ? static_cast(in_dims[1]) / groups - : static_cast(in_dims[in_dims.size() - 1]) / groups); - int col_step = static_cast(col_matrix_shape[0]) / groups; - - // convolution transpose grad on input: - // im2col + gemm (similar to conv-forward) - // input need to compute gradient - auto& dev_ctx = context.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - if (input_grad || filter_grad) { - Tensor col; - col.mutable_data(col_shape, context.GetPlace()); - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - Tensor col_matrix; - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - - Tensor filter_grad_; - phi::funcs::SetConstant set_zero; - - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - math::ConcatFunctor concat_functor; - - if (input_grad) { - input_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, input_grad, static_cast(0)); - } - if (filter_grad) { // filter_grad_ size (i_c, o_c/g, k_h, k_w) - filter_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, filter_grad, static_cast(0)); - filter_grad_ = *filter_grad; - filter_grad_.Resize(filter_matrix_shape); - } - - size_t D = input->dims().size(); - for (int i = 0; i < batch_size; i++) { - // batch with size (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for - // channel_first - // batch with size (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for - // channel_last - Tensor output_grad_batch = - output_grad->Slice(i, i + 1).Resize(output_shape); - - if (data_dim == 2U) { - // im2col: dy -> col matrix - // from (o_c, o_h, o_w) to (o_c * k_h * k_w, i_h * i_w) for - // channel_first - // from (o_h, o_w, o_c) to (o_c * k_h * k_w, i_h * i_w) for - // channel_last - im2col(dev_ctx, output_grad_batch, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col, data_layout); - } else if (data_dim == 3U) { - // vol2col: dy -> col_matrix - // from (o_c, o_d, o_h, o_w) to (o_c * k_d * k_h * k_w, i_d * i_h * - // i_w) for channel_first - // from (o_d, o_h, o_w, o_c) to (i_d * i_h * i_w, o_c * k_d * k_h * - // k_w) for channel_last - vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings, - &col, data_layout); - } - - if (input_grad) { - // batch with size (i_c, i_h, i_w) or (i_h, i_w, i_c) - Tensor input_grad_batch = - input_grad->Slice(i, i + 1).Resize(input_matrix_shape); - - // gemm: dx = filter * dy - // (i_c, o_c * k_h * k_w) * (o_c * k_h * k_w, i_h * i_w) -> (i_c, i_h - // * i_w) - // or - // (i_c, o_c * k_d * k_h * k_w) * (o_c * k_d * k_h * k_w, i_d * i_h * - // i_w) -> (i_c, - // i_d, i_h, i_w) - // gemm: dx = dy^T * filter^T for channel_last - - std::vector input_grad_batch_vec; - for (int g = 0; g < groups; g++) { - // input_grad_slice: (i_c/g, i_h * i_w) or (i_c/g, i_d * i_h * i_w) - // for channel_first - // input_grad_slice: (i_h * i_w, i_c/g) or (i_d * i_h * i_w, i_c/g) - // for channel_last - // filter_slice: (i_c/g, o_c/g * k_h * k_w) - Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step); - // col_matrix_slice: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d * - // k_h * k_w, d * h * w) - Tensor col_matrix_slice = - col_matrix.Slice(g * col_step, (g + 1) * col_step); - if (data_layout != framework::DataLayout::kNHWC) { - Tensor input_grad_slice = - input_grad_batch.Slice(g * in_step, (g + 1) * in_step); - blas.MatMul(filter_slice, false, col_matrix_slice, false, - static_cast(1.0), &input_grad_slice, - static_cast(0.0)); - } else { - Tensor input_grad_slice; - Slice(context, &input_grad_batch, - &input_grad_slice, g * in_step, - (g + 1) * in_step, 1); - blas.MatMul(col_matrix_slice, true, filter_slice, true, - static_cast(1.0), &input_grad_slice, - static_cast(0.0)); - DDim input_grad_slice_shape; - if (data_dim == 2U) { - input_grad_slice_shape = {in_dims[1], in_dims[2], in_step}; - } else { - input_grad_slice_shape = {in_dims[1], in_dims[2], in_dims[3], - in_step}; - } - input_grad_slice = - input_grad_slice.Resize(input_grad_slice_shape); - input_grad_batch_vec.push_back(input_grad_slice); - } - } - if (data_layout == framework::DataLayout::kNHWC) { - concat_functor(dev_ctx, input_grad_batch_vec, - static_cast(D - 2), &input_grad_batch); - } - } - if (filter_grad) { - // input batch: (i_c, i_h * i_w) or (i_h, i_w * i_c) - Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape); - // gemm: d_filter = x * dy^T - // (i_c, i_h * i_w) * (i_h * i_w, o_c * k_h * k_w) -> (i_c, o_c * k_h - // * k_w) - // or - // (i_c, i_d * i_h * i_w) * (i_d * i_h * i_w, o_c * k_d * k_h * k_w) - // -> (i_c, o_c * k_d * - // k_h * k_w) - // gemm: d_filter = x^T * dy^T for channel_last - - for (int g = 0; g < groups; g++) { - Tensor filter_grad_slice = - filter_grad_.Slice(g * in_step, (g + 1) * in_step); - Tensor col_matrix_slice = - col_matrix.Slice(g * col_step, (g + 1) * col_step); - if (data_layout != framework::DataLayout::kNHWC) { - Tensor in_batch_slice = - in_batch.Slice(g * in_step, (g + 1) * in_step); - blas.MatMul(in_batch_slice, false, col_matrix_slice, true, - static_cast(1.0), &filter_grad_slice, - static_cast(1.0)); - } else { - Tensor in_batch_slice; - Slice(context, &in_batch, &in_batch_slice, - g * in_step, (g + 1) * in_step, 1); - blas.MatMul(in_batch_slice, true, col_matrix_slice, true, - static_cast(1.0), &filter_grad_slice, - static_cast(1.0)); - } - } - } - } - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_transpose_op_npu.cc b/paddle/fluid/operators/conv_transpose_op_npu.cc index 7d0ebf21829c2..050ede78f72cf 100644 --- a/paddle/fluid/operators/conv_transpose_op_npu.cc +++ b/paddle/fluid/operators/conv_transpose_op_npu.cc @@ -13,11 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/conv_transpose_op.h" + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/cpu/conv_util.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; using NPUDeviceContext = platform::NPUDeviceContext; template @@ -55,8 +59,8 @@ class Conv2DTransposeNPUKernel : public framework::OpKernel { filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&padding, &dilation, padding_algorithm, - in_data_dims, stride, ksize); + phi::UpdatePaddingAndDilation(&padding, &dilation, padding_algorithm, + in_data_dims, stride, ksize); // construct NPU attr std::vector strides(4, 1); @@ -137,8 +141,8 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel { framework::DDim filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); + phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); std::vector strides_vec(4, 1); std::vector dilations_vec(4, 1); diff --git a/paddle/fluid/operators/conv_transpose_op_xpu.cc b/paddle/fluid/operators/conv_transpose_op_xpu.cc index 12e1739f2a267..b8bd3c4f00608 100644 --- a/paddle/fluid/operators/conv_transpose_op_xpu.cc +++ b/paddle/fluid/operators/conv_transpose_op_xpu.cc @@ -8,15 +8,22 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + #include "paddle/fluid/operators/conv_transpose_op.h" + #include #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/phi/kernels/cpu/conv_util.h" + #ifdef PADDLE_WITH_XPU namespace paddle { namespace operators { +using Tensor = framework::Tensor; + // target_len == 2 || target_len == 4 inline std::vector vector_extend(const std::vector& src, int target_len) { @@ -61,8 +68,8 @@ class Conv2DTransposeXPUKernel : public framework::OpKernel { framework::DDim filter_data_dims = phi::slice_ddim(filter.dims(), 2, filter.dims().size()); std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); + phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); const int batch_size = static_cast(input->dims()[0]); const int img_yc = static_cast(input->dims()[1]); @@ -135,8 +142,8 @@ class Conv2DTransposeGradXPUKernel : public framework::OpKernel { framework::DDim filter_data_dims = phi::slice_ddim(filter.dims(), 2, filter.dims().size()); std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); + phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); const int batch_size = static_cast(input->dims()[0]); const int img_yc = static_cast(input->dims()[1]); diff --git a/paddle/fluid/operators/cumprod_op.cc b/paddle/fluid/operators/cumprod_op.cc index bff6673429d9a..889cdac8f6882 100644 --- a/paddle/fluid/operators/cumprod_op.cc +++ b/paddle/fluid/operators/cumprod_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/cumprod_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,14 +23,6 @@ namespace operators { class CumprodOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Cumprod"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Cumprod"); - - ctx->ShareDim("X", "Out"); - ctx->ShareLoD("X", "Out"); - } }; class CumprodOpMaker : public framework::OpProtoAndCheckerMaker { @@ -81,22 +76,12 @@ class CumprodGradOp : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(cumprod, CumprodInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(cumprod, ops::CumprodOp, ops::CumprodOpMaker, ops::CumprodGradOpMaker, - ops::CumprodGradOpMaker); + ops::CumprodGradOpMaker, + CumprodInferShapeFunctor); REGISTER_OPERATOR(cumprod_grad, ops::CumprodGradOp); - -REGISTER_OP_CPU_KERNEL( - cumprod, ops::CumprodOpCPUKernel, ops::CumprodOpCPUKernel, - ops::CumprodOpCPUKernel, ops::CumprodOpCPUKernel, - ops::CumprodOpCPUKernel>, - ops::CumprodOpCPUKernel>); - -REGISTER_OP_CPU_KERNEL( - cumprod_grad, ops::CumprodGradOpCPUKernel, - ops::CumprodGradOpCPUKernel, ops::CumprodGradOpCPUKernel, - ops::CumprodGradOpCPUKernel, - ops::CumprodGradOpCPUKernel>, - ops::CumprodGradOpCPUKernel>); diff --git a/paddle/fluid/operators/cumprod_op.cu b/paddle/fluid/operators/cumprod_op.cu deleted file mode 100644 index f792d6832917f..0000000000000 --- a/paddle/fluid/operators/cumprod_op.cu +++ /dev/null @@ -1,369 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "paddle/fluid/operators/cumprod_op.h" -#include "paddle/fluid/operators/math/inclusive_scan.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { - -template -struct MultiplyFunctor { - HOSTDEVICE T operator()(T a, T b) const { return a * b; } -}; - -template -class CumprodOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - auto *y = ctx.Output("Out"); - auto dim = ctx.Attr("dim"); - size_t outer_dim, mid_dim, inner_dim; - GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim); - - const auto *x_data = x->data(); - auto *y_data = y->mutable_data(ctx.GetPlace()); - const auto &dev_ctx = - ctx.template device_context(); - math::InclusiveScan>( - x_data, y_data, outer_dim, mid_dim, inner_dim, static_cast(1), - MultiplyFunctor(), /*reverse=*/false, dev_ctx); - } -}; - -template -struct IsZeroFunctor { - HOSTDEVICE bool operator()(T x) const { return x == static_cast(0); } -}; - -template -struct CumprodGradFunctorExceptFirstZero { - HOSTDEVICE CumprodGradFunctorExceptFirstZero( - const T *x, const T *y, const T *dy_mul_y_reversed_cumsum, - const uint8_t *zero_mask, size_t mid_dim, size_t inner_dim, T *dx, - int64_t *first_zero_idx, T *x_filled_one) - : x_(x), - y_(y), - dy_mul_y_reversed_cumsum_(dy_mul_y_reversed_cumsum), - zero_mask_(zero_mask), - mid_dim_(mid_dim), - inner_dim_(inner_dim), - dx_(dx), - first_zero_idx_(first_zero_idx), - x_filled_one_(x_filled_one) {} - - HOSTDEVICE void operator()(size_t idx) const { - auto inner_idx = idx % inner_dim_; - auto outer_idx = idx / (mid_dim_ * inner_dim_); - auto mid_idx = (idx - inner_idx) / inner_dim_ % mid_dim_; - auto mask = zero_mask_[idx]; - bool should_fill_one = true; - - if (mask == 0) { - dx_[idx] = dy_mul_y_reversed_cumsum_[idx] / x_[idx]; - if (mid_idx == mid_dim_ - 1) { - // record first zero position as -1, i.e., no zero - first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = -1; - } - } else if (mid_idx > 0) { // mask > 0 - if (zero_mask_[idx - inner_dim_] > 0) { // not first zero - dx_[idx] = 0; - should_fill_one = false; - } else { - // idx is the first zero position, it should be recorded - dx_[idx] = y_[idx - inner_dim_]; - first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = mid_idx; - } - } else { // the first zero position is index 0 - dx_[idx] = 1; - first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = 0; - } - - x_filled_one_[idx] = should_fill_one ? 1 : x_[idx]; - } - - private: - const T *x_; - const T *y_; - const T *dy_mul_y_reversed_cumsum_; - const uint8_t *zero_mask_; - size_t mid_dim_; - size_t inner_dim_; - T *dx_; - int64_t *first_zero_idx_; - T *x_filled_one_; -}; - -template -struct FillFirstZeroPositionGradFunctor { - HOSTDEVICE FillFirstZeroPositionGradFunctor(const int64_t *first_zero_idx, - const T *grad_value, - size_t mid_dim, size_t inner_dim, - T *dx) - : first_zero_idx_(first_zero_idx), - grad_value_(grad_value), - mid_dim_(mid_dim), - inner_dim_(inner_dim), - dx_(dx) {} - - HOSTDEVICE void operator()(size_t idx) const { - auto outer_idx = idx / inner_dim_; - auto inner_idx = idx % inner_dim_; - auto mid_idx = first_zero_idx_[idx]; - if (mid_idx >= 0) { - auto full_idx = - outer_idx * mid_dim_ * inner_dim_ + mid_idx * inner_dim_ + inner_idx; - dx_[full_idx] *= grad_value_[full_idx]; - } - } - - private: - const int64_t *first_zero_idx_; - const T *grad_value_; - size_t mid_dim_; - size_t inner_dim_; - T *dx_; -}; - -/* -Reference to -https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ReduceOps.cpp -input: x, y, dL/dy -output: dL/dx -dL/dx[i] = sum{0<=j k, dL/dx[i] = 0; -i < k, dL/dx[i] = 1/x[i]*sum{i<=j k - dx[i] = 0; - x_filled_one[i] = x[i]; - } - } - } -} -T = reversed_cumsum(dy[j]*cumprod(x_filled_one[j])); -if (zero_index != -1) { - dx[zero_index] *= T[zero_index]; -} -*/ - -template -class CumprodGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - const auto *y = ctx.Input("Out"); - const auto *dy = - ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - auto dim = ctx.Attr("dim"); - - size_t outer_dim, mid_dim, inner_dim; - GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim); - if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return; - - size_t numel = outer_dim * mid_dim * inner_dim; - - const auto *x_data = x->data(); - const auto *y_data = y->data(); - const auto *dy_data = dy->data(); - - auto place = ctx.GetPlace(); - const auto &dev_ctx = - ctx.template device_context(); - auto *dx_data = dx->mutable_data(place); - - // deal with complex - const T *x_data_deal; - const T *y_data_deal; - memory::AllocationPtr x_conj; - memory::AllocationPtr y_conj; - if (framework::IsComplex::value) { - x_conj = memory::Alloc(place, numel * sizeof(T)); - auto *x_data_conj = reinterpret_cast(x_conj->ptr()); - y_conj = memory::Alloc(place, numel * sizeof(T)); - auto *y_data_conj = reinterpret_cast(y_conj->ptr()); - - platform::ForRange for_range_x(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_x(x_data, numel, x_data_conj); - for_range_x(functor_x); - - platform::ForRange for_range_y(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_y(y_data, numel, y_data_conj); - for_range_y(functor_y); - x_data_deal = x_data_conj; - y_data_deal = y_data_conj; - } else { - x_data_deal = x_data; - y_data_deal = y_data; - } - -// Step 1: find cummax-ed zero mask of x -#ifdef PADDLE_WITH_CUDA - const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream()); -#else - const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream()); -#endif - auto zero_mask_without_cummax = - memory::Alloc(place, numel * sizeof(uint8_t)); - auto *zero_mask_without_cummax_data = - reinterpret_cast(zero_mask_without_cummax->ptr()); - thrust::transform( - exec_policy, thrust::device_pointer_cast(x_data_deal), - thrust::device_pointer_cast(x_data_deal) + numel, - thrust::device_pointer_cast(zero_mask_without_cummax_data), - IsZeroFunctor()); - - auto zero_mask = memory::Alloc(place, numel * sizeof(uint8_t)); - auto *zero_mask_data = reinterpret_cast(zero_mask->ptr()); - math::InclusiveScan( - zero_mask_without_cummax_data, zero_mask_data, outer_dim, mid_dim, - inner_dim, static_cast(0), cub::Max(), /*reverse=*/false, - dev_ctx); - zero_mask_without_cummax = nullptr; - - // Step 2: calculate reversed cumsum(dy * y) - auto dy_mul_y = memory::Alloc(place, numel * sizeof(T)); - auto *dy_mul_y_data = reinterpret_cast(dy_mul_y->ptr()); - thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data), - thrust::device_pointer_cast(dy_data) + numel, - thrust::device_pointer_cast(y_data_deal), - thrust::device_pointer_cast(dy_mul_y_data), - MultiplyFunctor()); - - auto dy_mul_y_reversed_cumsum = memory::Alloc(place, numel * sizeof(T)); - auto *dy_mul_y_reversed_cumsum_data = - reinterpret_cast(dy_mul_y_reversed_cumsum->ptr()); - math::InclusiveScan( - dy_mul_y_data, dy_mul_y_reversed_cumsum_data, outer_dim, mid_dim, - inner_dim, static_cast(0), cub::Sum(), /*reverse=*/true, dev_ctx); - - // Step 3: calculate the gradient value except the first zero position. - // The gradient value of the first zero position is filled with out[idx-1], - // while the gradient value of the other positions are calculated out - // completely. This functor also: - // (1) find the first zero index, i.e., first_zero_idx_data. - // (2) fill x_filled_one, which satifies - // x_filled_one[i] = x[i], i > pos - // x_filled_one[i] = 1, i <= pos - auto first_zero_idx = - memory::Alloc(place, outer_dim * inner_dim * sizeof(int64_t)); - auto *first_zero_idx_data = - reinterpret_cast(first_zero_idx->ptr()); - auto *x_filled_one_data = dy_mul_y_data; // reuse former allocated memory - platform::ForRange for_range(dev_ctx, numel); - CumprodGradFunctorExceptFirstZero functor_except_first_zero( - x_data_deal, y_data_deal, dy_mul_y_reversed_cumsum_data, zero_mask_data, - mid_dim, inner_dim, dx_data, first_zero_idx_data, x_filled_one_data); - for_range(functor_except_first_zero); - - // Step 4: calculate cumprod of x_filled_one - auto *x_filled_one_cumprod_data = - dy_mul_y_reversed_cumsum_data; // reuse former allocated memory - math::InclusiveScan>( - x_filled_one_data, x_filled_one_cumprod_data, outer_dim, mid_dim, - inner_dim, static_cast(1), MultiplyFunctor(), /*reverse=*/false, - dev_ctx); - - // Step 5: calculate reversed cumsum(dy * x_filled_one_cumprod) - auto *dy_mul_x_filled_one_cumprod = - dy_mul_y_data; // reuse former allocated memory - thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data), - thrust::device_pointer_cast(dy_data) + numel, - thrust::device_pointer_cast(x_filled_one_cumprod_data), - thrust::device_pointer_cast(dy_mul_x_filled_one_cumprod), - MultiplyFunctor()); - auto *dy_mul_x_filled_one_cumprod_reversed_cumsum = - dy_mul_y_reversed_cumsum_data; // reuse former allocated memory - math::InclusiveScan( - dy_mul_x_filled_one_cumprod, - dy_mul_x_filled_one_cumprod_reversed_cumsum, outer_dim, mid_dim, - inner_dim, static_cast(0), cub::Sum(), - /*reverse=*/true, dev_ctx); - - // Step 6: fill zero pos gradient value - platform::ForRange - for_range_fill_zero_pos_grad(dev_ctx, outer_dim * inner_dim); - FillFirstZeroPositionGradFunctor fill_first_zero_pos_grad_functor( - first_zero_idx_data, dy_mul_x_filled_one_cumprod_reversed_cumsum, - mid_dim, inner_dim, dx_data); - for_range_fill_zero_pos_grad(fill_first_zero_pos_grad_functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - cumprod, ops::CumprodOpCUDAKernel, ops::CumprodOpCUDAKernel, - ops::CumprodOpCUDAKernel, ops::CumprodOpCUDAKernel, - ops::CumprodOpCUDAKernel>, - ops::CumprodOpCUDAKernel>); - -REGISTER_OP_CUDA_KERNEL( - cumprod_grad, ops::CumprodGradOpCUDAKernel, - ops::CumprodGradOpCUDAKernel, ops::CumprodGradOpCUDAKernel, - ops::CumprodGradOpCUDAKernel, - ops::CumprodGradOpCUDAKernel>, - ops::CumprodGradOpCUDAKernel>); diff --git a/paddle/fluid/operators/cumprod_op.h b/paddle/fluid/operators/cumprod_op.h deleted file mode 100644 index 74ed2008ae983..0000000000000 --- a/paddle/fluid/operators/cumprod_op.h +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; - -static void GetCumprodDimInfo(const framework::DDim& dim, int cumprod_dim, - size_t* outer_dim, size_t* mid_dim, - size_t* inner_dim) { - PADDLE_ENFORCE_GE( - cumprod_dim, -dim.size(), - platform::errors::InvalidArgument( - "The input dim of CumprodOp should be larger than the opposite " - "rank of input x which is %d.But received dim=%d", - -dim.size(), cumprod_dim)); - PADDLE_ENFORCE_LT(cumprod_dim, dim.size(), - platform::errors::InvalidArgument( - "The input dim of CumprodOp should be smaller than the " - "rank of input x which is %d.But received dim=%d", - dim.size(), cumprod_dim)); - if (cumprod_dim < 0) cumprod_dim += dim.size(); - - *outer_dim = 1; - for (int i = 0; i < cumprod_dim; ++i) { - *outer_dim *= dim[i]; - } - *mid_dim = dim[cumprod_dim]; - *inner_dim = 1; - for (int i = cumprod_dim + 1; i < dim.size(); ++i) { - *inner_dim *= dim[i]; - } -} - -template -class CumprodOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - int dim = context.Attr("dim"); - - auto* x_data = x->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - framework::DDim shape = x->dims(); - - size_t outer_dim = 1; - size_t mid_dim = 1; - size_t inner_dim = 1; - GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim); - - for (size_t i = 0; i < outer_dim; i++) { - for (size_t j = 0; j < mid_dim; j++) { - for (size_t k = 0; k < inner_dim; k++) { - size_t pos = i * mid_dim * inner_dim + j * inner_dim + k; - if (j == 0) { - out_data[pos] = x_data[pos]; - } else { - out_data[pos] = out_data[pos - inner_dim] * x_data[pos]; - } - } - } - } - } -}; - -template -class CumprodGradOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const { - const Tensor* d_out = context.Input(framework::GradVarName("Out")); - const Tensor* x = context.Input("X"); - const Tensor* out = context.Input("Out"); - - int dim = context.Attr("dim"); - framework::DDim shape = x->dims(); - Tensor* d_x = context.Output(framework::GradVarName("X")); - - auto* d_out_data = d_out->data(); - auto* x_data = x->data(); - auto* out_data = out->data(); - auto* d_x_data = d_x->mutable_data(context.GetPlace()); - - auto place = context.GetPlace(); - const auto& dev_ctx = - context.template device_context(); - - size_t outer_dim = 1; - size_t mid_dim = 1; - size_t inner_dim = 1; - GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim); - size_t numel = outer_dim * mid_dim * inner_dim; - - // deal with complex - const T* x_data_deal; - const T* out_data_deal; - memory::AllocationPtr x_conj; - memory::AllocationPtr out_conj; - if (framework::IsComplex::value) { - x_conj = memory::Alloc(place, numel * sizeof(T)); - auto* x_data_conj = reinterpret_cast(x_conj->ptr()); - out_conj = memory::Alloc(place, numel * sizeof(T)); - auto* out_data_conj = reinterpret_cast(out_conj->ptr()); - - platform::ForRange for_range_x(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_x(x_data, numel, x_data_conj); - for_range_x(functor_x); - - platform::ForRange for_range_out(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_out(out_data, numel, out_data_conj); - for_range_out(functor_out); - - x_data_deal = x_data_conj; - out_data_deal = out_data_conj; - } else { - x_data_deal = x_data; - out_data_deal = out_data; - } - - for (size_t i = 0; i < outer_dim; i++) { - for (size_t k = 0; k < inner_dim; k++) { - for (size_t j = 0; j < mid_dim; j++) { - size_t index = i * mid_dim * inner_dim + j * inner_dim + k; - d_x_data[index] = 0; - for (size_t n = 0; n < mid_dim; n++) { - size_t pos = i * mid_dim * inner_dim + n * inner_dim + k; - T elem; - if (j == 0) { - elem = d_out_data[pos]; - } else { - elem = d_out_data[pos] * out_data_deal[index - inner_dim]; - } - if (pos > index) { - for (size_t m = index + inner_dim; m <= pos; m += inner_dim) { - elem *= x_data_deal[m]; - } - } else if (pos < index) { - elem = static_cast(0); - } - d_x_data[index] += elem; - } - } - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/deformable_conv_op.cc b/paddle/fluid/operators/deformable_conv_op.cc index b15efc5f84bdd..6e15fd090b8c4 100644 --- a/paddle/fluid/operators/deformable_conv_op.cc +++ b/paddle/fluid/operators/deformable_conv_op.cc @@ -338,8 +338,6 @@ REGISTER_OPERATOR(deformable_conv, ops::DeformableConvOp, REGISTER_OPERATOR(deformable_conv_grad, ops::DeformableConvGradOp); -REGISTER_OP_CPU_KERNEL(deformable_conv, ops::DeformableConvCPUKernel, - ops::DeformableConvCPUKernel); REGISTER_OP_CPU_KERNEL(deformable_conv_grad, ops::DeformableConvGradCPUKernel, ops::DeformableConvGradCPUKernel); diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu index 2c7d905c79b37..ad10abf9c647b 100644 --- a/paddle/fluid/operators/deformable_conv_op.cu +++ b/paddle/fluid/operators/deformable_conv_op.cu @@ -446,108 +446,6 @@ __global__ void FilterGradAddupGpuKernel(const int nthreads, const int n, } } -template -class DeformableConvCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* input = ctx.Input("Input"); - const Tensor offset = *ctx.Input("Offset"); - const Tensor mask = *ctx.Input("Mask"); - Tensor filter = *ctx.Input("Filter"); - Tensor* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.cuda_device_context(); - - const int groups = ctx.Attr("groups"); - const int deformable_groups = ctx.Attr("deformable_groups"); - const int im2col_step = ctx.Attr("im2col_step"); - const std::vector strides = ctx.Attr>("strides"); - const std::vector paddings = ctx.Attr>("paddings"); - const std::vector dilations = ctx.Attr>("dilations"); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - std::vector output_shape_vec(phi::vectorize(output->dims())); - - // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w} - std::vector col_buffer_shape_vec(filter_shape_vec.size()); - col_buffer_shape_vec[0] = - input->dims()[1] * filter.dims()[2] * filter.dims()[3]; - col_buffer_shape_vec[1] = im2col_step; - for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { - col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec)); - std::vector output_buffer_shape_vec(1); - output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * - output_shape_vec[2] * output_shape_vec[3]; - framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec)); - Tensor col_buffer; - Tensor output_buffer; - col_buffer = ctx.AllocateTmpTensor(col_shape, dev_ctx); - output_buffer = - ctx.AllocateTmpTensor(output_shape, dev_ctx); - - int64_t M = output_shape_vec[1] / groups; - int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; - int64_t K = - input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; - - Tensor weight_3d; - weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K})); - Tensor col_buffer_3d; - col_buffer_3d.ShareDataWith(col_buffer) - .Resize(phi::make_ddim({groups, K, N})); - Tensor output_4d; - output_4d.ShareDataWith(output_buffer) - .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N})); - output_4d.mutable_data(ctx.GetPlace()); - framework::DDim input_shape = - phi::slice_ddim(input->dims(), 1, input->dims().size()); - std::vector input_shape_vec = phi::vectorize(input_shape); - - int input_dim = input->numel() / input->dims()[0]; - int input_offset_dim = offset.numel() / offset.dims()[0]; - int input_mask_dim = mask.numel() / mask.dims()[0]; - - auto blas = phi::funcs::GetBlas(dev_ctx); - - const T* input_ptr = input->data(); - const T* offset_ptr = offset.data(); - const T* mask_ptr = mask.data(); - col_buffer.mutable_data(ctx.GetPlace()); - T* col_buffer_ptr = col_buffer.data(); - - for (int i = 0; i < batch_size / im2col_step; ++i) { - ModulatedDeformableIm2col( - ctx.device_context(), input_ptr + i * im2col_step * input_dim, - offset_ptr + i * im2col_step * input_offset_dim, - mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, - col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations, - deformable_groups, col_buffer_ptr); - - Tensor output_3d = output_4d.Slice(i, i + 1).Resize( - phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); - for (int g = 0; g < groups; ++g) { - Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size())); - Tensor col_buffer_3d_slice = - col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim( - col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); - Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size())); - - blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0), - &output_3d_slice, T(0.0)); - } - } - output->ShareDataWith(output_buffer) - .Resize(phi::make_ddim(output_shape_vec)); - } -}; - template class DeformableConvGradCUDAKernel : public framework::OpKernel { public: @@ -740,9 +638,6 @@ class DeformableConvGradCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(deformable_conv, - ops::DeformableConvCUDAKernel, - ops::DeformableConvCUDAKernel); REGISTER_OP_CUDA_KERNEL(deformable_conv_grad, ops::DeformableConvGradCUDAKernel, ops::DeformableConvGradCUDAKernel); diff --git a/paddle/fluid/operators/deformable_conv_op.h b/paddle/fluid/operators/deformable_conv_op.h index 66961655ee6ff..1176b96987ed6 100644 --- a/paddle/fluid/operators/deformable_conv_op.h +++ b/paddle/fluid/operators/deformable_conv_op.h @@ -318,102 +318,6 @@ void FilterGradAddupCPUKernel(const int nthreads, const int n, const int height, } } -template -class DeformableConvCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* offset = ctx.Input("Offset"); - auto* mask = ctx.Input("Mask"); - Tensor filter = *ctx.Input("Filter"); - Tensor* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - - const int groups = ctx.Attr("groups"); - const int deformable_groups = ctx.Attr("deformable_groups"); - const int im2col_step = ctx.Attr("im2col_step"); - const std::vector strides = ctx.Attr>("strides"); - const std::vector paddings = ctx.Attr>("paddings"); - const std::vector dilations = ctx.Attr>("dilations"); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - std::vector output_shape_vec(phi::vectorize(output->dims())); - - // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w} - std::vector col_buffer_shape_vec(filter_shape_vec.size()); - col_buffer_shape_vec[0] = - input->dims()[1] * filter.dims()[2] * filter.dims()[3]; - col_buffer_shape_vec[1] = im2col_step; - for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { - col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec)); - std::vector output_buffer_shape_vec(1); - output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * - output_shape_vec[2] * output_shape_vec[3]; - framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec)); - Tensor col_buffer; - Tensor output_buffer; - col_buffer = ctx.AllocateTmpTensor(col_shape, dev_ctx); - output_buffer = - ctx.AllocateTmpTensor(output_shape, dev_ctx); - int64_t M = output_shape_vec[1] / groups; - int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; - int64_t K = - input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; - - Tensor weight_3d; - weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K})); - Tensor col_buffer_3d; - col_buffer_3d.ShareDataWith(col_buffer) - .Resize(phi::make_ddim({groups, K, N})); - Tensor output_4d; - output_4d.ShareDataWith(output_buffer) - .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N})); - output_4d.mutable_data(ctx.GetPlace()); - framework::DDim input_shape = - phi::slice_ddim(input->dims(), 1, input->dims().size()); - std::vector input_shape_vec = phi::vectorize(input_shape); - int input_dim = input->numel() / input->dims()[0]; - int input_offset_dim = offset->numel() / offset->dims()[0]; - int input_mask_dim = mask->numel() / mask->dims()[0]; - auto blas = phi::funcs::GetBlas(dev_ctx); - const T* input_ptr = input->data(); - const T* offset_ptr = offset->data(); - const T* mask_ptr = mask->data(); - col_buffer.mutable_data(ctx.GetPlace()); - T* col_buffer_ptr = col_buffer.data(); - for (int i = 0; i < batch_size / im2col_step; ++i) { - ModulatedDeformableIm2colCPU( - dev_ctx, input_ptr + i * im2col_step * input_dim, - offset_ptr + i * im2col_step * input_offset_dim, - mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, - col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations, - deformable_groups, col_buffer_ptr); - Tensor output_3d = output_4d.Slice(i, i + 1).Resize( - phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); - // get the product of pixel and weight - for (int g = 0; g < groups; ++g) { - Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size())); - Tensor col_buffer_3d_slice = - col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim( - col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); - Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size())); - blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0), - &output_3d_slice, T(0.0)); - } - } - output->ShareDataWith(output_buffer) - .Resize(phi::make_ddim(output_shape_vec)); - } -}; - template class DeformableConvGradCPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc index 0d9fbf612f73c..35e389090175f 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.cc +++ b/paddle/fluid/operators/detection/yolo_box_op.cc @@ -9,8 +9,10 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -235,10 +237,13 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(yolo_box, YoloBoxInferShapeFunctor, + PD_INFER_META(phi::YoloBoxInferMeta)); REGISTER_OPERATOR( yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + YoloBoxInferShapeFunctor); REGISTER_OP_VERSION(yolo_box) .AddCheckpoint( diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc index 98247fbc862bb..6959b5cf81106 100644 --- a/paddle/fluid/operators/determinant_op.cc +++ b/paddle/fluid/operators/determinant_op.cc @@ -13,6 +13,10 @@ // limitations under the License. #include "paddle/fluid/operators/determinant_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,11 +24,6 @@ namespace operators { class DeterminantOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant"); - } }; class DeterminantOpMaker : public framework::OpProtoAndCheckerMaker { @@ -44,19 +43,6 @@ class DeterminantGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", - "DeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "DeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", - framework::GradVarName("Out"), "DeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output", - framework::GradVarName("Input"), "DeterminantGradOp"); - - ctx->SetOutputDim(framework::GradVarName("Input"), - ctx->GetInputDim("Input")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -162,19 +148,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(determinant, DeterminantInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(determinant, ops::DeterminantOp, ops::DeterminantOpMaker, ops::DeterminantGradOpMaker, - ops::DeterminantGradOpMaker); - -REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp) + ops::DeterminantGradOpMaker, + DeterminantInferShapeFunctor); -REGISTER_OP_CPU_KERNEL(determinant, - ops::DeterminantKernel, - ops::DeterminantKernel); - -REGISTER_OP_CPU_KERNEL( - determinant_grad, ops::DeterminantGradKernel, - ops::DeterminantGradKernel); +DECLARE_INFER_SHAPE_FUNCTOR(determinant_grad, DeterminantGradInferShapeFunctor, + PD_INFER_META(phi::GeneralUnaryGradInferMeta)); +REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp, + DeterminantGradInferShapeFunctor); REGISTER_OPERATOR(slogdeterminant, ops::SlogDeterminantOp, ops::SlogDeterminantOpMaker, diff --git a/paddle/fluid/operators/determinant_op.cu b/paddle/fluid/operators/determinant_op.cu index d19d4c3d09386..d8237fa3004e6 100644 --- a/paddle/fluid/operators/determinant_op.cu +++ b/paddle/fluid/operators/determinant_op.cu @@ -17,14 +17,6 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - determinant, ops::DeterminantKernel, - ops::DeterminantKernel); - -REGISTER_OP_CUDA_KERNEL( - determinant_grad, - ops::DeterminantGradKernel, - ops::DeterminantGradKernel); REGISTER_OP_CUDA_KERNEL( slogdeterminant, ops::SlogDeterminantKernel, diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h index f89ecd3722287..a1fe8a25665ec 100644 --- a/paddle/fluid/operators/determinant_op.h +++ b/paddle/fluid/operators/determinant_op.h @@ -22,12 +22,15 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/diag_functor.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" #include "paddle/phi/kernels/funcs/unsqueeze.h" -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/determinant_kernel_impl.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" @@ -40,232 +43,6 @@ T sign(T val) { return static_cast(T(0) < val) - (val < T(0)); } -template -class EigenMatrix {}; - -template <> -class EigenMatrix { - public: - using MatrixType = Eigen::MatrixXf; -}; - -template <> -class EigenMatrix { - public: - using MatrixType = Eigen::MatrixXd; -}; - -inline int64_t GetBatchCount(const framework::DDim dims) { - int64_t batch_count = 1; - auto dim_size = dims.size(); - PADDLE_ENFORCE_GE( - dim_size, 2, - platform::errors::InvalidArgument( - "the input matrix dimension size should greater than 2.")); - - // Cumulative multiplying each dimension until the last 2 to get the batch - // count, - // for example a tensor with shape [3,3,3,3], the batch count of matrices is - // 9. - for (int64_t i = 0; i < dims.size() - 2; i++) { - batch_count *= dims[i]; - } - - return batch_count; -} - -template -struct DeterminantFunctor { - void operator()(const Tensor& input, const framework::ExecutionContext ctx, - int64_t rank, int64_t batch_count, Tensor* output) { - std::vector input_vec; - std::vector output_vec; - framework::TensorToVector(input, ctx.device_context(), &input_vec); - for (int64_t i = 0; i < batch_count; ++i) { // maybe can be parallel - auto begin_iter = input_vec.begin() + i * rank * rank; - auto end_iter = input_vec.begin() + (i + 1) * rank * rank; - std::vector sub_vec(begin_iter, - end_iter); // get every square matrix data - typename EigenMatrix::MatrixType matrix(rank, rank); - for (int64_t i = 0; i < rank; ++i) { - for (int64_t j = 0; j < rank; ++j) { - matrix(i, j) = sub_vec[rank * i + j]; - } - } - output_vec.push_back(matrix.determinant()); - } - framework::TensorFromVector(output_vec, output); - } -}; -template -class DeterminantKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("Input"); - auto input_dim = vectorize(input->dims()); - auto input_dim_size = input_dim.size(); - auto* output = context.Output("Out"); - - auto batch_count = GetBatchCount(input->dims()); - VLOG(2) << "input dim:" << input->dims(); - PADDLE_ENFORCE_GE( - input_dim_size, 2, - platform::errors::InvalidArgument( - "the input matrix dimension size should greater than 2.")); - PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1], - input_dim[input_dim_size - 2], - platform::errors::InvalidArgument( - "the input matrix should be square matrix.")); - auto rank = input_dim[input_dim_size - 1]; // square matrix length - DeterminantFunctor()(*input, context, rank, batch_count, output); - auto output_dims = phi::slice_ddim(input->dims(), 0, input_dim_size - 2); - if (input_dim_size > 2) { - output->Resize(output_dims); - } else { - // when input is a two-dimension matrix, The det value is a number. - output->Resize({1}); - } - VLOG(2) << "output dim:" << output->dims(); - } -}; - -template -struct FoundZeroFunctor { - FoundZeroFunctor(const T* x, int64_t numel, bool* res) - : x_(x), numel_(numel), res_(res) {} - HOSTDEVICE void operator()(size_t idx) const { - if (*res_ || idx >= static_cast(numel_)) { - // founded zero number - return; - } - *res_ = (x_[idx] == static_cast(0)); - } - const T* x_; - int64_t numel_; - bool* res_; -}; - -template -inline bool CheckMatrixInvertible(const framework::ExecutionContext& ctx, - const framework::Tensor* det) { - auto& dev_ctx = ctx.template device_context(); - auto numel = det->numel(); - - framework::Tensor dev_tensor; - auto* data = dev_tensor.mutable_data({1}, ctx.GetPlace()); - - // set false - phi::funcs::SetConstant zero; - zero(dev_ctx, &dev_tensor, false); - - // find whether zero - platform::ForRange for_range(dev_ctx, numel); - FoundZeroFunctor functor(det->data(), numel, data); - for_range(functor); - - // copy to host - dev_ctx.Wait(); - framework::Tensor cpu_tensor; - framework::TensorCopy(dev_tensor, platform::CPUPlace(), &cpu_tensor); - - // if founded zero, the matrix is not invertible - // else the matrix is invertible - auto* res = cpu_tensor.data(); - return !(*res); -} - -template -class DeterminantGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto& orig_dev_ctx = context.template device_context(); - const auto* input = context.Input("Input"); - const auto* det = context.Input("Out"); - const auto* grad = - context.Input(framework::GradVarName("Out")); - auto* ddet = - context.Output(framework::GradVarName("Input")); - - auto input_dims_size = input->dims().size(); - if (input_dims_size > 2) { - PADDLE_ENFORCE_EQ( - grad->dims().size() + 2, input_dims_size, - platform::errors::InvalidArgument( - "The grad tensor of det dims size should 2 less than" - " input tensor's, but here differ %d", - input_dims_size - grad->dims().size())); - } else if (input_dims_size == 2) { - // input dims size 2 and grad dims size 1 is possible - PADDLE_ENFORCE_EQ( - grad->dims().size(), 1, - platform::errors::InvalidArgument( - "The grad tensor of det dims size should 2 less than" - " input tensor's, but here differ %d", - input_dims_size - grad->dims().size())); - } else { - // checked in forward, pass - } - - auto& dev_ctx = static_cast< - const typename framework::ConvertToPhiContext::TYPE&>( - orig_dev_ctx); - - // Check Whether the matrix is invertible - // (matrix A not invertible) == (det(A)=0) - if (!CheckMatrixInvertible(context, det)) { - // The matrix is not invertible - VLOG(3) << "The input matrix not invertible!"; - ddet->Resize(input->dims()); - phi::Full(dev_ctx, phi::vectorize(input->dims()), static_cast(0.0f), - ddet); - return; - } - - // The matrix is invertible - // let |A| = Determinant(A) - // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf - // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2, - // -1) - - // First: inverse(A) - framework::Tensor inverse_A; - // A must be square matrices! - inverse_A.Resize(input->dims()); - inverse_A.mutable_data(context.GetPlace()); - - phi::funcs::MatrixInverseFunctor mat_inv; - mat_inv(orig_dev_ctx, *input, &inverse_A); - - VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); - - // Second: inverse(A).transpose(-2, -1) - framework::Tensor transpose_inverse_A = - phi::TransposeLast2Dim(dev_ctx, inverse_A); - - VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: " - << transpose_inverse_A.dims(); - - // Third: dA * |A| - auto mul_dA_detA = phi::Multiply(dev_ctx, *grad, *det); - VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims(); - - // Fourth: unsqueeze(dA * |A|, [-1, -2]) - auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1); - auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); - VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims(); - - // Finally: unsqueeze(dA * |A|) * inverse(A) - auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); - - VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims(); - - framework::TensorCopy(res, context.GetPlace(), ddet); - - ddet->Resize(input->dims()); - VLOG(3) << "d|A| dims: " << ddet->dims(); - } -}; - template struct SlogDeterminantFunctor { void operator()(const Tensor& input, const framework::ExecutionContext ctx, @@ -280,7 +57,7 @@ struct SlogDeterminantFunctor { auto end_iter = input_vec.begin() + (i + 1) * rank * rank; std::vector sub_vec(begin_iter, end_iter); // get every square matrix data - typename EigenMatrix::MatrixType matrix(rank, rank); + typename phi::detail::EigenMatrix::MatrixType matrix(rank, rank); for (int64_t i = 0; i < rank; ++i) { for (int64_t j = 0; j < rank; ++j) { matrix(i, j) = sub_vec[rank * i + j]; @@ -311,7 +88,7 @@ class SlogDeterminantKernel : public framework::OpKernel { auto input_dim_size = input_dim.size(); auto* output = context.Output("Out"); - auto batch_count = GetBatchCount(input->dims()); + auto batch_count = phi::detail::GetBatchCount(input->dims()); VLOG(2) << "input dim:" << input->dims(); PADDLE_ENFORCE_GE( input_dim_size, 2, @@ -370,7 +147,9 @@ class SlogDeterminantGradKernel : public framework::OpKernel { // (matrix A not invertible) == (absslogdet(A)=0) auto slogdet_vec = slogdet->Split(1, 0); auto absslogdet_val = slogdet_vec[0]; - if (!CheckMatrixInvertible(context, &absslogdet_val)) { + if (!phi::detail::CheckMatrixInvertible< + T, typename framework::ConvertToPhiContext::TYPE>( + dev_ctx, &absslogdet_val)) { // The matrix is not invertible VLOG(3) << "The input matrix not invertible!"; dslogdet->Resize(input->dims()); diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc index 93fbff67e220b..ac8c12bcd7eba 100644 --- a/paddle/fluid/operators/diag_v2_op.cc +++ b/paddle/fluid/operators/diag_v2_op.cc @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include - #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/infermeta/unary.h" @@ -58,15 +56,56 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker { } }; +class DiagV2GradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "X", "X", "DiagV2Grad"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", + framework::GradVarName("X"), "DiagV2Grad"); + + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.GetPlace()); + } +}; + +template +class DiagV2GradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr grad_op) const override { + grad_op->SetType("diag_v2_grad"); + grad_op->SetInput("X", this->Input("X")); + grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + grad_op->SetAttrMap(this->Attrs()); + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagGradV2NoNeedBufferVarsInferer, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; + DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor, PD_INFER_META(phi::DiagInferMeta)); -REGISTER_OPERATOR( - diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker, - DiagInferShapeFunctor); +REGISTER_OPERATOR(diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker, + ops::DiagV2GradOpMaker, + ops::DiagV2GradOpMaker, + DiagInferShapeFunctor); + +REGISTER_OPERATOR(diag_v2_grad, ops::DiagV2GradOp, + ops::DiagGradV2NoNeedBufferVarsInferer); diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 17665ad67e40e..144198367d538 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -32,10 +32,9 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/dropout_impl_util.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/phi/kernels/funcs/aligned_vector.h" +#include "paddle/fluid/platform/aligned_vector.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/functors.h" namespace paddle { @@ -177,12 +176,13 @@ __global__ void DropoutGradCUDAKernel( } template -void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, - bool is_test, +void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test, const std::string dropout_implementation, float dropout_prob, bool upscale_in_train, - bool is_fix_seed, int seed_val, const Tensor& x, - const Tensor* seed, Tensor* mask, Tensor* y) { + bool is_fix_seed, int seed_val, + const framework::Tensor& x, + const framework::Tensor* seed, + framework::Tensor* mask, framework::Tensor* y) { auto& place = *dev_ctx.eigen_device(); int64_t x_numel = x.numel(); auto stream = dev_ctx.stream(); @@ -220,7 +220,8 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, // VectorizedRandomGenerator use curand_uniform4, so we only support // vec_size is 4; int vec_size = (phi::GetVectorizedSize(x_data) == 4) ? 4 : 1; - auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size); + auto gpu_config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size); auto offset = ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size; @@ -278,11 +279,13 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, } template -void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, +void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx, const std::string dropout_implementation, - float dropout_prob, const Tensor& grad_y, - const Tensor& mask, int64_t size, - Tensor* grad_x, bool is_test = false) { + float dropout_prob, + const framework::Tensor& grad_y, + const framework::Tensor& mask, int64_t size, + framework::Tensor* grad_x, + bool is_test = false) { using MT = typename details::MPTypeTrait::Type; auto stream = dev_ctx.stream(); MT factor; diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h index d7db7dddce388..c62d45570ba29 100644 --- a/paddle/fluid/operators/dropout_impl_util.h +++ b/paddle/fluid/operators/dropout_impl_util.h @@ -20,7 +20,7 @@ limitations under the License. */ namespace paddle { namespace operators { -inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx, +inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx, const framework::Tensor* seed, const bool is_fix_seed, const int seed_val, const int offset, uint64_t* seed_data, diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 7613b04bccfdc..3d9950902acfe 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/dropout_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -25,17 +27,6 @@ class DropoutOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Dropout"); - - auto x_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", x_dims); - if (ctx->Attrs().Get("is_test") == false) { - ctx->SetOutputDim("Mask", x_dims); - } - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -173,18 +164,11 @@ class DropoutGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(dropout, DropoutInferShapeFunctor, + PD_INFER_META(phi::DropoutInferMeta)); + REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker, ops::DropoutGradOpMaker, - ops::DropoutGradOpMaker); + ops::DropoutGradOpMaker, + DropoutInferShapeFunctor); REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad); -REGISTER_OP_CPU_KERNEL( - dropout, ops::CPUDropoutKernel, - ops::CPUDropoutKernel, - ops::CPUDropoutKernel); -REGISTER_OP_CPU_KERNEL( - dropout_grad, - ops::DropoutGradKernel, - ops::DropoutGradKernel, - ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu deleted file mode 100644 index f6ddff1d0327d..0000000000000 --- a/paddle/fluid/operators/dropout_op.cu +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/dropout_impl.cu.h" -#include "paddle/fluid/operators/dropout_op.h" -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT. -// Use std::random and thrust::random(thrust is a std library in CUDA) to -// implement uniform random. -template -class GPUDropoutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* seed = - context.HasInput("Seed") ? context.Input("Seed") : nullptr; - auto* y = context.Output("Out"); - y->mutable_data(context.GetPlace()); - float dropout_prob = context.Attr("dropout_prob"); - - auto& dropout_implementation = - context.Attr("dropout_implementation"); - bool upscale_in_train = (dropout_implementation == "upscale_in_train"); - - bool is_test = context.Attr("is_test"); - - auto& dev_ctx = context.cuda_device_context(); - auto* mask = context.Output("Mask"); - mask->mutable_data(context.GetPlace()); - - bool is_fix_seed = context.Attr("fix_seed"); - int seed_val = context.Attr("seed"); - DropoutFwGPUKernelDriver(dev_ctx, is_test, dropout_implementation, - dropout_prob, upscale_in_train, is_fix_seed, - seed_val, *x, seed, mask, y); - } -}; - -template -class GPUDropoutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* grad_x = context.Output(framework::GradVarName("X")); - auto* grad_y = context.Input(framework::GradVarName("Out")); - auto* mask = context.Input("Mask"); - grad_x->mutable_data(context.GetPlace()); - auto size = grad_x->numel(); - auto& dropout_implementation = - context.Attr("dropout_implementation"); - float dropout_prob = context.Attr("dropout_prob"); - - bool is_test = context.Attr("is_test"); - - auto& dev_ctx = - context.template device_context(); - DropoutGradGPUKernelDriver(dev_ctx, dropout_implementation, dropout_prob, - *grad_y, *mask, size, grad_x, is_test); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - dropout, ops::GPUDropoutKernel, - ops::GPUDropoutKernel, - ops::GPUDropoutKernel, - ops::GPUDropoutKernel); -REGISTER_OP_CUDA_KERNEL( - dropout_grad, ops::GPUDropoutGradKernel, - ops::GPUDropoutGradKernel, - ops::GPUDropoutGradKernel, - ops::GPUDropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h deleted file mode 100644 index ea6ed0e619474..0000000000000 --- a/paddle/fluid/operators/dropout_op.h +++ /dev/null @@ -1,151 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include -#include - -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -template -using EigenMatrix = framework::EigenMatrix; - -template -using EigenVector = framework::EigenVector; - -template -class CPUDropoutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* seed = - context.HasInput("Seed") ? context.Input("Seed") : nullptr; - auto* y = context.Output("Out"); - const auto* x_data = x->data(); - auto* y_data = y->mutable_data(context.GetPlace()); - float dropout_prob = context.Attr("dropout_prob"); - - auto& dropout_implementation = - context.Attr("dropout_implementation"); - bool upscale_in_train = (dropout_implementation == "upscale_in_train"); - if (!context.Attr("is_test")) { - auto* mask = context.Output("Mask"); - auto* mask_data = mask->mutable_data(context.GetPlace()); - size_t size = phi::product(mask->dims()); - - // Special case when dropout_prob is 1.0 - if (dropout_prob == 1.0f) { - std::memset(y_data, 0, size * sizeof(*y_data)); // NOLINT - std::memset(mask_data, 0, size * sizeof(*mask_data)); // NOLINT - return; - } - // std::minstd_rand engine; - // NOTE: fixed seed should only be used in unittest or for debug. - // Guarantee to use random seed in training. - int seed_data = 0; - if (seed) { - seed_data = *(seed->data()); - } else { - seed_data = - context.Attr("fix_seed") ? context.Attr("seed") : 0; - } - auto engine = framework::GetCPURandomEngine(seed_data); - - std::uniform_real_distribution dist(0, 1); - - for (size_t i = 0; i < size; ++i) { - if (dist(*engine) < dropout_prob) { - mask_data[i] = 0; - y_data[i] = 0; - } else { - mask_data[i] = 1; - if (upscale_in_train) { - y_data[i] = x_data[i] / static_cast(1.0f - dropout_prob); - } else { - y_data[i] = x_data[i]; - } - } - } - } else { - if (upscale_in_train) { - const auto* X_data = x->data(); - auto* Y_data = y->mutable_data(context.GetPlace()); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int i = 0; i < x->numel(); i++) { - Y_data[i] = X_data[i]; - } - } else { - auto X = EigenMatrix::Reshape(*x, 1); - auto Y = EigenMatrix::Reshape(*y, 1); - auto& place = - *context.template device_context().eigen_device(); - Y.device(place) = X * static_cast(1.0f - dropout_prob); - } - } - } -}; -template -class DropoutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* grad_x = context.Output(framework::GradVarName("X")); - auto* grad_y = context.Input(framework::GradVarName("Out")); - auto* mask = context.Input("Mask"); - grad_x->mutable_data(context.GetPlace()); - - auto dX = EigenVector::Flatten(*grad_x); - auto dY = EigenVector::Flatten(*grad_y); - - auto& place = - *context.template device_context().eigen_device(); - auto& dropout_implementation = - context.Attr("dropout_implementation"); - if (context.Attr("is_test") == true) { - if (dropout_implementation == "upscale_in_train") { - dX.device(place) = static_cast(1) * dY; - } else { - float dropout_prob = context.Attr("dropout_prob"); - dX.device(place) = dY * static_cast(1.0f - dropout_prob); - } - } else { - auto M = EigenVector::Flatten(*mask); - if (dropout_implementation == "upscale_in_train") { - float dropout_prob = context.Attr("dropout_prob"); - if (dropout_prob == 1.0f) { - dX.device(place) = static_cast(0) * dY; - } else { - dX.device(place) = - dY * M.cast() / static_cast(1.0f - dropout_prob); - } - } else { - dX.device(place) = dY * M.cast(); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc index 6aae566760623..07b3b53811625 100644 --- a/paddle/fluid/operators/dropout_op_npu.cc +++ b/paddle/fluid/operators/dropout_op_npu.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/phi/core/ddim.h" diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc index 206d9a6c5e9c9..bdf08646f1d8b 100644 --- a/paddle/fluid/operators/dropout_op_test.cc +++ b/paddle/fluid/operators/dropout_op_test.cc @@ -24,14 +24,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(dropout); +USE_OP_ITSELF(dropout); void Compare(f::Scope* scope, const p::DeviceContext& ctx) { // init diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc index 07b7e2cc7c09b..7d8660f238abc 100644 --- a/paddle/fluid/operators/dropout_op_xpu.cc +++ b/paddle/fluid/operators/dropout_op_xpu.cc @@ -8,15 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/dropout_op.h" + #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { namespace operators { #ifdef PADDLE_WITH_XPU +using Tensor = framework::Tensor; template class DropoutXPUKernel : public framework::OpKernel { using XPUTyp = typename XPUTypeTrait::Type; diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h index 5e4c83e1a45eb..6daf05a9d778d 100644 --- a/paddle/fluid/operators/eig_op.h +++ b/paddle/fluid/operators/eig_op.h @@ -21,13 +21,13 @@ #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/diag_functor.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/slice.h" #include "paddle/phi/kernels/funcs/unsqueeze.h" -#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index a995877778e47..c28abb916b7a7 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -27,7 +27,7 @@ limitations under the License. */ // only can include the headers in paddle/phi/include dirs #include "paddle/phi/kernels/elementwise_grad_kernel.h" -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #endif namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h index 8e0bf78e9b7f9..54931d99292f9 100644 --- a/paddle/fluid/operators/elementwise/elementwise_functor.h +++ b/paddle/fluid/operators/elementwise/elementwise_functor.h @@ -1,11 +1,8 @@ /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -90,86 +87,6 @@ struct MinFunctor { template using Complex = paddle::platform::complex; -// Fmax -template -struct FMaxFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { - return std::fmax(a, b); - } -}; - -template <> -struct FMaxFunctor { - inline HOSTDEVICE paddle::platform::float16 operator()( - const paddle::platform::float16 a, - const paddle::platform::float16 b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmax(float_a, float_b); - return static_cast(result); - } -}; - -template <> -struct FMaxFunctor { - inline HOSTDEVICE int operator()(const int a, const int b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmax(float_a, float_b); - return std::lrint(result); - } -}; - -template <> -struct FMaxFunctor { - inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const { - double double_a = static_cast(a); - double double_b = static_cast(b); - auto result = std::fmax(double_a, double_b); - return std::llrint(result); - } -}; - -// Fmin -template -struct FMinFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { - return std::fmin(a, b); - } -}; - -template <> -struct FMinFunctor { - inline HOSTDEVICE paddle::platform::float16 operator()( - const paddle::platform::float16 a, - const paddle::platform::float16 b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmin(float_a, float_b); - return static_cast(result); - } -}; - -template <> -struct FMinFunctor { - inline HOSTDEVICE int operator()(const int a, const int b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmin(float_a, float_b); - return std::lrint(result); - } -}; - -template <> -struct FMinFunctor { - inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const { - double double_a = static_cast(a); - double double_b = static_cast(b); - auto result = std::fmin(double_a, double_b); - return std::llrint(result); - } -}; - template struct MinGradXFunctor { inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const { @@ -196,47 +113,6 @@ struct MinGradXYFunctor { } }; -template -struct MulGradFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; } -}; -template -struct MulGradFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b) const { - Complex b_conj(b.real, -b.imag); - return a * b_conj; - } -}; - -template -struct MulGradXYFunctor { - inline HOSTDEVICE phi::Array operator()(const InT a, const InT b, - const InT c) { - phi::Array outs; - // dx = dout * y - outs[0] = a * b; - // dy = dout * x - outs[1] = a * c; - return outs; - } -}; - -template -struct MulGradXYFunctor, Complex> { - inline HOSTDEVICE phi::Array, 2> operator()( - const Complex a, const Complex b, const Complex c) { - phi::Array, 2> outs; - // dx = dout * y - Complex b_conj(b.real, -b.imag); - outs[0] = a * b_conj; - // dy = dout * x - Complex c_conj(c.real, -c.imag); - outs[1] = a * c_conj; - return outs; - } -}; - // Ternary compare template struct MaxGradXFunctor { diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc index 91da732ef0d3d..d91315cc511aa 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc @@ -151,21 +151,3 @@ REGISTER_OPERATOR(elementwise_fmax, ops::ElementwiseOp, ops::ElementwiseFMaxGradOpMaker); REGISTER_OPERATOR(elementwise_fmax_grad, ops::ElementwiseOpGrad); - -REGISTER_OP_CPU_KERNEL( - elementwise_fmax, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel); -REGISTER_OP_CPU_KERNEL( - elementwise_fmax_grad, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu index 123332a4a23de..0d5f56fda1732 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu @@ -86,21 +86,3 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMaxGradKernel, ops::ElementwiseMaxGradKernel); - -REGISTER_OP_CUDA_KERNEL( - elementwise_fmax, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel); -REGISTER_OP_CUDA_KERNEL( - elementwise_fmax_grad, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h index cff30be50a3d1..afe1073d89a06 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.h @@ -35,21 +35,6 @@ class ElementwiseMaxKernel : public framework::OpKernel { } }; -template -class ElementwiseFMaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - - z->mutable_data(ctx.GetPlace()); - int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - FMaxFunctor(), z); - } -}; - template struct MaxGradDx { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { @@ -104,88 +89,5 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel { } }; -template -struct FMaxGradDx { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast((x >= y) || isnan(y)); - } -}; - -template <> -struct FMaxGradDx { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - (x >= y) || paddle::platform::isnan(y)); - } -}; - -template <> -struct FMaxGradDx { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast((x >= y)); - } -}; - -template <> -struct FMaxGradDx { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast((x >= y)); - } -}; - -template -struct FMaxGradDy { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast(!((x >= y) || isnan(y))); - } -}; - -template <> -struct FMaxGradDy { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - !((x >= y) || paddle::platform::isnan(y))); - } -}; - -template <> -struct FMaxGradDy { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast(!((x >= y))); - } -}; - -template <> -struct FMaxGradDy { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast(!((x >= y))); - } -}; - -template -class ElementwiseFMaxGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto* out = dout; // Fake out, not used - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, FMaxGradDy>( - ctx, *x, *y, *out, *dout, axis, dx, dy, FMaxGradDx(), - FMaxGradDy()); - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc index 3a1951999546e..dad80a2c33f3a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc @@ -147,21 +147,3 @@ REGISTER_OPERATOR(elementwise_fmin, ops::ElementwiseOp, ops::ElementwiseFMinGradOpMaker); REGISTER_OPERATOR(elementwise_fmin_grad, ops::ElementwiseOpGrad); - -REGISTER_OP_CPU_KERNEL( - elementwise_fmin, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel); -REGISTER_OP_CPU_KERNEL( - elementwise_fmin_grad, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu index 5af985567d898..fb8bc9ac7f83c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu @@ -82,21 +82,3 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMinGradKernel, ops::ElementwiseMinGradKernel); - -REGISTER_OP_CUDA_KERNEL( - elementwise_fmin, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel); -REGISTER_OP_CUDA_KERNEL( - elementwise_fmin_grad, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h index 88fb044d42206..283ad2adde978 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h @@ -35,21 +35,6 @@ class ElementwiseMinKernel : public framework::OpKernel { } }; -template -class ElementwiseFMinKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - - z->mutable_data(ctx.GetPlace()); - int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - FMinFunctor(), z); - } -}; - template struct MinGradDx { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { @@ -124,89 +109,5 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel { ElementwiseMinGrad(ctx, x, y, out, dout, dx, dy); } }; - -template -struct FMinGradDx { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast((x <= y) || isnan(y)); - } -}; - -template <> -struct FMinGradDx { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - (x <= y) || paddle::platform::isnan(y)); - } -}; - -template <> -struct FMinGradDx { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast((x <= y)); - } -}; - -template <> -struct FMinGradDx { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast((x <= y)); - } -}; - -template -struct FMinGradDy { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast(!((x <= y) || isnan(y))); - } -}; - -template <> -struct FMinGradDy { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - !((x <= y) || paddle::platform::isnan(y))); - } -}; - -template <> -struct FMinGradDy { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast(!((x <= y))); - } -}; - -template <> -struct FMinGradDy { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast(!((x <= y))); - } -}; - -template -class ElementwiseFMinGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto* out = dout; // Fake out, not used - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, FMinGradDy>( - ctx, *x, *y, *out, *dout, axis, dx, dy, FMinGradDx(), - FMinGradDy()); - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index e172279145e28..830e09eeae481 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -173,55 +173,6 @@ REGISTER_OP_CPU_KERNEL( paddle::platform::complex>, ops::ElementwiseMulKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel>, - ops::ElementwiseMulGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_grad_grad, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel>, - ops::ElementwiseMulDoubleGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_triple_grad, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel>, - ops::ElementwiseMulTripleGradKernel>); REGISTER_OP_VERSION(elementwise_mul) .AddCheckpoint( diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 45c87a27a180a..f7b9fd1e265f5 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -63,33 +63,6 @@ class ElementwiseMulKernel } }; -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - const auto& dev_ctx = - ctx.template device_context(); - const auto place = ctx.GetPlace(); - - if (dx != nullptr && dy != nullptr) { - std::vector ins = {dout, y, x}; - GetGradXAndYOut( - dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor()); - } else if (dx != nullptr && dy == nullptr) { - std::vector ins = {dout, y}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dx, MulGradFunctor()); - } else if (dx == nullptr && dy != nullptr) { - std::vector ins = {dout, x}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dy, MulGradFunctor()); - } -} - } // namespace operators } // namespace paddle @@ -103,44 +76,3 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulKernel, ops::ElementwiseMulKernel>, ops::ElementwiseMulKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel>, - ops::ElementwiseMulGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_grad_grad, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel>, - ops::ElementwiseMulDoubleGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_triple_grad, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel>, - ops::ElementwiseMulTripleGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index c81266d584468..6f4aba93d56e2 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/platform/cpu_info.h" -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" namespace paddle { namespace operators { @@ -137,244 +137,6 @@ class ElementwiseMulKernel : public framework::OpKernel { } } }; -template -struct MulGradDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; } -}; - -template -struct MulGradDX> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex y_conj(y.real, -y.imag); - return dout * y_conj; - } -}; - -template -struct MulGradDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; } -}; - -template -struct MulGradDY> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex x_conj(x.real, -x.imag); - return dout * x_conj; - } -}; -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, MulGradDY>( - ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX(), MulGradDY()); -} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy); -#endif - -template -class ElementwiseMulGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* out = dout; // out is not necessary - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - ElementwiseMulGrad(ctx, x, y, out, dout, dx, dy); - } -}; - -template -class ElementwiseMulDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input("DOut"); - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto* ddout = ctx.Output("DDOut"); - - if (ddout) ddout->mutable_data(ctx.GetPlace()); - - Tensor ddx_safe, ddy_safe; - GetDoubleGradSafeTensor(ctx, x, ddx, &ddx_safe); - GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); - - // dx = dout * ddy - // dy = dout * ddx - // ddout = ddx * y + x * ddy - // change computation sequence to save memory, so ddout can inplace ddx and - // dx can be used as 'tmp' tensor - // (1) dx = x * ddy - // (2) dy = dout * ddx - // (3) ddout = ddx * y - // (4) ddout = ddout + dx - // (5) dx = dout * ddy - if (ddout) { - int axis = ctx.Attr("axis"); - auto& place = - *ctx.template device_context().eigen_device(); - // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace - if (ddout->numel() > ddx->numel()) { - ElemwiseGradCompute, MulGradDY>( - ctx, ddx_safe, ddy_safe, *dout, *dout, axis, dx, dy, MulGradDX(), - MulGradDY()); - - Tensor ddout_tmp; - ddout_tmp.mutable_data(ddout->dims(), ctx.GetPlace()); - - default_elementwise_mul(ctx, y, &ddx_safe, ddout); - default_elementwise_mul(ctx, &ddy_safe, x, - &ddout_tmp); - - auto ddout_t = framework::EigenVector::Flatten(*ddout); - auto ddout_tmp_t = framework::EigenVector::Flatten(ddout_tmp); - ddout_t.device(place) = ddout_t + ddout_tmp_t; - } else { - // use dx to save memory, other than alloc tmp tensor - Tensor* ddout_tmp = dx; - - default_elementwise_mul(ctx, x, &ddy_safe, ddout_tmp); - // NOTE: in the following ElemwiseGradCompute, for the - // first output tensor is nullptr, the branch to calculate first - // output tensor will not be activated, DivGradDx function will not - // be called and can be ignored, the first branch has little effect - // on running speed. - ElemwiseGradCompute, MulGradDY>( - ctx, ddx_safe, ddy_safe, *dout, *dout, axis, nullptr, dy, - MulGradDX(), MulGradDY()); - default_elementwise_mul(ctx, &ddx_safe, y, ddout); - - auto ddout_t = framework::EigenVector::Flatten(*ddout); - auto ddout_tmp_t = framework::EigenVector::Flatten(*ddout_tmp); - ddout_t.device(place) = ddout_t + ddout_tmp_t; - default_elementwise_mul(ctx, dout, &ddy_safe, dx); - } - } - } -}; - -template -class ElementwiseMulTripleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - // get input - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input("DOut"); - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* d_dx = ctx.Input("D_DX"); - auto* d_dy = ctx.Input("D_DY"); - auto* d_ddout = ctx.Input("D_DDOut"); - - // get output - auto* out_d_x = ctx.Output("D_X"); - auto* out_d_y = ctx.Output("D_Y"); - auto* out_d_dout = ctx.Output("D_DOut"); - - auto* out_d_ddx = ctx.Output("D_DDX"); - auto* out_d_ddy = ctx.Output("D_DDY"); - - if (out_d_x) out_d_x->mutable_data(x->dims(), ctx.GetPlace()); - if (out_d_y) out_d_y->mutable_data(y->dims(), ctx.GetPlace()); - if (out_d_dout) out_d_dout->mutable_data(dout->dims(), ctx.GetPlace()); - if (out_d_ddx) out_d_ddx->mutable_data(x->dims(), ctx.GetPlace()); - if (out_d_ddy) out_d_ddy->mutable_data(y->dims(), ctx.GetPlace()); - - auto& place = *ctx.template device_context().eigen_device(); - - Tensor ddx_safe, ddy_safe; - GetDoubleGradSafeTensor(ctx, x, ddx, &ddx_safe); - GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); - - if (d_ddout) { - if (out_d_x) { - // out_d_x = ddy * d_ddout - default_elementwise_mul(ctx, &ddy_safe, d_ddout, - out_d_x); - } - if (out_d_y) { - // out_d_y = ddx * d_ddout - default_elementwise_mul(ctx, &ddx_safe, d_ddout, - out_d_y); - } - } - - if (out_d_dout) { - // get out_d_dout - // out_d_dout = ddy * d_dx + d_dy * ddx - Tensor out_d_dout_tmp; - out_d_dout_tmp.mutable_data(dout->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, d_dy, &ddx_safe, - out_d_dout); - default_elementwise_mul(ctx, &ddy_safe, d_dx, - &out_d_dout_tmp); - auto out_d_dout_t = framework::EigenVector::Flatten(*out_d_dout); - auto out_d_dout_tmp_t = - framework::EigenVector::Flatten(out_d_dout_tmp); - out_d_dout_t.device(place) = out_d_dout_t + out_d_dout_tmp_t; - } - - if (out_d_ddx) { - // get out_d_ddx - // out_d_ddx = dout * d_dy + y * d_ddout - Tensor out_d_ddx_tmp; - out_d_ddx_tmp.mutable_data(ddx->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, dout, d_dy, out_d_ddx); - default_elementwise_mul(ctx, y, d_ddout, - &out_d_ddx_tmp); - auto out_d_ddx_t = framework::EigenVector::Flatten(*out_d_ddx); - auto out_d_ddx_tmp_t = framework::EigenVector::Flatten(out_d_ddx_tmp); - out_d_ddx_t.device(place) = out_d_ddx_t + out_d_ddx_tmp_t; - } - - if (out_d_ddy) { - // get out_d_ddy - // out_d_ddy = dout * d_dx + x * d_ddout - Tensor out_d_ddy_tmp; - out_d_ddy_tmp.mutable_data(ddy->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, dout, d_dx, out_d_ddy); - default_elementwise_mul(ctx, x, d_ddout, - &out_d_ddy_tmp); - auto out_d_ddy_t = framework::EigenVector::Flatten(*out_d_ddy); - auto out_d_ddy_tmp_t = framework::EigenVector::Flatten(out_d_ddy_tmp); - out_d_ddy_t.device(place) = out_d_ddy_t + out_d_ddy_tmp_t; - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc index fc128a88f2096..3e9263fe93acd 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 838df2e162591..f9347d281043e 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,100 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto& dev_ctx = - ctx.template device_context(); - const auto& onednn_engine = dev_ctx.GetEngine(); - - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - auto tz = phi::vectorize(dout->dims()); - memory::data_type dout_type = framework::ToMKLDNNDataType( - framework::TransToProtoVarType(dout->dtype())); - platform::ReorderMKLDNNHandler handler( - tz, framework::TransToProtoVarType(dout->dtype()), dout_type, - onednn_engine); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - auto reorder_src_memory_p = handler.AcquireSrcMemory( - dout->format(), platform::to_void_cast(dout->data())); - - if (dx) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace()); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); - astream.wait(); - - dx->set_layout(DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } - - if (dy) { - // Direct copy - if (dout->dims() == dy->dims()) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace()); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, - *reorder_dst_memory_p); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } else { - // Broadcasting - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p}, - {DNNL_ARG_DST, *dy_memory_p}}); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - } - } - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; REGISTER_OP_KERNEL( @@ -116,6 +24,8 @@ REGISTER_OP_KERNEL( ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) -REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::EltwiseAddMKLDNNGradKernel, - ops::EltwiseAddMKLDNNGradKernel) +REGISTER_OP_KERNEL( + elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc index 367d602f5902e..c68aa8d3d1b46 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc @@ -1,146 +1,28 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" - -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseDivMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - - auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - if (dx) { - // dx = dout / y - - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), - dout, y, dx, 1.0f, 1.0f, 1.0f); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - const auto dst_dx_memory = handler.AcquireDstMemory(dx); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_y_memory}, - {DNNL_ARG_DST, *dst_dx_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dx->set_layout(framework::DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory)); - } - - if (dy) { - // dy = -dout * out / y - - platform::BinaryMKLDNNHandler y_handler( - dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), y, - y, nullptr, 1.0f, 1.0f, 1.0f); - - const auto y_memory = y_handler.AcquireSrcMemory(y); - - dnnl::post_ops po; - po.append_binary(dnnl::algorithm::binary_div, y_memory->get_desc()); - - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, out, nullptr, -1.0f, 1.0f, 1.0f, po); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_out_memory = handler.AcquireSecondSrcMemory(out); - - // If broadcasting is in use then let's write to temporary - // buffer allocated by oneDNN - const auto dst_dy_memory = (dout->dims() == dy->dims()) - ? handler.AcquireDstMemory(dy) - : handler.AcquireDstMemory(); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_out_memory}, - {DNNL_ARG_DST, *dst_dy_memory}, - {DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, *y_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dy->set_layout(framework::DataLayout::kMKLDNN); - - // Reduction is needed for broadcasting scenario - if (dout->dims() != dy->dims()) { - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - - // As source we use mem object with results from binary operation - reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory}, - {DNNL_ARG_DST, *dy_memory_p}}); - astream.wait(); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - - } else { - dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory)); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -// TODO(piotrekobi) add int8, uint8 support -REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace, - ops::EltwiseMKLDNNKernel, - ops::EltwiseMKLDNNKernel) - -REGISTER_OP_KERNEL(elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace, - ops::EltwiseDivMKLDNNGradKernel, - ops::EltwiseDivMKLDNNGradKernel) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace, + ops::EltwiseMKLDNNKernel, + ops::EltwiseMKLDNNKernel) + +REGISTER_OP_KERNEL( + elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index 763fc5f267410..d1a1aa3008c8b 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -15,23 +15,77 @@ #pragma once #include #include -#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace operators { -using framework::DataLayout; -using framework::Tensor; using dnnl::memory; using dnnl::primitive; using dnnl::stream; +using framework::DataLayout; +using framework::Tensor; + +inline std::vector CalculateBroadcastedDims(const Tensor* x, + const Tensor* y) { + const auto src_tz = phi::vectorize(x->dims()); + const auto dst_tz = phi::vectorize(y->dims()); + + size_t j = 0; + std::vector dst_tz_ex(src_tz.size(), 1); + for (size_t i = 0; i < src_tz.size(); ++i) { + dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++]; + if (j == dst_tz.size()) break; + } + + return dst_tz_ex; +} template class EltwiseMKLDNNKernel : public framework::OpKernel { + private: + dnnl::post_ops get_post_ops(const framework::ExecutionContext& ctx) const { + dnnl::post_ops post_operations; + if (ctx.HasAttr("activation_type")) { + const float scale = ctx.HasAttr("activation_scale") + ? ctx.Attr("activation_scale") + : 1.0f; + const float alpha = ctx.HasAttr("activation_alpha") + ? ctx.Attr("activation_alpha") + : 0.0f; + const float beta = ctx.HasAttr("activation_beta") + ? ctx.Attr("activation_beta") + : 0.0f; + + static std::unordered_map algo_map = { + {"relu", dnnl::algorithm::eltwise_relu}, + {"tanh", dnnl::algorithm::eltwise_tanh}, + {"leaky_relu", dnnl::algorithm::eltwise_relu}, + {"swish", dnnl::algorithm::eltwise_swish}, + {"hardswish", dnnl::algorithm::eltwise_hardswish}, + {"sqrt", dnnl::algorithm::eltwise_sqrt}, + {"abs", dnnl::algorithm::eltwise_abs}, + {"clip", dnnl::algorithm::eltwise_clip}, + {"gelu", dnnl::algorithm::eltwise_gelu_erf}, + {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh}, + {"relu6", dnnl::algorithm::eltwise_bounded_relu}, + {"sigmoid", dnnl::algorithm::eltwise_logistic}}; + + const auto& activation_type = + algo_map.find(ctx.Attr("activation_type")); + + if (activation_type != algo_map.end()) { + post_operations.append_eltwise(scale, activation_type->second, alpha, + beta); + } + } + return post_operations; + } + public: void Compute(const framework::ExecutionContext& ctx) const override { const auto& dev_ctx = @@ -47,9 +101,9 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { float scale_o = ctx.Attr("Scale_out"); int axis = ctx.Attr("axis"); - platform::BinaryMKLDNNHandler handler(BINARY_OP, axis, mkldnn_engine, - ctx.GetPlace(), x, y, z, scale_x, - scale_y, scale_o); + platform::BinaryMKLDNNHandler handler( + BINARY_OP, axis, mkldnn_engine, ctx.GetPlace(), x, y, z, scale_x, + scale_y, scale_o, get_post_ops(ctx)); const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); @@ -64,7 +118,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { // operation. const bool reuse_x_memopry = x->numel() == z->numel() && x->IsSharedBufferWith(*z); - std::shared_ptr dst_memory = nullptr; + std::shared_ptr dst_memory; if (reuse_x_memopry) { dst_memory = src_x_memory; // NOTE(chenfeiyu): when the output reuses memory from other tensor rather @@ -96,19 +150,187 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { } }; -inline std::vector CalculateBroadcastedDims(const Tensor* x, - const Tensor* y) { - const auto src_tz = phi::vectorize(x->dims()); - const auto dst_tz = phi::vectorize(y->dims()); +template +class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + ElemwiseGradKernel::Compute(ctx); + using Tensor = framework::Tensor; - size_t j = 0; - std::vector dst_tz_ex(src_tz.size(), 1); - for (size_t i = 0; i < src_tz.size(); ++i) { - dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++]; - if (j == dst_tz.size()) break; - } + auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); - return dst_tz_ex; -} + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + auto* dout = ctx.Input(framework::GradVarName("Out")); + + int axis = ctx.Attr("axis"); + + auto tz = phi::vectorize(dout->dims()); + auto proto_type_dout = framework::TransToProtoVarType(dout->dtype()); + + platform::ReorderMKLDNNHandler reorder_handler( + tz, proto_type_dout, framework::ToMKLDNNDataType(proto_type_dout), + onednn_engine); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + dout->format(), platform::to_void_cast(dout->data())); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + if (dx) { + std::shared_ptr dst_memory; + + // elementwise_add & elementwise_sub + if (BINARY_OP == dnnl::algorithm::binary_add || + BINARY_OP == dnnl::algorithm::binary_sub) { + dst_memory = reorder_handler.AcquireDstMemory(dx, dout->format(), + ctx.GetPlace()); + auto reorder_p = + reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); + + reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory); + } else { // elementwise_mul & elementwise_div + platform::BinaryMKLDNNHandler binary_handler( + BINARY_OP, axis, onednn_engine, ctx.GetPlace(), dout, y, dx, 1.0f, + 1.0f, 1.0f); + + const auto src_dout_memory = binary_handler.AcquireSrcMemory(dout); + const auto src_y_memory = binary_handler.AcquireSecondSrcMemory(y); + dst_memory = binary_handler.AcquireDstMemory(dx); + + const auto binary_prim = binary_handler.AcquireForwardPrimitive(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_dout_memory}, + {DNNL_ARG_SRC_1, *src_y_memory}, + {DNNL_ARG_DST, *dst_memory}}; + + binary_prim->execute(astream, args); + } + astream.wait(); + + dx->set_layout(framework::DataLayout::kMKLDNN); + dx->set_format(platform::GetMKLDNNFormat(*dst_memory)); + } + + if (dy) { + dnnl::primitive_attr broadcast_reduction_attr; + std::shared_ptr broadcast_src_memory; + std::shared_ptr dst_memory; + + // elementwise_add & elementwise_sub + if (BINARY_OP == dnnl::algorithm::binary_add || + BINARY_OP == dnnl::algorithm::binary_sub) { + if (dout->dims() == dy->dims()) { + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + dy, dout->format(), ctx.GetPlace()); + + dnnl::primitive_attr reorder_attr; + std::vector scales(1); + scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1; + reorder_attr.set_output_scales(0, scales); + auto reorder_p = std::make_shared( + *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, *reorder_src_memory_p, + *reorder_dst_memory_p); + + dst_memory = reorder_dst_memory_p; + } else { + broadcast_src_memory = reorder_src_memory_p; + } + } else { // elementwise_mul & elementwise_div + std::unordered_map args; + std::shared_ptr binary_prim; + std::shared_ptr post_op_memory; + std::shared_ptr src_0_memory; + std::shared_ptr src_1_memory; + + platform::BinaryMKLDNNHandler binary_handler( + dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(), + dout, x, nullptr, 1.0f, 1.0f, 1.0f); + + src_1_memory = binary_handler.AcquireSecondSrcMemory(x); + + if (BINARY_OP == dnnl::algorithm::binary_div) { + platform::BinaryMKLDNNHandler post_op_binary_handler( + dnnl::algorithm::binary_div, axis, onednn_engine, ctx.GetPlace(), + y, y, nullptr, 1.0f, 1.0f, 1.0f); + + post_op_memory = post_op_binary_handler.AcquireSrcMemory(y); + + dnnl::post_ops po; + po.append_binary(dnnl::algorithm::binary_div, + post_op_memory->get_desc()); + + binary_handler = platform::BinaryMKLDNNHandler( + dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(), + dout, out, nullptr, -1.0f, 1.0f, 1.0f, po); + + src_1_memory = binary_handler.AcquireSecondSrcMemory(out); + } + + src_0_memory = binary_handler.AcquireSrcMemory(dout); + + const auto dst_dy_memory = (dout->dims() == dy->dims()) + ? binary_handler.AcquireDstMemory(dy) + : binary_handler.AcquireDstMemory(); + + binary_prim = binary_handler.AcquireForwardPrimitive(); + args = {{DNNL_ARG_SRC_0, *src_0_memory}, + {DNNL_ARG_SRC_1, *src_1_memory}, + {DNNL_ARG_DST, *dst_dy_memory}}; + + if (BINARY_OP == dnnl::algorithm::binary_div) + args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, + *post_op_memory}); + + binary_prim->execute(astream, args); + broadcast_src_memory = dst_dy_memory; + dst_memory = dst_dy_memory; + } + astream.wait(); + dy->set_layout(DataLayout::kMKLDNN); + + if (dout->dims() != dy->dims()) { + // Broadcasting + if (BINARY_OP == dnnl::algorithm::binary_sub) { + dnnl::post_ops po; + po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0); + broadcast_reduction_attr.set_post_ops(po); + } + + platform::ReductionMKLDNNHandler reduction_handler( + dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, + ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), + broadcast_reduction_attr); + dst_memory = reduction_handler.AcquireDstMemory(dy); + + auto reduction_p = reduction_handler.AcquireForwardPrimitive(); + + reduction_p->execute(astream, { + {DNNL_ARG_SRC, *broadcast_src_memory}, + {DNNL_ARG_DST, *dst_memory}, + }); + astream.wait(); + dy->set_format(platform::GetMKLDNNFormat(dst_memory->get_desc().reshape( + phi::vectorize(dy->dims())))); + } else { + dy->set_format(platform::GetMKLDNNFormat(*dst_memory)); + } + } + } +}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index c03794012ff3b..0ef5c5e628ce6 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -1,127 +1,19 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - - auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - if (dx) { - // dx = dout*y - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, y, dx, 1.0f, 1.0f, 1.0f); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - const auto dst_dx_memory = handler.AcquireDstMemory(dx); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_y_memory}, - {DNNL_ARG_DST, *dst_dx_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dx->set_layout(framework::DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory)); - } - - if (dy) { - // dy = dout*x - // Handler is having nullptr passed instead of output tensor as - // we want Dst buffer to be allocated by oneDNN not to use Tensor - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, x, nullptr, 1.0f, 1.0f, 1.0f); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_x_memory = handler.AcquireSecondSrcMemory(x); - - // If broadcasting is in use then let's write to temporary - // buffer allocated by oneDNN - const auto dst_dy_memory = (dout->dims() == dy->dims()) - ? handler.AcquireDstMemory(dy) - : handler.AcquireDstMemory(); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_x_memory}, - {DNNL_ARG_DST, *dst_dy_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dy->set_layout(framework::DataLayout::kMKLDNN); - - // Reduction is needed for broadcasting scenario - if (dout->dims() != dy->dims()) { - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - // As source we use mem object with results from binary operation - reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory}, - {DNNL_ARG_DST, *dy_memory_p}}); - astream.wait(); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - - } else { - dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory)); - } - } - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; REGISTER_OP_KERNEL( @@ -132,6 +24,8 @@ REGISTER_OP_KERNEL( ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) -REGISTER_OP_KERNEL(elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::EltwiseMulMKLDNNGradKernel, - ops::EltwiseMulMKLDNNGradKernel) +REGISTER_OP_KERNEL( + elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc index 3c799008a2abc..510373831eb6d 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc @@ -1,5 +1,4 @@ - -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,113 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseSubMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto& dev_ctx = - ctx.template device_context(); - const auto& onednn_engine = dev_ctx.GetEngine(); - - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - auto tz = phi::vectorize(dout->dims()); - memory::data_type dout_type = framework::ToMKLDNNDataType( - framework::TransToProtoVarType(dout->dtype())); - platform::ReorderMKLDNNHandler handler( - tz, framework::TransToProtoVarType(dout->dtype()), dout_type, - onednn_engine); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - auto reorder_src_memory_p = handler.AcquireSrcMemory( - dout->format(), platform::to_void_cast(dout->data())); - - if (dx) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace()); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); - astream.wait(); - - dx->set_layout(DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } - - if (dy) { - // Direct copy - if (dout->dims() == dy->dims()) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace()); - - dnnl::primitive_attr reorder_attr; - std::vector scales = {-1}; - reorder_attr.set_output_scales(0, scales); - auto reorder_p = std::make_shared( - *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, - *reorder_dst_memory_p); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } else { - // Broadcasting - - dnnl::post_ops po; - po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0); - dnnl::primitive_attr attr; - attr.set_post_ops(po); - - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), attr); - - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - - reduction_p->execute(astream, { - {DNNL_ARG_SRC, *reorder_src_memory_p}, - {DNNL_ARG_DST, *dy_memory_p}, - }); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - } - } - } -}; - -} // namespace operators -} // namespace paddle namespace ops = paddle::operators; @@ -131,6 +24,8 @@ REGISTER_OP_KERNEL( ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) -REGISTER_OP_KERNEL(elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::EltwiseSubMKLDNNGradKernel, - ops::EltwiseSubMKLDNNGradKernel) +REGISTER_OP_KERNEL( + elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc index 5222103256d61..ea009a38056f0 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc @@ -17,8 +17,13 @@ #include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add_double_grad, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(add_double_grad, GPU, ALL_LAYOUT); +#endif namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc index 9d4d11609ac20..ce5c6b701d958 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc @@ -21,9 +21,12 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc index 7890d634e9941..3cecc52a3c481 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc @@ -27,9 +27,15 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_div); +PD_DECLARE_KERNEL(divide_double_grad, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(divide_double_grad, GPU, ALL_LAYOUT); +#endif + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/erf_op.cc b/paddle/fluid/operators/erf_op.cc index f68f670394871..64274d098c058 100644 --- a/paddle/fluid/operators/erf_op.cc +++ b/paddle/fluid/operators/erf_op.cc @@ -16,8 +16,10 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/erf_op.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -29,18 +31,6 @@ class ErfOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(%s) of ErfOp should not be null.", "X")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(%s) of ErfOp should not be null.", "Out")); - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -116,28 +106,10 @@ class ErfGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(erf, ErfInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(erf, ops::ErfOp, ops::ErfOpMaker, ops::ErfGradOpMaker, - ops::ErfGradOpMaker); + ops::ErfGradOpMaker, + ErfInferShapeFunctor); REGISTER_OPERATOR(erf_grad, ops::ErfGradOp); -REGISTER_OP_CPU_KERNEL( - erf, ops::ErfKernel, - ops::ErfKernel, - ops::ErfKernel); -REGISTER_OP_CPU_KERNEL( - erf_grad, ops::ErfGradKernel, - ops::ErfGradKernel, - ops::ErfGradKernel); - -REGISTER_OP_CUDA_KERNEL( - erf, ops::ErfKernel, - ops::ErfKernel, - ops::ErfKernel); -REGISTER_OP_CUDA_KERNEL( - erf_grad, ops::ErfGradKernel, - ops::ErfGradKernel, - ops::ErfGradKernel); diff --git a/paddle/fluid/operators/erf_op.h b/paddle/fluid/operators/erf_op.h deleted file mode 100644 index 4780b2e7f5b28..0000000000000 --- a/paddle/fluid/operators/erf_op.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifndef _USE_MATH_DEFINES -#define _USE_MATH_DEFINES -#endif -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" - -namespace paddle { -namespace operators { - -template -class ErfKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& context) const { - auto* out = context.Output("Out"); - auto* in = context.Input("X"); - out->mutable_data(in->place()); - - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto& place = - *context.template device_context().eigen_device(); - EigenErf, T>::Eval(place, eigen_out, - eigen_in); - } -}; - -template -class ErfGradKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& context) const { - auto* x = context.Input("X"); - auto* dout = - context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - - dx->mutable_data(dout->place()); - - auto eigen_x = framework::EigenVector::Flatten(*x); - auto eigen_dout = framework::EigenVector::Flatten(*dout); - auto eigen_dx = framework::EigenVector::Flatten(*dx); - auto& place = - *context.template device_context().eigen_device(); - EigenErfGrad, T>::Eval(place, eigen_dx, - eigen_x, eigen_dout); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc index 119e514a49e28..9361edd43bf15 100755 --- a/paddle/fluid/operators/expand_as_v2_op.cc +++ b/paddle/fluid/operators/expand_as_v2_op.cc @@ -12,7 +12,9 @@ limitations under the License. */ #include "paddle/fluid/operators/expand_as_v2_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -22,27 +24,6 @@ using framework::Tensor; class ExpandAsV2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAsV2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandAsV2"); - auto x_dims = ctx->GetInputDim("X"); - auto target_shape = ctx->Attrs().Get>("target_shape"); - PADDLE_ENFORCE_GE( - target_shape.size(), static_cast(x_dims.size()), - platform::errors::InvalidArgument( - "The rank of target_shape must be greater than or equal " - "to the rank of Input(X). But received Input(X): input " - "rank %u; received target_shape: rank %u.", - x_dims.size(), target_shape.size())); - PADDLE_ENFORCE_LE(target_shape.size(), MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of target_shape must be less than or equal " - "to %d. But received: rank %u.", - MAX_RANK_SUPPORTED, target_shape.size())); - ctx->SetOutputDim("Out", phi::make_ddim(target_shape)); - } }; class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker { @@ -116,42 +97,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandAsV2GradNoNeedBufVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(expand_as_v2, ExpandAsInferShapeFunctor, + PD_INFER_META(phi::ExpandAsInferMeta)); REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker, ops::ExpandAsV2GradOpMaker, - ops::ExpandAsV2GradOpMaker); + ops::ExpandAsV2GradOpMaker, + ExpandAsInferShapeFunctor); REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp, ops::ExpandAsV2GradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - expand_as_v2, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel); -REGISTER_OP_CPU_KERNEL( - expand_as_v2_grad, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL( - expand_as_v2, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel); -REGISTER_OP_CUDA_KERNEL( - expand_as_v2_grad, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel); -#endif REGISTER_OP_VERSION(expand_as_v2) .AddCheckpoint( R"ROC(fix expand_as_v2 and add new input [Y])ROC", paddle::framework::compatible::OpVersionDesc().NewInput( - "Y", "Expand X according to the shape of Y")); \ No newline at end of file + "Y", "Expand X according to the shape of Y")); diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h index d7560efc5c1f1..f09e7764eed39 100755 --- a/paddle/fluid/operators/expand_as_v2_op.h +++ b/paddle/fluid/operators/expand_as_v2_op.h @@ -32,219 +32,5 @@ template using EigenTensor = framework::EigenTensor; -template -class ExpandAsV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto rank = context.Input("X")->dims().size(); - auto target_shape = context.Attr>("target_shape"); - auto target_rank = target_shape.size(); - PADDLE_ENFORCE_GE(target_rank, rank, - platform::errors::InvalidArgument( - "The rank (%d) of the input 'target_tensor' for " - "expand_as_v2 op must be greater than or equal to " - "the rank (%d) of the input 'x'.", - target_rank, rank)); - PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument( - "The rank (%d) of the input 'x' for " - "expand_as_v2 op must be positive.", - rank)); - PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank (%d) of the input 'target_tensor' for " - "expand_as_v2 op must be less than or equal to %d.", - target_rank, MAX_RANK_SUPPORTED)); - - switch (target_rank) { - case 1: - ExpandAs<1>(context); - break; - case 2: - ExpandAs<2>(context); - break; - case 3: - ExpandAs<3>(context); - break; - case 4: - ExpandAs<4>(context); - break; - case 5: - ExpandAs<5>(context); - break; - case 6: - ExpandAs<6>(context); - break; - } - } - - protected: - template - void ExpandAs(const framework::ExecutionContext& context) const { - auto* in0 = context.Input("X"); - auto in_dims = in0->dims(); - auto target_shape = context.Attr>("target_shape"); - auto vec_in_dims = phi::vectorize(in_dims); - auto diff = target_shape.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - std::vector repeat_times(vec_in_dims.size()); - for (size_t i = 0; i < vec_in_dims.size(); ++i) { - PADDLE_ENFORCE_NE(target_shape[i], 0, - platform::errors::InvalidArgument( - "The value of target shape cannot be zero.")); - if (i < diff) { - PADDLE_ENFORCE_GT( - target_shape[i], 0, - platform::errors::InvalidArgument( - "The expanded size (%d) for non-existing dimensions must be " - "positive for expand_as_v2 op.", - target_shape[i])); - repeat_times[i] = target_shape[i]; - } else if (target_shape[i] > 0) { - if (vec_in_dims[i] != 1) { - PADDLE_ENFORCE_EQ( - vec_in_dims[i], target_shape[i], - platform::errors::InvalidArgument( - "The value (%d) of the non-singleton dimension does not match" - " the corresponding value (%d) in shape for expand_as_v2 op.", - vec_in_dims[i], target_shape[i])); - repeat_times[i] = 1; - } else { - repeat_times[i] = target_shape[i]; - } - } else { - PADDLE_ENFORCE_EQ( - target_shape[i], -1, - platform::errors::InvalidArgument( - "When the value in shape is negative for expand_as_v2 op, " - "only -1 is supported, but the value received is %d.", - target_shape[i])); - repeat_times[i] = 1; - } - } - auto* out0 = context.Output("Out"); - Eigen::DSizes bcast_dims; - for (size_t i = 0; i < repeat_times.size(); ++i) { - bcast_dims[i] = repeat_times[i]; - } - - framework::DDim new_in_dims = phi::make_ddim(vec_in_dims); - framework::DDim out_dims = phi::make_ddim(target_shape); - - out0->Resize(out_dims); - auto x = EigenTensor::From(*in0, new_in_dims); - out0->mutable_data(context.GetPlace()); - auto y = EigenTensor::From(*out0, out_dims); - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcast, T, Rank>::Eval(place, y, x, - bcast_dims); - } -}; - -template -class ExpandAsV2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in0 = context.Input("X"); - auto target_shape = context.Attr>("target_shape"); - auto x_dims = in0->dims(); - auto vec_in_dims = phi::vectorize(x_dims); - auto diff = target_shape.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - std::vector repeat_times(vec_in_dims.size()); - for (size_t i = 0; i < vec_in_dims.size(); ++i) { - repeat_times[i] = target_shape[i] / vec_in_dims[i]; - } - std::vector reshape_dims_vec; - std::vector reduce_dims_vec; - for (size_t i = 0; i < repeat_times.size(); ++i) { - reduce_dims_vec.push_back(reshape_dims_vec.size()); - reshape_dims_vec.push_back(repeat_times[i]); - reshape_dims_vec.push_back(vec_in_dims[i]); - } - - int dims = reduce_dims_vec.size(); - bool just_copy = true; - for (size_t i = 0; i < repeat_times.size(); i++) { - if (repeat_times[i] != 1) { - just_copy = false; - break; - } - } - // no need reduce, just copy - if (just_copy) { - auto* in0 = context.Input(framework::GradVarName("Out")); - auto* out0 = context.Output(framework::GradVarName("X")); - out0->mutable_data(context.GetPlace()); - framework::TensorCopy(*in0, context.GetPlace(), context.device_context(), - out0); - } else { - PADDLE_ENFORCE_GE(dims, 1, - platform::errors::InvalidArgument( - "The rank of the input 'Out@GRAD' for " - "expand_as_v2_grad op must be greater than or " - "equal to 1, but the value received is %d.", - dims)); - PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'Out@GRAD' for " - "expand_as_v2_grad op must be less than or equal " - "to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, dims)); - switch (dims) { - case 1: - ExpandAsBackward<1>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 2: - ExpandAsBackward<2>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 3: - ExpandAsBackward<3>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 4: - ExpandAsBackward<4>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 5: - ExpandAsBackward<5>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 6: - ExpandAsBackward<6>(context, reshape_dims_vec, reduce_dims_vec); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Only support tensor with rank being between 1 and 6. But " - "received tensor's rank = %d.", - dims)); - } - } - } - - protected: - template - void ExpandAsBackward(const framework::ExecutionContext& context, - const std::vector& reshape_dims_vec, - const std::vector& reduce_dims_vec) const { - size_t reshape_size = reshape_dims_vec.size(); - size_t reduce_size = reduce_dims_vec.size(); - auto* in0 = context.Input(framework::GradVarName("Out")); - auto* out0 = context.Output(framework::GradVarName("X")); - out0->mutable_data(context.GetPlace()); - auto x_grad = EigenVector::Flatten(*out0); - Eigen::DSizes reshape_dims; - for (size_t i = 0; i < reshape_size; ++i) { - reshape_dims[i] = reshape_dims_vec[i]; - } - Eigen::DSizes reduce_dims; - for (size_t i = 0; i < reduce_size; ++i) { - reduce_dims[i] = reduce_dims_vec[i]; - } - auto out_grad = EigenVector::Flatten(*in0); - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcastGrad, T, Dims>::Eval( - place, x_grad, out_grad, reduce_dims, reshape_dims); - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc index cdd4e1dbaae6a..df00ae54c1036 100644 --- a/paddle/fluid/operators/expand_op_npu_test.cc +++ b/paddle/fluid/operators/expand_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index 9f7e4fb8d5749..70597be393c35 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -28,13 +28,14 @@ __global__ void FindAbsMaxKernel(const T* in, const int n, T* out) { extern __shared__ char* shared_max_data_tmp[]; auto shared_max_data = reinterpret_cast(shared_max_data_tmp); if (gridDim.x > 1) { - shared_max_data[tid] = T(0); + T local_max_data = T(0); for (int i = bid; i < n; i += blockDim.x * gridDim.x) { T tmp = abs(in[i]); - if (tmp > shared_max_data[tid]) { - shared_max_data[tid] = tmp; + if (tmp > local_max_data) { + local_max_data = tmp; } } + shared_max_data[tid] = local_max_data; } else { if (bid < n) { shared_max_data[tid] = abs(in[bid]); @@ -83,13 +84,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n, int channel_size = n / c; const T* in_c = in + blockIdx.x * channel_size; extern __shared__ T shared_max_data[]; - shared_max_data[tid] = T(0); + T local_max_data = T(0); for (int i = tid; i < channel_size; i += blockDim.x) { T tmp = fabs(in_c[i]); - if (tmp > shared_max_data[tid]) { - shared_max_data[tid] = tmp; + if (tmp > local_max_data) { + local_max_data = tmp; } } + shared_max_data[tid] = local_max_data; __syncthreads(); for (int i = blockDim.x / 2; i > 0; i >>= 1) { if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) { @@ -113,13 +115,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n, int tid = threadIdx.x; int bid = blockIdx.x; const T* in_current = in + tid * cout_wh_size + bid * wh_size; - shared_max_data[tid] = T(0); + T local_max_data = T(0); for (int i = 0; i < wh_size; i++) { T tmp = fabs(in_current[i]); - if (tmp > shared_max_data[tid]) { - shared_max_data[tid] = tmp; + if (tmp > local_max_data) { + local_max_data = tmp; } } + shared_max_data[tid] = local_max_data; __syncthreads(); int len = blockDim.x; @@ -404,6 +407,19 @@ struct FindRangeAbsMaxFunctor { } }; +template +__global__ void FindMovingAverageAbsMaxKernel(const T* in_state, + const T* in_accum, + const T* cur_scale, const T rate, + T* out_state, T* out_accum, + T* out_scale) { + T state = rate * (*in_state) + T(1.0f); + T accum = rate * (*in_accum) + (*cur_scale); + *out_state = state; + *out_accum = accum; + *out_scale = accum / state; +} + template struct FindRangeAbsMaxFunctor; template @@ -415,29 +431,14 @@ struct FindMovingAverageAbsMaxFunctor { framework::Tensor* out_accum, framework::Tensor* out_scale) { const auto gpu_place = ctx.GetPlace(); - T accum; - T state; - T scale; - memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data(), - sizeof(T), ctx.stream()); - memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data(), - sizeof(T), ctx.stream()); - memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T), - ctx.stream()); - ctx.Wait(); - T rate_t = static_cast(rate); - state = rate_t * state + static_cast(1.0); - accum = rate_t * accum + scale; - scale = accum / state; - - memory::Copy(gpu_place, out_accum->mutable_data(gpu_place), - platform::CPUPlace(), &accum, sizeof(T), ctx.stream()); - memory::Copy(gpu_place, out_state->mutable_data(gpu_place), - platform::CPUPlace(), &state, sizeof(T), ctx.stream()); - memory::Copy(gpu_place, out_scale->mutable_data(gpu_place), - platform::CPUPlace(), &scale, sizeof(T), ctx.stream()); - ctx.Wait(); + T* out_state_data = out_state->mutable_data(gpu_place); + T* out_accum_data = out_accum->mutable_data(gpu_place); + T* out_scale_data = out_scale->mutable_data(gpu_place); + + FindMovingAverageAbsMaxKernel<<<1, 1, 0, ctx.stream()>>>( + in_state.data(), in_accum.data(), cur_scale, rate_t, + out_state_data, out_accum_data, out_scale_data); } }; diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu index 0eb84f18f25f0..27a235765227f 100644 --- a/paddle/fluid/operators/feed_forward_test.cu +++ b/paddle/fluid/operators/feed_forward_test.cu @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/fused/attn_feed_forward.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace framework = paddle::framework; @@ -29,6 +30,11 @@ namespace platform = paddle::platform; USE_OP(matmul); USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +#endif + // get paddle matmul op results as baseline template void GetLinearOp(const std::vector &x, const std::vector &y, diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index 79018f2a97448..cb03add314327 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -65,7 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel { tensor_value.mutable_data({1}, ctx.GetPlace()); FillNpuTensorWithConstant(&tensor_value, value); NpuOpRunner runner; -#if (CANN_VERSION_CODE >= 503003) +#if (CANN_VERSION_CODE >= 503003 && CANN_VERSION_CODE < 504001) runner.SetType("FillD") .AddInput(tensor_value) .AddOutput(*out_var) diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu new file mode 100644 index 0000000000000..7870efba4e7a1 --- /dev/null +++ b/paddle/fluid/operators/filter_by_instag_op.cu @@ -0,0 +1,597 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000 + +#if defined(PADDLE_WITH_CUDA) +#include +#endif + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/enforce.h" + +#include "paddle/fluid/operators/filter_by_instag_op.h" + +#if defined(PADDLE_WITH_CUDA) +namespace cg = cooperative_groups; +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using SelectedRows = phi::SelectedRows; +using LoDTensor = framework::LoDTensor; + +template +using Vector = framework::Vector; + +#define WARP_SIZE 32 +#define MAX_WARP_NUM 32 + +#if defined(PADDLE_WITH_CUDA) + +template +__global__ void filter_copy_fuse_kernel( + const size_t N, const int ins_per_thread, size_t* x1_lods_data, + size_t* x2_lods_data, const int64_t* x2_data, const int64_t* x3_data, + int64_t filter_tag_size, T* out_data, int64_t* map_data, + size_t* map_lods_data, size_t* out_lods_data, size_t* out_idx_data, + const T* x1_data, int x1_embed_size, float* loss_weight_data, + float fill_value) { + // N is instance num + // one threads for ins_per_thread instances + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + cg::thread_block b = cg::this_thread_block(); + cg::thread_block_tile g = cg::tiled_partition(b); + + int gid = idx / WARP_SIZE; + + // general use + int thread_num = + (N + (ins_per_thread - 1)) / ins_per_thread; // real thread num + int total_warp_num = thread_num / WARP_SIZE; // 30 + int remain_thread_num = thread_num % WARP_SIZE; // 16 + + int warp_thread_num = -1; + if (gid < total_warp_num) { + warp_thread_num = WARP_SIZE; + } else { + warp_thread_num = remain_thread_num; + } + + int group_num = total_warp_num; + if (remain_thread_num > 0) { + group_num = total_warp_num + 1; + } + + if (gid >= group_num) return; + + int ins_start = idx * ins_per_thread; + int ins_end = (idx + 1) * ins_per_thread; + + if (N < ins_end) ins_end = N; + + int flag_data[5]; + int prefix_sum_data[5]; + int prefix_sum_data2[5]; + + __shared__ int shr[MAX_WARP_NUM]; + __shared__ int shr2[MAX_WARP_NUM]; + __shared__ int shr3[MAX_WARP_NUM]; + + for (int p = ins_start; p < ins_end; p++) { + int ins_tag_start = x2_lods_data[p]; + int ins_tag_end = x2_lods_data[p + 1]; + flag_data[p - ins_start] = 0; + // filter logic + int i = ins_tag_start; + for (; i < ins_tag_end; i++) { + int64_t ins_tag = x2_data[i]; + int j = 0; + for (; j < filter_tag_size; j++) { + if (x3_data[j] == ins_tag) break; + } + // if ins_tag in filter tag + if (j < filter_tag_size) { + flag_data[p - ins_start] = 1; + break; + } + } + } + + int sum_addr = 0; + int sum_flag = 0; + int sum_out_lods = 0; + + int local_addr = 0; + int local_flag = 0; + int local_out_lods = 0; + + if (ins_start < ins_end) { + for (int p = ins_start; p < ins_end; p++) { + int previous = -1; + if (p == ins_start) { + previous = 0; + } else { + previous = prefix_sum_data[p - ins_start - 1]; + } + + prefix_sum_data[p - ins_start] = + previous + + flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]); + } + + local_addr = prefix_sum_data[ins_end - 1 - ins_start]; + sum_addr = local_addr; + + for (int p = ins_start; p < ins_end; p++) { + local_flag += flag_data[p - ins_start]; + } + sum_flag = local_flag; + + for (int p = ins_start; p < ins_end; p++) { + local_out_lods += + flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]); + } + + sum_out_lods = local_out_lods; + } + + for (int i = 1; i < warp_thread_num; i *= 2) { + int temp_addr = g.shfl_up(sum_addr, i); + int temp_flag = g.shfl_up(sum_flag, i); + int temp_out_lods = g.shfl_up(sum_out_lods, i); + + if (g.thread_rank() >= i) { + sum_addr += temp_addr; + sum_flag += temp_flag; + sum_out_lods += temp_out_lods; + } + } + + if (g.thread_rank() == warp_thread_num - 1) { + shr[gid] = sum_addr; + shr2[gid] = sum_flag; + shr3[gid] = sum_out_lods; + } + + b.sync(); + + int sum_addr2 = 0; + int sum_flag2 = 0; + int sum_out_lods2 = 0; + + // communicate between warp + if (g.thread_rank() < group_num) { + sum_addr2 = shr[g.thread_rank()]; + sum_flag2 = shr2[g.thread_rank()]; + sum_out_lods2 = shr3[g.thread_rank()]; + } + + for (int i = 1; i < group_num; i *= 2) { + int temp_addr2 = g.shfl_up(sum_addr2, i); + int temp_flag2 = g.shfl_up(sum_flag2, i); + int temp_out_lods2 = g.shfl_up(sum_out_lods2, i); + + if (g.thread_rank() >= i) { + sum_addr2 += temp_addr2; + sum_flag2 += temp_flag2; + sum_out_lods2 += temp_out_lods2; + } + } + + int sum_addr3 = g.shfl(sum_addr2, gid); + int sum_flag3 = g.shfl(sum_flag2, gid); + int sum_out_lods3 = g.shfl(sum_out_lods2, gid); + + int p_flag; + int p_addr; + int p_out_lods; + + if (ins_start < ins_end) { + p_addr = sum_addr3 - shr[gid] + sum_addr - local_addr; + p_flag = sum_flag3 - shr2[gid] + sum_flag - local_flag; + p_out_lods = sum_out_lods3 - shr3[gid] + sum_out_lods - local_out_lods; + + for (int p = ins_start; p < ins_end; p++) { + if (ins_start == p) { + prefix_sum_data2[p - ins_start] = p_addr; + } else { + prefix_sum_data2[p - ins_start] = + prefix_sum_data2[p - ins_start - 1] + + flag_data[p - ins_start - 1] * + (x1_lods_data[p] - x1_lods_data[p - 1]); + } + } + + if (gid == 0 && g.thread_rank() == group_num - 1) { + *out_idx_data = (sum_flag2 + 1); + map_lods_data[sum_flag2] = sum_flag2; + } + } + + int sum_out_lods4 = g.shfl(sum_out_lods2 + 1, group_num - 1); + + if (ins_start < ins_end) { + int out_lods_idx = p_flag + 1; + for (int p = ins_start; p < ins_end; p++) { + if (flag_data[p - ins_start] == 1) { + size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p]; + int t = out_lods_idx - 1; + int previous; + if (out_lods_idx == p_flag + 1) { + previous = p_out_lods; + } else { + previous = out_lods_data[t]; + } + map_data[t * 3] = (int64_t)previous; + map_data[t * 3 + 1] = x1_lods_data[p]; + map_lods_data[t] = t; + out_lods_data[out_lods_idx] = previous + batch_len; + map_data[t * 3 + 2] = batch_len; + out_lods_idx++; + } + } + + // fill loss_weight_data + if (sum_out_lods4 > 1) { + int out_data_num = sum_out_lods4 - 1; + int out_start = ins_start; + if (out_start < out_data_num) { + int out_end = ins_end >= out_data_num ? out_data_num : ins_end; + for (int p = out_start; p < out_end; p++) { + loss_weight_data[p] = fill_value; + } + } + } + + for (int p = ins_start; p < ins_end; p++) { + // copy logic + if (flag_data[p - ins_start] == 1) { + auto output_start_idx = prefix_sum_data2[p - ins_start]; + T* dst = out_data + output_start_idx * x1_embed_size; + const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size; + const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size; + for (const T *j = src_start; j != src_end; dst++, j++) { + *dst = *j; + } + } + } + } + + b.sync(); +} + +template +__global__ void copy_grad_kernel(const size_t N, const int ins_per_thread, + const T* out_grad_data, T* x1_grad_data, + const int64_t* map_data, int x1_embed_size) { + // N is instance num + // one threads for one instance + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int ins_start = idx * ins_per_thread; + int ins_end = (idx + 1) * ins_per_thread; + if (ins_start >= N) { + return; + } + if (ins_end > N) ins_end = N; + for (int p = ins_start; p < ins_end; p++) { + T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size; + const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size; + const T* src_end = + out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size; + + for (const T *j = src_start; j != src_end; dst++, j++) { + *dst = *j; + } + } +} + +#endif + +template +class FilterByInstagGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { +#if defined(PADDLE_WITH_CUDA) + + auto gpu_place = context.GetPlace(); + + gpuStream_t current_stream = context.cuda_device_context().stream(); + + int max_thread_num_per_block = 1024; + // context.cuda_device_context().GetMaxThreadsPerBlock(); + // X1 is global FC output + // Dim [batch size, embedding size] + const LoDTensor* x1 = context.Input("Ins"); + bool is_lod = context.Attr("is_lod"); + + int is_x1_lod = -1; + if (is_lod) + is_x1_lod = 1; + else + is_x1_lod = 0; + + int64_t out_val_if_empty = context.Attr("out_val_if_empty"); + size_t x1_embed_size = x1->dims()[1]; + // X2 is ins tag list + // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]] + const LoDTensor* x2 = context.Input("Ins_tag"); + // expected auto = const int64_t + const int64_t* x2_data = x2->data(); + + // X3 is local fc tag list + // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]] + const Tensor* x3 = context.Input("Filter_tag"); + const int64_t* x3_data = x3->data(); + + Vector x2_lods; + if (x2->lod().size() != 0) { // lod_level = 1 + x2_lods = x2->lod()[0]; + } else { // lod_level = 0 + const size_t x2_lods_size = x2->dims()[0]; + const size_t instag_per_num = x2->dims()[1]; + // x2_lods.resize(x2->dims()[0] + 1); + // move to cuda + x2_lods.push_back(0); + for (size_t i = 0; i < x2_lods_size; i++) { + x2_lods.push_back(x2_lods.back() + instag_per_num); + } + } + + const size_t x2_lods_size = x2_lods.size() - 1; + paddle::framework::MixVector mixv_x2_lods(&x2_lods); + + size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place); + + Vector x1_lods; + if (!is_x1_lod) { + x1_lods.push_back(0); + for (int i = 0; i < x1->dims()[0]; i++) { + x1_lods.push_back(i + 1); + } + } else { + // x1_lods = context.Input("Ins")->lod()[0]; + // new: lod_level=0 => lod() return {} + if (x1->lod().size() != 0) { // lod_level = 1 + x1_lods = x1->lod()[0]; + } else { // lod_level = 0 + // x1_lods.resize(x1->dims()[0] + 1); + // move to cuda + x1_lods.push_back(0); + for (int i = 0; i < x1->dims()[0]; i++) { + x1_lods.push_back(i + 1); + } + } + } + + paddle::framework::MixVector mixv_x1_lods(&x1_lods); + + size_t* x1_lods_data = mixv_x1_lods.CUDAMutableData(gpu_place); + auto* x1_data = x1->data(); + + // set output value + // for those whose ins been dropout, set 0 for whole lines. + // otherwise, copy whole line + // Dim [local fc count, batch size, embedding size] + LoDTensor* out = context.Output("Out"); + LoDTensor* map = context.Output("IndexMap"); + LoDTensor* loss_weight = context.Output("LossWeight"); + + int out_first = x1_lods.back(); + + out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size})); + map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3})); + loss_weight->Resize(phi::make_ddim({(int64_t)x2_lods_size, 1})); + + T* out_data = out->mutable_data(gpu_place); + int64_t* map_data = map->mutable_data(gpu_place); + float* loss_weight_data = loss_weight->mutable_data(gpu_place); + + int block_size = max_thread_num_per_block; + int ins_per_thread = (x2_lods_size + block_size - 1) / block_size; + dim3 block_dim(block_size); + dim3 grid_dim(1); + + Vector out_lods(x2_lods_size + 1, 0); + Vector map_lods(x2_lods_size + 1, 0); + + paddle::framework::MixVector mixv_out_lods(&out_lods); + paddle::framework::MixVector mixv_map_lods(&map_lods); + + // thrust::device_vector out_idx(1); + Vector out_idx(1, 0); + paddle::framework::MixVector mixv_out_idx(&out_idx); + + size_t* out_idx_data = mixv_out_idx.CUDAMutableData(gpu_place); + size_t* out_lods_data = mixv_out_lods.CUDAMutableData(gpu_place); + size_t* map_lods_data = mixv_map_lods.CUDAMutableData(gpu_place); + + float fill_value = 1.0; + + filter_copy_fuse_kernel<<>>( + x2_lods_size, ins_per_thread, x1_lods_data, x2_lods_data, x2_data, + x3_data, x3->numel(), out_data, map_data, map_lods_data, out_lods_data, + out_idx_data, x1_data, x1_embed_size, loss_weight_data, fill_value); + + platform::GpuStreamSync(current_stream); + + mixv_out_lods.resize(mixv_out_idx[0]); + + if (mixv_out_lods.size() - 1 > 0) { + out->Resize(phi::make_ddim( + {(int64_t)mixv_out_lods.back(), (int64_t)x1_embed_size})); + + map->Resize(phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 3})); + loss_weight->Resize( + phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 1})); + + } else { + out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size})); + map->Resize(phi::make_ddim({1, 3})); + loss_weight->Resize(phi::make_ddim({1, 1})); + } + + if (mixv_out_lods.size() - 1 > 0) { + map_lods.resize(mixv_out_lods.size()); + + mixv_map_lods.CopyToCPU(); + + std::vector> map_lod_info; + map_lod_info.emplace_back(map_lods); + + map->set_lod(map_lod_info); + loss_weight->set_lod(map_lod_info); + + mixv_out_lods.CopyToCPU(); + std::vector> out_lod_info; + out_lod_info.emplace_back(out_lods); + out->set_lod(out_lod_info); + + } else { + Vector map_lods(2, 0); + paddle::framework::MixVector mixv_map_lods(&map_lods); + thrust::device_ptr map_data_ptr(map_data); + + map_data_ptr[0] = 0; + map_data_ptr[1] = 1; + map_data_ptr[2] = 1; + + mixv_map_lods[0] = 0; + mixv_map_lods[1] = 1; + mixv_out_lods.push_back(1); + + mixv_map_lods.CopyToCPU(); + mixv_out_lods.CopyToCPU(); + + std::vector> map_lod_info; + map_lod_info.emplace_back(map_lods); + map->set_lod(map_lod_info); + + loss_weight->set_lod(map_lod_info); + + std::vector> out_lod_info; + out_lod_info.emplace_back(out_lods); + out->set_lod(out_lod_info); + + thrust::device_ptr out_data_ptr(out_data); + + // gpu kernel + if (std::is_same::value) { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } else if (std::is_same::value) { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } else if (std::is_same::value) { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } else { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } + + thrust::device_ptr loss_weight_data_ptr(loss_weight_data); + loss_weight_data_ptr[0] = 0; + } + +#endif + } +}; + +template +class FilterByInstagGradGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { +#if defined(PADDLE_WITH_CUDA) + + auto gpu_place = context.GetPlace(); + gpuStream_t current_stream = context.cuda_device_context().stream(); + auto max_thread_num_per_block = 1024; + auto* output_grad = context.Input(framework::GradVarName("Out")); + auto* x1_grad = context.Output(framework::GradVarName("Ins")); + auto* loss_weight = context.Input("LossWeight"); + auto* mmap = context.Input("IndexMap"); + auto* x1 = context.Input("Ins"); + + x1_grad->set_lod(context.Input("Ins")->lod()); + x1_grad->Resize(x1->dims()); + + auto* mmap_data = mmap->data(); + // expected auto = T + auto* output_grad_data = output_grad->data(); + auto* loss_weight_data = loss_weight->data(); + + // expected auto = T + auto* x1_grad_data = x1_grad->mutable_data(gpu_place); + thrust::device_ptr x1_grad_data_ptr(x1_grad_data); + thrust::device_ptr loss_weight_data_ptr(loss_weight_data); + + thrust::fill(x1_grad_data_ptr, + x1_grad_data_ptr + x1->dims()[0] * x1->dims()[1], 0); + + if (loss_weight->numel() != 1 || loss_weight_data_ptr[0] != 0) { + auto output_dims = output_grad->dims(); + int x1_embed_size = output_dims[1]; + + // one thread for multi-instances + int block_size = max_thread_num_per_block; + + size_t N = mmap->dims()[0]; + dim3 block_dim(block_size); + + dim3 grid_dim((N + block_size - 1) / block_size); + + const int ins_per_thread = 1; + + copy_grad_kernel<<>>( + N, ins_per_thread, output_grad_data, x1_grad_data, mmap_data, + x1_embed_size); + + cudaStreamSynchronize(current_stream); + } + +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL(filter_by_instag, ops::FilterByInstagGPUKernel, + ops::FilterByInstagGPUKernel, + ops::FilterByInstagGPUKernel, + ops::FilterByInstagGPUKernel); + +REGISTER_OP_CUDA_KERNEL(filter_by_instag_grad, + ops::FilterByInstagGradGPUKernel, + ops::FilterByInstagGradGPUKernel, + ops::FilterByInstagGradGPUKernel, + ops::FilterByInstagGradGPUKernel); diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h index deb2aa96b539e..3abc980ceaafc 100644 --- a/paddle/fluid/operators/filter_by_instag_op.h +++ b/paddle/fluid/operators/filter_by_instag_op.h @@ -61,7 +61,20 @@ class FilterByInstagKernel : public framework::OpKernel { // expected auto = const int64_t auto* x2_data = x2->data(); // e.g get [0, 1, 2, 3, ...] - size_t x2_lods_size = x2->dims()[0]; + // size_t x2_lods_size = x2->dims()[0]; + // size_t instag_num_per_ins = x2->dims()[1]; + + Vector x2_lods(1, 0); + if (x2->lod().size() != 0) { // lod_level = 1 + x2_lods = x2->lod()[0]; + } else { // lod_level = 0 + const size_t x2_lods_size = x2->dims()[0]; + const size_t instag_num_per_ins = x2->dims()[1]; + for (size_t i = 0; i < x2_lods_size; i++) { + x2_lods.push_back(x2_lods.back() + instag_num_per_ins); + } + } + Vector x1_lods(1, 0); if (!is_x1_lod) { for (int i = 0; i < x1->dims()[0]; i++) { @@ -79,8 +92,8 @@ class FilterByInstagKernel : public framework::OpKernel { } std::unordered_map mmap_aux; Vector out_lods(1, 0); - for (size_t i = 0; i < x2_lods_size; i++) { - for (size_t j = i; j < i + 1; j++) { + for (size_t i = 0; i < x2_lods.size() - 1; i++) { + for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) { if (filter_tag.find(x2_data[j]) != filter_tag.end()) { size_t batch_len = x1_lods[i + 1] - x1_lods[i]; mmap_aux[out_lods.back()] = x1_lods[i]; @@ -165,8 +178,10 @@ class FilterByInstagKernel : public framework::OpKernel { out_data[oi] = (int32_t)out_val_if_empty; } else if (std::is_same::value) { out_data[oi] = (int64_t)out_val_if_empty; - } else { + } else if (std::is_same::value) { out_data[oi] = static_cast(out_val_if_empty); + } else { + out_data[oi] = static_cast(out_val_if_empty); } } loss_weight_data[0] = 0; diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h index 5ef13b38c8a86..feae954e355b8 100644 --- a/paddle/fluid/operators/flatten_op.h +++ b/paddle/fluid/operators/flatten_op.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/phi_utils.h" -#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/flatten_grad_kernel.h" diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc index c445a28c084f6..e60fc44e9a6ff 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cc +++ b/paddle/fluid/operators/fused/conv_fusion_op.cc @@ -120,6 +120,142 @@ class Conv2DFusionOp : public operators::ConvOp { ctx->SetOutputsDim("Outputs", output_shapes); } } + + std::vector ComputeOutputShape( + framework::InferShapeContext* ctx) const { + OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Conv"); + OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "Conv"); + + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + std::string padding_algorithm = + ctx->Attrs().Get("padding_algorithm"); + int groups = ctx->Attrs().Get("groups"); + std::vector dilations = + ctx->Attrs().Get>("dilations"); + int dilation_size = dilations.size(); + for (int i = 0; i < dilation_size; ++i) { + PADDLE_ENFORCE_GT( + dilations[i], 0, + platform::errors::InvalidArgument( + "The dilation of Op(Conv) should be larget than 0, but received " + "dilation is %d.", + dilations[i])); + } + const std::string data_format = + ctx->Attrs().Get("data_format"); + + // MKL-DNN Kernels are using NCHW order of dims description + // so we ignore data_format consideration for MKL-DNN kernel + const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) && + (data_format == "NHWC" || data_format == "NDHWC"); + + PADDLE_ENFORCE_EQ( + in_dims.size() == 4 || in_dims.size() == 5, true, + platform::errors::InvalidArgument( + "The input of Op(Conv) should be a 4-D or 5-D Tensor. But " + "received: input's dimension is %u, input's shape is [%s].", + in_dims.size(), in_dims)); + + PADDLE_ENFORCE_EQ( + in_dims.size(), filter_dims.size(), + platform::errors::InvalidArgument( + "The input's dimension and filter's dimension of " + "Op(Conv) should be equal. But received: the input's shape is " + "[%s], " + "the input's dimension is %d; the filter's shape is [%s], " + "the filter's dimension is %d.", + in_dims, in_dims.size(), filter_dims, filter_dims.size())); + + int stride_size = strides.size(); + for (int i = 0; i < stride_size; ++i) { + PADDLE_ENFORCE_GT( + strides[i], 0, + platform::errors::InvalidArgument( + "The stride of Op(Conv) should be larget than 0, but received " + "stride is %d.", + strides[i])); + } + + int in_sub_stride_size = in_dims.size() - stride_size; + PADDLE_ENFORCE_EQ( + in_dims.size(), strides.size() + 2U, + platform::errors::InvalidArgument( + "The difference of input's dimension and Attr(strides)'s " + "length must be euqal to 2 for Op(Conv). " + "But received: input's dimension is %d, input's shape is [%s]; " + "Attr(stride)'s length is %d, Attr(stride) is [%s]; " + "difference of input's dimention and Attr(strides)'s length = %u.", + in_dims.size(), in_dims, strides.size(), phi::make_ddim(strides), + in_sub_stride_size)); + + const auto input_channels = + channel_last ? in_dims[in_dims.size() - 1] : in_dims[1]; + + PADDLE_ENFORCE_EQ( + input_channels, filter_dims[1] * groups, + platform::errors::InvalidArgument( + "The number of input's channels should be equal to filter's " + "channels " + "* groups for Op(Conv). But received: the input's channels is %d, " + "the input's shape is [%s]; the filter's channels is %d, the " + "filter's shape is [%s]; the groups is %d, the data_format is %s. " + "The error may come from wrong data_format setting.", + input_channels, in_dims, filter_dims[1], filter_dims, groups, + data_format)); + PADDLE_ENFORCE_EQ( + filter_dims[0] % groups, 0, + platform::errors::InvalidArgument( + "The number of output's channels (filter's first dimension) of " + "Op(Conv) should be divided by groups. But received: " + "the output channels is %d, the filter's shape is [%s], " + "the groups is %d.", + filter_dims[0], filter_dims, groups)); + + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_GT( + filter_dims[0], 0, + platform::errors::InvalidArgument( + "the size of filter at axis 0 should be greater than 0")); + } + + framework::DDim in_data_dims; + if (channel_last) { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } else { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } + + framework::DDim filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + std::vector output_shape({in_dims[0]}); + if (!channel_last) { + output_shape.push_back(filter_dims[0]); + } + for (int i = 0; i < in_data_dims.size(); ++i) { + if ((!ctx->IsRuntime()) && + (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) { + output_shape.push_back(-1); + } else { + output_shape.push_back( + ConvOutputSize(in_data_dims[i], filter_data_dims[i], dilations[i], + paddings[2 * i], paddings[2 * i + 1], strides[i])); + } + } + if (channel_last) { + output_shape.push_back(filter_dims[0]); + } + + return output_shape; + } }; // TODO(qingqing): add gradient operator for conv2d_fusion diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h index 0202776757973..54e4cbdc16249 100644 --- a/paddle/fluid/operators/fused/fmha_ref.h +++ b/paddle/fluid/operators/fused/fmha_ref.h @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/transpose_op.cu.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" namespace paddle { @@ -69,20 +70,21 @@ class FMHARef { ~FMHARef() {} void ComputeForward(const Tensor& qkv_input_tensor, + const Tensor* cache_kv_tensor, const Tensor* src_mask_tensor, - Tensor* transpose_2_out_tensor, Tensor* qk_out_tensor, + Tensor* transpose_2_out_tensor, + Tensor* cache_kv_out_tensor, Tensor* qk_out_tensor, Tensor* src_mask_out_tensor, Tensor* softmax_out_tensor, Tensor* dropout_mask_out_tensor, Tensor* dropout_out_tensor, Tensor* qktv_out_tensor, Tensor* fmha_out_tensor) { // input shape: [bs, seq_len, 3, num_head, head_dim] - // transpose with perm [2, 0, 1, 3, 4], + // transpose with perm [2, 0, 3, 1, 4], // output_shape: [3, bs, num_head, seq_len, head_dim] int ndims = 5; std::vector perm_1 = {2, 0, 3, 1, 4}; TransposeGPUKernelDriver(dev_ctx_, ndims, qkv_input_tensor, perm_1, transpose_2_out_tensor); - T* qkv_data = transpose_2_out_tensor->data(); T* qk_out_data = qk_out_tensor->data(); T* qktv_out_data = qktv_out_tensor->data(); @@ -90,11 +92,30 @@ class FMHARef { T* dropout_out_data = dropout_out_tensor->data(); T* fmha_out_data = fmha_out_tensor->data(); - int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_; - int k_size = q_size; + auto out_seq_len = seq_len_; + if (cache_kv_tensor) { + // kv [2, bs, num_head, seq_len, head_dim] + auto kv_tensor = transpose_2_out_tensor->Slice(1, 3); + phi::funcs::ConcatFunctor concat; + // out [2, bs, num_head, cache_seq_len + seq_len, head_dim] + concat(dev_ctx_, {*cache_kv_tensor, kv_tensor}, 3, cache_kv_out_tensor); + out_seq_len = cache_kv_out_tensor->dims()[3]; + } + + int64_t q_size = batch_size_ * seq_len_ * num_head_ * head_dim_; T* q_ptr = qkv_data; - T* k_ptr = q_ptr + q_size; - T* v_ptr = k_ptr + k_size; + T* k_ptr = nullptr; + T* v_ptr = nullptr; + + if (cache_kv_tensor) { + int64_t k_size = cache_kv_out_tensor->numel() / 2; + k_ptr = cache_kv_out_tensor->data(); + v_ptr = k_ptr + k_size; + } else { + int64_t k_size = q_size; + k_ptr = q_ptr + q_size; + v_ptr = k_ptr + k_size; + } // q*k^t, batched_gemm CBLAS_TRANSPOSE transA = CblasNoTrans; @@ -102,7 +123,7 @@ class FMHARef { auto blas = phi::funcs::GetBlas(dev_ctx_); int gemm_batch_size = batch_size_ * num_head_; int gemm_m = seq_len_; - int gemm_n = seq_len_; + int gemm_n = out_seq_len; int gemm_k = head_dim_; T alpha = static_cast(1.0 / sqrt(head_dim_)); T beta = static_cast(0.0); @@ -133,16 +154,16 @@ class FMHARef { transB = CblasNoTrans; gemm_m = seq_len_; gemm_n = head_dim_; - gemm_k = seq_len_; + gemm_k = out_seq_len; alpha = static_cast(1.0); stride_a = gemm_m * gemm_k; stride_b = gemm_k * gemm_n; if (dropout_param_.dropout_prob_) { DropoutFwGPUKernelDriver( - dev_ctx_, dropout_param_.is_test_, - static_cast( - dropout_param_.dropout_implementation_), + static_cast(dev_ctx_), + dropout_param_.is_test_, static_cast( + dropout_param_.dropout_implementation_), dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_, dropout_param_.is_fix_seed_, dropout_param_.seed_val_, static_cast(*softmax_out_tensor), dropout_param_.seed_, @@ -242,8 +263,9 @@ class FMHARef { // dropout bw if (dropout_param_.dropout_prob_) { DropoutGradGPUKernelDriver( - dev_ctx_, static_cast( - dropout_param_.dropout_implementation_), + static_cast(dev_ctx_), + static_cast( + dropout_param_.dropout_implementation_), dropout_param_.dropout_prob_, static_cast(*dropout_out_grad_tensor), dropout_mask_out_tensor, softmax_out_grad_tensor->numel(), diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc index d141800d61c0e..e473f8ff0662c 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cc +++ b/paddle/fluid/operators/fused/fused_attention_op.cc @@ -61,6 +61,10 @@ class FusedAttentionOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(ctx->HasOutput("QKTVOut"), "Output", "QKTVOut", "FusedAttentionOp"); + if (ctx->HasInput("CacheKV")) { + OP_INOUT_CHECK(ctx->HasOutput("CacheKVOut"), "Output", "CacheKVOut", + "FusedAttentionOp"); + } if (ctx->HasInput("SrcMask")) { OP_INOUT_CHECK(ctx->HasOutput("SrcMaskOut"), "Output", "SrcMaskOut", "FusedAttentionOp"); @@ -105,12 +109,14 @@ class FusedAttentionOp : public framework::OperatorWithKernel { "input qkv_weight = [%s]", x_dim, y_dim)); - PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3], - platform::errors::InvalidArgument( - "The dimensions of qkv_weight must be 4" - "(3, num_head, dim_head, dim_embed)," - "and must satisfy the limitations: " - "(num_head * dim_head == dim_embed)")); + if (ctx->Attrs().Get("ring_id") == -1) { + PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3], + platform::errors::InvalidArgument( + "The dimensions of qkv_weight must be 4" + "(3, num_head, dim_head, dim_embed)," + "and must satisfy the limitations: " + "(num_head * dim_head == dim_embed)")); + } if (ctx->Attrs().Get("pre_layer_norm") == true) { ctx->SetOutputDim("LnMean", {x_dim[0] * x_dim[1]}); @@ -132,20 +138,64 @@ class FusedAttentionOp : public framework::OperatorWithKernel { // [3, batch_size, num_head, seq_len, head_size] ctx->SetOutputDim("TransposeOut2", {y_dim[0], x_dim[0], y_dim[1], x_dim[1], y_dim[2]}); - // [batch, num_head, seq_len, seq_len] - ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + + // cache_seq_len + seq_len if cache else seq_len + auto out_seq_len = x_dim[1]; + if (ctx->HasInput("CacheKV")) { + // [2, batch_size, num_head, cache_seq_len, head_size] + auto c_dim = ctx->GetInputDim("CacheKV"); + + PADDLE_ENFORCE_EQ( + c_dim.size(), 5, + paddle::platform::errors::InvalidArgument( + "The CacheKV must be 5 dims, but got %d", c_dim.size())); + PADDLE_ENFORCE_EQ(c_dim[0], 2, + paddle::platform::errors::InvalidArgument( + "The first dim of CacheKV must be 2, but got %d", + c_dim[0])); // 2 + PADDLE_ENFORCE_EQ(c_dim[1], x_dim[0], + paddle::platform::errors::InvalidArgument( + "The second dim of CacheKV must be equal with " + "batch size %d, but got %d", + x_dim[0], c_dim[1])); // batch_size + PADDLE_ENFORCE_EQ(c_dim[2], y_dim[1], + paddle::platform::errors::InvalidArgument( + "The third dim of CacheKV must be equal with num " + "head %d, but got %d", + y_dim[1], c_dim[2])); // num_head + PADDLE_ENFORCE_GE( + c_dim[3], 0, + paddle::platform::errors::InvalidArgument( + "The forth dim of CacheKV must be greater than 0, but got %d", + c_dim[3])); // cache_seq_len + PADDLE_ENFORCE_EQ(c_dim[4], y_dim[2], + paddle::platform::errors::InvalidArgument( + "The fifth dim of CacheKV must be equal with head " + "size %d, but got %d", + y_dim[2], c_dim[4])); // head_size + + out_seq_len += c_dim[3]; + // [3, batch_size, num_head, cache_seq_len + seq_len, head_size] + ctx->SetOutputDim("CacheKVOut", + {c_dim[0], c_dim[1], c_dim[2], out_seq_len, c_dim[4]}); + } + + // [batch, num_head, seq_len, out_seq_len] + ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); if (ctx->HasInput("SrcMask")) { - ctx->SetOutputDim("SrcMaskOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + ctx->SetOutputDim("SrcMaskOut", + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); } // the same as QKOut's shape. ctx->SetOutputDim("AttnDropoutOut", - {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); if (ctx->Attrs().Get("attn_dropout_is_test") == false) { ctx->SetOutputDim("AttnDropoutMaskOut", - {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); } - ctx->SetOutputDim("SoftmaxOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + ctx->SetOutputDim("SoftmaxOut", + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); // [batch_size, num_heads, seq_len, head_dim] ctx->SetOutputDim("QKTVOut", {x_dim[0], y_dim[1], x_dim[1], y_dim[2]}); // [batch_size, seq_len, number of heads*head size] @@ -182,6 +232,8 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker { .AsDispensable(); AddInput("QKVW", "The qkv weight tensor."); AddInput("QKVBias", "The qkv bias tensor.").AsDispensable(); + AddInput("CacheKV", "(optional) The cached KV for generation inference.") + .AsDispensable(); AddInput("SrcMask", "(optional) The attention mask tensor in fmha.") .AsDispensable(); AddInput("OutLinearW", "The out_linear weight tensor."); @@ -217,6 +269,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("BiasDropoutResidualOut", "Result of residual + dropout(src + bias).") .AsIntermediate(); + AddOutput("CacheKVOut", "The udpated cache KV."); AddOutput("Y", "Result after attention."); AddAttr("pre_layer_norm", @@ -324,6 +377,10 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker { "0.0 and 0.001, But received [%s].", ln_epsilon)); }); + AddAttr( + "ring_id", + "ring id for tensor model parallel. distributed training and inference") + .SetDefault(-1); AddComment(R"DOC( Add fused attention op whose logic is as follows: diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index 03f51fc585798..d26577f06fe68 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -27,11 +27,39 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fmha_ref.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h" +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#endif + namespace paddle { namespace operators { using Tensor = framework::Tensor; +template +static void AllReduce(framework::Tensor &tensor, // NOLINT + const int ring_id, + const platform::CUDADeviceContext &ctx) { + if (ring_id == -1) return; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto dtype = + platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype())); + int64_t numel = tensor.numel(); + const void *sendbuff = tensor.data(); + auto place = ctx.GetPlace(); + void *recvbuff = tensor.mutable_data(place); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto stream = ctx.stream(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "PaddlePaddle should compile with NCCL or RCCL when used tensor model " + "parallel op.")); +#endif +} + template class FusedAttentionOpKernel : public framework::OpKernel { public: @@ -56,6 +84,8 @@ class FusedAttentionOpKernel : public framework::OpKernel { auto *src_mask = ctx.Input("SrcMask"); auto *transpose_out_2 = ctx.Output("TransposeOut2"); + auto *cache_kv = ctx.Input("CacheKV"); + auto *cache_kv_out = ctx.Output("CacheKVOut"); auto *qk_out = ctx.Output("QKOut"); auto *qktv_out = ctx.Output("QKTVOut"); auto *softmax_out = ctx.Output("SoftmaxOut"); @@ -86,6 +116,7 @@ class FusedAttentionOpKernel : public framework::OpKernel { auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input("Seed1") : nullptr; bool is_fix_seed_1 = ctx.Attr("attn_dropout_fix_seed"); int seed_val_1 = ctx.Attr("attn_dropout_seed"); + int ring_id = ctx.Attr("ring_id"); // final output. auto *out = ctx.Output("Y"); @@ -105,6 +136,10 @@ class FusedAttentionOpKernel : public framework::OpKernel { // get data ptr for FMHA. auto *transpose_out_2_data = transpose_out_2->mutable_data(ctx.GetPlace()); + auto *cache_kv_out_data = + (cache_kv_out == nullptr) + ? nullptr + : cache_kv_out->mutable_data(ctx.GetPlace()); auto *qk_out_data = qk_out->mutable_data(ctx.GetPlace()); auto *qktv_out_data = qktv_out->mutable_data(ctx.GetPlace()); auto *src_mask_out_data = @@ -161,9 +196,14 @@ class FusedAttentionOpKernel : public framework::OpKernel { output_size = hidden_size; // (transA, transB, compute_bias) = (false, false, false) + // NOTE(Yuang Liu): For general input size == output size, change the + // position won't have effects. For mp, the output size is mp_head * dkey + // which is actually the input size. While the input size is hidden size, + // which is actually the output size. So for out linear, switch the + // input size and output size. auto out_linear_compute = AttnMatMul(ctx.cuda_device_context(), false, false, bsz_seq, - output_size, input_size, false); + input_size, output_size, false); DropoutParam dropout_param2(ctx, 0); FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2, @@ -186,15 +226,15 @@ class FusedAttentionOpKernel : public framework::OpKernel { qkv_bias_out); } if (qkv_bias == nullptr) { - fmha_ref_compute.ComputeForward(*qkv_out, src_mask, transpose_out_2, - qk_out, src_mask_out, softmax_out, - attn_dropout_mask_out, attn_dropout_out, - qktv_out, fmha_out); + fmha_ref_compute.ComputeForward( + *qkv_out, cache_kv, src_mask, transpose_out_2, cache_kv_out, qk_out, + src_mask_out, softmax_out, attn_dropout_mask_out, attn_dropout_out, + qktv_out, fmha_out); } else { - fmha_ref_compute.ComputeForward(*qkv_bias_out, src_mask, transpose_out_2, - qk_out, src_mask_out, softmax_out, - attn_dropout_mask_out, attn_dropout_out, - qktv_out, fmha_out); + fmha_ref_compute.ComputeForward( + *qkv_bias_out, cache_kv, src_mask, transpose_out_2, cache_kv_out, + qk_out, src_mask_out, softmax_out, attn_dropout_mask_out, + attn_dropout_out, qktv_out, fmha_out); } // fmha_out: [batch_size, seq_len, num_head, head_dim] @@ -202,6 +242,9 @@ class FusedAttentionOpKernel : public framework::OpKernel { // out_linear_out: [batch_size, seq_len, embed_dim] out_linear_compute.ComputeForward(out_linear_weight, fmha_out, nullptr, out_linear_out, nullptr); + // tensor model parallel + AllReduce(*out_linear_out, ring_id, ctx.cuda_device_context()); + if (pre_layer_norm) { // output = (residual + dropout(input + bias)) fused_dropout_layernorm_helper.ResidualDropoutBias( @@ -244,6 +287,7 @@ class FusedAttentionGradKernel : public framework::OpKernel { auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input("Seed1") : nullptr; bool is_fix_seed_1 = ctx.Attr("attn_dropout_fix_seed"); int seed_val_1 = ctx.Attr("attn_dropout_seed"); + int ring_id = ctx.Attr("ring_id"); // get inputs. auto *d_y = ctx.Input(framework::GradVarName("Y")); @@ -399,9 +443,10 @@ class FusedAttentionGradKernel : public framework::OpKernel { transA = false; transB = false; bool compute_bias = false; + // (b*s, num_head * dim_head) * (num_head * dim_head, dim_embed) auto out_linear_compute = AttnMatMul(ctx.cuda_device_context(), transA, transB, bsz_seq, - output_size, input_size, compute_bias); + input_size, output_size, compute_bias); DropoutParam dropout_param2(ctx, 0); FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2, @@ -475,6 +520,8 @@ class FusedAttentionGradKernel : public framework::OpKernel { qkv_compute.ComputeBackward(ln_out, qkv_weight, d_qkv_out, d_ln_out, d_qkv_weight, d_qkv_bias); } + // tensor model parallel + AllReduce(*d_ln_out, ring_id, ctx.cuda_device_context()); layer_norm_compute.ComputeBackward(x_data, d_ln_out_data, ln_scale_data, ln_mean_data, ln_var_data, d_x_data, d_ln_scale_data, d_ln_bias_data); @@ -486,6 +533,8 @@ class FusedAttentionGradKernel : public framework::OpKernel { qkv_compute.ComputeBackward(input_x, qkv_weight, d_qkv_out, d_x, d_qkv_weight, d_qkv_bias); } + // tensor model parallel + AllReduce(*d_x, ring_id, ctx.cuda_device_context()); } // gradient accumulation std::vector ins; diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu index 2381b5b7fdfb8..717c1732b7b3a 100644 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu @@ -20,8 +20,14 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h" #include "paddle/fluid/operators/fused/fused_dropout_test.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/functors.h" +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT); +#endif + namespace framework = paddle::framework; namespace platform = paddle::platform; namespace details = paddle::operators::details; diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h index d7952df470d81..a9b72a9cdf397 100644 --- a/paddle/fluid/operators/fused/fused_dropout_test.h +++ b/paddle/fluid/operators/fused/fused_dropout_test.h @@ -25,14 +25,16 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" #include "paddle/fluid/string/printf.h" +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/layer_norm_kernel.h" namespace framework = paddle::framework; namespace platform = paddle::platform; namespace memory = paddle::memory; -USE_OP(dropout); -USE_OP(layer_norm); +USE_OP_ITSELF(dropout); +USE_OP_ITSELF(layer_norm); template using CudnnDataType = platform::CudnnDataType; @@ -136,18 +138,23 @@ void LayerNorm(const std::vector> &scale, const platform::CUDADeviceContext &ctx) { framework::Scope scope; auto place = ctx.GetPlace(); + paddle::optional scale_opt = paddle::none; if (scale.size() > 0) { auto var_scale = scope.Var("Scale"); auto tensor_scale = var_scale->GetMutable(); framework::TensorFromVector(scale, ctx, tensor_scale); tensor_scale->Resize({cols}); + scale_opt = *tensor_scale; } + paddle::optional bias_opt = paddle::none; if (bias.size() > 0) { auto var_bias = scope.Var("Bias"); auto tensor_bias = var_bias->GetMutable(); framework::TensorFromVector(bias, ctx, tensor_bias); tensor_bias->Resize({cols}); + + bias_opt = *tensor_bias; } auto var_x = scope.Var("X"); @@ -157,20 +164,19 @@ void LayerNorm(const std::vector> &scale, auto var_y = scope.Var("Y"); auto tensor_y = var_y->GetMutable(); + tensor_y->Resize({rows, cols}); auto var_mean = scope.Var("Mean"); auto tensor_mean = var_mean->GetMutable(); + tensor_mean->Resize({rows}); auto var_variance = scope.Var("Variance"); auto tensor_variance = var_variance->GetMutable(); - - framework::AttributeMap attrs; - attrs.insert({"epsilon", epsilon}); - - auto op = framework::OpRegistry::CreateOp( - "layer_norm", {{"X", {"X"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}}, - {{"Y", {"Y"}}, {"Mean", {"Mean"}}, {"Variance", {"Variance"}}}, attrs); - op->Run(scope, place); + tensor_variance->Resize({rows}); + ctx.Wait(); + phi::LayerNormKernel(static_cast(ctx), *tensor_x, + scale_opt, bias_opt, 1e-5, 1, false, tensor_y, + tensor_mean, tensor_variance); framework::TensorToVector(*tensor_y, ctx, y); framework::TensorToVector(*tensor_mean, ctx, means); framework::TensorToVector(*tensor_variance, ctx, vars); diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu index cc14d0680d381..c7e1f4a5463fe 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu @@ -19,6 +19,12 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_test.h" #include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h" +#include "paddle/phi/core/kernel_registry.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT); +#endif /** * @brief The unit test of fused_layernorm_residual_dropout_bias @@ -192,7 +198,6 @@ struct TestFusedLayernormResidualDropoutBias { residual_vec[i * cols + j] + out2[i * cols + j]; } } - LayerNorm(scale_vec, layernorm_bias_vec, correct_out, &correct_means, &correct_vars, &correct_layernorm_out, epsilon, rows, cols, *ctx); diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu index 1a12e6b565f02..5dff5e2225f4f 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu @@ -19,6 +19,12 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_test.h" #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h" +#include "paddle/phi/core/kernel_registry.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT); +#endif namespace framework = paddle::framework; namespace platform = paddle::platform; diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc index e5ca15a39ef51..7d7d6ae81a093 100644 --- a/paddle/fluid/operators/gather_nd_op.cc +++ b/paddle/fluid/operators/gather_nd_op.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/infermeta/backward.h" #include "paddle/phi/infermeta/binary.h" -#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index 8a405cc6fc1ba..9f2b48a24b447 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -12,12 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_op.h" #include #include #include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -26,58 +31,6 @@ class GatherOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of GatherOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Input(Index) of GatherOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of GatherOp should not be null.")); - - auto index_dims = ctx->GetInputDim("Index"); - - if (index_dims.size() == 2) { - PADDLE_ENFORCE_EQ( - index_dims[1], 1, - platform::errors::InvalidArgument( - "The last dim of index should be 1 when it is 2D, but we get %d", - index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - index_dims.size(), 1, - platform::errors::InvalidArgument( - "The index should be 1D, when it is not 2D, but we get %d", - index_dims.size())); - } - - auto axis = ctx->Attrs().Get("axis"); - auto input_dim = ctx->GetInputDim("X"); - if (ctx->HasInput("Axis") || axis == 0) { - // if HasInput("Axis"), we can not obtain correct shape of output - int batch_size = index_dims[0]; - framework::DDim output_dims(input_dim); - output_dims[0] = batch_size; - ctx->SetOutputDim("Out", output_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } else { - int index_size = index_dims[0]; - std::vector out_dim_vec; - for (int i = 0; i < axis; i++) { - out_dim_vec.push_back(input_dim[i]); - } - out_dim_vec.push_back(index_size); - for (int i = axis + 1; i < input_dim.size(); i++) { - out_dim_vec.push_back(input_dim[i]); - } - auto output_dims = phi::make_ddim(out_dim_vec); - ctx->SetOutputDim("Out", output_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -100,11 +53,6 @@ class GatherGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -193,22 +141,18 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherGradNoNeedBufferVarInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(gather, GatherInferShapeFunctor, + PD_INFER_META(phi::GatherInferMeta)); REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker, ops::GatherGradOpMaker, - ops::GatherGradOpMaker); + ops::GatherGradOpMaker, + GatherInferShapeFunctor); +DECLARE_INFER_SHAPE_FUNCTOR(gather_grad, GatherGradInferShapeFunctor, + PD_INFER_META(phi::GeneralUnaryGradInferMeta)); REGISTER_OPERATOR(gather_grad, ops::GatherGradOp, - ops::GatherGradNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel, - ops::GatherOpKernel, ops::GatherOpKernel, - ops::GatherOpKernel, - ops::GatherOpKernel, - ops::GatherOpKernel); -REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel); + ops::GatherGradNoNeedBufferVarInferer, + GatherGradInferShapeFunctor); + REGISTER_OP_VERSION(gather) .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC", paddle::framework::compatible::OpVersionDesc().NewInput( diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu deleted file mode 100644 index e0db2f26d3e05..0000000000000 --- a/paddle/fluid/operators/gather_op.cu +++ /dev/null @@ -1,152 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/gather_op.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" -#include "paddle/phi/kernels/funcs/scatter.cu.h" - -namespace paddle { -namespace operators { - -template -class GatherOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *output = ctx.Output("Out"); - - int axis = ctx.Attr("axis"); - - // get axis from tensor - if (ctx.HasInput("Axis")) { - Tensor cpu_axis; - const Tensor *axis_tensor = ctx.Input("Axis"); - framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis); - const auto &axis_type = - framework::TransToProtoVarType(axis_tensor->dtype()); - if (axis_type == framework::proto::VarType::INT32) { - axis = static_cast(cpu_axis.data()[0]); - } else if (axis_type == framework::proto::VarType::INT64) { - axis = static_cast(cpu_axis.data()[0]); - } else if (axis_type == framework::proto::VarType::INT16) { - axis = static_cast(cpu_axis.data()[0]); - } - } - const auto &place = ctx.GetPlace(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - const auto &dev_ctx = ctx.cuda_device_context(); - if (axis != 0) { - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GatherV2CUDAFunction(x, index, axis, output, - dev_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GatherV2CUDAFunction(x, index, axis, output, - dev_ctx); - } else if (index_type == framework::proto::VarType::INT16) { - phi::funcs::GatherV2CUDAFunction(x, index, axis, output, - dev_ctx); - } - return; - } - - output->mutable_data(ctx.GetPlace()); - if (x->numel() == 0) return; - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GPUGather(dev_ctx, *x, *index, output); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GPUGather(dev_ctx, *x, *index, output); - } else if (index_type == framework::proto::VarType::INT16) { - phi::funcs::GPUGather(dev_ctx, *x, *index, output); - } - } -}; - -template -class GatherGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *index = ctx.Input("Index"); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dO = ctx.Input(framework::GradVarName("Out")); - - int axis = ctx.Attr("axis"); - if (ctx.HasInput("Axis")) { - const Tensor *axis_tensor = ctx.Input("Axis"); - Tensor cpu_axis; - framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis); - const auto &axis_type = - framework::TransToProtoVarType(axis_tensor->dtype()); - if (axis_type == framework::proto::VarType::INT32) { - axis = static_cast(cpu_axis.data()[0]); - } else if (axis_type == framework::proto::VarType::INT64) { - axis = static_cast(cpu_axis.data()[0]); - } - } - - const auto &dev_ctx = ctx.cuda_device_context(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (axis != 0) { - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GatherV2GradCUDAFunction(dO, index, axis, dX, - dev_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GatherV2GradCUDAFunction(dO, index, axis, dX, - dev_ctx); - } - return; - } - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *ctx.template device_context() - .eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GPUScatterAssign(dev_ctx, *dO, *index, dX, - ctx.Attr("overwrite")); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GPUScatterAssign(dev_ctx, *dO, *index, dX, - ctx.Attr("overwrite")); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h deleted file mode 100644 index 94de694b2f9bc..0000000000000 --- a/paddle/fluid/operators/gather_op.h +++ /dev/null @@ -1,133 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/gather.h" -#include "paddle/phi/kernels/funcs/scatter.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class GatherOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *output = ctx.Output("Out"); - - int axis = ctx.Attr("axis"); - // get axis from tensor - if (ctx.HasInput("Axis")) { - const Tensor *axis_tensor = ctx.Input("Axis"); - const auto &axis_type = axis_tensor->dtype(); - if (axis_type == phi::DataType::INT32) { - axis = static_cast(axis_tensor->data()[0]); - } else if (axis_type == phi::DataType::INT64) { - axis = static_cast(axis_tensor->data()[0]); - } - } - const auto &index_type = index->dtype(); - auto &dev_ctx = ctx.template device_context(); - if (axis != 0) { - if (index_type == phi::DataType::INT32) { - phi::funcs::GatherV2Function(dev_ctx, x, index, axis, - output); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::GatherV2Function(dev_ctx, x, index, axis, - output); - } - return; - } - - output->mutable_data(ctx.GetPlace()); - if (x->numel() == 0) return; - if (index_type == phi::DataType::INT32) { - phi::funcs::CPUGather(dev_ctx, *x, *index, output); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::CPUGather(dev_ctx, *x, *index, output); - } - } -}; - -template -class GatherGradientOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto *index = ctx.Input("Index"); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dO = ctx.Input(framework::GradVarName("Out")); - - int axis = ctx.Attr("axis"); - if (ctx.HasInput("Axis")) { - const Tensor *axis_tensor = ctx.Input("Axis"); - const auto &axis_type = axis_tensor->dtype(); - if (axis_type == phi::DataType::INT32) { - axis = static_cast(axis_tensor->data()[0]); - } else if (axis_type == phi::DataType::INT64) { - axis = static_cast(axis_tensor->data()[0]); - } - } - const auto &index_type = index->dtype(); - auto &dev_ctx = ctx.template device_context(); - - if (axis != 0) { - if (index_type == phi::DataType::INT32) { - phi::funcs::GatherV2GradFunction(dev_ctx, dO, index, axis, - dX); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::GatherV2GradFunction(dev_ctx, dO, index, axis, - dX); - } - return; - } - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *dev_ctx.eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - bool overwrite = ctx.Attr("overwrite"); - - if (index_type == phi::DataType::INT32) { - if (overwrite) { - phi::funcs::ScatterAssign(dev_ctx, *dO, *index, dX); - } else { - phi::funcs::ScatterAssignAdd(dev_ctx, *dO, *index, dX); - } - } else if (index_type == phi::DataType::INT64) { - if (overwrite) { - phi::funcs::ScatterAssign(dev_ctx, *dO, *index, dX); - } else { - phi::funcs::ScatterAssignAdd(dev_ctx, *dO, *index, dX); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc index a83abb245224b..f996b1ede2f0f 100644 --- a/paddle/fluid/operators/gather_op_npu.cc +++ b/paddle/fluid/operators/gather_op_npu.cc @@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/kron_op.h" #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc index 3dce380360815..b42050eabe300 100644 --- a/paddle/fluid/operators/gather_op_npu_test.cc +++ b/paddle/fluid/operators/gather_op_npu_test.cc @@ -24,16 +24,15 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/gather_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(gather); +USE_OP_ITSELF(gather); USE_OP_DEVICE_KERNEL(gather, NPU); -USE_OP(gather_grad); +USE_OP_ITSELF(gather_grad); USE_OP_DEVICE_KERNEL(gather_grad, NPU); template diff --git a/paddle/fluid/operators/gather_op_xpu.cc b/paddle/fluid/operators/gather_op_xpu.cc index 28f2f7d473bef..6c691aa14ae77 100644 --- a/paddle/fluid/operators/gather_op_xpu.cc +++ b/paddle/fluid/operators/gather_op_xpu.cc @@ -13,15 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/gather_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class GatherOpXPUKernel : public framework::OpKernel { using XPUType = typename XPUTypeTrait::Type; diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc index 3d338f00d4fcb..3be2606bfc939 100644 --- a/paddle/fluid/operators/gelu_op.cc +++ b/paddle/fluid/operators/gelu_op.cc @@ -14,10 +14,11 @@ limitations under the License. */ #include #include -#include - -#include "paddle/fluid/operators/gelu_op.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -29,18 +30,6 @@ class GeluOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(%s) of GeluOp should not be null.", "X")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(%s) of GeluOp should not be null.", "Out")); - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -156,13 +145,10 @@ class GeluGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(gelu, GeluInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(gelu, ops::GeluOp, ops::GeluOpMaker, ops::GeluGradOpMaker, - ops::GeluGradOpMaker); + ops::GeluGradOpMaker, + GeluInferShapeFunctor); REGISTER_OPERATOR(gelu_grad, ops::GeluGradOp); -REGISTER_OP_CPU_KERNEL( - gelu, ops::GeluKernel, - ops::GeluKernel); -REGISTER_OP_CPU_KERNEL( - gelu_grad, ops::GeluGradKernel, - ops::GeluGradKernel); diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu deleted file mode 100644 index ef836ab72f001..0000000000000 --- a/paddle/fluid/operators/gelu_op.cu +++ /dev/null @@ -1,320 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/gelu_op.h" - -DECLARE_bool(use_fast_math); - -namespace paddle { -namespace operators { - -#ifdef __NVCC__ -template -static __device__ __forceinline__ float FP32FastTanh(float x) { -#if __CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000 - if (FastMode) { - float y; - asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x)); - return y; - } -#endif - return tanhf(x); -} - -template -static __device__ __forceinline__ float FP32GeluFwd(float x) { - auto tanh_out = - FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); - return x * 0.5f * (1.0f + tanh_out); -} - -template -static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) { - auto tanh_out = - FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); - auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) * - (0.79788456f + 0.1070322243f * x * x)) + - 0.5f * (1.0f + tanh_out); - return tmp * y_g; -} - -template -static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y, - size_t n) { - size_t offset = - static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; - size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; - for (; offset < n; offset += stride) { - using ArrT = phi::AlignedVector<__half, VecSize>; - ArrT in_arr = *reinterpret_cast(x + offset); -#pragma unroll - for (int i = 0; i < VecSize; ++i) { - float tmp = __half2float(in_arr[i]); - in_arr[i] = __float2half(FP32GeluFwd(tmp)); - } - *reinterpret_cast(y + offset) = in_arr; - } -} - -template -static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x, - const __half* y_g, __half* x_g, - size_t n) { - size_t offset = - static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; - size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; - for (; offset < n; offset += stride) { - using ArrT = phi::AlignedVector<__half, VecSize>; - ArrT x_in_arr = *reinterpret_cast(x + offset); - ArrT y_g_in_arr = *reinterpret_cast(y_g + offset); -#pragma unroll - for (int i = 0; i < VecSize; ++i) { - __half2 tmp_fp16_2; - tmp_fp16_2.x = x_in_arr[i]; - tmp_fp16_2.y = y_g_in_arr[i]; - float2 tmp_fp32_2 = __half22float2(tmp_fp16_2); - x_in_arr[i] = - __float2half(FP32GeluBwd(tmp_fp32_2.x, tmp_fp32_2.y)); - } - *reinterpret_cast(x_g + offset) = x_in_arr; - } -} - -static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( - const platform::CUDADeviceContext& dev_ctx, const __half* x, __half* y, - size_t n) { - auto is_aligned = [](const void* p, size_t alignment) { - return reinterpret_cast(p) % alignment == 0; - }; - -#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math) \ - do { \ - constexpr auto kAlignment = \ - alignof(phi::AlignedVector<__half, __vec_size>); \ - if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ - is_aligned(y, kAlignment)) { \ - size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ - size_t block = (n / __vec_size + thread - 1) / thread; \ - block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ - VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \ - << " , thread = " << thread; \ - FP16FastGeluFwdCUDAKernel< \ - __vec_size, \ - __use_fast_math><<>>(x, y, n); \ - return true; \ - } \ - } while (0) - - if (FLAGS_use_fast_math) { - PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true); - } else { - PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false); - } - -#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL - return false; -} - -static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( - const platform::CUDADeviceContext& dev_ctx, const __half* x, - const __half* y_g, __half* x_g, size_t n) { - auto is_aligned = [](const void* p, size_t alignment) { - return reinterpret_cast(p) % alignment == 0; - }; - -#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math) \ - do { \ - constexpr auto kAlignment = \ - alignof(phi::AlignedVector<__half, __vec_size>); \ - if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ - is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) && \ - is_aligned(x_g, kAlignment)) { \ - size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ - size_t block = (n / __vec_size + thread - 1) / thread; \ - block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ - VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \ - << " , thread = " << thread; \ - FP16FastGeluBwdCUDAKernel< \ - __vec_size, \ - __use_fast_math><<>>(x, y_g, \ - x_g, n); \ - return true; \ - } \ - } while (0) - - if (FLAGS_use_fast_math) { - PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true); - } else { - PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false); - } - -#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL - return false; -} -#endif - -template -struct GeluWithApproximateFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x) { - // this function is tanh approximation of gelu - MPType x = static_cast(arg_x); - MPType one = static_cast(1); - MPType half = static_cast(0.5); - MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - auto tanh_out = - tanh(kAlpha * x * (one + static_cast(GELU_CONSTANT) * x * x)); - MPType out = x * half * (one + tanh_out); - return static_cast(out); - } -}; - -template -struct GeluWithoutApproximateFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x) { - // actual gelu with approximation = false - MPType x = static_cast(arg_x); - return static_cast(x * normcdf(x)); - } -}; - -template -class GeluKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* in = context.Input("X"); - auto approximate = context.Attr("approximate"); - out->mutable_data(in->place()); - - std::vector ins = {in}; - std::vector outs = {out}; - const auto& dev_ctx = - context.template device_context(); - - if (approximate) { -#ifdef __NVCC__ - if (std::is_same::value) { - size_t n = in->numel(); - const auto* in_ptr = reinterpret_cast(in->data()); - auto* out_ptr = reinterpret_cast<__half*>(out->data()); - if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(dev_ctx, in_ptr, - out_ptr, n)) { - return; - } - } -#endif - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor()); - } else { - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor()); - } - } -}; - -template -struct GeluWithApproximateGradFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { - MPType x = static_cast(arg_x); - MPType dout = static_cast(arg_dout); - MPType one = static_cast(1); - MPType half = static_cast(0.5); - MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - MPType kBeta = - kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); - auto cube_x = x * x * x; - auto tanh_out = - tanh(kAlpha * ((static_cast(GELU_CONSTANT) * cube_x) + x)); - auto ans = - half * (one + tanh_out + - (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x)); - return static_cast(ans * dout); - } -}; - -template -struct GeluWithoutApproximateGradFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { - MPType x = static_cast(arg_x); - MPType dout = static_cast(arg_dout); - constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast(0.5); - const MPType cdf = normcdf(x); - const MPType pdf = exp(static_cast(-0.5) * x * x) * kBeta; - return static_cast(dout * (cdf + x * pdf)); - } -}; - -template -class GeluGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dout = - context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - auto approximate = context.Attr("approximate"); - dx->mutable_data(dout->place()); - - std::vector ins = {x, dout}; - std::vector outs = {dx}; - const auto& dev_ctx = - context.template device_context(); - if (approximate) { -#ifdef __NVCC__ - if (std::is_same::value) { - size_t n = x->numel(); - const auto* x_ptr = reinterpret_cast(x->data()); - const auto* y_g_ptr = reinterpret_cast(dout->data()); - auto* x_g_ptr = reinterpret_cast<__half*>(dx->data()); - if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(dev_ctx, x_ptr, y_g_ptr, - x_g_ptr, n)) { - return; - } - } -#endif - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor()); - } else { - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - gelu, ops::GeluKernel, - ops::GeluKernel, - ops::GeluKernel); -REGISTER_OP_CUDA_KERNEL( - gelu_grad, ops::GeluGradKernel, - ops::GeluGradKernel, - ops::GeluGradKernel); diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h deleted file mode 100644 index d4fed8a868ff9..0000000000000 --- a/paddle/fluid/operators/gelu_op.h +++ /dev/null @@ -1,233 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifndef _USE_MATH_DEFINES -#define _USE_MATH_DEFINES -#endif -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_helper.h" -#endif - -namespace paddle { -namespace operators { - -#define GELU_CONSTANT 0.044715 - -template -struct GeluFunctor { - template - void operator()(Device d, X x, Out out, bool approximate) const { - if (approximate) { - // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3}))) - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto temp = - (static_cast(M_2_SQRTPI * M_SQRT1_2) * - (casted_x + static_cast(GELU_CONSTANT) * casted_x.cube())) - .tanh(); - out.device(d) = (casted_x * static_cast(0.5) * - (static_cast(1) + temp)) - .template cast(); - } else { - auto temp = (static_cast(M_2_SQRTPI * M_SQRT1_2) * - (x + static_cast(GELU_CONSTANT) * x.cube())) - .tanh(); - out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); - } - } else { -#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ - !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) - auto x_data = x.data(); - auto out_data = out.data(); - int n = std::min(x.size(), out.size()); - - std::memset(out_data, 0, n * sizeof(T)); - phi::funcs::CBlas::AXPY(n, static_cast(M_SQRT1_2), x_data, 1, - out_data, 1); - phi::funcs::CBlas::VMERF(n, out_data, out_data, VML_LA); - for (int i = 0; i < n; i++) { - out_data[i] += static_cast(1); - } - phi::funcs::CBlas::VMUL(n, x_data, out_data, out_data); - for (int i = 0; i < n; i++) { - out_data[i] *= static_cast(0.5); - } -#else - // gelu(x) = 0.5 * x * (1 + erf(x / sqrt(2))) - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto temp = (casted_x * static_cast(M_SQRT1_2)).erf(); - out.device(d) = (casted_x * static_cast(0.5) * - (static_cast(1) + temp)) - .template cast(); - } else { - auto temp = (x * static_cast(M_SQRT1_2)).erf(); - out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); - } -#endif - } - } -}; - -template -struct GeluGradFunctor { - template - void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const { - if (approximate) { - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto casted_dout = dout.template cast(); - - const float kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - const float kBeta = - kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); - const auto y = - (kAlpha * - ((static_cast(GELU_CONSTANT) * casted_x.cube()) + casted_x)) - .tanh(); - dx.device(d) = (static_cast(0.5) * casted_dout * - (static_cast(1) + y + - (casted_x - casted_x * y.square()) * - (kAlpha + kBeta * casted_x.square()))) - .template cast(); - } else { - const T kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - const T kBeta = - kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); - const auto y = - (kAlpha * ((static_cast(GELU_CONSTANT) * x.cube()) + x)).tanh(); - dx.device(d) = static_cast(0.5) * dout * - (static_cast(1) + y + - (x - x * y.square()) * (kAlpha + kBeta * x.square())); - } - } else { -#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ - !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) - auto x_data = x.data(); - auto dx_data = dx.data(); - auto dout_data = dout.data(); - int n = std::min(x.size(), dx.size()); - - auto first = static_cast(std::malloc(n * sizeof(T))); - std::memset(first, 0, n * sizeof(T)); - auto second = static_cast(std::malloc(n * sizeof(T))); - std::memset(second, 0, n * sizeof(T)); - - // first = (0.5 * (1 + erf(x / sqrt(2)))) - phi::funcs::CBlas::AXPY(n, static_cast(M_SQRT1_2), x_data, 1, first, - 1); - phi::funcs::CBlas::VMERF(n, first, first, VML_LA); - for (int i = 0; i < n; i++) { - first[i] += static_cast(1); - } - phi::funcs::CBlas::SCAL(n, static_cast(0.5), first, 1); - - // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2)) - phi::funcs::CBlas::VSQUARE(n, x_data, second); - phi::funcs::CBlas::SCAL(n, -static_cast(0.5), second, 1); - phi::funcs::CBlas::VEXP(n, second, second); - phi::funcs::CBlas::VMUL(n, x_data, second, second); - phi::funcs::CBlas::SCAL( - n, static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1); - - // dx = dout * (first + second); - phi::funcs::CBlas::VADD(n, first, second, first); - phi::funcs::CBlas::VMUL(n, dout_data, first, dx_data); - - std::free(first); - std::free(second); -#else - // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) * - // exp(- x^2 / 2) - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto casted_dout = dout.template cast(); - auto first = static_cast(0.5) * - (static_cast(1) + - ((casted_x * static_cast(M_SQRT1_2)).erf())); - auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * - casted_x * - (-static_cast(0.5) * casted_x.square()).exp(); - dx.device(d) = (casted_dout * (first + second)).template cast(); - } else { - auto first = - static_cast(0.5) * - (static_cast(1) + ((x * static_cast(M_SQRT1_2)).erf())); - - auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * - (-static_cast(0.5) * x.square()).exp(); - dx.device(d) = dout * (first + second); - } -#endif - } - } -}; - -template -class GeluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* in = context.Input("X"); - auto approximate = context.Attr("approximate"); - out->mutable_data(in->place()); - - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto& place = - *context.template device_context().eigen_device(); - - GeluFunctor functor; - functor(place, eigen_in, eigen_out, approximate); - } -}; - -template -class GeluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dout = - context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - auto approximate = context.Attr("approximate"); - dx->mutable_data(dout->place()); - - auto eigen_x = framework::EigenVector::Flatten(*x); - auto eigen_dout = framework::EigenVector::Flatten(*dout); - auto eigen_dx = framework::EigenVector::Flatten(*dx); - auto& place = - *context.template device_context().eigen_device(); - - GeluGradFunctor functor; - functor(place, eigen_x, eigen_dout, eigen_dx, approximate); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc index 18bbc7f4929c6..c5297dd9cd404 100644 --- a/paddle/fluid/operators/gelu_op_npu.cc +++ b/paddle/fluid/operators/gelu_op_npu.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/gelu_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc index 00ff7ad2166dc..b132b3170756d 100644 --- a/paddle/fluid/operators/gelu_op_npu_test.cc +++ b/paddle/fluid/operators/gelu_op_npu_test.cc @@ -24,14 +24,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(gelu); +USE_OP_ITSELF(gelu); USE_OP_DEVICE_KERNEL(gelu, NPU); template diff --git a/paddle/fluid/operators/gelu_op_xpu.cc b/paddle/fluid/operators/gelu_op_xpu.cc index b8c2e9becf295..559d2448ad945 100644 --- a/paddle/fluid/operators/gelu_op_xpu.cc +++ b/paddle/fluid/operators/gelu_op_xpu.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include #include - -#include "paddle/fluid/operators/gelu_op.h" - +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc index b759345eda565..f7c006dbcb1a9 100644 --- a/paddle/fluid/operators/graph_send_recv_op.cc +++ b/paddle/fluid/operators/graph_send_recv_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -21,59 +24,6 @@ class GraphSendRecvOP : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GraphSendRecv"); - OP_INOUT_CHECK(ctx->HasInput("Src_index"), "Input", "Src_index", - "GraphSendRecv"); - OP_INOUT_CHECK(ctx->HasInput("Dst_index"), "Input", "Dst_index", - "GraphSendRecv"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GraphSendRecv"); - - auto src_index_dims = ctx->GetInputDim("Src_index"); - if (src_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(src_index_dims[1], 1, - platform::errors::InvalidArgument( - "The last dim of Src_index should be 1 when it " - "is 2D, but we get %d", - src_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - src_index_dims.size(), 1, - platform::errors::InvalidArgument( - "The Src_index should be 1D, when it is not 2D, but we get %d", - src_index_dims.size())); - } - - auto dst_index_dims = ctx->GetInputDim("Dst_index"); - if (dst_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(dst_index_dims[1], 1, - platform::errors::InvalidArgument( - "The last dim of Dst_index should be 1 when it " - "is 2D, but we get %d", - dst_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - dst_index_dims.size(), 1, - platform::errors::InvalidArgument("The Dst_index should be 1D, " - "when it is not 2D, but we get %d", - dst_index_dims.size())); - } - - PADDLE_ENFORCE_EQ( - src_index_dims[0], dst_index_dims[0], - platform::errors::InvalidArgument( - "Src_index and Dst_index should have the same shape.")); - - auto dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", dims); - - if (ctx->Attrs().Get("pool_type") == "MEAN") { - OP_INOUT_CHECK(ctx->HasOutput("Dst_count"), "Output", "Dst_count", - "GraphSendRecv"); - ctx->SetOutputDim("Dst_count", {dims[0]}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -164,10 +114,12 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv, GraphSendRecvInferShapeFunctor, + PD_INFER_META(phi::GraphSendRecvInferMeta)); REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP, ops::GraphSendRecvOpMaker, ops::GraphSendRecvGradOpMaker, - ops::GraphSendRecvGradOpMaker); + ops::GraphSendRecvGradOpMaker, + GraphSendRecvInferShapeFunctor); REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp); diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index 04aa6a3e10f6e..f6d3fd8984691 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -12,12 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/grid_sampler_op.h" #include #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -27,43 +31,6 @@ using Tensor = framework::Tensor; class GridSampleOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GridSampler"); - OP_INOUT_CHECK(ctx->HasInput("Grid"), "Input", "Grid", "GridSampler"); - OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "GridSampler"); - - auto x_dims = ctx->GetInputDim("X"); - auto grid_dims = ctx->GetInputDim("Grid"); - PADDLE_ENFORCE_EQ(x_dims.size(), 4, - platform::errors::InvalidArgument( - "Input(X) of GridSampleOp should be 4-D Tensor, but " - "received X dimension size(%d)", - x_dims.size())); - PADDLE_ENFORCE_EQ(grid_dims.size(), 4, - platform::errors::InvalidArgument( - "Input(Grid) of GridSampleOp should be 4-D Tensor, " - "but received X dimension size(%d)", - grid_dims.size())); - if (ctx->IsRuntime() || grid_dims[3] > 0) { - PADDLE_ENFORCE_EQ( - grid_dims[3], 2, - platform::errors::InvalidArgument( - "Input(Grid) dimension[3] should be 2, but received %d", - grid_dims[3])); - } - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - grid_dims[0], x_dims[0], - platform::errors::InvalidArgument( - "Input(X) and Input(Grid) dimension[0] should be equal, but " - "received X dimension[0](%d) != Grid dimension[0](%d)", - x_dims[0], grid_dims[0])); - } - - ctx->SetOutputDim("Output", - {x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]}); - ctx->ShareLoD("X", "Output"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -173,18 +140,6 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker { class GridSampleOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", - framework::GradVarName("X"), "grid_sampler"); - auto input_dims = ctx->GetInputDim("X"); - auto grid_dims = ctx->GetInputDim("Grid"); - if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), input_dims); - } - if (ctx->HasOutput(framework::GradVarName("Grid"))) { - ctx->SetOutputDim(framework::GradVarName("Grid"), grid_dims); - } - } protected: framework::OpKernelType GetExpectedKernelType( @@ -224,19 +179,16 @@ class GridSampleGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler, GridSamplerInferShapeFunctor, + PD_INFER_META(phi::GridSampleBaseInferMeta)); REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker, ops::GridSampleGradMaker, - ops::GridSampleGradMaker); -REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad); - -REGISTER_OP_CPU_KERNEL( - grid_sampler, - ops::GridSampleOpKernel, - ops::GridSampleOpKernel); -REGISTER_OP_CPU_KERNEL( - grid_sampler_grad, - ops::GridSampleGradOpKernel, - ops::GridSampleGradOpKernel); + ops::GridSampleGradMaker, + GridSamplerInferShapeFunctor); +DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler_grad, GridSamplerGradInferShapeFunctor, + PD_INFER_META(phi::GeneralBinaryGradInferMeta)); +REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad, + GridSamplerGradInferShapeFunctor); REGISTER_OP_VERSION(grid_sampler) .AddCheckpoint( diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu deleted file mode 100644 index a227a8e312765..0000000000000 --- a/paddle/fluid/operators/grid_sampler_op.cu +++ /dev/null @@ -1,492 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/grid_sampler_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) { - return h >= 0 && h < H && w >= 0 && w < W; -} - -template -static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH, - int sW, int H, int W, - T delta) { - if (in_bounds(h, w, H, W)) { - platform::CudaAtomicAdd(data + h * sH + w * sW, delta); - } -} - -template -static __forceinline__ __device__ T _unnormalize(T coord, int size, - bool align_corners) { - if (align_corners) { - return ((coord + 1.f) / 2) * (size - 1); - } else { - return ((coord + 1.f) * size - 1) / 2; - } -} - -template -static __forceinline__ __device__ T clip_indexes(T in, int max_value) { - return min(static_cast(max_value), max(in, static_cast(0))); -} - -template -static __forceinline__ __device__ T reflect_indexes(T in, int twice_low, - int twice_high) { - if (twice_low == twice_high) { - return static_cast(0); - } - T min = static_cast(twice_low) / 2; - T span = static_cast(twice_high - twice_low) / 2; - in = fabs(in - min); - T extra = fmod(in, span); - int flips = static_cast(floor(in / span)); - if (flips % 2 == 0) { - return extra + min; - } else { - return span - extra + min; - } -} - -template -static __forceinline__ __device__ T compute_positions(T coord, int size, - PaddingMode padding_mode, - bool align_corners) { - coord = _unnormalize(coord, size, align_corners); - if (padding_mode == PaddingMode::border) { - coord = clip_indexes(coord, size - 1); - } else if (padding_mode == PaddingMode::reflect) { - if (align_corners) { - coord = reflect_indexes(coord, 0, 2 * (size - 1)); - } else { - coord = reflect_indexes(coord, -1, 2 * size - 1); - } - coord = clip_indexes(coord, size - 1); - } - return coord; -} - -template -static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size, - bool align_corners, - T* grad_in) { - if (align_corners) { - *grad_in = static_cast(size - 1) / 2; - return ((coord + 1.f) / 2) * (size - 1); - } else { - *grad_in = static_cast(size) / 2; - return ((coord + 1.f) * size - 1) / 2; - } -} - -template -static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit, - T* grad_in) { - if (in <= static_cast(0)) { - *grad_in = static_cast(0); - return static_cast(0); - } else { - T max = static_cast(clip_limit - 1); - if (in >= max) { - *grad_in = static_cast(0); - return max; - } else { - *grad_in = static_cast(1); - return in; - } - } -} - -template -static __forceinline__ __device__ T -reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) { - if (twice_low == twice_high) { - *grad_in = static_cast(0); - return static_cast(0); - } - int grad_in_mult_; - T min = static_cast(twice_low) / 2; - T span = static_cast(twice_high - twice_low) / 2; - in = in - min; - if (in < static_cast(0)) { - grad_in_mult_ = -1; - in = -in; - } else { - grad_in_mult_ = 1; - } - T extra = fmod(in, span); - int flips = static_cast(floor(in / span)); - if (flips % 2 == 0) { - *grad_in = static_cast(grad_in_mult_); - return extra + min; - } else { - *grad_in = static_cast(-grad_in_mult_); - return span - extra + min; - } -} - -template -static __forceinline__ __device__ T -compute_positions_with_mask(T coord, int size, PaddingMode padding_mode, - bool align_corners, T* grad_in) { - T grad_clip, grad_refl; - coord = _unnormalize_with_mask(coord, size, align_corners, grad_in); - if (padding_mode == PaddingMode::border) { - coord = clip_indexes_with_mask(coord, size, &grad_clip); - *grad_in = (*grad_in) * grad_clip; - } else if (padding_mode == PaddingMode::reflect) { - if (align_corners) { - coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl); - } else { - coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl); - } - coord = clip_indexes_with_mask(coord, size, &grad_clip); - *grad_in = (*grad_in) * grad_refl * grad_clip; - } - - return coord; -} - -template -__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c, - int out_h, int out_w, int in_h, - int in_w, const T* input, const T* grid, - T* output, const Mode mode, - const PaddingMode padding_mode, - bool align_corners) { - int inp_sN = out_c * in_h * in_w; - - int inp_sC = in_h * in_w; - int inp_sH = in_w; - int inp_sW = 1; - int grid_sN = out_h * out_w * 2; - int grid_sH = out_w * 2; - int grid_sW = 2; - int grid_sCoor = 1; - int out_sN = out_c * out_h * out_w; - int out_sC = out_h * out_w; - int out_sH = out_w; - int out_sW = 1; - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_w; - const int h = (index / out_w) % out_h; - const int n = index / (out_h * out_w); - const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; - - T ix = grid[grid_offset]; - T iy = grid[grid_offset + grid_sCoor]; - - ix = compute_positions(ix, in_w, padding_mode, align_corners); - iy = compute_positions(iy, in_h, padding_mode, align_corners); - if (mode == Mode::bilinear) { - int ix_nw = static_cast(floor(ix)); - int iy_nw = static_cast(floor(iy)); - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - T nw = (ix_se - ix) * (iy_se - iy); - T ne = (ix - ix_sw) * (iy_sw - iy); - T sw = (ix_ne - ix) * (iy - iy_ne); - T se = (ix - ix_nw) * (iy - iy_nw); - - auto inp_offset_NC = n * inp_sN; - - auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; - for (int c = 0; c < out_c; - ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { - *out_ptr_NCHW = static_cast(0); - if (in_bounds(iy_nw, ix_nw, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw; - } - if (in_bounds(iy_ne, ix_ne, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne; - } - if (in_bounds(iy_sw, ix_sw, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw; - } - if (in_bounds(iy_se, ix_se, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se; - } - } - } else if (mode == Mode::nearest) { - int ix_nearest = static_cast(std::nearbyint(ix)); - int iy_nearest = static_cast(std::nearbyint(iy)); - auto inp_offset_NC = n * inp_sN; - auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; - for (int c = 0; c < out_c; - ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { - if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) { - *out_ptr_NCHW = - input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW]; - } else { - *out_ptr_NCHW = static_cast(0); - } - } - } - } -} - -template -class GridSampleOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.cuda_device_context(); - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode_s = ctx.Attr("padding_mode"); - auto mode_s = ctx.Attr("mode"); - PaddingMode padding_mode; - Mode mode; - if (padding_mode_s == "border") { - padding_mode = PaddingMode::border; - } else if (padding_mode_s == "reflection") { - padding_mode = PaddingMode::reflect; - } else { - padding_mode = PaddingMode::zeros; - } - - if (mode_s == "nearest") { - mode = Mode::nearest; - } else { - mode = Mode::bilinear; - } - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h - << "; out_w: " << out_w; - auto* output = ctx.Output("Output"); - auto* output_data = output->mutable_data(ctx.GetPlace()); - VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1] - << "; " << output->dims()[2] << "; " << output->dims()[3]; - int count = static_cast(n * out_h * out_w); - auto cu_stream = dev_ctx.stream(); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(dev_ctx, count); - grid_sample_cuda_kernel< - T><<>>( - count, n, c, out_h, out_w, in_h, in_w, input->data(), - grid->data(), output_data, mode, padding_mode, align_corners); - } -}; - -template -__global__ void grid_sampler_cuda_backward_kernel( - const int nthreads, const T* grad_output, const T* input, const T* grid, - int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input, - T* grad_grid, const Mode mode, const PaddingMode padding_mode, - bool align_corners) { - int inp_sN = out_c * in_h * in_w; - int inp_sC = in_h * in_w; - int inp_sH = in_w; - int inp_sW = 1; - int grid_sN = out_h * out_w * 2; - int grid_sH = out_w * 2; - int grid_sW = 2; - int grid_sCoor = 1; - - int gOut_sN = out_c * out_h * out_w; - int gOut_sC = out_h * out_w; - int gOut_sH = out_w; - int gOut_sW = 1; - - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_w; - const int h = (index / out_w) % out_h; - const int n = index / (out_h * out_w); - const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; - - T ix = grid[grid_offset]; - T iy = grid[grid_offset + grid_sCoor]; - - T gix_mult, giy_mult; - ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners, - &gix_mult); - iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners, - &giy_mult); - - if (mode == Mode::bilinear) { - int ix_nw = static_cast(floor(ix)); - int iy_nw = static_cast(floor(iy)); - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - T nw = (ix_se - ix) * (iy_se - iy); - T ne = (ix - ix_sw) * (iy_sw - iy); - T sw = (ix_ne - ix) * (iy - iy_ne); - T se = (ix - ix_nw) * (iy - iy_nw); - - T gix = static_cast(0), giy = static_cast(0); - int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; - T* gInp_ptr_NC = grad_input + n * inp_sN; - int inp_offset_NC = n * inp_sN; - for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC, - gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) { - T gOut = grad_output[gOut_offset]; - - atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, - nw * gOut); - atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, - ne * gOut); - atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, - sw * gOut); - atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, - se * gOut); - - if (in_bounds(iy_nw, ix_nw, in_h, in_w)) { - T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW]; - gix -= nw_val * (iy_se - iy) * gOut; - giy -= nw_val * (ix_se - ix) * gOut; - } - if (in_bounds(iy_ne, ix_ne, in_h, in_w)) { - T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW]; - gix += ne_val * (iy_sw - iy) * gOut; - giy -= ne_val * (ix - ix_sw) * gOut; - } - if (in_bounds(iy_sw, ix_sw, in_h, in_w)) { - T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW]; - gix -= sw_val * (iy - iy_ne) * gOut; - giy += sw_val * (ix_ne - ix) * gOut; - } - if (in_bounds(iy_se, ix_se, in_h, in_w)) { - T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW]; - gix += se_val * (iy - iy_nw) * gOut; - giy += se_val * (ix - ix_nw) * gOut; - } - } - - if (grad_grid != nullptr) { - T* gGrid_ptr_NHW = grad_grid + index * grid_sW; - gGrid_ptr_NHW[0] = gix_mult * gix; - gGrid_ptr_NHW[1] = giy_mult * giy; - } - } else if (mode == Mode::nearest) { - int ix_nearest = static_cast(std::nearbyint(ix)); - int iy_nearest = static_cast(std::nearbyint(iy)); - - int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; - T* gInp_ptr_NC = grad_input + n * inp_sN; - for (int c = 0; c < out_c; - ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) { - atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h, - in_w, grad_output[gOut_offset]); - } - - if (grad_grid != nullptr) { - T* gGrid_ptr_NHW = grad_grid + index * grid_sW; - gGrid_ptr_NHW[0] = static_cast(0); - gGrid_ptr_NHW[1] = static_cast(0); - } - } - } -} - -template -class GridSampleGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.cuda_device_context(); - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode_s = ctx.Attr("padding_mode"); - auto mode_s = ctx.Attr("mode"); - - PaddingMode padding_mode; - Mode mode; - if (padding_mode_s == "border") { - padding_mode = PaddingMode::border; - } else if (padding_mode_s == "reflection") { - padding_mode = PaddingMode::reflect; - } else { - padding_mode = PaddingMode::zeros; - } - - if (mode_s == "nearest") { - mode = Mode::nearest; - } else { - mode = Mode::bilinear; - } - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - auto* output_grad = ctx.Input(framework::GradVarName("Output")); - - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - - auto* input_grad = ctx.Output(framework::GradVarName("X")); - input_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), - input_grad, static_cast(0)); - - T* grid_grad_data = nullptr; - if (ctx.HasOutput(framework::GradVarName("Grid"))) { - auto* grid_grad = ctx.Output(framework::GradVarName("Grid")); - grid_grad_data = grid_grad->mutable_data(ctx.GetPlace()); - } - - int count = static_cast(n * out_h * out_w); - auto cu_stream = dev_ctx.stream(); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(dev_ctx, count); - grid_sampler_cuda_backward_kernel< - T><<>>( - count, output_grad->data(), input->data(), grid->data(), n, c, - out_h, out_w, in_h, in_w, input_grad->data(), grid_grad_data, mode, - padding_mode, align_corners); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel, - ops::GridSampleOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(grid_sampler_grad, - ops::GridSampleGradOpCUDAKernel, - ops::GridSampleGradOpCUDAKernel); diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h deleted file mode 100644 index 93e96694270a4..0000000000000 --- a/paddle/fluid/operators/grid_sampler_op.h +++ /dev/null @@ -1,600 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -enum class Mode { - bilinear, - nearest, -}; - -enum class PaddingMode { zeros, border, reflect }; - -using Tensor = framework::Tensor; -template -using EigenTensor = framework::EigenTensor; - -using Array3 = Eigen::DSizes; -using Array4 = Eigen::DSizes; - -template -static inline bool isInBound(T x, T y, T x_max, T y_max) { - if (x < 0 || x > x_max || y < 0 || y > y_max) { - return false; - } - return true; -} - -template -static inline void unnormalize(const platform::CPUDeviceContext& ctx, - Tensor* grid_slice, - const int max_val, // height-1 or width-1 - bool align_corners) { - auto& place = *ctx.eigen_device(); - auto grid_slice_t = EigenTensor::From(*grid_slice); - - if (!align_corners) { - auto factor = static_cast((max_val + 1) * 0.5); - grid_slice_t.device(place) = - (grid_slice_t + static_cast(1)) * factor - static_cast(0.5); - } else { - auto factor = static_cast(max_val * 0.5); - grid_slice_t.device(place) = (grid_slice_t + static_cast(1)) * factor; - } -} - -template -static inline void clip(const platform::CPUDeviceContext& ctx, - Tensor* grid_slice, - const int max_val, // height-1 or width-1 - bool align_corners, std::string padding_mode) { - auto& place = *ctx.eigen_device(); - auto grid_slice_t = EigenTensor::From(*grid_slice); - if (padding_mode == "border") { - grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - } else if (padding_mode == "reflection") { - if (align_corners) { - auto double_range = static_cast(max_val * 2); - auto grid_abs = grid_slice_t.abs(); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); - if (max_val == 0) { - grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); - } - } else { - auto double_range = static_cast((max_val + 1) * 2); - auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - grid_slice_t.device(place) = - extra.cwiseMin(double_range - extra) - static_cast(0.5); - grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - } - } -} - -template -static inline void clipWithMask(const platform::CPUDeviceContext& ctx, - const int max_val, // height-1 or width-1 - bool align_corners, std::string padding_mode, - Tensor* grid_slice, Tensor* grid_scale) { - auto& place = *ctx.eigen_device(); - grid_scale->mutable_data(grid_slice->dims(), ctx.GetPlace()); - - auto grid_slice_t = EigenTensor::From(*grid_slice); - auto factor = static_cast(max_val * 0.5); - if (!align_corners) { - factor = static_cast((max_val + 1) * 0.5); - } - auto grid_scale_t = EigenTensor::From(*grid_scale).setConstant(factor); - - if (padding_mode == "border") { - // auto bounded_lo = grid_slice_t.cwiseMax(static_cast(0)); - auto res = grid_slice_t.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - - auto in_bound = (res == grid_slice_t); - grid_scale_t.device(place) = grid_scale_t * in_bound.template cast(); - grid_slice_t.device(place) = res; - } else if (padding_mode == "reflection") { - if (align_corners) { - auto double_range = static_cast(max_val * 2); - auto is_neg = (grid_slice_t < static_cast(0)); - auto grid_abs = grid_slice_t.abs(); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - auto one_more_flip = (extra > (double_range - extra)); - grid_scale_t.device(place) = - grid_scale_t * ((is_neg == one_more_flip).template cast() - - (is_neg != one_more_flip).template cast()); - grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); - if (max_val == 0) { - grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); - } - } else { - auto double_range = static_cast((max_val + 1) * 2); - auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); - auto is_neg = ((grid_slice_t + static_cast(0.5)) < static_cast(0)); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - auto one_more_flip = (extra > (double_range - extra)); - auto reflected = - extra.cwiseMin(double_range - extra) - static_cast(0.5); - auto clipped = reflected.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - auto in_bound = (clipped == reflected).template cast(); - grid_scale_t.device(place) = - grid_scale_t * ((is_neg == one_more_flip).template cast() - - (is_neg != one_more_flip).template cast()) * - in_bound; - grid_slice_t.device(place) = clipped; - } - } -} - -template -static void calcGridLocations(const platform::CPUDeviceContext& ctx, - const Tensor& grid, const int in_h, - const int in_w, bool align_corners, - std::string padding_mode, Tensor* grid_x, - Tensor* grid_y) { - const int n = grid.dims()[0]; - const int out_h = grid.dims()[1]; - const int out_w = grid.dims()[2]; - - // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim - T* grid_x_data = grid_x->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - T* grid_y_data = grid_y->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - const T* grid_data = grid.data(); - for (int i = 0; i < n * out_h * out_w; i++) { - grid_x_data[i] = grid_data[2 * i]; - grid_y_data[i] = grid_data[(2 * i) + 1]; - } - - unnormalize(ctx, grid_x, in_w - 1, align_corners); - unnormalize(ctx, grid_y, in_h - 1, align_corners); - - clip(ctx, grid_x, in_w - 1, align_corners, padding_mode); - clip(ctx, grid_y, in_h - 1, align_corners, padding_mode); -} - -template -static void calcGridLocationsWithGrad(const platform::CPUDeviceContext& ctx, - const Tensor& grid, const int in_h, - const int in_w, bool align_corners, - std::string padding_mode, Tensor* grid_x, - Tensor* grid_y, Tensor* grid_x_scale, - Tensor* grid_y_scale) { - const int n = grid.dims()[0]; - const int out_h = grid.dims()[1]; - const int out_w = grid.dims()[2]; - - // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim - T* grid_x_data = grid_x->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - T* grid_y_data = grid_y->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - - const T* grid_data = grid.data(); - for (int i = 0; i < n * out_h * out_w; i++) { - grid_x_data[i] = grid_data[2 * i]; - grid_y_data[i] = grid_data[(2 * i) + 1]; - } - - unnormalize(ctx, grid_x, in_w - 1, align_corners); - unnormalize(ctx, grid_y, in_h - 1, align_corners); - - clipWithMask(ctx, in_w - 1, align_corners, padding_mode, grid_x, - grid_x_scale); - clipWithMask(ctx, in_h - 1, align_corners, padding_mode, grid_y, - grid_y_scale); -} - -template -static void getGridPointValue(const Tensor& input, Tensor* output, - const Tensor& x, const Tensor& y) { - const int n = input.dims()[0]; - const int c = input.dims()[1]; - const int in_h = input.dims()[2]; - const int in_w = input.dims()[3]; - const int out_h = x.dims()[1]; - const int out_w = x.dims()[2]; - auto x_t = EigenTensor::From(x); - auto y_t = EigenTensor::From(y); - auto output_t = EigenTensor::From(*output).setConstant((T)0); - auto input_t = EigenTensor::From(input); - - for (int i = 0; i < n; i++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), - (T)(in_h - 1))) { - for (int j = 0; j < c; j++) { - output_t(i, j, k, l) = - input_t(i, j, static_cast(round(y_t(i, k, l))), - static_cast(round(x_t(i, k, l)))); - } - } - } - } - } -} - -template -static void allNeigbors(const platform::CPUDeviceContext& ctx, - const Tensor& input, Tensor* grid_x, Tensor* grid_y, - Tensor* x_w, Tensor* x_e, Tensor* y_n, - Tensor* y_s, // positions - Tensor* d_w, Tensor* d_e, Tensor* d_n, - Tensor* d_s, // distance - Tensor* v_wn, Tensor* v_en, Tensor* v_ws, - Tensor* v_es) { // values - auto& place = *ctx.eigen_device(); - - const int c = input.dims()[1]; - const int n = grid_x->dims()[0]; - const int out_h = grid_x->dims()[1]; - const int out_w = grid_x->dims()[2]; - // calculate coords of 4 corner points - x_w->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - x_e->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - y_n->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - y_s->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - auto x_w_t = EigenTensor::From(*x_w); - auto x_e_t = EigenTensor::From(*x_e); - auto y_n_t = EigenTensor::From(*y_n); - auto y_s_t = EigenTensor::From(*y_s); - - auto grid_x_t = EigenTensor::From(*grid_x); - auto grid_y_t = EigenTensor::From(*grid_y); - - x_w_t.device(place) = grid_x_t.floor(); - x_e_t.device(place) = x_w_t + static_cast(1); - y_n_t.device(place) = grid_y_t.floor(); - y_s_t.device(place) = y_n_t + static_cast(1); - - // calculate distances to 4 sides - d_w->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - d_e->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - d_n->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - d_s->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - auto d_w_t = EigenTensor::From(*d_w); - auto d_e_t = EigenTensor::From(*d_e); - auto d_n_t = EigenTensor::From(*d_n); - auto d_s_t = EigenTensor::From(*d_s); - d_w_t.device(place) = grid_x_t - x_w_t; - d_e_t.device(place) = x_e_t - grid_x_t; - d_n_t.device(place) = grid_y_t - y_n_t; - d_s_t.device(place) = y_s_t - grid_y_t; - - // calc 4 corner points value - v_wn->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - v_en->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - v_ws->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - v_es->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - getGridPointValue(input, v_wn, *x_w, *y_n); - getGridPointValue(input, v_en, *x_e, *y_n); - getGridPointValue(input, v_ws, *x_w, *y_s); - getGridPointValue(input, v_es, *x_e, *y_s); -} - -template -static void bilinearInter(const platform::CPUDeviceContext& ctx, - const Tensor& input, Tensor* grid_x, Tensor* grid_y, - Tensor* out) { - auto& place = *ctx.eigen_device(); - const int n = grid_x->dims()[0]; - const int out_h = grid_x->dims()[1]; - const int out_w = grid_x->dims()[2]; - const int c = input.dims()[1]; - - Tensor x_w, x_e, y_n, y_s; - Tensor d_w, d_e, d_n, d_s; - Tensor v_wn, v_en, v_ws, v_es; - - allNeigbors(ctx, input, grid_x, grid_y, &x_w, &x_e, &y_n, &y_s, &d_w, &d_e, - &d_n, &d_s, &v_wn, &v_en, &v_ws, &v_es); - - auto d_w_t = EigenTensor::From(d_w); - auto d_e_t = EigenTensor::From(d_e); - auto d_n_t = EigenTensor::From(d_n); - auto d_s_t = EigenTensor::From(d_s); - - auto d_w_scaled_t = - d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto d_e_scaled_t = - d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto d_n_scaled_t = - d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto d_s_scaled_t = - d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto v_wn_t = EigenTensor::From(v_wn); - auto v_en_t = EigenTensor::From(v_en); - auto v_ws_t = EigenTensor::From(v_ws); - auto v_es_t = EigenTensor::From(v_es); - auto output_t = EigenTensor::From(*out); - // bilinear interpolaetion by 4 corner points - output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t + - v_en_t * d_w_scaled_t * d_s_scaled_t + - v_ws_t * d_e_scaled_t * d_n_scaled_t + - v_es_t * d_w_scaled_t * d_n_scaled_t; -} - -template -static void nearestInter(const platform::CPUDeviceContext& ctx, - const Tensor& input, Tensor* grid_x, Tensor* grid_y, - Tensor* out) { - auto& place = *ctx.eigen_device(); - - auto grid_x_t = EigenTensor::From(*grid_x); - auto grid_y_t = EigenTensor::From(*grid_y); - grid_x_t = grid_x_t.round(); - grid_y_t = grid_y_t.round(); - getGridPointValue(input, out, *grid_x, *grid_y); -} - -template -static void gatherOutputGradToInputGrad(const Tensor& output_grad, - Tensor* input_grad, const Tensor& x, - const Tensor& y, const Tensor& d1, - const Tensor& d2) { - const int n = output_grad.dims()[0]; - const int c = output_grad.dims()[1]; - const int out_h = output_grad.dims()[2]; - const int out_w = output_grad.dims()[3]; - const int in_h = input_grad->dims()[2]; - const int in_w = input_grad->dims()[3]; - auto x_t = EigenTensor::From(x); - auto y_t = EigenTensor::From(y); - auto d1_t = EigenTensor::From(d1); - auto d2_t = EigenTensor::From(d2); - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - - for (int i = 0; i < n; i++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), - (T)(in_h - 1))) { - for (int j = 0; j < c; j++) { - input_grad_t(i, j, static_cast(round(y_t(i, k, l))), - static_cast(round(x_t(i, k, l)))) += - output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l); - } - } - } - } - } -} - -template -static void gatherOutputGradToInputGrad(const Tensor& output_grad, - Tensor* input_grad, const Tensor& x, - const Tensor& y) { - const int n = output_grad.dims()[0]; - const int c = output_grad.dims()[1]; - const int out_h = output_grad.dims()[2]; - const int out_w = output_grad.dims()[3]; - const int in_h = input_grad->dims()[2]; - const int in_w = input_grad->dims()[3]; - auto x_t = EigenTensor::From(x); - auto y_t = EigenTensor::From(y); - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - for (int i = 0; i < n; i++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), - (T)(in_h - 1))) { - for (int j = 0; j < c; j++) { - input_grad_t(i, j, static_cast(round(y_t(i, k, l))), - static_cast(round(x_t(i, k, l)))) += - output_grad_t(i, j, k, l); - } - } - } - } - } -} - -template -static void gatherBilinearGrad(const platform::CPUDeviceContext& ctx, - const Tensor& input, const Tensor& output_grad, - Tensor* grid_x, Tensor* grid_y, - Tensor* grid_x_scale, Tensor* grid_y_scale, - Tensor* input_grad, Tensor* grid_grad) { - const int n = grid_x->dims()[0]; - const int out_h = grid_x->dims()[1]; - const int out_w = grid_x->dims()[2]; - const int c = input.dims()[1]; - - Tensor x_w, x_e, y_n, y_s; - Tensor d_w, d_e, d_n, d_s; - Tensor v_wn, v_en, v_ws, v_es; - - allNeigbors(ctx, input, - grid_x, // grid_x - grid_y, // grid_y - &x_w, &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s, &v_wn, &v_en, - &v_ws, &v_es); - - // gather output grad value to input grad by corner point coords and weight - gatherOutputGradToInputGrad(output_grad, input_grad, x_w, y_n, d_e, d_s); - gatherOutputGradToInputGrad(output_grad, input_grad, x_w, y_s, d_e, d_n); - gatherOutputGradToInputGrad(output_grad, input_grad, x_e, y_n, d_w, d_s); - gatherOutputGradToInputGrad(output_grad, input_grad, x_e, y_s, d_w, d_n); - - auto v_wn_t = EigenTensor::From(v_wn); - auto v_en_t = EigenTensor::From(v_en); - auto v_ws_t = EigenTensor::From(v_ws); - auto v_es_t = EigenTensor::From(v_es); - - auto d_w_t = EigenTensor::From(d_w); - auto d_e_t = EigenTensor::From(d_e); - auto d_n_t = EigenTensor::From(d_n); - auto d_s_t = EigenTensor::From(d_s); - - auto output_grad_t = EigenTensor::From(output_grad); - - if (grid_grad != nullptr) { - Tensor grid_grad_x, grid_grad_y; - grid_grad_x.mutable_data({n, out_h, out_w}, ctx.GetPlace()); - grid_grad_y.mutable_data({n, out_h, out_w}, ctx.GetPlace()); - auto grid_grad_x_t = - EigenTensor::From(grid_grad_x).setConstant(static_cast(0.0)); - auto grid_grad_y_t = - EigenTensor::From(grid_grad_y).setConstant(static_cast(0.0)); - for (int i = 0; i < n; i++) { - for (int j = 0; j < c; j++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - grid_grad_x_t(i, k, l) += - ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) + - (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) * - output_grad_t(i, j, k, l); - grid_grad_y_t(i, k, l) += - ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) + - (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) * - output_grad_t(i, j, k, l); - } - } - } - } - - // const T x_max = static_cast(in_w - 1); - // const T y_max = static_cast(in_h - 1); - - auto grid_x_scale_t = EigenTensor::From(*grid_x_scale); - auto grid_y_scale_t = EigenTensor::From(*grid_y_scale); - grid_grad_x_t = grid_grad_x_t * grid_x_scale_t; - grid_grad_y_t = grid_grad_y_t * grid_y_scale_t; - - // gather grid_grad [x, y] in 3rd Dim - T* grid_grad_data = grid_grad->data(); - T* grid_grad_x_data = grid_grad_x.data(); - T* grid_grad_y_data = grid_grad_y.data(); - for (int i = 0; i < n * out_h * out_w; i++) { - grid_grad_data[2 * i] = grid_grad_x_data[i]; - grid_grad_data[2 * i + 1] = grid_grad_y_data[i]; - } - } -} - -template -class GridSampleOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode = ctx.Attr("padding_mode"); - auto mode = ctx.Attr("mode"); - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - - auto* output = ctx.Output("Output"); - output->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), output, - static_cast(0)); - - Tensor grid_x, grid_y; - calcGridLocations( - ctx.template device_context(), *grid, in_h, - in_w, align_corners, padding_mode, &grid_x, &grid_y); - if (mode == "bilinear") { - bilinearInter( - ctx.template device_context(), *input, - &grid_x, &grid_y, output); - } else if (mode == "nearest") { - auto grid_x_t = EigenTensor::From(grid_x); - auto grid_y_t = EigenTensor::From(grid_y); - grid_x_t = grid_x_t.round(); - grid_y_t = grid_y_t.round(); - getGridPointValue(*input, output, grid_x, grid_y); - } - } -}; - -template -class GridSampleGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode = ctx.Attr("padding_mode"); - auto mode = ctx.Attr("mode"); - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - auto* output_grad = ctx.Input(framework::GradVarName("Output")); - - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - - auto* input_grad = ctx.Output(framework::GradVarName("X")); - input_grad->mutable_data({n, c, in_h, in_w}, ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), input_grad, - static_cast(0)); - - Tensor* grid_grad = nullptr; - if (ctx.HasOutput(framework::GradVarName("Grid"))) { - grid_grad = ctx.Output(framework::GradVarName("Grid")); - grid_grad->mutable_data({n, out_h, out_w, 2}, ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), grid_grad, - static_cast(0)); - } - - Tensor grid_x, grid_y; - Tensor grid_x_scale, grid_y_scale; - calcGridLocationsWithGrad( - ctx.template device_context(), *grid, in_h, - in_w, align_corners, padding_mode, &grid_x, &grid_y, &grid_x_scale, - &grid_y_scale); - if (mode == "bilinear") { - gatherBilinearGrad(ctx.template device_context(), - *input, *output_grad, &grid_x, &grid_y, - &grid_x_scale, &grid_y_scale, input_grad, - grid_grad); - } else { - auto grid_x_t = EigenTensor::From(grid_x); - auto grid_y_t = EigenTensor::From(grid_y); - grid_x_t = grid_x_t.round(); - grid_y_t = grid_y_t.round(); - gatherOutputGradToInputGrad(*output_grad, input_grad, grid_x, grid_y); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc index 2d284fb516e62..4331523d26edc 100644 --- a/paddle/fluid/operators/group_norm_op.cc +++ b/paddle/fluid/operators/group_norm_op.cc @@ -167,9 +167,11 @@ class GroupNormGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { // check input + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GroupNormGrad"); OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "GroupNormGrad"); OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "Variance", "GroupNormGrad"); + OP_INOUT_CHECK(ctx->HasInput("Mean"), "Input", "Mean", "GroupNormGrad"); OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input", framework::GradVarName("Y"), "GroupNormGrad"); @@ -216,10 +218,12 @@ class GroupNormGradMaker : public framework::SingleGradOpMaker { void Apply(GradOpPtr op) const override { op->SetType("group_norm_grad"); + op->SetInput("X", this->Input("X")); op->SetInput("Scale", this->Input("Scale")); op->SetInput("Bias", this->Input("Bias")); op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y")); op->SetInput("Y", this->Output("Y")); + op->SetInput("Mean", this->Output("Mean")); op->SetInput("Variance", this->Output("Variance")); op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index b376334f1e93c..ab8c50d90b8ec 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -81,46 +81,74 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var); } -template -__device__ __forceinline__ void ThreadReduce(const T* input, int size, - const int offset, AccT* mean, - AccT* var) { +template +__device__ __forceinline__ void ThreadReduce(phi::Array arrs, + int size, const int offset, + AccT* out_mean, AccT* out_var) { + const T* x = arrs[0]; + const T* y; + if (Num == 2) { + y = arrs[1]; + } using VecT = kps::details::VectorType; int tid = threadIdx.x; if (offset > 0) { - input -= offset; + x -= offset; + if (Num == 2) { + y -= offset; + } size += offset; if (tid >= offset) { - AccT temp = input[tid]; - *mean += temp; - *var += temp * temp; + if (Num == 1) { + *out_mean += x[tid]; + *out_var += x[tid] * x[tid]; + } else if (Num == 2) { + *out_mean += y[tid]; + *out_var += y[tid] * x[tid]; + } } size -= blockDim.x; - input += blockDim.x; + x += blockDim.x; + if (Num == 2) { + y += blockDim.x; + } } int remain = size % (VecSize * blockDim.x); - T ins[VecSize]; - VecT* ins_vec = reinterpret_cast(&ins); + T ins_x[VecSize]; + T ins_y[VecSize]; + VecT* ins_vec_x = reinterpret_cast(&ins_x); + VecT* ins_vec_y = reinterpret_cast(&ins_y); // vector part for (; VecSize * tid < (size - remain); tid += blockDim.x) { - *ins_vec = reinterpret_cast(input)[tid]; + *ins_vec_x = reinterpret_cast(x)[tid]; + if (Num == 2) { + *ins_vec_y = reinterpret_cast(y)[tid]; + } #pragma unroll for (int i = 0; i < VecSize; ++i) { - AccT temp = ins[i]; - *mean += temp; - *var += temp * temp; + if (Num == 1) { + *out_mean += ins_x[i]; + *out_var += ins_x[i] * ins_x[i]; + } else if (Num == 2) { + *out_mean += ins_y[i]; + *out_var += ins_y[i] * ins_x[i]; + } } } // scalar part tid = size - remain + threadIdx.x; for (; tid < size; tid += blockDim.x) { - AccT temp = input[tid]; - *mean += temp; - *var += temp * temp; + if (Num == 1) { + *out_mean += x[tid]; + *out_var += x[tid] * x[tid]; + } else if (Num == 2) { + *out_mean += y[tid]; + *out_var += y[tid] * x[tid]; + } } } @@ -148,7 +176,10 @@ __global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var, AccT x_var = static_cast(0); const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); x += i * size; - ThreadReduce(x, size, input_offset, &x_mean, &x_var); + phi::Array ins; + ins[0] = x; + ThreadReduce(ins, size, input_offset, &x_mean, &x_var); + x_mean = kps::details::BlockXReduce>( x_mean, kps::AddFunctor()); x_var = kps::details::BlockXReduce>( @@ -310,10 +341,12 @@ class GroupNormKernel }; template -__global__ void GroupNormBackwardGetMeanAndVar( - const T* x, const T* scale, const T* bias, const T* d_y, int N, int C, - int W, int imsize, int groups, int group_size, T epsilon, T* d_mean, - T* d_var, T* d_scale, T* d_bias, const DataLayout data_layout) { +__global__ void GroupNormBackwardGetMeanAndVar(const T* x, const T* scale, + const T* bias, const T* d_y, + int N, int C, int W, int imsize, + int groups, int group_size, + T epsilon, T* d_mean, T* d_var, + T* d_scale, T* d_bias) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; @@ -329,15 +362,11 @@ __global__ void GroupNormBackwardGetMeanAndVar( for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { T val, dval; - if (data_layout == DataLayout::kNCHW) { - val = x[(bid * C + ccid) * imsize + imid] - x_bias; - dval = d_y[(bid * C + ccid) * imsize + imid]; - } else { - int hid = imid / W; - int wid = imid % W; - val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias; - dval = d_y[(bid * H + hid) * W * C + wid * C + ccid]; - } + + int hid = imid / W; + int wid = imid % W; + val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias; + dval = d_y[(bid * H + hid) * W * C + wid * C + ccid]; d_var_data += val * dval; d_mean_data += dval * x_scale; @@ -357,8 +386,7 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale, const T* bias, const T* var, const T* d_mean, const T* d_var, int N, int C, int W, int imsize, int groups, int group_size, - T epsilon, T* d_x, - const DataLayout data_layout) { + T epsilon, T* d_x) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; @@ -379,26 +407,142 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale, if (x_scale != 0) x_scale_inv = 1.0 / x_scale; for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { - if (data_layout == DataLayout::kNCHW) { - T tmp = x[(bid * C + ccid) * imsize + imid]; - T v_y = (tmp - x_bias) * x_scale_inv; - T dly = d_y[(bid * C + ccid) * imsize + imid]; - d_x[(bid * C + ccid) * imsize + imid] = - x_var_inv * - (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); - } else { - int hid = imid / W; - int wid = imid % W; - T tmp = x[(bid * H + hid) * W * C + wid * C + ccid]; - T v_y = (tmp - x_bias) * x_scale_inv; - T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid]; - d_x[(bid * H + hid) * W * C + wid * C + ccid] = - x_var_inv * - (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); + int hid = imid / W; + int wid = imid % W; + T tmp = x[(bid * H + hid) * W * C + wid * C + ccid]; + T v_y = (tmp - x_bias) * x_scale_inv; + T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid]; + d_x[(bid * H + hid) * W * C + wid * C + ccid] = + x_var_inv * + (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); + } +} + +template +__global__ void VectorizedGetDsDbCUDAKernel(int imsize, const T* x, const T* dy, + T* ds, T* db) { + int i = blockIdx.x; + AccT ds_sum = static_cast(0); + AccT db_sum = static_cast(0); + const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); + x += i * imsize; + + phi::Array ins; + ins[0] = x; + ins[1] = dy; + ThreadReduce(ins, imsize, input_offset, &db_sum, + &ds_sum); + + ds_sum = kps::details::BlockXReduce>( + ds_sum, kps::AddFunctor()); + db_sum = kps::details::BlockXReduce>( + db_sum, kps::AddFunctor()); + __syncthreads(); + if (threadIdx.x == 0) { + ds[i] = ds_sum; + db[i] = db_sum; + } +} + +template +__global__ void ScalarGetDsDbCUDAKernel(int imsize, const T* x, const T* dy, + T* ds, T* db) { + const int nc = blockIdx.x; + T ds_sum = 0; + T db_sum = 0; + for (int i = threadIdx.x; i < imsize; i += blockDim.x) { + const int index = nc * imsize + i; + ds_sum += dy[index] * x[index]; + db_sum += dy[index]; + } + CudaAtomicAddWithWarp(&ds[nc], ds_sum); + CudaAtomicAddWithWarp(&db[nc], db_sum); +} + +template +__global__ void GetScaleBiasGradientCUDAKernel(int N, int C, int group, + T epsilon, const T* mean, + const T* var, const T* ds, + const T* db, T* d_scale, + T* d_bias) { + const int c = blockIdx.x * blockDim.x + threadIdx.x; + if (c < C) { + const int G = group; + const int D = C / G; + T sum1 = 0; + T sum2 = 0; + for (int n = 0; n < N; ++n) { + const int nc = n * C + c; + const int ng = n * G + c / D; + sum1 += (d_scale == nullptr) + ? T(0) + : ((ds[nc] - db[nc] * static_cast(mean[ng])) * + static_cast(rsqrt(var[ng] + epsilon))); + sum2 += (d_bias == nullptr) ? T(0) : db[nc]; + } + if (d_scale != nullptr) { + d_scale[c] = sum1; + } + if (d_bias != nullptr) { + d_bias[c] = sum2; } } } +template +__global__ void GetBackwardParamsCUDAKernel(int imsize, int groups, + int group_size, T epsilon, + const T* mean, const T* var, + const T* scale, const T* ds, + const T* db, T* p1, T* p2, T* p3) { + const int n = blockIdx.x; + const int g = blockIdx.y; + const int ng = n * groups + g; + T sum1 = 0; + T sum2 = 0; + T var_inv = rsqrt(var[ng] + epsilon); + for (int64_t i = threadIdx.x; i < group_size; i += blockDim.x) { + const int64_t index = ng * group_size + i; + const int64_t c = g * group_size + i; + const T scale_v = scale == nullptr ? T(1) : static_cast(scale[c]); + sum1 += ds[index] * scale_v; + sum2 += db[index] * scale_v; + const T scale_c = scale == nullptr ? T(0) : static_cast(scale[c]); + p1[index] = scale_c * var_inv; + } + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + sum1 = BlockReduce(ds_storage).Reduce(sum1, cub::Sum()); + sum2 = BlockReduce(db_storage).Reduce(sum2, cub::Sum()); + + if (threadIdx.x == 0) { + const T s = T(1) / static_cast(group_size * imsize); + const T x = (sum2 * static_cast(mean[ng]) - sum1) * + static_cast(var_inv) * static_cast(var_inv) * + static_cast(var_inv) * s; + p2[ng] = x; + p3[ng] = -x * static_cast(mean[ng]) - sum2 * static_cast(var_inv) * s; + } +} + +template +__global__ void GetXGradientCUDAKernel(int imsize, int C, int group_size, + int groups, T* p1, T* p2, T* p3, + const T* x, const T* dy, T* dx) { + int cid = blockIdx.x; + int gid = blockIdx.y; + int bid = blockIdx.z; + int ccid = bid * C + gid * group_size + cid; + int ng = bid * groups + gid; + int nc = gid * group_size + cid; + for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { + int index = (bid * C + nc) * imsize + imid; + dx[index] = p1[ccid] * dy[index] + p2[ng] * x[index] + p3[ng]; + } +} + template class GroupNormGradKernel : public framework::OpKernel { @@ -408,7 +552,9 @@ class GroupNormGradKernel const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); const float epsilon = ctx.Attr("epsilon"); - auto* x = ctx.Input("Y"); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* mean = ctx.Input("Mean"); auto* var = ctx.Input("Variance"); auto* scale = ctx.Input("Scale"); auto* bias = ctx.Input("Bias"); @@ -433,31 +579,27 @@ class GroupNormGradKernel phi::funcs::SetConstant set_zero; auto& dev_ctx = ctx.template device_context(); - Tensor temp_var; - temp_var.mutable_data(var->dims(), ctx.GetPlace()); - set_zero(dev_ctx, &temp_var, static_cast(0)); - T* temp_var_data = temp_var.data(); - - Tensor temp_mean; - temp_mean.mutable_data(var->dims(), ctx.GetPlace()); - set_zero(dev_ctx, &temp_mean, static_cast(0)); - T* temp_mean_data = temp_mean.data(); + Tensor ds, db; + ds.mutable_data({x_dims[0], C}, ctx.GetPlace()); + db.mutable_data({x_dims[0], C}, ctx.GetPlace()); + T* ds_data = ds.data(); + T* db_data = db.data(); + auto* y_data = y->data(); auto* x_data = x->data(); T* d_x_data = nullptr; if (d_x) d_x_data = d_x->data(); - auto* y_data = d_y->data(); + auto* dy_data = d_y->data(); auto* var_data = var->data(); + auto* mean_data = mean->data(); T* d_scale_data = nullptr; if (d_scale) { d_scale->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, d_scale, static_cast(0)); d_scale_data = d_scale->data(); } T* d_bias_data = nullptr; if (d_bias) { d_bias->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, d_bias, static_cast(0)); d_bias_data = d_bias->data(); } @@ -479,22 +621,103 @@ class GroupNormGradKernel #ifdef __HIPCC__ int block_size = std::max(std::min(256, imsize), 64); + const int block_dims = 256; #else int block_size = std::min(1024, imsize); + const int block_dims = 1024; #endif dim3 grid(group_size, groups, x_dims[0]); dim3 threads(block_size, 1, 1); int flags = (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; - UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, x_data, scale_data, - bias_data, y_data, x_dims[0], C, W, imsize, groups, - group_size, epsilon, temp_mean_data, temp_var_data, - d_scale_data, d_bias_data, data_layout); - if (d_x_data != nullptr) { - UNROLL_ALL_CASES(flags, GroupNormBackward, x_data, y_data, scale_data, - bias_data, var_data, temp_mean_data, temp_var_data, - x_dims[0], C, W, imsize, groups, group_size, epsilon, - d_x_data, data_layout); + if (data_layout == DataLayout::kNCHW) { + using AccT = typename details::MPTypeTrait::Type; + constexpr int vec_size = sizeof(float4) / sizeof(T); + const int max_num_threads = 1024; + int max_block_size = std::min(imsize / vec_size, max_num_threads); + int block_size_nchw = 1; + while (block_size_nchw < max_block_size) { + block_size_nchw *= 2; + } + block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize); + dim3 blocks(block_size_nchw); + if (imsize < vec_size) { + if (d_scale) { + set_zero(dev_ctx, d_scale, static_cast(0)); + } + if (d_bias) { + set_zero(dev_ctx, d_bias, static_cast(0)); + } + ScalarGetDsDbCUDAKernel< + T><<>>( + imsize, x_data, dy_data, ds_data, db_data); + } else { + VectorizedGetDsDbCUDAKernel< + T, AccT, vec_size><<>>( + imsize, x_data, dy_data, ds_data, db_data); + } + + if (d_scale || d_bias) { + const int block = 256; + GetScaleBiasGradientCUDAKernel< + T><<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>( + x_dims[0], C, groups, epsilon, mean_data, var_data, ds_data, + db_data, d_scale_data, d_bias_data); + } + + if (d_x_data != nullptr) { + // p1 * dy + p2 * x + p3, + // p1, p2, p3 represent the reverse calculation of temporary variables + // p1 = scale * var_inv + // p2 = (db * scale * mean - ds * scale) * pow(var_inv, 3) * (1/n) + // p3 = -p2 * mean[ng] - db * scale * var_inv * (1/n); + Tensor p1, p2, p3; + p1.mutable_data({x_dims[0] * C}, ctx.GetPlace()); + p2.mutable_data({x_dims[0], groups}, ctx.GetPlace()); + p3.mutable_data({x_dims[0], groups}, ctx.GetPlace()); + T* p1_data = p1.data(); + T* p2_data = p2.data(); + T* p3_data = p3.data(); + + GetBackwardParamsCUDAKernel<<< + dim3(x_dims[0], groups), block_dims, 0, dev_ctx.stream()>>>( + imsize, groups, group_size, epsilon, mean_data, var_data, + scale_data, ds_data, db_data, p1_data, p2_data, p3_data); + GetXGradientCUDAKernel<<>>( + imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data, + dy_data, d_x_data); + } + + } else { + if (d_scale) { + set_zero(dev_ctx, d_scale, static_cast(0)); + } + if (d_bias) { + set_zero(dev_ctx, d_bias, static_cast(0)); + } + + Tensor temp_var; + temp_var.mutable_data(var->dims(), ctx.GetPlace()); + set_zero(dev_ctx, &temp_var, static_cast(0)); + T* temp_var_data = temp_var.data(); + + Tensor temp_mean; + temp_mean.mutable_data(var->dims(), ctx.GetPlace()); + set_zero(dev_ctx, &temp_mean, static_cast(0)); + T* temp_mean_data = temp_mean.data(); + + int flags = (scale_data != nullptr) * kHasScale + + (bias_data != nullptr) * kHasBias; + UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, y_data, + scale_data, bias_data, dy_data, x_dims[0], C, W, imsize, + groups, group_size, epsilon, temp_mean_data, + temp_var_data, d_scale_data, d_bias_data); + if (d_x_data != nullptr) { + UNROLL_ALL_CASES(flags, GroupNormBackward, y_data, dy_data, scale_data, + bias_data, var_data, temp_mean_data, temp_var_data, + x_dims[0], C, W, imsize, groups, group_size, epsilon, + d_x_data); + } } } }; diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 9575ab54b32bd..93f0d3d334f27 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/hierarchical_sigmoid_op.h" #include #include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/multiary.h" + namespace paddle { namespace operators { @@ -60,31 +64,6 @@ namespace operators { class HierarchicalSigmoidOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasOutput("PreOut"), "Output", "PreOut", "hsigmoid"); - - auto with_prefetch = ctx->Attrs().Get("remote_prefetch"); - if (with_prefetch) { - OP_INOUT_CHECK(ctx->HasOutput("W_Out"), "Output", "W_Out", "hsigmoid"); - } - const int64_t input_dims = ctx->GetInputDim("X")[0]; - const int64_t label_dims = ctx->GetInputDim("Label")[0]; - PADDLE_ENFORCE_EQ(input_dims, label_dims, - platform::errors::InvalidArgument( - "The first dimension of " - "input and label is expected to be the same. " - "But received input's first dimension is %d; " - "label's first dimension is %d.", - input_dims, label_dims)); - - std::vector output_shape({input_dims, 1}); - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - ctx->ShareLoD("X", /*->*/ "Out"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -272,22 +251,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER( } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR( - hierarchical_sigmoid, ops::HierarchicalSigmoidOp, - ops::HierarchicalSigmoidOpMaker, - ops::HierarchicalSigmoidGradMaker, - ops::HierarchicalSigmoidGradMaker); +DECLARE_INFER_SHAPE_FUNCTOR(hierarchical_sigmoid, + HierarchicalSigmoidInferShapeFunctor, + PD_INFER_META(phi::HierarchicalSigmoidInferMeta)); +REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp, + ops::HierarchicalSigmoidOpMaker, + ops::HierarchicalSigmoidGradMaker, + ops::HierarchicalSigmoidGradMaker, + HierarchicalSigmoidInferShapeFunctor); REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp, ops::HierarchicalSigmoidGradOpGradVarTypeInference, ops::HierarchicalSigmoidGradOpNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL( - hierarchical_sigmoid, - ops::HierarchicalSigmoidOpKernel, - ops::HierarchicalSigmoidOpKernel); -REGISTER_OP_CPU_KERNEL( - hierarchical_sigmoid_grad, - ops::HierarchicalSigmoidGradOpKernel, - ops::HierarchicalSigmoidGradOpKernel); diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h deleted file mode 100644 index f11b28cfefb07..0000000000000 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ /dev/null @@ -1,222 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/mixed_vector.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/clip_op.h" -#include "paddle/fluid/operators/math/matrix_bit_code.h" -#include "paddle/fluid/platform/transform.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; -using platform::Transform; -using framework::LoDTensor; - -static std::vector PathToRows(const LoDTensor& path) { - std::set rows; - const int64_t* paths = path.data(); - for (int64_t i = 0; i < path.numel(); ++i) { - int64_t row = paths[i]; - if (row < 0) { - continue; - } - rows.emplace(row); - } - return std::vector(rows.begin(), rows.end()); -} -template -class HierarchicalSigmoidOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& in = GET_DATA_SAFELY(ctx.Input("X"), "Input", "X", - "HierarchicalSigmoid"); - auto& w = GET_DATA_SAFELY(ctx.Input("W"), "Input", "W", - "HierarchicalSigmoid"); - auto* path = ctx.Input("PathTable"); - auto* code = ctx.Input("PathCode"); - auto& label = GET_DATA_SAFELY(ctx.Input("Label"), "Input", - "Label", "HierarchicalSigmoid"); - auto* bias = ctx.Input("Bias"); - auto* out = ctx.Output("Out"); - auto* pre_out = ctx.Output("PreOut"); - size_t num_classes = static_cast(ctx.Attr("num_classes")); - // for remote prefetch - - bool is_custom = false; - if (path) { - is_custom = true; - } - int64_t code_length = - path ? path->dims()[1] : math::FindLastSet(num_classes - 1); - int64_t batch_size = in.dims()[0]; - LoDTensor sum; - auto& dev_ctx = ctx.template device_context(); - auto* pre_out_data = pre_out->mutable_data( - phi::make_ddim({batch_size, code_length}), ctx.GetPlace()); - auto pre_out_mat = EigenMatrix::From(*pre_out); - // Not all class(leaf) nodes' path lengths equal code_length, thus init as - // 0s can avoid out of path's loss. - phi::funcs::SetConstant zero; - zero(dev_ctx, pre_out, static_cast(0.0)); - auto& place = *ctx.template device_context().eigen_device(); - phi::funcs::RowwiseSum row_sum; - - std::unique_ptr> bit_code; - if (!is_custom) { - bit_code.reset(new math::MatrixBitCodeFunctor( - num_classes, label.template data())); - } else { - bit_code.reset(new math::MatrixBitCodeFunctor( - *path, *code, label.template data())); - } - - std::vector sum_dims({batch_size, 1UL}); - sum.mutable_data(phi::make_ddim(sum_dims), ctx.GetPlace()); - auto sum_mat = EigenMatrix::From(sum); - out->mutable_data(ctx.GetPlace()); - auto out_mat = framework::EigenMatrix::From(*out); - if (bias) { - bit_code->Add(*bias, pre_out); - } - bit_code->Mul(pre_out, w, in); - // clip to [-40, 40] - Transform trans; - trans(ctx.template device_context(), pre_out_data, - pre_out_data + pre_out->numel(), pre_out_data, - ClipFunctor(static_cast(-40.0), static_cast(40.0))); - bit_code->Sum(*pre_out, out, static_cast(-1)); - // use softrelu to calculate cross entropy - pre_out_mat.device(place) = (static_cast(1.0) + pre_out_mat.exp()).log(); - row_sum(dev_ctx, *pre_out, &sum); - // TODO(guosheng): Subtract the out of path's loss, since not all - // class(leaf) nodes' path lengths equal code_length. But it won't break the - // gradient check since both have the out of path's loss and will cancel out - // each other. - out_mat.device(place) = sum_mat + out_mat; - } -}; - -template -class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& in = GET_DATA_SAFELY(ctx.Input("X"), "Input", "X", - "HierarchicalSigmoidGrad"); - auto& w = GET_DATA_SAFELY(ctx.Input("W"), "Input", "W", - "HierarchicalSigmoidGrad"); - auto* path = ctx.Input("PathTable"); - auto* code = ctx.Input("PathCode"); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - bool is_sparse = ctx.Attr("is_sparse"); - auto& dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - auto& label = GET_DATA_SAFELY(ctx.Input("Label"), "Input", - "Label", "HierarchicalSigmoidGrad"); - auto& pre_out = GET_DATA_SAFELY(ctx.Input("PreOut"), "Input", - "PreOut", "HierarchicalSigmoidGrad"); - auto& out_grad = GET_DATA_SAFELY( - ctx.Input(framework::GradVarName("Out")), "Input", - framework::GradVarName("Out"), "HierarchicalSigmoidGrad"); - LoDTensor pre_out_grad; - - pre_out_grad.mutable_data(pre_out.dims(), ctx.GetPlace()); - in_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, in_grad, static_cast(0.0)); - - size_t num_classes = static_cast(ctx.Attr("num_classes")); - - bool is_custom = false; - if (path) { - is_custom = true; - } - - std::unique_ptr> bit_code; - if (!is_custom) { - bit_code.reset(new math::MatrixBitCodeFunctor( - num_classes, label.template data())); - } else { - bit_code.reset(new math::MatrixBitCodeFunctor( - *path, *code, label.template data())); - } - - // softrelu derivative - - auto blas = phi::funcs::GetBlas(ctx); - - auto* pre_out_grad_data = pre_out_grad.data(); - auto* pre_out_data = pre_out.template data(); - auto n = pre_out.numel(); - blas.VEXP(n, pre_out_data, pre_out_grad_data); - blas.VINV(n, pre_out_grad_data, pre_out_grad_data); - for (int64_t i = 0; i < n; ++i) { - pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i]; - } - bit_code->Sub(&pre_out_grad); // the gradient of clip(w * x + b) - auto* out_grad_data = out_grad.template data(); - - int64_t dim0 = pre_out_grad.dims()[0]; - int64_t dim1 = pre_out_grad.dims()[1]; - for (int64_t i = 0; i < dim0; ++i) { - T tmp = out_grad_data[i]; - blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1); - } - // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to - // be consistent with the clipping in forward. - auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); - if (bias_grad) { - bias_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, bias_grad, static_cast(0.0)); - bit_code->AddGrad(pre_out_grad, bias_grad); - } - if (!is_sparse) { - auto* w_grad = ctx.Output(framework::GradVarName("W")); - w_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, w_grad, static_cast(0.0)); - bit_code->MulGradWeight(pre_out_grad, w_grad, in); - } else { - PADDLE_ENFORCE_NOT_NULL(path, - platform::errors::NotFound( - "Custom tree must be set for sparse mode!")); - framework::Vector real_rows = PathToRows(*path); - auto* w_grad = ctx.Output(framework::GradVarName("W")); - w_grad->set_rows(real_rows); - // Build a map of id -> row_index to speed up finding the index of one id - w_grad->set_height(w.dims()[0]); - auto* w_grad_value = w_grad->mutable_value(); - framework::DDim temp_dim(w.dims()); - temp_dim[0] = real_rows.size(); - w_grad_value->mutable_data(temp_dim, ctx.GetPlace()); - zero(dev_ctx, w_grad_value, static_cast(0.0)); - bit_code->MulGradWeight(pre_out_grad, w_grad, in); - } - bit_code->MulGradError(pre_out_grad, w, in_grad); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/histogram_op.cc b/paddle/fluid/operators/histogram_op.cc index 92cc6077defcd..c9fd75651b589 100644 --- a/paddle/fluid/operators/histogram_op.cc +++ b/paddle/fluid/operators/histogram_op.cc @@ -16,7 +16,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,27 +30,6 @@ class HistogramOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "histogram"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "histogram"); - const auto &nbins = ctx->Attrs().Get("bins"); - const auto &minval = ctx->Attrs().Get("min"); - const auto &maxval = ctx->Attrs().Get("max"); - - PADDLE_ENFORCE_GE(nbins, 1, - platform::errors::InvalidArgument( - "The bins should be greater than or equal to 1." - "But received nbins is %d", - nbins)); - PADDLE_ENFORCE_GE(maxval, minval, platform::errors::InvalidArgument( - "max must be larger or equal to min." - "But received max is %d, min is %d", - maxval, minval)); - - ctx->SetOutputDim("Out", phi::make_ddim({nbins})); - ctx->ShareLoD("X", /*->*/ "Out"); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); @@ -81,7 +62,12 @@ class HistogramOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(histogram, HistogramInferShapeFunctor, + PD_INFER_META(phi::HistogramInferMeta)); + REGISTER_OPERATOR( histogram, ops::HistogramOp, ops::HistogramOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + HistogramInferShapeFunctor); diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc index 09f4e63943ad3..8324a6215bca8 100644 --- a/paddle/fluid/operators/increment_op_npu_test.cc +++ b/paddle/fluid/operators/increment_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/index_select_op.cc b/paddle/fluid/operators/index_select_op.cc index fea71edf41313..069cc9416a620 100644 --- a/paddle/fluid/operators/index_select_op.cc +++ b/paddle/fluid/operators/index_select_op.cc @@ -13,8 +13,13 @@ // limitations under the License. #include "paddle/fluid/operators/index_select_op.h" + #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -24,52 +29,6 @@ class IndexSelectOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of IndexSelectOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Input(Index) of IndexSelectOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of IndexSelectOp should not be null.")); - - auto input_dim = ctx->GetInputDim("X"); - auto index_dim = ctx->GetInputDim("Index"); - auto dim = ctx->Attrs().Get("dim"); - - PADDLE_ENFORCE_EQ( - dim < input_dim.size() && dim >= (0 - input_dim.size()), true, - platform::errors::OutOfRange( - "Attr(dim) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(dim) = %d.", - input_dim.size(), input_dim.size() - 1, dim)); - - PADDLE_ENFORCE_EQ( - index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1), - true, platform::errors::InvalidArgument( - "The 'shape' of Input(Index) must be 1-D tensor. " - "But received: the 'shape' of Input(Index) is [%s], " - "the dimension of Input(Index) is [%d].", - index_dim, index_dim.size())); - - PADDLE_ENFORCE_EQ(index_dim[0] != 0, true, - platform::errors::InvalidArgument( - "The length of Input(Index) can't be 0.")); - - auto output_dim = phi::vectorize(input_dim); - if (dim < 0) { - dim += input_dim.size(); - } - output_dim[dim] = index_dim[0]; - ctx->SetOutputDim("Out", phi::make_ddim(output_dim)); - auto type = ctx->GetInputsVarType("X")[0]; - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -148,20 +107,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer, } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(index_select, IndexSelectInferShapeFunctor, + PD_INFER_META(phi::IndexSelectInferMeta)); REGISTER_OPERATOR(index_select, ops::IndexSelectOp, ops::IndexSelectOpMaker, ops::IndexSelectGradMaker, - ops::IndexSelectGradMaker); + ops::IndexSelectGradMaker, + IndexSelectInferShapeFunctor); REGISTER_OPERATOR(index_select_grad, ops::IndexSelectGradOp, ops::IndexSelectGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - index_select, - ops::IndexSelectKernel, - ops::IndexSelectKernel, - ops::IndexSelectKernel, - ops::IndexSelectKernel); -REGISTER_OP_CPU_KERNEL( - index_select_grad, - ops::IndexSelectGradKernel, - ops::IndexSelectGradKernel, - ops::IndexSelectGradKernel, - ops::IndexSelectGradKernel); diff --git a/paddle/fluid/operators/index_select_op.cu b/paddle/fluid/operators/index_select_op.cu deleted file mode 100644 index f810aee2adea5..0000000000000 --- a/paddle/fluid/operators/index_select_op.cu +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/index_select_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using platform::PADDLE_CUDA_NUM_THREADS; -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -template -__global__ void index_select_cuda_kernel(const T* input, T* output, - const IndexT* index, int64_t N, - int64_t stride, int64_t size, - int64_t delta) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - - int64_t pre_idx = idx / (stride * size); - int64_t dim_idx = idx % (stride * size) / stride; - IndexT src_dim_idx = index[dim_idx]; - int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; - output[idx] = input[input_idx]; -} - -template -__global__ void index_select_grad_cuda_kernel(const T* output_grad, - T* input_grad, - const IndexT* index, int64_t nums, - int64_t N, int64_t stride, - int64_t size, int64_t delta) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - - int64_t pre_idx = idx / (stride * size); - int64_t dim_idx = idx % (stride * size) / stride; - IndexT src_dim_idx = index[dim_idx]; - int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; - paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]); -} - -template -__global__ void index_select_grad_init(T* input_grad, int64_t N) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - input_grad[idx] = 0.0; -} - -template -class IndexSelectCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* index = context.Input("Index"); - auto* out = context.Output("Out"); - int dim = context.Attr("dim"); - auto input_dim = in->dims(); - auto output_dim = out->dims(); - dim = dim >= 0 ? dim : dim + input_dim.size(); - auto stride_dim = phi::stride(input_dim); - int64_t stride = stride_dim[dim]; - int64_t size = output_dim[dim]; - int64_t delta = input_dim[dim] - size; - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT64 || - index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = out->numel(); - - auto stream = - context.template device_context().stream(); - - if (index_type == framework::proto::VarType::INT64) { - const int64_t* index_data = index->data(); - index_select_cuda_kernel<<< - (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data, - numel, stride, size, delta); - platform::GpuStreamSync(stream); - } else { - const int* index_data = index->data(); - index_select_cuda_kernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - in_data, out_data, index_data, numel, stride, size, delta); - platform::GpuStreamSync(stream); - } - } -}; - -template -class IndexSelectGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* output_grad = context.Input(framework::GradVarName("Out")); - auto* in_grad = context.Output(framework::GradVarName("X")); - auto* index = context.Input("Index"); - - auto* output_grad_data = output_grad->data(); - auto* in_grad_data = in_grad->mutable_data(context.GetPlace()); - - int dim = context.Attr("dim"); - auto input_dim = in_grad->dims(); - auto output_dim = output_grad->dims(); - dim = dim >= 0 ? dim : dim + input_dim.size(); - auto stride_dim = phi::stride(input_dim); - int64_t stride = stride_dim[dim]; - int64_t size = output_dim[dim]; - int64_t delta = input_dim[dim] - size; - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT64 || - index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - int64_t numel = in_grad->numel(); - int64_t index_nums = index->numel(); - int64_t out_nums = output_grad->numel(); - - auto stream = - context.template device_context().stream(); - - index_select_grad_init< - T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel); - - if (index_type == framework::proto::VarType::INT64) { - const int64_t* index_data = index->data(); - index_select_grad_cuda_kernel<<< - (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data, - index_data, index_nums, - out_nums, stride, size, delta); - platform::GpuStreamSync(stream); - } else { - const int* index_data = index->data(); - index_select_grad_cuda_kernel<<< - (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data, - index_data, index_nums, - out_nums, stride, size, delta); - platform::GpuStreamSync(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - index_select, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - index_select_grad, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel); diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h index 04b4f69add785..684829be2697c 100644 --- a/paddle/fluid/operators/index_select_op.h +++ b/paddle/fluid/operators/index_select_op.h @@ -91,41 +91,6 @@ void IndexSelectInner(const framework::ExecutionContext& context, output->Resize(output_dim); } -template -class IndexSelectKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto inputs = *context.Input("X"); - auto* index = context.Input("Index"); - auto* output = context.Output("Out"); - - int dim = context.Attr("dim"); - if (dim < 0) { - dim += inputs.dims().size(); - } - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - if (index_type == framework::proto::VarType::INT32) { - IndexSelectInner(context, &inputs, *index, output, - dim); - } else if (index_type == framework::proto::VarType::INT64) { - IndexSelectInner(context, &inputs, *index, - output, dim); - } - } -}; - template struct IndexSelectAdd { void operator()(const framework::ExecutionContext& ctx, int slice_size, @@ -197,43 +162,5 @@ void IndexSelectGradInner(const framework::ExecutionContext& context, x_grad->Resize(output_dim); } -template -class IndexSelectGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x_grad = - context.Output(framework::GradVarName("X")); - auto* index = context.Input("Index"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - - int dim = context.Attr("dim"); - if (dim < 0) { - dim += out_grad->dims().size(); - } - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - if (index_type == framework::proto::VarType::INT32) { - IndexSelectGradInner(context, *out_grad, *index, - x_grad, dim); - } else if (index_type == framework::proto::VarType::INT64) { - IndexSelectGradInner(context, *out_grad, - *index, x_grad, dim); - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc index bce7a3c1caae3..a232fba7e28d6 100644 --- a/paddle/fluid/operators/index_select_op_npu.cc +++ b/paddle/fluid/operators/index_select_op_npu.cc @@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/index_select_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class IndexSelectNPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index 7f5136969980b..77951ff394e74 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -323,6 +323,7 @@ class InplaceABNGradKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; + REGISTER_OPERATOR(inplace_abn, ops::InplaceABNOp, ops::InplaceABNOpMaker, ops::BatchNormOpInferVarType, ops::InplaceABNOpGradMaker, diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index d61eb46d97e98..cd297c53f89a0 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -61,13 +61,13 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D( template __forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex( - int* in_img_idx, int* w_id, T* w1lambda, T* w2lambda, T src_w, - const int in_img_w) { - src_w = (src_w > 0) ? src_w : 0.f; - *in_img_idx = static_cast(src_w); - *w_id = (*in_img_idx < in_img_w - 1) ? 1 : 0; - *w1lambda = src_w - *in_img_idx; - *w2lambda = 1.f - *w1lambda; + int* in_img_idx, int* x_id, T* lambda1, T* lambda2, T src_x, + const int in_img_x) { + src_x = (src_x > 0) ? src_x : 0.f; + *in_img_idx = static_cast(src_x); + *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0; + *lambda1 = src_x - *in_img_idx; + *lambda2 = 1.f - *lambda1; } struct FastDivModForInterpolate { @@ -670,83 +670,102 @@ __global__ void KeBilinearInterpBwShareMemory( } } +__device__ __forceinline__ int GetInputIndex(const size_t nc, const int height, + const int width, const int h, + const int w) { + return (nc * height + h) * width + w; +} + +template +__global__ void KeBilinearInterpNCHWBw(T* in, const int in_h, const int in_w, + const int out_h, const int out_w, + const int n, const int num_channels, + float ratio_h, float ratio_w, + const T* __restrict__ out, + const T align_type_value) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + int stride = blockDim.x * gridDim.x; + int num_out = n * num_channels * out_h * out_w; + int num_in = n * num_channels * in_h * in_w; + + for (; index < num_out; index += stride) { + int index_tmp = index; + int w2 = index_tmp % out_w; + index_tmp /= out_w; + int h2 = index_tmp % out_h; + int nc = index_tmp / out_h; + + int h1, y_id; + T h1lambda, h0lambda; + T src_y = ratio_h * (h2 + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex(&h1, &y_id, &h1lambda, &h0lambda, + src_y, in_h); + int w1, x_id; + T w1lambda, w0lambda; + T src_x = ratio_w * (w2 + align_type_value) - align_type_value; + PreCalculatorForLinearInterpInputIndex(&w1, &x_id, &w1lambda, &w0lambda, + src_x, in_w); + + T d2val = out[index]; + + platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1), + h0lambda * w0lambda * d2val); + platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id), + h0lambda * w1lambda * d2val); + platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1), + h1lambda * w0lambda * d2val); + platform::CudaAtomicAdd( + in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id), + h1lambda * w1lambda * d2val); + } +} + template __global__ void KeBilinearInterpBw(T* in, const int in_h, const int in_w, const T* __restrict__ out, const int out_h, const int out_w, const int n, - const int num_channels, float ratio_h, - float ratio_w, const T align_type_value, - bool is_nchw) { + const int out_chw, const int num_channels, + float ratio_h, float ratio_w, + const T align_type_value, + FastDivModForInterpolate divmods) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; int in_chw = in_h * in_w * num_channels; - int out_chw = num_channels * out_h * out_w; int nthreads = n * out_chw; - if (is_nchw) { - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / out_chw; - int out_id_w = tid % out_chw; - const int in_img_size = in_h * in_w; - const int out_img_size = out_h * out_w; - T value = out[out_id_h * out_chw + out_id_w]; - - int channel_id = out_id_w / out_img_size; - int out_img_idy = (out_id_w % out_img_size) / out_w; - int out_img_idx = tid % out_w; - int in_img_idx, in_img_idy, w_id, h_id; - T w1lambda, h1lambda, w2lambda, h2lambda; - - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_h); - - T* in_pos = &in[out_id_h * in_chw + channel_id * in_img_size + - in_img_idy * in_w + in_img_idx]; - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); - platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * value); - platform::CudaAtomicAdd(&in_pos[h_id * in_w], - h1lambda * w2lambda * value); - platform::CudaAtomicAdd(&in_pos[h_id * in_w + w_id], - h1lambda * w1lambda * value); - } - } else { - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / out_chw; - int out_id_w = tid % out_chw; - const int in_img_size = in_h * in_w; - const int out_img_size = out_h * out_w; - T value = out[out_id_h * out_chw + out_id_w]; - - int out_img_idy = out_id_w / (out_w * num_channels); - int out_img_idx = out_id_w % (out_w * num_channels) / num_channels; - int channel_id = tid % num_channels; - - int in_img_idx, in_img_idy, w_id, h_id; - T w1lambda, h1lambda, w2lambda, h2lambda; - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_h); - - T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels + - in_img_idx * num_channels + channel_id]; - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); - platform::CudaAtomicAdd(&in_pos[w_id * num_channels], - h2lambda * w1lambda * value); - platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels], - h1lambda * w2lambda * value); - platform::CudaAtomicAdd( - &in_pos[h_id * in_w * num_channels + w_id * num_channels], - h1lambda * w1lambda * value); - } + for (; tid < nthreads; tid += stride) { + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; + + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; + + int in_img_idx, in_img_idy, w_id, h_id; + T w1lambda, h1lambda, w2lambda, h2lambda; + T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; + T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, + &w2lambda, src_w, in_w); + PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, + &h2lambda, src_h, in_h); + + T value = out[tid]; + T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels + + in_img_idx * num_channels + channel_id]; + platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); + platform::CudaAtomicAdd(&in_pos[w_id * num_channels], + h2lambda * w1lambda * value); + platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels], + h1lambda * w2lambda * value); + platform::CudaAtomicAdd( + &in_pos[h_id * in_w * num_channels + w_id * num_channels], + h1lambda * w1lambda * value); } } @@ -1907,11 +1926,23 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, ctx.cuda_device_context().stream()>>>( input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c, ratio_h, ratio_w, align_type_value, is_nchw); + } else if (!optimize_flag & is_nchw) { + // + const int num_kernels = n * c * out_h * out_w; + const int num_threads = + std::min(ctx.cuda_device_context().GetMaxThreadsPerBlock(), 1024); + KeBilinearInterpNCHWBw< + T><<>>( + input_grad_data, in_h, in_w, out_h, out_w, n, c, ratio_h, ratio_w, + output_grad_data, align_type_value); } else { + int64_t cw = c * out_w; + auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); KeBilinearInterpBw<<>>( - input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c, - ratio_h, ratio_w, align_type_value, is_nchw); + input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, + out_chw, c, ratio_h, ratio_w, align_type_value, interp_divmods); } } else if ("bicubic" == interp_method) { #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/isclose_op.cc b/paddle/fluid/operators/isclose_op.cc index 0ae7a9fa02f1f..1c79213757fdf 100644 --- a/paddle/fluid/operators/isclose_op.cc +++ b/paddle/fluid/operators/isclose_op.cc @@ -12,56 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/isclose_op.h" #include #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { -template -struct GetTensorValue { - T operator()(const platform::CPUDeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - return *(tensor.data()); - } -}; - -template -struct IscloseFunctor { - void operator()(const platform::CPUDeviceContext& ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - auto* in_a = in.data(); - auto* in_b = other.data(); - auto* out_data = output->mutable_data(ctx.GetPlace()); - auto num = in.numel(); - // *out_data = true; - for (int i = 0; i < num; i++) { - out_data[i] = true; - } - for (int i = 0; i < num; i++) { - const T a = in_a[i], b = in_b[i]; - bool val; - if (std::isnan(a) || std::isnan(b)) { - val = equal_nan && std::isnan(a) == std::isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - // *out_data &= val; - out_data[i] = val; - } - } -}; - class IscloseOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -100,40 +63,6 @@ class IscloseOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Isclose"); - OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Isclose"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Isclose"); - - auto input_dim = ctx->GetInputDim("Input"); - auto other_dim = ctx->GetInputDim("Other"); - PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(), - platform::errors::PreconditionNotMet( - "Input(Input) and Input(Other) must have the same " - "dimension size.")); - int n = input_dim.size(); - bool is_runtime = ctx->IsRuntime(); - for (int i = 0; i < n; i++) { - if (is_runtime) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } else { - if (!(input_dim[i] < 0 || other_dim[i] < 0)) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } - } - } - - ctx->SetOutputDim("Out", input_dim); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -154,12 +83,11 @@ class IscloseOpVarTypeInference : public framework::VarTypeInference { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +DECLARE_INFER_SHAPE_FUNCTOR(isclose, IscloseInferShapeFunctor, + PD_INFER_META(phi::ValueCompareInferMeta)); REGISTER_OPERATOR( isclose, ops::IscloseOp, ops::IscloseOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, - ops::IscloseOpVarTypeInference); -REGISTER_OP_CPU_KERNEL(isclose, ops::IscloseKernel, - ops::IscloseKernel); + ops::IscloseOpVarTypeInference, IscloseInferShapeFunctor); diff --git a/paddle/fluid/operators/isclose_op.cu b/paddle/fluid/operators/isclose_op.cu deleted file mode 100644 index 09710ba0c6957..0000000000000 --- a/paddle/fluid/operators/isclose_op.cu +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/isclose_op.h" - -namespace paddle { -namespace operators { - -template -struct GetTensorValue { - T operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - const T* data = tensor.data(); - T value; - const auto gpu_place = dev_ctx.GetPlace(); - memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T), - dev_ctx.stream()); - return value; - } -}; - -template -__global__ void IscloseCUDAKernel(const T* in_data, const T* other_data, - const double rtol, const double atol, - bool equal_nan, int num, bool* out_data) { - unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; - bool val; - for (int i = idx; i < num; i += blockDim.x * gridDim.x) { - const T a = in_data[i], b = other_data[i]; - if (isnan(a) || isnan(b)) { - val = equal_nan && isnan(a) == isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - out_data[i] = val; - // if (!val) *out_data = false; - } -} - -template -struct IscloseFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - int num = in.numel(); - const T* in_data = in.data(); - const T* other_data = other.data(); - bool* out_data = output->mutable_data(dev_ctx.GetPlace()); - int block = 1024; - int grid = (block - 1 + num) / block; - grid = (grid > block) ? block : grid; -#ifdef PADDLE_WITH_HIP - hipMemset(out_data, true, num * sizeof(bool)); -#else - cudaMemset(out_data, true, num * sizeof(bool)); -#endif - IscloseCUDAKernel<<>>( - in_data, other_data, rtol, atol, equal_nan, num, out_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(isclose, ops::IscloseKernel, - ops::IscloseKernel); diff --git a/paddle/fluid/operators/isclose_op.h b/paddle/fluid/operators/isclose_op.h deleted file mode 100644 index cde5d2afbf009..0000000000000 --- a/paddle/fluid/operators/isclose_op.h +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; - -template -struct GetTensorValue { - T operator()(const platform::DeviceContext& ctx, - const framework::Tensor& tensor) const; -}; - -template -struct IscloseFunctor { - void operator()(const DeviceContext& ctx, const framework::Tensor& in, - const framework::Tensor& other, const float rtol, - const float atol, bool equal_nan, framework::Tensor* output); -}; - -template -class IscloseKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // get attrs - bool equal_nan = ctx.Attr("equal_nan"); - // get input/output - const auto* input = ctx.Input("Input"); - const auto* other = ctx.Input("Other"); - auto* out = ctx.Output("Out"); - - double rtol_v = std::stod(ctx.Attr("rtol")); - double atol_v = std::stod(ctx.Attr("atol")); - - auto& dev_ctx = ctx.template device_context(); - GetTensorValue get_tensor_value; - if (ctx.HasInput("Rtol")) { - const auto* rtol = ctx.Input("Rtol"); - PADDLE_ENFORCE_EQ( - rtol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Rtol) size must be 1, but get %d.", rtol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(rtol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Rtol) type must be double, but get %s.", - framework::DataTypeToString( - framework::TransToProtoVarType(rtol->dtype())))); - rtol_v = get_tensor_value(dev_ctx, *rtol); - } - if (ctx.HasInput("Atol")) { - const auto* atol = ctx.Input("Atol"); - PADDLE_ENFORCE_EQ( - atol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Atol) size must be 1, but get %d", atol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(atol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Atol) type must be double, but get %s", - framework::DataTypeToString( - framework::TransToProtoVarType(atol->dtype())))); - atol_v = get_tensor_value(dev_ctx, *atol); - } - - IscloseFunctor()(dev_ctx, *input, *other, rtol_v, atol_v, - equal_nan, out); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc index a78d8ec10149d..67c1942ea0b41 100644 --- a/paddle/fluid/operators/kldiv_loss_op.cc +++ b/paddle/fluid/operators/kldiv_loss_op.cc @@ -9,10 +9,11 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/kldiv_loss_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -22,44 +23,6 @@ using framework::Tensor; class KLDivLossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "KLDivLoss"); - OP_INOUT_CHECK(ctx->HasInput("Target"), "Input", "Target", "KLDivLoss"); - OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", "KLDivLoss"); - - auto dim_x = ctx->GetInputDim("X"); - auto dim_target = ctx->GetInputDim("Target"); - PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(), - platform::errors::InvalidArgument( - "Input(X) rank and Input(Target) rank should be " - "same, but received X rank(%d) != Target rank(%d)", - dim_x.size(), dim_target.size())); - for (int i = 0; i < dim_x.size(); i++) { - if (ctx->IsRuntime() || (dim_x[i] > 0 && dim_target[i] > 0)) { - PADDLE_ENFORCE_EQ( - dim_x[i], dim_target[i], - platform::errors::InvalidArgument( - "Input(X) and Input(Target) should in same shape. but received " - "X dimension[%d](%d) != Target dimension[%d](%d)", - i, dim_x[i], i, dim_target[i])); - } - } - - auto reduction = ctx->Attrs().Get("reduction"); - - auto reduction_valid = "mean" == reduction || "sum" == reduction || - "batchmean" == reduction || "none" == reduction; - PADDLE_ENFORCE_EQ( - reduction_valid, true, - platform::errors::InvalidArgument( - "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'.")); - - if ("none" == reduction) { - ctx->SetOutputDim("Loss", dim_x); - } else { - ctx->SetOutputDim("Loss", {1}); - } - } protected: framework::OpKernelType GetExpectedKernelType( @@ -172,15 +135,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(KLDivLossGradNoNeedBufferVarInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(kldiv_loss, KLDivInferShapeFunctor, + PD_INFER_META(phi::KLDivInferMeta)); + REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker, ops::KLDivLossOpGradMaker, - ops::KLDivLossOpGradMaker); + ops::KLDivLossOpGradMaker, + KLDivInferShapeFunctor); REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad, ops::KLDivLossGradNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL( - kldiv_loss, ops::KLDivLossKernel, - ops::KLDivLossKernel); -REGISTER_OP_CPU_KERNEL( - kldiv_loss_grad, - ops::KLDivLossGradKernel, - ops::KLDivLossGradKernel); diff --git a/paddle/fluid/operators/kldiv_loss_op.cu b/paddle/fluid/operators/kldiv_loss_op.cu deleted file mode 100644 index 5226cb8c08e3d..0000000000000 --- a/paddle/fluid/operators/kldiv_loss_op.cu +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/kldiv_loss_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - kldiv_loss, - ops::KLDivLossKernel, - ops::KLDivLossKernel); -REGISTER_OP_CUDA_KERNEL( - kldiv_loss_grad, - ops::KLDivLossGradKernel, - ops::KLDivLossGradKernel); diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h deleted file mode 100644 index 5a6ef06f5eb1e..0000000000000 --- a/paddle/fluid/operators/kldiv_loss_op.h +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using Array1 = Eigen::DSizes; - -template -struct KLDivLossForward { - HOSTDEVICE KLDivLossForward() {} - - HOSTDEVICE T operator()(const T& target, const T& input) const { - if (target <= 0) { - return 0; - } else { - return target * (std::log(target) - input); - } - } -}; - -template -struct KLDivLossBackward { - HOSTDEVICE KLDivLossBackward() {} - - HOSTDEVICE T operator()(const T& target, const T& grad) const { - if (target <= 0) { - return 0; - } else { - return static_cast(-1.) * grad; - } - } -}; - -template -class KLDivLossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& place = *ctx.template device_context().eigen_device(); - auto* input = ctx.Input("X"); - auto* target = ctx.Input("Target"); - auto* loss = ctx.Output("Loss"); - auto reduction = ctx.Attr("reduction"); - - const int n = input->dims()[0]; - - loss->mutable_data(ctx.GetPlace()); - auto input_t = framework::EigenVector::Flatten(*input); - auto target_t = framework::EigenVector::Flatten(*target); - auto loss_t = framework::EigenVector::Flatten(*loss); - auto output = target_t.binaryExpr(input_t, KLDivLossForward()); - if ("none" == reduction) { - loss_t.device(place) = output; - } else if ("batchmean" == reduction) { - auto output_sum = output.sum(); - if (n > 0) { - loss_t.device(place) = output_sum / output_sum.constant(n); - } else { - loss_t.device(place) = output_sum; - } - } else if ("mean" == reduction) { - loss_t.device(place) = output.mean(); - } else if ("sum" == reduction) { - loss_t.device(place) = output.sum(); - } - } -}; - -template -class KLDivLossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& place = *ctx.template device_context().eigen_device(); - auto* target = ctx.Input("Target"); - auto reduction = ctx.Attr("reduction"); - auto* input_grad = ctx.Output(framework::GradVarName("X")); - auto* loss_grad = ctx.Input(framework::GradVarName("Loss")); - - const int n = input_grad->dims()[0]; - const int numel = input_grad->numel(); - const int expand = numel / loss_grad->numel(); - - input_grad->mutable_data(ctx.GetPlace()); - - auto target_t = framework::EigenVector::Flatten(*target); - - auto input_grad_t = framework::EigenVector::Flatten(*input_grad); - auto loss_grad_t = framework::EigenVector::Flatten(*loss_grad); - - auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand)); - auto grad_t = target_t * loss_grad_expand; - input_grad_t.device(place) = - target_t.binaryExpr(grad_t, KLDivLossBackward()); - - if ("mean" == reduction) { - input_grad_t.device(place) = input_grad_t / static_cast(numel); - } else if ("batchmean" == reduction) { - input_grad_t.device(place) = input_grad_t / static_cast(n); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc index 322ae5df4cb87..eac181489aa9d 100644 --- a/paddle/fluid/operators/kldiv_loss_op_npu.cc +++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the Licnse. */ -#include "paddle/fluid/operators/kldiv_loss_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc index 58d51ab1c723f..60390016d66e3 100644 --- a/paddle/fluid/operators/kron_op.cc +++ b/paddle/fluid/operators/kron_op.cc @@ -17,9 +17,9 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/kron_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -28,27 +28,6 @@ class KronOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kron"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "kron"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kron"); - - auto dim_x = ctx->GetInputDim("X"); - auto dim_y = ctx->GetInputDim("Y"); - auto rank_x = dim_x.size(); - auto rank_y = dim_y.size(); - auto rank = (rank_x > rank_y) ? rank_x : rank_y; - - std::vector dim_out; - dim_out.reserve(rank); - for (int i = 0; i < rank; i++) { - int64_t dim_xi = (i < rank - rank_x) ? 1 : dim_x.at(i - (rank - rank_x)); - int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y)); - dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi); - } - ctx->SetOutputDim("Out", phi::make_ddim(dim_out)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -175,30 +154,10 @@ class KronGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(kron, KronInferShapeFunctor, + PD_INFER_META(phi::KronInferMeta)); REGISTER_OPERATOR(kron, ops::KronOp, ops::KronOpMaker, ops::KronGradOpMaker, - ops::KronGradOpMaker); -REGISTER_OP_CPU_KERNEL( - kron, ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel>, - ops::KronKernel>); - + ops::KronGradOpMaker, + KronInferShapeFunctor); REGISTER_OPERATOR(kron_grad, ops::KronGradOp); -REGISTER_OP_CPU_KERNEL( - kron_grad, ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel>, - ops::KronGradKernel>); diff --git a/paddle/fluid/operators/kron_op.cu b/paddle/fluid/operators/kron_op.cu deleted file mode 100644 index e5124e6500750..0000000000000 --- a/paddle/fluid/operators/kron_op.cu +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/kron_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - kron, ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel>, - ops::KronKernel>); - -REGISTER_OP_CUDA_KERNEL( - kron_grad, ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel>, - ops::KronGradKernel>); diff --git a/paddle/fluid/operators/kron_op.h b/paddle/fluid/operators/kron_op.h deleted file mode 100644 index 274b47c03a4d3..0000000000000 --- a/paddle/fluid/operators/kron_op.h +++ /dev/null @@ -1,415 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/for_range.h" -#if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "thrust/device_vector.h" -#endif - -namespace paddle { -namespace operators { - -// Process an element in the output, used with a parallel-for -template -struct KronElemFunctor { - KronElemFunctor(const T* a, const T* b, T* out, const int64_t* shape_b, - const int64_t* stride_a, const int64_t* stride_b, - const int64_t* stride_out, int ndims) - : a_(a), - b_(b), - out_(out), - shape_b_(shape_b), - stride_a_(stride_a), - stride_b_(stride_b), - stride_out_(stride_out), - ndims_(ndims) {} - - HOSTDEVICE void operator()(int64_t idx) const { - // it computes 1 element in the output - int64_t index = idx; - int64_t index_a = 0; - int64_t index_b = 0; - for (int i = 0; i < ndims_; i++) { - auto pos_i = index / stride_out_[i]; - index = index % stride_out_[i]; - auto pos_ai = pos_i / shape_b_[i]; - auto pos_bi = pos_i % shape_b_[i]; - index_a += stride_a_[i] * pos_ai; - index_b += stride_b_[i] * pos_bi; - } - out_[idx] = a_[index_a] * b_[index_b]; - } - - private: - const T* a_; - const T* b_; - T* out_; - const int64_t* shape_b_; - const int64_t* stride_a_; - const int64_t* stride_b_; - const int64_t* stride_out_; - const int ndims_; -}; - -template -struct KronOpFunctor { - void operator()(const DeviceContext& dev_ctx, const framework::Tensor& x, - const framework::Tensor& y, framework::Tensor* out) { - int ndims = out->dims().size(); - int64_t numel = out->numel(); - - const framework::DDim& dim_x = x.dims(); - const framework::DDim& dim_y = y.dims(); - const framework::DDim& dim_out = out->dims(); - const framework::DDim stride_x = phi::stride(dim_x); - const framework::DDim stride_y = phi::stride(dim_y); - const framework::DDim stride_out = phi::stride(dim_out); - - const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr, - *p_stride_out = nullptr, *p_shape_y = nullptr; -#if defined(__NVCC__) || defined(__HIPCC__) - thrust::device_vector d_stride_x(ndims); - thrust::device_vector d_stride_y(ndims); - thrust::device_vector d_stride_out(ndims); - thrust::device_vector d_shape_y(ndims); - thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin()); - thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin()); - thrust::copy(stride_out.Get(), stride_out.Get() + ndims, - d_stride_out.begin()); - thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin()); - - p_stride_x = thrust::raw_pointer_cast(d_stride_x.data()); - p_stride_y = thrust::raw_pointer_cast(d_stride_y.data()); - p_stride_out = thrust::raw_pointer_cast(d_stride_out.data()); - p_shape_y = thrust::raw_pointer_cast(d_shape_y.data()); -#else - p_stride_x = stride_x.Get(); - p_stride_y = stride_y.Get(); - p_stride_out = stride_out.Get(); - p_shape_y = dim_y.Get(); -#endif - - platform::ForRange for_range(dev_ctx, numel); - KronElemFunctor functor(x.data(), y.data(), out->data(), - p_shape_y, p_stride_x, p_stride_y, p_stride_out, - ndims); - for_range(functor); - } -}; - -template -struct KronGradElemFunctor { - KronGradElemFunctor(const T* dout, const T* A, const T* B, T* dout_a, - T* dout_b, const int64_t* stride_dout, - const int64_t* stride_a, const int64_t* stride_b, - const int64_t* shape_b, const int64_t numel_a, - const int64_t numel_b, const int ndims) - : dout_(dout), - A_(A), - B_(B), - dout_a_(dout_a), - dout_b_(dout_b), - stride_dout_(stride_dout), - stride_a_(stride_a), - stride_b_(stride_b), - shape_b_(shape_b), - numel_a_(numel_a), - numel_b_(numel_b), - ndims_(ndims) {} - - HOSTDEVICE void operator()(int64_t idx) { - int64_t index = idx; - int64_t index_a = 0; - int64_t index_b = 0; - for (int i = 0; i < ndims_; i++) { - auto pos_i = index / stride_dout_[i]; - index = index % stride_dout_[i]; - auto pos_ai = pos_i / shape_b_[i]; - auto pos_bi = pos_i % shape_b_[i]; - index_a += stride_a_[i] * pos_ai; - index_b += stride_b_[i] * pos_bi; - } - - if (dout_a_) { - size_t index_out_a = index_a * numel_b_ + index_b; - dout_a_[index_out_a] = dout_[idx] * B_[index_b]; - } - if (dout_b_) { - size_t index_out_b = index_b * numel_a_ + index_a; - dout_b_[index_out_b] = dout_[idx] * A_[index_a]; - } - } - - private: - const T* dout_; - const T* A_; - const T* B_; - T* dout_a_; - T* dout_b_; - const int64_t* stride_dout_; - const int64_t* stride_a_; - const int64_t* stride_b_; - const int64_t* shape_b_; - const int64_t numel_a_; - const int64_t numel_b_; - const int ndims_; -}; - -template -struct KronGradElemFunctor> { - KronGradElemFunctor(const platform::complex* dout, - const platform::complex* A, - const platform::complex* B, - platform::complex* dout_a, - platform::complex* dout_b, const int64_t* stride_dout, - const int64_t* stride_a, const int64_t* stride_b, - const int64_t* shape_b, const int64_t numel_a, - const int64_t numel_b, const int ndims) - : dout_(dout), - A_(A), - B_(B), - dout_a_(dout_a), - dout_b_(dout_b), - stride_dout_(stride_dout), - stride_a_(stride_a), - stride_b_(stride_b), - shape_b_(shape_b), - numel_a_(numel_a), - numel_b_(numel_b), - ndims_(ndims) {} - - HOSTDEVICE void operator()(int64_t idx) { - int64_t index = idx; - int64_t index_a = 0; - int64_t index_b = 0; - for (int i = 0; i < ndims_; i++) { - auto pos_i = index / stride_dout_[i]; - index = index % stride_dout_[i]; - auto pos_ai = pos_i / shape_b_[i]; - auto pos_bi = pos_i % shape_b_[i]; - index_a += stride_a_[i] * pos_ai; - index_b += stride_b_[i] * pos_bi; - } - - if (dout_a_) { - size_t index_out_a = index_a * numel_b_ + index_b; - dout_a_[index_out_a] = - dout_[idx] * - platform::complex(B_[index_b].real, -B_[index_b].imag); - } - if (dout_b_) { - size_t index_out_b = index_b * numel_a_ + index_a; - dout_b_[index_out_b] = - dout_[idx] * - platform::complex(A_[index_a].real, -A_[index_a].imag); - } - } - - private: - const platform::complex* dout_; - const platform::complex* A_; - const platform::complex* B_; - platform::complex* dout_a_; - platform::complex* dout_b_; - const int64_t* stride_dout_; - const int64_t* stride_a_; - const int64_t* stride_b_; - const int64_t* shape_b_; - const int64_t numel_a_; - const int64_t numel_b_; - const int ndims_; -}; - -template -struct KronGradOpFunctor { - void operator()(const DeviceContext& dev_ctx, const framework::Tensor& dout, - const framework::Tensor& x, const framework::Tensor& y, - framework::Tensor* dx, framework::Tensor* dy) { - int ndims = dout.dims().size(); - int64_t numel = dout.numel(); - int64_t numel_x = x.numel(); - int64_t numel_y = y.numel(); - - const framework::DDim& dim_x = x.dims(); - const framework::DDim& dim_y = y.dims(); - const framework::DDim& dim_dout = dout.dims(); - - const framework::DDim stride_x = phi::stride(dim_x); - const framework::DDim stride_y = phi::stride(dim_y); - const framework::DDim stride_dout = phi::stride(dim_dout); - - const int64_t* p_stride_x = nullptr; - const int64_t* p_stride_y = nullptr; - const int64_t* p_stride_dout = nullptr; - const int64_t* p_shape_y = nullptr; -#if defined(__NVCC__) || defined(__HIPCC__) - thrust::device_vector d_stride_x(ndims); - thrust::device_vector d_stride_y(ndims); - thrust::device_vector d_stride_dout(ndims); - thrust::device_vector d_shape_y(ndims); - thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin()); - thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin()); - thrust::copy(stride_dout.Get(), stride_dout.Get() + ndims, - d_stride_dout.begin()); - thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin()); - - p_stride_x = thrust::raw_pointer_cast(d_stride_x.data()); - p_stride_y = thrust::raw_pointer_cast(d_stride_y.data()); - p_stride_dout = thrust::raw_pointer_cast(d_stride_dout.data()); - p_shape_y = thrust::raw_pointer_cast(d_shape_y.data()); -#else - p_stride_x = stride_x.Get(); - p_stride_y = stride_y.Get(); - p_stride_dout = stride_dout.Get(); - p_shape_y = dim_y.Get(); -#endif - // dout_x: dout * kron(ones(X), Y) re-aranged in shape (numel_x, numel_y) - // dout_y: dout * kron(X, ones(Y)) re-aranged in shaoe (numel_y, numel_x) - framework::Tensor dout_x; - T* p_dout_x = nullptr; - if (dx) { - dout_x.mutable_data({numel_x, numel_y}, dev_ctx.GetPlace()); - p_dout_x = dout_x.data(); - } - framework::Tensor dout_y; - T* p_dout_y = nullptr; - if (dy) { - dout_y.mutable_data({numel_y, numel_x}, dev_ctx.GetPlace()); - p_dout_y = dout_y.data(); - } - - platform::ForRange for_range(dev_ctx, numel); - KronGradElemFunctor func(dout.data(), x.data(), y.data(), - p_dout_x, p_dout_y, p_stride_dout, p_stride_x, - p_stride_y, p_shape_y, numel_x, numel_y, ndims); - for_range(func); - -// reduce_sum along aixs 1 -#if defined(__NVCC__) || defined(__HIPCC__) - auto stream = dev_ctx.stream(); // it is a cuda device_context - if (dx) { - TensorReduceImpl>( - dev_ctx, dout_x, dx, kps::IdentityFunctor(), {1}, stream); - } - if (dy) { - TensorReduceImpl>( - dev_ctx, dout_y, dy, kps::IdentityFunctor(), {1}, stream); - } -#else - auto* place = dev_ctx.eigen_device(); - Eigen::array reduce_dim = {1}; - if (dx) { - auto eigen_dout_x = framework::EigenMatrix::Reshape(dout_x, 1); - auto eigen_vec_dx = framework::EigenVector::Flatten(*dx); - eigen_vec_dx.device(*place) = eigen_dout_x.sum(reduce_dim); - } - if (dy) { - auto eigen_dout_y = framework::EigenMatrix::Reshape(dout_y, 1); - auto eigen_vec_dy = framework::EigenVector::Flatten(*dy); - eigen_vec_dy.device(*place) = eigen_dout_y.sum(reduce_dim); - } -#endif - } -}; - -inline framework::Tensor UnsqueezeTo(const framework::Tensor& src, int ndims) { - const framework::DDim& shape = src.dims(); - int rank = shape.size(); - framework::Tensor res; - res.ShareDataWith(src); - PADDLE_ENFORCE_LE( - rank, ndims, - platform::errors::InvalidArgument( - "The input Tensor's rank should be less than or equal to ndims" - "Received input Tensor's rank = %d, ndims = %d", - rank, ndims)); - if (rank < ndims) { - std::vector new_dim(ndims, 1); - for (int i = ndims - rank; i < ndims; i++) { - new_dim[i] = shape[i - ndims + rank]; - } - res.Resize(phi::make_ddim(new_dim)); - } - return res; -} - -template -class KronKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - int ndims = out->dims().size(); - framework::Tensor xx = UnsqueezeTo(*x, ndims); - framework::Tensor yy = UnsqueezeTo(*y, ndims); - - KronOpFunctor func; - func(dev_ctx, xx, yy, out); - } -}; - -template -class KronGradKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - if (dx) { - dx->mutable_data(ctx.GetPlace()); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - } - - int ndims = dout->dims().size(); - framework::Tensor xx = UnsqueezeTo(*x, ndims); - framework::Tensor yy = UnsqueezeTo(*y, ndims); - - framework::Tensor* pdxx = nullptr; - framework::Tensor* pdyy = nullptr; - framework::Tensor dxx; - framework::Tensor dyy; - if (dx) { - dxx = UnsqueezeTo(*dx, ndims); - pdxx = &dxx; - } - - if (dy) { - dyy = UnsqueezeTo(*dy, ndims); - pdyy = &dyy; - } - - KronGradOpFunctor func; - func(dev_ctx, *dout, xx, yy, pdxx, pdyy); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/kthvalue_op.cc b/paddle/fluid/operators/kthvalue_op.cc index 2a79cee27814e..4c679d3026386 100644 --- a/paddle/fluid/operators/kthvalue_op.cc +++ b/paddle/fluid/operators/kthvalue_op.cc @@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/kthvalue_op.h" #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -25,54 +26,6 @@ class KthvalueOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kthvalue"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kthvalue"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "kthvalue"); - auto input_dims = ctx->GetInputDim("X"); - const int& dim_size = input_dims.size(); - int axis = static_cast(ctx->Attrs().Get("axis")); - PADDLE_ENFORCE_LT(axis, dim_size, - paddle::platform::errors::InvalidArgument( - "the axis must be [-%d, %d), but received %d .", - dim_size, dim_size, axis)); - PADDLE_ENFORCE_GE(axis, -dim_size, - paddle::platform::errors::InvalidArgument( - "the axis must be [-%d, %d), but received %d .", - dim_size, dim_size, axis)); - if (axis < 0) axis += dim_size; - int k = static_cast(ctx->Attrs().Get("k")); - PADDLE_ENFORCE_GE( - k, 1, paddle::platform::errors::InvalidArgument( - "the k in the kthvalue must >= 1, but received %d .", k)); - PADDLE_ENFORCE_GE(input_dims.size(), 1, - paddle::platform::errors::InvalidArgument( - "input of kthvalue must have >= 1d shape")); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_GE( - input_dims[axis], k, - paddle::platform::errors::InvalidArgument( - "input of kthvalue must have >= %d columns in axis of %d", k, - axis)); - } - bool keepdim = ctx->Attrs().Get("keepdim"); - std::vector dimvec; - for (int64_t i = 0; i < axis; i++) { - dimvec.emplace_back(input_dims[i]); - } - if (keepdim) { - dimvec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < dim_size; i++) { - dimvec.emplace_back(input_dims[i]); - } - framework::DDim dims = phi::make_ddim(dimvec); - ctx->SetOutputDim("Out", dims); - ctx->SetOutputDim("Indices", dims); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -155,20 +108,13 @@ class KthvalueGradOpMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(kthvalue, KthvalueInferShapeFunctor, + PD_INFER_META(phi::KthvalueInferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR(kthvalue, ops::KthvalueOp, ops::KthvalueOpMaker, ops::KthvalueGradOpMaker, - ops::KthvalueGradOpMaker); -REGISTER_OP_CPU_KERNEL( - kthvalue, ops::KthvalueCPUKernel, - ops::KthvalueCPUKernel, - ops::KthvalueCPUKernel, - ops::KthvalueCPUKernel); + ops::KthvalueGradOpMaker, + KthvalueInferShapeFunctor); REGISTER_OPERATOR(kthvalue_grad, ops::KthvalueOpGrad); -REGISTER_OP_CPU_KERNEL( - kthvalue_grad, - ops::KthvalueGradCPUKernel, - ops::KthvalueGradCPUKernel, - ops::KthvalueGradCPUKernel, - ops::KthvalueGradCPUKernel); diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu deleted file mode 100644 index 4f30c58d37500..0000000000000 --- a/paddle/fluid/operators/kthvalue_op.cu +++ /dev/null @@ -1,279 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/kthvalue_op.h" -#include "paddle/fluid/operators/top_k_function_cuda.h" -#include "paddle/fluid/operators/top_k_v2_op.h" -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -#endif - -namespace paddle { -namespace operators { - -int getBlockSize(int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; -} - -template -bool SortKthvalue(const platform::CUDADeviceContext& ctx, - const framework::Tensor* input_tensor, const int64_t num_cols, - const int64_t num_rows, const int k, - framework::Tensor* out_tensor, - framework::Tensor* indices_tensor) { - auto cu_stream = ctx.stream(); - framework::Tensor input_indices; - const std::vector dims = {num_rows, num_cols}; - auto dim = phi::make_ddim(dims); - input_indices.Resize(dim); - input_indices.mutable_data(ctx.GetPlace()); - size_t temp_storage_bytes = -1; - int block_size = getBlockSize(num_cols); - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - unsigned int grid_size = num_rows < maxGridDimX - ? static_cast(num_rows) - : maxGridDimX; - InitIndex<<>>( - input_indices.data(), num_rows, num_cols); - cub::CountingInputIterator counting_iter(0); - cub::TransformInputIterator> - segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols)); - T* sorted_values_ptr; - int64_t* sorted_indices_ptr; - framework::Tensor temp_values, temp_indices; - const T* input = input_tensor->data(); - T* values = out_tensor->data(); - int64_t* indices = indices_tensor->mutable_data(ctx.GetPlace()); - temp_values.Resize(dim); - temp_indices.Resize(dim); - sorted_values_ptr = temp_values.mutable_data(ctx.GetPlace()); - sorted_indices_ptr = temp_indices.mutable_data(ctx.GetPlace()); - auto err = cub::DeviceSegmentedRadixSort::SortPairs( - nullptr, temp_storage_bytes, input, sorted_values_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); -#ifdef __HIPCC__ - if (err != hipSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "hipcub::DeviceSegmentedRadixSort::SortPairs, status: " - << hipGetErrorString(err); - return false; - } -#else - if (err != cudaSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairs, status: " - << cudaGetErrorString(err); - return false; - } -#endif - framework::Tensor temp_storage; - temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); - - err = cub::DeviceSegmentedRadixSort::SortPairs( - temp_storage.data(), temp_storage_bytes, input, - sorted_values_ptr, input_indices.data(), sorted_indices_ptr, - num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1, - 0, sizeof(T) * 8, cu_stream); -#ifdef __HIPCC__ - if (err != hipSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "hipcub::DeviceSegmentedRadixSort::SortPairs, " - << temp_storage_bytes << ", status: " << hipGetErrorString(err); - return false; - } -#else - if (err != cudaSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairs, " - << temp_storage_bytes << ", status: " << cudaGetErrorString(err); - return false; - } -#endif - auto& dev = *ctx.eigen_device(); - const Eigen::DSizes slice_indices{0, k - 1}; - const Eigen::DSizes slice_sizes{num_rows, 1}; - auto e_indices = framework::EigenMatrix::From(*indices_tensor, dim); - auto e_tmp_indices = framework::EigenMatrix::From( - static_cast(temp_indices)); - std::vector odims = {static_cast(num_rows), static_cast(1)}; - dim = phi::make_ddim(odims); - auto e_values = framework::EigenMatrix::From(*out_tensor, dim); - auto e_tmp_values = framework::EigenMatrix::From( - static_cast(temp_values)); - - EigenSlice, int64_t, 2>::Eval( - dev, e_indices, e_tmp_indices, slice_indices, slice_sizes); - EigenSlice, T, 2>::Eval( - dev, e_values, e_tmp_values, slice_indices, slice_sizes); - return true; -} - -template -class KthvalueOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int k = static_cast(ctx.Attr("k")); - int axis = static_cast(ctx.Attr("axis")); - bool keepdim = static_cast(ctx.Attr("keepdim")); - const auto& in_dims = input->dims(); - if (axis < 0) axis += in_dims.size(); - auto out_dims = output->dims(); - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - PADDLE_ENFORCE_EQ(SortKthvalue(dev_ctx, input, input_width, - input_height, k, output, indices), - true, platform::errors::External( - "KthvalueOP: Error when use cub sorting")); - return; - } else { - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dims); - indices->Resize(tmp_out_dims); - } - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(in_dims); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - trans_out_dims[i] = in_dims[trans[i]]; - } - trans_out_dims[in_dims.size() - 1] = 1; - framework::Tensor trans_input; - trans_input.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - TransCompute(ndims, dev_ctx, *input, - &trans_input, trans); - framework::Tensor trans_ind, trans_out; - trans_ind.mutable_data(trans_out_dims, ctx.GetPlace()); - trans_out.mutable_data(trans_out_dims, ctx.GetPlace()); - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - PADDLE_ENFORCE_EQ( - SortKthvalue(dev_ctx, &trans_input, input_width, input_height, k, - &trans_out, &trans_ind), - true, - platform::errors::External("KthvalueOP: Error when use cub sorting")); - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans); - TransCompute(ndims, dev_ctx, trans_out, - output, trans); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class KthvalueOpGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = context.Attr("axis"); - int k = static_cast(context.Attr("k")); - const auto& in_dims = x->dims(); - auto out_dims = indices->dims(); - if (axis < 0) axis += in_dims.size(); - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - const T* out_grad_data = out_grad->data(); - const int64_t* indices_data = indices->data(); - int pre, n, post; - GetDims(in_dims, axis, &pre, &n, &post); - auto& dev_ctx = context.cuda_device_context(); - int block_size = getBlockSize(post * k); - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); - int grid_size = std::min(max_blocks, pre); - AssignGradWithAxis<<>>( - out_grad_data, indices_data, x_grad_data, pre, post, n, 1); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - kthvalue, - ops::KthvalueOpCUDAKernel, - ops::KthvalueOpCUDAKernel, - ops::KthvalueOpCUDAKernel, - ops::KthvalueOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - kthvalue_grad, - ops::KthvalueOpGradCUDAKernel, - ops::KthvalueOpGradCUDAKernel, - ops::KthvalueOpGradCUDAKernel, - ops::KthvalueOpGradCUDAKernel); diff --git a/paddle/fluid/operators/kthvalue_op.h b/paddle/fluid/operators/kthvalue_op.h deleted file mode 100644 index 15df0a10c6992..0000000000000 --- a/paddle/fluid/operators/kthvalue_op.h +++ /dev/null @@ -1,281 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { -template -static void getKthvalue(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, - Type* t_indices, const int& k) { - bool partial_sort_flag = (k * 64) < input_width; -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(i, j), j)); - } - } - if (partial_sort_flag) { - std::partial_sort( - col_vec.begin(), col_vec.begin() + k, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - } else { - std::nth_element( - col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - } - t_out[i] = col_vec[k - 1].first; - t_indices[i] = col_vec[k - 1].second; - } -} - -template -static void kthvalueAssign(const Type& input_height, const Type& input_width, - const int& input_dim, const framework::Tensor* input, - const framework::Tensor* indices, T* output_data) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - auto e_indices = framework::EigenVector::Flatten(*indices); - output_data[i * input_width + e_indices(0)] = e_input(0); - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = - framework::EigenMatrix::Reshape(*indices, input_dim - 1); - output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); - } - } -} - -template -class KthvalueCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto* indices = context.Output("Indices"); - const auto& in_dims = input->dims(); - int k = static_cast(context.Attr("k")); - bool keepdim = static_cast(context.Attr("keepdim")); - int axis = static_cast(context.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - T* output_data = output->mutable_data(context.GetPlace()); - int64_t* indices_data = indices->mutable_data(context.GetPlace()); - auto out_dims = output->dims(); - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - getKthvalue(input_height, input_width, in_dims.size(), input, - output_data, indices_data, k); - } else { - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dims); - indices->Resize(tmp_out_dims); - } - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(in_dims); - - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - trans_out_dims[i] = in_dims[trans[i]]; - } - trans_out_dims[in_dims.size() - 1] = 1; - framework::Tensor trans_inp; - trans_inp.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - TransCompute(ndims, dev_context, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - framework::Tensor tmp_out, tmp_indices; - T* t_out = tmp_out.mutable_data(trans_out_dims, context.GetPlace()); - auto* t_ind = - tmp_indices.mutable_data(trans_out_dims, context.GetPlace()); - - getKthvalue(input_height, input_width, in_dims.size(), - &trans_inp, t_out, t_ind, k); - TransCompute( - ndims, dev_context, tmp_indices, indices, trans); - TransCompute(ndims, dev_context, tmp_out, - output, trans); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class KthvalueGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = static_cast(context.Attr("axis")); - bool keepdim = static_cast(context.Attr("keepdim")); - auto in_dims = x->dims(); - auto out_dims = indices->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(out_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(out_dims[i - 1]); - } - out_dims = phi::make_ddim(tmp_out_shape); - } - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - if (axis == in_dims.size() - 1) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); - if (keepdim) { - kthvalueAssign(input_height, input_width, in_dims.size(), out_grad, - indices, x_grad_data); - } else { - auto& dev_context = - context.template device_context(); - framework::Tensor out_grad_tmp, indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - kthvalueAssign(input_height, input_width, in_dims.size(), &out_grad_tmp, - &indices_tmp, x_grad_data); - } - } else { - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(out_dims.size() - 1); - for (int i = axis + 1; i < out_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - framework::DDim trans_dims(out_dims); - framework::DDim trans_in_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = out_dims[trans[i]]; - trans_in_dims[i] = in_dims[trans[i]]; - } - framework::Tensor trans_dO, trans_ind; - trans_dO.mutable_data(trans_dims, context.GetPlace()); - trans_ind.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - if (keepdim) { - TransCompute( - ndims, dev_context, *out_grad, &trans_dO, trans); - TransCompute( - ndims, dev_context, *indices, &trans_ind, trans); - } else { - framework::Tensor out_grad_tmp, indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - TransCompute( - ndims, dev_context, out_grad_tmp, &trans_dO, trans); - TransCompute( - ndims, dev_context, indices_tmp, &trans_ind, trans); - } - const int64_t input_height = phi::product( - phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); - const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; - framework::Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_in_dims, context.GetPlace()); - memset(t_out, 0, x_grad->numel() * sizeof(T)); - kthvalueAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out); - TransCompute(ndims, dev_context, tmp_out, - x_grad, trans); - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index 412ae3c49b5f3..c0a4b88fc76fd 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -758,12 +758,14 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel( */ template -void ln_bwd_1024_kernel_driver( - const platform::CUDADeviceContext &dev_ctx, const int rows, const int cols, - float epsilon, const T *x_ptr, const ScaleT *scale_ptr, const U *mean_ptr, - const U *var_ptr, const T *dout_ptr, T *dx_ptr, ScaleT *dscale_ptr, - ScaleT *dbias_ptr, const MaskType *mask_ptr = nullptr, - T factor = static_cast(0), T *d_dropout_src_ptr = nullptr) { +void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows, + const int cols, float epsilon, const T *x_ptr, + const ScaleT *scale_ptr, const U *mean_ptr, + const U *var_ptr, const T *dout_ptr, T *dx_ptr, + ScaleT *dscale_ptr, ScaleT *dbias_ptr, + const MaskType *mask_ptr = nullptr, + T factor = static_cast(0), + T *d_dropout_src_ptr = nullptr) { auto stream = dev_ctx.stream(); if (cols == 1024) { // step-1: compute dx and reduced part results of dscale and dbias. @@ -1334,8 +1336,7 @@ static void LayerNormBackward( const U *mean, const U *var, T *d_x, LayerNormScaleBiasT *d_scale, LayerNormScaleBiasT *d_bias, float epsilon, - int64_t batch_size, int64_t feature_size, - const platform::CUDADeviceContext &dev_ctx) { + int64_t batch_size, int64_t feature_size, const phi::GPUContext &dev_ctx) { auto stream = dev_ctx.stream(); #ifdef __HIPCC__ const int kMaxBlockDim = 256; diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc index e7d676479be0c..224ab748dab6c 100644 --- a/paddle/fluid/operators/layer_norm_op.cc +++ b/paddle/fluid/operators/layer_norm_op.cc @@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/layer_norm_op.h" - #include #include +#include "paddle/fluid/framework/op_registry.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -278,10 +277,3 @@ REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker, ops::LayerNormGradOpMaker); REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp, ops::LayerNormGradNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL( - layer_norm, ops::LayerNormKernel, - ops::LayerNormKernel); -REGISTER_OP_CPU_KERNEL( - layer_norm_grad, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel); diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu deleted file mode 100644 index dfe73d3727132..0000000000000 --- a/paddle/fluid/operators/layer_norm_op.cu +++ /dev/null @@ -1,289 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/operators/layer_norm_kernel.cu.h" -#include "paddle/fluid/operators/layer_norm_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -template -void LayerNormDirectCUDAFunctor::operator()(gpuStream_t stream, - const T *input, - std::vector input_shape, - const T *bias, const T *scale, - T *output, T *mean, T *variance, - int begin_norm_axis, float eps) { - const auto x_dims = phi::make_ddim(input_shape); - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int64_t batch_size = static_cast(matrix_dim[0]); - int64_t feature_size = static_cast(matrix_dim[1]); - switch (GetDesiredBlockDim(feature_size)) { - FIXED_BLOCK_DIM_CASE( - LayerNormForward<<>>( - input, scale, bias, output, mean, variance, eps, feature_size)); - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Product from begin_norm_axis to end in layer_norm must be larger " - "than 1")); - break; - } -} - -template -class LayerNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - using U = LayerNormParamType; - const float epsilon = ctx.Attr("epsilon"); - auto *scale = ctx.Input("Scale"); - auto *bias = ctx.Input("Bias"); - auto *x = ctx.Input("X"); - - auto *y = ctx.Output("Y"); - auto *mean = ctx.Output("Mean"); - auto *var = ctx.Output("Variance"); - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - - const auto x_dims = x->dims(); - auto *x_data = x->data(); - auto *y_data = y->mutable_data(ctx.GetPlace()); - auto *mean_data = mean->mutable_data(ctx.GetPlace()); - auto *var_data = var->mutable_data(ctx.GetPlace()); - - auto *void_scale_data = (scale == nullptr ? nullptr : scale->data()); - auto *void_bias_data = (bias == nullptr ? nullptr : bias->data()); - - framework::proto::VarType::Type x_dtype = - framework::TransToProtoVarType(x->dtype()); - framework::proto::VarType::Type scale_bias_dtype; - if (void_scale_data != nullptr) { - scale_bias_dtype = framework::TransToProtoVarType(scale->dtype()); - if (void_bias_data != nullptr) { - PADDLE_ENFORCE_EQ(scale_bias_dtype, - framework::TransToProtoVarType(bias->dtype()), - platform::errors::InvalidArgument( - "Thie Scale and Bias of layer_norm op " - "should have the same data type.")); - } - } else { - scale_bias_dtype = (void_bias_data != nullptr - ? framework::TransToProtoVarType(bias->dtype()) - : x_dtype); - } - - bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype; - if (!is_scale_bias_same_dtype_with_x) { - PADDLE_ENFORCE_EQ(scale_bias_dtype, - framework::DataTypeTrait::DataType(), - platform::errors::InvalidArgument( - "Unsupported data type of Scale and Bias: %s", - framework::DataTypeToString(scale_bias_dtype))); - } - - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int64_t batch_size = static_cast(matrix_dim[0]); - int64_t feature_size = static_cast(matrix_dim[1]); - - auto stream = ctx.cuda_device_context().stream(); - -#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \ - do { \ - switch (GetDesiredBlockDim(feature_size)) { \ - FIXED_BLOCK_DIM_CASE( \ - LayerNormForward<<< \ - batch_size, kBlockDim, 0, stream>>>( \ - x_data, static_cast(void_scale_data), \ - static_cast(void_bias_data), y_data, \ - mean_data, var_data, epsilon, feature_size)); \ - default: \ - PADDLE_THROW(platform::errors::InvalidArgument( \ - "Product from begin_norm_axis to end must be larger than 1")); \ - break; \ - } \ - } while (0) - -#ifdef PADDLE_WITH_CUDA - bool can_call_1024_kernel = false; - if (feature_size == 1024 && scale != nullptr && bias != nullptr) { - can_call_1024_kernel = true; - } - if (can_call_1024_kernel) { - const int WARPS_M = 4; - const int WARPS_N = 1; - const int THREADS_PER_WARP = 32; - const int BYTES_PER_LDG = 16; - const int VecSize = BYTES_PER_LDG / sizeof(T); - - const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M; - const int ROWS_PER_CTA = WARPS_M; - - const int grid = static_cast( - std::ceil(batch_size / static_cast(ROWS_PER_CTA))); - if (is_scale_bias_same_dtype_with_x) { - ln_fwd_1024_kernel<<>>( - batch_size, feature_size, epsilon, x_data, - static_cast(void_scale_data), - static_cast(void_bias_data), mean_data, var_data, - y_data); - } else { - ln_fwd_1024_kernel<<>>( - batch_size, feature_size, epsilon, x_data, - static_cast(void_scale_data), - static_cast(void_bias_data), mean_data, var_data, - y_data); - } - } else { -#endif - if (is_scale_bias_same_dtype_with_x) { - PADDLE_LAUNCH_LAYERNORM_FWD(T, true); - } else { - PADDLE_LAUNCH_LAYERNORM_FWD(U, false); - } -#ifdef PADDLE_WITH_CUDA - } -#endif - -#undef PADDLE_LAUNCH_LAYERNORM_FWD - } -}; - -template -class LayerNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - using U = LayerNormParamType; - const float epsilon = ctx.Attr("epsilon"); - // d_x, d_scale, d_bias may be nullptr - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - auto *x = ctx.Input("X"); - auto *mean = ctx.Input("Mean"); - auto *var = ctx.Input("Variance"); - auto *scale = ctx.Input("Scale"); - auto *bias = ctx.Input("Bias"); - auto *d_y = ctx.Input(framework::GradVarName("Y")); - - const auto &x_dims = x->dims(); - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int64_t batch_size = static_cast(matrix_dim[0]); - int64_t feature_size = static_cast(matrix_dim[1]); - - auto *x_data = x->data(); - auto *d_y_data = d_y->data(); - - auto *mean_data = mean->data(); - auto *var_data = var->data(); - - auto *d_x_data = - (d_x == nullptr ? nullptr : d_x->mutable_data(ctx.GetPlace())); - - framework::proto::VarType::Type x_dtype = - framework::TransToProtoVarType(x->dtype()); - framework::proto::VarType::Type scale_bias_dtype; - if (scale != nullptr) { - scale_bias_dtype = framework::TransToProtoVarType(scale->dtype()); - } else { - // FIXME(zengjinle): do not find a better way to get the right - // data type of the d_scale and d_bias if scale == nullptr. - auto *bias = ctx.Input("Bias"); - if (bias != nullptr) { - scale_bias_dtype = framework::TransToProtoVarType(bias->dtype()); - } else { - scale_bias_dtype = x_dtype; - } - } - -#define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \ - do { \ - auto *scale_data = \ - (scale == nullptr ? nullptr : scale->data()); \ - auto *d_scale_data = \ - (d_scale == nullptr ? nullptr : d_scale->mutable_data( \ - ctx.GetPlace())); \ - auto *d_bias_data = \ - (d_bias == nullptr ? nullptr : d_bias->mutable_data( \ - ctx.GetPlace())); \ - auto *d_x_data = \ - (d_x == nullptr ? nullptr : d_x->mutable_data(ctx.GetPlace())); \ - LayerNormBackward( \ - x_data, d_y_data, scale_data, mean_data, var_data, d_x_data, \ - d_scale_data, d_bias_data, epsilon, batch_size, feature_size, \ - ctx.cuda_device_context()); \ - } while (0) - - if (scale_bias_dtype == x_dtype) { - PADDLE_LAUNCH_LAYERNORM_BWD(T, true); - } else { - PADDLE_LAUNCH_LAYERNORM_BWD(U, false); - } - -#undef PADDLE_LAUNCH_LAYERNORM_BWD - } -}; - -template class LayerNormDirectCUDAFunctor; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_CUDA_KERNEL( - layer_norm, - ops::LayerNormKernel, - ops::LayerNormKernel); -REGISTER_OP_CUDA_KERNEL( - layer_norm_grad, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel); -#elif CUDNN_VERSION_MIN(8, 1, 0) -REGISTER_OP_CUDA_KERNEL( - layer_norm, - ops::LayerNormKernel, - ops::LayerNormKernel, - ops::LayerNormKernel, - ops::LayerNormKernel); -REGISTER_OP_CUDA_KERNEL( - layer_norm_grad, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel); -#else -REGISTER_OP_CUDA_KERNEL( - layer_norm, - ops::LayerNormKernel, - ops::LayerNormKernel, - ops::LayerNormKernel); -REGISTER_OP_CUDA_KERNEL( - layer_norm_grad, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel); -#endif diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h deleted file mode 100644 index 9d70b7cf70743..0000000000000 --- a/paddle/fluid/operators/layer_norm_op.h +++ /dev/null @@ -1,374 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \ - !defined(__OSX__) -#include "paddle/fluid/operators/jit/kernels.h" -#endif -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace platform { -class CPUDeviceContext; -class CUDADeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { - -// Wrap RowwiseMean and ColwiseMean. -// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is -// significantly faster. Unlike the RowwiseMean and ColwiseMean, the -// implementation only considers 2D. -template -struct RowwiseMean2D { - RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx); - - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, framework::Tensor* vec); -}; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class RowwiseMean2D { - public: - RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) - : left_(left), right_(right) { - framework::DDim ones_dim({right_}); - divisor_.mutable_data(ones_dim, dev_ctx.GetPlace()); - phi::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right); - } - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* out) { - phi::funcs::GetBlas(context).GEMV( - false, left_, right_, 1., input.data(), divisor_.data(), 0., - out->data()); - } - - private: - int left_; - int right_; - framework::Tensor divisor_; -}; -#endif - -template -class RowwiseMean2D { - public: - RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {} - - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* out) { - row_mean_(context, input, out); - } - - private: - phi::funcs::RowwiseMean row_mean_; -}; - -template -struct ColwiseSum2D { - ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx); - - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, framework::Tensor* vec); -}; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class ColwiseSum2D { - public: - ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) - : left_(left), right_(right) { - framework::DDim ones_dim({left_}); - divisor_.mutable_data(ones_dim, dev_ctx.GetPlace()); - phi::funcs::set_constant(dev_ctx, &divisor_, 1.0); - } - - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* out) { - phi::funcs::GetBlas(context).GEMV( - true, left_, right_, 1., input.data(), divisor_.data(), 0., - out->data()); - } - - private: - int left_; - int right_; - framework::Tensor divisor_; -}; -#endif - -template -class ColwiseSum2D { - public: - ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {} - - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* out) { - col_wise_(context, input, out); - } - - private: - phi::funcs::ColwiseSum col_wise_; -}; - -template -struct SubAndSquareFunctor { - inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); } -}; - -template -struct DivAndSqrtFunctor { - explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; } - inline HOSTDEVICE T operator()(T a, T b) const { - return a / (sqrt(b + epsilon_)); - } - - private: - T epsilon_; -}; - -template -struct MulInvVarFunctor { - inline HOSTDEVICE T operator()(T a, T b) const { - return a * std::sqrt(1.0 / b); - } -}; - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using DataLayout = framework::DataLayout; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class LayerNormDirectCUDAFunctor { - public: - void operator()(gpuStream_t stream, const T* input, - std::vector input_shape, const T* bias, const T* scale, - T* output, T* mean, T* variance, int begin_norm_axis, - float eps); -}; -#endif - -template -class LayerNormKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - auto* scale = ctx.Input("Scale"); - auto* bias = ctx.Input("Bias"); - auto x = *ctx.Input("X"); - - auto* y = ctx.Output("Y"); - auto* mean = ctx.Output("Mean"); - auto* var = ctx.Output("Variance"); - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - - const auto x_dims = x.dims(); - - y->mutable_data(ctx.GetPlace()); - mean->mutable_data(ctx.GetPlace()); - var->mutable_data(ctx.GetPlace()); - - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int left = static_cast(matrix_dim[0]); - int right = static_cast(matrix_dim[1]); - framework::DDim matrix_shape({left, right}); - - x.Resize(matrix_shape); - Tensor out; - out.ShareDataWith(*y); - out.Resize(matrix_shape); - -#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \ - defined(__OSX__) - auto& dev_ctx = ctx.template device_context(); - RowwiseMean2D row_mean(left, right, ctx.device_context()); - - // get mean - row_mean(dev_ctx, x, mean); - - // get variance - ElementwiseComputeEx, DeviceContext, T>( - ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor(), &out); - row_mean(dev_ctx, out, var); - - // get x_norm - ElementwiseComputeEx, DeviceContext, T>( - ctx, &x, mean, /*axis*/ 0, SubFunctor(), &out); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &out, var, /*axis*/ 0, - DivAndSqrtFunctor(static_cast(epsilon)), &out); - - if (scale) { - ElementwiseComputeEx, DeviceContext, T>( - ctx, &out, scale, /*axis*/ 1, MulFunctor(), &out); - } - if (bias) { - ElementwiseComputeEx, DeviceContext, T>( - ctx, &out, bias, /*axis*/ 1, AddFunctor(), &out); - } -#else - PADDLE_ENFORCE_EQ(mean->numel(), left, - platform::errors::InvalidArgument( - "mean's length (%d) is not equal with expected (%d).", - mean->numel(), left)); - PADDLE_ENFORCE_EQ(var->numel(), left, - platform::errors::InvalidArgument( - "var's length (%d) is not equal with expected (%d).", - var->numel(), left)); - if (scale) { - PADDLE_ENFORCE_EQ( - scale->numel(), right, - platform::errors::InvalidArgument( - "scale's length (%d) is not equal with expected (%d).", - scale->numel(), right)); - } - if (bias) { - PADDLE_ENFORCE_EQ( - bias->numel(), right, - platform::errors::InvalidArgument( - "bias's length (%d) is not equal with expected (%d).", - bias->numel(), right)); - } - - auto ker = - jit::KernelFuncs, platform::CPUPlace>::Cache() - .At(right); - ker(x.data(), out.data(), mean->data(), var->data(), - scale ? scale->data() : nullptr, bias ? bias->data() : nullptr, - static_cast(left), static_cast(epsilon), right); -#endif - } -}; - -template -class LayerNormGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - auto x = *ctx.Input("X"); - auto* mean = ctx.Input("Mean"); - auto* var = ctx.Input("Variance"); - auto* scale = ctx.Input("Scale"); - auto d_y = *ctx.Input(framework::GradVarName("Y")); - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - - // init output - auto* d_x = ctx.Output(framework::GradVarName("X")); - auto* d_scale = ctx.Output(framework::GradVarName("Scale")); - auto* d_bias = ctx.Output(framework::GradVarName("Bias")); - - const auto& x_dims = x.dims(); - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int left = static_cast(matrix_dim[0]); - int right = static_cast(matrix_dim[1]); - framework::DDim matrix_shape({left, right}); - - d_y.Resize(matrix_shape); - auto& dev_ctx = ctx.template device_context(); - ColwiseSum2D colwise_sum(left, right, - ctx.device_context()); - - Tensor temp; - Tensor temp_norm; - if (d_scale || d_x) { - x.Resize(matrix_shape); - temp.mutable_data(matrix_shape, ctx.GetPlace()); - - temp_norm.mutable_data(matrix_shape, ctx.GetPlace()); - // get x_norm - ElementwiseComputeEx, DeviceContext, T>( - ctx, &x, mean, /*axis*/ 0, SubFunctor(), &temp_norm); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &temp_norm, var, /*axis*/ 0, - DivAndSqrtFunctor(static_cast(epsilon)), &temp_norm); - } - - if (d_bias) { - d_bias->mutable_data(ctx.GetPlace()); - colwise_sum(dev_ctx, d_y, d_bias); - } - if (d_scale) { - d_scale->mutable_data(ctx.GetPlace()); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor(), &temp); - colwise_sum(dev_ctx, temp, d_scale); - } - - if (d_x) { - framework::DDim vec_shape({left}); - d_x->mutable_data(ctx.GetPlace()); - auto dx_dim = d_x->dims(); - Tensor temp_vec; - temp_vec.mutable_data(vec_shape, ctx.GetPlace()); - - RowwiseMean2D row_mean(left, right, - ctx.device_context()); - - if (d_scale) { - // dy_dx - ElementwiseComputeEx, DeviceContext, T>( - ctx, &d_y, scale, /*axis*/ 1, MulFunctor(), &temp); - framework::TensorCopy(temp, ctx.GetPlace(), ctx.device_context(), d_x); - - // dy_dmean_dx - row_mean(dev_ctx, temp, &temp_vec); - ElementwiseComputeEx, DeviceContext, T>( - ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor(), d_x); - - // dy_var_dx - ElementwiseComputeEx, DeviceContext, T>( - ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor(), &temp); - } else { - // dy_dx - framework::TensorCopy(d_y, ctx.GetPlace(), ctx.device_context(), d_x); - - // dy_dmean_dx - row_mean(dev_ctx, d_y, &temp_vec); - ElementwiseComputeEx, DeviceContext, T>( - ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor(), d_x); - - // dy_var_dx - ElementwiseComputeEx, DeviceContext, T>( - ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor(), &temp); - } - // dy_var_dx - row_mean(dev_ctx, temp, &temp_vec); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor(), &temp); - ElementwiseComputeEx, DeviceContext, T>( - ctx, d_x, &temp, /*axis*/ 0, SubFunctor(), d_x); - - ElementwiseComputeEx, DeviceContext, T>( - ctx, d_x, var, /*axis*/ 0, - DivAndSqrtFunctor(static_cast(epsilon)), d_x); - d_x->Resize(dx_dim); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc index c88880b43fff9..3c7e5bf9593e0 100644 --- a/paddle/fluid/operators/layer_norm_op_npu.cc +++ b/paddle/fluid/operators/layer_norm_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/layer_norm_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/layer_norm_op_xpu.cc b/paddle/fluid/operators/layer_norm_op_xpu.cc index 0480a354c8bd8..3b21a55f8df0d 100644 --- a/paddle/fluid/operators/layer_norm_op_xpu.cc +++ b/paddle/fluid/operators/layer_norm_op_xpu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/layer_norm_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/lgamma_op.cc b/paddle/fluid/operators/lgamma_op.cc index 148fb05afcfd9..72c6b41efa989 100644 --- a/paddle/fluid/operators/lgamma_op.cc +++ b/paddle/fluid/operators/lgamma_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/lgamma_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -35,16 +38,6 @@ This operator performs elementwise lgamma for input $X$. class LgammaOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Lgamma"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Lgamma"); - - auto in_dims = ctx->GetInputDim("X"); - - ctx->SetOutputDim("Out", in_dims); - ctx->ShareLoD("X", "Out"); - } }; template @@ -83,17 +76,12 @@ class LgammaGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(lgamma, LgammaInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); + REGISTER_OPERATOR(lgamma, ops::LgammaOp, ops::LgammaOpMaker, ops::LgammaGradMaker, - ops::LgammaGradMaker); + ops::LgammaGradMaker, + LgammaInferShapeFunctor); REGISTER_OPERATOR(lgamma_grad, ops::LgammaGradOp); - -REGISTER_OP_CPU_KERNEL( - lgamma, ops::LgammaKernel, - ops::LgammaKernel) - -REGISTER_OP_CPU_KERNEL( - lgamma_grad, - ops::LgammaGradKernel, - ops::LgammaGradKernel); diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu deleted file mode 100644 index b9f273727b00b..0000000000000 --- a/paddle/fluid/operators/lgamma_op.cu +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/operators/lgamma_op.h" - -namespace paddle { -namespace operators { - -template -struct CudaLgammaFunctor { - __device__ __forceinline__ T operator()(const T x) const { - return Eigen::numext::lgamma(x); - } -}; - -template -class LgammaKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - - auto& dev_ctx = context.device_context(); - std::vector ins = {x}; - std::vector outs = {out}; - auto functor = CudaLgammaFunctor(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - lgamma, ops::LgammaKernel, - ops::LgammaKernel); - -REGISTER_OP_CUDA_KERNEL( - lgamma_grad, - ops::LgammaGradKernel, - ops::LgammaGradKernel); diff --git a/paddle/fluid/operators/lgamma_op.h b/paddle/fluid/operators/lgamma_op.h deleted file mode 100644 index 674054e745732..0000000000000 --- a/paddle/fluid/operators/lgamma_op.h +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { - -template -struct LgammaFunctor { - LgammaFunctor(const T* input, T* output, int64_t numel) - : input_(input), output_(output), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - output_[idx] = Eigen::numext::lgamma(input_[idx]); - } - - private: - const T* input_; - T* output_; - int64_t numel_; -}; - -template -struct LgammaGradFunctor { - LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel) - : dout_(dout), x_(x), output_(output), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]); - } - - private: - const T* dout_; - const T* x_; - T* output_; - int64_t numel_; -}; - -using Tensor = framework::Tensor; - -template -class LgammaKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - - auto numel = x->numel(); - auto* x_data = x->data(); - auto* out_data = out->mutable_data(context.GetPlace(), - size_t(x->numel() * sizeof(T))); - - auto& dev_ctx = context.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - LgammaFunctor functor(x_data, out_data, numel); - for_range(functor); - } -}; - -template -class LgammaGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const framework::Tensor* d_out = - ctx.Input(framework::GradVarName("Out")); - const framework::Tensor* x = ctx.Input("X"); - framework::Tensor* d_x = - ctx.Output(framework::GradVarName("X")); - - auto numel = d_out->numel(); - auto* dout_data = d_out->data(); - auto* x_data = x->data(); - auto* dx_data = d_x->mutable_data( - ctx.GetPlace(), static_cast(numel * sizeof(T))); - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - LgammaGradFunctor functor(dout_data, x_data, dx_data, numel); - for_range(functor); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc index 0e69b397e04c7..da38f906b9bd3 100644 --- a/paddle/fluid/operators/log_softmax_op.cc +++ b/paddle/fluid/operators/log_softmax_op.cc @@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/log_softmax_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -24,10 +27,6 @@ class LogSoftmaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - return UnaryOpUnchangedInferShapeCheckAxis(ctx); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -123,18 +122,11 @@ class LogSoftmaxGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; - +DECLARE_INFER_SHAPE_FUNCTOR(log_softmax, LogSoftmaxInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMetaCheckAxis)); REGISTER_OPERATOR(log_softmax, ops::LogSoftmaxOp, ops::LogSoftmaxOpMaker, ops::LogSoftmaxOpInferVarType, ops::LogSoftmaxGradOpMaker, - ops::LogSoftmaxGradOpMaker); + ops::LogSoftmaxGradOpMaker, + LogSoftmaxInferShapeFunctor); REGISTER_OPERATOR(log_softmax_grad, ops::LogSoftmaxGradOp); - -REGISTER_OP_CPU_KERNEL( - log_softmax, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); -REGISTER_OP_CPU_KERNEL( - log_softmax_grad, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu deleted file mode 100644 index 8770abdac838f..0000000000000 --- a/paddle/fluid/operators/log_softmax_op.cu +++ /dev/null @@ -1,485 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "paddle/fluid/operators/log_softmax_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/phi/kernels/funcs/functors.h" - -namespace paddle { -namespace operators { - -#define LAUNCH_WARP_FORWAR_COMPUTE(near_greater_power_of_two) \ - case near_greater_power_of_two: \ - ComputeLogSoftmaxForwardInWarp< \ - T, AccT, near_greater_power_of_two><<>>( \ - dst, src, outer_size, dim_size); \ - break; - -template -__device__ __forceinline__ T WarpReduceSum(T value) { -#pragma unroll - for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) { - T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset); - value = value + sum_val; - } - return value; -} - -template -__device__ __forceinline__ T WarpReduceMax(T value) { -#pragma unroll - for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) { - T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset); - value = max(value, max_val); - } - return value; -} - -int GetNearGreaterPowerOfTwo(int value) { - int log2_value = 0; - while ((1 << log2_value) < value) { - ++log2_value; - } - return 1 << log2_value; -} - -template -__global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src, - int batch_size, - int element_count) { - constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo; - constexpr int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size; - int batch_id = blockDim.y * blockIdx.x + threadIdx.y; - - int thread_in_warp_idx = threadIdx.x; - - // 1.read data from global memory to registers - AccT elements[warp_iter]; - // set effective_element_count as the num of elements when warps do effective - // work - // set effective_element_count as 0, when warps do ineffective work - int effective_element_count = (batch_id < batch_size) ? element_count : 0; - for (int it = 0; it < warp_iter; ++it) { - int element_index = thread_in_warp_idx + it * kernel_warp_size; - if (element_index < effective_element_count) { - elements[it] = - static_cast(src[batch_id * element_count + element_index]); - } else { - elements[it] = -std::numeric_limits::infinity(); - } - } - - // 2.compute max_value. For each thread, loop all registers to find max - AccT max_value = elements[0]; -#pragma unroll - for (int it = 1; it < warp_iter; ++it) { - max_value = (max_value > elements[it]) ? max_value : elements[it]; - } - max_value = WarpReduceMax(max_value); - - // 3.For each warp, accumulate all thread registers - AccT sum = 0.0f; -#pragma unroll - for (int it = 0; it < warp_iter; ++it) { - sum += std::exp(elements[it] - max_value); - } - sum = WarpReduceSum(sum); - - // 4.store result. - sum = std::log(sum); -#pragma unroll - for (int it = 0; it < warp_iter; ++it) { - int element_index = thread_in_warp_idx + it * kernel_warp_size; - if (element_index < effective_element_count) { - dst[batch_id * element_count + element_index] = - static_cast(elements[it] - max_value - sum); - } else { - break; - } - } -} - -template -void LaunchSoftmaxForwardForLastAxis(T *dst, const T *src, int dim_size, - int outer_size, gpuStream_t stream) { - int threads_per_block = 128; - int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size); - int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - int warps_per_block = (threads_per_block / kernel_warp_size); - int blocks = (outer_size + warps_per_block - 1) / warps_per_block; - dim3 threads(kernel_warp_size, warps_per_block, 1); - - switch (near_greater_power_of_two) { - LAUNCH_WARP_FORWAR_COMPUTE(1); - LAUNCH_WARP_FORWAR_COMPUTE(2); - LAUNCH_WARP_FORWAR_COMPUTE(4); // dim_size: 3~4 - LAUNCH_WARP_FORWAR_COMPUTE(8); // dim_size: 5~8 - LAUNCH_WARP_FORWAR_COMPUTE(16); // dim_size: 9~16 - LAUNCH_WARP_FORWAR_COMPUTE(32); // dim_size: 17~32 - LAUNCH_WARP_FORWAR_COMPUTE(64); // dim_size: 33~64 - LAUNCH_WARP_FORWAR_COMPUTE(128); // dim_size 65~128 - LAUNCH_WARP_FORWAR_COMPUTE(256); // dim_size 129~256 - LAUNCH_WARP_FORWAR_COMPUTE(512); // dim_size 257~512 - LAUNCH_WARP_FORWAR_COMPUTE(1024); // dim_size 513~1024 - - default: - break; - } -} - -// Returns the final item after reduce operation along block.x. -// Firstly, get shared memory(smem) offset, find the starting position for every -// y. -// Secondly, initialise every smem position with value 'val' of thread itself. -// Thirdly, apply standard reduction along x direction as below: -// -// -> x direction -// [o o o o o o o o] time 0 -// | |/ / -// | /| / -// | / | / -// |/ |/ -// [o o o o x x x x] time 1 -// | |/ / -// |/|/ -// [o o x x x x x x] time 2 -// |/ -// [o x x x x x x x] time 3 -// -// Finally, return the first item. -// Imaging multiple reductions executed in paralell along y axis, -// Note that when blockDim.x is not 1, it's a EVEN number in all cases, -// and the size of shared memory is even as well. -template class Functor> -__forceinline__ __device__ T BlockReduceAlongDimX(T *shared, T val) { - Functor func; - // This reduction is not Block-wise reduction, only reduce along block.x. - // therefore the shared mem has offsets for different block.y. - shared += threadIdx.y * blockDim.x; - shared[threadIdx.x] = val; - int offset = blockDim.x / 2; - - while (offset > 0) { - __syncthreads(); - if (threadIdx.x < offset) { - shared[threadIdx.x] = - func(shared[threadIdx.x], shared[threadIdx.x + offset]); - } - offset /= 2; - } - __syncthreads(); - return shared[0]; -} - -template -__global__ void LogSoftmaxForwardCUDAKernelNotLastAxis( - T *output, const T *input, int outer_size, int dim_size, int inner_size) { - extern __shared__ unsigned char smem[]; - auto sdata = reinterpret_cast(smem); - - const int outer_stride = inner_size * dim_size; - const int dim_stride = inner_size; - - for (int x_id = blockIdx.x; x_id < outer_size; x_id += gridDim.x) { - for (int y_id = blockIdx.y * blockDim.y + threadIdx.y; y_id < inner_size; - y_id += blockDim.y * gridDim.y) { - const int data_offset = x_id * outer_stride + y_id; - // When blockDim.x==1, no block.x-reduction opetaions are needed. - // And threadIdx.x is 0 all the time, so the for-loops below are literally - // loops (No parallel executions). Loop all elements along axis and - // calculate the Max, Sum and (input[id]-Max-log(Sum)) to get the final - // log_softmax values along that axis. - // 1. reduce max - AccT max_value = -std::numeric_limits::infinity(); - // For one thread, iterate all items it responsable for, and get - // max_value. - // If there are N threads, N max_value will be returned. - for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { - const AccT value = - static_cast(input[data_offset + d * dim_stride]); - max_value = phi::funcs::MaxFunctor()(max_value, value); - } - // If there are more than 1 threads along block x, reduce all max_values - // and get the global max_value, which is the max value along "axis". - // If there is only one thread along block x, no need to reduce, as the - // 'max_value' is the global max_value. - if (blockDim.x > 1) { - max_value = BlockReduceAlongDimX( - sdata, max_value); - } - - // 2. reduce sum - AccT sum = 0; - // Below is the same execution as '1. reduce max' - for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { - sum += std::exp(static_cast(input[data_offset + d * dim_stride]) - - max_value); - } - if (blockDim.x > 1) { - sum = BlockReduceAlongDimX(sdata, sum); - } - - // 3. input-max-log_sum and write to output - for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { - output[data_offset + d * dim_stride] = static_cast( - static_cast(input[data_offset + d * dim_stride]) - max_value - - std::log(sum)); - } - } - } -} - -// block.y covers inner_size. Threads along the x axis process dim_size -// elements, and make sure not to exceed the 1024 threads per block. -// Note that dim_threads namely blockDim.x is either 1 or a even number. -inline dim3 GetBlockSize(int dim_size, int inner_size) { - int inner_threads = inner_size; - inner_threads = std::min(inner_threads, 1024); - int dim_threads = 1; - - while (dim_threads * inner_threads <= 1024 && dim_threads <= dim_size) { - dim_threads *= 2; - } - dim_threads /= 2; - return dim3(dim_threads, inner_threads); -} - -// First cover the y axis as many blocks as possible. -// Then cover the x axis as many blocks as possible, -// and make sure not to exceed the max_active_blocks. -inline dim3 GetGridSize(dim3 block, int max_active_blocks, int outer_size, - int dim_size, int inner_size) { - int inner_blocks = (inner_size + block.y - 1) / block.y; - if (inner_blocks > max_active_blocks) inner_blocks = max_active_blocks; - - int outer_blocks = (max_active_blocks + inner_blocks - 1) / inner_blocks; - if (outer_blocks > outer_size) outer_blocks = outer_size; - return dim3(outer_blocks, inner_blocks); -} - -// When designing grid size and block size, priority is given to block size, -// and grid will be determined according to the maximum number of active blocks, -// which is set by as a experience value. -template -void ComputeLaunchConfigure(Kernel k, int outer_size, int dim_size, - int inner_size, dim3 &grid, dim3 &block, - int &shared_mem, int num_sm) { - block = GetBlockSize(dim_size, inner_size); - int block_threads = block.x * block.y; - shared_mem = block.x == 1 ? 0 : block_threads * sizeof(T); - int max_active_blocks = num_sm * 2; - grid = - GetGridSize(block, max_active_blocks, outer_size, dim_size, inner_size); -} - -template -void LaunchLogSoftmaxForwardCUDAKernelNotLastAxis(T *output_data, - const T *input_data, - int outer_size, int dim_size, - int inner_size, int num_sm, - gpuStream_t stream) { - int shared_mem; - dim3 grid; - dim3 block; - - ComputeLaunchConfigure( - &LogSoftmaxForwardCUDAKernelNotLastAxis, outer_size, dim_size, - inner_size, grid, block, shared_mem, num_sm); - - LogSoftmaxForwardCUDAKernelNotLastAxis< - T, MPDType><<>>( - output_data, input_data, outer_size, dim_size, inner_size); -} - -template -class LogSoftmaxKernel - : public framework::OpKernel { - using MPDType = typename phi::dtype::MPTypeTrait::Type; - - public: - void Compute(const framework::ExecutionContext &context) const override { - const auto *x = context.Input("X"); - auto *out = context.Output("Out"); - const auto *input_data = x->data(); - auto *output_data = out->mutable_data(context.GetPlace()); - - const int rank = x->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - - int dim_size = x->dims()[axis]; - int inner_size = 1; - for (int i = axis + 1; i < x->dims().size(); ++i) { - inner_size *= x->dims()[i]; - } - int outer_size = SizeToAxis(axis, x->dims()); - gpuStream_t stream = context.cuda_device_context().stream(); - int num_sm = context.cuda_device_context().GetSMCount(); - - if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) { - LaunchSoftmaxForwardForLastAxis(output_data, input_data, - dim_size, outer_size, stream); - } else { - LaunchLogSoftmaxForwardCUDAKernelNotLastAxis( - output_data, input_data, outer_size, dim_size, inner_size, num_sm, - stream); - } - } -}; - -// Backward below -#define LAUNCH_WARP_BACKWARD_COMPUTE(near_greater_power_of_two) \ - case near_greater_power_of_two: \ - ComputeLogSoftmaxBackwardInWarp< \ - T, AccT, near_greater_power_of_two><<>>( \ - output, grad_output, grad_input, outer_size, dim_size); \ - break; - -template -__global__ void ComputeLogSoftmaxBackwardInWarp(const T *output, - const T *grad_output, - T *grad_input, int batch_size, - int element_count) { - constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo; - constexpr int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size; - int batch_id = blockDim.y * blockIdx.x + threadIdx.y; - - int thread_in_warp_idx = threadIdx.x; - - // 1.read data from global memory to registers - AccT output_register[warp_iter]; - AccT grad_output_register[warp_iter]; - int effective_element_count = (batch_id < batch_size) ? element_count : 0; - for (int iter = 0; iter < warp_iter; ++iter) { - int element_index = thread_in_warp_idx + iter * kernel_warp_size; - if (element_index < effective_element_count) { - output_register[iter] = - static_cast(output[batch_id * element_count + element_index]); - grad_output_register[iter] = static_cast( - grad_output[batch_id * element_count + element_index]); - } else { - output_register[iter] = static_cast(0); - grad_output_register[iter] = static_cast(0); - } - } - - // 2. For each warp, accumulate all thread registers - AccT sum = grad_output_register[0]; -#pragma unroll - for (int iter = 1; iter < warp_iter; ++iter) { - sum += grad_output_register[iter]; - } - sum = WarpReduceSum(sum); - -// 3. write result in grad_input -#pragma unroll - for (int iter = 0; iter < warp_iter; ++iter) { - int element_index = thread_in_warp_idx + iter * kernel_warp_size; - if (element_index < effective_element_count) { - grad_input[batch_id * element_count + element_index] = static_cast( - (grad_output_register[iter] - std::exp(output_register[iter]) * sum)); - } - } -} - -template -void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output, - const T *output, int dim_size, - int outer_size, gpuStream_t stream) { - int threads_per_block = 128; - int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size); - int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - int warps_per_block = (threads_per_block / kernel_warp_size); - int blocks = (outer_size + warps_per_block - 1) / warps_per_block; - dim3 threads(kernel_warp_size, warps_per_block, 1); - - switch (near_greater_power_of_two) { - LAUNCH_WARP_BACKWARD_COMPUTE(1); // dim_size: 1 - LAUNCH_WARP_BACKWARD_COMPUTE(2); // dim_size: 2 - LAUNCH_WARP_BACKWARD_COMPUTE(4); // dim_size: 3~4 - LAUNCH_WARP_BACKWARD_COMPUTE(8); // dim_size: 5~8 - LAUNCH_WARP_BACKWARD_COMPUTE(16); // dim_size: 9~16 - LAUNCH_WARP_BACKWARD_COMPUTE(32); // dim_size: 17~32 - LAUNCH_WARP_BACKWARD_COMPUTE(64); // dim_size: 33~64 - LAUNCH_WARP_BACKWARD_COMPUTE(128); // dim_size: 65~128 - LAUNCH_WARP_BACKWARD_COMPUTE(256); // dim_size: 129~256 - LAUNCH_WARP_BACKWARD_COMPUTE(512); // dim_size: 257~512 - LAUNCH_WARP_BACKWARD_COMPUTE(1024); // dim_size: 513~1024 - - default: - break; - } -} - -template -class LogSoftmaxGradKernel - : public framework::OpKernel { - using MPDType = typename phi::dtype::MPTypeTrait::Type; - - public: - void Compute(const framework::ExecutionContext &context) const override { - const auto *out = context.Input("Out"); - const auto *d_out = - context.Input(framework::GradVarName("Out")); - auto *d_x = context.Output(framework::GradVarName("X")); - - const auto *out_data = out->data(); - const auto *d_out_data = d_out->data(); - auto *d_x_data = d_x->mutable_data(context.GetPlace()); - - const int rank = out->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - - int dim_size = out->dims()[axis]; - int inner_size = 1; - for (int i = axis + 1; i < out->dims().size(); ++i) { - inner_size *= out->dims()[i]; - } - int outer_size = SizeToAxis(axis, out->dims()); - gpuStream_t stream = context.cuda_device_context().stream(); - - if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) { - LaunchSoftmaxBackwardForLastAxis( - d_x_data, d_out_data, out_data, dim_size, outer_size, stream); - } else { - LogSoftmaxGradFunctor()( - context.template device_context(), out, - d_out, d_x, axis); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - log_softmax, ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); -REGISTER_OP_CUDA_KERNEL( - log_softmax_grad, ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); diff --git a/paddle/fluid/operators/log_softmax_op.h b/paddle/fluid/operators/log_softmax_op.h deleted file mode 100644 index 162087a75662d..0000000000000 --- a/paddle/fluid/operators/log_softmax_op.h +++ /dev/null @@ -1,197 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; - -static inline int CanonicalAxis(const int axis, const int rank) { - if (axis < 0) { - return axis + rank; - } - return axis; -} - -static inline size_t SizeToAxis(const int axis, const framework::DDim dims) { - size_t size = 1; - for (int i = 0; i < axis; i++) { - size *= dims[i]; - } - return size; -} - -static inline size_t SizeFromAxis(const int axis, const framework::DDim dims) { - size_t size = 1; - for (int i = axis; i < dims.size(); i++) { - size *= dims[i]; - } - return size; -} - -template -struct ValueClip { - HOSTDEVICE T operator()(const T& x) const { - const T kThreshold = static_cast(-64.); - return x < kThreshold ? kThreshold : x; - } -}; - -template -struct LogSoftmaxFunctor { - void operator()(const DeviceContext& context, const framework::Tensor* X, - framework::Tensor* Y, const int axis) { - constexpr int kBatchDim = 0; - constexpr int kClassDim = 1; - constexpr int kAxisDim = 1; - - int axis_dim = X->dims()[axis]; - const int n = SizeToAxis(axis, X->dims()); - const int d = SizeFromAxis(axis, X->dims()); - framework::DDim dim_2d{n, d}; - - auto logits = EigenMatrix::From(*X, dim_2d); - auto log_softmax = EigenMatrix::From(*Y, dim_2d); - - const int batch_size = logits.dimension(kBatchDim); - const int num_classes = logits.dimension(kClassDim); - const int num_remain = num_classes / axis_dim; - - Eigen::DSizes along_axis(kAxisDim); - Eigen::DSizes batch_classes(batch_size, num_classes); - Eigen::DSizes batch_by_one(batch_size, 1); - Eigen::DSizes one_by_class(1, num_classes); - Eigen::DSizes batch_one_remain(batch_size, 1, num_remain); - Eigen::DSizes one_axis_one(1, axis_dim, 1); - Eigen::DSizes one_axis(1, axis_dim); - Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); - - // For numerical stability, logits should be shifted by maximum number along - // axis, calculate shifted_logits into log_softmax tensor for memory reuse. - if (num_remain == 1) { - // axis == -1, axis and class in same dimension, calculate along - // class dimension directly for higher performance - log_softmax.device(*context.eigen_device()) = - (logits - - logits.maximum(along_axis) - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)) - .unaryExpr(ValueClip()); - } else { - // axis != -1, class dimension split into (axis, remain), max and sum - // should be calculated along axis dimension - log_softmax.device(*context.eigen_device()) = - (logits.reshape(batch_axis_remain) - - logits.reshape(batch_axis_remain) - .maximum(along_axis) - .eval() - .reshape(batch_one_remain) - .broadcast(one_axis_one) - .reshape(batch_classes)) - .unaryExpr(ValueClip()); - } - - log_softmax.device(*context.eigen_device()) = - log_softmax - - log_softmax.exp() - .eval() - .reshape(batch_axis_remain) - .sum(along_axis) - .log() - .broadcast(one_axis); - } -}; - -template -class LogSoftmaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* X = context.Input("X"); - auto* Out = context.Output("Out"); - const int rank = X->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - - // allocate memory on device. - Out->mutable_data(context.GetPlace()); - - if (X->numel() != 0) { - LogSoftmaxFunctor()( - context.template device_context(), X, Out, axis); - } - } -}; - -template -struct LogSoftmaxGradFunctor { - void operator()(const DeviceContext& context, const framework::Tensor* Y, - const framework::Tensor* dY, framework::Tensor* dX, - const int axis) { - constexpr int kBatchDim = 0; - constexpr int kClassDim = 1; - - const int n = SizeToAxis(axis, Y->dims()); - const int d = SizeFromAxis(axis, Y->dims()); - framework::DDim dim_2d{n, d}; - - auto y = EigenMatrix::From(*Y, dim_2d); - auto dy = EigenMatrix::From(*dY, dim_2d); - auto dx = EigenMatrix::From(*dX, dim_2d); - - const int axis_dim = Y->dims()[axis]; - const int batch_size = y.dimension(kBatchDim); - const int num_classes = y.dimension(kClassDim); - const int num_remain = num_classes / axis_dim; - - Eigen::DSizes along_class(kClassDim); - Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); - Eigen::DSizes one_axis(1, axis_dim); - - dx.device(*context.eigen_device()) = - dy - - (y.exp()) * (dy.reshape(batch_axis_remain) - .sum(along_class) - .broadcast(one_axis)); - } -}; - -template -class LogSoftmaxGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* Out = context.Input("Out"); - auto* dOut = - context.Input(framework::GradVarName("Out")); - auto* dX = context.Output(framework::GradVarName("X")); - const int rank = Out->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - - // allocate memory on device. - dX->mutable_data(context.GetPlace()); - - if (Out->numel() != 0) { - LogSoftmaxGradFunctor()( - context.template device_context(), Out, dOut, dX, - axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_softmax_op_npu.cc b/paddle/fluid/operators/log_softmax_op_npu.cc index 5795f1dffac78..6ce21aec9215a 100644 --- a/paddle/fluid/operators/log_softmax_op_npu.cc +++ b/paddle/fluid/operators/log_softmax_op_npu.cc @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/log_softmax_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" namespace paddle { namespace operators { @@ -27,7 +28,7 @@ class LogSoftmaxNPUKernel : public framework::OpKernel { auto* X = ctx.Input("X"); auto* Out = ctx.Output("Out"); const int rank = X->dims().size(); - const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); Out->mutable_data(ctx.GetPlace()); if (X->numel() != 0) { @@ -47,7 +48,7 @@ class LogSoftmaxGradNPUKernel : public framework::OpKernel { auto* dOut = ctx.Input(framework::GradVarName("Out")); auto* dX = ctx.Output(framework::GradVarName("X")); const int rank = dOut->dims().size(); - const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); // allocate memory on device. dX->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 65297abe3e49b..88d70d9bb7dae 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -221,7 +221,7 @@ class LRNOp : public framework::OperatorWithKernel { auto ar = paddle::framework::AttrReader(attrs); const std::string data_format = ar.Get("data_format"); auto dl = framework::StringToDataLayout(data_format); - // Some models may have intentionally set "AnyLayout" for pool + // Some models may have intentionally set "AnyLayout" for lrn // op. Treat this as NCHW (default data_format value) if (dl != framework::DataLayout::kAnyLayout) { return framework::OpKernelType(expected_kernel_type.data_type_, diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu index 92c9857f0b942..10e2867bf2953 100644 --- a/paddle/fluid/operators/lstsq_op.cu +++ b/paddle/fluid/operators/lstsq_op.cu @@ -17,9 +17,11 @@ #include #include +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/lstsq_op.h" #include "paddle/fluid/operators/qr_op.h" #include "paddle/fluid/platform/dynload/cusolver.h" +#include "paddle/phi/kernels/triangular_solve_kernel.h" namespace paddle { namespace operators { @@ -70,6 +72,10 @@ class LstsqCUDAKernel : public framework::OpKernel { Tensor tau = dito.Fill(tau_dims_vec, 0); auto tau_data = tau.mutable_data(context.GetPlace()); + using Context = + typename framework::ConvertToPhiContext::TYPE; + auto& phi_dev_ctx = static_cast(dev_ctx); + if (m >= n) { Tensor tmp_x = dito.Transpose(new_x); Tensor tmp_y = dito.Transpose(new_y); @@ -93,8 +99,9 @@ class LstsqCUDAKernel : public framework::OpKernel { Tensor slice_y = dito.Slice(trans_y, {-2}, {0}, {min_mn}); // Step 3, solve R X = Y - triangular_solve(dev_ctx, res_r, slice_y, solution, - true, false, false); + phi::TriangularSolveKernel(phi_dev_ctx, res_r, slice_y, true, + false, false, solution); + } else { auto x_data = new_x.mutable_data(context.GetPlace()); auto y_data = new_y.mutable_data(context.GetPlace()); @@ -105,8 +112,8 @@ class LstsqCUDAKernel : public framework::OpKernel { // Step 2, solve R^H Z = Y Tensor trans_r = dito.Transpose(new_x); - triangular_solve(dev_ctx, trans_r, new_y, solution, - true, true, false); + phi::TriangularSolveKernel(phi_dev_ctx, trans_r, new_y, true, + true, false, solution); // Step 3, X <- Q Z BatchedOrgqr(dev_ctx, batch_count, n, n, min_mn, x_data, diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h index 3cbbc62e7bec9..520722dafcbea 100644 --- a/paddle/fluid/operators/lstsq_op.h +++ b/paddle/fluid/operators/lstsq_op.h @@ -22,7 +22,6 @@ #include "paddle/fluid/operators/math/matrix_solve.h" #include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h index f323e2e041d99..2414ae68438fd 100644 --- a/paddle/fluid/operators/lu_op.h +++ b/paddle/fluid/operators/lu_op.h @@ -15,12 +15,13 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/set_value_op.h" #include "paddle/fluid/operators/svd_helper.h" -#include "paddle/fluid/operators/triangular_solve_op.h" -#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" +#include "paddle/phi/kernels/triangular_solve_kernel.h" namespace paddle { namespace operators { @@ -403,11 +404,12 @@ void LU_Unpack(const DeviceContext& dev_ctx, const framework::Tensor* LU, const auto W = udims[udims.size() - 1]; auto L_dataptr = L->mutable_data(dev_ctx.GetPlace()); platform::ForRange x_for_range(dev_ctx, LU->numel()); - TrilTriuCompute tril_computer(LU->data(), -1, true, H, W, L_dataptr); + phi::funcs::TrilTriuCompute tril_computer(LU->data(), -1, true, H, W, + L_dataptr); x_for_range(tril_computer); - TrilTriuCompute triu_computer(LU->data(), 0, false, H, W, - U->mutable_data(dev_ctx.GetPlace())); + phi::funcs::TrilTriuCompute triu_computer( + LU->data(), 0, false, H, W, U->mutable_data(dev_ctx.GetPlace())); x_for_range(triu_computer); // set L's diagonal 1 @@ -531,15 +533,15 @@ class LUGradKernel : public framework::OpKernel { auto phil_rank = LmHdims.size(); auto phiu_rank = UmHdims.size(); platform::ForRange l_for_range(dev_ctx, phi_L.numel()); - TrilTriuCompute tril_computer(phi_L.data(), -1, true, - LmHdims[phil_rank - 2], - LmHdims[phil_rank - 1], phi_L.data()); + phi::funcs::TrilTriuCompute tril_computer( + phi_L.data(), -1, true, LmHdims[phil_rank - 2], + LmHdims[phil_rank - 1], phi_L.data()); l_for_range(tril_computer); platform::ForRange u_for_range(dev_ctx, phi_U.numel()); - TrilTriuCompute triu_computer(phi_U.data(), 0, false, - UmHdims[phiu_rank - 2], - UmHdims[phiu_rank - 1], phi_U.data()); + phi::funcs::TrilTriuCompute triu_computer( + phi_U.data(), 0, false, UmHdims[phiu_rank - 2], + UmHdims[phiu_rank - 1], phi_U.data()); u_for_range(triu_computer); Tensor_Add(dev_ctx, phi_L, phi_U, &phi); @@ -555,6 +557,11 @@ class LUGradKernel : public framework::OpKernel { framework::Tensor Pmat; Unpack_Pivot(dev_ctx, *P, &Pmat, m, k); + + using Context = + typename framework::ConvertToPhiContext::TYPE; + auto& phi_dev_ctx = static_cast(dev_ctx); + if (m <= n) { if (k < n) { framework::Tensor U_complement, U_grad_complement, phi_complement, @@ -585,8 +592,9 @@ class LUGradKernel : public framework::OpKernel { const auto W = phidims[phidims.size() - 1]; platform::ForRange x_for_range(dev_ctx, phi_complement.numel()); - TrilTriuCompute tril_computer(phi_complement.data(), -1, true, H, - W, phi_complement_l.data()); + phi::funcs::TrilTriuCompute tril_computer( + phi_complement.data(), -1, true, H, W, + phi_complement_l.data()); x_for_range(tril_computer); Tensor_Sub(dev_ctx, phi, phi_complement_l, &phi); @@ -605,8 +613,9 @@ class LUGradKernel : public framework::OpKernel { framework::Tensor psi_principal, phi_mH, psi_tmp; Tensor_Conj(dev_ctx, phi, &phi_mH); phi_mH = helper.Transpose(phi_mH); - triangular_solve(dev_ctx, U_narrow, phi_mH, - &psi_principal, true, false, false); + + phi::TriangularSolveKernel( + phi_dev_ctx, U_narrow, phi_mH, true, false, false, &psi_principal); Tensor_Conj(dev_ctx, psi_principal, &psi_principal); psi_principal = helper.Transpose(psi_principal); @@ -620,8 +629,9 @@ class LUGradKernel : public framework::OpKernel { SetValueCompute_dispatch(ctx, &psi, &psi_principal, &psi, axes, &slice_starts, &slice_ends, valuedims, xrank); - triangular_solve(dev_ctx, L_narrow_mH, psi, &psi_tmp, - true, false, true); + + phi::TriangularSolveKernel(phi_dev_ctx, L_narrow_mH, psi, + true, false, true, &psi_tmp); auto mat_dim_p = phi::funcs::CreateMatrixDescriptor(Pmat.dims(), 0, false); @@ -656,8 +666,8 @@ class LUGradKernel : public framework::OpKernel { const auto W = phidims[phidims.size() - 1]; platform::ForRange x_for_range(dev_ctx, phi_complement.numel()); - TrilTriuCompute triu_computer(phi_complement.data(), 0, false, H, W, - phi_complement_u.data()); + phi::funcs::TrilTriuCompute triu_computer( + phi_complement.data(), 0, false, H, W, phi_complement_u.data()); x_for_range(triu_computer); Tensor_Sub(dev_ctx, phi, phi_complement_u, &phi); @@ -672,8 +682,10 @@ class LUGradKernel : public framework::OpKernel { &psi, axes, &slice_starts, &slice_ends, valuedims, xrank); framework::Tensor psi_principal, phi_mH, psi_tmp, U_narrow_mH; - triangular_solve(dev_ctx, L_narrow_mH, phi, - &psi_principal, true, false, true); + + phi::TriangularSolveKernel(phi_dev_ctx, L_narrow_mH, phi, + true, false, true, &psi_principal); + slice_starts[0] = 0; slice_starts[1] = 0; slice_ends[0] = k; @@ -695,8 +707,8 @@ class LUGradKernel : public framework::OpKernel { psi_tmp = helper.Transpose(psi_tmp); Tensor_Conj(dev_ctx, U_narrow, &U_narrow_mH); - triangular_solve(dev_ctx, U_narrow_mH, psi_tmp, &psi, - true, false, false); + phi::TriangularSolveKernel(phi_dev_ctx, U_narrow_mH, psi_tmp, + true, false, false, &psi); *dx = helper.Transpose(psi); } } diff --git a/paddle/fluid/operators/lu_unpack_op.h b/paddle/fluid/operators/lu_unpack_op.h index d2303f2c08da8..e4100867dc685 100644 --- a/paddle/fluid/operators/lu_unpack_op.h +++ b/paddle/fluid/operators/lu_unpack_op.h @@ -16,7 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/lu_op.h" -#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" namespace paddle { namespace operators { @@ -87,7 +88,8 @@ class LU_UnpackGradKernel : public framework::OpKernel { auto W = ldims[ldims.size() - 1]; auto L_dataptr = dl_tril.mutable_data(dev_ctx.GetPlace()); platform::ForRange l_for_range(dev_ctx, dl->numel()); - TrilTriuCompute tril_computer(dl->data(), -1, true, H, W, L_dataptr); + phi::funcs::TrilTriuCompute tril_computer(dl->data(), -1, true, H, W, + L_dataptr); l_for_range(tril_computer); const auto udims = du->dims(); @@ -96,7 +98,8 @@ class LU_UnpackGradKernel : public framework::OpKernel { W = udims[udims.size() - 1]; auto U_dataptr = du_triu.mutable_data(dev_ctx.GetPlace()); platform::ForRange u_for_range(dev_ctx, du->numel()); - TrilTriuCompute triu_computer(du->data(), 0, false, H, W, U_dataptr); + phi::funcs::TrilTriuCompute triu_computer(du->data(), 0, false, H, W, + U_dataptr); u_for_range(triu_computer); auto xdims = dx->dims(); diff --git a/paddle/fluid/operators/masked_select_op.cc b/paddle/fluid/operators/masked_select_op.cc index a6eb535c693b8..1887bbcfb7efd 100644 --- a/paddle/fluid/operators/masked_select_op.cc +++ b/paddle/fluid/operators/masked_select_op.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,16 +23,6 @@ class MaskedSelectOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Input", "MaskedSelect"); - OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect"); - OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Out", "MaskedSelect"); - - // output will only be a 1-D Tensor - ctx->SetOutputDim("Y", phi::make_ddim({-1})); - ctx->ShareLoD("X", /*->*/ "Y"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -100,8 +92,13 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(MaskedSelectedGradNoNeedBufferVarsInferer, } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(masked_select, MaksedSelectInferShapeFunctor, + PD_INFER_META(phi::MaskedSelectInferMeta)); + REGISTER_OPERATOR(masked_select, ops::MaskedSelectOp, ops::MaskedSelectOpMaker, ops::MaskedSelectGradOpMaker, - ops::MaskedSelectGradOpMaker); + ops::MaskedSelectGradOpMaker, + MaksedSelectInferShapeFunctor); REGISTER_OPERATOR(masked_select_grad, ops::MaskedSelectOpGrad, ops::MaskedSelectedGradNoNeedBufferVarsInferer); diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index d5a86d62b417c..af1069cb86799 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -20,7 +20,6 @@ math_library(sampler DEPS generator) # math_library(math_function DEPS blas dense_tensor tensor) math_library(maxouting) -math_library(pooling) if(WITH_MKLDNN) math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler) @@ -46,7 +45,6 @@ math_library(vol2col) math_library(prelu) math_library(bert_encoder_functor) math_library(tree2col DEPS math_function) -math_library(segment_pooling) math_library(matrix_solve) cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc index c9308d27c0a34..e1861b2f7c5ea 100644 --- a/paddle/fluid/operators/math/concat_and_split.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -243,8 +243,6 @@ class ConcatFunctor { const int axis_t = axis; const int ins_size_t = ins_size; - auto place = context.GetPlace(); - output->mutable_data(place); // mlu should do sth // init ins tensors @@ -295,7 +293,6 @@ class SplitFunctor { std::vector desc_vector; for (size_t i = 0; i < out_size; i++) { (*outputs)[i]->Resize(outs_dims[i]); - (*outputs)[i]->mutable_data(context.GetPlace()); output_descs.emplace_back( MLUCnnlTensorDesc(*(*outputs)[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType((*outputs)[i]->dtype()))); diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h index 9994ccc10cb13..b77e23450360c 100644 --- a/paddle/fluid/operators/math/inclusive_scan.h +++ b/paddle/fluid/operators/math/inclusive_scan.h @@ -34,10 +34,10 @@ namespace paddle { namespace operators { namespace math { -template +template static void CubInclusiveScan(InputIterator x_iter, OutputIterator y_iter, - size_t n, BinaryOp op, - const platform::CUDADeviceContext &dev_ctx) { + size_t n, BinaryOp op, const Context &dev_ctx) { memory::AllocationPtr allocation; void *temp_storage = nullptr; size_t temp_storage_bytes = 0; @@ -185,11 +185,10 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y, } } -template +template static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim, size_t inner_dim, T init, BinaryOp op, - bool reverse, - const platform::CUDADeviceContext &dev_ctx) { + bool reverse, const Context &dev_ctx) { constexpr size_t kThreadNumX = 16; constexpr size_t kThreadNumY = 32; @@ -209,10 +208,10 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim, } } -template +template void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim, size_t inner_dim, T init, BinaryOp op, bool reverse, - const platform::CUDADeviceContext &dev_ctx) { + const Context &dev_ctx) { if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return; if (outer_dim == 1 && inner_dim == 1) { @@ -224,8 +223,7 @@ void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim, CubInclusiveScan(x, y, mid_dim, op, dev_ctx); } } else if (inner_dim != 1) { - platform::ForRange for_range( - dev_ctx, outer_dim * inner_dim); + platform::ForRange for_range(dev_ctx, outer_dim * inner_dim); if (reverse) { for_range( InclusiveScanOuterOrMidDimFunctor( diff --git a/paddle/fluid/operators/math/matrix_solve.cc b/paddle/fluid/operators/math/matrix_solve.cc index 883ee9b148654..7b239b8166644 100644 --- a/paddle/fluid/operators/math/matrix_solve.cc +++ b/paddle/fluid/operators/math/matrix_solve.cc @@ -34,45 +34,6 @@ class MatrixSolveFunctor { template class MatrixSolveFunctor; template class MatrixSolveFunctor; -template -class TriangularSolveFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor* a, framework::Tensor* b, bool left, - bool upper, bool transpose, bool unitriangular) { - CBLAS_SIDE side = left ? CblasLeft : CblasRight; - CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower; - CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans; - CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit; - - const T* a_data = a->data(); - T* b_data = b->mutable_data(context.GetPlace()); - - int a_dim_size = a->dims().size(); - int b_dim_size = b->dims().size(); - - int M = static_cast(b->dims()[b_dim_size - 2]); - int N = static_cast(b->dims()[b_dim_size - 1]); - auto lda = left ? std::max(1, M) : std::max(1, N); - auto ldb = std::max(1, N); - - int batch_size = 1; - auto& a_dim = a->dims(); - for (int i = 0; i < a_dim_size - 2; i++) { - batch_size *= a_dim[i]; - } - - auto blas = phi::funcs::GetBlas(context); - for (int i = 0; i < batch_size; i++) { - blas.TRSM(side, uplo, transA, diag, M, N, T(1), a_data + i * M * M, lda, - b_data + i * N * M, ldb); - } - } -}; - -template class TriangularSolveFunctor; -template class TriangularSolveFunctor; - } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc index d3490ead21273..737196dde1dfc 100644 --- a/paddle/fluid/operators/math/matrix_solve.cu.cc +++ b/paddle/fluid/operators/math/matrix_solve.cu.cc @@ -161,67 +161,6 @@ class MatrixSolveFunctor { template class MatrixSolveFunctor; template class MatrixSolveFunctor; -template -class TriangularSolveFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, const Tensor* a, - Tensor* b, bool left, bool upper, bool transpose, - bool unitriangular) { - CBLAS_SIDE side = left ? CblasLeft : CblasRight; - CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower; - CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans; - CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit; - - const T* a_data = a->data(); - T* b_data = b->mutable_data(context.GetPlace()); - - int a_dim_size = a->dims().size(); - int b_dim_size = b->dims().size(); - - int M = static_cast(b->dims()[b_dim_size - 2]); - int N = static_cast(b->dims()[b_dim_size - 1]); - auto lda = left ? std::max(1, M) : std::max(1, N); - auto ldb = std::max(1, N); - - int batch_size = 1; - auto& a_dim = a->dims(); - for (int i = 0; i < a_dim_size - 2; i++) { - batch_size *= a_dim[i]; - } - - auto blas = phi::funcs::GetBlas(context); - if (batch_size <= 8 && M >= 64) { - for (auto i = 0; i < batch_size; i++) { - blas.TRSM(side, uplo, transA, diag, M, N, static_cast(1.0), - a_data + i * M * M, lda, b_data + i * N * M, ldb); - } - } else { - std::vector cpu_ptrs(batch_size * 2); - for (int i = 0; i < batch_size; ++i) { - cpu_ptrs[i] = a_data + i * M * M; - cpu_ptrs[i + batch_size] = b_data + i * M * N; - } - - // Copy the addresses of A and tmp_b from host to device. - memory::allocation::AllocationPtr tmp_gpu_ptrs_data = - memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); - memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(), - platform::CPUPlace(), static_cast(cpu_ptrs.data()), - cpu_ptrs.size() * sizeof(T*), context.stream()); - - const T** gpu_a_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()); - T** gpu_b_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; - blas.BatchedTRSM(side, uplo, transA, diag, M, N, static_cast(1.0), - gpu_a_ptrs, lda, gpu_b_ptrs, ldb, batch_size); - } - } -}; - -template class TriangularSolveFunctor; -template class TriangularSolveFunctor; - } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/fluid/operators/math/matrix_solve.h index 1dc43205592f6..415d0c6dd8e0c 100644 --- a/paddle/fluid/operators/math/matrix_solve.h +++ b/paddle/fluid/operators/math/matrix_solve.h @@ -117,14 +117,6 @@ class MatrixSolveFunctor { const framework::Tensor& b, framework::Tensor* out); }; -template -class TriangularSolveFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor* a, - framework::Tensor* b, bool left, bool upper, bool transpose, - bool unitriangular); -}; - } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h deleted file mode 100644 index dfd3dad38644b..0000000000000 --- a/paddle/fluid/operators/math/pooling.h +++ /dev/null @@ -1,315 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { -namespace math { - -/* - * \brief Extracting simple operations from pooling. - * Both MaxPool and AvgPool need "initial", "compute" and "finalize" - * operation. - * MaxPool initializes temp variable to the negative maximum to find the - * maximum value in the pooling field. - * AvgPool initializes temp variable to the zero to accumulate all values - * in pool pooling, and finally takes the average. - * MaxPoolGrad and AvgPoolGrad are gradient operations respectively. - */ -template -class MaxPool { - public: - DEVICE inline T initial() { return static_cast(-FLT_MAX); } - DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } - DEVICE inline void finalize(const T& pool_field, T* y) {} -}; - -template -class AvgPool { - using MT = typename details::MPTypeTrait::Type; - MT intermediate_res; - - public: - DEVICE inline T initial() { - intermediate_res = static_cast(0.0f); - return static_cast(0); - } - - DEVICE inline void compute(const T& x, T* y) { - intermediate_res += static_cast(x); - } - - DEVICE inline void finalize(const T& pool_field, T* y) { - *y = static_cast(intermediate_res / (static_cast(pool_field))); - } -}; - -template -class MaxPoolGrad { - public: - static constexpr bool use_x = true; - HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale, - T* dx) { - *dx += dy * static_cast(x == y); - } -}; - -template -class AvgPoolGrad { - public: - static constexpr bool use_x = false; - HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale, - T* dx) { - *dx += (scale * dy); - } -}; - -/* used for adaptive pool to calculate start and end index of each divided grid - */ -HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) { - return static_cast( - floor(static_cast(ph * input_size) / output_size)); -} - -HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) { - return static_cast( - ceil(static_cast((ph + 1) * input_size) / output_size)); -} - -/* - * \brief Getting pooling results, and calculating gradient. - * - * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C - * is the number of channels, H and W is the height and width of feature. - * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C - * is the number of channels, D, H and W is the depth, height and width of - * feature. - * - * In max pooling, it is possible that the pooling region has multiple maximum - * elements. In this case, we should compute the gradient of the first maximum - * element. - * This is different from average pooling. So we rewrite the max_pool_grad: - * MaxPool2dGradFunctor, MaxPool3dGradFunctor. - */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class Pool2dDirectCUDAFunctor { - public: - void operator()(const T* input, const std::vector& input_shape, - const std::vector& output_shape, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, T* output, gpuStream_t stream, - PoolProcess pool_compute); -}; -#endif - -template -class Pool2dFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, - PoolProcess pool_compute); - - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_compute); -}; - -template -class Pool2dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_compute); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_compute); -}; - -template -class MaxPool2dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - framework::Tensor* input_grad); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad); -}; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class Pool3dDirectCUDAFunctor { - public: - void operator()(const T* input, const std::vector& input_shape, - const std::vector& output_shape, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, T* output, gpuStream_t stream, - PoolProcess pool_compute); -}; -#endif - -template -class Pool3dFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, - PoolProcess pool_compute); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_compute); -}; - -template -class Pool3dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_compute); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_compute); -}; - -template -class MaxPool3dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - framework::Tensor* input_grad); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad); -}; - -/* - * \brief Getting max pooling results and corresponding max index, and - * calculating gradient. - * In up-sampling-pooling, it is necessary to know max element index. - * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in - * NCDHW format. - */ -template -class MaxPool2dWithIndexFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask); -}; - -template -class MaxPool2dWithIndexGradFunctor { - public: - void operator()(const DeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad); -}; - -template -class MaxPool3dWithIndexFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask); -}; - -template -class MaxPool3dWithIndexGradFunctor { - public: - void operator()(const DeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad); -}; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc index 1524a50f1ac6d..87df75ac46504 100644 --- a/paddle/fluid/operators/matmul_v2_op_xpu.cc +++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc @@ -38,7 +38,7 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, auto mat_dim_b = phi::funcs::CreateMatrixDescriptor( ColumnMatrixFromVector(y_dims), 0, trans_y); - if (x_dims.size() == 3 && y_dims.size() <= 2) { + if (x_dims.size() >= 3 && y_dims.size() <= 2) { // if transpose_X is true, the transpose cost much time if (!trans_x) { mat_dim_a.height_ *= mat_dim_a.batch_size_; diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc index c65af3129f364..56f65340ea999 100644 --- a/paddle/fluid/operators/matrix_power_op.cc +++ b/paddle/fluid/operators/matrix_power_op.cc @@ -12,7 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/matrix_power_op.h" +#include +#include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,26 +26,6 @@ namespace operators { class MatrixPowerOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "matrix_power"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "matrix_power"); - auto dims = ctx->GetInputDim("X"); - auto n_dim = dims.size(); - PADDLE_ENFORCE_GE(n_dim, 2, - platform::errors::InvalidArgument( - "The Input(X) should have at least 2 dimensions. But " - "received a %d dimension tensor.", - n_dim)); - PADDLE_ENFORCE_EQ(dims[n_dim - 2], dims[n_dim - 1], - platform::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) all should " - "be square matrices " - "But received X's shape[-2] = %d and shape[-1] = %d.", - dims[n_dim - 2], dims[n_dim - 1])); - ctx->SetOutputDim("Out", dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class MatrixPowerOpMaker : public framework::OpProtoAndCheckerMaker { @@ -113,19 +99,14 @@ class MatrixPowerGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(matrix_power, MatrixPowerInferShapeFunctor, + PD_INFER_META(phi::MatrixPowerInferMeta)); + REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker, ops::MatrixPowerOpInferVarType, ops::MatrixPowerGradOpMaker, - ops::MatrixPowerGradOpMaker); + ops::MatrixPowerGradOpMaker, + MatrixPowerInferShapeFunctor); REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp); - -REGISTER_OP_CPU_KERNEL( - matrix_power, - ops::MatrixPowerKernel, - ops::MatrixPowerKernel); - -REGISTER_OP_CPU_KERNEL( - matrix_power_grad, - ops::MatrixPowerGradKernel, - ops::MatrixPowerGradKernel); diff --git a/paddle/fluid/operators/matrix_power_op.cu b/paddle/fluid/operators/matrix_power_op.cu deleted file mode 100644 index d972e9499dc88..0000000000000 --- a/paddle/fluid/operators/matrix_power_op.cu +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/matrix_power_op.h" - -namespace ops = paddle::operators; -namespace plf = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(matrix_power, - ops::MatrixPowerKernel, - ops::MatrixPowerKernel); - -REGISTER_OP_CUDA_KERNEL( - matrix_power_grad, - ops::MatrixPowerGradKernel, - ops::MatrixPowerGradKernel); diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h deleted file mode 100644 index 8eb9c58513df6..0000000000000 --- a/paddle/fluid/operators/matrix_power_op.h +++ /dev/null @@ -1,277 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/matrix_inverse.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct IdentityMatrixFunctor { - IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {} - - HOSTDEVICE void operator()(size_t index) const { - const int row = index / m_ % m_; - const int col = index % m_; - output_[index] = col == row ? static_cast(1) : static_cast(0); - } - - const int m_; - T* output_; -}; - -template -void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out, - const paddle::framework::ExecutionContext& ctx) { - const auto& x_dims = X->dims(); - const int x_ndim = x_dims.size(); - T* out_data = Out->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, X->numel()); - - if (n == 0) { - // Out = Identity Matrix - IdentityMatrixFunctor functor(x_dims[x_ndim - 1], out_data); - for_range(functor); - return; - } - - auto blas = phi::funcs::GetBlas(dev_ctx); - - Tensor new_x = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - int new_n = n; - if (n > 0) { - // newX = X - framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); - } else { - // newX = X^{-1}, n = -n - phi::funcs::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *X, &new_x); - new_n = -n; - } - - if (new_n == 1) { - framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, Out); - return; - } - - auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false); - - if (new_n == 2) { - // Out = newX * newX - Out->mutable_data(ctx.GetPlace()); - blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), - Out, static_cast(0)); - return; - } else if (new_n == 3) { - // Out = (newX * newX) * newX - // Note: C[i] matrices in MatMul must not overlap, i.e. the individual - // gemm operations must be computable independently; otherwise, - // undefined behavior is expected. - Tensor temp = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), - &temp, static_cast(0)); - blas.MatMul(temp, no_trans_desc, new_x, no_trans_desc, static_cast(1), - Out, static_cast(0)); - return; - } else if (new_n == 4) { - // Out = (newX * newX) * (newX * newX) - Tensor temp = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), - &temp, static_cast(0)); - blas.MatMul(temp, no_trans_desc, temp, no_trans_desc, static_cast(1), - Out, static_cast(0)); - return; - } - - // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN) - int bit = 0; - Tensor z = Tensor(X->dtype()); - bool out_inited = false; - Tensor temp_out = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - Tensor temp_z = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - while (new_n > 0) { - bit = new_n & 0x1; - new_n >>= 1; - if (z.IsInitialized()) { - blas.MatMul(z, no_trans_desc, z, no_trans_desc, static_cast(1), - &temp_z, static_cast(0)); - framework::TensorCopy(temp_z, ctx.GetPlace(), dev_ctx, &z); - } else { - z = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, &z); - } - if (bit == 1) { - if (out_inited == true) { - blas.MatMul(*Out, no_trans_desc, z, no_trans_desc, static_cast(1), - &temp_out, static_cast(0)); - framework::TensorCopy(temp_out, ctx.GetPlace(), dev_ctx, Out); - } else { - framework::TensorCopy(z, ctx.GetPlace(), dev_ctx, Out); - out_inited = true; - } - } - } - return; -} - -template -class MatrixPowerKernel : public framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { - const Tensor* X = ctx.Input("X"); - Tensor* Out = ctx.Output("Out"); - int n = ctx.Attr("n"); - - const auto& x_dims = X->dims(); - const int x_ndim = x_dims.size(); - PADDLE_ENFORCE_EQ( - x_dims[x_ndim - 2], x_dims[x_ndim - 1], - platform::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) should be equal." - "X's shape[-2] = %d and shape[-1] = %d.", - x_dims[x_ndim - 2], x_dims[x_ndim - 1])); - - MatrixPowerFunction(X, n, Out, ctx); - } -}; - -template -void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out, - const Tensor* dOut, const int n, Tensor* dX, - const paddle::framework::ExecutionContext& ctx) { - dX->mutable_data(ctx.GetPlace()); - const auto& x_dims = X->dims(); - - auto& dev_ctx = ctx.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - - if (n == 0) { - // \nabla X = O - phi::funcs::SetConstant zero; - zero(dev_ctx, dX, static_cast(0)); - return; - } else if (n == 1) { - // \nabla X = \nabla Out - framework::TensorCopy(*dOut, ctx.GetPlace(), dev_ctx, dX); - return; - } - - auto trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, true); - auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false); - - if (n == -1) { - // \nabla X = Out^{T} * \nabla Out * Out^{T} - Tensor temp_dx = - ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*Out, trans_desc, *dOut, no_trans_desc, static_cast(-1), - &temp_dx, static_cast(0)); - blas.MatMul(temp_dx, no_trans_desc, *Out, trans_desc, static_cast(1), dX, - static_cast(0)); - return; - } - - Tensor new_x = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - int new_n = n; - if (n > 0) { - // newX = X - framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); - } else { - // newX = X^{-1}, n = -n - phi::funcs::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *X, &new_x); - new_n = -n; - } - - // Use chain rule blow to compute \nabla newX^{n} - // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1}, - // Note that newX^{0} can be omitted - std::vector> tensor_list(new_n - 1); - tensor_list[0] = std::make_shared(new_x); - int index = 1; - while (index < new_n - 1) { - tensor_list[index] = std::make_shared( - ctx.AllocateTmpTensor(X->dims(), dev_ctx)); - blas.MatMul(*tensor_list[index - 1], no_trans_desc, new_x, no_trans_desc, - static_cast(1), tensor_list[index].get(), static_cast(0)); - index++; - } - - // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i} - // * \nabla Out - // * (newX^{T}^{n - i - 1}) - Tensor dx_new = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*tensor_list[new_n - 2], trans_desc, *dOut, no_trans_desc, - static_cast(1), &dx_new, static_cast(0)); - Tensor da_an_minus1 = - ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*dOut, no_trans_desc, *tensor_list[new_n - 2], trans_desc, - static_cast(1), &da_an_minus1, static_cast(0)); - blas.AXPY(X->numel(), static_cast(1), da_an_minus1.data(), - dx_new.data()); - int start = 0; - while (start < new_n - 2) { - Tensor a_da = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - Tensor a_da_a = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*tensor_list[start], trans_desc, *dOut, no_trans_desc, - static_cast(1), &a_da, static_cast(0)); - blas.MatMul(a_da, no_trans_desc, *tensor_list[new_n - 3 - start], - trans_desc, static_cast(1), &a_da_a, static_cast(0)); - blas.AXPY(X->numel(), static_cast(1), a_da_a.data(), - dx_new.data()); - start++; - } - - if (n > 0) { - // \nabla X = \nabla newX - framework::TensorCopy(dx_new, ctx.GetPlace(), dev_ctx, dX); - } else { - // \nabla X = newX^{T} * \nabla newX * newX^{T} - Tensor temp_dx = - ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(new_x, trans_desc, dx_new, no_trans_desc, static_cast(-1), - &temp_dx, static_cast(0)); - blas.MatMul(temp_dx, no_trans_desc, new_x, trans_desc, static_cast(1), - dX, static_cast(0)); - } - return; -} - -template -class MatrixPowerGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* X = ctx.Input("X"); - const Tensor* Out = ctx.Input("Out"); - const Tensor* dOut = ctx.Input(framework::GradVarName("Out")); - const int n = ctx.Attr("n"); - Tensor* dX = ctx.Output(framework::GradVarName("X")); - - MatrixPowerGradFunction(X, Out, dOut, n, dX, ctx); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/matrix_rank_op.cc b/paddle/fluid/operators/matrix_rank_op.cc index 1f04875c2203b..e7d08b6597360 100644 --- a/paddle/fluid/operators/matrix_rank_op.cc +++ b/paddle/fluid/operators/matrix_rank_op.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/matrix_rank_op.h" #include #include #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" @@ -70,9 +69,9 @@ class MatrixRankeOp : public framework::OperatorWithKernel { std::vector x_batch_dims_array(max_dim); std::vector tol_dims_array(max_dim); std::vector out_dims_array(max_dim); - GetBroadcastDimsArrays(dim_x_batch, dim_tol, x_batch_dims_array.data(), - tol_dims_array.data(), out_dims_array.data(), - max_dim, axis); + phi::funcs::GetBroadcastDimsArrays( + dim_x_batch, dim_tol, x_batch_dims_array.data(), + tol_dims_array.data(), out_dims_array.data(), max_dim, axis); ctx->SetOutputDim("Out", phi::make_ddim(out_dims_array)); } } else { @@ -115,141 +114,9 @@ class MatrixRankeOpMaker : public framework::OpProtoAndCheckerMaker { } }; -template -void BatchEigenvalues(const T* x_data, T* eigenvalues_data, int batches, - int rows, int cols, int k) { - // Eigen::Matrix API need non-const pointer. - T* input = const_cast(x_data); - int stride = rows * cols; - for (int i = 0; i < batches; i++) { - auto m = Eigen::Map< - Eigen::Matrix>( - input + i * stride, rows, rows); - Eigen::SelfAdjointEigenSolver< - Eigen::Matrix> - eigen_solver(m); - auto eigenvalues = eigen_solver.eigenvalues().cwiseAbs(); - for (int j = 0; j < k; j++) { - *(eigenvalues_data + i * k + j) = eigenvalues[j]; - } - } -} - -template -void BatchSVD(const T* x_data, T* eigenvalues_data, int batches, int rows, - int cols, int k) { - // Eigen::Matrix API need non-const pointer. - T* input = const_cast(x_data); - int stride = rows * cols; - Eigen::BDCSVD< - Eigen::Matrix> - svd; - for (int i = 0; i < batches; i++) { - auto m = Eigen::Map< - Eigen::Matrix>( - input + i * stride, rows, cols); - svd.compute(m); - auto res_s = svd.singularValues(); - for (int j = 0; j < k; j++) { - eigenvalues_data[i * k + j] = res_s[j]; - } - } -} - -template -class MatrixRankCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - auto* x_data = x->data(); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - bool hermitian = context.Attr("hermitian"); - - auto dim_x = x->dims(); - auto dim_out = out->dims(); - int rows = dim_x[dim_x.size() - 2]; - int cols = dim_x[dim_x.size() - 1]; - int k = std::min(rows, cols); - auto numel = x->numel(); - int batches = numel / (rows * cols); - - bool use_default_tol = context.Attr("use_default_tol"); - const Tensor* atol_tensor = nullptr; - Tensor temp_tensor; - T rtol_T = 0; - if (use_default_tol) { - framework::TensorFromVector(std::vector{0}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - rtol_T = std::numeric_limits::epsilon() * std::max(rows, cols); - } else if (context.HasInput("TolTensor")) { - atol_tensor = context.Input("TolTensor"); - } else { - framework::TensorFromVector(std::vector{context.Attr("tol")}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - } - - Tensor eigenvalue_tensor; - auto* eigenvalue_data = eigenvalue_tensor.mutable_data( - detail::GetEigenvalueDim(dim_x, k), context.GetPlace()); - if (hermitian) { - BatchEigenvalues(x_data, eigenvalue_data, batches, rows, cols, k); - } else { - BatchSVD(x_data, eigenvalue_data, batches, rows, cols, k); - } - - auto dito_T = - math::DeviceIndependenceTensorOperations( - context); - std::vector max_eigenvalue_shape = - phi::vectorize(detail::RemoveLastDim(eigenvalue_tensor.dims())); - Tensor max_eigenvalue_tensor = - dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape); - - Tensor temp_rtol_tensor; - framework::TensorFromVector(std::vector{rtol_T}, &temp_rtol_tensor); - Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor); - Tensor tol_tensor; - tol_tensor.mutable_data(dim_out, context.GetPlace()); - ElementwiseComputeEx, platform::CPUDeviceContext, - T, T>(context, atol_tensor, &rtol_tensor, -1, - GreaterElementFunctor(), &tol_tensor); - - tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1)); - - Tensor compare_result; - compare_result.mutable_data(detail::NewAxisDim(dim_out, k), - context.GetPlace()); - - int axis = -1; - if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) { - ElementwiseComputeEx, - platform::CPUDeviceContext, T, int>( - context, &eigenvalue_tensor, &tol_tensor, axis, - phi::funcs::GreaterThanFunctor(), &compare_result); - } else { - ElementwiseComputeEx, - platform::CPUDeviceContext, T, int>( - context, &eigenvalue_tensor, &tol_tensor, axis, - phi::funcs::LessThanFunctor(), &compare_result); - } - auto dito_int = - math::DeviceIndependenceTensorOperations(context); - std::vector result_shape = phi::vectorize(dim_out); - Tensor result = dito_int.ReduceSum(compare_result, result_shape); - out->ShareDataWith(result); - } -}; - } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(matrix_rank, ops::MatrixRankeOp, ops::MatrixRankeOpMaker); - -REGISTER_OP_CPU_KERNEL(matrix_rank, ops::MatrixRankCPUKernel, - ops::MatrixRankCPUKernel); diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu deleted file mode 100644 index dccd716022d2a..0000000000000 --- a/paddle/fluid/operators/matrix_rank_op.cu +++ /dev/null @@ -1,316 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_WITH_HIP -// HIP not support cusolver -#include -#include -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/operators/matrix_rank_op.h" -#include "paddle/fluid/operators/svd_helper.h" -#include "paddle/fluid/platform/dynload/cusolver.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/compare_functors.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { -namespace detail { -DDim GetUDDim(const DDim& x_dim, int k) { - auto x_vec = phi::vectorize(x_dim); - x_vec[x_vec.size() - 1] = k; - return phi::make_ddim(x_vec); -} - -DDim GetVHDDim(const DDim& x_dim, int k) { - auto x_vec = phi::vectorize(x_dim); - x_vec[x_vec.size() - 2] = k; - return phi::make_ddim(x_vec); -} -} // namespace detail - -template -class MatrixRankGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = - context.template device_context(); - - const Tensor* x = context.Input("X"); - auto* x_data = x->data(); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - bool hermitian = context.Attr("hermitian"); - - auto dim_x = x->dims(); - auto dim_out = out->dims(); - int rows = dim_x[dim_x.size() - 2]; - int cols = dim_x[dim_x.size() - 1]; - int k = std::min(rows, cols); - auto numel = x->numel(); - int batches = numel / (rows * cols); - - bool use_default_tol = context.Attr("use_default_tol"); - const Tensor* atol_tensor = nullptr; - Tensor temp_tensor; - T rtol_T = 0; - if (use_default_tol) { - framework::TensorFromVector(std::vector{0}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - rtol_T = std::numeric_limits::epsilon() * std::max(rows, cols); - } else if (context.HasInput("TolTensor")) { - atol_tensor = context.Input("TolTensor"); - } else { - framework::TensorFromVector(std::vector{context.Attr("tol")}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - } - - // Must Copy X once, because the gesvdj will destory the content when exit. - Tensor x_tmp; - paddle::framework::TensorCopy(*x, context.GetPlace(), &x_tmp); - auto info = memory::Alloc(dev_ctx, sizeof(int) * batches); - int* info_ptr = reinterpret_cast(info->ptr()); - - Tensor eigenvalue_tensor; - auto* eigenvalue_data = eigenvalue_tensor.mutable_data( - detail::GetEigenvalueDim(dim_x, k), context.GetPlace()); - if (hermitian) { - SyevjBatched(dev_ctx, batches, rows, x_tmp.data(), eigenvalue_data, - info_ptr); - platform::ForRange for_range( - dev_ctx, eigenvalue_tensor.numel()); - phi::funcs::AbsFunctor functor(eigenvalue_data, eigenvalue_data, - eigenvalue_tensor.numel()); - for_range(functor); - } else { - Tensor U, VH; - auto* u_data = - U.mutable_data(detail::GetUDDim(dim_x, k), context.GetPlace()); - auto* vh_data = - VH.mutable_data(detail::GetVHDDim(dim_x, k), context.GetPlace()); - GesvdjBatched(dev_ctx, batches, cols, rows, k, x_tmp.data(), vh_data, - u_data, eigenvalue_data, info_ptr, 1); - } - - auto dito_T = - math::DeviceIndependenceTensorOperations(context); - std::vector max_eigenvalue_shape = - phi::vectorize(detail::RemoveLastDim(eigenvalue_tensor.dims())); - Tensor max_eigenvalue_tensor = - dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape); - Tensor temp_rtol_tensor; - framework::TensorFromVector(std::vector{rtol_T}, - context.device_context(), &temp_rtol_tensor); - Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor); - Tensor tol_tensor; - tol_tensor.mutable_data(dim_out, context.GetPlace()); - ElementwiseComputeEx, platform::CUDADeviceContext, - T, T>(context, atol_tensor, &rtol_tensor, -1, - GreaterElementFunctor(), &tol_tensor); - - tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1)); - - Tensor compare_result; - compare_result.mutable_data(detail::NewAxisDim(dim_out, k), - context.GetPlace()); - int axis = -1; - ElementwiseComputeEx, - platform::CUDADeviceContext, T, int64_t>( - context, &eigenvalue_tensor, &tol_tensor, axis, - phi::funcs::GreaterThanFunctor(), &compare_result); - auto dito_int = - math::DeviceIndependenceTensorOperations(context); - std::vector result_shape = phi::vectorize(dim_out); - Tensor result = dito_int.ReduceSum(compare_result, result_shape); - out->ShareDataWith(result); - } - - void GesvdjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize, - int m, int n, int k, T* A, T* U, T* V, T* S, int* info, - int thin_UV = 1) const; - - void SyevjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize, - int n, T* A, T* W, int* info) const; -}; - -template <> -void MatrixRankGPUKernel::GesvdjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n, - int k, float* A, float* U, float* V, float* S, int* info, - int thin_UV) const { - // do not compute singular vectors - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - gesvdjInfo_t gesvdj_params = NULL; - int lda = m; - int ldu = m; - int ldt = n; - int lwork = 0; - auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize( - handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork, - gesvdj_params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); - float* workspace_ptr = reinterpret_cast(workspace->ptr()); - int stride_A = lda * n; - int stride_U = ldu * (thin_UV ? k : m); - int stride_V = ldt * (thin_UV ? k : n); - for (int i = 0; i < batchSize; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj( - handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i, - U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork, - info, gesvdj_params)); - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); -} - -template <> -void MatrixRankGPUKernel::GesvdjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n, - int k, double* A, double* U, double* V, double* S, int* info, - int thin_UV) const { - // do not compute singular vectors - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - gesvdjInfo_t gesvdj_params = NULL; - int lda = m; - int ldu = m; - int ldt = n; - int lwork = 0; - auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize( - handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork, - gesvdj_params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); - double* workspace_ptr = reinterpret_cast(workspace->ptr()); - int stride_A = lda * n; - int stride_U = ldu * (thin_UV ? k : m); - int stride_V = ldt * (thin_UV ? k : n); - for (int i = 0; i < batchSize; ++i) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj( - handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i, - U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork, - info, gesvdj_params)); - // check the error info - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); -} - -template <> -void MatrixRankGPUKernel::SyevjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, float* A, - float* W, int* info) const { - auto handle = dev_ctx.cusolver_dn_handle(); - // Compute eigenvalues only - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - // matrix is saved as column-major in cusolver. - // numpy and torch use lower triangle to compute eigenvalues, so here use - // upper triangle - cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; - int lda = n; - int stride_A = lda * n; - int lwork = 0; - syevjInfo_t params = NULL; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateSyevjInfo(¶ms)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize( - handle, jobz, uplo, n, A, lda, W, &lwork, params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); - float* workspace_ptr = reinterpret_cast(workspace->ptr()); - for (int i = 0; i < batchSize; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj( - handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr, - lwork, info, params)); - - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i, - error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroySyevjInfo(params)); -} - -template <> -void MatrixRankGPUKernel::SyevjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, double* A, - double* W, int* info) const { - auto handle = dev_ctx.cusolver_dn_handle(); - // Compute eigenvalues only - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - // upper triangle of A is stored - cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; - int lda = n; - int stride_A = lda * n; - int lwork = 0; - syevjInfo_t params = NULL; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateSyevjInfo(¶ms)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj_bufferSize( - handle, jobz, uplo, n, A, lda, W, &lwork, params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); - double* workspace_ptr = reinterpret_cast(workspace->ptr()); - - for (int i = 0; i < batchSize; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj( - handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr, - lwork, info, params)); - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i, - error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroySyevjInfo(params)); -} - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(matrix_rank, ops::MatrixRankGPUKernel, - ops::MatrixRankGPUKernel); -#endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc index 056620db5b966..32ef052119883 100644 --- a/paddle/fluid/operators/metrics/accuracy_op.cc +++ b/paddle/fluid/operators/metrics/accuracy_op.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -21,69 +23,6 @@ class AccuracyOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Out"), true, - platform::errors::NotFound("Input (Out) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Indices"), true, - platform::errors::NotFound( - "Input (Indices) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Label"), true, - platform::errors::NotFound( - "Input (Label) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Accuracy"), true, - platform::errors::NotFound( - "Output (Accuracy) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Correct"), true, - platform::errors::NotFound( - "Output (Correct) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Total"), true, - platform::errors::NotFound( - "Output (Total) of AccuracyOp is not found.")); - - OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "Accuracy"); - OP_INOUT_CHECK(ctx->HasInput("Indices"), "Input", "Indices", "Accuracy"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Accuracy"); - OP_INOUT_CHECK(ctx->HasOutput("Accuracy"), "Output", "Accuracy", - "Accuracy"); - OP_INOUT_CHECK(ctx->HasOutput("Correct"), "Output", "Correct", "Accuracy"); - OP_INOUT_CHECK(ctx->HasOutput("Total"), "Output", "Total", "Accuracy"); - - auto inference_dim = ctx->GetInputDim("Out"); - auto label_dim = ctx->GetInputDim("Label"); - // Assume indices has same shape as inference, because - // it's the output of topk. - - PADDLE_ENFORCE_EQ( - label_dim.size(), 2, - platform::errors::InvalidArgument( - "ShapeError: label's dimensions of AccuracyOp must be 2. " - "But received label's dimensions = %d, label's shape = [%s]", - label_dim.size(), label_dim)); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(label_dim[1], 1, - platform::errors::InvalidArgument( - "ShapeError: label's second dimension of " - "AccuracyOp must be 1. But received label's " - "second dimension is = %d, label's shape = [%s]", - label_dim[1], label_dim)); - PADDLE_ENFORCE_EQ( - inference_dim[0], label_dim[0], - platform::errors::InvalidArgument( - "ShapeError: the output's num_rows of AccuracyOp must be" - " the same as label's num_rows. But received output's " - "shape = [%s], label's shape = [%s], output's num_rows = %d, " - "label's " - "num_rows = %d", - inference_dim, label_dim, inference_dim[0], label_dim[0])); - } - - ctx->SetOutputDim("Accuracy", {1}); - ctx->SetOutputDim("Correct", {1}); - ctx->SetOutputDim("Total", {1}); - ctx->ShareLoD("Out", /*->*/ "Accuracy"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -125,8 +64,11 @@ with the input Out(Inference). // FIXME(typhoonzero): types of T is for infernece data. // label data is always int. +DECLARE_INFER_SHAPE_FUNCTOR(accuracy, AccuracyInferShapeFunctor, + PD_INFER_META(phi::AccuracyInferMeta)); namespace ops = paddle::operators; REGISTER_OPERATOR( accuracy, ops::AccuracyOp, ops::AccuracyOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + AccuracyInferShapeFunctor); diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc index 90e6a36220ab0..2e82b47e8da1c 100644 --- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/layer_norm_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/common/data_type.h" namespace paddle { namespace operators { @@ -139,7 +140,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel { layer_norm_p->execute(astream, args); astream.wait(); - y->set_layout(DataLayout::kMKLDNN); + y->set_layout(phi::DataLayout::kMKLDNN); y->set_format(platform::GetMKLDNNFormat(*dst_memory)); } }; @@ -150,4 +151,5 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel { // TODO(jczaja): Enable FP32 when performance is good namespace ops = paddle::operators; REGISTER_OP_KERNEL(layer_norm, MKLDNN, ::paddle::platform::CPUPlace, + ops::LayerNormMKLDNNOpKernel, ops::LayerNormMKLDNNOpKernel); diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index ab02d4cfed9d5..1078b451c55ba 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { using framework::DataLayout; +using framework::Tensor; using dnnl::memory; using dnnl::pooling_backward; using dnnl::pooling_forward; @@ -83,11 +85,11 @@ class PoolingMKLDNNHandler phi::slice_ddim(input_dims, 2, input_dims.size()); if (global_pooling) { - operators::UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } - operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, + data_dims, strides, ksize); const auto src_tz = phi::vectorize(input->dims()); const auto dst_tz = phi::vectorize(output->dims()); @@ -173,11 +175,11 @@ class PoolingMKLDNNHandler framework::DDim data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); if (global_pooling) { - operators::UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } - operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, + data_dims, strides, ksize); auto src_tz = phi::vectorize(in_x->dims()); auto diff_src_tz = phi::vectorize(in_x_grad->dims()); diff --git a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc index bdb4fe1198a8e..86ecb01c89af7 100644 --- a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc @@ -50,13 +50,8 @@ class PReluMKLDNNHandler if (weights->dims().size() != x->dims().size()) { auto new_weights_dims = std::vector(x->dims().size(), 1); if (mode == "channel") { - if (data_format == "NHWC") { - new_weights_dims[x->dims().size() - 1] = - *std::max_element(weights_dims.begin(), weights_dims.end()); - } else { - new_weights_dims[1] = - *std::max_element(weights_dims.begin(), weights_dims.end()); - } + new_weights_dims[1] = + *std::max_element(weights_dims.begin(), weights_dims.end()); } weights_dims = std::move(new_weights_dims); } diff --git a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc index 780c6e7f153e7..a3b764b0e1c46 100644 --- a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc @@ -13,19 +13,32 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/shape_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" namespace paddle { namespace operators { -using paddle::framework::Tensor; +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = phi::SelectedRows; template -class ShapeMKLDNNKernel : public ShapeKernel { +class ShapeMKLDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ShapeKernel::Compute(ctx); + auto* in_var = ctx.InputVar("Input"); + framework::DDim in_dims; + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); + } else { + in_dims = in_var->Get().dims(); + } + auto* out_t = ctx.Output("Out"); + out_t->Resize({in_dims.size()}); + auto out_data = out_t->mutable_data(platform::CPUPlace()); + for (int i = 0; i < in_dims.size(); ++i) { + out_data[i] = in_dims[i]; + } auto* out = ctx.Output("Out"); out->set_layout(framework::DataLayout::kMKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index e9dadd5ec937c..4090d5ffca801 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -24,6 +24,7 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); @@ -32,6 +33,8 @@ USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); +PD_DECLARE_KERNEL(softmax, CPU, ALL_LAYOUT); + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 916f02179b364..0e988557df626 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -24,14 +24,18 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" -USE_OP(pool2d); +USE_OP_ITSELF(pool2d); USE_OP_DEVICE_KERNEL(pool2d, MKLDNN); USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(transpose); USE_OP_DEVICE_KERNEL(transpose, MKLDNN); +PD_DECLARE_KERNEL(pool2d, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT); + namespace paddle { namespace operators { @@ -94,7 +98,7 @@ TEST(test_pool2d_transpose_nhwc, cpu_place) { TEST(test_pool2d_relu_relu_nhwc, cpu_place) { framework::DDim dims({1, 4, 8, 512}); // NHWC shape - framework::DDim expected_dims({1, 512, 3, 7}); // NHWC expected shape + framework::DDim expected_dims({1, 512, 3, 7}); // NCHW expected shape platform::CPUPlace p; framework::Scope scope; diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc index c7fb92cd5107c..9c16ccb138f7d 100644 --- a/paddle/fluid/operators/mode_op.cc +++ b/paddle/fluid/operators/mode_op.cc @@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/mode_op.h" #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -23,43 +27,6 @@ class ModeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mode"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mode"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "mode"); - - auto input_dims = ctx->GetInputDim("X"); - const int& dim_size = input_dims.size(); - int axis = static_cast(ctx->Attrs().Get("axis")); - PADDLE_ENFORCE_EQ( - (axis < dim_size) && (axis >= (-1 * dim_size)), true, - paddle::platform::errors::InvalidArgument( - "the axis of ModeOp must be [-%d, %d), but you set axis is %d", - dim_size, dim_size, axis)); - PADDLE_ENFORCE_GE(input_dims.size(), 1, - paddle::platform::errors::InvalidArgument( - "input of ModeOp must have >= 1d shape")); - if (axis < 0) axis += dim_size; - bool keepdim = ctx->Attrs().Get("keepdim"); - std::vector dimvec; - for (int64_t i = 0; i < axis; i++) { - dimvec.emplace_back(input_dims[i]); - } - if (keepdim) { - dimvec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < dim_size; i++) { - dimvec.emplace_back(input_dims[i]); - } - framework::DDim dims = phi::make_ddim(dimvec); - PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument( - "input shape should >= 1d")); - ctx->SetOutputDim("Out", dims); - ctx->SetOutputDim("Indices", dims); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -138,18 +105,11 @@ class ModeGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(mode, ModeInferShapeFunctor, + PD_INFER_META(phi::ModeInferMeta)); REGISTER_OPERATOR(mode, ops::ModeOp, ops::ModeOpMaker, ops::ModeGradOpMaker, - ops::ModeGradOpMaker); -REGISTER_OP_CPU_KERNEL(mode, - ops::ModeCPUKernel, - ops::ModeCPUKernel, - ops::ModeCPUKernel, - ops::ModeCPUKernel); - + ops::ModeGradOpMaker, + ModeInferShapeFunctor); REGISTER_OPERATOR(mode_grad, ops::ModeOpGrad); -REGISTER_OP_CPU_KERNEL( - mode_grad, ops::ModeGradCPUKernel, - ops::ModeGradCPUKernel, - ops::ModeGradCPUKernel, - ops::ModeGradCPUKernel); diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu deleted file mode 100644 index afb949d3374c6..0000000000000 --- a/paddle/fluid/operators/mode_op.cu +++ /dev/null @@ -1,233 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/mode_op.h" -#include "paddle/fluid/operators/top_k_function_cuda.h" -#include "paddle/fluid/operators/top_k_v2_op.h" - -namespace paddle { -namespace operators { - -int ComputeBlockSize(int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; -} - -template -void getModebySort(const platform::CUDADeviceContext& ctx, - const framework::Tensor* input_tensor, - const int64_t num_cols, const int64_t num_rows, - T* out_tensor, int64_t* indices_tensor) { - framework::Tensor input_tmp; - framework::TensorCopy(*input_tensor, ctx.GetPlace(), &input_tmp); - T* input_tmp_data = input_tmp.mutable_data(ctx.GetPlace()); - input_tmp.Resize(phi::make_ddim({num_rows, num_cols})); - thrust::device_ptr out_tensor_ptr(out_tensor); - thrust::device_ptr indices_tensor_ptr(indices_tensor); - - for (int64_t i = 0; i < num_rows; ++i) { - T* begin = input_tmp_data + num_cols * i; - T* end = input_tmp_data + num_cols * (i + 1); - thrust::device_vector indices_data(num_cols); - thrust::sequence(thrust::device, indices_data.begin(), - indices_data.begin() + num_cols); - thrust::sort_by_key(thrust::device, begin, end, indices_data.begin()); - int unique = 1 + thrust::inner_product(thrust::device, begin, end - 1, - begin + 1, 0, thrust::plus(), - thrust::not_equal_to()); - thrust::device_vector keys_data(unique); - thrust::device_vector cnts_data(unique); - thrust::reduce_by_key(thrust::device, begin, end, - thrust::constant_iterator(1), keys_data.begin(), - cnts_data.begin()); - auto it = thrust::max_element(thrust::device, cnts_data.begin(), - cnts_data.begin() + unique); - T mode = keys_data[it - cnts_data.begin()]; - int64_t counts = cnts_data[it - cnts_data.begin()]; - auto pos = thrust::find(thrust::device, begin, end, mode); - int64_t index = indices_data[pos - begin + counts - 1]; - out_tensor_ptr[i] = static_cast(mode); - indices_tensor_ptr[i] = static_cast(index); - } -} - -template -class ModeOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = static_cast(ctx.Attr("axis")); - bool keepdim = static_cast(ctx.Attr("keepdim")); - - // get the input dims - const auto& in_dims = input->dims(); - // calcluate the real axis - if (axis < 0) axis += in_dims.size(); - - auto out_dims = output->dims(); - - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - getModebySort(dev_ctx, input, input_width, input_height, output_data, - indices_data); - } else { - std::vector trans_axis; - for (int i = 0; i < axis; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(axis); - - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dim); - indices->Resize(tmp_out_dim); - } - - framework::DDim trans_shape(in_dims); - framework::DDim trans_out_shape(in_dims); - for (int i = 0; i < trans_axis.size(); i++) { - trans_shape[i] = in_dims[trans_axis[i]]; - trans_out_shape[i] = in_dims[trans_axis[i]]; - } - trans_out_shape[in_dims.size() - 1] = 1; - - // second step, tranpose the input - framework::Tensor trans_input; - trans_input.mutable_data(trans_shape, ctx.GetPlace()); - int ndims = trans_axis.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - TransCompute(ndims, dev_ctx, *input, - &trans_input, trans_axis); - framework::Tensor trans_ind; - int64_t* trans_ind_data = - trans_ind.mutable_data(trans_out_shape, ctx.GetPlace()); - framework::Tensor trans_out; - T* trans_out_data = - trans_out.mutable_data(trans_out_shape, ctx.GetPlace()); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); - const int64_t input_width = trans_shape[trans_shape.size() - 1]; - getModebySort(dev_ctx, &trans_input, input_width, input_height, - trans_out_data, trans_ind_data); - // last step, tranpose back the indices and output - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans_axis); - TransCompute(ndims, dev_ctx, trans_out, - output, trans_axis); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class ModeOpGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = context.Attr("axis"); - - const auto& in_dims = x->dims(); - auto out_dims = indices->dims(); - - if (axis < 0) axis += in_dims.size(); - // allocate the cuda memory for the x_grad - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - const T* out_grad_data = out_grad->data(); - const int64_t* indices_data = indices->data(); - - int pre, n, post; - GetDims(in_dims, axis, &pre, &n, &post); - - // calcluate the block and grid num - auto& dev_ctx = context.cuda_device_context(); - int block_size = ComputeBlockSize(post); - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); - int grid_size = std::min(max_blocks, pre); - AssignGradWithAxis<<>>( - out_grad_data, indices_data, x_grad_data, pre, post, n, 1); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - mode, ops::ModeOpCUDAKernel, - ops::ModeOpCUDAKernel, - ops::ModeOpCUDAKernel, - ops::ModeOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - mode_grad, - ops::ModeOpGradCUDAKernel, - ops::ModeOpGradCUDAKernel, - ops::ModeOpGradCUDAKernel, - ops::ModeOpGradCUDAKernel); diff --git a/paddle/fluid/operators/mode_op.h b/paddle/fluid/operators/mode_op.h deleted file mode 100644 index 76d356ed16eb3..0000000000000 --- a/paddle/fluid/operators/mode_op.h +++ /dev/null @@ -1,317 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { - -template -static void getMode(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, Type* t_indices) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(i, j), j)); - } - } - std::sort(col_vec.begin(), col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - T mode = 0; - int64_t indice = 0; - int64_t cur_freq = 0; - int64_t max_freq = 0; - for (int64_t i = 0; i < input_width; ++i) { - ++cur_freq; - if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) { - if (cur_freq > max_freq) { - max_freq = cur_freq; - mode = col_vec[i].first; - indice = col_vec[i].second; - } - cur_freq = 0; - } - } - t_out[i] = mode; - t_indices[i] = indice; - } -} - -template -static void ModeAssign(const Type& input_height, const Type& input_width, - const int& input_dim, const framework::Tensor* input, - const framework::Tensor* indices, T* output_data) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - auto e_indices = framework::EigenVector::Flatten(*indices); - output_data[i * input_width + e_indices(0)] = e_input(0); - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = - framework::EigenMatrix::Reshape(*indices, input_dim - 1); - output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); - } - } -} - -template -class ModeCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto* indices = context.Output("Indices"); - const auto& in_dims = input->dims(); - bool keepdim = static_cast(context.Attr("keepdim")); - - // axis < 0, cacluate the real axis - int axis = static_cast(context.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - - T* output_data = output->mutable_data(context.GetPlace()); - int64_t* indices_data = indices->mutable_data(context.GetPlace()); - auto out_dims = output->dims(); - // if axis is not the last dim, transpose it to the last dim, do the - // calculation, - // then tranpose it back to orginal axis. - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - getMode(input_height, input_width, in_dims.size(), input, - output_data, indices_data); - } else { - std::vector trans_axis; - for (int i = 0; i < axis; i++) { - trans_axis.emplace_back(i); - } - trans_axis.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(axis); - - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dim); - indices->Resize(tmp_out_dim); - } - - // get the trans input_dims, out_dims - framework::DDim trans_shape(in_dims); - framework::DDim trans_out_shape(in_dims); - - for (size_t i = 0; i < trans_axis.size(); i++) { - trans_shape[i] = in_dims[trans_axis[i]]; - trans_out_shape[i] = in_dims[trans_axis[i]]; - } - trans_out_shape[in_dims.size() - 1] = 1; - - framework::Tensor trans_input; - trans_input.mutable_data(trans_shape, context.GetPlace()); - int ndims = trans_axis.size(); - auto& dev_context = - context.template device_context(); - - // transpose the input value - TransCompute(ndims, dev_context, *input, - &trans_input, trans_axis); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); - const int64_t input_width = trans_shape[trans_shape.size() - 1]; - framework::Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_out_shape, context.GetPlace()); - framework::Tensor tmp_indices; - auto* t_ind = tmp_indices.mutable_data(trans_out_shape, - context.GetPlace()); - - getMode(input_height, input_width, in_dims.size(), - &trans_input, t_out, t_ind); - // transpose back - TransCompute( - ndims, dev_context, tmp_indices, indices, trans_axis); - TransCompute(ndims, dev_context, tmp_out, - output, trans_axis); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class ModeGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = static_cast(context.Attr("axis")); - bool keepdim = static_cast(context.Attr("keepdim")); - - auto in_dims = x->dims(); - auto out_dims = indices->dims(); - - // axis < 0, get the real axis - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(out_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(out_dims[i - 1]); - } - out_dims = phi::make_ddim(tmp_out_shape); - } - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - if (axis == in_dims.size() - 1) { - // allocate the memory for the input_grad - // assign the out_grad to input_grad directly - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - // init the output grad with 0, because some input elements has no grad - memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); - // Assign the output_grad to input_grad - if (keepdim) { - ModeAssign(input_height, input_width, in_dims.size(), out_grad, indices, - x_grad_data); - } else { - auto& dev_context = - context.template device_context(); - framework::Tensor out_grad_tmp; - framework::Tensor indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - ModeAssign(input_height, input_width, in_dims.size(), &out_grad_tmp, - &indices_tmp, x_grad_data); - } - } else { - // can not assign grad to input_grad, must do the transpose - std::vector trans_axis; - for (int i = 0; i < axis; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(out_dims.size() - 1); - for (int i = axis + 1; i < out_dims.size() - 1; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(axis); - framework::DDim trans_shape(out_dims); - framework::DDim trans_in_shape(in_dims); - for (size_t i = 0; i < trans_axis.size(); i++) { - trans_shape[i] = out_dims[trans_axis[i]]; - trans_in_shape[i] = in_dims[trans_axis[i]]; - } - // transpose the out_grad, indices - framework::Tensor trans_dO; - trans_dO.mutable_data(trans_shape, context.GetPlace()); - framework::Tensor trans_ind; - trans_ind.mutable_data(trans_shape, context.GetPlace()); - int ndims = trans_axis.size(); - auto& dev_context = - context.template device_context(); - - if (keepdim) { - // Do transpose - TransCompute( - ndims, dev_context, *out_grad, &trans_dO, trans_axis); - TransCompute( - ndims, dev_context, *indices, &trans_ind, trans_axis); - } else { - framework::Tensor out_grad_tmp; - framework::Tensor indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - // Do transpose - TransCompute( - ndims, dev_context, out_grad_tmp, &trans_dO, trans_axis); - TransCompute( - ndims, dev_context, indices_tmp, &trans_ind, trans_axis); - } - const int64_t input_height = phi::product( - phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1)); - const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1]; - - // Assign the out_grad to tranpose input_grad - framework::Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_in_shape, context.GetPlace()); - memset(t_out, 0, x_grad->numel() * sizeof(T)); - - ModeAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out); - - // Transpose back - TransCompute(ndims, dev_context, tmp_out, - x_grad, trans_axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc index b309e1b87ef90..5b107ce643df3 100644 --- a/paddle/fluid/operators/multi_dot_op.cc +++ b/paddle/fluid/operators/multi_dot_op.cc @@ -16,77 +16,19 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/kernels/funcs/blas/blas.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; -/** - * @brief compute the output shape and check the input shape valid or not - */ -inline framework::DDim ComputeAndCheckShape( - const bool is_runtime, const std::vector& inputs_dims) { - const size_t n = inputs_dims.size(); - auto first_dim = inputs_dims[0]; - - bool is_vector = false; - framework::DDim out_dim; - - PADDLE_ENFORCE_LT( - first_dim.size(), static_cast(3), - platform::errors::InvalidArgument( - "multi_dot: the first input tensor must be 1D or 2D but got[%d]!", - static_cast(first_dim.size()))); - - // If the first tensor is 1D of size n view it as a row vector (1, n) - if (first_dim.size() == 1) { - first_dim = phi::make_ddim({1, static_cast(first_dim[0])}); - is_vector = true; - } - - auto last_dim = inputs_dims[n - 1]; - PADDLE_ENFORCE_LT( - last_dim.size(), static_cast(3), - platform::errors::InvalidArgument( - "the last input tensor of multi_dot must be 1D or 2D but got[%d]!", - static_cast(first_dim.size()))); - - // If the last tensor is 1D of size n view it as a column vector (n, 1) - if (last_dim.size() == 1) { - last_dim = phi::make_ddim({static_cast(last_dim[0]), 1}); - out_dim = is_vector ? phi::make_ddim({1}) : phi::make_ddim({first_dim[0]}); - } else { - out_dim = is_vector ? phi::make_ddim({last_dim[1]}) - : phi::make_ddim({first_dim[0], last_dim[1]}); - } - - auto width = first_dim[1]; - for (size_t i = 1; i < n - 1; i++) { - PADDLE_ENFORCE_EQ(inputs_dims[i].size(), static_cast(2), - platform::errors::InvalidArgument( - "the input tensor of multi_dot op must be 2D.")); - - const auto& tmp_dim = inputs_dims[i]; - PADDLE_ENFORCE_EQ( - tmp_dim[0], width, - platform::errors::InvalidArgument( - "the input matrix does not meet the multiplication requirements.")); - width = tmp_dim[1]; - } - - PADDLE_ENFORCE_EQ( - last_dim[0], width, - platform::errors::InvalidArgument( - "the input matrix does not meet the multiplication requirements.")); - - return out_dim; -} - class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -105,22 +47,6 @@ If the first argument is 1-D it is treated as a row vector. If the last argument class MultiDotOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "multi_dot"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "multi_dot"); - - auto inputs_dims = ctx->GetInputsDim("X"); - - const size_t inputs_num = inputs_dims.size(); - PADDLE_ENFORCE_GT( - inputs_num, static_cast(1), - platform::errors::InvalidArgument( - "The number of input tensors in multi_dot op should > 1.")); - auto out_dims = ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims); - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", "Out"); - } }; class MultiDotOpGrad : public framework::OperatorWithKernel { @@ -171,9 +97,15 @@ class MultiDotOpDoubleGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(multi_dot, MultiDotInferShapeFunctor, + PD_INFER_META(phi::MultiDotInferMeta)); + REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker, ops::MultiDotOpGradMaker, - ops::MultiDotOpGradMaker); + ops::MultiDotOpGradMaker, + MultiDotInferShapeFunctor); + REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad, ops::MultiDotOpDoubleGradMaker, ops::MultiDotOpDoubleGradMaker); diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc index 313a479ea301b..8771a6573cba0 100644 --- a/paddle/fluid/operators/multiplex_op.cc +++ b/paddle/fluid/operators/multiplex_op.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/multiplex_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -169,15 +169,3 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, ops::MultiplexGradMaker, ops::MultiplexGradMaker); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); -REGISTER_OP_CPU_KERNEL( - multiplex, - ops::MultiplexCPUKernel, - ops::MultiplexCPUKernel, - ops::MultiplexCPUKernel, - ops::MultiplexCPUKernel); -REGISTER_OP_CPU_KERNEL( - multiplex_grad, - ops::MultiplexGradCPUKernel, - ops::MultiplexGradCPUKernel, - ops::MultiplexGradCPUKernel, - ops::MultiplexGradCPUKernel); diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu deleted file mode 100644 index 0a32ee96fb693..0000000000000 --- a/paddle/fluid/operators/multiplex_op.cu +++ /dev/null @@ -1,117 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/multiplex_op.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MultiplexGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto ins = ctx.MultiInput("X"); - auto* ids = ctx.Input("Ids"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - for (size_t i = 0; i < ins.size(); ++i) { - PADDLE_ENFORCE_GT( - ins[i]->numel(), 0, - platform::errors::OutOfRange( - "indexing will be out of bounds with size 0 for the %d-th input.", - i)); - } - - auto rows = ins[0]->dims()[0]; - auto cols = ins[0]->numel() / rows; - // copy index to cpu - Tensor index_t_cpu; - paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); - auto* index = index_t_cpu.data(); - auto stream = ctx.cuda_device_context().stream(); - platform::CUDAPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - int32_t k = index[i]; - PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet( - "index must be nonnegative.")); - PADDLE_ENFORCE_LT(static_cast(k), ins.size(), - platform::errors::PreconditionNotMet( - "index exceeds the number of candidate tensors.")); - memory::Copy(place, out->data() + i * cols, place, - ins[k]->data() + i * cols, cols * sizeof(T), stream); - } - } -}; - -template -class MultiplexGradGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* ids = ctx.Input("Ids"); - auto d_ins = ctx.MultiOutput(framework::GradVarName("X")); - - size_t idx = -1UL; - for (size_t i = 0; i < d_ins.size(); i++) { - if (d_ins[i]) { - d_ins[i]->mutable_data(ctx.GetPlace()); - auto t = framework::EigenVector::Flatten(*d_ins[i]); - t.device(*ctx.template device_context().eigen_device()) = - t.constant(static_cast(0)); - - idx = i; - } - } - - if (idx == -1UL) return; - - auto rows = d_ins[idx]->dims()[0]; - auto cols = d_ins[idx]->numel() / rows; - // copy index to cpu - Tensor index_t_cpu; - paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); - auto* index = index_t_cpu.data(); - - auto stream = ctx.cuda_device_context().stream(); - platform::CUDAPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - size_t k = static_cast(index[i]); - if (d_ins[k]) { - memory::Copy(place, d_ins[k]->data() + i * cols, place, - d_out->data() + i * cols, cols * sizeof(T), stream); - } - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - multiplex, - ops::MultiplexGPUKernel, - ops::MultiplexGPUKernel, - ops::MultiplexGPUKernel, - ops::MultiplexGPUKernel); -REGISTER_OP_CUDA_KERNEL( - multiplex_grad, - ops::MultiplexGradGPUKernel, - ops::MultiplexGradGPUKernel, - ops::MultiplexGradGPUKernel, - ops::MultiplexGradGPUKernel); diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h deleted file mode 100644 index 1d0a009edeedc..0000000000000 --- a/paddle/fluid/operators/multiplex_op.h +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/memcpy.h" - -namespace paddle { -namespace operators { - -template -class MultiplexCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto ins = ctx.MultiInput("X"); - auto ids = ctx.Input("Ids"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - for (size_t i = 0; i < ins.size(); ++i) { - PADDLE_ENFORCE_GT( - ins[i]->numel(), 0, - platform::errors::OutOfRange( - "indexing will be out of bounds with size 0 for the %d-th input.", - i)); - } - - auto rows = ins[0]->dims()[0]; - auto cols = ins[0]->numel() / rows; - auto index = ids->data(); - platform::CPUPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - int32_t k = index[i]; - PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet( - "index must be nonnegative.")); - PADDLE_ENFORCE_LT(static_cast(k), ins.size(), - platform::errors::PreconditionNotMet( - "index exceeds the number of candidate tensors.")); - memory::Copy(place, out->data() + i * cols, place, - ins[k]->data() + i * cols, cols * sizeof(T)); - } - } -}; - -template -class MultiplexGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* ids = ctx.Input("Ids"); - auto d_ins = - ctx.MultiOutput(framework::GradVarName("X")); - - size_t idx = -1UL; - for (size_t i = 0; i < d_ins.size(); i++) { - if (d_ins[i]) { - d_ins[i]->mutable_data(ctx.GetPlace()); - auto t = framework::EigenVector::Flatten(*d_ins[i]); - t.device(*ctx.template device_context().eigen_device()) = - t.constant(static_cast(0)); - - idx = i; - } - } - - if (idx == -1UL) return; - - auto rows = d_ins[idx]->dims()[0]; - auto cols = d_ins[idx]->numel() / rows; - auto* index = ids->data(); - platform::CPUPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - size_t k = static_cast(index[i]); - if (d_ins[k]) { - memory::Copy(place, d_ins[k]->data() + i * cols, place, - d_out->data() + i * cols, cols * sizeof(T)); - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc index 6c35ad29e9749..a4e1f7b3091a9 100644 --- a/paddle/fluid/operators/nll_loss_op.cc +++ b/paddle/fluid/operators/nll_loss_op.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,77 +25,6 @@ class NLLLossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NLLLoss"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "NLLLoss"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NLLLoss"); - OP_INOUT_CHECK(ctx->HasOutput("Total_weight"), "Output", "Total_weight", - "NLLLoss"); - - auto x_dims = ctx->GetInputDim("X"); - auto label_dims = ctx->GetInputDim("Label"); - auto reduction = ctx->Attrs().Get("reduction"); - - PADDLE_ENFORCE_EQ(x_dims.size() == 2 || x_dims.size() == 4, true, - platform::errors::InvalidArgument( - "The tensor rank of Input(X) must be 2 or 4.")); - bool contain_unknown_dim = phi::contain_unknown_dim(x_dims) || - phi::contain_unknown_dim(label_dims); - bool check = ctx->IsRuntime() || !contain_unknown_dim; - if (check) { - PADDLE_ENFORCE_EQ( - x_dims[0], label_dims[0], - platform::errors::InvalidArgument( - "ShapeError: Expected input batch_size to match label batch_size," - "But received: the Input(x) batch_size is [%s], the Input(label) " - " batch_size is [%s].", - x_dims[0], label_dims[0])); - if (ctx->HasInput("Weight")) { - auto w_dims = ctx->GetInputDim("Weight"); - PADDLE_ENFORCE_EQ(w_dims.size(), 1, - platform::errors::InvalidArgument( - "Input(Weight) should be a 1D tensor.")); - PADDLE_ENFORCE_EQ( - x_dims[1], w_dims[0], - platform::errors::InvalidArgument( - "Expected input tensor Weight's size should equal " - "to the first dimension of the input tensor X. But received " - "Weight's " - "size is %d, the first dimension of input X is %d", - w_dims[0], x_dims[1])); - } - } - if (x_dims.size() == 2) { - if (reduction == "none") { - ctx->SetOutputDim("Out", {x_dims[0]}); - } else { - ctx->SetOutputDim("Out", {1}); - } - } else if (x_dims.size() == 4) { - PADDLE_ENFORCE_EQ(label_dims.size(), 3, - platform::errors::InvalidArgument( - "Expected Input(Lable) dimensions=3, received %d.", - label_dims.size())); - auto input0 = x_dims[0]; - auto input2 = x_dims[2]; - auto input3 = x_dims[3]; - auto label0 = label_dims[0]; - auto label1 = label_dims[1]; - auto label2 = label_dims[2]; - PADDLE_ENFORCE_EQ( - input0 == label0 && input2 == label1 && input3 == label2, true, - platform::errors::InvalidArgument("Input(X) tensor shape should " - "match to Input(Label) tensor " - "shape.")); - if (reduction == "none") { - ctx->SetOutputDim("Out", {x_dims[0], x_dims[2], x_dims[3]}); - } else { - ctx->SetOutputDim("Out", {1}); - } - } - ctx->SetOutputDim("Total_weight", {1}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -259,8 +190,11 @@ class NLLLossGradMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(nll_loss, NllLossRawInferShapeFunctor, + PD_INFER_META(phi::NllLossRawInferMeta)); namespace ops = paddle::operators; REGISTER_OPERATOR(nll_loss, ops::NLLLossOp, ops::NLLLossOpMaker, ops::NLLLossGradMaker, - ops::NLLLossGradMaker); + ops::NLLLossGradMaker, + NllLossRawInferShapeFunctor); REGISTER_OPERATOR(nll_loss_grad, ops::NLLLossGradOp); diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc index 5d394424d54f5..51daccce0e882 100644 --- a/paddle/fluid/operators/norm_op.cc +++ b/paddle/fluid/operators/norm_op.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -57,21 +59,7 @@ where, $\sum {x^2}$ is calculated along the `axis` dimension. }; class NormOp : public framework::OperatorWithKernel { - public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NormOp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NormOp"); - auto xdim = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", xdim); - - if (ctx->Attrs().Get("is_test") == false) { - int axis = ctx->Attrs().Get("axis"); - if (axis < 0) axis = xdim.size() + axis; - xdim[axis] = 1; - ctx->SetOutputDim("Norm", xdim); - } - } }; class NormOpGrad : public framework::OperatorWithKernel { @@ -111,7 +99,11 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; +DECLARE_INFER_SHAPE_FUNCTOR(norm, NormInferShapeFunctor, + PD_INFER_META(phi::NormInferMeta)); + REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker, ops::NormOpGradOpMaker, - ops::NormOpGradOpMaker); + ops::NormOpGradOpMaker, + NormInferShapeFunctor); REGISTER_OPERATOR(norm_grad, ops::NormOpGrad); diff --git a/paddle/fluid/operators/number_count_op.cc b/paddle/fluid/operators/number_count_op.cc new file mode 100644 index 0000000000000..8f7a3b82acf19 --- /dev/null +++ b/paddle/fluid/operators/number_count_op.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/number_count_op.h" + +namespace paddle { +namespace operators { + +class NumberCountOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("gate_idx"), "Input", "gate_idx", + "NumberCount"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "number_count", + "NumberCount"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + // the dtype of the gate_idx should be same as int64 + auto gate_idx_dtype = + OperatorWithKernel::IndicateVarDataType(ctx, "gate_idx"); + + PADDLE_ENFORCE_EQ(gate_idx_dtype, framework::proto::VarType::INT64, + platform::errors::InvalidArgument( + "The dtype of the gate_idx_dtype should be int64")); + return framework::OpKernelType(gate_idx_dtype, ctx.GetPlace()); + } +}; + +class NumberCountOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("gate_idx", "(Tensor) The input gate index tensor."); + AddOutput("Out", "(Tensor) The output expert count tensor."); + AddAttr("upper_range", "ļ¼ˆint), The number of experts."); + + AddComment(R"DOC(number_count Operator.count gate indices.)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CPU_KERNEL(number_count, ops::NumberCountOpCPUKernel, + ops::NumberCountOpCPUKernel); + +REGISTER_OP_WITHOUT_GRADIENT(number_count, ops::NumberCountOp, + ops::NumberCountOpMaker); diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu new file mode 100644 index 0000000000000..97e4b4f2845ae --- /dev/null +++ b/paddle/fluid/operators/number_count_op.cu @@ -0,0 +1,108 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/number_count_op.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +#define CEIL(_x_, _y_) (((_x_)-1) / (_y_) + 1) +#define PERTHREAD_EXPERTS 256 +#define WARP_SIZE 32 + +const int CUDA_NUM_THREADS = 512; +static inline int GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +template +__global__ void initialize_zero_kernel(T* data, const int length) { + CUDA_KERNEL_LOOP(idx, length) { data[idx] = static_cast(0); } +} + +template +__global__ void NumberCount(const T* gate_idx, T* number_count, + int64_t batch_size, int upper_range) { + int res_tmp[PERTHREAD_EXPERTS] = {0}; + int expert_min = blockIdx.x * PERTHREAD_EXPERTS; + int expert_max = expert_min + PERTHREAD_EXPERTS; + if (expert_max > upper_range) { + expert_max = upper_range; + } + for (int i = threadIdx.x; i < batch_size; i += blockDim.x) { + T idx = gate_idx[i]; + if (idx == -1) { + continue; + } + if (idx < expert_min || idx >= expert_max) { + continue; + } + res_tmp[idx - expert_min] += 1; + } + for (int i = expert_min; i < expert_max; ++i) { + int x = res_tmp[i - expert_min]; +#pragma unroll + for (int j = 1; j < WARP_SIZE; j <<= 1) { +#ifdef __HIPCC__ + x = x + __shfl_down(x, j); +#else + x = x + __shfl_down_sync(-1u, x, j); +#endif + } + if (threadIdx.x % WARP_SIZE == 0) { + platform::CudaAtomicAdd(number_count + i, x); + } + } +} + +template +class NumberCountOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto gate_idx = context.Input("gate_idx"); + auto upper_range = context.Attr("upper_range"); + auto number_count = context.Output("Out"); + + int64_t batch_size = gate_idx->numel(); + auto place = context.GetPlace(); + const auto& dev_ctx = + context.template device_context(); + + framework::DDim out_dims = phi::make_ddim({upper_range}); + auto out_data = number_count->mutable_data(out_dims, place); + const T* gate_data = gate_idx->data(); + + initialize_zero_kernel< + T><<>>( + out_data, upper_range); + + NumberCount< + T><<>>( + gate_data, out_data, batch_size, upper_range); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(number_count, ops::NumberCountOpCUDAKernel); diff --git a/paddle/fluid/operators/number_count_op.h b/paddle/fluid/operators/number_count_op.h new file mode 100644 index 0000000000000..95e64946fb8a2 --- /dev/null +++ b/paddle/fluid/operators/number_count_op.h @@ -0,0 +1,37 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +#if defined(PADDLE_WITH_GLOO) +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#endif + +namespace paddle { +namespace operators { + +template +class NumberCountOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support expert count op for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc index e212f4e7e2b7d..122b6a8a80aac 100644 --- a/paddle/fluid/operators/one_hot_v2_op.cc +++ b/paddle/fluid/operators/one_hot_v2_op.cc @@ -12,9 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/one_hot_v2_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -22,26 +26,6 @@ namespace operators { class OneHotV2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "one_hot_v2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "one_hot_v2"); - - auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_GE(x_dims.size(), 1, - platform::errors::InvalidArgument( - "Rank of Input(X) should be at least 1.")); - - int depth = ctx->Attrs().Get("depth"); - if (ctx->HasInput("depth_tensor")) { - depth = -1; - } - - auto out_dims_vec = phi::vectorize(x_dims); - out_dims_vec.push_back(depth); - auto out_dims = phi::make_ddim(out_dims_vec); - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /* --> */ "Out"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -52,7 +36,7 @@ class OneHotV2Op : public framework::OperatorWithKernel { } framework::OpKernelType GetKernelTypeForVar( - const std::string& var_name, const Tensor& tensor, + const std::string& var_name, const framework::Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { if (var_name == "depth_tensor") { return expected_kernel_type; @@ -114,10 +98,12 @@ Out is a LoDTensor: } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(one_hot_v2, OneHotInferShapeFunctor, + PD_INFER_META(phi::OneHotRawInferMeta)); + REGISTER_OPERATOR( one_hot_v2, ops::OneHotV2Op, ops::OneHotV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - one_hot_v2, ops::OneHotV2Kernel, - ops::OneHotV2Kernel); + paddle::framework::EmptyGradOpMaker, + OneHotInferShapeFunctor); diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu deleted file mode 100644 index 77e2a931e50de..0000000000000 --- a/paddle/fluid/operators/one_hot_v2_op.cu +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/one_hot_v2_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data, - const int64_t numel, const int depth) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) { - *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0; - } -} - -template -struct OneHotV2OpCUDAFunctor { - const framework::LoDTensor* in_; - framework::LoDTensor* out_; - const DeviceContext& ctx_; - int depth_; - - OneHotV2OpCUDAFunctor(const framework::LoDTensor* in, - framework::LoDTensor* out, int depth, - const DeviceContext& ctx) - : in_(in), out_(out), depth_(depth), ctx_(ctx) {} - - template - void apply() const { - auto* p_in_data = in_->data(); - auto numel = in_->numel(); - auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); - auto stream = ctx_.stream(); - phi::funcs::set_constant(ctx_, out_, 0.0); - - FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - p_in_data, p_out_data, numel, depth_); - } -}; - -using LoDTensor = framework::LoDTensor; -template -class OneHotV2CUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - - int depth = -1; - if (context.HasInput("depth_tensor")) { - auto* depth_tensor = context.Input("depth_tensor"); - if (platform::is_gpu_place(depth_tensor->place())) { - framework::Tensor temp; - paddle::framework::TensorCopySync(*depth_tensor, platform::CPUPlace(), - &temp); - depth = *temp.data(); - } else { - depth = *depth_tensor->data(); - } - - auto out_dims = out->dims(); - out_dims[out_dims.size() - 1] = depth; - out->Resize(out_dims); - } else { - depth = context.Attr("depth"); - } - framework::VisitDataType( - static_cast( - context.Attr("dtype")), - OneHotV2OpCUDAFunctor( - in, out, depth, context.template device_context())); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - one_hot_v2, - ops::OneHotV2CUDAKernel, - ops::OneHotV2CUDAKernel); diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc index acf6baf50b418..e5702a37bb2b4 100644 --- a/paddle/fluid/operators/one_hot_v2_op_npu.cc +++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc @@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/one_hot_v2_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; template class OneHotV2NPUKernel : public framework::OpKernel { diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc index b96fcaa486cce..372a71706ab5e 100644 --- a/paddle/fluid/operators/op_debug_string_test.cc +++ b/paddle/fluid/operators/op_debug_string_test.cc @@ -17,8 +17,10 @@ #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add_grad); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc new file mode 100644 index 0000000000000..e5399ee36ba7f --- /dev/null +++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/optimizers/merged_momentum_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +template +class MLUMergedMomentumOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto params = ctx.MultiInput("Param"); + auto params_out = ctx.MultiOutput("ParamOut"); + size_t n = params.size(); + PADDLE_ENFORCE_EQ(n, params_out.size(), + platform::errors::InvalidArgument( + "The size of Output(ParamOut) must be equal to " + "Input(Param), but got the size of Output(ParamOut) " + "is %d, the size of Input(Param) is %d.", + params_out.size(), n)); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ(params[i], params_out[i], + platform::errors::InvalidArgument( + "The size of Input(Param) and Output(ParamOut) " + "must be the same Tensors.")); + } + + auto grads = ctx.MultiInput("Grad"); + PADDLE_ENFORCE_EQ( + n, grads.size(), + platform::errors::InvalidArgument( + "The size of Input(Grad) must be equal to Input(Param), but got " + "the size of Input(Grad) is %d, the size of Input(Param) is %d.", + grads.size(), n)); + + auto velocitys = ctx.MultiInput("Velocity"); + PADDLE_ENFORCE_EQ(n, velocitys.size(), + platform::errors::InvalidArgument( + "The size of Input(Velocity) must be equal to " + "Input(Param), but got the size of Input(Velocity) " + "is %d, the size of Input(Param) is %d.", + velocitys.size(), n)); + + auto velocitys_out = ctx.MultiOutput("VelocityOut"); + PADDLE_ENFORCE_EQ( + n, velocitys_out.size(), + platform::errors::InvalidArgument( + "The size of Output(VelocityOut) must be " + "equal to Input(Param), but got the size of Output(VelocityOut) is " + "%d, the size of Input(Param) is %d.", + velocitys_out.size(), n)); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ(velocitys[i], velocitys_out[i], + platform::errors::InvalidArgument( + "Input(Velocity) and Output(VelocityOut) must be " + "the same Tensors.")); + } + + auto mu = ctx.Attr("mu"); + auto lrs = ctx.MultiInput("LearningRate"); + if (lrs.size() != 1) { + PADDLE_ENFORCE_EQ( + n, lrs.size(), + platform::errors::InvalidArgument( + "If the size of Input(LearningRate) is not 1, the size of " + "Input(LearningRate) must be " + "equal to Input(Param), but got the size of Input(LearningRate) " + "is %d, the size of Input(Param) is %d.", + lrs.size(), n)); + } + auto use_nesterov = ctx.Attr("use_nesterov"); + auto regularization_methods = + ctx.Attr>("regularization_method"); + auto regularization_coeffs = + ctx.Attr>("regularization_coeff"); + if (regularization_methods.size() != 0) { + PADDLE_ENFORCE_EQ( + n, regularization_methods.size(), + platform::errors::InvalidArgument( + "The size of Attr(regularization_method) must be equal " + "to Input(Param), but got the size of " + "Attr(regularization_method) is %d, the size of Input(Param) is " + "%d.", + regularization_methods.size(), n)); + PADDLE_ENFORCE_EQ( + n, regularization_coeffs.size(), + platform::errors::InvalidArgument( + "The size of Attr(regularization_coeff) must be equal " + "to Input(Param), but got the size of Attr(regularization_coeff) " + "is %d, the size of Input(Param) is %d.", + regularization_coeffs.size(), n)); + } + + VLOG(5) << "use_nesterov: " << use_nesterov + << ", regularization_methods.size(): " + << regularization_methods.size() + << ", regularization_coeffs.size(): " + << regularization_coeffs.size(); + + auto& dev_ctx = ctx.template device_context(); + + Tensor mu_tensor = ctx.AllocateTmpTensor({1}, dev_ctx); + MLUCnnlTensorDesc mu_tensor_desc(mu_tensor); + MLUCnnl::Fill(ctx, mu, mu_tensor_desc.get(), GetBasePtr(&mu_tensor)); + + for (size_t idx = 0; idx < n; ++idx) { + RegularizationType regularization_flag = + regularization_methods.size() > 0 && + regularization_methods[idx] == "l2_decay" + ? RegularizationType::kL2DECAY + : RegularizationType::kNONE; + T regularization_coeff = static_cast(0.0); + if (regularization_coeffs.size() != 0) { + regularization_coeff = static_cast(regularization_coeffs[idx]); + } + + auto learning_rate = lrs.size() > 1 ? lrs[idx] : lrs[0]; + auto param_out = params_out[idx]; + auto velocity_out = velocitys_out[idx]; + + auto grad = grads[idx]; + Tensor regularized_grad; + MLUCnnlTensorDesc param_desc(*param_out); + if (regularization_flag == RegularizationType::kL2DECAY) { + regularized_grad = ctx.AllocateTmpTensor( + param_out->dims(), dev_ctx); + MLUCnnlOpTensorDesc op_tensor_desc( + CNNL_OP_TENSOR_ADD, ToCnnlDataType(), CNNL_NOT_PROPAGATE_NAN); + MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), param_desc.get(), + GetBasePtr(param_out), param_desc.get(), + GetBasePtr(grad), param_desc.get(), + GetBasePtr(®ularized_grad), ToCnnlDataType(), + regularization_coeff); + } else { + regularized_grad = *grad; + } + MLUCnnl::ApplyMomentum(ctx, param_desc.get(), + GetBasePtr(®ularized_grad), use_nesterov, + GetBasePtr(learning_rate), GetBasePtr(&mu_tensor), + GetBasePtr(param_out), GetBasePtr(velocity_out)); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_MLU_KERNEL(merged_momentum, ops::MLUMergedMomentumOpKernel, + ops::MLUMergedMomentumOpKernel); diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc index 7b9a4ab1557bf..e4952a243262b 100644 --- a/paddle/fluid/operators/pad3d_op.cc +++ b/paddle/fluid/operators/pad3d_op.cc @@ -16,7 +16,9 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -24,734 +26,10 @@ namespace operators { using framework::Tensor; -template -void ConstPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, const int out_d, - const int out_h, const int out_w, const T value) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - out_data[out_d * out_height * out_width + out_h * out_width + out_w] = - (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || - in_h >= in_height || in_w >= in_width) - ? value - : in_data[in_d * in_height * in_width + in_h * in_width + in_w]; -} - -template -void ConstPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, const int out_h, - const int out_w, const T value) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - if (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || - in_h >= in_height || in_w >= in_width) { - for (int c = 0; c < channels; ++c) { - out_data[out_index + c] = value; - } - } else { - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - out_data[out_index + c] = in_data[in_index + c]; - } - } -} - -template -void ReflectPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const int out_d, const int out_h, const int out_w, - const T value) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = std::max(in_d, -in_d); // reflect by 0 - in_d = std::min(in_d, 2 * in_depth - in_d - 2); // reflect by in_depth - in_h = std::max(in_h, -in_h); // reflect by 0 - in_h = std::min(in_h, 2 * in_height - in_h - 2); // reflect by in_height - in_w = std::max(in_w, -in_w); // reflect by 0 - in_w = std::min(in_w, 2 * in_width - in_w - 2); // reflect by in_width - - out_data[out_d * out_height * out_width + out_h * out_width + out_w] = - in_data[in_d * in_height * in_width + in_h * in_width + in_w]; -} - -template -void ReflectPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, const int out_h, - const int out_w, const T value) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = std::max(in_d, -in_d); - in_d = std::min(in_d, 2 * in_depth - in_d - 2); - in_h = std::max(in_h, -in_h); - in_h = std::min(in_h, 2 * in_height - in_h - 2); - in_w = std::max(in_w, -in_w); - in_w = std::min(in_w, 2 * in_width - in_w - 2); - - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - out_data[out_index + c] = in_data[in_index + c]; - } -} - -template -void ReplicatePad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const int out_d, const int out_h, const int out_w, - const T value) { - int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0)); - int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); - int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); - - out_data[out_d * out_height * out_width + out_h * out_width + out_w] = - in_data[in_d * in_height * in_width + in_h * in_width + in_w]; -} - -template -void ReplicatePad3DFuncNDHWC(const T* in_data, T* out_data, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, - const int out_h, const int out_w, const T value) { - int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0)); - int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); - int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); - - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - out_data[out_index + c] = in_data[in_index + c]; - } -} - -template -void CircularPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const int out_d, const int out_h, const int out_w, - const T value) { - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - - out_data[out_d * out_height * out_width + out_h * out_width + out_w] = - in_data[in_d * in_height * in_width + in_h * in_width + in_w]; -} - -template -void CircularPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, - const int out_h, const int out_w, const T value) { - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - out_data[out_index + c] = in_data[in_index + c]; - } -} - -template -void Pad3DNCDHW(const T* in_data, const int num, const int channels, - const int in_depth, const int in_height, const int in_width, - const int out_depth, const int out_height, const int out_width, - const int pad_front, const int pad_top, const int pad_left, - T value, T* out_data, - void (*pad_func)(const T*, T*, const int, const int, const int, - const int, const int, const int, const int, - const int, const int, const int, const int, - const int, const T)) { - for (int n = 0; n < num; ++n) { - for (int c = 0; c < channels; ++c) { - for (int out_d = 0; out_d < out_depth; ++out_d) { - for (int out_h = 0; out_h < out_height; ++out_h) { - for (int out_w = 0; out_w < out_width; ++out_w) { - pad_func(in_data, out_data, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, - pad_left, out_d, out_h, out_w, value); - } - } - } - in_data += in_depth * in_height * in_width; - out_data += out_depth * out_height * out_width; - } - } -} - -template -void Pad3DNDHWC(const T* in_data, const int num, const int channels, - const int in_depth, const int in_height, const int in_width, - const int out_depth, const int out_height, const int out_width, - const int pad_front, const int pad_top, const int pad_left, - T value, T* out_data, - void (*pad_func)(const T*, T*, const int, const int, const int, - const int, const int, const int, const int, - const int, const int, const int, const int, - const int, const int, const T)) { - for (int n = 0; n < num; ++n) { - for (int out_d = 0; out_d < out_depth; ++out_d) { - for (int out_h = 0; out_h < out_height; ++out_h) { - for (int out_w = 0; out_w < out_width; ++out_w) { - pad_func(in_data, out_data, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, - pad_left, out_d, out_h, out_w, value); - } - } - } - in_data += in_depth * in_height * in_width * channels; - out_data += out_depth * out_height * out_width * channels; - } -} - -template -void ConstPad3DGradNCDHW(T* d_in_data, const T* d_out_data, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, const int out_d, - const int out_h, const int out_w) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || - in_h >= in_height || in_w >= in_width)) { - d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] = - d_out_data[out_d * out_height * out_width + out_h * out_width + out_w]; - } -} - -template -void ConstPad3DGradNDHWC(T* d_in_data, const T* d_out_data, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, const int out_h, - const int out_w) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || - in_h >= in_height || in_w >= in_width)) { - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - d_in_data[in_index + c] = d_out_data[out_index + c]; - } - } -} - -template -void ReflectPad3DGradNCDHW(T* d_in_data, const T* d_out_data, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, const int out_h, - const int out_w) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = std::max(in_d, -in_d); // reflect by 0 - in_d = std::min(in_d, 2 * in_depth - in_d - 2); // reflect by in_depth - in_h = std::max(in_h, -in_h); // reflect by 0 - in_h = std::min(in_h, 2 * in_height - in_h - 2); // reflect by in_height - in_w = std::max(in_w, -in_w); // reflect by 0 - in_w = std::min(in_w, 2 * in_width - in_w - 2); // reflect by in_width - - d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] += - d_out_data[out_d * out_height * out_width + out_h * out_width + out_w]; -} - -template -void ReflectPad3DGradNDHWC(T* d_in_data, const T* d_out_data, - const int channels, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const int out_d, const int out_h, const int out_w) { - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = std::max(in_d, -in_d); - in_d = std::min(in_d, 2 * in_depth - in_d - 2); - in_h = std::max(in_h, -in_h); - in_h = std::min(in_h, 2 * in_height - in_h - 2); - in_w = std::max(in_w, -in_w); - in_w = std::min(in_w, 2 * in_width - in_w - 2); - - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - d_in_data[in_index + c] += d_out_data[out_index + c]; - } -} - -template -void ReplicatePad3DGradNCDHW(T* d_in_data, const T* d_out_data, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, - const int out_h, const int out_w) { - int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0)); - int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); - int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); - - d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] += - d_out_data[out_d * out_height * out_width + out_h * out_width + out_w]; -} - -template -void ReplicatePad3DGradNDHWC(T* d_in_data, const T* d_out_data, - const int channels, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const int out_d, const int out_h, - const int out_w) { - int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0)); - int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); - int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); - - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - d_in_data[in_index + c] += d_out_data[out_index + c]; - } -} - -template -void CircularPad3DGradNCDHW(T* d_in_data, const T* d_out_data, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const int out_d, - const int out_h, const int out_w) { - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] += - d_out_data[out_d * out_height * out_width + out_h * out_width + out_w]; -} - -template -void CircularPad3DGradNDHWC(T* d_in_data, const T* d_out_data, - const int channels, const int in_depth, - const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const int out_d, const int out_h, const int out_w) { - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - - const int out_index = - (out_d * out_height * out_width + out_h * out_width + out_w) * channels; - const int in_index = - (in_d * in_height * in_width + in_h * in_width + in_w) * channels; - for (int c = 0; c < channels; ++c) { - d_in_data[in_index + c] += d_out_data[out_index + c]; - } -} - -template -void Pad3DGradNCDHW(T* d_in_data, const int num, const int channels, - const int in_depth, const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, const int pad_top, - const int pad_left, const T* d_out_data, - void (*pad_func)(T*, const T*, const int, const int, - const int, const int, const int, const int, - const int, const int, const int, const int, - const int, const int)) { - for (int n = 0; n < num; ++n) { - for (int c = 0; c < channels; ++c) { - for (int out_d = 0; out_d < out_depth; ++out_d) { - for (int out_h = 0; out_h < out_height; ++out_h) { - for (int out_w = 0; out_w < out_width; ++out_w) { - pad_func(d_in_data, d_out_data, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, - pad_left, out_d, out_h, out_w); - } - } - } - d_in_data += in_depth * in_height * in_width; - d_out_data += out_depth * out_height * out_width; - } - } -} - -template -void Pad3DGradNDHWC(T* d_in_data, const int num, const int channels, - const int in_depth, const int in_height, const int in_width, - const int out_depth, const int out_height, - const int out_width, const int pad_front, const int pad_top, - const int pad_left, const T* d_out_data, - void (*pad_func)(T*, const T*, const int, const int, - const int, const int, const int, const int, - const int, const int, const int, const int, - const int, const int, const int)) { - for (int n = 0; n < num; ++n) { - for (int out_d = 0; out_d < out_depth; ++out_d) { - for (int out_h = 0; out_h < out_height; ++out_h) { - for (int out_w = 0; out_w < out_width; ++out_w) { - pad_func(d_in_data, d_out_data, channels, in_depth, in_height, - in_width, out_depth, out_height, out_width, pad_front, - pad_top, pad_left, out_d, out_h, out_w); - } - } - } - d_in_data += in_depth * in_height * in_width * channels; - d_out_data += out_depth * out_height * out_width * channels; - } -} - -static inline std::vector GetPaddings( - const framework::ExecutionContext& context) { - std::vector paddings(6); - auto* paddings_t = context.Input("Paddings"); - if (paddings_t) { - auto paddings_data = paddings_t->data(); - std::memcpy(paddings.data(), paddings_data, paddings.size() * sizeof(int)); - } else { - auto pads = context.Attr>("paddings"); - std::copy(pads.begin(), pads.end(), paddings.data()); - } - return paddings; -} - -template -class Pad3dCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::vector pads = GetPaddings(context); - auto mode = context.Attr("mode"); - auto data_format = context.Attr("data_format"); - T value = static_cast(context.Attr("value")); - - auto* x = context.Input("X"); - auto in_dims = x->dims(); - const T* in_data = x->data(); - - auto* out = context.Output("Out"); - if (data_format == "NCDHW") { - out->Resize({in_dims[0], in_dims[1], in_dims[2] + pads[4] + pads[5], - in_dims[3] + pads[2] + pads[3], - in_dims[4] + pads[0] + pads[1]}); - } else { - out->Resize({in_dims[0], in_dims[1] + pads[4] + pads[5], - in_dims[2] + pads[2] + pads[3], - in_dims[3] + pads[0] + pads[1], in_dims[4]}); - } - auto out_dims = out->dims(); - T* out_data = out->mutable_data(context.GetPlace()); - - int channels = in_dims[1]; - int in_depth = in_dims[2]; - int in_height = in_dims[3]; - int in_width = in_dims[4]; - int out_depth = out_dims[2]; - int out_height = out_dims[3]; - int out_width = out_dims[4]; - if (data_format == "NDHWC") { - channels = in_dims[4]; - in_depth = in_dims[1]; - in_height = in_dims[2]; - in_width = in_dims[3]; - out_depth = out_dims[1]; - out_height = out_dims[2]; - out_width = out_dims[3]; - } - - if (mode == "reflect") { - PADDLE_ENFORCE_GT(in_depth, pads[4], - platform::errors::InvalidArgument( - "The depth of Input(X)'s dimension should be " - "greater than pad_front" - " in reflect mode" - ", but received depth(%d) and pad_front(%d).", - in_depth, pads[4])); - PADDLE_ENFORCE_GT(in_depth, pads[5], - platform::errors::InvalidArgument( - "The depth of Input(X)'s dimension should be " - "greater than pad_back" - " in reflect mode" - ", but received depth(%d) and pad_back(%d).", - in_depth, pads[5])); - - PADDLE_ENFORCE_GT(in_height, pads[2], - platform::errors::InvalidArgument( - "The height of Input(X)'s dimension should be " - "greater than pad_top" - " in reflect mode" - ", but received depth(%d) and pad_top(%d).", - in_height, pads[2])); - PADDLE_ENFORCE_GT(in_height, pads[3], - platform::errors::InvalidArgument( - "The height of Input(X)'s dimension should be " - "greater than pad_bottom" - " in reflect mode" - ", but received depth(%d) and pad_bottom(%d).", - in_height, pads[3])); - - PADDLE_ENFORCE_GT(in_width, pads[0], - platform::errors::InvalidArgument( - "The width of Input(X)'s dimension should be " - "greater than pad_left" - " in reflect mode" - ", but received depth(%d) and pad_left(%d).", - in_width, pads[0])); - PADDLE_ENFORCE_GT(in_width, pads[1], - platform::errors::InvalidArgument( - "The width of Input(X)'s dimension should be " - "greater than pad_right" - " in reflect mode" - ", but received depth(%d) and pad_right(%d).", - in_width, pads[1])); - } else if (mode == "circular" || mode == "replicate") { - PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0, - platform::errors::InvalidArgument( - "The input tensor size can not be 0 for circular " - "or replicate padding mode.")); - } - - const int pad_left = pads[0]; - const int pad_top = pads[2]; - const int pad_front = pads[4]; - const int num = in_dims[0]; - if (data_format == "NCDHW") { - std::map - func_map; - - func_map["reflect"] = ReflectPad3DFuncNCDHW; - func_map["replicate"] = ReplicatePad3DFuncNCDHW; - func_map["circular"] = CircularPad3DFuncNCDHW; - func_map["constant"] = ConstPad3DFuncNCDHW; - Pad3DNCDHW(in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - value, out_data, func_map[mode]); - } else { - std::map - func_map; - - func_map["reflect"] = ReflectPad3DFuncNDHWC; - func_map["replicate"] = ReplicatePad3DFuncNDHWC; - func_map["circular"] = CircularPad3DFuncNDHWC; - func_map["constant"] = ConstPad3DFuncNDHWC; - Pad3DNDHWC(in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - value, out_data, func_map[mode]); - } - } -}; - -template -class Pad3dGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::vector pads = GetPaddings(context); - auto mode = context.Attr("mode"); - auto data_format = context.Attr("data_format"); - auto* d_out = context.Input(framework::GradVarName("Out")); - auto* d_in = context.Output(framework::GradVarName("X")); - auto d_in_dims = d_in->dims(); - auto d_out_dims = d_out->dims(); - const T* d_out_data = d_out->data(); - T* d_in_data = d_in->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(context.template device_context(), - d_in, static_cast(0)); - const int pad_left = pads[0]; - const int pad_top = pads[2]; - const int pad_front = pads[4]; - const int num = d_in_dims[0]; - if (data_format == "NCDHW") { - const int channels = d_in_dims[1]; - const int in_depth = d_in_dims[2]; - const int in_height = d_in_dims[3]; - const int in_width = d_in_dims[4]; - const int out_depth = d_out_dims[2]; - const int out_height = d_out_dims[3]; - const int out_width = d_out_dims[4]; - - std::map - func_map; - - func_map["reflect"] = ReflectPad3DGradNCDHW; - func_map["replicate"] = ReplicatePad3DGradNCDHW; - func_map["circular"] = CircularPad3DGradNCDHW; - func_map["constant"] = ConstPad3DGradNCDHW; - - Pad3DGradNCDHW(d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, - pad_left, d_out_data, func_map[mode]); - } else { - const int channels = d_in_dims[4]; - const int in_depth = d_in_dims[1]; - const int in_height = d_in_dims[2]; - const int in_width = d_in_dims[3]; - const int out_depth = d_out_dims[1]; - const int out_height = d_out_dims[2]; - const int out_width = d_out_dims[3]; - - std::map - func_map; - - func_map["reflect"] = ReflectPad3DGradNDHWC; - func_map["replicate"] = ReplicatePad3DGradNDHWC; - func_map["circular"] = CircularPad3DGradNDHWC; - func_map["constant"] = ConstPad3DGradNDHWC; - - Pad3DGradNDHWC(d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, - pad_left, d_out_data, func_map[mode]); - } - } -}; - class Pad3dOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad3d"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad3d"); - - auto x_dim = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ(x_dim.size(), 5, - platform::errors::InvalidArgument( - "The size of Input(X)'s dimension should be equal to " - "5, but received %d. ", - x_dim.size())); - - std::vector out_dims(x_dim.size()); - auto data_format = ctx->Attrs().Get("data_format"); - out_dims[0] = x_dim[0]; - if (ctx->HasInput("Paddings")) { - auto paddings_dim = ctx->GetInputDim("Paddings"); - PADDLE_ENFORCE_EQ(paddings_dim.size(), 1, - platform::errors::InvalidArgument( - "Size of Input(Paddings)'s dimension should be " - "equal to 1, but received %d.", - paddings_dim.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(paddings_dim[0], 6, - platform::errors::InvalidArgument( - "Shape of Input(Paddings) should be equal to " - "[6], but received [%d].", - paddings_dim[0])); - } - out_dims[1] = x_dim[1]; - out_dims[2] = x_dim[2]; - out_dims[3] = x_dim[3]; - } else { - auto paddings = ctx->Attrs().Get>("paddings"); - PADDLE_ENFORCE_EQ( - paddings.size(), 6, - platform::errors::InvalidArgument( - "Size of paddings should be equal to 4, but received %d.", - static_cast(paddings.size()))); - if (data_format == "NCDHW") { - out_dims[1] = x_dim[1]; // channel - out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0)) - ? x_dim[2] - : (x_dim[2] + paddings[4] + paddings[5]); // depth - - out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0)) - ? x_dim[3] - : (x_dim[3] + paddings[2] + paddings[3]); // height - - out_dims[4] = ((!ctx->IsRuntime()) && (x_dim[4] < 0)) - ? x_dim[4] - : (x_dim[4] + paddings[0] + paddings[1]); // width - } else { // NDHWC - out_dims[4] = x_dim[4]; // channel - - out_dims[1] = ((!ctx->IsRuntime()) && (x_dim[1] < 0)) - ? x_dim[1] - : (x_dim[1] + paddings[4] + paddings[5]); // depth - out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0)) - ? x_dim[2] - : (x_dim[2] + paddings[2] + paddings[3]); // height - out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0)) - ? x_dim[3] - : (x_dim[3] + paddings[0] + paddings[1]); // width - } - } - - ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -921,15 +199,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(Pad3dOpGradNoNeedBufferVarsInferer, "X"); namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(pad3d, Pad3dInferShapeFunctor, + PD_INFER_META(phi::Pad3dInferMeta)); + REGISTER_OPERATOR(pad3d, ops::Pad3dOp, ops::Pad3dOpMaker, ops::Pad3dOpGradMaker, - ops::Pad3dOpGradMaker); + ops::Pad3dOpGradMaker, + Pad3dInferShapeFunctor); REGISTER_OPERATOR(pad3d_grad, ops::Pad3dOpGrad, ops::Pad3dOpDoubleGradMaker, ops::Pad3dOpDoubleGradMaker, ops::Pad3dOpGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL(pad3d, ops::Pad3dCPUKernel, - ops::Pad3dCPUKernel, ops::Pad3dCPUKernel, - ops::Pad3dCPUKernel); -REGISTER_OP_CPU_KERNEL(pad3d_grad, ops::Pad3dGradCPUKernel, - ops::Pad3dGradCPUKernel); diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu deleted file mode 100644 index 9ab0eb9d445da..0000000000000 --- a/paddle/fluid/operators/pad3d_op.cu +++ /dev/null @@ -1,793 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using platform::PADDLE_CUDA_NUM_THREADS; - -using framework::Tensor; - -template -__global__ void Pad3DConstNCDHW(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T value, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int nc = index / out_width; - - const int out_w = index % out_width; - const int out_h = nc % out_height; - nc /= out_height; - const int out_d = nc % out_depth; - nc /= out_depth; - - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - out_data[index] = - (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || - in_h >= in_height || in_w >= in_width) - ? value - : in_data[nc * in_depth * in_height * in_width + - in_d * in_height * in_width + in_h * in_width + in_w]; - } -} - -template -__global__ void Pad3DConstNDHWC(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T value, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int n = index / channels; - const int c = index % channels; - const int out_w = n % out_width; - n /= out_width; - const int out_h = n % out_height; - n /= out_height; - const int out_d = n % out_depth; - n /= out_depth; - const int in_d = out_d - pad_front; - const int in_h = out_h - pad_top; - const int in_w = out_w - pad_left; - - out_data[index] = - (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth || - in_h >= in_height || in_w >= in_width) - ? value - : in_data[n * in_depth * in_height * in_width * channels + - in_d * in_height * in_width * channels + - in_h * in_width * channels + in_w * channels + c]; - } -} - -template -__global__ void Pad3DReflectNCDHW(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int nc = index / out_width; - - const int out_w = index % out_width; - const int out_h = nc % out_height; - nc /= out_height; - const int out_d = nc % out_depth; - nc /= out_depth; - - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = max(in_d, -in_d); // reflect by 0 - in_d = min(in_d, 2 * in_depth - in_d - 2); // reflect by in_depth - in_h = max(in_h, -in_h); // reflect by 0 - in_h = min(in_h, 2 * in_height - in_h - 2); // reflect by in_height - in_w = max(in_w, -in_w); // reflect by 0 - in_w = min(in_w, 2 * in_width - in_w - 2); // reflect by in_width - out_data[index] = - in_data[(nc * in_depth * in_height + in_d * in_height + in_h) * - in_width + - in_w]; - } -} - -template -__global__ void Pad3DReflectNDHWC(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int n = index / channels; - const int c = index % channels; - const int out_w = n % out_width; - n /= out_width; - const int out_h = n % out_height; - n /= out_height; - const int out_d = n % out_depth; - n /= out_depth; - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = max(in_d, -in_d); - in_d = min(in_d, 2 * in_depth - in_d - 2); - in_h = max(in_h, -in_h); - in_h = min(in_h, 2 * in_height - in_h - 2); - in_w = max(in_w, -in_w); - in_w = min(in_w, 2 * in_width - in_w - 2); - - out_data[index] = in_data[n * in_depth * in_height * in_width * channels + - in_d * in_height * in_width * channels + - in_h * in_width * channels + in_w * channels + c]; - } -} - -template -__global__ void Pad3DReplicateNCDHW(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int nc = index / out_width; - - const int out_w = index % out_width; - const int out_h = nc % out_height; - nc /= out_height; - const int out_d = nc % out_depth; - nc /= out_depth; - - int in_d = min(in_depth - 1, max(out_d - pad_front, 0)); - int in_h = min(in_height - 1, max(out_h - pad_top, 0)); - int in_w = min(in_width - 1, max(out_w - pad_left, 0)); - - out_data[index] = - in_data[(nc * in_depth * in_height + in_d * in_height + in_h) * - in_width + - in_w]; - } -} - -template -__global__ void Pad3DReplicateNDHWC(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int n = index / channels; - const int c = index % channels; - const int out_w = n % out_width; - n /= out_width; - const int out_h = n % out_height; - n /= out_height; - const int out_d = n % out_depth; - n /= out_depth; - - int in_d = min(in_depth - 1, max(out_d - pad_front, 0)); - int in_h = min(in_height - 1, max(out_h - pad_top, 0)); - int in_w = min(in_width - 1, max(out_w - pad_left, 0)); - - out_data[index] = in_data[n * in_depth * in_height * in_width * channels + - in_d * in_height * in_width * channels + - in_h * in_width * channels + in_w * channels + c]; - } -} - -template -__global__ void Pad3DCircularNCDHW(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int nc = index / out_width; - - const int out_w = index % out_width; - const int out_h = nc % out_height; - nc /= out_height; - const int out_d = nc % out_depth; - nc /= out_depth; - - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - - out_data[index] = - in_data[(nc * in_depth * in_height + in_d * in_height + in_h) * - in_width + - in_w]; - } -} - -template -__global__ void Pad3DCircularNDHWC(const int nthreads, const T* in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, T* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - int n = index / channels; - const int c = index % channels; - const int out_w = n % out_width; - n /= out_width; - const int out_h = n % out_height; - n /= out_height; - const int out_d = n % out_depth; - n /= out_depth; - - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - - out_data[index] = in_data[n * in_depth * in_height * in_width * channels + - in_d * in_height * in_width * channels + - in_h * in_width * channels + in_w * channels + c]; - } -} - -template -__global__ void Pad3DGradConstNCDHW(const int in_size, T* d_in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const T* d_out_data) { - CUDA_KERNEL_LOOP(in_index, in_size) { - const int in_w = in_index % in_width; - - int nc = in_index / in_width; - const int in_h = nc % in_height; - - nc /= in_height; - const int in_d = nc % in_depth; - - nc /= in_depth; - - const int out_d = in_d + pad_front; - const int out_h = in_h + pad_top; - const int out_w = in_w + pad_left; - d_in_data[in_index] = - d_out_data[nc * out_depth * out_height * out_width + - out_d * out_height * out_width + out_h * out_width + out_w]; - } -} - -template -__global__ void Pad3DGradConstNDHWC(const int in_size, T* d_in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const T* d_out_data) { - CUDA_KERNEL_LOOP(in_index, in_size) { - const int c = in_index % channels; - int n = in_index / channels; - - const int in_w = n % in_width; - n /= in_width; - - const int in_h = n % in_height; - n /= in_height; - - const int in_d = n % in_depth; - n /= in_depth; - - const int out_d = in_d + pad_front; - const int out_h = in_h + pad_top; - const int out_w = in_w + pad_left; - - d_in_data[in_index] = - d_out_data[n * out_depth * out_height * out_width * channels + - out_d * out_height * out_width * channels + - out_h * out_width * channels + out_w * channels + c]; - } -} - -template -__global__ void Pad3DGradReflectNCDHW(const int out_size, T* d_in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const T* d_out_data) { - CUDA_KERNEL_LOOP(out_index, out_size) { - int nc = out_index / out_width; - const int out_w = out_index % out_width; - const int out_h = nc % out_height; - nc /= out_height; - const int out_d = nc % out_depth; - nc /= out_depth; - - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = max(in_d, -in_d); - in_h = max(in_h, -in_h); - in_w = max(in_w, -in_w); - - in_d = min(in_d, 2 * in_depth - in_d - 2); - in_h = min(in_h, 2 * in_height - in_h - 2); - in_w = min(in_w, 2 * in_width - in_w - 2); - - platform::CudaAtomicAdd( - &d_in_data[nc * in_depth * in_height * in_width + - in_d * in_height * in_width + in_h * in_width + in_w], - d_out_data[out_index]); - } -} - -template -__global__ void Pad3DGradReflectNDHWC(const int out_size, T* d_in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, const int out_width, - const int pad_front, const int pad_top, - const int pad_left, const T* d_out_data) { - CUDA_KERNEL_LOOP(out_index, out_size) { - const int c = out_index % channels; - int n = out_index / channels; - const int out_w = n % out_width; - n /= out_width; - const int out_h = n % out_height; - n /= out_height; - const int out_d = n % out_depth; - n /= out_depth; - - int in_d = out_d - pad_front; - int in_h = out_h - pad_top; - int in_w = out_w - pad_left; - - in_d = max(in_d, -in_d); - in_h = max(in_h, -in_h); - in_w = max(in_w, -in_w); - - in_d = min(in_d, in_depth * 2 - in_d - 2); - in_h = min(in_h, in_height * 2 - in_h - 2); - in_w = min(in_w, in_width * 2 - in_w - 2); - platform::CudaAtomicAdd( - &d_in_data[n * in_depth * in_height * in_width * channels + - in_d * in_height * in_width * channels + - in_h * in_width * channels + in_w * channels + c], - d_out_data[out_index]); - } -} - -template -__global__ void Pad3DGradReplicateNCDHW( - const int out_size, T* d_in_data, const int num, const int channels, - const int in_depth, const int in_height, const int in_width, - const int out_depth, const int out_height, const int out_width, - const int pad_front, const int pad_top, const int pad_left, - const T* d_out_data) { - CUDA_KERNEL_LOOP(out_index, out_size) { - int nc = out_index / out_width; - const int out_w = out_index % out_width; - const int out_h = nc % out_height; - nc /= out_height; - const int out_d = nc % out_depth; - nc /= out_depth; - - const int in_d = min(in_depth - 1, max(out_d - pad_front, 0)); - const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); - const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); - - platform::CudaAtomicAdd( - &d_in_data[nc * in_depth * in_height * in_width + - in_d * in_height * in_width + in_h * in_width + in_w], - d_out_data[out_index]); - } -} - -template -__global__ void Pad3DGradReplicateNDHWC( - const int out_size, T* d_in_data, const int num, const int channels, - const int in_depth, const int in_height, const int in_width, - const int out_depth, const int out_height, const int out_width, - const int pad_front, const int pad_top, const int pad_left, - const T* d_out_data) { - CUDA_KERNEL_LOOP(out_index, out_size) { - const int c = out_index % channels; - int n = out_index / channels; - const int out_w = n % out_width; - n /= out_width; - const int out_h = n % out_height; - n /= out_height; - const int out_d = n % out_depth; - n /= out_depth; - - const int in_d = min(in_depth - 1, max(out_d - pad_front, 0)); - const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); - const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); - - platform::CudaAtomicAdd( - &d_in_data[n * in_depth * in_height * in_width * channels + - in_d * in_height * in_width * channels + - in_h * in_width * channels + in_w * channels + c], - d_out_data[out_index]); - } -} - -template -__global__ void Pad3DGradCircularNCDHW(const int out_size, T* d_in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const T* d_out_data) { - CUDA_KERNEL_LOOP(out_index, out_size) { - int nc = out_index / out_width; - const int out_w = out_index % out_width; - const int out_h = nc % out_height; - nc /= out_height; - const int out_d = nc % out_depth; - nc /= out_depth; - - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - - platform::CudaAtomicAdd( - &d_in_data[nc * in_depth * in_height * in_width + - in_d * in_height * in_width + in_h * in_width + in_w], - d_out_data[out_index]); - } -} - -template -__global__ void Pad3DGradCircularNDHWC(const int out_size, T* d_in_data, - const int num, const int channels, - const int in_depth, const int in_height, - const int in_width, const int out_depth, - const int out_height, - const int out_width, const int pad_front, - const int pad_top, const int pad_left, - const T* d_out_data) { - CUDA_KERNEL_LOOP(out_index, out_size) { - const int c = out_index % channels; - int n = out_index / channels; - const int out_w = n % out_width; - n /= out_width; - const int out_h = n % out_height; - n /= out_height; - const int out_d = n % out_depth; - n /= out_depth; - - int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth; - int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; - int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - - platform::CudaAtomicAdd( - &d_in_data[n * in_depth * in_height * in_width * channels + - in_d * in_height * in_width * channels + - in_h * in_width * channels + in_w * channels + c], - d_out_data[out_index]); - } -} - -static inline std::vector GetPaddings( - const framework::ExecutionContext& context) { - std::vector paddings(6); - auto* paddings_data = context.Input("Paddings"); - if (paddings_data) { - Tensor pads; - framework::TensorCopySync(*paddings_data, platform::CPUPlace(), &pads); - auto pads_data = pads.data(); - std::memcpy(paddings.data(), pads_data, paddings.size() * sizeof(int)); - } else { - auto pads = context.Attr>("paddings"); - std::copy(pads.begin(), pads.end(), paddings.data()); - } - return paddings; -} - -template -class Pad3dCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::vector pads = GetPaddings(context); - auto mode = context.Attr("mode"); - auto data_format = context.Attr("data_format"); - T value = static_cast(context.Attr("value")); - - auto* x = context.Input("X"); - auto in_dims = x->dims(); - const T* in_data = x->data(); - auto* out = context.Output("Out"); - auto out_dims = out->dims(); - if (data_format == "NCDHW") { - out_dims[0] = in_dims[0]; - out_dims[1] = in_dims[1]; - out_dims[2] = in_dims[2] + pads[4] + pads[5]; - out_dims[3] = in_dims[3] + pads[2] + pads[3]; - out_dims[4] = in_dims[4] + pads[0] + pads[1]; - } else { - out_dims[0] = in_dims[0]; - out_dims[1] = in_dims[1] + pads[4] + pads[5]; - out_dims[2] = in_dims[2] + pads[2] + pads[3]; - out_dims[3] = in_dims[3] + pads[0] + pads[1]; - out_dims[4] = in_dims[4]; - } - T* out_data = out->mutable_data(out_dims, context.GetPlace()); - - int channels = in_dims[1]; - int in_depth = in_dims[2]; - int in_height = in_dims[3]; - int in_width = in_dims[4]; - int out_depth = out_dims[2]; - int out_height = out_dims[3]; - int out_width = out_dims[4]; - if (data_format == "NDHWC") { - channels = in_dims[4]; - in_depth = in_dims[1]; - in_height = in_dims[2]; - in_width = in_dims[3]; - out_depth = out_dims[1]; - out_height = out_dims[2]; - out_width = out_dims[3]; - } - - if (mode == "reflect") { - PADDLE_ENFORCE_GT(in_depth, pads[4], - platform::errors::InvalidArgument( - "The depth of Input(X)'s dimension should be " - "greater than pad_front" - " in reflect mode" - ", but received depth(%d) and pad_front(%d).", - in_depth, pads[4])); - PADDLE_ENFORCE_GT(in_depth, pads[5], - platform::errors::InvalidArgument( - "The depth of Input(X)'s dimension should be " - "greater than pad_back" - " in reflect mode" - ", but received depth(%d) and pad_back(%d).", - in_depth, pads[5])); - - PADDLE_ENFORCE_GT(in_height, pads[2], - platform::errors::InvalidArgument( - "The height of Input(X)'s dimension should be " - "greater than pad_top" - " in reflect mode" - ", but received depth(%d) and pad_top(%d).", - in_height, pads[2])); - PADDLE_ENFORCE_GT(in_height, pads[3], - platform::errors::InvalidArgument( - "The height of Input(X)'s dimension should be " - "greater than pad_bottom" - " in reflect mode" - ", but received depth(%d) and pad_bottom(%d).", - in_height, pads[3])); - - PADDLE_ENFORCE_GT(in_width, pads[0], - platform::errors::InvalidArgument( - "The width of Input(X)'s dimension should be " - "greater than pad_left" - " in reflect mode" - ", but received depth(%d) and pad_left(%d).", - in_width, pads[0])); - PADDLE_ENFORCE_GT(in_width, pads[1], - platform::errors::InvalidArgument( - "The width of Input(X)'s dimension should be " - "greater than pad_right" - " in reflect mode" - ", but received depth(%d) and pad_right(%d).", - in_width, pads[1])); - } else if (mode == "circular" || mode == "replicate") { - PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0, - platform::errors::InvalidArgument( - "The input tensor size can not be 0 for circular " - "or replicate padding mode.")); - } - - const int pad_left = pads[0]; - const int pad_top = pads[2]; - const int pad_front = pads[4]; - const int num = in_dims[0]; - - auto stream = context.cuda_device_context().stream(); - int block = PADDLE_CUDA_NUM_THREADS; - const int out_size = out->numel(); - int grid = (out_size + block - 1) / block; - - if (data_format == "NCDHW") { - if (mode == "reflect") { - Pad3DReflectNCDHW<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - out_data); - } else if (mode == "replicate") { - Pad3DReplicateNCDHW<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - out_data); - } else if (mode == "circular") { - Pad3DCircularNCDHW<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - out_data); - } else { - Pad3DConstNCDHW<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - value, out_data); - } - } else { - if (mode == "reflect") { - Pad3DReflectNDHWC<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - out_data); - } else if (mode == "replicate") { - Pad3DReplicateNDHWC<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - out_data); - } else if (mode == "circular") { - Pad3DCircularNDHWC<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - out_data); - } else { - Pad3DConstNDHWC<<>>( - out_size, in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - value, out_data); - } - } - } -}; - -template -class Pad3dGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::vector pads = GetPaddings(context); - auto mode = context.Attr("mode"); - auto data_format = context.Attr("data_format"); - auto* d_out = context.Input(framework::GradVarName("Out")); - auto* d_in = context.Output(framework::GradVarName("X")); - auto d_in_dims = d_in->dims(); - auto d_out_dims = d_out->dims(); - const T* d_out_data = d_out->data(); - T* d_in_data = d_in->mutable_data(context.GetPlace()); - - phi::funcs::SetConstant set_zero; - set_zero(context.template device_context(), - d_in, static_cast(0)); - - const int pad_left = pads[0]; - const int pad_top = pads[2]; - const int pad_front = pads[4]; - - const int num = d_in_dims[0]; - - auto stream = context.cuda_device_context().stream(); - int block = PADDLE_CUDA_NUM_THREADS; - const int out_size = d_out->numel(); - const int in_size = d_in->numel(); - int grid = (out_size + block - 1) / block; - - if (data_format == "NCDHW") { - const int channels = d_in_dims[1]; - const int in_depth = d_in_dims[2]; - const int in_height = d_in_dims[3]; - const int in_width = d_in_dims[4]; - const int out_depth = d_out_dims[2]; - const int out_height = d_out_dims[3]; - const int out_width = d_out_dims[4]; - - if (mode == "reflect") { - Pad3DGradReflectNCDHW<<>>( - out_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } else if (mode == "replicate") { - Pad3DGradReplicateNCDHW<<>>( - out_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } else if (mode == "circular") { - Pad3DGradCircularNCDHW<<>>( - out_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } else { - grid = (in_size + block - 1) / block; - Pad3DGradConstNCDHW<<>>( - in_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } - } else { - const int channels = d_in_dims[4]; - const int in_depth = d_in_dims[1]; - const int in_height = d_in_dims[2]; - const int in_width = d_in_dims[3]; - const int out_depth = d_out_dims[1]; - const int out_height = d_out_dims[2]; - const int out_width = d_out_dims[3]; - if (mode == "reflect") { - Pad3DGradReflectNDHWC<<>>( - out_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } else if (mode == "replicate") { - Pad3DGradReplicateNDHWC<<>>( - out_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } else if (mode == "circular") { - Pad3DGradCircularNDHWC<<>>( - out_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } else { - grid = (in_size + block - 1) / block; - Pad3DGradConstNDHWC<<>>( - in_size, d_in_data, num, channels, in_depth, in_height, in_width, - out_depth, out_height, out_width, pad_front, pad_top, pad_left, - d_out_data); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(pad3d, ops::Pad3dCUDAKernel, - ops::Pad3dCUDAKernel, - ops::Pad3dCUDAKernel, ops::Pad3dCUDAKernel, - ops::Pad3dCUDAKernel); -REGISTER_OP_CUDA_KERNEL(pad3d_grad, ops::Pad3dGradCUDAKernel, - ops::Pad3dGradCUDAKernel, - ops::Pad3dGradCUDAKernel); diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc index 229e61ac9fe79..dc162ae5782f2 100644 --- a/paddle/fluid/operators/pad_op.cc +++ b/paddle/fluid/operators/pad_op.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/complex.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,37 +30,6 @@ class PadOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad"); - - auto x_dim = ctx->GetInputDim("X"); - auto& paddings = ctx->Attrs().Get>("paddings"); - PADDLE_ENFORCE_EQ( - static_cast(paddings.size()), x_dim.size() * 2, - platform::errors::InvalidArgument( - "Size of 'paddings' dimension should be equal to 2 * size of " - "Input(X)'s dimension, but received (size of 'paddings' dimension " - "is) %d vs (2 * size of Input(X)'s dimension is) %d.", - static_cast(paddings.size()), x_dim.size() * 2)); - for (size_t i = 0; i < paddings.size(); ++i) { - PADDLE_ENFORCE_GE(paddings[i], 0, - platform::errors::InvalidArgument( - "The element of 'paddings' should >= 0, but " - "received %d for index %d.", - paddings[i], static_cast(i))); - } - std::vector out_dims(x_dim.size()); - for (int i = 0; i < x_dim.size(); ++i) { - if ((!ctx->IsRuntime()) && (x_dim[i] == -1)) { - out_dims[i] = -1; - } else { - out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1]; - } - } - ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); - if (out_dims[0] == x_dim[0]) { - // Only pass LoD when the first dimension is equal between - // output and input. - ctx->ShareLoD("X", /*->*/ "Out"); - } } }; @@ -160,10 +131,13 @@ class PadOpDoubleGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(pad, PadInferShapeFunctor, + PD_INFER_META(phi::PadInferMeta)); REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker, - ops::PadOpGradMaker); + ops::PadOpGradMaker, + PadInferShapeFunctor); REGISTER_OPERATOR(pad_grad, ops::PadOpGrad, ops::PadOpDoubleGradMaker, ops::PadOpDoubleGradMaker); diff --git a/paddle/fluid/operators/pad_op_npu.cc b/paddle/fluid/operators/pad_op_npu.cc index d0cb674b4049f..adc4a2ffaf8c5 100644 --- a/paddle/fluid/operators/pad_op_npu.cc +++ b/paddle/fluid/operators/pad_op_npu.cc @@ -90,5 +90,5 @@ namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL(pad, ops::PadNPUKernel, ops::PadNPUKernel, ops::PadNPUKernel); -REGISTER_OP_NPU_KERNEL(pad_grad, ops::PadNPUKernel, +REGISTER_OP_NPU_KERNEL(pad_grad, ops::PadGradNPUKernel, ops::PadGradNPUKernel); diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc deleted file mode 100644 index 6335004e69a37..0000000000000 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ /dev/null @@ -1,567 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/pool_op.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/operator.h" -#endif -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; -using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor; -using DataLayout = platform::DataLayout; -using PoolingMode = platform::PoolingMode; -template -using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; - -DataLayout getLayoutFromStr(std::string data_format) { - if (data_format == "NHWC") { - return DataLayout::kNHWC; - } else if (data_format == "NCHW") { - return DataLayout::kNCHW; - } else if (data_format == "NCDHW") { - return DataLayout::kNCDHW; - } else { - return DataLayout::kNCDHW; - } -} - -template -class PoolCUDNNOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("Pool operator CUDA kernel must use " - "CUDAPlace rather than CPUPlace.")); - - const Tensor *input = ctx.Input("X"); - Tensor *output = ctx.Output("Out"); - output->mutable_data(ctx.GetPlace()); - std::string pooling_type = ctx.Attr("pooling_type"); - bool exclusive = ctx.Attr("exclusive"); - bool adaptive = ctx.Attr("adaptive"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string data_format = ctx.Attr("data_format"); - bool global_pooling = ctx.Attr("global_pooling"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings - auto in_x_dims = input->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; - const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; - - // -----------------transformed tensor ------------------------ - - Tensor transformed_input(input->type()); - Tensor transformed_output(output->type()); - DataLayout layout; - - if (data_format == str_NDHWC) { - layout = DataLayout::kNCDHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 4, 1, 2, 3}; - - // input - transformed_input.Resize(input->dims()); - - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[4]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - in_dims_vec[4] = input->dims()[3]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans5; - trans5(dev_ctx, *input, &transformed_input, axis); - - // output - transformed_output.Resize(output->dims()); - - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[4]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - out_dims_vec[4] = output->dims()[3]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - } else if (data_format == str_NHWC) { - layout = DataLayout::kNCHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 3, 1, 2}; - - transformed_input.Resize(input->dims()); - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[3]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans; - trans(dev_ctx, *input, &transformed_input, axis); - - transformed_output.Resize(output->dims()); - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[3]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); -#endif - } else { - layout = getLayoutFromStr(data_format); - transformed_input = *input; - transformed_output = *output; - } - - const T *tranformed_input_data = transformed_input.data(); - T *tranformed_output_data = transformed_output.mutable_data( - transformed_output.dims(), ctx.GetPlace()); - - // ------------------- cudnn descriptors --------------------- - ScopedTensorDescriptor input_desc; - ScopedTensorDescriptor output_desc; - ScopedPoolingDescriptor pool_desc; - -#ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#else - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#endif - PoolingMode pooling_mode; - if (pooling_type == "max") { - pooling_mode = PoolingMode::kMaximum; - } else { - pooling_mode = exclusive ? PoolingMode::kAverageExclusive - : PoolingMode::kAverageInclusive; - } - -#ifdef PADDLE_WITH_HIP - miopenPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#else - cudnnPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#endif - - // ------------------- cudnn pool algorithm --------------------- - auto handle = ctx.cuda_device_context().cudnn_handle(); - ScalingParamType alpha = 1.0f, beta = 0.0f; - -#ifdef PADDLE_WITH_HIP - char *pool_workspace; - size_t pool_worksize = 0; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenPoolingGetWorkSpaceSizeV2( - cudnn_pool_desc, cudnn_output_desc, &pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingForward( - handle, cudnn_pool_desc, &alpha, cudnn_input_desc, - tranformed_input_data, &beta, cudnn_output_desc, tranformed_output_data, - false, pool_workspace, pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingForward( - handle, cudnn_pool_desc, &alpha, cudnn_input_desc, - tranformed_input_data, &beta, cudnn_output_desc, - tranformed_output_data)); -#endif - // add - if (data_format == str_NDHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 4, 1}; - phi::funcs::Transpose - trans5_v2; - trans5_v2(dev_ctx, transformed_output, output, axis); - } -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - if (data_format == str_NHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 1}; - phi::funcs::Transpose trans; - trans(dev_ctx, transformed_output, output, axis); - } -#endif - } -}; - -template -class PoolCUDNNGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("Pool operator CUDA kernel must use " - "CUDAPlace rather than CPUPlace.")); - - const Tensor *input = ctx.Input("X"); - const Tensor *output = ctx.Input("Out"); - const Tensor *output_grad = - ctx.Input(framework::GradVarName("Out")); - Tensor *input_grad = ctx.Output(framework::GradVarName("X")); - - std::string pooling_type = ctx.Attr("pooling_type"); - bool exclusive = ctx.Attr("exclusive"); - bool adaptive = ctx.Attr("adaptive"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string data_format = ctx.Attr("data_format"); - bool global_pooling = ctx.Attr("global_pooling"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - -#ifdef PADDLE_WITH_HIP - if (pooling_type == "max") { - using OpKernelMap = paddle::framework::OperatorWithKernel::OpKernelMap; - using OpKernelFunc = paddle::framework::OperatorWithKernel::OpKernelFunc; - auto &all_op_kernels = - paddle::framework::OperatorWithKernel::AllOpKernels(); - std::string op_type = "pool2d_grad"; - auto kernels_iter = all_op_kernels.find(op_type); - PADDLE_ENFORCE_NE( - kernels_iter, all_op_kernels.end(), - platform::errors::Unavailable( - "There are no kernels which are registered in the %s operator.", - op_type)); - OpKernelMap &kernels = kernels_iter->second; - paddle::framework::OpKernelType expected_kernel_key( - paddle::framework::ToDataType(typeid(T)), ctx.GetPlace()); - auto kernel_iter = kernels.find(expected_kernel_key); - PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), - platform::errors::NotFound( - "Operator (%s) does not have kernel for %s.", - op_type, KernelTypeToString(expected_kernel_key))); - std::unique_ptr kernel_func_( - new OpKernelFunc(kernel_iter->second)); - (*kernel_func_)(ctx); - return; - } -#endif - - // update paddings - auto in_x_dims = input->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - // ------- tensor grad -------------- - Tensor transformed_input(input->type()); - Tensor transformed_output(output->type()); - Tensor transformed_output_grad(output_grad->type()); - - input_grad->mutable_data(ctx.GetPlace()); - Tensor transformed_input_grad(input_grad->type()); - DataLayout layout; - const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; - const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; - if (data_format == str_NDHWC) { - layout = DataLayout::kNCDHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 4, 1, 2, 3}; - - // input - transformed_input.Resize(input->dims()); - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[4]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - in_dims_vec[4] = input->dims()[3]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans5; - trans5(dev_ctx, *input, &transformed_input, axis); - - // output - transformed_output.Resize(output->dims()); - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[4]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - out_dims_vec[4] = output->dims()[3]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); - - transformed_output.mutable_data(ctx.GetPlace(), output->type()); - - phi::funcs::Transpose - trans5_v2; - trans5_v2(dev_ctx, *output, &transformed_output, axis); - - // output grad - transformed_output_grad.Resize(phi::make_ddim(out_dims_vec)); - transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type()); - - phi::funcs::Transpose - trans5_v3; - trans5_v3(dev_ctx, *output_grad, &transformed_output_grad, axis); - - // input grad - transformed_input_grad.Resize(phi::make_ddim(in_dims_vec)); - -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - } else if (data_format == str_NHWC) { - layout = DataLayout::kNCHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 3, 1, 2}; - - // input - transformed_input.Resize(input->dims()); - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[3]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans4; - trans4(dev_ctx, *input, &transformed_input, axis); - - // output - transformed_output.Resize(output->dims()); - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[3]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); - - transformed_output.mutable_data(ctx.GetPlace(), output->type()); - - phi::funcs::Transpose - trans4_v2; - trans4_v2(dev_ctx, *output, &transformed_output, axis); - - // output grad - transformed_output_grad.Resize(phi::make_ddim(out_dims_vec)); - transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type()); - - phi::funcs::Transpose - trans4_v3; - trans4_v3(dev_ctx, *output_grad, &transformed_output_grad, axis); - - // input grad - transformed_input_grad.Resize(phi::make_ddim(in_dims_vec)); -#endif - } else { - layout = getLayoutFromStr(data_format); - transformed_input = *input; - transformed_output = *output; - transformed_output_grad = *output_grad; - transformed_input_grad = *input_grad; - } - - const T *input_data = transformed_input.data(); - const T *output_data = transformed_output.data(); - const T *output_grad_data = transformed_output_grad.data(); - - // ------------------- cudnn descriptors --------------------- - ScopedTensorDescriptor input_desc; - ScopedTensorDescriptor output_desc; - ScopedPoolingDescriptor pool_desc; - -#ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#else - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#endif - PoolingMode pooling_mode; - if (pooling_type == "max") { - if (FLAGS_cudnn_deterministic) { - pooling_mode = PoolingMode::kMaximumDeterministic; - } else { - pooling_mode = PoolingMode::kMaximum; - } - } else { - pooling_mode = exclusive ? PoolingMode::kAverageExclusive - : PoolingMode::kAverageInclusive; - } - -#ifdef PADDLE_WITH_HIP - miopenPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#else - cudnnPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#endif - - // ------------------- cudnn pool algorithm --------------------- - auto handle = ctx.cuda_device_context().cudnn_handle(); - ScalingParamType alpha = 1.0f, beta = 0.0f; - if (input_grad) { - T *input_grad_data = transformed_input_grad.mutable_data( - transformed_input_grad.dims(), ctx.GetPlace()); -// Because beta is zero, it is unnecessary to reset input_grad. -#ifdef PADDLE_WITH_HIP - char *pool_workspace; - size_t pool_worksize = 0; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenPoolingGetWorkSpaceSizeV2( - cudnn_pool_desc, cudnn_output_desc, &pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingBackward( - handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, - cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, - &beta, cudnn_input_desc, input_grad_data, pool_workspace)); - PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingBackward( - handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, - cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, - &beta, cudnn_input_desc, input_grad_data)); -#endif - - if (data_format == str_NDHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 4, 1}; - phi::funcs::Transpose - trans5_v4; - trans5_v4(dev_ctx, transformed_input_grad, input_grad, axis); - } -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - if (data_format == str_NHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 1}; - phi::funcs::Transpose - trans4_v4; - trans4_v4(dev_ctx, transformed_input_grad, input_grad, axis); - } -#endif - } - } -}; - -template -class PoolCUDNNGradGradOpKernel : public PoolCUDNNOpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - std::string pooling_type = ctx.Attr("pooling_type"); - if (pooling_type == "max") { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op grad grad only supports avgpool.")); - } else { - PoolCUDNNOpKernel::Compute(ctx); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); - -REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel); -#else -REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); -REGISTER_OP_KERNEL(pool2d_grad_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradGradOpKernel, - ops::PoolCUDNNGradGradOpKernel, - ops::PoolCUDNNGradGradOpKernel); - -REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); -#endif diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index ae095c2fa7aaa..44f3d8090e565 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -15,6 +15,12 @@ limitations under the License. */ #include "paddle/fluid/operators/pool_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" + #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -23,125 +29,6 @@ limitations under the License. */ namespace paddle { namespace operators { -int PoolOutputSize(int input_size, int filter_size, int padding_1, - int padding_2, int stride, bool ceil_mode) { - int output_size; - if (!ceil_mode) { - output_size = - (input_size - filter_size + padding_1 + padding_2) / stride + 1; - } else { - output_size = - (input_size - filter_size + padding_1 + padding_2 + stride - 1) / - stride + - 1; - } - PADDLE_ENFORCE_GT( - output_size, 0, - platform::errors::InvalidArgument( - "the output size must be greater than 0. But received: " - "output_size = %d due to the settings of input_size(%d), " - "padding(%d,%d), " - "k_size(%d) and stride(%d). Please check again!", - output_size, input_size, padding_1, padding_2, filter_size, stride)); - return output_size; -} - -void PoolOp::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of Pool operator is not found.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of Pool operator is not found.")); - - std::string pooling_type = ctx->Attrs().Get("pooling_type"); - std::vector ksize = ctx->Attrs().Get>("ksize"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - bool ceil_mode = ctx->Attrs().Get("ceil_mode"); - bool adaptive = ctx->Attrs().Get("adaptive"); - bool global_pooling = ctx->Attrs().Get("global_pooling"); - std::string data_format = ctx->Attrs().Get("data_format"); - std::string padding_algorithm = - ctx->Attrs().Get("padding_algorithm"); - - auto in_x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ( - in_x_dims.size() == 4 || in_x_dims.size() == 5, true, - platform::errors::InvalidArgument( - "the input of Op(pool) should be 4-D or 5-D Tensor. But " - "received: %u-D Tensor and it's shape is [%s].", - in_x_dims.size(), in_x_dims)); - - PADDLE_ENFORCE_EQ( - in_x_dims.size() - ksize.size(), 2U, - platform::errors::InvalidArgument( - "the dimension of input minus the size of " - "Attr(ksize) must be euqal to 2 in Op(pool). " - "But received: the dimension of input minus the size " - "of Attr(ksize) is %d, the " - "input's dimension is %d, the shape of input " - "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].", - in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims, - ksize.size(), phi::make_ddim(ksize))); - - PADDLE_ENFORCE_EQ( - ksize.size(), strides.size(), - platform::errors::InvalidArgument( - "the size of Attr(ksize) and Attr(strides) in " - "Op(pool) must be equal. " - "But received: Attr(ksize)'s size is %d, Attr(strides)'s " - "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].", - ksize.size(), strides.size(), phi::make_ddim(ksize), - phi::make_ddim(strides))); - - // MKL-DNN Kernels are using NCHW order of dims description - // so we ignore data_format consideration for MKL-DNN kernel - const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) && - (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings if "SAME" or global_pooling - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - std::vector output_shape; - if (adaptive) { - output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); - } else { - for (int i = 0; i < data_dims.size(); ++i) { - if ((!ctx->IsRuntime()) && (data_dims[i] < 0)) { - output_shape.push_back(data_dims[i]); - } else { - output_shape.push_back( - PoolOutputSize(data_dims[i], ksize[i], paddings[2 * i], - paddings[2 * i + 1], strides[i], ceil_mode)); - } - } - } - - // output_N = input_N - output_shape.insert(output_shape.begin(), in_x_dims[0]); - // output_C = input_C - if (channel_last) { - output_shape.push_back(in_x_dims[in_x_dims.size() - 1]); - } else { - output_shape.insert(output_shape.begin() + 1, in_x_dims[1]); - } - - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - ctx->ShareLoD("X", "Out"); -} - bool CanMKLDNNSupportPool(const framework::ExecutionContext& ctx) { if (ctx.Attr("adaptive") == false) return true; // (jczaja): oneDNN is supporting only unchangable in size pool window @@ -216,16 +103,6 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar( tensor.place(), tensor.layout()); } -void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::NotFound( - "Input(X) of Pool Gradoperator is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, - platform::errors::NotFound( - "Input(X@GRAD) of Pool Gradoperator is not found.")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); -} - framework::OpKernelType PoolOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; @@ -471,7 +348,7 @@ class Pool2dOpGradGradMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("pool2d_grad_grad"); + grad_op->SetType("pool2d_double_grad"); grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); grad_op->SetAttrMap(this->Attrs()); @@ -692,35 +569,34 @@ width, respectively. The input(X) size and output(Out) size may be different. namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(pool2d, Pool2dInferShapeFunctor, + PD_INFER_META(phi::PoolInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pool2d_grad, Pool2dGradInferShapeFunctor, + PD_INFER_META(phi::PoolGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pool2d_double_grad, + Pool2dDoubleGradInferShapeFunctor, + PD_INFER_META(phi::PoolInferMeta)); + REGISTER_OPERATOR( pool2d, ops::PoolOp, ops::Pool2dOpMaker, ops::PoolOpInferVarType, paddle::framework::DefaultGradOpMaker, - paddle::framework::DefaultGradOpMaker); + paddle::framework::DefaultGradOpMaker, + Pool2dInferShapeFunctor); REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad, ops::Pool2dOpGradGradMaker, - ops::Pool2dOpGradGradMaker); -REGISTER_OPERATOR(pool2d_grad_grad, ops::PoolOp); - -REGISTER_OP_CPU_KERNEL( - pool2d, ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL( - pool2d_grad, ops::PoolGradKernel, - ops::PoolGradKernel); -REGISTER_OP_CPU_KERNEL( - pool2d_grad_grad, - ops::PoolGradGradKernel, - ops::PoolGradGradKernel); + ops::Pool2dOpGradGradMaker, + Pool2dGradInferShapeFunctor); +REGISTER_OPERATOR(pool2d_double_grad, ops::PoolOp, + Pool2dDoubleGradInferShapeFunctor); + +DECLARE_INFER_SHAPE_FUNCTOR(pool3d, Pool3dInferShapeFunctor, + PD_INFER_META(phi::PoolInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pool3d_grad, Pool3dGradInferShapeFunctor, + PD_INFER_META(phi::PoolGradInferMeta)); REGISTER_OPERATOR( pool3d, ops::PoolOp, ops::Pool3dOpMaker, ops::PoolOpInferVarType, paddle::framework::DefaultGradOpMaker, - paddle::framework::DefaultGradOpMaker); -REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad); - -REGISTER_OP_CPU_KERNEL( - pool3d, ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL( - pool3d_grad, ops::PoolGradKernel, - ops::PoolGradKernel); + paddle::framework::DefaultGradOpMaker, + Pool3dInferShapeFunctor); +REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad, Pool3dGradInferShapeFunctor); diff --git a/paddle/fluid/operators/pool_op.cu b/paddle/fluid/operators/pool_op.cu deleted file mode 100644 index 069ce0c1fda85..0000000000000 --- a/paddle/fluid/operators/pool_op.cu +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/pool_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - pool2d, ops::PoolKernel, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CUDA_KERNEL( - pool2d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel, - ops::PoolGradKernel); - -REGISTER_OP_CUDA_KERNEL( - pool2d_grad_grad, - ops::PoolGradGradKernel, - ops::PoolGradGradKernel, - ops::PoolGradGradKernel); - -REGISTER_OP_CUDA_KERNEL( - pool3d, ops::PoolKernel, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CUDA_KERNEL( - pool3d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel, - ops::PoolGradKernel); diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h index bea6506ee86db..d48ac3bd358ef 100644 --- a/paddle/fluid/operators/pool_op.h +++ b/paddle/fluid/operators/pool_op.h @@ -12,19 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once +// NOTE(Ruibiao): Difficult to remove code from this header file because too +// many files rely on it through "mkldnn_reuse.h" -#include -#include -#include +#pragma once -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#if defined(__HIPCC__) || defined(__NVCC__) -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#endif namespace paddle { namespace operators { @@ -35,8 +28,6 @@ class PoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override; @@ -50,8 +41,6 @@ class PoolOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override; @@ -71,292 +60,5 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override; }; -template -inline void UpdatePadding(std::vector* paddings, const bool global_pooling, - const bool adaptive, - const std::string padding_algorithm, - const framework::DDim data_dims, - const std::vector& strides, - const std::vector& ksize) { - // set padding size == data_dims.size() * 2 - auto data_shape = phi::vectorize(data_dims); - if (static_cast(paddings->size()) == data_dims.size()) { - for (int i = 0; i < data_dims.size(); ++i) { - T copy_pad = *(paddings->begin() + 2 * i); - paddings->insert(paddings->begin() + 2 * i + 1, copy_pad); - } - } else { - PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(), - platform::errors::InvalidArgument( - "Paddings size %d should be the same or twice as the " - "pooling size %d.", - paddings->size(), data_dims.size() * 2)); - } - - // when padding_algorithm is "VALID" or "SAME" - if (padding_algorithm == "SAME") { - for (int i = 0; i < data_dims.size(); ++i) { - T out_size = (data_dims[i] + strides[i] - 1) / strides[i]; - T pad_sum = - std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i], - static_cast(0)); - T pad_0 = pad_sum / 2; - T pad_1 = pad_sum - pad_0; - *(paddings->begin() + i * 2) = pad_0; - *(paddings->begin() + i * 2 + 1) = pad_1; - } - } else if (padding_algorithm == "VALID") { - for (auto it = paddings->begin(); it != paddings->end(); it++) { - *it = 0; - } - } - - // if global_pooling == true or adaptive == true, padding will be ignore - if (global_pooling || adaptive) { - for (auto it = paddings->begin(); it != paddings->end(); it++) { - *it = 0; - } - } -} - -template -inline void UpdateKsize(std::vector* ksize, - const framework::DDim data_dims) { - ksize->resize(static_cast(data_dims.size())); - for (size_t i = 0; i < ksize->size(); ++i) { - *(ksize->begin() + i) = static_cast(data_dims[i]); - } -} - -inline int getReduceNum(const framework::Tensor& input, - const framework::Tensor* output, - const std::string data_format, - std::vector* reduce_dim) { - // data_format only can be NCHW - bool channel_last = (data_format == "NHWC"); - if (channel_last) { - return 0; - } - int reduce_num = 0; - const int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; - if ((output_height == 1) && (output_width == 1)) { - reduce_dim->push_back(2); - reduce_dim->push_back(3); - reduce_num = input.dims()[2] * input.dims()[3]; - } - return reduce_num; -} - -template -class PoolKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - - std::string pooling_type = context.Attr("pooling_type"); - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::string data_format = context.Attr("data_format"); - bool exclusive = context.Attr("exclusive"); - bool adaptive = context.Attr("adaptive"); - bool global_pooling = context.Attr("global_pooling"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings - auto in_x_dims = in_x->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - auto& dev_ctx = context.template device_context(); - switch (ksize.size()) { - case 2: { - if (pooling_type == "max") { - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::MaxPool, T> - pool2d_forward; - paddle::operators::math::MaxPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - true, false, out, pool_process); - - } else if (pooling_type == "avg") { - std::vector reduce_dim; - int reduce_num = getReduceNum(*in_x, out, data_format, &reduce_dim); - if (reduce_num > 0 && - adaptive) { // for adaptive_avg_pool2d && output_size == 1 -#if defined(__HIPCC__) || defined(__NVCC__) - auto stream = dev_ctx.stream(); - TensorReduceImpl>( - dev_ctx, *in_x, out, kps::DivideFunctor(reduce_num), - reduce_dim, stream); -#else // for cpu - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool2d_forward; - paddle::operators::math::AvgPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, - data_format, exclusive, adaptive, out, pool_process); -#endif - } else { // avgpool_2d or adaptive_avg_pool2d && output_size != 1 - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool2d_forward; - paddle::operators::math::AvgPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, - data_format, exclusive, adaptive, out, pool_process); - } - } - } break; - case 3: { - if (pooling_type == "max") { - paddle::operators::math::Pool3dFunctor< - DeviceContext, paddle::operators::math::MaxPool, T> - pool3d_forward; - paddle::operators::math::MaxPool pool_process; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - true, false, out, pool_process); - - } else if (pooling_type == "avg") { - paddle::operators::math::Pool3dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool3d_forward; - paddle::operators::math::AvgPool pool_process; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - exclusive, adaptive, out, pool_process); - } - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } -}; - -template -class PoolGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - const Tensor* out = context.Input("Out"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - - std::string pooling_type = context.Attr("pooling_type"); - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - bool exclusive = context.Attr("exclusive"); - bool adaptive = context.Attr("adaptive"); - std::string data_format = context.Attr("data_format"); - bool global_pooling = context.Attr("global_pooling"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings - auto in_x_dims = in_x->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - auto& dev_ctx = context.template device_context(); - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_constant; - set_constant(dev_ctx, in_x_grad, static_cast(0.0)); - - switch (ksize.size()) { - case 2: { - if (pooling_type == "max") { - paddle::operators::math::MaxPool2dGradFunctor - pool2d_backward; - pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, in_x_grad); - } else if (pooling_type == "avg") { - paddle::operators::math::Pool2dGradFunctor< - DeviceContext, paddle::operators::math::AvgPoolGrad, T> - pool2d_backward; - paddle::operators::math::AvgPoolGrad pool_process; - pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, exclusive, adaptive, - in_x_grad, pool_process); - } - } break; - case 3: { - if (pooling_type == "max") { - paddle::operators::math::MaxPool3dGradFunctor - pool3d_backward; - pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, in_x_grad); - } else if (pooling_type == "avg") { - paddle::operators::math::Pool3dGradFunctor< - DeviceContext, paddle::operators::math::AvgPoolGrad, T> - pool3d_backward; - paddle::operators::math::AvgPoolGrad pool_process; - pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, exclusive, adaptive, - in_x_grad, pool_process); - } - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } - } -}; - -template -class PoolGradGradKernel : public PoolKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::string pooling_type = context.Attr("pooling_type"); - if (pooling_type == "max") { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op grad grad only supports avgpool.")); - } else { - PoolKernel::Compute(context); - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc index 08656e64231b6..fa88d128a9a1d 100644 --- a/paddle/fluid/operators/pool_op_mlu.cc +++ b/paddle/fluid/operators/pool_op_mlu.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { @@ -80,10 +81,10 @@ class MLUPoolOpKernel : public framework::OpKernel { data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); if (global_pooling) { - UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } MLUCnnlTensorDesc in_x_desc(*in_x, cnnl_layout, ToCnnlDataType()); @@ -191,10 +192,10 @@ class MLUPoolGradOpKernel : public framework::OpKernel { data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); if (global_pooling) { - UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } // inputs need with NHWC layout diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc index bd26d6350d9c3..0efcb8b7981c3 100644 --- a/paddle/fluid/operators/pool_op_npu.cc +++ b/paddle/fluid/operators/pool_op_npu.cc @@ -11,8 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { @@ -68,8 +70,8 @@ class NPUPoolOpKernel : public framework::OpKernel { strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); PADDLE_ENFORCE_LT( std::max(paddings[0], paddings[1]), ksize[0], platform::errors::InvalidArgument( @@ -201,8 +203,8 @@ class NPUPoolGradOpKernel : public framework::OpKernel { strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); PADDLE_ENFORCE_LT( std::max(paddings[0], paddings[1]), ksize[0], diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc index 402dd6c108039..87c437d8a78e0 100644 --- a/paddle/fluid/operators/pool_op_xpu.cc +++ b/paddle/fluid/operators/pool_op_xpu.cc @@ -8,13 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" + #include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #ifdef PADDLE_WITH_XPU namespace paddle { namespace operators { +using framework::Tensor; + xpu::Pooling_t XPUPoolingType(const std::string& pooltype, bool exclusive, bool is_test) { if (pooltype == "max") { diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index d061f9ae05613..e0341f4a4b471 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_with_index_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,71 +32,6 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of Pooling should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of Pooling should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Mask"), true, - platform::errors::InvalidArgument( - "Output(Mask) of Pooling should not be null.")); - - auto in_x_dims = ctx->GetInputDim("X"); - - std::vector ksize = ctx->Attrs().Get>("ksize"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - bool adaptive = ctx->Attrs().Get("adaptive"); - - PADDLE_ENFORCE( - in_x_dims.size() == 4 || in_x_dims.size() == 5, - platform::errors::InvalidArgument("Pooling intput should be 4-D or 5-D " - "tensor but received %dD-Tensor", - in_x_dims.size())); - - if (ctx->Attrs().Get("global_pooling")) { - ksize.resize(static_cast(in_x_dims.size()) - 2); - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x_dims[i + 2]); - } - } - - PADDLE_ENFORCE_EQ( - in_x_dims.size() - ksize.size(), 2U, - platform::errors::InvalidArgument( - "The input size %d minus the kernel size %d should equal to 2.", - in_x_dims.size(), ksize.size())); - PADDLE_ENFORCE_EQ( - ksize.size(), strides.size(), - platform::errors::InvalidArgument( - "Strides size %d and pooling size %d should be the same.", - strides.size(), ksize.size())); - PADDLE_ENFORCE_EQ( - ksize.size(), paddings.size(), - platform::errors::InvalidArgument( - "Paddings size %d and pooling size %d should be the same.", - paddings.size(), ksize.size())); - - std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - if (adaptive) { - output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); - } else { - for (size_t i = 0; i < ksize.size(); ++i) { - if ((!ctx->IsRuntime()) && (in_x_dims[i + 2] < 0)) { - output_shape.push_back(in_x_dims[i + 2]); - } else { - output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], - paddings[i], strides[i])); - } - } - } - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - ctx->SetOutputDim("Mask", phi::make_ddim(output_shape)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -106,22 +45,6 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Mask"), true, - platform::errors::InvalidArgument("Input(Mask) must not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::InvalidArgument("Input(X) must not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, - platform::errors::InvalidArgument( - "Input(Out@GRAD) should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, - platform::errors::InvalidArgument( - "Output(X@GRAD) should not be null.")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -335,40 +258,34 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER( namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index, + MaxPool2dWithIndexInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index_grad, + MaxPool2dWithIndexGradInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta)); + REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp, ops::MaxPool2dWithIndexOpMaker, ops::MaxPoolWithIndexGradOpMaker, - ops::MaxPoolWithIndexGradOpMaker); + ops::MaxPoolWithIndexGradOpMaker, + MaxPool2dWithIndexInferShapeFunctor); REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad, - ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer); + ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer, + MaxPool2dWithIndexGradInferShapeFunctor); -REGISTER_OP_CPU_KERNEL( - max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CPU_KERNEL( - max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); +DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index, + MaxPool3dWithIndexInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index_grad, + MaxPool3dWithIndexGradInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta)); REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp, ops::MaxPool3dWithIndexOpMaker, ops::MaxPoolWithIndexGradOpMaker, - ops::MaxPoolWithIndexGradOpMaker); + ops::MaxPoolWithIndexGradOpMaker, + MaxPool3dWithIndexInferShapeFunctor); REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad, - ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer); - -REGISTER_OP_CPU_KERNEL( - max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CPU_KERNEL( - max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); + ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer, + MaxPool3dWithIndexGradInferShapeFunctor); diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc deleted file mode 100644 index 5497dcbd9ce25..0000000000000 --- a/paddle/fluid/operators/pool_with_index_op.cu.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/pool_with_index_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CUDA_KERNEL( - max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); - -REGISTER_OP_CUDA_KERNEL( - max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CUDA_KERNEL( - max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h deleted file mode 100644 index 6e51a833f5c89..0000000000000 --- a/paddle/fluid/operators/pool_with_index_op.h +++ /dev/null @@ -1,121 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MaxPoolWithIndexKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - Tensor* mask = context.Output("Mask"); - - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - bool adaptive = context.Attr("adaptive"); - - auto& dev_ctx = context.template device_context(); - if (context.Attr("global_pooling")) { - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x->dims()[i + 2]); - } - } - - switch (ksize.size()) { - case 2: { - paddle::operators::math::MaxPool2dWithIndexFunctor - pool2d_forward; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, - mask); - } break; - case 3: { - paddle::operators::math::MaxPool3dWithIndexFunctor - pool3d_forward; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, - mask); - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } -}; - -template -class MaxPoolWithIndexGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* mask = context.Input("Mask"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - bool adaptive = context.Attr("adaptive"); - if (context.Attr("global_pooling")) { - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x_grad->dims()[i + 2]); - } - } - - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - auto& device_ctx = context.template device_context(); - phi::funcs::set_constant(device_ctx, in_x_grad, 0); - - switch (ksize.size()) { - case 2: { - paddle::operators::math::MaxPool2dWithIndexGradFunctor - pool2d_backward; - pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, adaptive, in_x_grad); - } break; - case 3: { - paddle::operators::math::MaxPool3dWithIndexGradFunctor - pool3d_backward; - pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, adaptive, in_x_grad); - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 9bd6ae8bab829..de35f67405810 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -9,14 +9,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/prelu_op.h" - #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + +framework::OpKernelType innerGetKernelTypeForVar( + const Tensor &tensor, const framework::OpKernelType &expected_kernel_type) { +#ifdef PADDLE_WITH_MKLDNN + auto isOneDNNKernelChosen = + (expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN); + auto isNotOneDNNTensor = (tensor.layout() != framework::DataLayout::kMKLDNN); + auto isModelNHWC = + (paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout() == framework::DataLayout::kNHWC); + // All inputs (including alpha) need shape rotating + if (isOneDNNKernelChosen && isNotOneDNNTensor && isModelNHWC) { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), + framework::DataLayout::kNHWC); + } +#endif + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); +} + class PReluOp : public framework::OperatorWithKernel { public: PReluOp(const std::string &type, const framework::VariableNameMap &inputs, @@ -24,95 +49,6 @@ class PReluOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "prelu"); - OP_INOUT_CHECK(ctx->HasInput("Alpha"), "Input", "Alpha", "prelu"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "prelu"); - - auto x_dim = ctx->GetInputDim("X"); - std::string mode = ctx->Attrs().Get("mode"); - if (mode == "all") { - PADDLE_ENFORCE_EQ(phi::product(ctx->GetInputDim("Alpha")), 1, - platform::errors::InvalidArgument( - "For mode 'all', size of weight Alpha must be one. " - "But recevied alpha's size: %d.", - product(ctx->GetInputDim("Alpha")))); - } else if (mode == "channel") { - auto x_rank = x_dim.size(); - PADDLE_ENFORCE_GE(x_rank, 2, - platform::errors::InvalidArgument( - "For mode 'channel', rank of input X must be " - "equal or larger than 2. But recevied X's " - "rank: %d", - x_rank)); - const std::string data_format_str = - ctx->Attrs().Get("data_format"); - PADDLE_ENFORCE_EQ(data_format_str == "NCHW" || data_format_str == "NHWC", - true, - platform::errors::InvalidArgument( - "For mode 'channel', data_format must be one of " - "NCHW and NHWC. But recevied data_format: %s", - data_format_str)); - if (data_format_str == "NCHW") { - PADDLE_ENFORCE_EQ( - product(ctx->GetInputDim("Alpha")) == x_dim[1], true, - platform::errors::InvalidArgument( - "For mode 'channel', size of weight Alpha must be " - "equal to the number of channels of input(x). But " - "recevied alpha's size: %d, x_dim[1]: %d", - product(ctx->GetInputDim("Alpha")), x_dim[1])); - } else { - PADDLE_ENFORCE_EQ( - product(ctx->GetInputDim("Alpha")) == x_dim[x_rank - 1], true, - platform::errors::InvalidArgument( - "For mode 'channel', size of weight Alpha must be " - "equal to the number of channels of input(x). But " - "recevied alpha's size: %d, x_dim[%d]: %d", - product(ctx->GetInputDim("Alpha")), x_rank - 1, - x_dim[x_rank - 1])); - } - - } else if (mode == "element") { - auto alpha_dim = ctx->GetInputDim("Alpha"); - auto alpha_rank = alpha_dim.size(); - auto x_rank = x_dim.size(); - PADDLE_ENFORCE_GE(x_rank, 1, - platform::errors::InvalidArgument( - "For mode 'element', rank of input X must be " - "equal or larger than 2. But recevied X's " - "rank: %d", - x_rank)); - PADDLE_ENFORCE_EQ( - alpha_rank, x_rank, - platform::errors::InvalidArgument( - "For mode 'element', rank of weight Alpha must be ", - "equal to the rank of input(x). But recevied alpha's rank: %d, " - "x's rank: %d.", - alpha_rank, x_rank)); - size_t x_product = 1; - size_t alpha_product = 1; - for (int64_t i = x_rank - 1; i > 0; i--) { - x_product *= x_dim[i]; - alpha_product *= alpha_dim[i]; - } - PADDLE_ENFORCE_EQ( - alpha_product, x_product, - platform::errors::InvalidArgument( - "For mode 'element', the size of weight Alpha must be " - "equal to the size of input(x). But recevied alpha's size: %d, " - "x's size: %d.", - alpha_product, x_product)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Attr(mode) of prelu must be one of 'all', 'channel', or 'element'. " - "But recevied " - "mode: '%s'.", - mode)); - } - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -128,6 +64,12 @@ class PReluOp : public framework::OperatorWithKernel { #endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + return innerGetKernelTypeForVar(tensor, expected_kernel_type); + } }; class PReluOpMaker : public framework::OpProtoAndCheckerMaker { @@ -212,6 +154,12 @@ class PReluGradOp : public framework::OperatorWithKernel { #endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + return innerGetKernelTypeForVar(tensor, expected_kernel_type); + } }; template @@ -236,13 +184,10 @@ class PReluGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(prelu, PReluInferShapeFunctor, + PD_INFER_META(phi::PReluInferMeta)); REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker, ops::PReluGradOpMaker, - ops::PReluGradOpMaker); + ops::PReluGradOpMaker, + PReluInferShapeFunctor); REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp); -REGISTER_OP_CPU_KERNEL( - prelu, ops::PReluKernel, - ops::PReluKernel); -REGISTER_OP_CPU_KERNEL( - prelu_grad, ops::PReluGradKernel, - ops::PReluGradKernel); diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu deleted file mode 100644 index 12e55d042d703..0000000000000 --- a/paddle/fluid/operators/prelu_op.cu +++ /dev/null @@ -1,208 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/prelu.h" -#include "paddle/fluid/operators/prelu_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -#define CUDA_NUM_THREADS 1024 - -inline static int PADDLE_GET_BLOCKS(const int N) { - return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; -} - -template -class CUDAPReluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* alpha = context.Input("Alpha"); - auto* out = context.Output("Out"); - - const T* x_ptr = x->data(); - T* o_ptr = out->mutable_data(context.GetPlace()); - - const T* alpha_ptr = alpha->data(); - auto& mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - - int numel = x->numel(); - auto dim = x->dims(); - auto x_rank = dim.size(); - - VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim[" - << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel; - - if (mode == "channel") { - bool channel_last = data_format == "NHWC"; - size_t channel = channel_last ? dim[x_rank - 1] : dim[1]; - math::PreluChannelWiseDirectCUDAFunctor prelu_channel_wise; - prelu_channel_wise(context.cuda_device_context().stream(), x_ptr, - alpha_ptr, o_ptr, dim[0], channel, channel_last, - numel); - } else if (mode == "element") { - math::PreluElementWiseDirectCUDAFunctor prelu_element_wise; - prelu_element_wise(context.cuda_device_context().stream(), x_ptr, - alpha_ptr, o_ptr, dim[0], numel); - } else { - math::PreluScalarDirectCUDAFunctor prelu_scalar; - prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr, - o_ptr, numel); - } - } -}; - -enum PRELU_MODE { Element, ChannelFirst, ChannelLast, Scalar }; - -template -__global__ void PReluOpGradKernel(const T* x_ptr, const T* alpha_ptr, - const T* dy_ptr, T* dx_ptr, T* dalpha_ptr, - size_t channel_num, size_t plane_size, - size_t spatial_size, size_t numel, - PRELU_MODE mode) { - CUDA_KERNEL_LOOP(index, numel) { - T scale; - if (mode == Element) { - size_t element_index = index % spatial_size; - scale = alpha_ptr[element_index]; - } else if (mode == ChannelFirst) { - size_t temp = index / plane_size; - size_t channel_index = temp % channel_num; - scale = alpha_ptr[channel_index]; - } else if (mode == ChannelLast) { - size_t channel_index = index % channel_num; - scale = alpha_ptr[channel_index]; - } else { - scale = alpha_ptr[0]; - } - T x = x_ptr[index]; - T dy = dy_ptr[index]; - T zero = static_cast(0); - if (dx_ptr != nullptr) dx_ptr[index] = (x > zero) ? dy : scale * dy; - if (dalpha_ptr != nullptr) dalpha_ptr[index] = (x > zero) ? zero : x * dy; - } -} - -template -class PreluOpGradFunctor { - public: - void operator()(gpuStream_t stream, const T* x, const T* alpha, const T* dy, - T* dx, T* dalpha, const framework::DDim& input_dims, - PRELU_MODE mode) { - size_t numel = 1; - for (size_t i = 0; i < input_dims.size(); ++i) { - numel *= input_dims[i]; - } - size_t plane_size = numel / input_dims[0] / input_dims[1]; - size_t spatial_size = numel / input_dims[0]; - size_t channel = - mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1]; - - PReluOpGradKernel< - T><<>>( - x, alpha, dy, dx, dalpha, channel, plane_size, spatial_size, numel, - mode); - } -}; - -template -class CUDAPReluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* alpha = context.Input("Alpha"); - auto* dx = context.Output(framework::GradVarName("X")); - auto* dy = context.Input(framework::GradVarName("Out")); - auto* dalpha = context.Output(framework::GradVarName("Alpha")); - - const T* x_ptr = x->data(); - const T* alpha_ptr = alpha->data(); - const T* dy_ptr = dy->data(); - T* dx_ptr = dx ? dx->mutable_data(context.GetPlace()) : nullptr; - T* dalpha_ptr = - dalpha ? dalpha->mutable_data(context.GetPlace()) : nullptr; - - if (!dx && !dalpha) return; - - auto& mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - - int numel = x->numel(); - auto dim = x->dims(); - auto x_rank = dim.size(); - std::vector input_shape = phi::vectorize(dim); - auto stream = context.cuda_device_context().stream(); - - T* dalpha_tmp_ptr; - Tensor dalpha_tmp; - if (dalpha_ptr == nullptr) { - dalpha_tmp_ptr = dalpha_ptr; - } else { - auto& dev_ctx = context.template device_context(); - dalpha_tmp = context.AllocateTmpTensor(dim, dev_ctx); - dalpha_tmp_ptr = dalpha_tmp.mutable_data(context.GetPlace()); - } - - PRELU_MODE m; - bool channel_last = false; - if (mode == "element") { - m = Element; - } else if (mode == "channel") { - channel_last = data_format == "NHWC"; - m = channel_last ? ChannelLast : ChannelFirst; - } else { - m = Scalar; - } - PreluOpGradFunctor prelu_grad; - prelu_grad(stream, x_ptr, alpha_ptr, dy_ptr, dx_ptr, dalpha_tmp_ptr, dim, - m); - - if (dalpha_tmp_ptr == nullptr) return; - - std::vector reduce_dims; - for (size_t i = 0; i < dim.size(); i++) { - if (mode == "channel" && !channel_last && i == 1) continue; - if (mode == "channel" && channel_last && i == dim.size() - 1) continue; - if (mode == "element" && i != 0) continue; - reduce_dims.push_back(i); - } - - TensorReduceImpl>( - context.cuda_device_context(), dalpha_tmp, dalpha, - kps::IdentityFunctor(), reduce_dims, stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - prelu, ops::CUDAPReluKernel, - ops::CUDAPReluKernel, - ops::CUDAPReluKernel); -REGISTER_OP_CUDA_KERNEL( - prelu_grad, - ops::CUDAPReluGradKernel, - ops::CUDAPReluGradKernel, - ops::CUDAPReluGradKernel); diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h deleted file mode 100644 index 384994eb37c2a..0000000000000 --- a/paddle/fluid/operators/prelu_op.h +++ /dev/null @@ -1,172 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/transform.h" -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using platform::Transform; - -template -class PReluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* alpha = context.Input("Alpha"); - auto* out = context.Output("Out"); - - const T* x_ptr = x->data(); - T* o_ptr = out->mutable_data(context.GetPlace()); - - const T* alpha_ptr = alpha->data(); - auto& mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - - int numel = x->numel(); - auto dim = x->dims(); - int index = 0; - int i = 0; - if (mode == "channel") { - if (data_format == "NCHW") { - int temp = 1; - for (int j = 2; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - index = i % dim[dim.size() - 1]; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } - } else if (mode == "element") { - int temp = 1; - for (int j = 1; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = i % temp; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i]; - } - } - } -}; - -template -class PReluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dx = context.Output(framework::GradVarName("X")); - auto* dout = context.Input(framework::GradVarName("Out")); - auto* dalpha = context.Output(framework::GradVarName("Alpha")); - auto* alpha = context.Input("Alpha"); - const T* alpha_ptr = alpha->data(); - const T* x_ptr = x->data(); - const T* dout_ptr = dout->data(); - std::string mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - int numel = x->numel(); - auto dim = x->dims(); - int index = 0; - int i = 0; - if (dx) { - T* dx_ptr = dx->mutable_data(context.GetPlace()); - if (mode == "channel") { - if (data_format == "NCHW") { - int temp = 1; - for (int j = 2; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - dx_ptr[i] = - x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - index = i % dim[dim.size() - 1]; - dx_ptr[i] = - x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; - } - } - } else if (mode == "element") { - int temp = 1; - for (int j = 1; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = i % temp; - dx_ptr[i] = - x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - dx_ptr[i] = x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[0] * dout_ptr[i]; - } - } - } - - index = 0; - if (dalpha) { - T* dalpha_ptr = dalpha->mutable_data(context.GetPlace()); - memset(dalpha_ptr, 0, sizeof(T) * dalpha->numel()); - - if (mode == "channel") { - if (data_format == "NCHW") { - int temp = 1; - for (int j = 2; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - index = i % dim[dim.size() - 1]; - dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } - } else if (mode == "element") { - int temp = 1; - for (int j = 1; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = i % temp; - dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - dalpha_ptr[0] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } - } - - // TODO(Guanzhong): add GPU kernels - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc index da637dfeb237d..cfacffff23410 100644 --- a/paddle/fluid/operators/psroi_pool_op.cc +++ b/paddle/fluid/operators/psroi_pool_op.cc @@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/psroi_pool_op.h" -#include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -82,75 +82,6 @@ class PSROIPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of PSROIPoolOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true, - platform::errors::InvalidArgument( - "Input(ROIs) of PSROIPoolOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of PSROIPoolOp should not be null.")); - auto input_dims = ctx->GetInputDim("X"); - auto rois_dims = ctx->GetInputDim("ROIs"); - - PADDLE_ENFORCE_EQ(input_dims.size(), 4, - platform::errors::InvalidArgument( - "The format of input tensor is NCHW")); - PADDLE_ENFORCE_EQ( - rois_dims.size(), 2, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " - "given as [(x1, y1, x2, y2), ...]")); - PADDLE_ENFORCE_EQ( - rois_dims[1], 4, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " - "given as [(x1, y1, x2, y2), ...]")); - if (ctx->HasInput("RoisNum")) { - auto rois_num_dims = ctx->GetInputDim("RoisNum"); - PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1, - platform::errors::InvalidArgument( - "The second dimension of RoisNum should " - "be 1, but received dimension is %d", - rois_num_dims.size())); - } - int pooled_height = ctx->Attrs().Get("pooled_height"); - int pooled_width = ctx->Attrs().Get("pooled_width"); - int output_channels = ctx->Attrs().Get("output_channels"); - float spatial_scale = ctx->Attrs().Get("spatial_scale"); - - PADDLE_ENFORCE_EQ( - input_dims[1], output_channels * pooled_height * pooled_width, - platform::errors::InvalidArgument( - "the channel of X(%d) " - "should be equal to the product of " - "output_channels(%d), pooled_height(%d) and pooled_width(%d)", - input_dims[1], output_channels, pooled_height, pooled_width)); - - PADDLE_ENFORCE_GT(pooled_height, 0, - platform::errors::InvalidArgument( - "The pooled output height must be greater than 0")); - PADDLE_ENFORCE_GT(pooled_width, 0, - platform::errors::InvalidArgument( - "The pooled output width must be greater than 0")); - PADDLE_ENFORCE_GT(output_channels, 1, - platform::errors::InvalidArgument( - "The pooled output channels must greater than 1")); - PADDLE_ENFORCE_GT(spatial_scale, 0.0f, - platform::errors::InvalidArgument( - "The spatial scale must greater than 0.")); - - auto out_dims = input_dims; - out_dims[0] = rois_dims[0]; - out_dims[1] = - output_channels; // input_dims[1] / (pooled_height * pooled_width); - out_dims[2] = pooled_height; - out_dims[3] = pooled_width; - ctx->SetOutputDim("Out", out_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -164,16 +95,6 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, - platform::errors::InvalidArgument( - "The gradient of Out should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, - platform::errors::InvalidArgument( - "The gradient of X should not be null.")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -204,15 +125,13 @@ class PSROIPoolGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool, PsroiPoolInferShapeFunctor, + PD_INFER_META(phi::PsroiPoolInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool_grad, PsroiPoolGradInferShapeFunctor, + PD_INFER_META(phi::PsroiPoolGradInferMeta)); REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker, ops::PSROIPoolGradMaker, - ops::PSROIPoolGradMaker); -REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp); -REGISTER_OP_CPU_KERNEL( - psroi_pool, - ops::CPUPSROIPoolOpKernel, - ops::CPUPSROIPoolOpKernel); -REGISTER_OP_CPU_KERNEL( - psroi_pool_grad, - ops::CPUPSROIPoolGradOpKernel, - ops::CPUPSROIPoolGradOpKernel); + ops::PSROIPoolGradMaker, + PsroiPoolInferShapeFunctor); +REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp, + PsroiPoolGradInferShapeFunctor); diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu deleted file mode 100644 index c1917501db8b5..0000000000000 --- a/paddle/fluid/operators/psroi_pool_op.cu +++ /dev/null @@ -1,350 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/psroi_pool_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaximumNumBlocks = 4096; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaximumNumBlocks); -} - -template -__global__ void GPUPSROIPoolForward( - const int nthreads, const T* input_data, const T* input_rois, - const float spatial_scale, const int input_channels, const int height, - const int width, const int output_channels, const int pooled_height, - const int pooled_width, const int* rois_batch_id_data, T* output_data) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (size_t i = index; i < nthreads; i += offset) { - // The output is in order (n, c, ph, pw) - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; - - // set roi_batch_id - int roi_batch_id = rois_batch_id_data[n]; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = max(roi_end_w - roi_start_w, (T)0.1); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); - int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); - int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); - int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); - - // Add roi offsets and clip to input boundaries - hstart = min(max(hstart, 0), height); - hend = min(max(hend, 0), height); - wstart = min(max(wstart, 0), width); - wend = min(max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - const T* offset_input_data = - input_data + - (roi_batch_id * input_channels + input_channel) * height * width; - T outsum = 0; - - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * width + iw; - outsum += offset_input_data[input_index]; - } - } - - T bin_area = static_cast((hend - hstart) * (wend - wstart)); - output_data[i] = is_empty ? 0. : outsum / bin_area; - } -} - -template -__global__ void GPUPSROIPoolBackward( - const int nthreads, const T* input_rois, const T* output_grad_data, - const float spatial_scale, const int input_channels, const int height, - const int width, const int output_channels, const int pooled_height, - const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (int i = index; i < nthreads; i += offset) { - // The output is in order (n, c, ph, pw) - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; - - // set roi_batch_id - int roi_batch_id = rois_batch_id_data[n]; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - int input_offset = - (roi_batch_id * input_channels + input_channel) * height * width; - T* offset_input_grad_data = input_grad_data + input_offset; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = max(roi_end_w - roi_start_w, (T)0.1); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); - int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); - int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); - int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); - - // Add roi offsets and clip to input boundaries - hstart = min(max(hstart, 0), height); - hend = min(max(hend, 0), height); - wstart = min(max(wstart, 0), width); - wend = min(max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - // Accumulate diff_val into input data - T bin_area = static_cast((hend - hstart) * (wend - wstart)); - T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area; - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * width + iw; - platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val); - } - } - } -} - -template -class GPUPSROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto output_channels = ctx.Attr("output_channels"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - - PADDLE_ENFORCE_EQ( - input_channels, output_channels * pooled_height * pooled_width, - platform::errors::InvalidArgument( - "The channels %d of input X should equal the product of " - "output_channels %d x pooled_height %d x pooled_width %d.", - input_channels, output_channels, pooled_height, pooled_width)); - - int rois_num = rois->dims()[0]; - if (rois_num == 0) return; - int rois_batch_size; - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(platform::CPUPlace()); - - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be " - "the same but received batch size of input(ROIs) and " - "input(X) is %d and %d respectively.", - rois_batch_size, batch_size)); - std::vector rois_num_list(rois_batch_size); - memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(), - rois_num_data, sizeof(int) * rois_batch_size, 0); - int rois_num_count = 0; - for (int i = 0; i < rois_batch_size; ++i) { - rois_num_count += rois_num_list[i]; - } - PADDLE_ENFORCE_EQ( - rois_num_count, rois_num, - platform::errors::InvalidArgument( - "the rois_num from input and RoisNum must be the same")); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be " - "the same but received batch size of input(ROIs) and " - "input(X) is %d and %d respectively.", - rois_batch_size, batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The number of rois from input(ROIs) and its LOD " - "must be the same. Received rois %d of input(ROIs) " - "but the number of rois %d from its LOD is %d", - rois_num, rois_num_with_lod)); - - // set rois batch id - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - framework::Tensor rois_batch_id_list_gpu; - framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), - ctx.device_context(), &rois_batch_id_list_gpu); - - int output_size = out->numel(); - int blocks = NumBlocks(output_size); - int threads = kNumCUDAThreads; - - // call cuda kernel function - GPUPSROIPoolForward< - T><<>>( - output_size, in->data(), rois->data(), spatial_scale, - input_channels, height, width, output_channels, pooled_height, - pooled_width, rois_batch_id_list_gpu.data(), - out->mutable_data(ctx.GetPlace())); - } -}; - -template -class GPUPSROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - - auto* output_grad = ctx.Input(framework::GradVarName("Out")); - auto* input_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto output_channels = ctx.Attr("output_channels"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - int rois_num = rois->dims()[0]; - int input_channels = in->dims()[1]; - int height = in->dims()[2]; - int width = in->dims()[3]; - - if (input_grad) { - // set roi batch id - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(platform::CPUPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - std::vector rois_num_list(rois_batch_size); - memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(), - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - framework::Tensor rois_batch_id_list_gpu; - framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), - ctx.device_context(), &rois_batch_id_list_gpu); - - input_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(ctx.cuda_device_context(), input_grad, static_cast(0)); - - int output_grad_size = output_grad->numel(); - int blocks = NumBlocks(output_grad_size); - int threads = kNumCUDAThreads; - - if (output_grad_size > 0) { - GPUPSROIPoolBackward< - T><<>>( - output_grad_size, rois->data(), output_grad->data(), - spatial_scale, input_channels, height, width, output_channels, - pooled_height, pooled_width, rois_batch_id_list_gpu.data(), - input_grad->mutable_data(ctx.GetPlace())); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - psroi_pool, - ops::GPUPSROIPoolOpKernel, - ops::GPUPSROIPoolOpKernel); -REGISTER_OP_CUDA_KERNEL( - psroi_pool_grad, - ops::GPUPSROIPoolGradOpKernel, - ops::GPUPSROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h deleted file mode 100644 index 3f020d93391b0..0000000000000 --- a/paddle/fluid/operators/psroi_pool_op.h +++ /dev/null @@ -1,295 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class CPUPSROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto output_channels = ctx.Attr("output_channels"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - PADDLE_ENFORCE_EQ(input_channels, - output_channels * pooled_height * pooled_width, - platform::errors::InvalidArgument( - "the channels of input " - "X should equal the product of " - "output_channels x pooled_height x pooled_width")); - - auto in_stride = phi::stride(in_dims); - auto out_stride = phi::stride(out->dims()); - - const T* input_data = in->data(); - - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(ctx.GetPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of rois and the batch size of images " - " must be the same. But received the batch size of rois is %d, " - "and the batch size of images is %d", - rois_batch_size, batch_size)); - int rois_num_count = 0; - for (int i = 0; i < rois_batch_size; ++i) { - rois_num_count += rois_num_data[i]; - } - PADDLE_ENFORCE_EQ( - rois_num_count, rois_num, - platform::errors::InvalidArgument( - "the rois_num from input and RoisNum must be the same")); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument("the rois_batch_size and input(X) " - "batch_size should be the same.")); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num_with_lod, rois_num, - platform::errors::InvalidArgument( - "the rois_num from input and lod must be the same")); - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - T* output_data = out->mutable_data(ctx.GetPlace()); - const T* input_rois = rois->data(); - - // calculate psroipooling, parallel processing can be implemented per ROI - for (int n = 0; n < rois_num; ++n) { - // set roi batch id - int roi_batch_id = rois_batch_id_data[n]; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = - static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = - static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - // Force too small rois to be 1 x 1 - T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); - - // Compute bin size w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - // calculate each pixel of the output feature map. - int out_roi_offset = n * out_stride[0]; - for (int c = 0; c < output_channels; ++c) { - // per category - int out_plane_offset = out_roi_offset + c * out_stride[1]; - for (int ph = 0; ph < pooled_height; ++ph) { - int out_row_offset = out_plane_offset + ph * out_stride[2]; - for (int pw = 0; pw < pooled_width; ++pw) { - // calculate w and h at input feature map - int hstart = floor(static_cast(ph) * bin_size_h + roi_start_h); - int wstart = floor(static_cast(pw) * bin_size_w + roi_start_w); - int hend = ceil(static_cast(ph + 1) * bin_size_h + roi_start_h); - int wend = ceil(static_cast(pw + 1) * bin_size_w + roi_start_w); - // Add roi offsets and clip to input boundaries - hstart = std::min(std::max(hstart, 0), height); - wstart = std::min(std::max(wstart, 0), width); - hend = std::min(std::max(hend, 0), height); - wend = std::min(std::max(wend, 0), width); - - int output_index = out_row_offset + pw; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - int input_plane_offset = - roi_batch_id * in_stride[0] + input_channel * in_stride[1]; - const T* offset_input_data = input_data + input_plane_offset; - T out_sum = 0.; - bool is_empty = (hend <= hstart) || (wend <= wstart); - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * in_stride[2] + iw; - out_sum += offset_input_data[input_index]; - } - } - T bin_area = (hend - hstart) * (wend - wstart); - output_data[output_index] = is_empty ? 0. : out_sum / bin_area; - } - } - } - } - return; - } -}; - -template -class CPUPSROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto output_channels = ctx.Attr("output_channels"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - if (input_grad) { - auto in_dims = in->dims(); - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - // set roi batch id - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(ctx.GetPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - const T* input_rois = rois->data(); - const T* output_grad_data = output_grad->data(); - T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - - // set gradient of X to be 0. before backpropagate. - phi::funcs::SetConstant set_zero; - set_zero(ctx.template device_context(), input_grad, - static_cast(0)); - - // backpropagate gradient per output pixel - int output_grad_size = output_grad->numel(); - for (int i = 0; i < output_grad_size; ++i) { - // The output is in order (n, c, ph, pw) - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; - - // set roi_batch_id - int roi_batch_id = rois_batch_id_data[n]; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - int input_offset = - (roi_batch_id * input_channels + input_channel) * height * width; - T* offset_input_grad_data = input_grad_data + input_offset; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = - static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = - static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); - int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); - int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); - int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); - - // Add roi offsets and clip to input boundaries - hstart = std::min(std::max(hstart, 0), height); - hend = std::min(std::max(hend, 0), height); - wstart = std::min(std::max(wstart, 0), width); - wend = std::min(std::max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - // Accumulate diff_val into input data - T bin_area = static_cast((hend - hstart) * (wend - wstart)); - T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area; - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * width + iw; - offset_input_grad_data[input_index] += diff_val; - } - } - } - } - return; - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc index 40e3cbde3b009..82fc9ef1b7858 100644 --- a/paddle/fluid/operators/qr_op.cc +++ b/paddle/fluid/operators/qr_op.cc @@ -145,8 +145,6 @@ REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker, REGISTER_OPERATOR(qr_grad, ops::QrGradOp); -REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel, ops::QrCPUKernel); - REGISTER_OP_CPU_KERNEL( qr_grad, ops::QrGradKernel, ops::QrGradKernel); diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h index f09a07e96cd34..5ef02d8942797 100644 --- a/paddle/fluid/operators/qr_op.h +++ b/paddle/fluid/operators/qr_op.h @@ -48,85 +48,6 @@ static inline std::tuple _parse_qr_mode(std::string mode) { return std::make_tuple(compute_q, reduced); } -template -class QrCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool compute_q; - bool reduced_mode; - const Tensor& x = *context.Input("X"); - Tensor& q = *context.Output("Q"); - Tensor& r = *context.Output("R"); - std::string mode = context.Attr("mode"); - std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode); - - auto numel = x.numel(); - PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet( - "The input of QR is empty.")); - auto x_dims = x.dims(); - int x_rank = x_dims.size(); - int m = x_dims[x_rank - 2]; - int n = x_dims[x_rank - 1]; - int min_mn = std::min(m, n); - int k = reduced_mode ? min_mn : m; - int batch_size = numel / (m * n); - int x_stride = m * n; - int q_stride = m * k; - int r_stride = k * n; - - auto* x_data = x.data>(); - T* q_data = nullptr; - if (compute_q) { - q_data = q.mutable_data>( - context.GetPlace(), - size_t(batch_size * m * k * sizeof(phi::dtype::Real))); - memset(q_data, 0, - size_t(batch_size * m * k * sizeof(phi::dtype::Real))); - } - auto* r_data = r.mutable_data>( - context.GetPlace(), - size_t(batch_size * k * n * sizeof(phi::dtype::Real))); - memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real))); - - // Implement QR by calling Eigen - for (int i = 0; i < batch_size; ++i) { - const T* x_matrix_ptr = x_data + i * x_stride; - T* r_matrix_ptr = r_data + i * r_stride; - using EigenDynamicMatrix = - Eigen::Matrix; - auto x_matrix = Eigen::Map(x_matrix_ptr, m, n); - Eigen::HouseholderQR qr(x_matrix); - if (reduced_mode) { - auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n); - auto r_matrix_view = - qr_top_matrix.template triangularView(); - auto r_matrix = EigenDynamicMatrix(r_matrix_view); - memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); - } else { - auto r_matrix_view = - qr.matrixQR().template triangularView(); - auto r_matrix = EigenDynamicMatrix(r_matrix_view); - memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); - } - - if (compute_q) { - T* q_matrix_ptr = q_data + i * q_stride; - if (reduced_mode) { - auto q_matrix = - qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn); - q_matrix.transposeInPlace(); - memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); - } else { - auto q_matrix = - qr.householderQ() * EigenDynamicMatrix::Identity(m, m); - q_matrix.transposeInPlace(); - memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); - } - } - } - } -}; - template class QrGradKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc index 24741efe426b1..c7e91ba35dee1 100644 --- a/paddle/fluid/operators/range_op_npu_test.cc +++ b/paddle/fluid/operators/range_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc index 0a5d54e72c845..83a21a919dcaa 100644 --- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc +++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc @@ -12,9 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/frobenius_norm_op.h" - #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { @@ -56,22 +59,12 @@ class FrobeniusNormOpMaker : public ops::ReduceOpMaker { virtual std::string GetOpType() const { return "Reduce frobenius_norm"; } }; +DECLARE_INFER_SHAPE_FUNCTOR(frobenius_norm, FrobeniusNormInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + REGISTER_OPERATOR(frobenius_norm, ops::ReduceOp, FrobeniusNormOpMaker, ops::FrobeniusNormOpGradMaker, - ops::FrobeniusNormOpGradMaker); + ops::FrobeniusNormOpGradMaker, + FrobeniusNormInferShapeFunctor); REGISTER_OPERATOR(frobenius_norm_grad, ops::ReduceGradOp); - -REGISTER_OP_CPU_KERNEL(frobenius_norm, - ops::ReduceKernel, - ops::ReduceKernel); - -template -using CPUFrobeniusNormGradKernel = - ops::FrobeniusNormGradKernel; - -REGISTER_OP_CPU_KERNEL(frobenius_norm_grad, CPUFrobeniusNormGradKernel, - CPUFrobeniusNormGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cu b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cu deleted file mode 100644 index b2cef09df9436..0000000000000 --- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cu +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/frobenius_norm_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" - -template -using CUDAFrobeniusNormKernel = - ops::ReduceKernel; - -REGISTER_OP_CUDA_KERNEL(frobenius_norm, CUDAFrobeniusNormKernel, - CUDAFrobeniusNormKernel); - -template -using CUDAFrobeniusNormGradKernel = - ops::ReduceGradKernel; - -REGISTER_OP_CUDA_KERNEL(frobenius_norm_grad, CUDAFrobeniusNormGradKernel, - CUDAFrobeniusNormGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.h b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.h deleted file mode 100644 index 0b6b87d99ecd9..0000000000000 --- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.h +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" - -namespace paddle { -namespace operators { - -// \partial \| X \|_F = \frac{X}{ \| X \|_F } -template -class FrobeniusNormGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // default use Eigen broadcast - ReduceGradKernel kernel; - kernel.Compute(context); - } -}; - -struct FrobeniusNormFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = ((x->square()).sum(dim)).sqrt(); - } -}; - -struct FrobeniusNormGradFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, - const Dim& dim, int size) { - dx->device(place) = y->broadcast(dim); - dx->device(place) = *dx + dx->constant(1e-12f); - dx->device(place) = (*x / *dx) * (dy->broadcast(dim)); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc index 955cf8d4448c1..9115d21b195e1 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc @@ -14,6 +14,10 @@ #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace framework { class OpDesc; @@ -28,9 +32,17 @@ class CPUDeviceContext; } // namespace platform } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(reduce_all, ReduceAllInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); +class ReduceAllOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_all"; } + virtual std::string GetOpType() const { return "Reduce reduce_all"; } +}; // kernel's device type is decided by input tensor place, to be consistent with // compare and logical ops -REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_all, UseInputPlace); -REGISTER_OP_CPU_KERNEL(reduce_all, - ops::BoolReduceKernel); +REGISTER_OPERATOR( + reduce_all, ops::ReduceOpUseInputPlace, ReduceAllOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ReduceAllInferShapeFunctor); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc index fa3800dd3c9e4..69561b9349888 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc @@ -14,6 +14,9 @@ #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { class OpDesc; @@ -28,9 +31,18 @@ class CPUDeviceContext; } // namespace platform } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(reduce_any, ReduceAnyInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + +class ReduceAnyOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_any"; } + virtual std::string GetOpType() const { return "Reduce reduce_any"; } +}; // kernel's device type is decided by input tensor place, to be consistent with // compare and logical ops -REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_any, UseInputPlace); -REGISTER_OP_CPU_KERNEL(reduce_any, - ops::BoolReduceKernel); +REGISTER_OPERATOR( + reduce_any, ops::ReduceOpUseInputPlace, ReduceAnyOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ReduceAnyInferShapeFunctor); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc index d057ee8f5d798..e327d19ab3be8 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc @@ -35,7 +35,7 @@ namespace p = paddle::platform; using Tensor = paddle::framework::Tensor; -USE_OP(reduce_any); +USE_OP_ITSELF(reduce_any); USE_OP_DEVICE_KERNEL(reduce_any, NPU); template diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc index 41df8e4a15f09..15812778e0023 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc @@ -35,13 +35,3 @@ REGISTER_OPERATOR( paddle::framework::DefaultGradOpMaker, ReduceMaxInferShapeFunctor); REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp) - -REGISTER_OP_CPU_KERNEL( - reduce_max_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu deleted file mode 100644 index 5ee38b8fa4629..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -REGISTER_OP_CUDA_KERNEL( - reduce_max_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index 4a18330913803..dc41979defb93 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -107,12 +107,3 @@ REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp, ops::ReduceMeanDoubleGradDescMaker, ops::ReduceMeanDoubleGradOpBaseMaker, ops::ReduceMeanGradNoNeedBufferVarInferer); - -template -using CPUReduceMeanGradKernel = - ops::ReduceGradKernel; - -REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel, - CPUReduceMeanGradKernel, - CPUReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc index 11aa78382e319..5e5b04d57b002 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc @@ -14,21 +14,24 @@ #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" -REGISTER_REDUCE_OP(reduce_min); -REGISTER_OP_CPU_KERNEL( - reduce_min, ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); -REGISTER_OP_CPU_KERNEL( - reduce_min_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +namespace ops = paddle::operators; + +class ReduceMinOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_min"; } + virtual std::string GetOpType() const { return "Reduce reduce_min"; } +}; + +DECLARE_INFER_SHAPE_FUNCTOR(reduce_min, ReduceMinInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + +REGISTER_OPERATOR( + reduce_min, ops::ReduceOp, ReduceMinOpMaker, + paddle::framework::DefaultGradOpMaker, + paddle::framework::DefaultGradOpMaker, + ReduceMinInferShapeFunctor); +REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp) diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu deleted file mode 100644 index 44548b8d2e778..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" - -// reduce_min -REGISTER_OP_CUDA_KERNEL( - reduce_min, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu deleted file mode 100644 index bf886063786a8..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -REGISTER_OP_CUDA_KERNEL( - reduce_min_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc index eb745ab9c56c5..b1abdf9e8a758 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc @@ -14,6 +14,10 @@ #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace framework { class OpDesc; @@ -26,14 +30,20 @@ class CPUDeviceContext; } // namespace platform } // namespace paddle -REGISTER_REDUCE_OP(reduce_prod); +namespace ops = paddle::operators; + +class ReduceProdOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_prod"; } + virtual std::string GetOpType() const { return "Reduce reduce_prod"; } +}; + +DECLARE_INFER_SHAPE_FUNCTOR(reduce_prod, ReduceProdInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); -REGISTER_OP_CPU_KERNEL(reduce_prod_grad, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +REGISTER_OPERATOR( + reduce_prod, ops::ReduceOp, ReduceProdOpMaker, + paddle::framework::DefaultGradOpMaker, + paddle::framework::DefaultGradOpMaker, + ReduceProdInferShapeFunctor); +REGISTER_OPERATOR(reduce_prod_grad, ops::ReduceGradOp); diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu deleted file mode 100644 index 0610cdd94f89c..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" - -REGISTER_OP_CUDA_KERNEL( - reduce_prod_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index 6441d53239e95..2a78774f3706e 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -114,16 +114,3 @@ REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp, ops::ReduceSumDoubleOpGradMaker, ops::ReduceSumDoubleOpGradMaker, ops::ReduceSumGradNoNeedBufferVarInferer); - -template -using CPUReduceSumGradKernel = - ops::ReduceSumGradKernel; - -REGISTER_OP_CPU_KERNEL( - reduce_sum_grad, CPUReduceSumGradKernel, - CPUReduceSumGradKernel, CPUReduceSumGradKernel, - CPUReduceSumGradKernel, - CPUReduceSumGradKernel, CPUReduceSumGradKernel, - CPUReduceSumGradKernel>, - CPUReduceSumGradKernel>); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu deleted file mode 100644 index 2f6bf12751809..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" - -template -using CUDAReduceSumGradKernel = - ops::ReduceCudaGradKernel; - -REGISTER_OP_CUDA_KERNEL( - reduce_sum_grad, CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel>, - CUDAReduceSumGradKernel>); diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h index b636184ae457e..a473b54c1f855 100644 --- a/paddle/fluid/operators/rnn_op.h +++ b/paddle/fluid/operators/rnn_op.h @@ -16,9 +16,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/unique_op.h" @@ -36,6 +36,14 @@ using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; using TensorList = std::vector; +template +using EigenMatrix = framework::EigenMatrix; + +template +using EigenVector = framework::EigenVector; + #define DEFINE_MODE_DETECTOR(MODE_NAME, MODE_STR) \ inline bool is_##MODE_NAME(const framework::ExecutionContext& ctx) { \ const std::string& mode = ctx.Attr("mode"); \ diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index 5627b4f229e10..bf78b6a696559 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/roi_align_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,79 +26,6 @@ class ROIAlignOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of ROIAlignOp " - "is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true, - platform::errors::NotFound("Input(ROIs) of ROIAlignOp " - "is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of ROIAlignOp " - "is not found.")); - auto input_dims = ctx->GetInputDim("X"); - auto rois_dims = ctx->GetInputDim("ROIs"); - - if (ctx->HasInput("RoisNum")) { - auto rois_num_dims = ctx->GetInputDim("RoisNum"); - PADDLE_ENFORCE_EQ( - rois_num_dims.size(), 1, - platform::errors::InvalidArgument("The size of RoisNum should be 1" - ", but received size = %d", - rois_num_dims.size())); - } - PADDLE_ENFORCE_EQ( - input_dims.size(), 4, - platform::errors::InvalidArgument( - "The format of Input(X) in" - "RoIAlignOp is NCHW. And the rank of input must be 4. " - "But received rank = %d", - input_dims.size())); - PADDLE_ENFORCE_EQ(rois_dims.size(), 2, platform::errors::InvalidArgument( - "The rank of Input(ROIs) " - "in RoIAlignOp should be 2. " - "But the rank of RoIs is %d", - rois_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(rois_dims[1], 4, - platform::errors::InvalidArgument( - "The second dimension " - "of Input(ROIs) should be 4. But received the " - "dimension = %d", - rois_dims[1])); - } - int pooled_height = ctx->Attrs().Get("pooled_height"); - int pooled_width = ctx->Attrs().Get("pooled_width"); - float spatial_scale = ctx->Attrs().Get("spatial_scale"); - - PADDLE_ENFORCE_GT(pooled_height, 0, - platform::errors::InvalidArgument( - "The 'pooled_height' attribute in RoIAlignOp is " - "invalid. The height must be greater than 0. But " - "received 'pooled_height' = %d", - pooled_height)); - PADDLE_ENFORCE_GT(pooled_width, 0, - platform::errors::InvalidArgument( - "The 'pooled_width' attribute in RoIAlignOp is " - "invalid. The width must be greater than 0. But " - "received 'pooled_width' = %d", - pooled_width)); - PADDLE_ENFORCE_GT(spatial_scale, 0.0f, - platform::errors::InvalidArgument( - "The 'spatial_scale' attribute in RoIAlignOp is " - "invalid. The scale must be greater than 0. But " - "received 'spatial_scale' = %f", - spatial_scale)); - - auto out_dims = input_dims; - out_dims[0] = rois_dims[0]; - out_dims[1] = input_dims[1]; - out_dims[2] = pooled_height; - out_dims[3] = pooled_width; - - ctx->SetOutputDim("Out", out_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -221,21 +151,16 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RoiAlignGradNoNeedBufVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(roi_align, RoiAlignInferShapeFunctor, + PD_INFER_META(phi::RoiAlignInferMeta)); + REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker, ops::ROIAlignGradMaker, - ops::ROIAlignGradMaker); + ops::ROIAlignGradMaker, + RoiAlignInferShapeFunctor); REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp, ops::RoiAlignGradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - roi_align, - ops::CPUROIAlignOpKernel, - ops::CPUROIAlignOpKernel, - ops::CPUROIAlignOpKernel); -REGISTER_OP_CPU_KERNEL( - roi_align_grad, - ops::CPUROIAlignGradOpKernel, - ops::CPUROIAlignGradOpKernel, - ops::CPUROIAlignGradOpKernel); + REGISTER_OP_VERSION(roi_align) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu deleted file mode 100644 index 18941d10e937d..0000000000000 --- a/paddle/fluid/operators/roi_align_op.cu +++ /dev/null @@ -1,426 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/roi_align_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaxinumNumBlocks = 4096; -static constexpr int kROISize = 4; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaxinumNumBlocks); -} - -template -__device__ T BilinearInterpolate(const T* input_data, const int height, - const int width, T y, T x) { - if (y < -1.0 || y > height || x < -1.0 || x > width) { - return 0; - } - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - int y_low = static_cast(y); - int x_low = static_cast(x); - int y_high; - int x_high; - if (y_low >= height - 1) { - y_high = y_low = height - 1; - y = static_cast(y_low); - } else { - y_high = y_low + 1; - } - if (x_low >= width - 1) { - x_high = x_low = width - 1; - x = static_cast(x_low); - } else { - x_high = x_low + 1; - } - T ly = y - y_low, lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; - - T v1 = input_data[y_low * width + x_low]; - T v2 = input_data[y_low * width + x_high]; - T v3 = input_data[y_high * width + x_low]; - T v4 = input_data[y_high * width + x_high]; - T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - - T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); - return val; -} - -template -__device__ void BilinearInterpolateGradient(const int height, const int width, - T y, T x, T* w1, T* w2, T* w3, - T* w4, int* x_low, int* x_high, - int* y_low, int* y_high) { - if (y < -1.0 || y > height || x < -1.0 || x > width) { - return; - } - - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - *y_low = static_cast(y); - *x_low = static_cast(x); - if (*y_low >= height - 1) { - *y_high = *y_low = height - 1; - y = static_cast(*y_low); - } else { - *y_high = *y_low + 1; - } - if (*x_low >= width - 1) { - *x_high = *x_low = width - 1; - x = static_cast(*x_low); - } else { - *x_high = *x_low + 1; - } - T ly = y - *y_low, lx = x - *x_low; - T hy = 1. - ly, hx = 1. - lx; - *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx; - - return; -} - -template -__global__ void GPUROIAlignForward( - const int nthreads, const T* input_data, const T* input_rois, - const float spatial_scale, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int sampling_ratio, int* roi_batch_id_data, T* output_data, - const bool continuous_coordinate) { - CUDA_KERNEL_LOOP(i, nthreads) { - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % channels; - int n = i / pooled_width / pooled_height / channels; - - const T* offset_input_rois = input_rois + n * kROISize; - int roi_batch_ind = roi_batch_id_data[n]; - - T roi_offset = continuous_coordinate ? static_cast(0.5) : 0; - T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset; - T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset; - T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset; - T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - if (!continuous_coordinate) { - roi_width = max(roi_width, static_cast(1.)); - roi_height = max(roi_height, static_cast(1.)); - } - - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - - const T* offset_input_data = - input_data + (roi_batch_ind * channels + c) * height * width; - - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = - (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); - const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); - T output_val = 0; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T y = roi_ymin + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_xmin + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - T val = BilinearInterpolate(offset_input_data, height, width, y, x); - output_val += val; - } - } - output_val /= count; - output_data[i] = output_val; - } -} - -template -__global__ void GPUROIAlignBackward( - const int nthreads, const T* input_rois, const T* out_grad, - const int num_rois, const float spatial_scale, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int sampling_ratio, int* roi_batch_id_data, - T* input_grad, const bool continuous_coordinate) { - CUDA_KERNEL_LOOP(i, nthreads) { - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % channels; - int n = i / pooled_width / pooled_height / channels; - const T* offset_input_rois = input_rois + n * kROISize; - int roi_batch_ind = roi_batch_id_data[n]; - - T roi_offset = continuous_coordinate ? T(0.5) : 0; - T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset; - T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset; - T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset; - T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - if (!continuous_coordinate) { - roi_width = max(roi_width, static_cast(1.)); - roi_height = max(roi_height, static_cast(1.)); - } - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - - T* offset_input_grad = - input_grad + (roi_batch_ind * channels + c) * height * width; - - const T* offset_out_grad = - out_grad + (n * channels + c) * pooled_height * pooled_width; - const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw]; - - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = - (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); - - const T count = roi_bin_grid_h * roi_bin_grid_w; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T y = roi_ymin + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_xmin + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - T w1 = 0, w2 = 0, w3 = 0, w4 = 0; - int x_low = -1, x_high = -1, y_low = -1, y_high = -1; - BilinearInterpolateGradient(height, width, y, x, &w1, &w2, &w3, &w4, - &x_low, &x_high, &y_low, &y_high); - T diff1 = out_grad_this_bin * w1 / count; - T diff2 = out_grad_this_bin * w2 / count; - T diff3 = out_grad_this_bin * w3 / count; - T diff4 = out_grad_this_bin * w4 / count; - if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { - platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_low, - diff1); - platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_high, - diff2); - platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_low, - diff3); - platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_high, - diff4); - } - } - } - } -} - -template -class GPUROIAlignOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto aligned = ctx.Attr("aligned"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - - int rois_num = rois->dims()[0]; - - if (rois_num == 0) return; - - int output_size = out->numel(); - int blocks = NumBlocks(output_size); - int threads = kNumCUDAThreads; -#ifdef WITH_NV_JETSON - platform::ChangeThreadNum(ctx.cuda_device_context(), &threads, 256); -#endif - Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - auto cplace = platform::CPUPlace(); - int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); - auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = ctx.GetPlace(); - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - int rois_batch_size = rois_num_t->numel(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The rois_batch_size and imgs " - "batch_size must be the same. But received rois_batch_size = %d, " - "batch_size = %d", - rois_batch_size, batch_size)); - - std::vector rois_num_list(rois_batch_size); - memory::Copy(cplace, rois_num_list.data(), gplace, - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto lod = rois->lod(); - PADDLE_ENFORCE_EQ( - lod.empty(), false, - platform::errors::InvalidArgument("Input(ROIs) in ROIAlignOp does " - "not contain LoD information.")); - auto rois_lod = lod.back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of rois and batch size " - "of images must be the same. But received rois batch size = %d, " - "and images batch size = %d", - rois_batch_size, batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The actual number of rois and the number of rois " - "provided from Input(RoIsLoD) in RoIAlign must be the same." - " But received actual number of rois is %d, and the number " - "of rois from RoIsLoD is %d", - rois_num, rois_num_with_lod)); - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc(dev_ctx, bytes); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, - dev_ctx.stream()); - GPUROIAlignForward<<>>( - output_size, in->data(), rois->data(), spatial_scale, channels, - height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data, - out->mutable_data(ctx.GetPlace()), aligned); - } -}; - -template -class GPUROIAlignGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto aligned = ctx.Attr("aligned"); - - int rois_num = rois->dims()[0]; - int channels = in->dims()[1]; - int height = in->dims()[2]; - int width = in->dims()[3]; - - if (!in_grad) { - return; - } - Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - auto cplace = platform::CPUPlace(); - int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); - - auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = ctx.GetPlace(); - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - int rois_batch_size = rois_num_t->numel(); - std::vector rois_num_list(rois_batch_size); - memory::Copy(cplace, rois_num_list.data(), gplace, - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = start; i < start + rois_num_list[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - auto roi_ptr = - memory::Alloc(dev_ctx, roi_batch_id_list.numel() * sizeof(int)); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - int bytes = roi_batch_id_list.numel() * sizeof(int); - memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, - dev_ctx.stream()); - in_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, in_grad, static_cast(0)); - - int output_grad_size = out_grad->numel(); - int blocks = NumBlocks(output_grad_size); - int threads = kNumCUDAThreads; - - if (output_grad_size > 0) { - GPUROIAlignBackward<<>>( - output_grad_size, rois->data(), out_grad->data(), rois_num, - spatial_scale, channels, height, width, pooled_height, pooled_width, - sampling_ratio, roi_id_data, in_grad->mutable_data(ctx.GetPlace()), - aligned); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - roi_align, - ops::GPUROIAlignOpKernel, - ops::GPUROIAlignOpKernel); -REGISTER_OP_CUDA_KERNEL( - roi_align_grad, - ops::GPUROIAlignGradOpKernel, - ops::GPUROIAlignGradOpKernel); diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h deleted file mode 100644 index e71099ed99f00..0000000000000 --- a/paddle/fluid/operators/roi_align_op.h +++ /dev/null @@ -1,465 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -namespace { // NOLINT -constexpr size_t get_offset(size_t x, size_t y, size_t width) { - return y * width + x; -} - -template -struct offsets_and_ratios { - offsets_and_ratios() = default; - offsets_and_ratios(std::size_t xy, std::size_t xY, std::size_t Xy, - std::size_t XY, T xy_ratio, T xY_ratio, T Xy_ratio, - T XY_ratio) - : xy(xy), - xY(xY), - Xy(Xy), - XY(XY), - xy_ratio(xy_ratio), - xY_ratio(xY_ratio), - Xy_ratio(Xy_ratio), - XY_ratio(XY_ratio) {} - - std::size_t xy = 0; - std::size_t xY = 0; - std::size_t Xy = 0; - std::size_t XY = 0; - T xy_ratio = 0.0f; - T xY_ratio = 0.0f; - T Xy_ratio = 0.0f; - T XY_ratio = 0.0f; -}; - -template -std::vector> get_indexes_and_ratios( - std::size_t width, std::size_t height, const T roi_width, - const T roi_height, const T roi_xmin, const T roi_ymin, - std::size_t pooled_width, std::size_t roi_bin_grid_w, - std::size_t pooled_height, std::size_t roi_bin_grid_h) { - const auto ind_num = - pooled_width * roi_bin_grid_w * pooled_height * roi_bin_grid_h; - - std::vector> interpolation_cords; - interpolation_cords.reserve(ind_num); - - const auto bin_w = roi_width / pooled_width; - const auto bin_h = roi_height / pooled_height; - - for (std::size_t py = 0; py < pooled_height; py++) { - for (std::size_t px = 0; px < pooled_width; px++) { - for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) { - // calculate x of sample points - auto y = - roi_ymin + - bin_h * (py + - static_cast(iy + .5f) / static_cast(roi_bin_grid_h)); - for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) { - // calculate x of sample points - auto x = roi_xmin + - bin_w * (px + - static_cast(ix + .5f) / - static_cast(roi_bin_grid_w)); - - // deal with elements out of map - if (y < -1.0 || y > height || x < -1.0 || x > width) { - interpolation_cords.emplace_back(); - continue; - } - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - - std::size_t x_low_index = static_cast(x); - std::size_t x_high_index; - if (x_low_index >= width - 1) { - x_high_index = x_low_index = width - 1; - x = static_cast(x_low_index); - } else { - x_high_index = x_low_index + 1; - } - T x_ratio = x_high_index - x; - - std::size_t y_low_index = static_cast(y); - std::size_t y_high_index; - if (y_low_index >= height - 1) { - y_high_index = y_low_index = height - 1; - y = static_cast(y_low_index); - } else { - y_high_index = y_low_index + 1; - } - T y_ratio = y_high_index - y; - - auto xy = get_offset(x_low_index, y_low_index, width); - auto xY = get_offset(x_low_index, y_high_index, width); - auto Xy = get_offset(x_high_index, y_low_index, width); - auto XY = get_offset(x_high_index, y_high_index, width); - - auto xy_ratio = x_ratio * y_ratio; - auto xY_ratio = x_ratio * (1 - y_ratio); - auto Xy_ratio = (1 - x_ratio) * y_ratio; - auto XY_ratio = (1 - x_ratio) * (1 - y_ratio); - - interpolation_cords.emplace_back(xy, xY, Xy, XY, xy_ratio, xY_ratio, - Xy_ratio, XY_ratio); - } - } - } - } - return interpolation_cords; -} // namespace - -template -void interpolate(std::vector& interpolated_values, // NOLINT - const std::vector>& interpolation_cords, - const T* data) { - for (auto& ic : interpolation_cords) { - auto xlyl_offset = ic.xy; - auto xhyl_offset = ic.Xy; - auto xlyh_offset = ic.xY; - auto xhyh_offset = ic.XY; - - auto xlyl_ratio = ic.xy_ratio; - auto xhyl_ratio = ic.Xy_ratio; - auto xlyh_ratio = ic.xY_ratio; - auto xhyh_ratio = ic.XY_ratio; - - interpolated_values.emplace_back( - xlyl_ratio * data[xlyl_offset] + xhyl_ratio * data[xhyl_offset] + - xlyh_ratio * data[xlyh_offset] + xhyh_ratio * data[xhyh_offset]); - } -} - -template -void avg_pool(const std::vector& interpolated_values, T* output_data, - int roi_bin_grid_w, int roi_bin_grid_h, int pooled_width, - int pooled_height) { - const auto data_amount = pooled_width * pooled_height; - const auto grid_points = roi_bin_grid_w * roi_bin_grid_h; - const T count = 1.0 / grid_points; - auto val_begin = interpolated_values.cbegin(); - for (auto i = 0; i < data_amount; ++i) { - T sum = 0.0; - auto val_end = val_begin + grid_points; - sum = std::accumulate(val_begin, val_end, sum); - val_begin = val_end; - output_data[i] = sum * count; - } -} -} // NOLINT - -template -void bilinear_interpolate_gradient(const int height, const int width, T y, T x, - const T out_grad_this_bin, const T count, - T* batch_grad_data) { - int x_low, y_low, x_high, y_high; - T w1, w2, w3, w4; - if (y < -1.0 || y > height || x < -1.0 || x > width) { - w1 = w2 = w3 = w4 = 0; - x_low = x_high = y_low = y_high = -1; - return; - } - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - y_low = static_cast(y); - x_low = static_cast(x); - if (y_low >= height - 1) { - y_high = y_low = height - 1; - y = static_cast(y_low); - } else { - y_high = y_low + 1; - } - - if (x_low >= width - 1) { - x_high = x_low = width - 1; - x = static_cast(x_low); - } else { - x_high = x_low + 1; - } - - T ly = y - y_low, lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; - w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - T diff1 = out_grad_this_bin * w1 / count; - T diff2 = out_grad_this_bin * w2 / count; - T diff3 = out_grad_this_bin * w3 / count; - T diff4 = out_grad_this_bin * w4 / count; - if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { - *(batch_grad_data + y_low * width + x_low) += diff1; - *(batch_grad_data + y_low * width + x_high) += diff2; - *(batch_grad_data + y_high * width + x_low) += diff3; - *(batch_grad_data + y_high * width + x_high) += diff4; - } -} - -template -class CPUROIAlignOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto aligned = ctx.Attr("aligned"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - auto in_stride = phi::stride(in_dims); - auto roi_stride = phi::stride(rois->dims()); - auto out_stride = phi::stride(out->dims()); - - const T* input_data = in->data(); - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(ctx.GetPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of rois and the batch size of images " - " must be the same. But received the batch size of rois is %d, " - "and the batch size of images is %d", - rois_batch_size, batch_size)); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto lod = rois->lod(); - PADDLE_ENFORCE_EQ(lod.empty(), false, - platform::errors::InvalidArgument( - "Input(ROIs) Tensor of ROIAlignOp " - "does not contain LoD information.")); - auto rois_lod = lod.back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The rois_batch_size and imgs " - "batch_size must be the same. But received rois_batch_size = %d, " - "batch_size = %d", - rois_batch_size, batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The actual number of rois and the number of rois " - "provided from Input(RoIsLoD) in RoIAlign must be the same." - " But received actual number of rois is %d, and the number " - "of rois from RoIsLoD is %d", - rois_num, rois_num_with_lod)); - for (int n = 0; n < rois_batch_size; ++n) { - for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - T* output_data = out->mutable_data(ctx.GetPlace()); - const T* rois_data = rois->data(); - T roi_offset = aligned ? T(0.5) : 0; - for (int n = 0; n < rois_num; ++n) { - int roi_batch_id = roi_batch_id_data[n]; - T roi_xmin = rois_data[0] * spatial_scale - roi_offset; - T roi_ymin = rois_data[1] * spatial_scale - roi_offset; - T roi_xmax = rois_data[2] * spatial_scale - roi_offset; - T roi_ymax = rois_data[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - if (!aligned) { - roi_width = std::max(roi_width, static_cast(1.)); - roi_height = std::max(roi_height, static_cast(1.)); - } - - const T* batch_data = input_data + roi_batch_id * in_stride[0]; - - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_width / pooled_width); - - auto interpolation_cords = get_indexes_and_ratios( - width, height, roi_width, roi_height, roi_xmin, roi_ymin, - pooled_width, roi_bin_grid_w, pooled_height, roi_bin_grid_h); - - std::vector interpolated_values; - interpolated_values.reserve(interpolation_cords.size()); - for (auto channel = 0; channel < channels; ++channel) { - interpolate(interpolated_values, interpolation_cords, batch_data); - avg_pool(interpolated_values, output_data, roi_bin_grid_w, - roi_bin_grid_h, pooled_width, pooled_height); - batch_data += in_stride[1]; - output_data += out_stride[1]; - interpolated_values.clear(); - } - rois_data += roi_stride[0]; - } - } -}; - -template -class CPUROIAlignGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out_grad = - ctx.Input(framework::GradVarName("Out")); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto in_dims = in->dims(); - auto aligned = ctx.Attr("aligned"); - - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - if (!in_grad) { - return; - } - Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(ctx.GetPlace()); - - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - in_grad->mutable_data(ctx.GetPlace()); - auto& dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, in_grad, static_cast(0)); - - int output_grad_size = out_grad->numel(); - - if ((!out_grad->IsInitialized()) || (output_grad_size <= 0)) { - return; - } - - const T* rois_data = rois->data(); - const T* out_grad_data = out_grad->data(); - T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); - - auto in_stride = phi::stride(in->dims()); - auto roi_stride = phi::stride(rois->dims()); - auto out_stride = phi::stride(out_grad->dims()); - - T roi_offset = aligned ? T(0.5) : 0; - for (int n = 0; n < rois_num; ++n) { - int roi_batch_idx = roi_batch_id_data[n]; - T roi_xmin = rois_data[0] * spatial_scale - roi_offset; - T roi_ymin = rois_data[1] * spatial_scale - roi_offset; - T roi_xmax = rois_data[2] * spatial_scale - roi_offset; - T roi_ymax = rois_data[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - roi_width = std::max(roi_width, static_cast(1.)); - roi_height = std::max(roi_height, static_cast(1.)); - if (!aligned) { - roi_width = std::max(roi_width, static_cast(1.)); - roi_height = std::max(roi_height, static_cast(1.)); - } - - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - for (int c = 0; c < channels; ++c) { - T* batch_grad_data = - in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1]; - const T* batch_out_grad_data = - out_grad_data + n * out_stride[0] + c * out_stride[1]; - for (int ph = 0; ph < pooled_height; ++ph) { - for (int pw = 0; pw < pooled_width; ++pw) { - int pool_index = ph * pooled_width + pw; - T out_grad_this_bin = batch_out_grad_data[pool_index]; - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_width / pooled_width); - T count = roi_bin_grid_h * roi_bin_grid_w; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T y = roi_ymin + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_xmin + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - bilinear_interpolate_gradient(height, width, y, x, - out_grad_this_bin, count, - batch_grad_data); - } - } - } - } - } - rois_data += roi_stride[0]; - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc index d5b63854d9905..78509e4299b80 100644 --- a/paddle/fluid/operators/roi_align_op_npu.cc +++ b/paddle/fluid/operators/roi_align_op_npu.cc @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/roi_align_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc index 09d2d906653e8..13490d6fcde3a 100644 --- a/paddle/fluid/operators/roi_align_op_xpu.cc +++ b/paddle/fluid/operators/roi_align_op_xpu.cc @@ -13,13 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/roi_align_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + template class XPUROIAlignOpKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index a512e7dcd682b..12e33d56c0020 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/roi_pool_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -26,74 +29,6 @@ class ROIPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "roi_pool"); - OP_INOUT_CHECK(ctx->HasInput("ROIs"), "Input", "ROIs", "roi_pool"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "roi_pool"); - OP_INOUT_CHECK(ctx->HasOutput("Argmax"), "Output", "Argmax", "roi_pool"); - - auto input_dims = ctx->GetInputDim("X"); - auto rois_dims = ctx->GetInputDim("ROIs"); - - if (ctx->HasInput("RoisNum")) { - auto rois_num_dims = ctx->GetInputDim("RoisNum"); - PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1, - platform::errors::InvalidArgument( - "The second dimension of RoisNum should " - "be 1, but received dimension is %d", - rois_num_dims.size())); - } - PADDLE_ENFORCE_EQ(input_dims.size(), 4, - platform::errors::InvalidArgument( - "The input data should be a four-dimensional " - "tensor with [N,C,H,W], but received input data with " - " %d dimension", - input_dims.size())); - PADDLE_ENFORCE_EQ( - rois_dims.size(), 2, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], ...], but received ROIs is " - "%d-dimensional LoDTensor", - rois_dims.size())); - PADDLE_ENFORCE_EQ( - rois_dims[1], kROISize, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], ...]. But the second dimension of " - "the received data is %d", - rois_dims[1])); - - int pooled_height = ctx->Attrs().Get("pooled_height"); - int pooled_width = ctx->Attrs().Get("pooled_width"); - float spatial_scale = ctx->Attrs().Get("spatial_scale"); - - PADDLE_ENFORCE_GT(pooled_height, 0, - platform::errors::OutOfRange( - "The pooled output height must be greater than 0" - "but received height is %d", - pooled_height)); - PADDLE_ENFORCE_GT(pooled_width, 0, - platform::errors::OutOfRange( - "The pooled output width must be greater than 0" - "but received width is %d", - pooled_width)); - PADDLE_ENFORCE_GT(spatial_scale, 0.0f, - platform::errors::OutOfRange( - "The spatial scale must be greater than 0, " - "but received spatial scale is %f", - spatial_scale)); - - auto out_dims = input_dims; - out_dims[0] = rois_dims[0]; - out_dims[1] = input_dims[1]; - out_dims[2] = pooled_height; - out_dims[3] = pooled_width; - - ctx->SetOutputDim("Out", out_dims); - ctx->SetOutputDim("Argmax", out_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -212,20 +147,15 @@ class ROIPoolGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(roi_pool, RoiPoolInferShapeFunctor, + PD_INFER_META(phi::RoiPoolInferMeta)); + REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, ops::ROIPoolGradMaker, - ops::ROIPoolGradMaker); + ops::ROIPoolGradMaker, + RoiPoolInferShapeFunctor); REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp); -REGISTER_OP_CPU_KERNEL( - roi_pool, - ops::CPUROIPoolOpKernel, - ops::CPUROIPoolOpKernel, - ops::CPUROIPoolOpKernel); -REGISTER_OP_CPU_KERNEL( - roi_pool_grad, - ops::CPUROIPoolGradOpKernel, - ops::CPUROIPoolGradOpKernel, - ops::CPUROIPoolGradOpKernel); + REGISTER_OP_VERSION(roi_pool) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu deleted file mode 100644 index b907b1114bbc0..0000000000000 --- a/paddle/fluid/operators/roi_pool_op.cu +++ /dev/null @@ -1,306 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/roi_pool_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaxinumNumBlocks = 4096; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaxinumNumBlocks); -} - -template -__global__ void GPUROIPoolForward( - const int nthreads, const T* input_data, const T* input_rois, - const float spatial_scale, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - int* roi_batch_id_data, T* output_data, int64_t* argmax_data) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (size_t i = index; i < nthreads; i += offset) { - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % channels; - int n = i / pooled_width / pooled_height / channels; - - const T* offset_input_rois = input_rois + n * kROISize; - int roi_batch_ind = roi_batch_id_data[n]; - int roi_start_w = round(offset_input_rois[0] * spatial_scale); - int roi_start_h = round(offset_input_rois[1] * spatial_scale); - int roi_end_w = round(offset_input_rois[2] * spatial_scale); - int roi_end_h = round(offset_input_rois[3] * spatial_scale); - - int roi_width = max(roi_end_w - roi_start_w + 1, 1); - int roi_height = max(roi_end_h - roi_start_h + 1, 1); - - int hstart = static_cast(floor(static_cast(ph) * - static_cast(roi_height) / - static_cast(pooled_height))); - int wstart = static_cast(floor(static_cast(pw) * - static_cast(roi_width) / - static_cast(pooled_width))); - int hend = static_cast(ceil(static_cast(ph + 1) * - static_cast(roi_height) / - static_cast(pooled_height))); - int wend = static_cast(ceil(static_cast(pw + 1) * - static_cast(roi_width) / - static_cast(pooled_width))); - hstart = min(max(hstart + roi_start_h, 0), height); - hend = min(max(hend + roi_start_h, 0), height); - wstart = min(max(wstart + roi_start_w, 0), width); - wend = min(max(wend + roi_start_w, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - T maxval = is_empty ? 0 : -std::numeric_limits::max(); - int maxidx = -1; - const T* offset_input_data = - input_data + (roi_batch_ind * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - int input_data_index = h * width + w; - if (offset_input_data[input_data_index] > maxval) { - maxval = offset_input_data[input_data_index]; - maxidx = input_data_index; - } - } - } - output_data[i] = maxval; - if (argmax_data) { - argmax_data[i] = maxidx; - } - } -} - -template -__global__ void GPUROIPoolBackward( - const int nthreads, const T* input_rois, const T* output_grad, - const int64_t* argmax_data, const int num_rois, const float spatial_scale, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, int* roi_batch_id_data, - T* input_grad) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (int i = index; i < nthreads; i += offset) { - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % channels; - int n = i / pooled_width / pooled_height / channels; - - int roi_batch_ind = roi_batch_id_data[n]; - int input_offset = (roi_batch_ind * channels + c) * height * width; - int output_offset = (n * channels + c) * pooled_height * pooled_width; - const T* offset_output_grad = output_grad + output_offset; - T* offset_input_grad = input_grad + input_offset; - const int64_t* offset_argmax_data = argmax_data + output_offset; - - int argmax = offset_argmax_data[ph * pooled_width + pw]; - if (argmax != -1) { - platform::CudaAtomicAdd( - offset_input_grad + argmax, - static_cast(offset_output_grad[ph * pooled_width + pw])); - } - } -} - -template -class GPUROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - auto* argmax = ctx.Output("Argmax"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - auto in_stride = phi::stride(in_dims); - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - - int rois_num = rois->dims()[0]; - - if (rois_num == 0) return; - - int output_size = out->numel(); - int blocks = NumBlocks(output_size); - int threads = kNumCUDAThreads; - - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - auto cplace = platform::CPUPlace(); - int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); - auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = ctx.GetPlace(); - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - int rois_batch_size = rois_num_t->numel(); - - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be the same but " - "received batch size of input(ROIs) and input(X) is %d and %d " - "respectively.", - rois_batch_size, batch_size)); - std::vector rois_num_list(rois_batch_size); - memory::Copy(cplace, rois_num_list.data(), gplace, - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be the same but " - "received batch size of input(ROIs) and input(X) is %d and %d " - "respectively.", - rois_batch_size, batch_size)); - - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The number of rois from input(ROIs) and its LOD " - "must be the same. Received rois %d of input(ROIs) " - "but the number of rois %d from its LOD is %d", - rois_num, rois_num_with_lod)); - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc(dev_ctx, bytes); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, - dev_ctx.stream()); - - GPUROIPoolForward<<>>( - output_size, in->data(), rois->data(), spatial_scale, channels, - height, width, pooled_height, pooled_width, roi_id_data, - out->mutable_data(ctx.GetPlace()), - argmax->mutable_data(ctx.GetPlace())); - } -}; - -template -class GPUROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* rois_lod = ctx.Input("RoisNum"); - auto* argmax = ctx.Input("Argmax"); - - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* x_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - int rois_num = rois->dims()[0]; - int channels = in->dims()[1]; - int height = in->dims()[2]; - int width = in->dims()[3]; - - if (x_grad) { - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - auto cplace = platform::CPUPlace(); - int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); - - auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = ctx.GetPlace(); - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - int rois_batch_size = rois_num_t->numel(); - std::vector rois_num_list(rois_batch_size); - memory::Copy(cplace, rois_num_list.data(), gplace, - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc(dev_ctx, bytes); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, - dev_ctx.stream()); - - x_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, x_grad, static_cast(0)); - - int output_grad_size = out_grad->numel(); - int blocks = NumBlocks(output_grad_size); - int threads = kNumCUDAThreads; - - if (output_grad_size > 0) { - GPUROIPoolBackward<<>>( - output_grad_size, rois->data(), out_grad->data(), - argmax->data(), rois_num, spatial_scale, channels, height, - width, pooled_height, pooled_width, roi_id_data, - x_grad->mutable_data(ctx.GetPlace())); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - roi_pool, - ops::GPUROIPoolOpKernel, - ops::GPUROIPoolOpKernel); -REGISTER_OP_CUDA_KERNEL( - roi_pool_grad, - ops::GPUROIPoolGradOpKernel, - ops::GPUROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h deleted file mode 100644 index a104fd49eb3e0..0000000000000 --- a/paddle/fluid/operators/roi_pool_op.h +++ /dev/null @@ -1,250 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/memcpy.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -static constexpr int kROISize = 4; - -template -class CPUROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - auto* argmax = ctx.Output("Argmax"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - auto in_stride = phi::stride(in_dims); - auto argmax_stride = phi::stride(argmax->dims()); - auto roi_stride = phi::stride(rois->dims()); - auto out_stride = phi::stride(out->dims()); - - const T* input_data = in->data(); - - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(ctx.GetPlace()); - - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument("The rois_batch_size and imgs " - "batch_size must be the same.")); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument("The rois_batch_size and imgs " - "batch_size must be the same.")); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num, rois_num_with_lod, - platform::errors::InvalidArgument("The rois_num from input " - "and lod must be the same.")); - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - - T* output_data = out->mutable_data(ctx.GetPlace()); - int64_t* argmax_data = argmax->mutable_data(ctx.GetPlace()); - - const T* rois_data = rois->data(); - for (int n = 0; n < rois_num; ++n) { - int roi_batch_id = roi_batch_id_data[n]; - int roi_start_w = round(rois_data[0] * spatial_scale); - int roi_start_h = round(rois_data[1] * spatial_scale); - int roi_end_w = round(rois_data[2] * spatial_scale); - int roi_end_h = round(rois_data[3] * spatial_scale); - - // Force malformed ROIs to be 1x1 - int roi_height = std::max(roi_end_h - roi_start_h + 1, 1); - int roi_width = std::max(roi_end_w - roi_start_w + 1, 1); - - const float bin_size_h = - static_cast(roi_height) / static_cast(pooled_height); - const float bin_size_w = - static_cast(roi_width) / static_cast(pooled_width); - - const T* batch_data = input_data + roi_batch_id * in_stride[0]; - - for (int c = 0; c < channels; ++c) { - for (int ph = 0; ph < pooled_height; ++ph) { - for (int pw = 0; pw < pooled_width; ++pw) { - // Compute pooling region for this output unit: - // start (included) = floor(ph * roi_height / pooled_height_) - // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_) - int hstart = - static_cast(floor(static_cast(ph) * bin_size_h)); - int wstart = - static_cast(floor(static_cast(pw) * bin_size_w)); - int hend = - static_cast(ceil(static_cast(ph + 1) * bin_size_h)); - int wend = - static_cast(ceil(static_cast(pw + 1) * bin_size_w)); - - hstart = std::min(std::max(hstart + roi_start_h, 0), height); - hend = std::min(std::max(hend + roi_start_h, 0), height); - wstart = std::min(std::max(wstart + roi_start_w, 0), width); - wend = std::min(std::max(wend + roi_start_w, 0), width); - - const int pool_index = ph * pooled_width + pw; - - // Define an empty pooling region to be zero - bool is_empty = (hend <= hstart) || (wend <= wstart); - output_data[pool_index] = - is_empty ? 0 : -std::numeric_limits::max(); - argmax_data[pool_index] = -1; - - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - const int index = h * width + w; - if (batch_data[index] > output_data[pool_index]) { - output_data[pool_index] = batch_data[index]; - argmax_data[pool_index] = index; - } - } - } - } - } - - batch_data += in_stride[1]; - output_data += out_stride[1]; - argmax_data += argmax_stride[1]; - } - // Increment ROI data pointer - rois_data += roi_stride[0]; - } - return; - } -}; - -template -class CPUROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* argmax = ctx.Input("Argmax"); - auto* out_grad = - ctx.Input(framework::GradVarName("Out")); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - - if (in_grad) { - int rois_num = rois->dims()[0]; - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(ctx.GetPlace()); - - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - - const T* rois_data = rois->data(); - const T* out_grad_data = out_grad->data(); - const int64_t* argmax_data = argmax->data(); - T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(ctx.template device_context(), in_grad, - static_cast(0)); - - auto in_stride = phi::stride(in->dims()); - auto argmax_stride = phi::stride(argmax->dims()); - auto roi_stride = phi::stride(rois->dims()); - auto out_stride = phi::stride(out_grad->dims()); - - int channels = in->dims()[1]; - - for (int n = 0; n < rois_num; ++n) { - int roi_batch_idx = roi_batch_id_data[n]; - T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0]; - for (int c = 0; c < channels; ++c) { - for (int ph = 0; ph < pooled_height; ++ph) { - for (int pw = 0; pw < pooled_width; ++pw) { - int pool_index = ph * pooled_width + pw; - if (argmax_data[pool_index] >= 0) { - auto index = argmax_data[pool_index]; - batch_grad_data[index] += out_grad_data[pool_index]; - } - } - } - batch_grad_data += in_stride[1]; - out_grad_data += out_stride[1]; - argmax_data += argmax_stride[1]; - } - rois_data += roi_stride[0]; - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc index f82510556fde8..898db4c22fed9 100644 --- a/paddle/fluid/operators/roll_op.cc +++ b/paddle/fluid/operators/roll_op.cc @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/roll_op.h" - #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/platform/complex.h" +#include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -29,43 +32,6 @@ class RollOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of RollOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of RollOp should not be null.")); - - auto dims = ctx->Attrs().Get>("axis"); - auto shifts = ctx->Attrs().Get>("shifts"); - - if (!ctx->HasInput("ShiftsTensor")) { - if (dims.size() != 0) { - PADDLE_ENFORCE_EQ(dims.size(), shifts.size(), - platform::errors::InvalidArgument( - "When dims.size() != 0, dims.size() " - "should be equal to " - "shifts.size(). But received " - "dims.size() = %d, shifts.size() = %d", - dims.size(), shifts.size())); - } else { - PADDLE_ENFORCE_EQ(shifts.size(), 1, - platform::errors::InvalidArgument( - "When dims.size() == 0, shifts.size() " - "should be equal to 1, But received " - "shifts.size() = %d", - shifts.size())); - } - } - - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - auto type = ctx->GetInputsVarType("X")[0]; - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -149,29 +115,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RollGradNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(roll, RollInferShapeFunctor, + PD_INFER_META(phi::RollInferMeta)); + REGISTER_OPERATOR(roll, ops::RollOp, ops::RollOpMaker, ops::RollGradMaker, - ops::RollGradMaker); + ops::RollGradMaker, + RollInferShapeFunctor); REGISTER_OPERATOR(roll_grad, ops::RollGradOp, ops::RollGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - roll, ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel>, - ops::RollKernel>); -REGISTER_OP_CPU_KERNEL( - roll_grad, ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel>, - ops::RollGradKernel>); REGISTER_OP_VERSION(roll) .AddCheckpoint( diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu deleted file mode 100644 index b9064c5450f9f..0000000000000 --- a/paddle/fluid/operators/roll_op.cu +++ /dev/null @@ -1,225 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/roll_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/core/utils/array.h" - -namespace paddle { -namespace operators { - -using platform::PADDLE_CUDA_NUM_THREADS; -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -template -__global__ void RollCudaKernel(const T* input, T* output, int64_t N, - phi::Array shifts, - phi::Array strides, - phi::Array sizes) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - - int64_t output_idx = idx; - int64_t new_dim_idx = 0; - -#pragma unroll - for (size_t i = 0; i < Rank; i++) { - new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i]; - if (new_dim_idx >= sizes[i]) { - output_idx += (shifts[i] - sizes[i]) * strides[i]; - } else { - output_idx += shifts[i] * strides[i]; - } - } - output[output_idx] = input[idx]; -} - -template -class RollKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - PADDLE_ENFORCE_EQ( - shifts_tensor->dims().size(), 1, - platform::errors::InvalidArgument( - "The rank of ShiftsTensor is expected to be 1, got %s", - shifts_tensor->dims().size())); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - auto stream = - context.template device_context().stream(); - - size_t nums = shifts.size(); - auto input_dim = in->dims(); - auto stride_dim = phi::stride(input_dim); - - std::vector strides(nums), sizes(nums); - if (dims.size() == 0) { - strides[0] = 1; - sizes[0] = numel; - shifts[0] = (shifts[0] % numel + numel) % numel; - } else { - for (size_t i = 0; i < nums; i++) { - int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size(); - int64_t size = input_dim[dim]; - - if (size != 0) { - shifts[i] = (shifts[i] % size + size) % size; - strides[i] = stride_dim[dim]; - sizes[i] = size; - } - } - } - -#define CALL_ROLL_CUDA_KERNEL(N) \ - case N: { \ - phi::Array _strides; \ - phi::Array _shifts; \ - phi::Array _sizes; \ - for (size_t idx = 0; idx < N; ++idx) { \ - _strides[idx] = strides[idx]; \ - _shifts[idx] = shifts[idx]; \ - _sizes[idx] = sizes[idx]; \ - } \ - RollCudaKernel< \ - T, \ - N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \ - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel, \ - _shifts, _strides, _sizes); \ - break; \ - } - - switch (nums) { - CALL_ROLL_CUDA_KERNEL(1); - CALL_ROLL_CUDA_KERNEL(2); - CALL_ROLL_CUDA_KERNEL(3); - CALL_ROLL_CUDA_KERNEL(4); - CALL_ROLL_CUDA_KERNEL(5); - CALL_ROLL_CUDA_KERNEL(6); - CALL_ROLL_CUDA_KERNEL(7); - CALL_ROLL_CUDA_KERNEL(8); - CALL_ROLL_CUDA_KERNEL(9); - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "shifts.size() should be less than 10, But received shifts.size() " - "= %d", - shifts.size())); - } - } -}; - -template -class RollGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input(framework::GradVarName("Out")); - auto* out = context.Output(framework::GradVarName("X")); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - PADDLE_ENFORCE_EQ( - shifts_tensor->dims().size(), 1, - platform::errors::InvalidArgument( - "The rank of ShiftsTensor is expected to be 1, got %s", - shifts_tensor->dims().size())); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - auto stream = - context.template device_context().stream(); - size_t nums = shifts.size(); - auto input_dim = in->dims(); - auto stride_dim = phi::stride(input_dim); - - std::vector strides(nums), sizes(nums); - if (dims.size() == 0) { - strides[0] = 1; - sizes[0] = numel; - shifts[0] = ((-shifts[0]) % numel + numel) % numel; - } else { - for (size_t i = 0; i < nums; i++) { - int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size(); - int64_t size = input_dim[dim]; - if (size != 0) { - shifts[i] = ((-shifts[i]) % size + size) % size; - strides[i] = stride_dim[dim]; - sizes[i] = size; - } - } - } - - switch (nums) { - CALL_ROLL_CUDA_KERNEL(1); - CALL_ROLL_CUDA_KERNEL(2); - CALL_ROLL_CUDA_KERNEL(3); - CALL_ROLL_CUDA_KERNEL(4); - CALL_ROLL_CUDA_KERNEL(5); - CALL_ROLL_CUDA_KERNEL(6); - CALL_ROLL_CUDA_KERNEL(7); - CALL_ROLL_CUDA_KERNEL(8); - CALL_ROLL_CUDA_KERNEL(9); - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "shifts.size() should be less than 10, But received shifts.size() " - "= %d", - shifts.size())); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - roll, ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel>, - ops::RollKernel>); -REGISTER_OP_CUDA_KERNEL( - roll_grad, ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel>, - ops::RollGradKernel>); diff --git a/paddle/fluid/operators/roll_op.h b/paddle/fluid/operators/roll_op.h deleted file mode 100644 index 413c7bcfc15eb..0000000000000 --- a/paddle/fluid/operators/roll_op.h +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using DDim = framework::DDim; - -template -inline void shift_along_dim(T* data, const DDim& input_dim, int64_t dim, - int64_t shift) { - if (dim < 0) { - dim += input_dim.size(); - } - if (input_dim[dim] == 0) { - return; - } - shift = shift % input_dim[dim]; - if (shift < 0) { - shift += input_dim[dim]; - } - - auto outer_loops = 1; - for (auto i = 0; i < dim; i++) { - outer_loops *= input_dim[i]; - } - auto slice_width = 1; - for (auto i = dim + 1; i < input_dim.size(); i++) { - slice_width *= input_dim[i]; - } - - VLOG(3) << "shift_along_dim_debug: input_dim: " << input_dim - << "; dim: " << dim << "; shift: " << shift - << "; outer_loops: " << outer_loops - << "; slice_width: " << slice_width; - if (shift == 0) { - return; - } - - std::vector head; - auto head_size = slice_width * (input_dim[dim] - shift); - head.resize(head_size); - - for (auto i = 0; i < outer_loops; i++) { - for (auto j = 0; j < head_size; j++) { - head[j] = data[i * input_dim[dim] * slice_width + j]; - } - for (auto j = input_dim[dim] - shift; j < input_dim[dim]; j++) { - auto dst_pos = j - input_dim[dim] + shift; - for (auto k = 0; k < slice_width; k++) { - data[(i * input_dim[dim] + dst_pos) * slice_width + k] = - data[(i * input_dim[dim] + j) * slice_width + k]; - } - } - for (auto j = 0; j < head_size; j++) { - data[(i * input_dim[dim] + shift) * slice_width + j] = head[j]; - } - } -} - -template -class RollKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input_var = context.InputVar("X"); - auto* output_var = context.OutputVar("Out"); - auto& input = input_var->Get(); - auto* output = output_var->GetMutable(); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - PADDLE_ENFORCE_EQ( - shifts_tensor->dims().size(), 1, - platform::errors::InvalidArgument( - "The rank of ShiftsTensor is expected to be 1, got %s", - shifts_tensor->dims().size())); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - std::vector out_vec; - paddle::framework::TensorToVector(input, context.device_context(), - &out_vec); - - size_t nums = shifts.size(); - DDim input_dim = input.dims(); - - // axis = none, reshape to 1-D tensor - if (dims.size() == 0) { - dims.push_back(0l); - input_dim = framework::Dim<1>(out_vec.size()); - } - - for (size_t i = 0; i < nums; i++) { - PADDLE_ENFORCE_EQ( - dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()), true, - platform::errors::OutOfRange( - "Attr(axis[%d]) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.", - i, input_dim.size(), input_dim.size() - 1, i, dims[i])); - shift_along_dim(out_vec.data(), input_dim, dims[i], shifts[i]); - } - output->mutable_data(context.GetPlace()); - framework::TensorFromVector(out_vec, context.device_context(), output); - output->Resize(input.dims()); - } -}; - -template -class RollGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input_var = context.InputVar(framework::GradVarName("Out")); - auto* output_var = context.OutputVar(framework::GradVarName("X")); - auto& input = input_var->Get(); - auto* output = output_var->GetMutable(); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - std::vector out_vec; - paddle::framework::TensorToVector(input, context.device_context(), - &out_vec); - - size_t nums = shifts.size(); - DDim input_dim = input.dims(); - - // axis = none, reshape to 1-D tensor - if (dims.size() == 0) { - dims.push_back(0l); - input_dim = framework::Dim<1>(out_vec.size()); - } - - for (size_t i = 0; i < nums; i++) { - shift_along_dim(out_vec.data(), input_dim, dims[i], 0 - shifts[i]); - } - output->mutable_data(context.GetPlace()); - framework::TensorFromVector(out_vec, context.device_context(), output); - output->Resize(input.dims()); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc index 815984ac307fd..d5ef95269b48a 100644 --- a/paddle/fluid/operators/scatter_op_npu.cc +++ b/paddle/fluid/operators/scatter_op_npu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/kron_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/searchsorted_op.cc b/paddle/fluid/operators/searchsorted_op.cc index bbd5b9c4e7db9..3a6fdbaa2613d 100644 --- a/paddle/fluid/operators/searchsorted_op.cc +++ b/paddle/fluid/operators/searchsorted_op.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/searchsorted_op.h" - +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -22,60 +23,6 @@ namespace operators { class SearchSortedOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - static bool SearchsortedDimsMatchedBeforeLastDim( - const framework::DDim& sequences_dims, - const framework::DDim& values_dims) { - if (sequences_dims.size() != values_dims.size()) { - return false; - } - const auto& sequences_dims_size = sequences_dims.size(); - for (int64_t dim = 0; dim < sequences_dims_size - 1; ++dim) { - if (sequences_dims[dim] != values_dims[dim]) { - return false; - } - } - return true; - } - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("SortedSequence"), "Input", "SortedSequence", - "searchsorted"); - OP_INOUT_CHECK(ctx->HasInput("Values"), "Input", "Values", "searchsorted"); - - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "searchsorted"); - - auto sequences_dims = ctx->GetInputDim("SortedSequence"); - auto values_dims = ctx->GetInputDim("Values"); - auto out_int32 = ctx->Attrs().Get("out_int32"); - - if (sequences_dims.size() != 1) { - PADDLE_ENFORCE_EQ( - SearchsortedDimsMatchedBeforeLastDim(sequences_dims, values_dims), - true, - platform::errors::Unavailable( - "The dimensions of sorted_sequence tensor ( %s ) and values " - "tensor ( %s ) can not match. Because the input sorted_sequence " - "tensor must be 1 dimension or the first N-1 dimensions of " - "sorted_sequence tensor and input values tensor must match. " - "Please input appropriate sorted_sequence and values again! ", - sequences_dims, values_dims)); - } - - if (out_int32) { - PADDLE_ENFORCE_LT( - sequences_dims[sequences_dims.size() - 1], - std::numeric_limits::max(), - platform::errors::Unavailable( - "The size of sorted_sequence %d exceed the maximum limit d%. " - "Because the size of sorted_sequence should be less than the " - "output maximum value for int32 bit. Please set appropriate " - "sorted_sequence to meet this requirement! ", - sequences_dims[sequences_dims.size() - 1], - std::numeric_limits::max())); - } - - ctx->SetOutputDim("Out", values_dims); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -116,11 +63,7 @@ class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; -REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker); - -REGISTER_OP_CPU_KERNEL( - searchsorted, - ops::SearchSortedKernel, - ops::SearchSortedKernel, - ops::SearchSortedKernel, - ops::SearchSortedKernel); +DECLARE_INFER_SHAPE_FUNCTOR(searchsorted, SearchsortedInferShapeFunctor, + PD_INFER_META(phi::SearchsortedInferMeta)); +REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker, + SearchsortedInferShapeFunctor); diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc index 322cd97f01c3a..9d4c8532a82c0 100644 --- a/paddle/fluid/operators/segment_pool_op.cc +++ b/paddle/fluid/operators/segment_pool_op.cc @@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/segment_pool_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -23,22 +26,6 @@ class SegmentPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SegmentPool"); - OP_INOUT_CHECK(ctx->HasInput("SegmentIds"), "Input", "SegmentIds", - "SegmentPool"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SegmentPool"); - auto dims = ctx->GetInputDim("X"); - dims[0] = -1; - ctx->SetOutputDim("Out", dims); - - if (ctx->Attrs().Get("pooltype") == "MEAN") { - OP_INOUT_CHECK(ctx->HasOutput("SummedIds"), "Output", "SummedIds", - "SegmentPool"); - ctx->SetOutputDim("SummedIds", {-1, 1}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -150,17 +137,11 @@ class SegmentPoolGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(segment_pool, SegmentPoolInferShapeFunctor, + PD_INFER_META(phi::SegmentPoolInferMeta)); + REGISTER_OPERATOR(segment_pool, ops::SegmentPoolOp, ops::SegmentPoolOpMaker, ops::SegmentPoolGradOpMaker, - ops::SegmentPoolGradOpMaker); + ops::SegmentPoolGradOpMaker, + SegmentPoolInferShapeFunctor); REGISTER_OPERATOR(segment_pool_grad, ops::SegmentPoolGradOp); - -REGISTER_OP_CPU_KERNEL( - segment_pool, - ops::SegmentPoolKernel, - ops::SegmentPoolKernel); - -REGISTER_OP_CPU_KERNEL( - segment_pool_grad, - ops::SegmentPoolGradKernel, - ops::SegmentPoolGradKernel); diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu deleted file mode 100644 index e147e62a98354..0000000000000 --- a/paddle/fluid/operators/segment_pool_op.cu +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/segment_pool_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - segment_pool, - ops::SegmentPoolKernel, - ops::SegmentPoolKernel); -REGISTER_OP_CUDA_KERNEL( - segment_pool_grad, - ops::SegmentPoolGradKernel, - ops::SegmentPoolGradKernel); diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h deleted file mode 100644 index 2f5ef7f54f988..0000000000000 --- a/paddle/fluid/operators/segment_pool_op.h +++ /dev/null @@ -1,176 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/segment_pooling.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/phi/common/place.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) { - auto* input = context.Input("X"); - auto* segment = context.Input("SegmentIds"); - auto* output = context.Output("Out"); - std::string pooltype = context.Attr("pooltype"); - Tensor* summed_ids = nullptr; - - int64_t num_indices = segment->numel(); - PADDLE_ENFORCE_EQ( - num_indices, input->dims()[0], - platform::errors::InvalidArgument( - "Segment_ids should be the same size as dimension 0 of input X.")); - PADDLE_ENFORCE_EQ(num_indices, segment->dims()[0], - platform::errors::InvalidArgument( - "Segment_ids should be 1-D tensor, or it's other " - "dimension size is 1. Segment_ids's shape is: [%s].", - segment->dims())); - - if (input->numel() == 0 || segment->numel() == 0) { - return; - } - - bool cpu_place = context.GetPlace().GetType() == phi::AllocationType::CPU; - if (cpu_place) { - auto dims = input->dims(); - auto* segment_ids = segment->data(); - dims[0] = static_cast(segment_ids[segment->numel() - 1] + 1); - PADDLE_ENFORCE_GT( - dims[0], 0, - platform::errors::InvalidArgument( - "Segment ids must be >= 0, but got last id %d", dims[0])); - output->Resize({dims}); - output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, output, static_cast(0)); - } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (!cpu_place) { - Tensor length; - length.mutable_data(phi::make_ddim({1}), platform::CPUPlace()); - IndexT* length_data = length.data(); - const IndexT* segment_ids = segment->data(); - -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS( - hipMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT), - hipMemcpyDeviceToHost)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT), - cudaMemcpyDeviceToHost)); -#endif - - IndexT length_host = length_data[0]; - length_host++; - PADDLE_ENFORCE_GT( - length_host, 0, - platform::errors::InvalidArgument( - "Segment ids must be >= 0, but got last id %d", length_data[0])); - auto dims = input->dims(); - dims[0] = static_cast(length_host); - output->Resize({dims}); - output->mutable_data(context.GetPlace()); - T init_value = 0; - if (pooltype == "MAX") { - init_value = static_cast(-FLT_MAX); - } else if (pooltype == "MIN") { - init_value = static_cast(FLT_MAX); - } - phi::funcs::SetConstant setconst; - auto& dev_ctx = context.template device_context(); - setconst(dev_ctx, output, static_cast(init_value)); - // the gpu kernel of mean pool record the counts of segment_ids - if (pooltype == "MEAN") { - summed_ids = context.Output("SummedIds"); - summed_ids->Resize({dims[0], 1}); - summed_ids->mutable_data(context.GetPlace()); - setconst(dev_ctx, summed_ids, static_cast(1e-12)); - } - } -#endif - - SegmentPoolFunctor pool; - - pool(context.template device_context(), *input, *segment, - output, summed_ids, pooltype); -} - -template -class SegmentPoolKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* segment = context.Input("SegmentIds"); - auto index_type = framework::TransToProtoVarType(segment->dtype()); - if (index_type == framework::proto::VarType::INT32) { - SegmentKernelLaunchHelper(context); - } else if (index_type == framework::proto::VarType::INT64) { - SegmentKernelLaunchHelper(context); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported index type, Expected int, int64, but got %s.", - index_type)); - } - } -}; - -template -class SegmentPoolGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Input("Out"); - auto* segment = context.Input("SegmentIds"); - auto* out_g = context.Input(framework::GradVarName("Out")); - auto* in_g = context.Output(framework::GradVarName("X")); - std::string pooltype = context.Attr("pooltype"); - - const Tensor* summed_ids = nullptr; - if (pooltype == "MEAN") { - summed_ids = context.Input("SummedIds"); - } - - in_g->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, in_g, static_cast(0)); - - auto index_type = framework::TransToProtoVarType(segment->dtype()); - if (index_type == framework::proto::VarType::INT32) { - SegmentPoolGradFunctor pool; - pool(context.template device_context(), *input, *output, - *out_g, *segment, in_g, summed_ids, pooltype); - } else if (index_type == framework::proto::VarType::INT64) { - SegmentPoolGradFunctor pool; - pool(context.template device_context(), *input, *output, - *out_g, *segment, in_g, summed_ids, pooltype); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported index type, Expected int, int64, but got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index 7d0d782b837c4..73655bcb18500 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -13,9 +13,15 @@ // limitations under the License. #include "paddle/fluid/operators/set_value_op.h" + #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace framework { class InferShapeContext; @@ -34,6 +40,8 @@ class CPUDeviceContext; namespace paddle { namespace operators { +using Tensor = framework::Tensor; + class SetValue : public framework::OperatorWithKernel { public: SetValue(const std::string &type, const framework::VariableNameMap &inputs, @@ -41,17 +49,6 @@ class SetValue : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "SetValue"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SetValue"); - auto in_dims = ctx->GetInputDim("Input"); - PADDLE_ENFORCE_LT( - in_dims.size(), 7, - platform::errors::InvalidArgument( - "The rank of input should be less than 7, but received %d.", - in_dims.size())); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -236,21 +233,16 @@ DECLARE_INPLACE_OP_INFERER(SetValueOpInplaceInferer, {"Input", "Out"}); namespace ops = paddle::operators; namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(set_value, SetValueInferShapeFunctor, + PD_INFER_META(phi::SetValueInferMeta)); + REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker, ops::SetValueGradMaker, ops::SetValueGradMaker, - ops::SetValueOpInplaceInferer); + ops::SetValueOpInplaceInferer, SetValueInferShapeFunctor); REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad); -REGISTER_OP_CPU_KERNEL( - set_value_grad, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel); - REGISTER_OP_VERSION(set_value) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/set_value_op.cu b/paddle/fluid/operators/set_value_op.cu deleted file mode 100644 index 9f291a863c067..0000000000000 --- a/paddle/fluid/operators/set_value_op.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/set_value_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - set_value_grad, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel); diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h index 4d459f8c01b15..4696907f32e6d 100644 --- a/paddle/fluid/operators/set_value_op.h +++ b/paddle/fluid/operators/set_value_op.h @@ -19,14 +19,10 @@ #include #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/assign_value_op.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/slice_utils.h" -#include "paddle/fluid/operators/strided_slice_op.h" #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/enforce.h" @@ -36,23 +32,6 @@ namespace operators { using Tensor = framework::Tensor; using DDim = framework::DDim; -inline void GetOffsets(const DDim& big_dim, const DDim& small_dim, - DDim start_offset, int cur_dim, - std::vector* offsets) { - if (cur_dim == big_dim.size()) { - offsets->push_back(start_offset); - return; - } - if (small_dim[cur_dim] == big_dim[cur_dim]) { - GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets); - } else { - for (int i = 0; i < big_dim[cur_dim]; i++) { - GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets); - start_offset[cur_dim] += 1; - } - } -} - inline std::string GetValueName(framework::proto::VarType::Type data_type) { std::string value_name; switch (data_type) { @@ -121,253 +100,6 @@ inline void CheckIsDimsMatch(const framework::DDim first, "of target shape: %d, but now shape is %d.", second.to_str(), first.to_str())); } -template -class SetValueGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - int rank = ctx.Input(framework::GradVarName("Out"))->dims().size(); - - switch (rank) { - case 1: - SetValueGradCompute<1>(ctx); - break; - case 2: - SetValueGradCompute<2>(ctx); - break; - case 3: - SetValueGradCompute<3>(ctx); - break; - case 4: - SetValueGradCompute<4>(ctx); - break; - case 5: - SetValueGradCompute<5>(ctx); - break; - case 6: - SetValueGradCompute<6>(ctx); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "The rank of set_value_grad's input should be less than 7, but " - "received %d.", - rank)); - } - } - - private: - template - void SetValueGradCompute(const framework::ExecutionContext& context) const { - auto starts = context.Attr>("starts"); - auto ends = context.Attr>("ends"); - auto steps = context.Attr>("steps"); - - auto axes_int64 = context.Attr>("axes"); - std::vector axes(axes_int64.begin(), axes_int64.end()); - - auto starts_indices = Eigen::DSizes(); - auto ends_indices = Eigen::DSizes(); - auto steps_indices = Eigen::DSizes(); - auto reverse_axis = Eigen::array(); - - auto list_new_ends_tensor = - context.MultiInput("EndsTensorList"); - auto list_new_starts_tensor = - context.MultiInput("StartsTensorList"); - auto list_new_steps_tensor = - context.MultiInput("StepsTensorList"); - - if (list_new_starts_tensor.size() > 0) { - starts = GetDataFromTensorList(list_new_starts_tensor); - } - - if (list_new_ends_tensor.size() > 0) { - ends = GetDataFromTensorList(list_new_ends_tensor); - } - - if (list_new_steps_tensor.size() > 0) { - steps = GetDataFromTensorList(list_new_steps_tensor); - } - - auto in = context.Input(framework::GradVarName("Out")); - PADDLE_ENFORCE_EQ( - in->IsInitialized(), true, - platform::errors::PermissionDenied( - "The input of `set_value_grad`(%s) has not been initialized", - framework::GradVarName("Out"))); - auto grad_value = context.Output( - framework::GradVarName("ValueTensor")); - auto grad_input = - context.Output(framework::GradVarName("Input")); - auto in_dims = in->dims(); - - auto decrease_axis_int64 = - context.Attr>("decrease_axes"); - std::vector decrease_axis(decrease_axis_int64.begin(), - decrease_axis_int64.end()); - std::vector infer_flags(axes.size(), 1); - std::vector out_dims_vector(in_dims.size(), -1); - StridedSliceOutDims(starts, ends, steps, axes, infer_flags, in_dims, - decrease_axis, out_dims_vector.data(), axes.size(), - false); - - framework::DDim out_dims(phi::make_ddim(out_dims_vector)); - - std::vector reverse_vector(starts.size(), 0); - StridedSliceFunctor(starts.data(), ends.data(), steps.data(), axes.data(), - reverse_vector.data(), in_dims, infer_flags, - decrease_axis, starts.size()); - - for (size_t axis = 0; axis < D; axis++) { - starts_indices[axis] = 0; - ends_indices[axis] = out_dims[axis]; - steps_indices[axis] = 1; - reverse_axis[axis] = false; - } - - for (size_t axis = 0; axis < axes.size(); axis++) { - int axis_index = axes[axis]; - starts_indices[axis_index] = starts[axis]; - ends_indices[axis_index] = ends[axis]; - steps_indices[axis_index] = steps[axis]; - reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false; - } - - bool need_reverse = false; - for (size_t axis = 0; axis < axes.size(); axis++) { - if (reverse_vector[axis] == 1) { - need_reverse = true; - break; - } - } - - auto& dev_ctx = context.template device_context(); - auto& place = - *context.template device_context().eigen_device(); - phi::funcs::SetConstant set_zero; - - if (grad_input) { - // Set gradient of `Input` - paddle::framework::TensorCopy(*in, context.GetPlace(), grad_input); - - auto grad_input_t = - framework::EigenTensor::From(*grad_input); - - framework::Tensor tmp(grad_input->dtype()); - tmp.mutable_data(out_dims, context.GetPlace()); - set_zero(dev_ctx, &tmp, static_cast(0)); - auto tmp_t = framework::EigenTensor::From(tmp); - - grad_input_t.stridedSlice(starts_indices, ends_indices, steps_indices) - .device(place) = tmp_t; - } - if (grad_value) { - grad_value->mutable_data(context.GetPlace()); - set_zero(dev_ctx, grad_value, static_cast(0)); - - auto in_t = framework::EigenTensor::From(*in); - - if (grad_value->dims() == out_dims) { - auto grad_value_t = - framework::EigenTensor::From(*grad_value); - if (need_reverse) { - framework::Tensor tmp(grad_value->dtype()); - tmp.mutable_data(out_dims, context.GetPlace()); - set_zero(dev_ctx, &tmp, static_cast(0)); - auto tmp_t = framework::EigenTensor::From(tmp); - - tmp_t.device(place) = - in_t.stridedSlice(starts_indices, ends_indices, steps_indices); - grad_value_t.device(place) = tmp_t.reverse(reverse_axis); - } else { - grad_value_t.device(place) = - in_t.stridedSlice(starts_indices, ends_indices, steps_indices); - } - } else { - int out_dims_size = out_dims.size(); - auto grad_value_dims = grad_value->dims(); - auto fake_grad_value_dims = out_dims; - - // Create an extented shape according to the rules of broadcast. - auto grad_value_dims_size = grad_value_dims.size(); - - int num_decrease = 0; - - int decrease_axis_size = decrease_axis.size(); - for (int i = 0; i < out_dims_size; i++) { - if (decrease_axis.end() != - std::find(decrease_axis.begin(), decrease_axis.end(), i)) { - fake_grad_value_dims[i] = 1; - num_decrease++; - } else if (i < out_dims_size - (grad_value_dims_size + - decrease_axis_size - num_decrease)) { - fake_grad_value_dims[i] = 1; - } else { - auto index_grad = - i - (out_dims_size - (grad_value_dims_size + - decrease_axis_size - num_decrease)); - fake_grad_value_dims[i] = grad_value_dims[index_grad]; - - PADDLE_ENFORCE_EQ((out_dims[i] == grad_value_dims[index_grad]) || - (grad_value_dims[index_grad] == 1), - true, - platform::errors::InvalidArgument( - "An error occurred while calculating %s: " - "[%s] can not be accumulated into [%s].", - framework::GradVarName("ValueTensor"), - out_dims, grad_value_dims)); - } - } - - VLOG(3) << "Dimensions of " << framework::GradVarName("ValueTensor") - << "([" << grad_value_dims << "])is broadcasted into [" - << fake_grad_value_dims << "]."; - - auto extent = Eigen::DSizes(); - auto offset = out_dims; - for (int i = 0; i < out_dims_size; i++) { - offset[i] = 0; - extent[i] = fake_grad_value_dims[i]; - } - std::vector offsets; - GetOffsets(out_dims, fake_grad_value_dims, offset, 0, &offsets); - - auto grad_value_t = - framework::EigenTensor:: - From(*grad_value, fake_grad_value_dims); - - framework::Tensor tmp(grad_value->dtype()); - tmp.mutable_data(out_dims, context.GetPlace()); - set_zero(dev_ctx, &tmp, static_cast(0)); - auto tmp_t = framework::EigenTensor::From(tmp); - - tmp_t.device(place) = - in_t.stridedSlice(starts_indices, ends_indices, steps_indices); - - // accumulate gradient - for (auto offset : offsets) { - grad_value_t.device(place) = - grad_value_t + - tmp_t.slice(framework::EigenDim::From(offset), extent); - } - if (need_reverse) { - framework::Tensor tmp_value(grad_value->dtype()); - tmp_value.mutable_data(fake_grad_value_dims, context.GetPlace()); - auto tmp_value_t = - framework::EigenTensor::From(tmp_value); - tmp_value_t.device(place) = grad_value_t.reverse(reverse_axis); - grad_value_t.device(place) = tmp_value_t; - } - } - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc index 599697059c4dc..46d64333b608b 100644 --- a/paddle/fluid/operators/set_value_op_npu.cc +++ b/paddle/fluid/operators/set_value_op_npu.cc @@ -174,6 +174,9 @@ class SetValueNPUKernel : public framework::OpKernel { .AddInput(std::move(index_indices)) .AddInput(val_temp) .AddOutput(out_temp) +#if (CANN_VERSION_CODE >= 504001) + .AddAttrs({{"use_locking", false}}) +#endif .Run(stream); } }; diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc index 5b7ccdde81097..9001ce5d51dec 100644 --- a/paddle/fluid/operators/shape_op.cc +++ b/paddle/fluid/operators/shape_op.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/shape_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/complex.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -24,17 +25,6 @@ class ShapeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, - platform::errors::InvalidArgument( - "Input (Input) of get_shape op should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output (Out) of get_shape op should not be null.")); - auto in_dim = ctx->GetInputDim("Input"); - ctx->SetOutputDim("Out", {in_dim.size()}); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = @@ -91,13 +81,12 @@ Return the shape of the input. namespace ops = paddle::operators; namespace plat = paddle::platform; + +DECLARE_INFER_SHAPE_FUNCTOR(shape, ShapeInferShapeFunctor, + PD_INFER_META(phi::ShapeInferMeta)); + REGISTER_OPERATOR( shape, ops::ShapeOp, ops::ShapeOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, - ops::ShapeKernel>, - ops::ShapeKernel>); + paddle::framework::EmptyGradOpMaker, + ShapeInferShapeFunctor); diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu deleted file mode 100644 index c6e380a94f84d..0000000000000 --- a/paddle/fluid/operators/shape_op.cu +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/shape_op.h" -#include "paddle/fluid/platform/complex.h" - -REGISTER_OP_CUDA_KERNEL( - shape, paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel>, - paddle::operators::ShapeKernel>); diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h deleted file mode 100644 index 39ebcca46a710..0000000000000 --- a/paddle/fluid/operators/shape_op.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = phi::SelectedRows; - -template -class ShapeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_var = ctx.InputVar("Input"); - framework::DDim in_dims; - if (in_var->IsType()) { - in_dims = in_var->Get().value().dims(); - } else { - in_dims = in_var->Get().dims(); - } - auto* out_t = ctx.Output("Out"); - out_t->Resize({in_dims.size()}); - auto out_data = out_t->mutable_data(platform::CPUPlace()); - for (int i = 0; i < in_dims.size(); ++i) { - out_data[i] = in_dims[i]; - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc index 7bff7b2d66834..f751ab41014c2 100644 --- a/paddle/fluid/operators/shape_op_npu.cc +++ b/paddle/fluid/operators/shape_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/shape_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc index 2e9092a643253..a62d1b434e764 100644 --- a/paddle/fluid/operators/shape_op_xpu.cc +++ b/paddle/fluid/operators/shape_op_xpu.cc @@ -10,12 +10,41 @@ * limitations under the License. */ #ifdef PADDLE_WITH_XPU +#include +#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/shape_op.h" +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = phi::SelectedRows; + +template +class ShapeXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_var = ctx.InputVar("Input"); + framework::DDim in_dims; + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); + } else { + in_dims = in_var->Get().dims(); + } + auto* out_t = ctx.Output("Out"); + out_t->Resize({in_dims.size()}); + auto out_data = out_t->mutable_data(platform::CPUPlace()); + for (int i = 0; i < in_dims.size(); ++i) { + out_data[i] = in_dims[i]; + } + } +}; +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL(shape, ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel); +REGISTER_OP_XPU_KERNEL(shape, ops::ShapeXPUKernel, + ops::ShapeXPUKernel, ops::ShapeXPUKernel, + ops::ShapeXPUKernel, ops::ShapeXPUKernel); #endif diff --git a/paddle/fluid/operators/shard_index_op.cc b/paddle/fluid/operators/shard_index_op.cc index 54555e494ffe5..053a90f2fc9fa 100644 --- a/paddle/fluid/operators/shard_index_op.cc +++ b/paddle/fluid/operators/shard_index_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/shard_index_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,27 +23,6 @@ namespace operators { class ShardIndexOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ShardIndex"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ShardIndex"); - - auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_GE(x_dims.size(), 2, - platform::errors::InvalidArgument( - "Rank of Input(X) should be at least 2, " - "but the value given is %d.", - x_dims.size())); - if (ctx->IsRuntime() || x_dims[x_dims.size() - 1] > 0) { - PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1], 1U, - platform::errors::InvalidArgument( - "The last dimension of Input(X) should be 1, " - "but the value given is %d.", - x_dims[x_dims.size() - 1])); - } - - ctx->SetOutputDim("Out", x_dims); - ctx->ShareLoD("X", /* --> */ "Out"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -114,7 +96,10 @@ the original index should be recalculated (i.e. sharded) before. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(shard_index, ops::ShardIndexOp, - ops::ShardIndexOpMaker); -REGISTER_OP_CPU_KERNEL(shard_index, ops::ShardIndexCPUKernel, - ops::ShardIndexCPUKernel); +DECLARE_INFER_SHAPE_FUNCTOR(shard_index, ShardIndexInferShapeFunctor, + PD_INFER_META(phi::ShardIndexInferMeta)); +REGISTER_OPERATOR( + shard_index, ops::ShardIndexOp, ops::ShardIndexOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ShardIndexInferShapeFunctor); diff --git a/paddle/fluid/operators/shard_index_op.cu b/paddle/fluid/operators/shard_index_op.cu deleted file mode 100644 index 115b3f47d664b..0000000000000 --- a/paddle/fluid/operators/shard_index_op.cu +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/shard_index_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using platform::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void ShardIndexInner(const T* in_data, T* out_data, - const int64_t numel, const int index_num, - const int nshards, const int shard_id, - const int ignore_value) { - int shard_size = (index_num + nshards - 1) / nshards; - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < numel) { - assert(in_data[idx] >= 0 && in_data[idx] < index_num); - if (in_data[idx] / shard_size == shard_id) { - out_data[idx] = in_data[idx] % shard_size; - } else { - out_data[idx] = ignore_value; - } - } -} - -using LoDTensor = framework::LoDTensor; - -template -class ShardIndexCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - int index_num = context.Attr("index_num"); - int nshards = context.Attr("nshards"); - int shard_id = context.Attr("shard_id"); - int ignore_value = context.Attr("ignore_value"); - PADDLE_ENFORCE_GT( - index_num, 0, - platform::errors::InvalidArgument( - "The value 'index_num' for Op(shard_index) must be greater than 0, " - "but the value given is %d.", - index_num)); - PADDLE_ENFORCE_GT(nshards, 0, - platform::errors::InvalidArgument( - "The value 'nshard' for Op(shard_index) must be " - "greater than 0, but the value given is %d.", - nshards)); - PADDLE_ENFORCE_GE( - shard_id, 0, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be greater or " - "equal to 0, but the value given is %d.", - shard_id)); - PADDLE_ENFORCE_LT( - shard_id, nshards, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be less than " - "nshards (%d), but the value given is %d.", - nshards, shard_id)); - - out->Resize(in->dims()); - out->set_lod(in->lod()); - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - auto stream = - context.template device_context().stream(); - ShardIndexInner<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - in_data, out_data, numel, index_num, nshards, shard_id, ignore_value); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(shard_index, ops::ShardIndexCUDAKernel, - ops::ShardIndexCUDAKernel); diff --git a/paddle/fluid/operators/shard_index_op.h b/paddle/fluid/operators/shard_index_op.h deleted file mode 100644 index c2fe3711686d4..0000000000000 --- a/paddle/fluid/operators/shard_index_op.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using LoDTensor = framework::LoDTensor; -template -class ShardIndexCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - int index_num = context.Attr("index_num"); - int nshards = context.Attr("nshards"); - int shard_id = context.Attr("shard_id"); - int ignore_value = context.Attr("ignore_value"); - PADDLE_ENFORCE_GT( - index_num, 0, - platform::errors::InvalidArgument( - "The value 'index_num' for Op(shard_index) must be greater than 0, " - "but the value given is %d.", - index_num)); - PADDLE_ENFORCE_GT(nshards, 0, - platform::errors::InvalidArgument( - "The value 'nshard' for Op(shard_index) must be " - "greater than 0, but the value given is %d.", - nshards)); - PADDLE_ENFORCE_GE( - shard_id, 0, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be greater or " - "equal to 0, but the value given is %d.", - shard_id)); - PADDLE_ENFORCE_LT( - shard_id, nshards, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be less than " - "nshards (%d), but the value given is %d.", - nshards, shard_id)); - - int shard_size = (index_num + nshards - 1) / nshards; - - out->Resize(in->dims()); - out->set_lod(in->lod()); - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - for (int64_t i = 0; i < numel; ++i) { - PADDLE_ENFORCE_GE(in_data[i], 0, - platform::errors::InvalidArgument( - "The input_index for Op(shard_index) must be " - "greater or equal to 0, but the value given is %d.", - in_data[i])); - PADDLE_ENFORCE_LT(in_data[i], index_num, - platform::errors::InvalidArgument( - "The input_index for Op(shard_index) must be less " - "than index_num (%d), but the value given is %d.", - index_num, in_data[i])); - if (in_data[i] / shard_size == shard_id) { - out_data[i] = in_data[i] % shard_size; - } else { - out_data[i] = ignore_value; - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/shard_index_op_npu.cc b/paddle/fluid/operators/shard_index_op_npu.cc index dc2e8ad58f31c..c875448424a24 100644 --- a/paddle/fluid/operators/shard_index_op_npu.cc +++ b/paddle/fluid/operators/shard_index_op_npu.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/shard_index_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 374992096605b..3840b99dd176d 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" @@ -23,6 +24,10 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -30,30 +35,6 @@ class SoftmaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of SoftmaxOp is not found.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of SoftmaxOp is not found.")); - - auto dim_x = ctx->GetInputDim("X"); - auto rank_x = dim_x.size(); - auto axis = ctx->Attrs().Get("axis"); - PADDLE_ENFORCE_GE(axis, -rank_x, - platform::errors::InvalidArgument( - "Attr(axis) value should be in range [-R, R-1], " - "R is the rank of Input(X).")); - PADDLE_ENFORCE_LT(axis, rank_x, - platform::errors::InvalidArgument( - "Attr(axis) value should be in range [-R, R-1], " - "R is the rank of Input(X).")); - - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -168,23 +149,6 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Out"), true, - platform::errors::InvalidArgument("Input(Out) is not found.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput(framework::GradVarName("Out")), true, - platform::errors::InvalidArgument("Input(Out@GRAD) is not found.")); - PADDLE_ENFORCE_EQ( - ctx->GetInputDim("Out"), - ctx->GetInputDim(framework::GradVarName("Out")), - platform::errors::InvalidArgument("Input(Out) and its gradients " - "should have a same shape.")); - - ctx->SetOutputDim(framework::GradVarName("X"), - ctx->GetInputDim(framework::GradVarName("Out"))); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -244,9 +208,14 @@ DECLARE_INPLACE_OP_INFERER(SoftmaxInplaceInferer, {"X", "Out"}); namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(softmax, SoftmaxInferShapeFunctor, + PD_INFER_META(phi::SoftmaxInferMeta)); REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, ops::SoftmaxOpInferVarType, ops::SoftmaxOpGradMaker, ops::SoftmaxOpGradMaker, - ops::SoftmaxInplaceInferer); -REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad); + ops::SoftmaxInplaceInferer, SoftmaxInferShapeFunctor); +DECLARE_INFER_SHAPE_FUNCTOR(softmax_grad, SoftmaxGradInferShapeFunctor, + PD_INFER_META(phi::GeneralUnaryGradInferMeta)); +REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad, + SoftmaxGradInferShapeFunctor); diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc index 3bc55fafd81e1..3148b31a8322e 100644 --- a/paddle/fluid/operators/softmax_op_npu_test.cc +++ b/paddle/fluid/operators/softmax_op_npu_test.cc @@ -22,7 +22,6 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 19a395e72314d..41545a1ca20b2 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -760,8 +760,9 @@ static void SoftmaxWithCrossEntropyHardLabel( */ template __global__ void SoftmaxWithCrossEntropyGradHardLabel( - T* logits_grad, const T* loss_grad, const LabelT* labels, const int64_t n, - const int64_t dim, const int64_t d, const int ignore_index) { + T* logits_grad, const T* loss_grad, const T* softmax, const LabelT* labels, + const int64_t n, const int64_t dim, const int64_t d, + const int ignore_index) { int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; int64_t idx_n = idx / (d * dim); int64_t idx_dim = (idx / d) % dim; @@ -773,10 +774,9 @@ __global__ void SoftmaxWithCrossEntropyGradHardLabel( if (lbl == ignore_index) { logits_grad[idx] = static_cast(0.0); } else if (lbl == idx_dim) { - logits_grad[idx] = - (logits_grad[idx] - static_cast(1.0)) * loss_grad[ids]; + logits_grad[idx] = (softmax[idx] - static_cast(1.0)) * loss_grad[ids]; } else { - logits_grad[idx] *= loss_grad[ids]; + logits_grad[idx] = softmax[idx] * loss_grad[ids]; } } } @@ -1395,11 +1395,20 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { Tensor* logit_grad = context.Output(framework::GradVarName("Logits")); const Tensor* softmax = context.Input("Softmax"); - if (logit_grad != softmax) { + auto stream = context.cuda_device_context().stream(); + auto ignore_index = context.Attr("ignore_index"); + auto use_softmax = context.Attr("use_softmax"); + + T* logit_grad_data = nullptr; + bool copy_flag = (logit_grad != softmax && (!use_softmax || soft_label)); + if (copy_flag) { framework::TensorCopy(*softmax, context.GetPlace(), context.device_context(), logit_grad); + logit_grad_data = logit_grad->template data(); + } else { + logit_grad_data = + logit_grad->template mutable_data(context.GetPlace()); } - T* logit_grad_data = logit_grad->template data(); const int rank = logit_grad->dims().size(); const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); @@ -1414,9 +1423,6 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { #else int block = 512; #endif - auto stream = context.cuda_device_context().stream(); - auto ignore_index = context.Attr("ignore_index"); - auto use_softmax = context.Attr("use_softmax"); // do not with softmax op, and input is softmax if (!use_softmax) { @@ -1451,11 +1457,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { SoftCrossEntropyGradientKernel<<>>( logit_grad_data, loss_grad_data, label_data, n, d, remain); } else { + const T* softmax_data = softmax->template data(); const auto* label_data = labels.template data(); int grid = (n * d + block - 1) / block; SoftmaxWithCrossEntropyGradHardLabel<<>>( - logit_grad_data, loss_grad_data, label_data, n, d / remain, remain, - ignore_index); + logit_grad_data, loss_grad_data, softmax_data, label_data, n, + d / remain, remain, ignore_index); } } }; diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h index bff8061814ae6..aa944cfcfbb17 100644 --- a/paddle/fluid/operators/spp_op.h +++ b/paddle/fluid/operators/spp_op.h @@ -16,9 +16,10 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { @@ -53,14 +54,20 @@ class SppKernel : public framework::OpKernel { out_level.mutable_data(output_shape, context.GetPlace()); // pooling if (pooling_type == "max") { - math::Pool2dFunctor, T> pool_forward; - math::MaxPool max_process; + phi::funcs::Pool2dFunctor< + typename framework::ConvertToPhiContext::TYPE, + phi::funcs::MaxPool, T> + pool_forward; + phi::funcs::MaxPool max_process; pool_forward(context.template device_context(), *in_x, kernel_size, strides, paddings, true, false, &out_level, max_process); } else if (pooling_type == "avg") { - math::Pool2dFunctor, T> pool_forward; - math::AvgPool avg_process; + phi::funcs::Pool2dFunctor< + typename framework::ConvertToPhiContext::TYPE, + phi::funcs::AvgPool, T> + pool_forward; + phi::funcs::AvgPool avg_process; pool_forward(context.template device_context(), *in_x, kernel_size, strides, paddings, true, false, &out_level, avg_process); @@ -95,7 +102,9 @@ class SppGradKernel : public framework::OpKernel { std::string pooling_type = context.template Attr("pooling_type"); auto& device_ctx = context.template device_context(); - phi::funcs::SetConstant zero; + phi::funcs::SetConstant< + typename framework::ConvertToPhiContext::TYPE, T> + zero; in_x_grad->mutable_data(context.GetPlace()); zero(device_ctx, in_x_grad, static_cast(0)); auto out_stride = phi::stride(out->dims()); @@ -145,14 +154,18 @@ class SppGradKernel : public framework::OpKernel { outgrad_level.Resize(out_shape); // pooling backward if (pooling_type == "max") { - math::MaxPool2dGradFunctor pool2d_backward; + phi::funcs::MaxPool2dGradFunctor< + typename framework::ConvertToPhiContext::TYPE, T> + pool2d_backward; pool2d_backward(context.template device_context(), *in_x, *&out_level, *&outgrad_level, kernel_size, strides, paddings, in_x_grad); } else if (pooling_type == "avg") { - math::Pool2dGradFunctor, T> + phi::funcs::Pool2dGradFunctor< + typename framework::ConvertToPhiContext::TYPE, + phi::funcs::AvgPoolGrad, T> pool_backward; - math::AvgPoolGrad avg_process; + phi::funcs::AvgPoolGrad avg_process; pool_backward(context.template device_context(), *in_x, *&out_level, *&outgrad_level, kernel_size, strides, paddings, true, false, in_x_grad, avg_process); diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h index 58e5440689926..a776a78616b8d 100644 --- a/paddle/fluid/operators/squeeze_op.h +++ b/paddle/fluid/operators/squeeze_op.h @@ -17,7 +17,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc index 956544c53609e..d61f5aa3f634c 100644 --- a/paddle/fluid/operators/squeeze_op_npu_test.cc +++ b/paddle/fluid/operators/squeeze_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/sync_batch_norm_op.cc b/paddle/fluid/operators/sync_batch_norm_op.cc index d198992abde7d..0c178b02d0309 100644 --- a/paddle/fluid/operators/sync_batch_norm_op.cc +++ b/paddle/fluid/operators/sync_batch_norm_op.cc @@ -50,6 +50,7 @@ class SyncBatchNormGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + REGISTER_OPERATOR(sync_batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, ops::BatchNormOpInferVarType, ops::SyncBatchNormGradMaker, diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc index dc12f8e8892a0..e179149c5bb77 100644 --- a/paddle/fluid/operators/tile_op.cc +++ b/paddle/fluid/operators/tile_op.cc @@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tile_op.h" #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -26,66 +30,6 @@ class TileOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Tile"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Tile"); - auto x_dims = ctx->GetInputDim("X"); - auto repeat_times = ctx->Attrs().Get>("repeat_times"); - if (repeat_times.size() == 0) { - repeat_times = std::vector(x_dims.size(), -1); - } - - PADDLE_ENFORCE_LE( - x_dims.size(), MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op " - "must not be greater than %d, but the value received is %d.", - MAX_RANK_SUPPORTED, x_dims.size())); - PADDLE_ENFORCE_LE( - repeat_times.size(), MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The size of the shape of input 'repeat_times' for tile op " - "must not be greater than %d, but the value received is %d.", - MAX_RANK_SUPPORTED, repeat_times.size())); - PADDLE_ENFORCE_GE( - repeat_times.size(), 1, - platform::errors::InvalidArgument( - "The size of the shape of input 'repeat_times' for tile op " - "must be positive integers, but the value received is %d.", - repeat_times.size())); - - auto out_rank = - std::max(static_cast(x_dims.size()), repeat_times.size()); - std::vector out_shape(out_rank); - auto x_dim_vec = phi::vectorize(x_dims); - if (x_dim_vec.size() > repeat_times.size()) { - auto diff = x_dim_vec.size() - repeat_times.size(); - repeat_times.insert(repeat_times.begin(), diff, -1); - } else { - auto diff = repeat_times.size() - x_dim_vec.size(); - x_dim_vec.insert(x_dim_vec.begin(), diff, -1); - } - for (size_t i = 0; i < repeat_times.size(); ++i) { - if (x_dim_vec[i] == -1 || repeat_times[i] == -1) { - out_shape[i] = -1; - } else { - PADDLE_ENFORCE_GT( - repeat_times[i], 0, - platform::errors::InvalidArgument( - "Every element of the input 'repeat_times' for tile op must be " - "greater than 0, but the value given is %d.", - repeat_times[i])); - out_shape[i] = x_dim_vec[i] * repeat_times[i]; - } - } - - ctx->SetOutputDim("Out", phi::make_ddim(out_shape)); - if (out_shape[0] == x_dims[0]) { - ctx->ShareLoD("X", "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -268,38 +212,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TileGradNoNeedBufVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(tile, TileInferMetaFunctor, + PD_INFER_META(phi::TileInferMeta)); + REGISTER_OPERATOR(tile, ops::TileOp, ops::TileOpMaker, ops::TileGradOpMaker, - ops::TileGradOpMaker); + ops::TileGradOpMaker, + TileInferMetaFunctor); REGISTER_OPERATOR(tile_grad, ops::TileGradOp, ops::TileDoubleGradOpMaker, ops::TileDoubleGradOpMaker, ops::TileGradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - tile, ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel); -REGISTER_OP_CPU_KERNEL( - tile_grad, ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL( - tile, ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel); -REGISTER_OP_CUDA_KERNEL( - tile_grad, ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel); -#endif diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h deleted file mode 100644 index 1698b5e3c6322..0000000000000 --- a/paddle/fluid/operators/tile_op.h +++ /dev/null @@ -1,306 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" - -#define MAX_RANK_SUPPORTED 6 - -namespace paddle { -namespace operators { -inline std::vector get_repeat_times( - const framework::ExecutionContext& ctx) { - if (ctx.HasInput("RepeatTimes")) { - auto* repeat_tensor = ctx.Input("RepeatTimes"); - auto* repeat_data = repeat_tensor->data(); - framework::Tensor cpu_repeat_tensor; - if (platform::is_gpu_place(repeat_tensor->place()) || - platform::is_xpu_place(repeat_tensor->place()) || - platform::is_npu_place(repeat_tensor->place())) { - paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(), - &cpu_repeat_tensor); - repeat_data = cpu_repeat_tensor.data(); - } - auto vec_repeat_times = - std::vector(repeat_data, repeat_data + repeat_tensor->numel()); - return vec_repeat_times; - } - - auto list_repeat_times_tensor = - ctx.MultiInput("repeat_times_tensor"); - if (list_repeat_times_tensor.size() > 0) { - // get tensor from - std::vector vec_repeat_times; - for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) { - auto tensor = list_repeat_times_tensor[i]; - if (platform::is_gpu_place(tensor->place()) || - platform::is_xpu_place(tensor->place()) || - platform::is_npu_place(tensor->place())) { - framework::Tensor temp; - paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); - vec_repeat_times.push_back(*temp.data()); - } else { - vec_repeat_times.push_back(*tensor->data()); - } - } - return vec_repeat_times; - } else { - return ctx.Attr>("repeat_times"); - } -} - -using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; -template -using EigenTensor = framework::EigenTensor; -using framework::To32BitIndex; - -template -class TileKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto rank = context.Input("X")->dims().size(); - PADDLE_ENFORCE_GE( - rank, 1, platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op must be a positive " - "integer, but the value received is %d.", - rank)); - PADDLE_ENFORCE_LE( - rank, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op " - "must be less than or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, rank)); - auto repeat_times = get_repeat_times(context); - int repeat_times_size = repeat_times.size(); - PADDLE_ENFORCE_GE( - repeat_times_size, 1, - platform::errors::InvalidArgument( - "The number of elements of the input 'repeat_times' for tile " - "op must be positive, but the value received is %d.", - repeat_times_size)); - PADDLE_ENFORCE_LE( - repeat_times_size, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The number of elements of the input 'repeat_times' for tile op " - "must be less than or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, repeat_times_size)); - rank = std::max(rank, repeat_times_size); - switch (rank) { - case 1: - Tile<1>(context); - break; - case 2: - Tile<2>(context); - break; - case 3: - Tile<3>(context); - break; - case 4: - Tile<4>(context); - break; - case 5: - Tile<5>(context); - break; - case 6: - Tile<6>(context); - break; - } - } - - protected: - template - void Tile(const framework::ExecutionContext& context) const { - auto* in0 = context.Input("X"); - - auto in_dims = in0->dims(); - auto repeat_times = get_repeat_times(context); - for (size_t i = 0; i < repeat_times.size(); ++i) { - PADDLE_ENFORCE_GT( - repeat_times[i], 0, - platform::errors::InvalidArgument( - "All elements of the input 'repeat_times' for tile op must " - "be positive integers, but the value received is %d.", - repeat_times[i])); - } - auto vec_in_dims = phi::vectorize(in_dims); - if (repeat_times.size() < vec_in_dims.size()) { - int diff = vec_in_dims.size() - repeat_times.size(); - repeat_times.insert(repeat_times.begin(), diff, 1); - } else { - int diff = repeat_times.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - } - PADDLE_ENFORCE_EQ( - repeat_times.size(), vec_in_dims.size(), - platform::errors::InvalidArgument( - "The rank (%d) of the input 'x' and the rank (%d) of the input " - "'repeat_times' for tile op must match after promotion.", - vec_in_dims.size(), repeat_times.size())); - auto* out0 = context.Output("Out"); - Eigen::DSizes bcast_dims; - for (size_t i = 0; i < repeat_times.size(); ++i) { - bcast_dims[i] = repeat_times[i]; - } - - framework::DDim new_in_dims = phi::make_ddim(vec_in_dims); - framework::DDim out_dims(new_in_dims); - for (size_t i = 0; i < repeat_times.size(); ++i) { - out_dims[i] *= repeat_times[i]; - } - - out0->Resize(out_dims); - auto x = EigenTensor::From(*in0, new_in_dims); - out0->mutable_data(context.GetPlace()); - auto y = EigenTensor::From(*out0, out_dims); - auto& place = - *context.template device_context().eigen_device(); - // use 32-bit index to speed up - bool use_32bit_index = y.size() < Eigen::NumTraits::highest(); - if (use_32bit_index) { - EigenBroadcast, T, Rank>::Eval( - place, To32BitIndex(y), To32BitIndex(x), bcast_dims); - } else { - EigenBroadcast, T, Rank>::Eval(place, y, x, - bcast_dims); - } - } -}; - -template -class TileGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto repeat_times = get_repeat_times(context); - auto x_dims = x->dims(); - auto vec_in_dims = phi::vectorize(x_dims); - if (repeat_times.size() < vec_in_dims.size()) { - int diff = vec_in_dims.size() - repeat_times.size(); - repeat_times.insert(repeat_times.begin(), diff, 1); - } else { - int diff = repeat_times.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - } - // 1. reshape_dims_vec is the broadcast parameter. - // 2. reduce_dims_vec is the dimension parameter to compute gradients. For - // each dimension expanded, the gradients should be summed to original - // size. - std::vector reshape_dims_vec; - std::vector reduce_dims_vec; - for (size_t i = 0; i < repeat_times.size(); ++i) { - reduce_dims_vec.push_back(reshape_dims_vec.size()); - reshape_dims_vec.push_back(repeat_times[i]); - reshape_dims_vec.push_back(vec_in_dims[i]); - } - - int dims = reduce_dims_vec.size(); - - bool just_copy = true; - for (size_t i = 0; i < repeat_times.size(); i++) { - if (repeat_times[i] != 1) { - just_copy = false; - break; - } - } - // no need reduce, just copy - if (just_copy) { - auto* dout = context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - dx->mutable_data(context.GetPlace()); - framework::TensorCopy(*dout, context.GetPlace(), context.device_context(), - dx); - // TensorCopy may change the dims of dx - dx->Resize(x_dims); - } else { - PADDLE_ENFORCE_GE(dims, 1, - platform::errors::InvalidArgument( - "Th rank of the input 'Out@GRAD' for tile_grad op " - " must be greater than or equal to 1, but " - "the value received is %d.", - dims)); - PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'Out@GRAD' for tile_grad op " - "must be less than or equal " - "to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, dims)); - switch (dims) { - case 1: - TileBackward<1>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 2: - TileBackward<2>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 3: - TileBackward<3>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 4: - TileBackward<4>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 5: - TileBackward<5>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 6: - TileBackward<6>(context, reshape_dims_vec, reduce_dims_vec); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Only support tensor with rank being between 1 and 6. But " - "received tensor's rank = %d.", - dims)); - } - } - } - - protected: - template - void TileBackward(const framework::ExecutionContext& context, - const std::vector& reshape_dims_vec, - const std::vector& reduce_dims_vec) const { - size_t reshape_size = reshape_dims_vec.size(); - size_t reduce_size = reduce_dims_vec.size(); - auto* in0 = context.Input(framework::GradVarName("Out")); - auto* out0 = context.Output(framework::GradVarName("X")); - out0->mutable_data(context.GetPlace()); - auto x_grad = EigenVector::Flatten(*out0); - Eigen::DSizes reshape_dims; - for (size_t i = 0; i < reshape_size; ++i) { - reshape_dims[i] = reshape_dims_vec[i]; - } - Eigen::DSizes reduce_dims; - for (size_t i = 0; i < reduce_size; ++i) { - reduce_dims[i] = reduce_dims_vec[i]; - } - - auto out_grad = EigenVector::Flatten(*in0); - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcastGrad, T, Dims>::Eval( - place, x_grad, out_grad, reduce_dims, reshape_dims); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/tile_op_functor.h b/paddle/fluid/operators/tile_op_functor.h new file mode 100644 index 0000000000000..95bfb9f4e1a9d --- /dev/null +++ b/paddle/fluid/operators/tile_op_functor.h @@ -0,0 +1,67 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/fluid/framework/operator.h" + +#define MAX_RANK_SUPPORTED 6 + +namespace paddle { +namespace operators { + +inline std::vector get_repeat_times( + const framework::ExecutionContext& ctx) { + if (ctx.HasInput("RepeatTimes")) { + auto* repeat_tensor = ctx.Input("RepeatTimes"); + auto* repeat_data = repeat_tensor->data(); + framework::Tensor cpu_repeat_tensor; + if (platform::is_gpu_place(repeat_tensor->place()) || + platform::is_xpu_place(repeat_tensor->place()) || + platform::is_npu_place(repeat_tensor->place())) { + paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(), + &cpu_repeat_tensor); + repeat_data = cpu_repeat_tensor.data(); + } + auto vec_repeat_times = + std::vector(repeat_data, repeat_data + repeat_tensor->numel()); + return vec_repeat_times; + } + + auto list_repeat_times_tensor = + ctx.MultiInput("repeat_times_tensor"); + if (list_repeat_times_tensor.size() > 0) { + // get tensor from + std::vector vec_repeat_times; + for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) { + auto tensor = list_repeat_times_tensor[i]; + if (platform::is_gpu_place(tensor->place()) || + platform::is_xpu_place(tensor->place()) || + platform::is_npu_place(tensor->place())) { + framework::Tensor temp; + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); + vec_repeat_times.push_back(*temp.data()); + } else { + vec_repeat_times.push_back(*tensor->data()); + } + } + return vec_repeat_times; + } else { + return ctx.Attr>("repeat_times"); + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc index 9e306c7be537b..cea6b458aec78 100644 --- a/paddle/fluid/operators/tile_op_npu.cc +++ b/paddle/fluid/operators/tile_op_npu.cc @@ -11,7 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tile_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/tile_op_functor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/tile_op_xpu.cc b/paddle/fluid/operators/tile_op_xpu.cc index 6b60b167a2465..598377587d6f7 100644 --- a/paddle/fluid/operators/tile_op_xpu.cc +++ b/paddle/fluid/operators/tile_op_xpu.cc @@ -11,11 +11,14 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/tile_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/tile_op_functor.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class TileXPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index d60976928e00c..80c9935057cb5 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -51,6 +51,19 @@ namespace operators { using Tensor = framework::Tensor; +inline void GetDims(const phi::DDim& dim, int axis, int* pre, int* n, + int* post) { + *pre = 1; + *post = 1; + *n = dim[axis]; + for (int i = 0; i < axis; ++i) { + (*pre) *= dim[i]; + } + for (int i = axis + 1; i < dim.size(); ++i) { + (*post) *= dim[i]; + } +} + struct SegmentOffsetIter { EIGEN_DEVICE_FUNC explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc index 810afc901df57..0a9ae789b01ee 100644 --- a/paddle/fluid/operators/top_k_v2_op.cc +++ b/paddle/fluid/operators/top_k_v2_op.cc @@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_v2_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -22,56 +25,6 @@ class TopkV2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "topk_v2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "topk_v2"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "topk_v2"); - - auto input_dims = ctx->GetInputDim("X"); - const int& dim_size = input_dims.size(); - int axis = static_cast(ctx->Attrs().Get("axis")); - PADDLE_ENFORCE_EQ( - (axis < dim_size) && (axis >= (-1 * dim_size)), true, - paddle::platform::errors::InvalidArgument( - "the axis of topk must be [-%d, %d), but you set axis is %d", - dim_size, dim_size, axis)); - - if (axis < 0) axis += dim_size; - - int k; - auto k_is_tensor = ctx->HasInput("K"); - if (k_is_tensor) { - k = -1; - } else { - k = static_cast(ctx->Attrs().Get("k")); - PADDLE_ENFORCE_EQ(k >= 1, true, - paddle::platform::errors::InvalidArgument( - "the attribute of k in the topk must >= 1 or be a " - "Tensor, but received %d .", - k)); - } - - PADDLE_ENFORCE_GE(input_dims.size(), 1, - paddle::platform::errors::InvalidArgument( - "input of topk must have >= 1d shape")); - - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_GE( - input_dims[axis], k, - paddle::platform::errors::InvalidArgument( - "input of topk op must have >= %d columns in axis of %d", k, - axis)); - } - - framework::DDim dims = input_dims; - - dims[axis] = k; - ctx->SetOutputDim("Out", dims); - ctx->SetOutputDim("Indices", dims); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -168,20 +121,11 @@ class TopkV2GradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(top_k_v2, TopKInferShapeFunctor, + PD_INFER_META(phi::TopKInferMeta)); REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker, ops::TopkV2GradOpMaker, - ops::TopkV2GradOpMaker); + ops::TopkV2GradOpMaker, + TopKInferShapeFunctor); REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad); - -REGISTER_OP_CPU_KERNEL(top_k_v2, - ops::TopkV2Kernel, - ops::TopkV2Kernel, - ops::TopkV2Kernel, - ops::TopkV2Kernel) - -REGISTER_OP_CPU_KERNEL( - top_k_v2_grad, ops::TopkV2GradKernel, - ops::TopkV2GradKernel, - ops::TopkV2GradKernel, - ops::TopkV2GradKernel) diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu deleted file mode 100644 index 84d8eef53bf72..0000000000000 --- a/paddle/fluid/operators/top_k_v2_op.cu +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/top_k_function_cuda.h" -#include "paddle/fluid/operators/top_k_v2_op.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -#define FIXED_BLOCK_DIM_BASE(dim, ...) \ - case (dim): { \ - constexpr auto kBlockDim = (dim); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM(...) \ - FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__) - -template -class TopkV2OpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - - // get the attributes - int k = static_cast(ctx.Attr("k")); - int axis = static_cast(ctx.Attr("axis")); - const bool& sorted = static_cast(ctx.Attr("sorted")); - const bool& largest = static_cast(ctx.Attr("largest")); - - // get the input dims - const auto& in_dims = input->dims(); - // calcluate the real axis - if (axis < 0) axis += in_dims.size(); - - auto* k_t = ctx.Input("K"); - if (k_t) { - Tensor k_host; - framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host); - k = k_host.data()[0]; - framework::DDim output_dims = output->dims(); - output_dims[axis] = k; - output->Resize(output_dims); - indices->Resize(output_dims); - } - - const auto& out_dims = output->dims(); - - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - - if (axis == in_dims.size() - 1) { - // if get the topK from the last axis - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - - if (k > input_width) k = input_width; - - // The conclusion is drawn from the data through multiple sets of - // statistics - if (input_width >= 128 && k >= input_width * 0.75) { - if (SortTopk(dev_ctx, input, input_width, input_height, k, output, - indices, largest)) { - // Successed, return. - return; - } else { - LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use " - "default topk kernel."; - } - } - - // NOTE: pass lds and dim same to input width. - // NOTE: old matrix implementation of stride is different to eigen. - const int kMaxHeight = 2048; - int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; - switch (GetDesiredBlockDim(input_width)) { -#ifdef PADDLE_WITH_HIP - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - output_data, k, indices_data, input_data, input_width, - input_width, static_cast(k), gridx, input_height, - largest)); -#else - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - output_data, k, indices_data, input_data, input_width, - input_width, static_cast(k), gridx, input_height, - largest)); -#endif - default: - PADDLE_THROW(platform::errors::Fatal( - "the input data shape has error in the topk cuda kernel.")); - } - } else { - // if get topK not from the last axis, will tranpose the tensor and get - // TopK - - // first step, prepare the trans args for the tranpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(output->dims()); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - trans_out_dims[i] = out_dims[trans[i]]; - } - // second step, tranpose the input - Tensor trans_input; - trans_input.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - TransCompute(ndims, dev_ctx, *input, - &trans_input, trans); - // third step, calcluate the topk - // allocate the tmp cuda memory for the tmp result - Tensor trans_ind; - trans_ind.mutable_data(trans_out_dims, ctx.GetPlace()); - Tensor trans_out; - trans_out.mutable_data(trans_out_dims, ctx.GetPlace()); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - if (k > input_width) k = input_width; - - // The conclusion is drawn from the data through multiple sets of - // statistics - if (input_width >= 128 && k >= input_width * 0.75) { - if (SortTopk(dev_ctx, &trans_input, input_width, input_height, k, - &trans_out, &trans_ind, largest)) { - // last step, tranpose back the indices and output - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans); - TransCompute( - ndims, dev_ctx, trans_out, output, trans); - return; - } else { - LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use " - "default topk kernel."; - } - } - - const int kMaxHeight = 2048; - int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; - switch (GetDesiredBlockDim(input_width)) { -#ifdef PADDLE_WITH_HIP - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - trans_out.data(), k, trans_ind.data(), - trans_input.data(), input_width, input_width, - static_cast(k), gridx, input_height, largest)); -#else - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - trans_out.data(), k, trans_ind.data(), - trans_input.data(), input_width, input_width, - static_cast(k), gridx, input_height, largest)); -#endif - default: - PADDLE_THROW(platform::errors::Fatal( - "the input data shape has error in the topk cuda kernel.")); - } - - // last step, tranpose back the indices and output - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans); - TransCompute(ndims, dev_ctx, trans_out, - output, trans); - } - } -}; - -#undef FIXED_BLOCK_DIM_BASE -#undef FIXED_BLOCK_DIM -template -class TopkV2OpGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* x = context.Input("X"); - auto* out_grad = context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = context.Output(framework::GradVarName("X")); - int axis = context.Attr("axis"); - - const auto& in_dims = x->dims(); - const auto& out_dims = indices->dims(); - - // get the real the axis and the k - if (axis < 0) axis += in_dims.size(); - const int& k = out_dims[axis]; - const int& raw_height = in_dims[axis]; - - // allocate the cuda memory for the x_grad - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - const T* out_grad_data = out_grad->data(); - const int64_t* indices_data = indices->data(); - - int pre, n, post; - GetDims(in_dims, axis, &pre, &n, &post); - - // calcluate the block and grid num - auto& dev_ctx = context.cuda_device_context(); - auto ComputeBlockSize = [](int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - int block_size = ComputeBlockSize(post * k); - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); - int grid_size = std::min(max_blocks, pre); - - // lanuch the cuda kernel to assign the grad - AssignGradWithAxis<<>>( - out_grad_data, indices_data, x_grad_data, pre, post, n, k); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - top_k_v2, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL( - top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, float>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, double>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, int>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, int64_t>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, paddle::platform::float16>); diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h deleted file mode 100644 index a808207476f3b..0000000000000 --- a/paddle/fluid/operators/top_k_v2_op.h +++ /dev/null @@ -1,335 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - The reason why we need the topk v2 is because the compatibility. We redefine - the NaN is maximum value - in the process of comparing. If do not add the topk v2, will affect the - inference result of model that traing - by the older version paddlepaddle. -*/ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/top_k_op.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { - -inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n, - int* post) { - *pre = 1; - *post = 1; - *n = dim[axis]; - for (int i = 0; i < axis; ++i) { - (*pre) *= dim[i]; - } - for (int i = axis + 1; i < dim.size(); ++i) { - (*post) *= dim[i]; - } -} - -template -static void FullTopK(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, Type* t_indices, - const int& k, const bool& largest, const bool& sorted) { - // when the k is small, will the partial sort - bool partial_sort_flag = (k * 64) < input_width; - -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - // Eigen::DSizes flat2dims(input_height, input_width); - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(i, j), j)); - } - } - if (partial_sort_flag) { - std::partial_sort( - col_vec.begin(), col_vec.begin() + k, col_vec.end(), - [&largest](const std::pair& l, const std::pair& r) { - if (largest) { - return (std::isnan(static_cast(l.first)) && - !std::isnan(static_cast(r.first))) || - (l.first > r.first); - } else { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - } - }); - } else { - // use the nth-element to get the K-larger or K-small element - if (largest) { - std::nth_element( - col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (std::isnan(static_cast(l.first)) && - !std::isnan(static_cast(r.first))) || - (l.first > r.first); - }); - // the nth-element will get the unorder elements, sort the element - if (sorted) { - std::sort(col_vec.begin(), col_vec.begin() + k - 1, - [&largest](const std::pair& l, - const std::pair& r) { - return (std::isnan(static_cast(l.first)) && - !std::isnan(static_cast(r.first))) || - (l.first > r.first); - }); - } - } else { - std::nth_element( - col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - // the nth-element will get the unorder elements, sort the element - if (sorted) { - std::sort( - col_vec.begin(), col_vec.begin() + k - 1, - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - } - } - } - for (Type j = 0; j < k; ++j) { - t_out[i * k + j] = col_vec[j].first; - t_indices[i * k + j] = col_vec[j].second; - } - } -} - -template -static void FullTopKAssign(const Type& input_height, const Type& input_width, - const int& input_dim, const framework::Tensor* input, - const framework::Tensor* indices, T* output_data, - const int& k) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - auto e_indices = framework::EigenVector::Flatten(*indices); - for (Type j = 0; j < k; ++j) { - output_data[i * input_width + e_indices(j)] = e_input(j); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = - framework::EigenMatrix::Reshape(*indices, input_dim - 1); - for (Type j = 0; j < k; ++j) { - output_data[i * input_width + e_indices(i, j)] = e_input(i, j); - } - } - } -} - -template -class TopkV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // Get the top k elements of each row of input tensor - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto* indices = context.Output("Indices"); - const auto& in_dims = input->dims(); - int k = static_cast(context.Attr("k")); - const auto& sorted = static_cast(context.Attr("sorted")); - const auto& largest = static_cast(context.Attr("largest")); - - // axis < 0, cacluate the real axis - int axis = static_cast(context.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - - // if K tensor is not null, will the use K tesnor as k - auto* k_t = context.Input("K"); - if (k_t) { - k = k_t->data()[0]; - framework::DDim output_dims = output->dims(); - // accroding to axis to set K value in the dim - output_dims[axis] = k; - output->Resize(output_dims); - indices->Resize(output_dims); - } - - T* output_data = output->mutable_data(context.GetPlace()); - int64_t* indices_data = indices->mutable_data(context.GetPlace()); - const auto& out_dims = output->dims(); - if (axis + 1 == in_dims.size()) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - FullTopK(input_height, input_width, in_dims.size(), input, - output_data, indices_data, k, largest, sorted); - } else { - // if the topk dims is not last dim, will tranpose and do topk - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - - // get the trans input_dims, out_dims - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(output->dims()); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - for (size_t i = 0; i < trans.size(); i++) { - trans_out_dims[i] = out_dims[trans[i]]; - } - - Tensor trans_inp; - trans_inp.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - - // transpose the input value - TransCompute(ndims, dev_context, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - // Allocate the temp tensor to the save the topk indices, values - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_out_dims, context.GetPlace()); - Tensor tmp_indices; - auto* t_ind = - tmp_indices.mutable_data(trans_out_dims, context.GetPlace()); - - // get the TopK value - FullTopK(input_height, input_width, in_dims.size(), - &trans_inp, t_out, t_ind, k, largest, sorted); - // transpose back - TransCompute( - ndims, dev_context, tmp_indices, indices, trans); - TransCompute(ndims, dev_context, tmp_out, - output, trans); - } - } -}; - -template -class TopkV2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out_grad = context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = context.Output(framework::GradVarName("X")); - int axis = static_cast(context.Attr("axis")); - - const auto& in_dims = x->dims(); - const auto& out_dims = indices->dims(); - - // axis < 0, get the real axis - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - const size_t& k = out_dims[axis]; - - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - if (axis + 1 == in_dims.size()) { - // allocate the memory for the input_grad - - // assign the out_grad to input_grad directly - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - // init the output grad with 0, because some input elements has no grad - memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); - // Assign the output_grad to input_grad - FullTopKAssign(input_height, input_width, in_dims.size(), out_grad, - indices, x_grad_data, k); - } else { - // can not assign grad to input_grad, must do the transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(out_dims.size() - 1); - for (int i = axis + 1; i < out_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - framework::DDim trans_dims(out_dims); - framework::DDim trans_in_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = out_dims[trans[i]]; - trans_in_dims[i] = in_dims[trans[i]]; - } - // transpose the out_grad, indices - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, context.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - - // Do transpose - TransCompute(ndims, dev_context, *out_grad, - &trans_dO, trans); - TransCompute( - ndims, dev_context, *indices, &trans_ind, trans); - const int64_t input_height = phi::product( - phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); - const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; - - // Assign the out_grad to tranpose input_grad - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_in_dims, context.GetPlace()); - memset(t_out, 0, x_grad->numel() * sizeof(T)); - - FullTopKAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out, k); - - // Transpose back - TransCompute(ndims, dev_context, tmp_out, - x_grad, trans); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc index 5b8a6b3e75449..caaae02124c92 100644 --- a/paddle/fluid/operators/top_k_v2_op_mlu.cc +++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_v2_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc index e11070638834c..dff5c2d3f3937 100644 --- a/paddle/fluid/operators/top_k_v2_op_npu.cc +++ b/paddle/fluid/operators/top_k_v2_op_npu.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_v2_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/top_k_v2_op_xpu.cc b/paddle/fluid/operators/top_k_v2_op_xpu.cc index 49daac2ff0da6..4d9c39be92eff 100644 --- a/paddle/fluid/operators/top_k_v2_op_xpu.cc +++ b/paddle/fluid/operators/top_k_v2_op_xpu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include -#include "paddle/fluid/operators/top_k_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/transpose_op.h" #include "xpu/refactor/math.h" diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc index 0590b66f6f868..c6c0fa3c0019e 100644 --- a/paddle/fluid/operators/trace_op.cc +++ b/paddle/fluid/operators/trace_op.cc @@ -61,7 +61,7 @@ the 2-D planes specified by dim1 and dim2. )DOC"); } }; -class TraceOpGrad : public framework::OperatorWithKernel { +class TraceGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -114,7 +114,7 @@ REGISTER_OPERATOR(trace, ops::TraceOp, ops::TraceOpMaker, ops::TraceGradOpMaker, TraceInferShapeFunctor); -REGISTER_OPERATOR(trace_grad, ops::TraceOpGrad, +REGISTER_OPERATOR(trace_grad, ops::TraceGradOp, ops::TraceGradNoNeedBufferVarsInferer); /* ========================== register checkpoint ===========================*/ diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc index 5617d728a51dc..fb39034c8e92c 100644 --- a/paddle/fluid/operators/transpose_op_npu_test.cc +++ b/paddle/fluid/operators/transpose_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc index df84659a00f4c..35b925ca172b7 100644 --- a/paddle/fluid/operators/triangular_solve_op.cc +++ b/paddle/fluid/operators/triangular_solve_op.cc @@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/solve_op.h" #include "paddle/phi/infermeta/binary.h" namespace paddle { diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h deleted file mode 100644 index 315847b4d800e..0000000000000 --- a/paddle/fluid/operators/triangular_solve_op.h +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "glog/logging.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -#include "paddle/fluid/operators/solve_op.h" -#include "paddle/fluid/operators/tril_triu_op.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -static void triangular_solve(const DeviceContext &context, const Tensor &x, - const Tensor &y, Tensor *out, bool upper, - bool transpose, bool unitriangular) { - // Tensor broadcast use eigen library - std::vector x_bst_dims_vec; - std::vector y_bst_dims_vec; - std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(x, y); - - Tensor x_bst(x.type()); - TensorExpand(context, x, &x_bst, x_bst_dims_vec); - - Tensor y_bst(y.type()); - TensorExpand(context, y, &y_bst, y_bst_dims_vec); - - // TriangularSolveFunctor performs calculations in-place - // x_clone should be a copy of 'x' after broadcast - // out should be a copy of 'y' after broadcast - Tensor x_clone(x.type()); - x_clone.Resize(phi::make_ddim(x_bst_dims_vec)); - x_clone.mutable_data(context.GetPlace()); - framework::TensorCopy(x_bst, context.GetPlace(), context, &x_clone); - - out->Resize(phi::make_ddim(y_bst_dims_vec)); - out->mutable_data(context.GetPlace()); - framework::TensorCopy(y_bst, context.GetPlace(), context, out); - - math::TriangularSolveFunctor functor; - functor(context, &x_clone, out, /*left=*/true, upper, transpose, - unitriangular); -} - -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor &input, Tensor *output, - const framework::ExecutionContext &ctx); -}; - -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor &in, Tensor *out, - const framework::ExecutionContext &ctx) { - // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] - // out_reduce_dim should be [0, 2] - const std::vector in_dims = phi::vectorize(in.dims()); - auto in_size = in_dims.size(); - const std::vector out_dims = phi::vectorize(out->dims()); - auto out_size = out_dims.size(); - - std::vector out_bst_dims(in_size); - - std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1); - std::copy(out_dims.data(), out_dims.data() + out_size, - out_bst_dims.data() + in_size - out_size); - out->Resize(phi::make_ddim(out_bst_dims)); - - std::vector out_reduce_dims; - for (size_t idx = 0; idx <= in_size - 3; idx++) { - if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) { - out_reduce_dims.push_back(idx); - } - } - - ReduceKernelFunctor( - &in, out, out_reduce_dims, true, false, ctx) - .template apply(); - out->Resize(phi::make_ddim(out_dims)); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc index 3e943c62e1ce1..c8010e8a128e0 100644 --- a/paddle/fluid/operators/tril_triu_op.cc +++ b/paddle/fluid/operators/tril_triu_op.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tril_triu_op.h" #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -104,19 +104,3 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker, ops::TrilTriuGradOpMaker, ops::TrilTriuGradOpMaker); REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp); -REGISTER_OP_CPU_KERNEL( - tril_triu, ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel); -REGISTER_OP_CPU_KERNEL( - tril_triu_grad, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel); diff --git a/paddle/fluid/operators/tril_triu_op.cu b/paddle/fluid/operators/tril_triu_op.cu deleted file mode 100644 index 9cbbdeeb2ce28..0000000000000 --- a/paddle/fluid/operators/tril_triu_op.cu +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/tril_triu_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - tril_triu, ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel); -REGISTER_OP_CUDA_KERNEL( - tril_triu_grad, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel); diff --git a/paddle/fluid/operators/tril_triu_op.h b/paddle/fluid/operators/tril_triu_op.h deleted file mode 100644 index 3150b7617d10a..0000000000000 --- a/paddle/fluid/operators/tril_triu_op.h +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { - -template -class TrilTriuCompute { - public: - HOSTDEVICE TrilTriuCompute(const T* in, const int diagonal, const bool lower, - const int64_t H, const int64_t W, T* out) - : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {} - - HOSTDEVICE void operator()(int64_t idx) { - const int64_t row = (idx / W_) % H_; - const int64_t col = idx % W_; - const bool mask = - lower_ ? (col - row > diagonal_) : (col - row < diagonal_); - out_[idx] = mask ? static_cast(0) : in_[idx]; - } - - private: - const T* in_; - const int diagonal_; - const bool lower_; - const int64_t H_; - const int64_t W_; - T* out_; -}; - -template -class TrilTriuOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const auto* x = context.Input("X"); - const auto* x_data = x->data(); - auto* out = context.Output("Out"); - auto* out_data = out->mutable_data(context.GetPlace()); - - const int diagonal = context.Attr("diagonal"); - const bool lower = context.Attr("lower"); - - const auto& dims = x->dims(); - const auto H = dims[dims.size() - 2]; - const auto W = dims[dims.size() - 1]; - - platform::ForRange for_range( - context.template device_context(), - static_cast(x->numel())); - - paddle::operators::TrilTriuCompute tril_triu_computer( - x_data, diagonal, lower, H, W, out_data); - for_range(tril_triu_computer); - } -}; - -template -class TrilTriuGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const auto* d_out = - context.Input(framework::GradVarName("Out")); - const auto* dout_data = d_out->data(); - auto* d_x = context.Output(framework::GradVarName("X")); - auto* dx_data = d_x->mutable_data(context.GetPlace()); - - const int diagonal = context.Attr("diagonal"); - const bool lower = context.Attr("lower"); - - const auto& dims = d_out->dims(); - const auto H = dims[dims.size() - 2]; - const auto W = dims[dims.size() - 1]; - - platform::ForRange for_range( - context.template device_context(), - static_cast(d_out->numel())); - - paddle::operators::TrilTriuCompute tril_triu_grad_computer( - dout_data, diagonal, lower, H, W, dx_data); - for_range(tril_triu_grad_computer); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc index ad1c1814c05cd..4145730357d60 100644 --- a/paddle/fluid/operators/tril_triu_op_npu.cc +++ b/paddle/fluid/operators/tril_triu_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc new file mode 100644 index 0000000000000..a44ea8ff689b8 --- /dev/null +++ b/paddle/fluid/operators/tril_triu_op_xpu.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under +the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device/device_wrapper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class TrilTriuXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const auto* x = context.Input("X"); + const auto* x_data = x->data(); + auto* out = context.Output("Out"); + auto* out_data = out->mutable_data(context.GetPlace()); + + const int diagonal = context.Attr("diagonal"); + const bool lower = context.Attr("lower"); + auto xshape = phi::vectorize(x->dims()); + auto& dev_ctx = context.template device_context(); + int r = 0; + if (lower) { + r = xpu::tril(dev_ctx.x_context(), x_data, out_data, xshape, diagonal); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op"); + } else { + r = xpu::triu(dev_ctx.x_context(), x_data, out_data, xshape, diagonal); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op"); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + tril_triu, ops::TrilTriuXPUKernel, + ops::TrilTriuXPUKernel); +#endif diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 5ab2004617810..1be8f3387dbad 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -236,7 +236,6 @@ register_unity_group(cc scatter_nd_add_op.cc scatter_op.cc seed_op.cc - segment_pool_op.cc select_input_op.cc select_output_op.cc) register_unity_group(cc @@ -496,8 +495,7 @@ register_unity_group(cu scale_op.cu scatter_nd_add_op.cu scatter_op.cu - seed_op.cu - segment_pool_op.cu) + seed_op.cu) register_unity_group(cu roi_pool_op.cu selu_op.cu diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h index 7f676cbb65ee4..f6112fb59c122 100644 --- a/paddle/fluid/operators/unsqueeze_op.h +++ b/paddle/fluid/operators/unsqueeze_op.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/funcs/blas/blas.h" diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc index 3e11c952d15f3..a8ced783744a9 100644 --- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc +++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/viterbi_decode_op.cc b/paddle/fluid/operators/viterbi_decode_op.cc index bf1cdeed65a84..602376d54e0d2 100644 --- a/paddle/fluid/operators/viterbi_decode_op.cc +++ b/paddle/fluid/operators/viterbi_decode_op.cc @@ -9,8 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/viterbi_decode_op.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -19,47 +21,6 @@ class ViterbiDecodeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasInput("Transition"), "Input", "Transition", - "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasInput("Length"), "Input", "Length", "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasOutput("Scores"), "Output", "Scores", - "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasOutput("Path"), "Output", "Path", "ViterbiDecode"); - auto in_dims = ctx->GetInputDim("Input"); - PADDLE_ENFORCE_EQ(in_dims.size(), 3, - platform::errors::InvalidArgument( - "The rank of Input in ViterbiDecode must be 3. But " - "received Input's rank is %d.", - in_dims.size())); - auto length_dims = ctx->GetInputDim("Length"); - PADDLE_ENFORCE_EQ(length_dims.size(), 1, - platform::errors::InvalidArgument( - "The rank of Length in ViterbiDecode must be 1. But " - "received Length's rank is %d.", - length_dims.size())); - auto transition_dims = ctx->GetInputDim("Transition"); - PADDLE_ENFORCE_EQ( - transition_dims.size(), 2, - platform::errors::InvalidArgument( - "The rank of Transition in ViterbiDecode must be 2. But " - "received Transition's rank is %d.", - transition_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - in_dims[0], length_dims[0], - platform::errors::InvalidArgument( - "The batch size of Input and Length should be equal.")); - PADDLE_ENFORCE_EQ(in_dims[2], transition_dims[0], - platform::errors::InvalidArgument( - "The number of tags of Input (%d) and Transition " - "(%d) should be equal.", - transition_dims[0], in_dims[2])); - } - ctx->SetOutputDim("Scores", length_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -102,8 +63,8 @@ class ViterbiDecodeOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; namespace platform = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(viterbi_decode, ViterbiDecodeInferShapeFunctor, + PD_INFER_META(phi::ViterbiDecodeInferMeta)); REGISTER_OP_WITHOUT_GRADIENT(viterbi_decode, ops::ViterbiDecodeOp, - ops::ViterbiDecodeOpMaker); -REGISTER_OP_CPU_KERNEL( - viterbi_decode, ops::ViterbiDecodeKernel, - ops::ViterbiDecodeKernel); + ops::ViterbiDecodeOpMaker, + ViterbiDecodeInferShapeFunctor); diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu deleted file mode 100644 index 68628fb2748c4..0000000000000 --- a/paddle/fluid/operators/viterbi_decode_op.cu +++ /dev/null @@ -1,206 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_functor.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/viterbi_decode_op.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif - -namespace paddle { -namespace operators { - -#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ - case (1 << (log2_block_dim)): { \ - constexpr auto kBlockDim = (1 << (log2_block_dim)); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM_CASE(...) \ - FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); - -int64_t ComputeBlockSize(int64_t col) { - if (col > 512) - return 1024; - else if (col > 256) - return 512; - else if (col > 128) - return 256; - else if (col > 64) - return 128; - else if (col > 32) - return 64; - else if (col > 16) - return 32; - else if (col > 8) - return 16; - else - return 8; -} - -template