diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 90cb686700ef2..d453e9d2a2acd 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -7,46 +7,62 @@ SET(XPU_PROJECT "extern_xpu") SET(XPU_API_LIB_NAME "libxpuapi.so") SET(XPU_RT_LIB_NAME "libxpurt.so") +if(NOT DEFINED XPU_BASE_URL) + SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220411") +else() + SET(XPU_BASE_URL "${XPU_BASE_URL}") +endif() + +# ubuntu and centos: use output by XDNN API team +if(NOT DEFINED XPU_XDNN_BASE_URL) + SET(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev") + SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220412") +else() + SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}") +endif() + IF(WITH_AARCH64) SET(XPU_XRE_DIR_NAME "xre-kylin_aarch64") SET(XPU_XDNN_DIR_NAME "xdnn-kylin_aarch64") SET(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64") + SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) ELSEIF(WITH_SUNWAY) SET(XPU_XRE_DIR_NAME "xre-deepin_sw6_64") SET(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64") SET(XPU_XCCL_DIR_NAME "xccl-deepin_sw6_64") + SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) ELSEIF(WITH_BDCENTOS) SET(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64") - SET(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64") + SET(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64") SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") + # ubuntu and centos: use output by XDNN API team + SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) ELSEIF(WITH_UBUNTU) SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") - SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64") + SET(XPU_XDNN_DIR_NAME "XDNN-ubuntu_x86_64") SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") + # ubuntu and centos: use output by XDNN API team + SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) ELSEIF(WITH_CENTOS) SET(XPU_XRE_DIR_NAME "xre-centos7_x86_64") - SET(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64") + SET(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64") SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") - -ELSE () + # ubuntu and centos: use output by XDNN API team + SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) +ELSE() SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") - SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64") + SET(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64") SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") + # default: use output by XDNN API team + SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) ENDIF() -if(NOT DEFINED XPU_BASE_URL) - SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220411") -else() - SET(XPU_BASE_URL "${XPU_BASE_URL}") -endif() - SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) -SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220411/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE) -SET(SNAPPY_PREFIX_DIR "${THIRD_PARTY_PATH}/xpu") +SET(SNAPPY_PREFIX_DIR "${THIRD_PARTY_PATH}/xpu") SET(XPU_DOWNLOAD_DIR "${SNAPPY_PREFIX_DIR}/src/${XPU_PROJECT}") SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") SET(XPU_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake index 9047e6a9261ec..9cddbe1496478 100644 --- a/cmake/xpu_kp.cmake +++ b/cmake/xpu_kp.cmake @@ -128,7 +128,7 @@ macro(compile_kernel COMPILE_ARGS) COMMAND ${CMAKE_COMMAND} -E make_directory kernel_build COMMAND - cp ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu -rf + ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu COMMAND ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} -I. -o kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.xpu @@ -151,7 +151,7 @@ macro(compile_kernel COMPILE_ARGS) COMMAND ${CMAKE_COMMAND} -E make_directory kernel_build COMMAND - cp ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu -rf + ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu COMMAND ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} -I. -o kernel_build/${kernel_name}.host.o kernel_build/${kernel_name}.xpu diff --git a/paddle/fluid/distributed/collective/Common.cc b/paddle/fluid/distributed/collective/Common.cc index 02eab58478ccc..4a883f8196389 100644 --- a/paddle/fluid/distributed/collective/Common.cc +++ b/paddle/fluid/distributed/collective/Common.cc @@ -41,13 +41,14 @@ std::string GetKeyFromPlaces(const std::vector& places) { } static bool CheckTensorsInPlace(const std::vector& tensors, - const PlaceType type) { - return std::all_of(tensors.cbegin(), tensors.cend(), - [&](const Tensor& t) { return t.place() == type; }); + phi::AllocationType type) { + return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { + return t.place().GetType() == type; + }); } bool CheckTensorsInCudaPlace(const std::vector& tensors) { - return CheckTensorsInPlace(tensors, PlaceType::kGPU); + return CheckTensorsInPlace(tensors, phi::AllocationType::GPU); } } // namespace distributed diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 71741515c90d5..02f7f25636410 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -414,20 +414,13 @@ void EagerReducer::InitializeDenseGroups( p_group->dense_tensors_.push_back(phi::DenseTensor()); const auto &dtype = tensor.dtype(); - const auto &place = tensor.place(); const auto &inner_place = tensor.impl()->place(); if (index > 0) { PADDLE_ENFORCE_EQ(dtype, p_group->dtype_, platform::errors::PreconditionNotMet( "Tensor %s has unexpected dtype.", tensor_name)); - PADDLE_ENFORCE_EQ(place, place_, - platform::errors::PreconditionNotMet( - "Tensor %s has different place. Expected place is " - "%s, but actual place is %s", - tensor_name, inner_place_, inner_place)); } else { p_group->dtype_ = dtype; - place_ = place; inner_place_ = inner_place; } } diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h index 12c02509884e9..424bae0e5acd1 100644 --- a/paddle/fluid/distributed/collective/reducer.h +++ b/paddle/fluid/distributed/collective/reducer.h @@ -26,7 +26,6 @@ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/phi/api/include/api.h" #include "paddle/phi/api/include/tensor.h" -#include "paddle/phi/api/lib/ext_compat_utils.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/utils/string/string_helper.h" @@ -121,7 +120,6 @@ class EagerReducer { std::vector groups_; std::vector variable_locators_; - PlaceType place_; platform::Place inner_place_; size_t next_group_ = 0; int64_t nranks_ = -1; diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index df4879735bb82..fbcd920905c9d 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -36,7 +36,6 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/phi/api/all.h" -#include "paddle/phi/api/lib/ext_compat_utils.h" #include "paddle/phi/api/lib/utils/tensor_utils.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/utils/any.h" @@ -627,8 +626,8 @@ class CustomGradOpMaker static void RegisterOperatorKernelWithPlace( const std::string& name, const OperatorWithKernel::OpKernelFunc& op_kernel_func, - const proto::VarType::Type type, const PlaceType& place) { - OpKernelType key(type, experimental::ConvertExtPlaceToInnerPlace(place)); + const proto::VarType::Type type, const platform::Place& place) { + OpKernelType key(type, place); VLOG(3) << "Custom Operator: op kernel key: " << key; OperatorWithKernel::AllOpKernels()[name][key] = op_kernel_func; } @@ -666,10 +665,10 @@ static void RegisterOperatorKernel(const std::string& name, op_kernel_func = func; } RegisterOperatorKernelWithPlace(name, op_kernel_func, proto::VarType::RAW, - PlaceType::kCPU); + platform::CPUPlace()); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) RegisterOperatorKernelWithPlace(name, op_kernel_func, proto::VarType::RAW, - PlaceType::kGPU); + platform::CUDAPlace()); #endif } diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h old mode 100755 new mode 100644 index 8e51f0e2405bf..6d3a4c5d9c0b9 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -16,6 +16,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_HETERPS +#include #include #include #include @@ -38,7 +39,7 @@ namespace framework { class HeterContext { public: - ~HeterContext() { + virtual ~HeterContext() { if (!multi_mf_dim_) { for (size_t i = 0; i < mutex_.size(); ++i) { delete mutex_[i]; @@ -56,9 +57,12 @@ class HeterContext { Scope* scope_{nullptr}; std::vector> feature_keys_; std::vector>> feature_dim_keys_; + std::vector>> device_task_keys_; #ifdef PADDLE_WITH_PSLIB std::vector> value_ptr_; + std::vector>> + device_task_ptr_; std::vector>> value_dim_ptr_; std::vector>> @@ -68,6 +72,8 @@ class HeterContext { std::vector> value_ptr_; std::vector>> value_dim_ptr_; + std::vector>> + device_task_ptr_; std::vector>> device_dim_ptr_; #endif @@ -93,6 +99,12 @@ class HeterContext { shard_num_ = shard_num; feature_keys_.resize(shard_num_); value_ptr_.resize(shard_num_); + device_task_ptr_.resize(shard_num_); + device_task_keys_.resize(shard_num_); + for (size_t i = 0; i < device_task_ptr_.size(); i++) { + device_task_ptr_[i].resize(device_num); + device_task_keys_[i].resize(device_num); + } device_values_.resize(device_num); device_keys_.resize(device_num); @@ -108,6 +120,12 @@ class HeterContext { feature_dim_keys_.resize(shard_num_); value_ptr_.resize(shard_num_); value_dim_ptr_.resize(shard_num_); + device_task_ptr_.resize(shard_num_); + device_task_keys_.resize(shard_num_); + for (size_t i = 0; i < device_task_ptr_.size(); i++) { + device_task_ptr_[i].resize(device_num); + device_task_keys_[i].resize(device_num); + } for (size_t i = 0; i < feature_dim_keys_.size(); i++) { feature_dim_keys_[i].resize(dim_num); value_dim_ptr_[i].resize(dim_num); @@ -151,6 +169,12 @@ class HeterContext { for (size_t i = 0; i < device_keys_.size(); ++i) { device_keys_[i].clear(); } + for (size_t i = 0; i < device_task_ptr_.size(); ++i) { + for (size_t j = 0; j < device_task_ptr_[i].size(); ++j) { + device_task_ptr_[i][j].clear(); + device_task_keys_[i][j].clear(); + } + } } else { VLOG(3) << "Reset gpu task with dynamic mf dimention"; for (size_t i = 0; i < feature_dim_keys_.size(); i++) { diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc old mode 100755 new mode 100644 index e167a39caa526..115ec4d0102cc --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -298,6 +298,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { platform::Timer timeline; + std::vector> task_futures; int device_num = heter_devices_.size(); auto& local_keys = gpu_task->feature_keys_; auto& local_ptr = gpu_task->value_ptr_; @@ -316,7 +317,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { device_dim_ptr[dev].resize(multi_mf_dim_); } } - auto& device_mutex = gpu_task->mutex_; + // auto& device_mutex = gpu_task->mutex_; std::vector threads(thread_keys_shard_num_); #ifdef PADDLE_WITH_PSLIB @@ -502,6 +503,8 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { table_id_, pass_id, pass_values); } #endif + auto& device_task_keys = gpu_task->device_task_keys_; + auto& device_task_ptrs = gpu_task->device_task_ptr_; auto build_dynamic_mf_func = [this, device_num, &local_dim_keys, &local_dim_ptr, &device_dim_keys, &device_dim_ptr, @@ -534,17 +537,14 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { #endif }; auto build_func = [device_num, record_status, &pass_values, &local_keys, - &local_ptr, &device_keys, &device_vals, - &device_mutex](int i) { - std::vector> task_keys(device_num); + &local_ptr, &device_task_keys, &device_task_ptrs](int i) { + auto& task_keys = device_task_keys[i]; #ifdef PADDLE_WITH_PSLIB - std::vector> task_ptrs( - device_num); + auto& task_ptrs = device_task_ptrs[i]; #endif #ifdef PADDLE_WITH_PSCORE - std::vector> task_ptrs( - device_num); + auto& task_ptrs = device_task_ptrs[i]; #endif for (size_t j = 0; j < local_keys[i].size(); j++) { @@ -569,88 +569,139 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { } } #endif - for (int dev = 0; dev < device_num; dev++) { - device_mutex[dev]->lock(); + }; + if (!multi_mf_dim_) { + for (int i = 0; i < thread_keys_shard_num_; i++) { + task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(build_func, i)); + } + for (auto& f : task_futures) { + f.wait(); + } + task_futures.clear(); + VLOG(0) << "GpuPs build hbmps done"; + } + std::vector> prefix_sum; + prefix_sum.resize(device_num); + for (int i = 0; i < device_num; i++) { + prefix_sum[i].resize(thread_keys_shard_num_ + 1); + prefix_sum[i][0] = 0; + } + auto calc_prefix_func = [this, &prefix_sum, &device_keys, &device_vals, + &device_task_keys](int device_num) { + for (int j = 0; j < thread_keys_shard_num_; j++) { + prefix_sum[device_num][j + 1] = + prefix_sum[device_num][j] + device_task_keys[j][device_num].size(); + } + device_keys[device_num].resize( + prefix_sum[device_num][thread_keys_shard_num_]); + device_vals[device_num].resize( + prefix_sum[device_num][thread_keys_shard_num_]); + }; + if (!multi_mf_dim_) { + for (int i = 0; i < device_num; i++) { + task_futures.emplace_back( + hbm_thread_pool_[i]->enqueue(calc_prefix_func, i)); + } + for (auto& f : task_futures) { + f.wait(); + } + task_futures.clear(); + } + VLOG(0) << "prefix done"; + auto prepare_dev_value_func = [device_num, &prefix_sum, &device_keys, + &device_vals, &device_task_keys, + &device_task_ptrs](int dev, int shard_id) { + auto& task_keys = device_task_keys[shard_id]; +#ifdef PADDLE_WITH_PSLIB + auto& task_ptrs = device_task_ptrs[shard_id]; +#endif + +#ifdef PADDLE_WITH_PSCORE + auto& task_ptrs = device_task_ptrs[dev]; +#endif - int len = task_keys[dev].size(); - int cur = device_keys[dev].size(); - device_keys[dev].resize(device_keys[dev].size() + len); - device_vals[dev].resize(device_vals[dev].size() + len); + int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id]; + int cur = prefix_sum[dev][shard_id]; #ifdef PADDLE_WITH_PSLIB - for (int j = 0; j < len; ++j) { - device_keys[dev][cur + j] = task_keys[dev][j]; - float* ptr_val = task_ptrs[dev][j]->data(); - FeatureValue& val = device_vals[dev][cur + j]; - size_t dim = task_ptrs[dev][j]->size(); - - val.delta_score = ptr_val[1]; - val.show = ptr_val[2]; - val.clk = ptr_val[3]; - val.slot = ptr_val[6]; - val.lr = ptr_val[4]; - val.lr_g2sum = ptr_val[5]; - val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); - - if (dim > 7) { - val.mf_size = MF_DIM + 1; - for (int x = 0; x < val.mf_size; x++) { - val.mf[x] = ptr_val[x + 7]; - } - } else { - val.mf_size = 0; - for (int x = 0; x < MF_DIM + 1; x++) { - val.mf[x] = 0; - } + for (int j = 0; j < len; ++j) { + device_keys[dev][cur + j] = task_keys[dev][j]; + float* ptr_val = task_ptrs[dev][j]->data(); + FeatureValue& val = device_vals[dev][cur + j]; + size_t dim = task_ptrs[dev][j]->size(); + + val.delta_score = ptr_val[1]; + val.show = ptr_val[2]; + val.clk = ptr_val[3]; + val.slot = ptr_val[6]; + val.lr = ptr_val[4]; + val.lr_g2sum = ptr_val[5]; + val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); + + if (dim > 7) { + val.mf_size = MF_DIM + 1; + for (int x = 0; x < val.mf_size; x++) { + val.mf[x] = ptr_val[x + 7]; + } + } else { + val.mf_size = 0; + for (int x = 0; x < MF_DIM + 1; x++) { + val.mf[x] = 0; } } + } #endif #ifdef PADDLE_WITH_PSCORE - for (int j = 0; j < len; ++j) { - device_keys[dev][cur + j] = task_keys[dev][j]; - float* ptr_val = task_ptrs[dev][j]->data(); - FeatureValue& val = device_vals[dev][cur + j]; - size_t dim = task_ptrs[dev][j]->size(); - val.delta_score = ptr_val[2]; - val.show = ptr_val[3]; - val.clk = ptr_val[4]; - val.slot = ptr_val[0]; - val.lr = ptr_val[5]; - val.lr_g2sum = ptr_val[6]; - val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); - - if (dim > 7) { - val.mf_size = MF_DIM + 1; - for (int x = 0; x < val.mf_size; x++) { - val.mf[x] = ptr_val[x + 7]; - } - } else { - val.mf_size = 0; - for (int x = 0; x < MF_DIM + 1; x++) { - val.mf[x] = 0; - } + for (int j = 0; j < len; ++j) { + device_keys[dev][cur + j] = task_keys[dev][j]; + float* ptr_val = task_ptrs[dev][j]->data(); + FeatureValue& val = device_vals[dev][cur + j]; + size_t dim = task_ptrs[dev][j]->size(); + val.delta_score = ptr_val[2]; + val.show = ptr_val[3]; + val.clk = ptr_val[4]; + val.slot = ptr_val[0]; + val.lr = ptr_val[5]; + val.lr_g2sum = ptr_val[6]; + val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); + + if (dim > 7) { + val.mf_size = MF_DIM + 1; + for (int x = 0; x < val.mf_size; x++) { + val.mf[x] = ptr_val[x + 7]; + } + } else { + val.mf_size = 0; + for (int x = 0; x < MF_DIM + 1; x++) { + val.mf[x] = 0; } } + } #endif - VLOG(3) << "GpuPs build hbmps done"; + VLOG(3) << "GpuPs build hbmps done"; - device_mutex[dev]->unlock(); - } }; - if (!multi_mf_dim_) { - for (size_t i = 0; i < threads.size(); i++) { - threads[i] = std::thread(build_func, i); - } - } else { + if (multi_mf_dim_) { for (int i = 0; i < thread_keys_shard_num_; i++) { for (int j = 0; j < multi_mf_dim_; j++) { threads[i * multi_mf_dim_ + j] = std::thread(build_dynamic_mf_func, i, j); } } - } - for (std::thread& t : threads) { - t.join(); + for (std::thread& t : threads) { + t.join(); + } + } else { + for (int i = 0; i < thread_keys_shard_num_; i++) { + for (int j = 0; j < device_num; j++) { + task_futures.emplace_back( + hbm_thread_pool_[i]->enqueue(prepare_dev_value_func, j, i)); + } + } + for (auto& f : task_futures) { + f.wait(); + } + task_futures.clear(); } timeline.Pause(); VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() @@ -750,7 +801,7 @@ void PSGPUWrapper::pre_build_thread() { PreBuildTask(gpu_task); timer.Pause(); VLOG(0) << "thread PreBuildTask end, cost time: " << timer.ElapsedSec() - << "s"; + << " s"; buildcpu_ready_channel_->Put(gpu_task); } VLOG(3) << "build cpu thread end"; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 9b7d6de082d1c..9551e49b6b77b 100755 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -83,6 +83,10 @@ class PSGPUWrapper { PSGPUWrapper() { HeterPs_ = NULL; sleep_seconds_before_fail_exit_ = 300; + hbm_thread_pool_.resize(thread_keys_shard_num_); + for (size_t i = 0; i < hbm_thread_pool_.size(); i++) { + hbm_thread_pool_[i].reset(new ::ThreadPool(1)); + } } void PullSparse(const paddle::platform::Place& place, const int table_id, @@ -399,6 +403,7 @@ class PSGPUWrapper { std::shared_ptr current_task_ = nullptr; std::thread pre_build_threads_; bool running_ = false; + std::vector> hbm_thread_pool_; protected: static bool is_initialized_; diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index ecc5fbdcf945d..17acbde2a09e7 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -597,7 +597,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, } for (auto& out_name : output_names) { - if (ctx->HasOutputs(out_name)) { + if (ctx->HasOutputs(out_name, true)) { auto output_var = ctx->GetOutputVarPtrs(out_name); if (output_var.size() == 1) { infer_meta_context.EmplaceBackOutput(std::make_shared( @@ -606,8 +606,18 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, paddle::SmallVector> outputs; outputs.reserve(output_var.size()); for (const auto& out : output_var) { - outputs.emplace_back( - std::make_shared(out, ctx->IsRuntime())); + if (ctx->IsRuntime()) { + if (BOOST_GET_CONST(Variable*, out)) { + outputs.emplace_back( + std::make_shared(out, ctx->IsRuntime())); + continue; + } + } else if (BOOST_GET_CONST(VarDesc*, out)) { + outputs.emplace_back( + std::make_shared(out, ctx->IsRuntime())); + continue; + } + outputs.emplace_back(nullptr); } infer_meta_context.EmplaceBackOutputs(std::move(outputs)); } diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc index ccdd9dc9d50ce..089e68fe48c52 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.cc +++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc @@ -93,19 +93,24 @@ bool InterpretercoreInferShapeContext::HasInputs( return true; } -bool InterpretercoreInferShapeContext::HasOutputs( - const std::string& name) const { +bool InterpretercoreInferShapeContext::HasOutputs(const std::string& name, + bool allow_null) const { const auto& outs = ctx_.outputs; auto it = outs.find(name); if (it == outs.end() || it->second.empty()) { return false; } - for (auto& output : it->second) { - if (output == nullptr) { - return false; + if (allow_null) { + for (auto& output : it->second) { + if (output != nullptr) return true; + } + return false; + } else { + for (auto& output : it->second) { + if (output == nullptr) return false; } + return true; } - return true; } AttrReader InterpretercoreInferShapeContext::Attrs() const { diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 5704fa414bbb2..aab32cfa06d40 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -58,7 +58,8 @@ class InterpretercoreInferShapeContext : public InferShapeContext { bool HasInputs(const std::string& name) const override; - bool HasOutputs(const std::string& name) const override; + bool HasOutputs(const std::string& name, + bool allow_null = false) const override; AttrReader Attrs() const override; diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index f31fefcfade89..15b979086d1eb 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -39,7 +39,8 @@ class CompileTimeInferShapeContext : public InferShapeContext { bool HasInputs(const std::string &name) const override; - bool HasOutputs(const std::string &name) const override; + bool HasOutputs(const std::string &name, + bool allow_null = false) const override; AttrReader Attrs() const override; @@ -882,7 +883,8 @@ bool CompileTimeInferShapeContext::HasInputs(const std::string &name) const { return true; } -bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const { +bool CompileTimeInferShapeContext::HasOutputs(const std::string &name, + bool allow_null) const { if (op_.Outputs().find(name) == op_.Outputs().end()) { return false; } @@ -890,10 +892,17 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const { if (output_names.empty()) { return false; } - for (auto &output : output_names) { - if (!block_.HasVarRecursive(output)) return false; + if (allow_null) { + for (auto &output : output_names) { + if (block_.HasVarRecursive(output)) return true; + } + return false; + } else { + for (auto &output : output_names) { + if (!block_.HasVarRecursive(output)) return false; + } + return true; } - return true; } AttrReader CompileTimeInferShapeContext::Attrs() const { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index e6577f662ae7b..d9704d70b45ec 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -718,18 +718,24 @@ class RuntimeInferShapeContext : public InferShapeContext { return true; } - bool HasOutputs(const std::string& name) const override { + bool HasOutputs(const std::string& name, + bool allow_null = false) const override { const auto& outs = ctx_.outputs; auto it = outs.find(name); if (it == outs.end() || it->second.empty()) { return false; } - for (auto& output : it->second) { - if (output == nullptr) { - return false; + if (allow_null) { + for (auto& output : it->second) { + if (output != nullptr) return true; + } + return false; + } else { + for (auto& output : it->second) { + if (output == nullptr) return false; } + return true; } - return true; } AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h index 31e3929362a04..6ba60590cf8f3 100644 --- a/paddle/fluid/framework/shape_inference.h +++ b/paddle/fluid/framework/shape_inference.h @@ -69,7 +69,8 @@ class InferShapeContext { const std::string &name) const = 0; virtual bool HasInputs(const std::string &name) const = 0; - virtual bool HasOutputs(const std::string &name) const = 0; + virtual bool HasOutputs(const std::string &name, + bool allow_null = false) const = 0; virtual DDim GetInputDim(const std::string &name) const = 0; virtual std::vector GetInputsDim(const std::string &name) const = 0; diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h index f871e77fdf6e2..1e5b112ece21f 100644 --- a/paddle/fluid/imperative/infer_shape_context.h +++ b/paddle/fluid/imperative/infer_shape_context.h @@ -95,17 +95,27 @@ class DygraphInferShapeContext : public framework::InferShapeContext { return true; } - bool HasOutputs(const std::string& name) const override { + bool HasOutputs(const std::string& name, + bool allow_null = false) const override { auto it = var_map_out_->find(name); if (it == var_map_out_->end() || it->second.empty()) { return false; } - for (auto& output : it->second) { - if (output == nullptr) { - return false; + if (allow_null) { + for (auto& output : it->second) { + if (output != nullptr) { + return true; + } } + return false; + } else { + for (auto& output : it->second) { + if (output == nullptr) { + return false; + } + } + return true; } - return true; } framework::AttrReader Attrs() const override { diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc index 4c2d3fc162f83..e950f952c24e6 100644 --- a/paddle/fluid/operators/activation_op_xpu.cc +++ b/paddle/fluid/operators/activation_op_xpu.cc @@ -490,7 +490,6 @@ REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, XPULeakyReluFunctor, XPULeakyReluGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, XPUReciprocalFunctor, XPUReciprocalGradFunctor) -REGISTER_ACTIVATION_XPU_KERNEL(relu, XPUReluFunctor, XPUReluGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor, XPUSigmoidGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor) @@ -500,6 +499,13 @@ REGISTER_ACTIVATION_XPU_KERNEL(softplus, XPUSoftPlusFunctor, REGISTER_ACTIVATION_XPU_KERNEL(swish, XPUSwishFunctor, XPUSwishGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(pow, XPUPowFunctor, XPUPowGradFunctor) +REGISTER_OP_XPU_KERNEL( + relu, ops::XPUActivationKernel>, + ops::XPUActivationKernel>); +REGISTER_OP_XPU_KERNEL( + relu_grad, ops::XPUActivationGradKernel>, + ops::XPUActivationGradKernel< + ops::XPUReluGradFunctor>); REGISTER_OP_XPU_KERNEL( tanh, ops::XPUActivationKernel>, ops::XPUActivationKernel>); diff --git a/paddle/fluid/operators/bce_loss_op_xpu.cc b/paddle/fluid/operators/bce_loss_op_xpu.cc new file mode 100644 index 0000000000000..8ec80efceb9ec --- /dev/null +++ b/paddle/fluid/operators/bce_loss_op_xpu.cc @@ -0,0 +1,70 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device/device_wrapper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class XPUBCELossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* labels = context.Input("Label"); + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + auto x_numel = x->numel(); + auto& dev_ctx = context.template device_context(); + int r = xpu::bce_loss(dev_ctx.x_context(), x->data(), + labels->data(), out->data(), x_numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "bce_loss"); + } +}; + +template +class XPUBCELossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* labels = context.Input("Label"); + auto* dout = context.Input(framework::GradVarName("Out")); + auto* dx = context.Output(framework::GradVarName("X")); + dx->mutable_data(context.GetPlace()); + + auto x_numel = x->numel(); + auto& dev_ctx = context.template device_context(); + int r = xpu::bce_loss_grad(dev_ctx.x_context(), x->data(), + labels->data(), dout->data(), + dx->data(), x_numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "bce_loss_grad"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + bce_loss, ops::XPUBCELossKernel); +REGISTER_OP_XPU_KERNEL( + bce_loss_grad, + ops::XPUBCELossGradKernel); + +#endif // PADDLE_WITH_XPU diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc index e4b0b0ee2e3b2..ba35098bbac10 100644 --- a/paddle/fluid/operators/concat_op_xpu.cc +++ b/paddle/fluid/operators/concat_op_xpu.cc @@ -26,6 +26,8 @@ using Tensor = framework::Tensor; template class ConcatXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { auto ins = ctx.MultiInput("X"); @@ -79,10 +81,10 @@ class ConcatXPUKernel : public framework::OpKernel { auto place = ctx.GetPlace(); out->mutable_data(place); std::vector> xdims_list; - std::vector ptrs; + std::vector ptrs; for (unsigned int i = 0; i < ins.size(); ++i) { if (ins[i] && ins[i]->numel() > 0) { - ptrs.push_back(ins[i]->data()); + ptrs.push_back(reinterpret_cast(ins[i]->data())); int size = ins[i]->dims().size(); std::vector tmp_dims(size); for (int j = 0; j < size; ++j) { @@ -96,8 +98,9 @@ class ConcatXPUKernel : public framework::OpKernel { "No tensor need concat")); auto& dev_ctx = ctx.template device_context(); - int r = xpu::concat(dev_ctx.x_context(), ptrs, out->data(), - xdims_list, axis); + int r = xpu::concat(dev_ctx.x_context(), ptrs, + reinterpret_cast(out->data()), + xdims_list, axis); PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( "XPU concat kernel return wrong value[%d %s]", r, @@ -107,6 +110,8 @@ class ConcatXPUKernel : public framework::OpKernel { template class ConcatGradXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const { auto* out_grad = @@ -134,12 +139,12 @@ class ConcatGradXPUKernel : public framework::OpKernel { axis = ComputeAxis(static_cast(axis), static_cast(ins[0]->dims().size())); // get output tensor that the name is not kEmptyVarName - std::vector ptrs(outs.size()); + std::vector ptrs(outs.size()); for (size_t j = 0; j < outs.size(); ++j) { if (out_var_names[j] != framework::kEmptyVarName && outs[j]->numel() != 0UL) { outs[j]->mutable_data(ctx.GetPlace()); - ptrs[j] = outs[j]->data(); + ptrs[j] = reinterpret_cast(outs[j]->data()); } else { ptrs[j] = nullptr; } @@ -173,8 +178,10 @@ class ConcatGradXPUKernel : public framework::OpKernel { xdims_list[axis] = total_length; auto& dev_ctx = ctx.template device_context(); - int r = xpu::split(dev_ctx.x_context(), out_grad->data(), ptrs, - xdims_list, split_list, axis); + int r = xpu::split( + dev_ctx.x_context(), + reinterpret_cast(out_grad->data()), ptrs, xdims_list, + split_list, axis); PADDLE_ENFORCE_EQ( r, XPU_SUCCESS, platform::errors::External( @@ -189,9 +196,13 @@ class ConcatGradXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( - concat, ops::ConcatXPUKernel); + concat, ops::ConcatXPUKernel, + ops::ConcatXPUKernel); REGISTER_OP_XPU_KERNEL( concat_grad, - ops::ConcatGradXPUKernel); + ops::ConcatGradXPUKernel, + ops::ConcatGradXPUKernel); #endif diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc index c75c24ab0abc2..a18d0ebfca946 100644 --- a/paddle/fluid/operators/rnn_op_xpu.cc +++ b/paddle/fluid/operators/rnn_op_xpu.cc @@ -51,41 +51,6 @@ void reset_parameter_vector(const std::vector& raw_params_vec, } } -template -void RunLSTMLayer(const framework::ExecutionContext& ctx, int seq_len, - int batch_size, int xdim, int hidden_size, const T* x, T* y, - const T* init_h, const T* init_c, T* last_h, T* last_c, - int state_offset, const std::vector& seq_len_tensor, - const std::vector& param_list, T* i_f_g_o, T* c, - bool is_bidirect, int layer_idx, int offset) { - bool is_reverse = false; - if (is_bidirect) { - layer_idx = 2 * layer_idx + offset; - if (offset > 0) { - is_reverse = true; - } - } - auto w_x = param_list[0 + offset * 4]; - auto w_h = param_list[1 + offset * 4]; - auto b_x = param_list[2 + offset * 4]; - auto b_h = param_list[3 + offset * 4]; - - auto h_0 = init_h + layer_idx * state_offset; - auto c_0 = init_c + layer_idx * state_offset; - auto last_h_ptr = last_h + layer_idx * state_offset; - auto last_c_ptr = last_c + layer_idx * state_offset; - auto& dev_ctx = ctx.template device_context(); - int r = xpu::lstm_train( - dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0, - (const T*)w_x, (const T*)w_h, (const T*)b_x, (const T*)b_h, - reinterpret_cast(y), reinterpret_cast(last_h_ptr), - reinterpret_cast(last_c_ptr), batch_size, xdim, hidden_size, seq_len, - seq_len_tensor, is_reverse, nullptr, nullptr, nullptr, nullptr, - reinterpret_cast(i_f_g_o), reinterpret_cast(c), - xpu::Activation_t::TANH, xpu::Activation_t::SIGMOID); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "lstm_train"); -} - template class RnnXPUKernel : public framework::OpKernel { public: @@ -184,9 +149,9 @@ class RnnXPUKernel : public framework::OpKernel { auto y = output->data(); auto last_h_ptr = last_h->data(); auto last_c_ptr = last_c->data(); - auto i_f_g_o = reserve_data->data(); - auto c = - i_f_g_o + + auto i_f_g_o_ptr = reserve_data->data(); + auto c_ptr = + i_f_g_o_ptr + num_layers * direction_num * seq_len * batch_size * hidden_size * 4; std::vector seq_len_tensor(batch_size, seq_len); @@ -197,11 +162,12 @@ class RnnXPUKernel : public framework::OpKernel { int state_offset = pre_state[0]->dims()[1] * pre_state[0]->dims()[2]; for (int i = 0; i < num_layers; i++) { + auto i_f_g_o = i_f_g_o_ptr + + i * direction_num * seq_len * batch_size * hidden_size * 4; + auto c = c_ptr + i * direction_num * seq_len * batch_size * hidden_size; + const T* cur_input_ptr = nullptr; int cur_xdim = -1; - i_f_g_o += i * direction_num * seq_len * batch_size * hidden_size * 4; - c += i * direction_num * seq_len * batch_size * hidden_size; - if (i == 0) { cur_input_ptr = x; cur_xdim = input_dim; @@ -222,41 +188,44 @@ class RnnXPUKernel : public framework::OpKernel { cur_output_ptr = internal_output_1_ptr; } + auto h_0 = init_h_ptr + direction_num * i * state_offset; + auto c_0 = init_c_ptr + direction_num * i * state_offset; + auto last_h = last_h_ptr + direction_num * i * state_offset; + auto last_c = last_c_ptr + direction_num * i * state_offset; + + auto w_x = parameter_lists[i][0]; + auto w_h = parameter_lists[i][1]; + auto b_x = parameter_lists[i][2]; + auto b_h = parameter_lists[i][3]; if (is_bidirec) { - std::vector output_vec(2); - std::vector output_ptr_vec(2); - for (int k = 0; k < 2; ++k) { - output_vec[k].Resize({seq_len, batch_size, output->dims()[2] / 2}); - output_ptr_vec[k] = output_vec[k].mutable_data(ctx.GetPlace()); - } - RunLSTMLayer( - ctx, seq_len, batch_size, cur_xdim, hidden_size, cur_input_ptr, - output_ptr_vec[0], init_h_ptr, init_c_ptr, last_h_ptr, last_c_ptr, - state_offset, seq_len_tensor, parameter_lists[i], i_f_g_o, c, - is_bidirec, i, 0); - - T* bw_i_f_g_o = i_f_g_o + seq_len * batch_size * hidden_size * 4; - T* bw_c = c + seq_len * batch_size * hidden_size; - RunLSTMLayer( - ctx, seq_len, batch_size, cur_xdim, hidden_size, cur_input_ptr, - output_ptr_vec[1], init_h_ptr, init_c_ptr, last_h_ptr, last_c_ptr, - state_offset, seq_len_tensor, parameter_lists[i], bw_i_f_g_o, bw_c, - is_bidirec, i, 1); - - // concat - int r = xpu::concat( - dev_ctx.x_context(), {output_ptr_vec[0], output_ptr_vec[1]}, - cur_output_ptr, {{seq_len, batch_size, hidden_size}, - {seq_len, batch_size, hidden_size}}, - 2); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "concat"); - xpu_wait(dev_ctx.x_context()->xpu_stream); + auto bw_x = parameter_lists[i][4]; + auto bw_h = parameter_lists[i][5]; + auto bb_x = parameter_lists[i][6]; + auto bb_h = parameter_lists[i][7]; + + int r = xpu::bilstm_train( + dev_ctx.x_context(), (const T*)cur_input_ptr, (const T*)h_0, + (const T*)c_0, (const T*)w_x, (const T*)w_h, (const T*)b_x, + (const T*)b_h, (const T*)bw_x, (const T*)bw_h, (const T*)bb_x, + (const T*)bb_h, reinterpret_cast(cur_output_ptr), + reinterpret_cast(last_h), reinterpret_cast(last_c), + batch_size, cur_xdim, hidden_size, seq_len, seq_len_tensor, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + reinterpret_cast(i_f_g_o), reinterpret_cast(c)); + + PADDLE_ENFORCE_XDNN_SUCCESS(r, "bilstm_train"); } else { - RunLSTMLayer( - ctx, seq_len, batch_size, cur_xdim, hidden_size, cur_input_ptr, - cur_output_ptr, init_h_ptr, init_c_ptr, last_h_ptr, last_c_ptr, - state_offset, seq_len_tensor, parameter_lists[i], i_f_g_o, c, - is_bidirec, i, 0); + int r = xpu::lstm_train( + dev_ctx.x_context(), (const T*)cur_input_ptr, (const T*)h_0, + (const T*)c_0, (const T*)w_x, (const T*)w_h, (const T*)b_x, + (const T*)b_h, reinterpret_cast(cur_output_ptr), + reinterpret_cast(last_h), reinterpret_cast(last_c), + batch_size, cur_xdim, hidden_size, seq_len, seq_len_tensor, nullptr, + nullptr, nullptr, nullptr, reinterpret_cast(i_f_g_o), + reinterpret_cast(c), xpu::Activation_t::TANH, + xpu::Activation_t::SIGMOID); + + PADDLE_ENFORCE_XDNN_SUCCESS(r, "lstm_train"); } } } diff --git a/paddle/fluid/operators/sign_op_xpu.cc b/paddle/fluid/operators/sign_op_xpu.cc index 22934cf482159..a00aa4bb7ce51 100644 --- a/paddle/fluid/operators/sign_op_xpu.cc +++ b/paddle/fluid/operators/sign_op_xpu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { namespace operators { @@ -26,10 +27,9 @@ class SignXPUKernel : public framework::OpKernel { auto* in = context.Input("X"); out->mutable_data(in->place()); auto xpu_context = context.device_context().x_context(); - int r = xpu::activation_forward(xpu_context, xpu::Activation_t::SIGN, - in->numel(), in->data(), out->data()); - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::Fatal("XPU sign kernel error!")); + // int sign(Context* ctx, const T* x , T* y, int len); + int r = xpu::sign(xpu_context, in->data(), out->data(), in->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sign"); } }; diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc index a9fa78c4e4943..6fc80ca379f3f 100644 --- a/paddle/fluid/operators/stack_op.cc +++ b/paddle/fluid/operators/stack_op.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" #include "paddle/phi/infermeta/multiary.h" namespace plat = paddle::platform; @@ -68,44 +69,6 @@ Stack all of the Inputs(X) into one tensor along Attr(axis). The dims of all Inp class StackOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput(framework::GradVarName("Y")), true, - platform::errors::InvalidArgument("Input(Y@Grad) not exist.")); - - int axis = ctx->Attrs().Get("axis"); - auto dy_dim = ctx->GetInputDim(framework::GradVarName("Y")); - int rank = dy_dim.size(); - PADDLE_ENFORCE_GE( - axis, -rank, - platform::errors::InvalidArgument( - "Attr(axis) must be inside [-rank, rank), where rank = %d, " - "but received axis is:%d.", - rank, axis)); - PADDLE_ENFORCE_LT( - axis, rank, - platform::errors::InvalidArgument( - "Attr(axis) must be inside [-rank, rank), where rank = %d, " - "but received axis is:%d.", - rank, axis)); - - if (axis < 0) axis += rank; - PADDLE_ENFORCE_EQ( - ctx->Outputs(framework::GradVarName("X")).size(), - static_cast(dy_dim[axis]), - platform::errors::InvalidArgument( - "Number of Outputs(X@Grad) is equal to dy dim at axis, but" - " received outputs size is:%d, dy dims is:%d.", - ctx->Outputs(framework::GradVarName("X")).size(), - static_cast(dy_dim[axis]))); - - auto vec = phi::vectorize(dy_dim); - vec.erase(vec.begin() + axis); - ctx->SetOutputsDim( - framework::GradVarName("X"), - std::vector(dy_dim[axis], phi::make_ddim(vec))); - } }; template @@ -127,8 +90,10 @@ class StackGradOpMaker : public framework::SingleGradOpMaker { DECLARE_INFER_SHAPE_FUNCTOR(stack, StackInferMetaFunctor, PD_INFER_META(phi::StackInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(stack_grad, StackGradInferMetaFunctor, + PD_INFER_META(phi::StackGradInferMeta)); REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker, ops::StackGradOpMaker, ops::StackGradOpMaker, StackInferMetaFunctor); -REGISTER_OPERATOR(stack_grad, ops::StackOpGrad); +REGISTER_OPERATOR(stack_grad, ops::StackOpGrad, StackGradInferMetaFunctor); diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index af29aac6b9052..90cf4128aae94 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -134,7 +134,7 @@ inline void StridedMemcpyWithAxis0( for (size_t i = 0; i < outputs->size(); ++i) { auto out_stride = stride_numel(shape_refer[i]->dims()); auto out = outputs->at(i); - if (out != nullptr) { + if (out != nullptr && out->initialized()) { StridedNumelCopyWithAxis(dev_ctx, axis, out->data(), out_stride, input.data() + input_offset, in_stride, out_stride[axis]); diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 46059100b3802..f29546c5210d9 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -192,13 +192,13 @@ add_subdirectory(profiler) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) if(WITH_GPU) - nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler) + nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler stats) nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) elseif(WITH_ROCM) - hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler) + hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler stats) hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) else() - cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler) + cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler stats) cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place) endif() diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 08a7f08006957..9915b4d8d34f8 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -43,6 +43,9 @@ XPUOpMap& get_kl2_ops() { {"batch_norm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"bce_loss_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"bce_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"bilinear_interp_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"bilinear_interp_v2_grad", @@ -53,8 +56,10 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace())})}, {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), @@ -285,8 +290,10 @@ XPUOpMap& get_kl2_ops() { {"reduce_sum_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"reshape2_grad", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), @@ -316,6 +323,7 @@ XPUOpMap& get_kl2_ops() { {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"sigmoid_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace())})}, diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 021899c5f3782..6dbed97a55f40 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -473,7 +473,7 @@ static PyObject* tensor__share_buffer_to(TensorObject* self, PyObject* args, } auto dst_tensor = static_cast(dst_ptr->impl().get()); - dst_tensor->ShareDataWith(*src_tensor); + dst_tensor->ShareBufferWith(*src_tensor); dst_tensor->ShareDataTypeWith(*src_tensor); Py_INCREF(Py_None); return Py_None; @@ -921,7 +921,7 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self, "please check the type of tensor.")); } - if (value_tensor_tmp.place() == paddle::PlaceType::kUNK) { + if (!value_tensor_tmp.initialized()) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) SetTensorFromPyArray( static_cast(value_tensor_tmp.impl().get()), @@ -1009,7 +1009,7 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self, VLOG(4) << "index is not tensor"; self_numpy[_index] = py::object(py::handle(value_obj), true); } - if (self->tensor.place() == paddle::PlaceType::kUNK) { + if (!self->tensor.initialized()) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) SetTensorFromPyArray(self_tensor, self_numpy, platform::Place(platform::CUDAPlace(0)), false); diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 724b1ba556d4b..d43e327393f25 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -23,7 +23,7 @@ add_subdirectory(tools) add_subdirectory(tests) # make an unity target for compile deps -set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor) +set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor api_scalar) get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) set(PHI_DEPS ${PHI_DEPS} ${phi_kernels}) diff --git a/paddle/phi/api/all.h b/paddle/phi/api/all.h index ac8607597a436..5838e7b2eaab7 100644 --- a/paddle/phi/api/all.h +++ b/paddle/phi/api/all.h @@ -41,5 +41,4 @@ limitations under the License. */ #include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/api/ext/exception.h" #include "paddle/phi/api/ext/op_meta_info.h" -#include "paddle/phi/api/ext/place.h" #include "paddle/phi/api/ext/tensor_compat.h" diff --git a/paddle/phi/api/ext/tensor_compat.h b/paddle/phi/api/ext/tensor_compat.h index 530275de50ec7..e63390db06e82 100644 --- a/paddle/phi/api/ext/tensor_compat.h +++ b/paddle/phi/api/ext/tensor_compat.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/phi/api/include/api.h" #include "paddle/phi/api/include/tensor.h" // Note(chenweihang): In order to be compatible with the original custom @@ -21,5 +22,8 @@ limitations under the License. */ // cannot be includeed in paddle namespace paddle { -using Tensor = paddle::experimental::Tensor; +using Tensor = experimental::Tensor; +// using several Tensor initialize functions in paddle namespace +using experimental::empty; +using experimental::full; } // namespace paddle diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index ad3933e2b2b53..d3efb7ca1c21e 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -29,7 +29,6 @@ using gpuStream_t = cudaStream_t; using gpuStream_t = hipStream_t; #endif -#include "paddle/phi/api/ext/place.h" #include "paddle/phi/api/include/dll_decl.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" @@ -109,21 +108,23 @@ class PADDLE_API Tensor final { /** * @brief Construct a new Tensor object on the target place. - * This is a deprecated method and may be removed in the future! + * + * This is a deprecated method and may be removed in the future!!! * * @param place */ - explicit Tensor(const PlaceType& place); + explicit Tensor(const Place& place); /** * @brief Construct a new Tensor object on the target place * with specified shape. - * This is a deprecated method and may be removed in the future! + * + * This is a deprecated method and may be removed in the future!!! * * @param place * @param shape */ - Tensor(const PlaceType& place, const std::vector& shape); + Tensor(const Place& place, const std::vector& shape); /** * @brief Construct a new Tensor object by a TensorBase pointer and name @@ -135,8 +136,9 @@ class PADDLE_API Tensor final { /** * @brief Construct a new Tensor object with name * - * @note Used to adapt original execution mechanism and debug analysis - * in the development of new dygraph. It may be removed in the future. + * @note Internal method, used to adapt original execution mechanism and + * debug analysis in the development of new dygraph. It may be removed in + * the future. * */ explicit Tensor(const std::string& name) : name_(name) {} @@ -151,6 +153,7 @@ class PADDLE_API Tensor final { /** * @brief Get the size of current tensor. + * * The compatible method of `Tensor::numel()`. * This is a deprecated method and may be removed in the future! * @@ -167,6 +170,7 @@ class PADDLE_API Tensor final { /** * @brief Return the shape (dimensions) of Tensor. + * * The compatible method of `Tensor::dims()`. * This is a deprecated method and may be removed in the future! * @@ -178,7 +182,7 @@ class PADDLE_API Tensor final { * @brief Reset the shape of the tensor. * @note: This method means Reset the shape of the tensor, * and must be called before calling mutable_data() or - * copy_to(const PlaceType& place), this is not a standard definition of + * copy_to(const Place& place), this is not a standard definition of * reshape behavior, so we will deprecated this feature in the future. * * @param shape @@ -194,6 +198,7 @@ class PADDLE_API Tensor final { /** * @brief Return the data type of Tensor. + * * The compatible method of `Tensor::dtype()`. * This is a deprecated method and may be removed in the future! * @@ -246,18 +251,18 @@ class PADDLE_API Tensor final { * @brief Return the place (device) of Tensor. * This is a deprecated method and may be removed in the future! * - * @return PlaceType + * @return Place */ - PlaceType place() const; + Place place() const; /** * @brief Return the place (device) of Tensor. - * Because the `place` method already exists, so we need to use a new name, - * here we temporarily use `inner_place`. * - * @return paddle::platform::Place + * This is a deprecated method and may be removed in the future!!! + * + * @return Place */ - phi::Place inner_place() const; + Place inner_place() const; /** * @brief Determine whether the tensor device is CPU @@ -287,7 +292,7 @@ class PADDLE_API Tensor final { /** * @brief Get the memory pointer in CPU or GPU with specific data type. - * It's usually used to get the output data pointer. + * It's usually used to get the output data pointer, same as the T* data(). * * @tparam T * @return T* @@ -297,6 +302,7 @@ class PADDLE_API Tensor final { /** * @brief Get the memory pointer in CPU or GPU with specific data type. + * * It's usually used to get the output data pointer. * This is a deprecated method and may be removed in the future! * @@ -305,7 +311,7 @@ class PADDLE_API Tensor final { * @return T* */ template - T* mutable_data(const PlaceType& place); + T* mutable_data(const Place& place); /** * @brief Get the const memory pointer directly. @@ -319,8 +325,7 @@ class PADDLE_API Tensor final { /** * @brief Get the memory pointer directly. - * It's usually used to get the output data pointer. - * This is a deprecated method and may be removed in the future! + * It's usually used to get the mutable output data pointer. * * @tparam T * @return T* @@ -409,7 +414,7 @@ class PADDLE_API Tensor final { * @return Tensor */ template - Tensor copy_to(const PlaceType& target_place) const; + Tensor copy_to(const Place& target_place) const; /** * @brief Transfer the current Tensor to the specified device and return. @@ -427,7 +432,8 @@ class PADDLE_API Tensor final { * @param blocking, Should we copy this in sync way. * @return void */ - void copy_(const Tensor& src, const phi::Place& target_place, bool blocking); + void copy_(const Tensor& src, const Place& target_place, bool blocking); + /** * @brief Cast datatype from one to another * @@ -489,11 +495,17 @@ class PADDLE_API Tensor final { /* Part 8: Autograd methods */ /** - * @brief Get the autograd meta object + * @brief Get the autograd meta object pointer * * @return AbstractAutogradMeta* */ AbstractAutogradMeta* get_autograd_meta() const; + + /** + * @brief Get the shared pointer of autograd meta object + * + * @return std::shared_ptr& + */ const std::shared_ptr& mutable_autograd_meta() const; /** @@ -524,7 +536,7 @@ class PADDLE_API Tensor final { /* Part 10: Auto generated Tensor methods */ - /* Part 11: Methods of converting SparseTensor and DenseTensor to each other + /* Part 11: Methods of converting underlying TensorType to each other */ /** * @brief Convert DenseTensor or SparseCsrTensor to SparseCooTensor @@ -587,12 +599,6 @@ class PADDLE_API Tensor final { * in the development of new dygraph. It may be removed in the future. */ std::string name_{""}; - - /** - * Place type: Return the expected memory location if the Tensor is - * uninitialized. - */ - PlaceType place_{PlaceType::kUNK}; }; } // namespace experimental diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 7dfe7d8cf4d20..e10ae8254a79e 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -1,13 +1,11 @@ add_subdirectory(utils) -cc_library(ext_compat_utils SRCS ext_compat_utils.cc DEPS place) - if (WITH_GPU) - nv_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce) + nv_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils phi_enforce) elseif (WITH_ROCM) - hip_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce) + hip_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils phi_enforce) else() - cc_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce) + cc_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils phi_enforce) endif() set(api_gen_base ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_base.py) @@ -166,7 +164,7 @@ cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_conte cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor) cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform) cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform) -cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) +cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform tensor_copy) cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl) cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl global_utils) @@ -175,3 +173,5 @@ cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw p cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform phi_function_api sparse_api) cc_library(strings_api SRCS ${strings_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils) cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api strings_api) +cc_library(tensor_copy SRCS tensor_copy.cc DEPS phi_tensor_raw copy_kernel kernel_dispatch api_gen_utils) +cc_library(api_scalar SRCS scalar.cc DEPS tensor_copy) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 0f1cbc3f1910e..81e7faeb87015 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" +#include "paddle/phi/api/lib/tensor_copy.h" #include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/compat/convert_utils.h" @@ -33,6 +34,187 @@ limitations under the License. */ namespace paddle { namespace experimental { +std::tuple adam_impl( + const Tensor& param, + const Tensor& grad, + const Tensor& learning_rate, + const Tensor& moment1, + const Tensor& moment2, + const Tensor& beta1_pow, + const Tensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow) { + Backend kernel_backend = Backend::UNDEFINED; + DataLayout kernel_layout = DataLayout::UNDEFINED; + DataType kernel_data_type = DataType::UNDEFINED; + if (kernel_backend == Backend::UNDEFINED || + kernel_layout == DataLayout::UNDEFINED || + kernel_data_type == DataType::UNDEFINED) { + auto kernel_key_set = ParseKernelKeyByInputArgs(param); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + if (kernel_backend == Backend::UNDEFINED) { + kernel_backend = kernel_key.backend(); + } + if (kernel_layout == DataLayout::UNDEFINED) { + kernel_layout = kernel_key.layout(); + } + if (kernel_data_type == DataType::UNDEFINED) { + kernel_data_type = kernel_key.dtype(); + } + } + std::string kernel_name = "adam"; + const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + kernel_name, {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << kernel_name << " API kernel key: [" << kernel_backend << ", " + << kernel_layout << ", " << kernel_data_type << "]"; + VLOG(6) << kernel_name << " API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); + + auto input_param = PrepareData(param, kernel.InputAt(0), {}); + auto input_grad = PrepareData(grad, kernel.InputAt(1), {}); + auto input_lr = PrepareData(learning_rate, kernel.InputAt(2), {}); + auto input_moment1 = PrepareData(moment1, kernel.InputAt(3), {}); + auto input_moment2 = PrepareData(moment2, kernel.InputAt(4), {}); + auto input_beta1_pow = PrepareData(beta1_pow, kernel.InputAt(5), {}); + auto input_beta2_pow = PrepareData(beta2_pow, kernel.InputAt(6), {}); + paddle::optional input_master_param(paddle::none); + auto input_master_param_ptr = + PrepareData(master_param, kernel.InputAt(7), {}); + paddle::optional input_skip_update(paddle::none); + auto input_skip_update_ptr = PrepareData(skip_update, kernel.InputAt(8), {}); + + std::tuple api_output; + auto kernel_out_0 = input_param.get(); + auto kernel_out_1 = input_moment1.get(); + auto kernel_out_2 = input_moment2.get(); + auto kernel_out_3 = input_beta1_pow.get(); + auto kernel_out_4 = input_beta2_pow.get(); + phi::DenseTensor* kernel_out_5 = nullptr; + if (input_master_param_ptr) { + input_master_param = + paddle::make_optional(*input_master_param_ptr); + kernel_out_5 = + paddle::make_optional(*input_master_param_ptr) + .get_ptr(); + } + + if (input_skip_update_ptr) { + input_skip_update = + paddle::make_optional(*input_skip_update_ptr); + } + + paddle::optional input_meta_ref_master_param( + paddle::none); + phi::DenseTensor dt; + phi::MetaTensor input_meta_tmp_master_param(dt); + if (input_master_param_ptr) { + input_meta_tmp_master_param.set_dtype(input_master_param_ptr->dtype()); + input_meta_tmp_master_param.set_dims(input_master_param_ptr->dims()); + input_meta_tmp_master_param.set_layout(input_master_param_ptr->layout()); + input_meta_ref_master_param = input_meta_tmp_master_param; + } + + paddle::optional input_meta_ref_skip_update( + paddle::none); + phi::DenseTensor dt1; + phi::MetaTensor input_meta_tmp_skip_update(dt1); + if (input_skip_update_ptr) { + input_meta_tmp_skip_update.set_dtype(input_skip_update_ptr->dtype()); + input_meta_tmp_skip_update.set_dims(input_skip_update_ptr->dims()); + input_meta_tmp_skip_update.set_layout(input_skip_update_ptr->layout()); + input_meta_ref_skip_update = input_meta_tmp_skip_update; + } + + phi::MetaTensor meta_out_0(kernel_out_0); + phi::MetaTensor meta_out_1(kernel_out_1); + phi::MetaTensor meta_out_2(kernel_out_2); + phi::MetaTensor meta_out_3(kernel_out_3); + phi::MetaTensor meta_out_4(kernel_out_4); + phi::MetaTensor meta_out_5(kernel_out_5); + + phi::AdamInferMeta(MakeMetaTensor(*input_param), + MakeMetaTensor(*input_grad), + MakeMetaTensor(*input_lr), + MakeMetaTensor(*input_moment1), + MakeMetaTensor(*input_moment2), + MakeMetaTensor(*input_beta1_pow), + MakeMetaTensor(*input_beta2_pow), + input_meta_ref_master_param, + input_meta_ref_skip_update, + beta1, + beta2, + epsilon, + lazy_mode, + min_row_size_to_use_multithread, + multi_precision, + use_global_beta_pow, + &meta_out_0, + &meta_out_1, + &meta_out_2, + &meta_out_3, + &meta_out_4, + &meta_out_5); + + using kernel_signature = void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + paddle::optional, + paddle::optional, + const Scalar&, + const Scalar&, + const Scalar&, + bool, + int64_t, + bool, + bool, + phi::DenseTensor*, + phi::DenseTensor*, + phi::DenseTensor*, + phi::DenseTensor*, + phi::DenseTensor*, + phi::DenseTensor*); + auto* kernel_fn = kernel.GetVariadicKernelFn(); + + (*kernel_fn)(*dev_ctx, + *input_param, + *input_grad, + *input_lr, + *input_moment1, + *input_moment2, + *input_beta1_pow, + *input_beta2_pow, + input_master_param, + input_skip_update, + beta1, + beta2, + epsilon, + lazy_mode, + min_row_size_to_use_multithread, + multi_precision, + use_global_beta_pow, + kernel_out_0, + kernel_out_1, + kernel_out_2, + kernel_out_3, + kernel_out_4, + kernel_out_5); + + return api_output; +} + ////////////////// Forward api impls ////////////////////// Tensor conv2d_impl(const Tensor& input, @@ -243,35 +425,8 @@ std::vector> conv2d_grad_impl( } Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) { - auto kernel_key_set = ParseKernelKeyByInputArgs(x); - kernel_key_set.backend_set = - kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place)); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); - auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( - "copy", kernel_key); - - VLOG(6) << "copy API kernel key: " << kernel_key; - VLOG(6) << "copy API kernel: " << kernel; - - auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); - - auto dense_x = TensorToDenseTensor(x); - Tensor out; - auto kernel_out = SetKernelOutput(kernel_key.backend(), &out); - phi::MetaTensor meta_out(kernel_out); - phi::UnchangedInferMeta(*dense_x, &meta_out); - - using kernel_signature = void (*)(const platform::DeviceContext&, - const phi::DenseTensor&, - phi::Place, - bool, - phi::DenseTensor*); - - auto* kernel_fn = kernel.GetVariadicKernelFn(); - - (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out); - + copy(x, place, blocking, &out); return out; } diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h index 0d1ba3e98e53e..5d46ed691816b 100644 --- a/paddle/phi/api/lib/api_custom_impl.h +++ b/paddle/phi/api/lib/api_custom_impl.h @@ -30,6 +30,24 @@ namespace experimental { ////////////////// Forward api impls ////////////////////// +std::tuple adam_impl( + const Tensor& param, + const Tensor& grad, + const Tensor& learning_rate, + const Tensor& moment1, + const Tensor& moment2, + const Tensor& beta1_pow, + const Tensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow); + std::tuple batch_norm_impl( const Tensor& x, const Tensor& scale, diff --git a/paddle/phi/api/lib/ext_compat_utils.cc b/paddle/phi/api/lib/ext_compat_utils.cc deleted file mode 100644 index 1d0f52b5f0b65..0000000000000 --- a/paddle/phi/api/lib/ext_compat_utils.cc +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/api/lib/ext_compat_utils.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" - -namespace paddle { -namespace experimental { - -platform::Place ConvertExtPlaceToInnerPlace(PlaceType p) { - if (p == PlaceType::kCPU) { - return platform::Place(platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - } else if (p == PlaceType::kGPU) { - return platform::Place(platform::CUDAPlace(platform::GetCurrentDeviceId())); -#endif - } else { - PADDLE_THROW( - platform::errors::Unimplemented("Unsupported place type code(%d) when " - "casting enum place to paddle place.", - static_cast(p))); - } - return platform::Place(); -} - -PlaceType ConvertInnerPlaceToExtPlace(const platform::Place& p) { - if (platform::is_cpu_place(p)) { - return PlaceType::kCPU; - } else if (platform::is_gpu_place(p)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - return PlaceType::kGPU; -#endif - } else { - PADDLE_THROW( - platform::errors::Unimplemented("Unsupported place type `%s` when " - "casting paddle place to enum place.", - p)); - } - return PlaceType::kUNK; -} - -Backend ConvertExtPlaceToBackend(PlaceType p) { - switch (p) { - case PlaceType::kCPU: - return Backend::CPU; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - case PlaceType::kGPU: - return Backend::GPU; -#endif - default: - PADDLE_THROW( - platform::errors::Unimplemented("Unsupported place type `%s` when " - "casting enum place to backend.", - static_cast(p))); - } -} - -} // namespace experimental -} // namespace paddle diff --git a/paddle/phi/api/lib/scalar.cc b/paddle/phi/api/lib/scalar.cc new file mode 100644 index 0000000000000..981487df86be4 --- /dev/null +++ b/paddle/phi/api/lib/scalar.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/common/scalar.h" + +#include "paddle/phi/api/lib/tensor_copy.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/enforce.h" + +namespace paddle { +namespace experimental { + +template <> +ScalarBase::ScalarBase(const Tensor& tensor_in) + : dtype_(tensor_in.dtype()) { // NOLINT + PADDLE_ENFORCE_EQ(tensor_in.numel(), + 1, + phi::errors::InvalidArgument( + "The Scalar only supports Tensor with 1 element, but " + "now Tensor has `%d` elements", + tensor_in.numel())); + if (tensor_in.place() == PlaceType::kGPU) { + Tensor dst_tensor; + copy(tensor_in, phi::CPUPlace(), true, &dst_tensor); + GetDataFromTensor(dst_tensor); + } else if (tensor_in.place() == PlaceType::kCPU) { + GetDataFromTensor(tensor_in); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Now, it is not supported to construct Scalar using tensor that its " + "PlaceType is (%d)", + static_cast(tensor_in.place()))); + } +} + +} // namespace experimental +} // namespace paddle diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index ffc754feaed98..07204b7ffcf61 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -19,46 +19,41 @@ limitations under the License. */ #include #include "glog/logging.h" -#include "paddle/phi/api/lib/ext_compat_utils.h" + #include "paddle/phi/api/lib/utils/allocator.h" -#include "paddle/phi/api/lib/utils/storage.h" -#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/selected_rows.h" #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/core/tensor_base.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_utils.h" -/** - * [ Why still include the fluid headers? ] - * - * We hope to organize the basic implementation of Tensor and the logic related - * to Tensor computation into an independent library, which we call - * [Tensor Operation Library, phi], so we extract or rewrite the original - * Kernels. - * - * In the future, the training library, inference library and custom operators - * will link to this Tensor Operation library. - * - * However, if we directly split the link relation, we need to make too many - * changes, which will affect the stability of the framework, so here we still - * rely on the implementation of the framework, which is a intermediate state. - * - * In the future, the necessary components will be moved to the this library, - * or the corresponding components will be re-implemented. - */ - -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/platform/place.h" + #include "paddle/fluid/platform/stream/cuda_stream.h" -#include "paddle/phi/common/complex.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/core/enforce.h" namespace paddle { namespace experimental { +namespace detail { +static Place GetCorrectPlaceByPlaceType(const Place &place_type) { + auto alloc_type = place_type.GetType(); + switch (alloc_type) { + case AllocationType::CPU: + return place_type; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + case AllocationType::GPU: + return phi::Place(AllocationType::GPU, + phi::backends::gpu::GetCurrentDeviceId()); +#endif + default: + PADDLE_THROW(phi::errors::Unavailable( + "The PlaceType is a legacy design, only supports CPU and GPU, " + "and will not support other place types in the future.")); + } +} +} // namespace detail /////// Tensor Methods //////// @@ -71,27 +66,41 @@ Tensor::Tensor(std::shared_ptr tensor_impl) phi::errors::InvalidArgument("TensorImpl with nullptr is not supported")); } -Tensor::Tensor(const PlaceType &place) - : impl_(std::move(std::make_shared( - std::move(phi::make_intrusive( - ConvertExtPlaceToInnerPlace(place))), - std::move(phi::DenseTensorMeta(phi::DataType::UNDEFINED, - phi::make_ddim({}), - phi::DataLayout::NCHW))))), - place_{place} {} - -Tensor::Tensor(const PlaceType &place, const std::vector &shape) - : impl_(std::move(std::make_shared( - std::move(phi::make_intrusive( - ConvertExtPlaceToInnerPlace(place))), - std::move(phi::DenseTensorMeta(phi::DataType::UNDEFINED, - phi::make_ddim(shape), - phi::DataLayout::NCHW))))), - place_{place} {} +Tensor::Tensor(const Place &place) { + LOG(WARNING) << "The Tensor(place) constructor is deprecated since version " + "2.3, and will be removed in version 2.4! Please use " + "`paddle::empty/full` method to create a new " + "Tensor instead. " + "Reason: A legal tensor cannot be constructed only based on " + "the `place`, and datatype, shape, layout, etc. is also " + "required."; + DefaultAllocator alloc(detail::GetCorrectPlaceByPlaceType(place)); + impl_ = std::move(std::make_shared( + &alloc, + std::move(phi::DenseTensorMeta( + phi::DataType::FLOAT32, phi::make_ddim({}), phi::DataLayout::NCHW)))); +} + +Tensor::Tensor(const Place &place, const std::vector &shape) { + LOG(WARNING) << "The Tensor(place, shape) constructor is deprecated since " + "version 2.3, and will be removed in version 2.4! Please use " + "`paddle::empty/full` method to create a new " + "Tensor instead. " + "Reason: A legal tensor cannot be constructed only based on " + "the `place` and `shape`, and datatype, layout, etc. is also " + "required."; + DefaultAllocator alloc(detail::GetCorrectPlaceByPlaceType(place)); + impl_ = std::move(std::make_shared( + &alloc, + std::move(phi::DenseTensorMeta(phi::DataType::FLOAT32, + phi::make_ddim({shape}), + phi::DataLayout::NCHW)))); +} Tensor::Tensor(std::shared_ptr tensor_impl, const std::string &name) : impl_(std::move(tensor_impl)), name_(std::move(name)) {} + /* Part 2: Dimension, DataType and DataLayout methods */ int64_t Tensor::numel() const { return impl_->numel(); } @@ -112,14 +121,13 @@ void Tensor::reshape(const std::vector &shape) { LOG(WARNING) << "The function of resetting the shape of the uninitialized " "Tensor of the `reshape` method is deprecated since version " "2.3, and will be removed in version 2.4, please use " - "`paddle::experimental::full` method to create a new Tensor " + "`paddle::empty/full` method to create a new Tensor " "instead. " "reason: `reshape` means changing the tensor shape without " "touching underlying data, this requires the total size of " "the tensor to remain constant."; if (is_dense_tensor()) { - std::dynamic_pointer_cast(impl_)->Resize( - phi::make_ddim(shape)); + static_cast(impl_.get())->Resize(phi::make_ddim(shape)); } else { PADDLE_THROW(phi::errors::Unimplemented( "Only support reshape operation on DenseTensor now.")); @@ -146,15 +154,16 @@ bool Tensor::is_sparse_csr_tensor() const { } /* Part 3: Device and Backend methods */ -PlaceType Tensor::place() const { - if (!impl_->initialized()) { - return place_; - } else { - return ConvertInnerPlaceToExtPlace(impl_->place()); - } +Place Tensor::place() const { + PADDLE_ENFORCE_NOT_NULL( + impl_, + phi::errors::PermissionDenied( + "Null pointer error, the impl_ of Tensor should not be " + "Null when calling Tensor::place().")); + return impl_->place(); } -paddle::platform::Place Tensor::inner_place() const { +Place Tensor::inner_place() const { PADDLE_ENFORCE_NOT_NULL( impl_, phi::errors::PermissionDenied( @@ -179,9 +188,18 @@ bool Tensor::is_gpu_pinned() const { template T *Tensor::mutable_data() { + LOG(WARNING) << "Allocating memory through `mutable_data` method is " + "deprecated since version 2.3, and `mutable_data` method " + "will be removed in version 2.4! Please use " + "`paddle::empty/full` method to create a new " + "Tensor with allocated memory, and use data() method " + "to get the memory pointer of tensor instead. " + "Reason: When calling `mutable_data` to allocate memory, " + "the place, datatype, and data layout of tensor may be in " + "an illegal state."; if (is_dense_tensor()) { - return std::dynamic_pointer_cast(impl_)->mutable_data( - ConvertExtPlaceToInnerPlace(place())); + return static_cast(impl_.get()) + ->mutable_data(place()); } return nullptr; } @@ -202,51 +220,43 @@ template PADDLE_API phi::dtype::float16 * Tensor::mutable_data(); template -T *Tensor::mutable_data(const PlaceType &place) { - auto inner_place = ConvertExtPlaceToInnerPlace(place); - if (impl_->initialized()) { - PADDLE_ENFORCE_EQ( - platform::is_same_place(inner_place, impl_->place()), - true, - phi::errors::Unimplemented("Modification of tensor place through " - "mutable_data is not supported now")); - } +T *Tensor::mutable_data(const Place &place) { + LOG(WARNING) << "Allocating memory through `mutable_data` method is " + "deprecated since version 2.3, and `mutable_data` method " + "will be removed in version 2.4! Please use " + "`paddle::empty/full` method to create a new " + "Tensor with allocated memory, and use data() method " + "to get the memory pointer of tensor instead. " + "Reason: When calling `mutable_data` to allocate memory, " + "the datatype, and data layout of tensor may be in " + "an illegal state."; if (is_dense_tensor()) { - return std::dynamic_pointer_cast(impl_)->mutable_data( - inner_place); + return static_cast(impl_.get())->mutable_data(place); } return nullptr; } -template PADDLE_API float *Tensor::mutable_data(const PlaceType &place); -template PADDLE_API double *Tensor::mutable_data( - const PlaceType &place); -template PADDLE_API int64_t *Tensor::mutable_data( - const PlaceType &place); -template PADDLE_API int32_t *Tensor::mutable_data( - const PlaceType &place); -template PADDLE_API uint8_t *Tensor::mutable_data( - const PlaceType &place); -template PADDLE_API int8_t *Tensor::mutable_data( - const PlaceType &place); -template PADDLE_API int16_t *Tensor::mutable_data( - const PlaceType &place); -template PADDLE_API bool *Tensor::mutable_data(const PlaceType &place); +template PADDLE_API float *Tensor::mutable_data(const Place &place); +template PADDLE_API double *Tensor::mutable_data(const Place &place); +template PADDLE_API int64_t *Tensor::mutable_data(const Place &place); +template PADDLE_API int32_t *Tensor::mutable_data(const Place &place); +template PADDLE_API uint8_t *Tensor::mutable_data(const Place &place); +template PADDLE_API int8_t *Tensor::mutable_data(const Place &place); +template PADDLE_API int16_t *Tensor::mutable_data(const Place &place); +template PADDLE_API bool *Tensor::mutable_data(const Place &place); template PADDLE_API phi::dtype::complex - *Tensor::mutable_data>(const PlaceType &place); + *Tensor::mutable_data>(const Place &place); template PADDLE_API phi::dtype::complex - *Tensor::mutable_data>(const PlaceType &place); + *Tensor::mutable_data>(const Place &place); template PADDLE_API phi::dtype::float16 * -Tensor::mutable_data(const PlaceType &place); +Tensor::mutable_data(const Place &place); template const T *Tensor::data() const { if (is_dense_tensor()) { - return std::dynamic_pointer_cast(impl_)->data(); - } else if (phi::SelectedRows::classof(impl_.get())) { - return std::dynamic_pointer_cast(impl_) - ->value() - .data(); + return static_cast(impl_.get())->data(); + } else if (is_selected_rows()) { + return static_cast(impl_.get())->value().data(); } return nullptr; } @@ -271,9 +281,9 @@ Tensor::data() const; template T *Tensor::data() { if (is_dense_tensor()) { - return std::dynamic_pointer_cast(impl_)->data(); - } else if (phi::SelectedRows::classof(impl_.get())) { - return std::dynamic_pointer_cast(impl_) + return static_cast(impl_.get())->data(); + } else if (is_selected_rows()) { + return static_cast(impl_.get()) ->mutable_value() ->data(); } @@ -299,7 +309,7 @@ Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const { if (is_dense_tensor()) { return Tensor(std::make_shared( std::move(phi::DenseTensorUtils::Slice( - *(std::dynamic_pointer_cast(impl_).get()), + *(static_cast(impl_.get())), begin_idx, end_idx)))); } else { @@ -331,6 +341,9 @@ bool Tensor::defined() const { return impl_ != nullptr; } bool Tensor::initialized() const { return defined() && impl_->initialized(); } bool Tensor::is_initialized() const { + LOG(WARNING) << "The `is_initialized` method is deprecated since version " + "2.3, and will be removed in version 2.4! " + "Please use `initialized` method instead."; return defined() && impl_->initialized(); } @@ -342,7 +355,6 @@ Tensor &Tensor::operator=(const Tensor &x) & { impl_ = x.impl_; autograd_meta_ = x.autograd_meta_; name_ = x.name_; - place_ = x.place_; return *this; } @@ -350,7 +362,6 @@ Tensor &Tensor::operator=(Tensor &&x) & { impl_ = std::move(x.impl_); autograd_meta_ = std::move(x.autograd_meta_); name_ = std::move(x.name_); - place_ = std::move(x.place_); return *this; } @@ -371,8 +382,7 @@ void Tensor::set_autograd_meta( void Tensor::bump_inplace_version() { if (is_dense_tensor()) { auto &inplace_version_counter = - std::dynamic_pointer_cast(impl_) - ->InplaceVersionCounter(); + static_cast(impl_.get())->InplaceVersionCounter(); inplace_version_counter.Bump(); } else { PADDLE_THROW(phi::errors::Unimplemented( @@ -383,8 +393,7 @@ void Tensor::bump_inplace_version() { uint32_t Tensor::current_inplace_version() { if (is_dense_tensor()) { auto &inplace_version_counter = - std::dynamic_pointer_cast(impl_) - ->InplaceVersionCounter(); + static_cast(impl_.get())->InplaceVersionCounter(); return inplace_version_counter.CurrentVersion(); } else { PADDLE_THROW(phi::errors::Unimplemented( @@ -397,8 +406,7 @@ void Tensor::reset_inplace_version(bool set_to_zero) { if (set_to_zero) { if (is_dense_tensor()) { auto &inplace_version_counter = - std::dynamic_pointer_cast(impl_) - ->InplaceVersionCounter(); + static_cast(impl_.get())->InplaceVersionCounter(); inplace_version_counter.SetInplaceVersionToZero(); } } diff --git a/paddle/phi/api/lib/tensor_copy.cc b/paddle/phi/api/lib/tensor_copy.cc new file mode 100644 index 0000000000000..57e3c28d8cb1f --- /dev/null +++ b/paddle/phi/api/lib/tensor_copy.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/api/lib/tensor_copy.h" +#include "paddle/phi/api/lib/api_gen_utils.h" +#include "paddle/phi/api/lib/kernel_dispatch.h" +#include "paddle/phi/api/lib/utils/storage.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/infermeta/unary.h" + +namespace paddle { +namespace experimental { + +void copy(const Tensor& src, Place place, bool blocking, Tensor* dst) { + auto kernel_key_set = ParseKernelKeyByInputArgs(src); + kernel_key_set.backend_set = + kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place)); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "copy", kernel_key); + + VLOG(6) << "copy API kernel key: " << kernel_key; + VLOG(6) << "copy API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); + + auto dense_x = TensorToDenseTensor(src); + + auto kernel_out = SetKernelOutput(kernel_key.backend(), dst); + phi::MetaTensor meta_out(kernel_out); + phi::UnchangedInferMeta(*dense_x, &meta_out); + + using kernel_signature = void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + phi::Place, + bool, + phi::DenseTensor*); + + auto* kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out); +} + +} // namespace experimental +} // namespace paddle diff --git a/paddle/phi/api/ext/place.h b/paddle/phi/api/lib/tensor_copy.h similarity index 71% rename from paddle/phi/api/ext/place.h rename to paddle/phi/api/lib/tensor_copy.h index 91d4f41c21351..3ce45853319ec 100644 --- a/paddle/phi/api/ext/place.h +++ b/paddle/phi/api/lib/tensor_copy.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,12 @@ limitations under the License. */ #pragma once +#include "paddle/phi/api/include/tensor.h" + namespace paddle { +namespace experimental { -// TODO(yangjiabin): Add other place support in next PR -enum class PlaceType { kUNK = -1, kCPU, kGPU }; +void copy(const Tensor& src, Place place, bool blocking, Tensor* dst); +} // namespace experimental } // namespace paddle diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index c4c77ab93790d..46ca457b2c10a 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/phi/api/include/tensor.h" -#include "paddle/phi/api/lib/ext_compat_utils.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/tensor_base.h" @@ -39,37 +38,37 @@ Tensor Tensor::copy_to(Place place, bool blocking) const { } template -Tensor Tensor::copy_to(const PlaceType &target_place) const { +Tensor Tensor::copy_to(const Place &target_place) const { LOG(WARNING) << "The Tensor's `copy_to` method is deprecated since version " "2.3, and will be removed in version 2.4, please use " "`copy_to` method without template argument instead. " "reason: copying a Tensor to another device does not need " "to specify the data type template argument."; - return copy_to(ConvertExtPlaceToInnerPlace(target_place), /*blocking=*/false); + return copy_to(target_place, /*blocking=*/false); } template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; -template PADDLE_API Tensor Tensor::copy_to>( - const PlaceType &target_place) const; -template PADDLE_API Tensor Tensor::copy_to>( - const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to>(const Place &target_place) const; +template PADDLE_API Tensor +Tensor::copy_to>(const Place &target_place) const; +template PADDLE_API Tensor +Tensor::copy_to(const Place &target_place) const; void Tensor::copy_(const Tensor &src, const phi::Place &target_place, diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt index 94a16da2b7720..de97e7516f619 100644 --- a/paddle/phi/api/lib/utils/CMakeLists.txt +++ b/paddle/phi/api/lib/utils/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS -tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits scalar string_tensor) +tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits string_tensor scalar) diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt index 9bf692703860f..aa839eab587cb 100644 --- a/paddle/phi/common/CMakeLists.txt +++ b/paddle/phi/common/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(phi_place SRCS place.cc) -cc_library(scalar SRCS scalar.cc DEPS phi_enforce) +cc_library(scalar SRCS scalar.cc DEPS phi_enforce tensor) diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc index 2b5254d3d5f14..a77042757c7ba 100644 --- a/paddle/phi/common/place.cc +++ b/paddle/phi/common/place.cc @@ -109,3 +109,16 @@ uint32_t Place::Hash::operator()(const Place &place) const { } } // namespace phi + +namespace paddle { + +phi::Place PlaceType::kUNK = phi::Place(); +phi::Place PlaceType::kCPU = phi::Place(phi::AllocationType::CPU); +// GPU Place contains device id, here we use default value 0, so it cannot +// use for multi-casd cases, but because it is static variable, it is difficult +// to get the exact device id at all time. +// NOTE: Please DO NOT use this place in the framework!!! +// It only for external compatibility +phi::Place PlaceType::kGPU = phi::Place(phi::AllocationType::GPU); + +} // namespace paddle diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h index 390684366db71..d43fc497277c5 100644 --- a/paddle/phi/common/place.h +++ b/paddle/phi/common/place.h @@ -213,4 +213,30 @@ using GPUPinnedPlace = phi::GPUPinnedPlace; using XPUPlace = phi::XPUPlace; using NPUPlace = phi::NPUPlace; } // namespace experimental + +/* NOTE: In order to remove and be compatible with the enumeration type +`PlaceType` of custom operator, we define a temporary type. + +This type cannot add any new type!!! It is only used for compatibility with +historical writing and we will remove this temporary type in the future. +This Type cannot be used in framework! only used for custom operator! + +The historical PlaceType define: + +- enum class PlaceType { kUNK = -1, kCPU, kGPU }; + +The historical PlaceType using: + +- PD_CHECK(x.place() == paddle::PlaceType::kCPU) +- auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape()); + +The new type cannot be used as int value! If you use as int, please modify +the implementation. +*/ +struct PADDLE_API PlaceType { + static phi::Place kUNK; + static phi::Place kCPU; + static phi::Place kGPU; +}; + } // namespace paddle diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc index 5cd55c1e88bed..41f1c9541823d 100644 --- a/paddle/phi/common/scalar.cc +++ b/paddle/phi/common/scalar.cc @@ -14,21 +14,32 @@ limitations under the License. */ #include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/place.h" #include "paddle/phi/core/enforce.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace experimental { -// NOTE(xiongkun): why we put definition here? -// test_custom_op can't include enforce.h, because enforce.h includes gflags. -// so we decouple the include dependence of enforce.h by link. -void ThrowTensorConvertError(int num) { - PADDLE_ENFORCE_EQ(num, +// The Tensor must have one dim +template <> +ScalarBase::ScalarBase(const phi::DenseTensor& tensor_in) + : dtype_(tensor_in.dtype()) { // NOLINT + PADDLE_ENFORCE_EQ(tensor_in.numel(), 1, phi::errors::InvalidArgument( "The Scalar only supports Tensor with 1 element, but " "now Tensor has `%d` elements", - num)); + tensor_in.numel())); + auto cpu_place = phi::CPUPlace(); + if (!paddle::platform::is_same_place(tensor_in.place(), cpu_place)) { + phi::DenseTensor tensor; + framework::TensorCopySync(tensor_in, cpu_place, &tensor); + GetDataFromTensor(tensor); + } else { + GetDataFromTensor(tensor_in); + } } } // namespace experimental diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h index 5134f4eb72639..c28f6185a556a 100644 --- a/paddle/phi/common/scalar.h +++ b/paddle/phi/common/scalar.h @@ -23,8 +23,6 @@ limitations under the License. */ namespace paddle { namespace experimental { -void ThrowTensorConvertError(int); - template class ScalarBase { public: @@ -105,50 +103,7 @@ class ScalarBase { } // The Tensor must have one dim - ScalarBase(const T& tensor) : dtype_(tensor.dtype()) { // NOLINT - is_from_tensor_ = true; - ThrowTensorConvertError(tensor.numel()); - switch (dtype_) { - case DataType::FLOAT32: - data_.f32 = tensor.template data()[0]; - break; - case DataType::FLOAT64: - data_.f64 = tensor.template data()[0]; - break; - case DataType::FLOAT16: - data_.f16 = tensor.template data()[0]; - break; - case DataType::BFLOAT16: - data_.bf16 = tensor.template data()[0]; - break; - case DataType::INT32: - data_.i32 = tensor.template data()[0]; - break; - case DataType::INT64: - data_.i64 = tensor.template data()[0]; - break; - case DataType::INT16: - data_.i16 = tensor.template data()[0]; - break; - case DataType::INT8: - data_.i8 = tensor.template data()[0]; - break; - case DataType::UINT8: - data_.ui8 = tensor.template data()[0]; - break; - case DataType::BOOL: - data_.b = tensor.template data()[0]; - break; - case DataType::COMPLEX64: - data_.c64 = tensor.template data()[0]; - break; - case DataType::COMPLEX128: - data_.c128 = tensor.template data()[0]; - break; - default: - PD_THROW("Invalid tensor data type `", dtype_, "`."); - } - } + ScalarBase(const T& tensor_in); // NOLINT template ScalarBase(const ScalarBase& other) { @@ -200,6 +155,49 @@ class ScalarBase { private: template friend void CopyScalar(const ScalarBase& src, ScalarBase* dst); + void GetDataFromTensor(const T& tensor) { + is_from_tensor_ = true; + switch (dtype_) { + case DataType::FLOAT32: + data_.f32 = tensor.template data()[0]; + break; + case DataType::FLOAT64: + data_.f64 = tensor.template data()[0]; + break; + case DataType::FLOAT16: + data_.f16 = tensor.template data()[0]; + break; + case DataType::BFLOAT16: + data_.bf16 = tensor.template data()[0]; + break; + case DataType::INT32: + data_.i32 = tensor.template data()[0]; + break; + case DataType::INT64: + data_.i64 = tensor.template data()[0]; + break; + case DataType::INT16: + data_.i16 = tensor.template data()[0]; + break; + case DataType::INT8: + data_.i8 = tensor.template data()[0]; + break; + case DataType::UINT8: + data_.ui8 = tensor.template data()[0]; + break; + case DataType::BOOL: + data_.b = tensor.template data()[0]; + break; + case DataType::COMPLEX64: + data_.c64 = tensor.template data()[0]; + break; + case DataType::COMPLEX128: + data_.c128 = tensor.template data()[0]; + break; + default: + PD_THROW("Invalid tensor data type `", dtype_, "`."); + } + } private: bool is_from_tensor_{false}; diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index b42b4388c2ce1..23574e98fbf17 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -23,7 +23,7 @@ cc_library(string_tensor SRCS string_tensor.cc DEPS convert_utils tensor_meta te cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor) cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) -cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy) +cc_library(selected_rows SRCS selected_rows_impl.cc selected_rows.cc DEPS tensor_base dense_tensor phi_enforce ddim memcpy) cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows) cc_library(custom_kernel SRCS custom_kernel.cc DEPS kernel_factory) diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc index 8acdd8b34f7d1..1bfe29bc9d3ba 100644 --- a/paddle/phi/core/dense_tensor.cc +++ b/paddle/phi/core/dense_tensor.cc @@ -19,7 +19,24 @@ limitations under the License. */ #include "paddle/phi/common/float16.h" #include "paddle/phi/core/compat/convert_utils.h" -// See Note [ Why still include the fluid headers? ] +/** + * [ Why still include the fluid headers? ] + * + * We hope to organize the basic implementation of Tensor and the logic related + * to Tensor computation into an independent library, which we call + * [Tensor Operation Library, phi], so we extract or rewrite the original + * Kernels. + * + * In the future, the training library, inference library and custom operators + * will link to this Tensor Operation library. + * + * However, if we directly split the link relation, we need to make too many + * changes, which will affect the stability of the framework, so here we still + * rely on the implementation of the framework, which is a intermediate state. + * + * In the future, the necessary components will be moved to the this library, + * or the corresponding components will be re-implemented. + */ #include "paddle/fluid/memory/malloc.h" namespace phi { diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h index a740a9a5725d9..ef91319e1c961 100644 --- a/paddle/phi/core/dense_tensor.h +++ b/paddle/phi/core/dense_tensor.h @@ -190,6 +190,25 @@ class DenseTensor : public TensorBase, std::shared_ptr inplace_version_counter_{ std::make_shared()}; +/* @jim19930609: This is a hack +In general, it is badly designed to fuse MKLDNN-specific objects into a +generic Tensor. +We temporarily leave them here to unblock Tensor Unification progress. +In the final state, we should come up with a MKLDNN_Tensor and move the +following codes there. +*/ +#ifdef PADDLE_WITH_MKLDNN + /** + * @brief the detail format of memory block which have layout as kMKLDNN + * + * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, + * nChw16c, etc. For a MKLDNN memory block, layout will be set as + * DataLayout::kMKLDNN meanwhile detail memory format will be kept in + * this field. + */ + dnnl::memory::format_tag format_ = dnnl::memory::format_tag::undef; +#endif + #ifndef PADDLE_WITH_CUSTOM_KERNEL #include "paddle/phi/core/dense_tensor.inl" #endif diff --git a/paddle/phi/core/dense_tensor.inl b/paddle/phi/core/dense_tensor.inl index 804360ea941ba..c6ca3c00cb558 100644 --- a/paddle/phi/core/dense_tensor.inl +++ b/paddle/phi/core/dense_tensor.inl @@ -133,17 +133,6 @@ inline void set_format(const dnnl::memory::format_tag format) { format_ = format; } -protected: -/** - * @brief the detail format of memory block which have layout as kMKLDNN - * - * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, - * nChw16c, etc. For a MKLDNN memory block, layout will be set as - * DataLayout::kMKLDNN meanwhile detail memory format will be kept in - * this field. - */ - -dnnl::memory::format_tag format_ = dnnl::memory::format_tag::undef; #endif /* ------------------------------ */ diff --git a/paddle/phi/api/lib/ext_compat_utils.h b/paddle/phi/core/selected_rows.cc similarity index 53% rename from paddle/phi/api/lib/ext_compat_utils.h rename to paddle/phi/core/selected_rows.cc index 89f6f15b70ff2..dcf9c4182157a 100644 --- a/paddle/phi/api/lib/ext_compat_utils.h +++ b/paddle/phi/core/selected_rows.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,20 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once +#include "paddle/phi/core/selected_rows.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/phi/api/ext/place.h" -#include "paddle/phi/common/backend.h" +namespace phi { -namespace paddle { -namespace experimental { +SelectedRows::SelectedRows(const std::vector& rows, + const int64_t& height) + : impl_(std::make_shared(rows, height)) {} -platform::Place ConvertExtPlaceToInnerPlace(PlaceType p); +SelectedRows::SelectedRows() + : impl_(std::make_shared()) {} -PlaceType ConvertInnerPlaceToExtPlace(const platform::Place& p); - -Backend ConvertExtPlaceToBackend(PlaceType p); - -} // namespace experimental -} // namespace paddle +} // namespace phi diff --git a/paddle/phi/core/selected_rows.h b/paddle/phi/core/selected_rows.h index 7ee475b4d5d9e..a71c0471cc431 100644 --- a/paddle/phi/core/selected_rows.h +++ b/paddle/phi/core/selected_rows.h @@ -42,10 +42,9 @@ class SelectedRows : public TensorBase, * */ public: - SelectedRows(const std::vector& rows, const int64_t& height) - : impl_(std::make_shared(rows, height)) {} + SelectedRows(const std::vector& rows, const int64_t& height); - SelectedRows() : impl_(std::make_shared()) {} + SelectedRows(); const DenseTensor& value() const { return impl_->value(); } diff --git a/paddle/phi/core/utils/type_registry.h b/paddle/phi/core/utils/type_registry.h index 8d9f9167242c8..f27c3db2275c3 100644 --- a/paddle/phi/core/utils/type_registry.h +++ b/paddle/phi/core/utils/type_registry.h @@ -51,7 +51,7 @@ TypeInfo TypeRegistry::RegisterType(const std::string& type) { std::lock_guard guard(mutex_); assert(name_to_id_.find(type) == name_to_id_.end()); assert(names_.size() < std::numeric_limits::max()); - int8_t id = names_.size(); + int8_t id = static_cast(names_.size()); names_.emplace_back(type); name_to_id_[type] = id; return TypeInfo(id); diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index 81d3cb9ddf0f4..84db67978fc23 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -169,6 +169,27 @@ void CrossEntropyWithSoftmaxGradInferMeta(const MetaTensor& label, logits_grad->set_dtype(softmax.dtype()); } +void DeformableConvGradInferMeta(const MetaTensor& x, + const MetaTensor& offset, + const MetaTensor& filter, + paddle::optional mask, + const MetaTensor& out_grad, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + int deformable_groups, + int groups, + int im2col_step, + MetaTensor* dx, + MetaTensor* offset_grad, + MetaTensor* filter_grad, + MetaTensor* mask_grad) { + GeneralTernaryGradInferMeta(x, offset, filter, dx, offset_grad, filter_grad); + if (mask) { + UnchangedInferMeta(mask.get(), mask_grad); + } +} + void GatherNdGradInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& out_grad, @@ -520,8 +541,10 @@ void StackGradInferMeta(const MetaTensor& out_grad, vec.erase(vec.begin() + axis); for (size_t i = 0; i < x_grad.size(); ++i) { - x_grad[i]->set_dims(phi::make_ddim(vec)); - x_grad[i]->set_dtype(out_grad.dtype()); + if (x_grad[i]) { + x_grad[i]->set_dims(phi::make_ddim(vec)); + x_grad[i]->set_dtype(out_grad.dtype()); + } } } diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 058ff7541cd8b..6e730c83d1d50 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -79,6 +79,22 @@ void CrossEntropyWithSoftmaxGradInferMeta(const MetaTensor& label, MetaTensor* logits_grad, MetaConfig config = MetaConfig()); +void DeformableConvGradInferMeta(const MetaTensor& x, + const MetaTensor& offset, + const MetaTensor& filter, + paddle::optional mask, + const MetaTensor& out_grad, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + int deformable_groups, + int groups, + int im2col_step, + MetaTensor* dx, + MetaTensor* offset_grad, + MetaTensor* filter_grad, + MetaTensor* mask_grad); + void GatherNdGradInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& out_grad, diff --git a/paddle/phi/kernels/funcs/sparse/common_shape.h b/paddle/phi/kernels/funcs/sparse/common_shape.h index 3617e3cd2f406..e4c836d116252 100644 --- a/paddle/phi/kernels/funcs/sparse/common_shape.h +++ b/paddle/phi/kernels/funcs/sparse/common_shape.h @@ -40,6 +40,45 @@ inline const DDim InferDenseDims(const DDim& x_dims, return values_dims; } +template +inline const IntT HOSTDEVICE IndicesToIndex(const IntT* indices, + const IntT* sparse_offsets, + const int64_t non_zero_num, + const int64_t sparse_dim, + const int i) { + IntT index = 0; + for (IntT j = 0; j < sparse_dim; j++) { + index += indices[j * non_zero_num + i] * sparse_offsets[j]; + } + return index; +} + +template +inline void HOSTDEVICE FlattenIndices(const IntT* indices, + const IntT* sparse_offsets, + const int64_t non_zero_num, + const int64_t sparse_dim, + const int start, + const int stride, + IntT* out) { + for (int i = start; i < non_zero_num; i += stride) { + out[i] = + IndicesToIndex(indices, sparse_offsets, non_zero_num, sparse_dim, i); + } +} + +// 1. indices.dims().size() == 2 +template +inline void CalcOffsetsPerDim(const DDim& dims, + const int64_t sparse_dim, + std::vector* offsets) { + IntT offset = 1; + for (IntT i = sparse_dim - 1; i >= 0; i--) { + (*offsets)[i] = offset; + offset *= dims[i]; + } +} + } // namespace sparse } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc index 0a5e145312e0e..a07a7fb2ecf44 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/sparse/common_shape.h" #include "paddle/phi/api/ext/dispatch.h" @@ -38,12 +39,6 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx, const DenseTensor& indices = mask.non_zero_indices(); const DenseTensor& values = mask.non_zero_elements(); int sparse_dim = indices.dims().size(); - std::vector sparse_offsets(sparse_dim); - int64_t offset = 1; - for (int i = sparse_dim - 1; i >= 0; i--) { - sparse_offsets[i] = offset; - offset *= dims[i]; - } DenseTensor out_indices = phi::EmptyLike(dev_ctx, indices); DenseTensor out_values = phi::EmptyLike(dev_ctx, values); @@ -51,21 +46,25 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx, // the out_indices is same as indices of mask phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &out_indices); - const IntT* indices_ptr = indices.data(); T* out_values_ptr = out_values.data(); const T* x_ptr = x.data(); const int64_t non_zero_num = mask.nnz(); auto dims_2d = flatten_to_2d(dims, sparse_dim); const int cols = dims_2d[1]; + const IntT* indices_ptr = indices.data(); + + std::vector out_indexs(non_zero_num), sparse_offsets(sparse_dim); + + phi::funcs::sparse::CalcOffsetsPerDim( + dims, sparse_dim, &sparse_offsets); for (int64_t i = 0; i < non_zero_num; i++) { - int64_t index = 0; - for (int j = 0; j < sparse_dim; j++) { - index += indices_ptr[j * non_zero_num + i] * sparse_offsets[j]; - } + int64_t index = phi::funcs::sparse::IndicesToIndex( + indices_ptr, sparse_offsets.data(), non_zero_num, sparse_dim, i); memcpy(out_values_ptr + i * cols, x_ptr + index * cols, cols * sizeof(T)); } + out->SetMember(out_indices, out_values, dims, true); } @@ -85,6 +84,73 @@ void SparseMaskKernel(const Context& dev_ctx, })); } +template +void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& mask_indices, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + mask_indices.dims().size(), + 2, + phi::errors::InvalidArgument("the mask_indices must be 2-D tensor")); + + const int64_t sparse_dim = x.non_zero_indices().dims()[0]; + + std::vector sparse_offsets(sparse_dim), x_indexs(x.nnz()), + mask_indexs(mask_indices.dims()[1]); + phi::funcs::sparse::CalcOffsetsPerDim( + x.dims(), sparse_dim, &sparse_offsets); + + phi::funcs::sparse::FlattenIndices(x.non_zero_indices().data(), + sparse_offsets.data(), + x.nnz(), + sparse_dim, + 0, + 1, + x_indexs.data()); + phi::funcs::sparse::FlattenIndices(mask_indices.data(), + sparse_offsets.data(), + x.nnz(), + sparse_dim, + 0, + 1, + mask_indexs.data()); + + std::unordered_map x_indexs_map; + for (uint64_t i = 0; i < x_indexs.size(); i++) { + x_indexs_map[x_indexs[i]] = i; + } + *out = phi::EmptyLike(dev_ctx, x.non_zero_elements()); + T* out_ptr = out->data(); + memset(out_ptr, static_cast(0), out->numel() * sizeof(T)); + const int64_t stride = + x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim; + const T* in_ptr = x.non_zero_elements().data(); + // TODO(zhangkaihuo): multithreading can be used for acceleration + for (uint64_t i = 0; i < mask_indexs.size(); i++) { + auto iter = x_indexs_map.find(mask_indexs[i]); + if (iter != x_indexs_map.end()) { + memcpy(out_ptr + i * stride, + in_ptr + iter->second * stride, + stride * sizeof(T)); + } + } +} + +/** + * @brief filter values from x.values() using mask_indices + */ +template +void SparseMaskHelperKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& mask_indices, + DenseTensor* out) { + PD_DISPATCH_INTEGRAL_TYPES( + x.non_zero_indices().dtype(), "SparseMaskHelperCPUKernel", ([&] { + SparseMaskHelperCPUKernel(dev_ctx, x, mask_indices, out); + })); +} + } // namespace sparse } // namespace phi @@ -101,3 +167,16 @@ PD_REGISTER_KERNEL(sparse_mask, int64_t) { kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); } + +PD_REGISTER_KERNEL(sparse_mask_helper, + CPU, + ALL_LAYOUT, + phi::sparse::SparseMaskHelperKernel, + float, + double, + uint8_t, + int16_t, + int, + int64_t) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc index acc834269663d..0499371a4dd17 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc @@ -394,3 +394,15 @@ PD_REGISTER_KERNEL(csr_values, int64_t) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); } + +PD_REGISTER_KERNEL(sparse_coo_tensor, + CPU, + ALL_LAYOUT, + phi::sparse::SparseCooTensorKernel, + float, + double, + phi::dtype::float16, + uint8_t, + int16_t, + int, + int64_t) {} diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu index d206d6bbc195c..96ab56697b9b0 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include + #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/ddim.h" @@ -20,6 +22,7 @@ limitations under the License. */ #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/sparse/common_shape.h" #include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" #include "paddle/phi/api/ext/dispatch.h" @@ -59,7 +62,7 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx, const DenseTensor& indices = mask.non_zero_indices(); const DenseTensor& values = mask.non_zero_elements(); int sparse_dim = indices.dims().size(); - DenseTensor sparse_offsets = phi::Empty( + DenseTensor sparse_offsets = phi::Empty( dev_ctx, DenseTensorMeta(DataType::INT64, {sparse_dim}, DataLayout::NCHW)); std::vector h_sparse_offsets(sparse_dim); @@ -121,6 +124,153 @@ void SparseMaskKernel(const Context& dev_ctx, })); } +// TODO(zhangkaihuo): Use an op to realize the function of FlattenIndices +template +__global__ void FlattenIndicesKernel(const IntT* indices, + const IntT* sparse_offsets, + const int64_t non_zero_num, + const int64_t sparse_dim, + IntT* out) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + phi::funcs::sparse::FlattenIndices(indices, + sparse_offsets, + non_zero_num, + sparse_dim, + tid, + gridDim.x * blockDim.x, + out); +} + +template +__global__ void SparseMaskCopyKernel(const IntT* x_indexs, + const IntT* mask_indexs, + const IntT* bound_out, + const T* x_values, + const int64_t n, + const int64_t stride, + T* out_values) { + CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { + const IntT j = bound_out[i]; + if (j >= 0 && j < n && mask_indexs[i] == x_indexs[j]) { + for (int k = 0; k < stride; k++) { + out_values[i * stride + k] = x_values[j * stride + k]; + } + } + } +} + +template +void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& mask_indices, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + mask_indices.dims().size(), + 2, + phi::errors::InvalidArgument("the mask_indices must be 2-D tensor")); + + const int64_t sparse_dim = x.non_zero_indices().dims()[0]; + auto indices_dtype = paddle::experimental::CppTypeToDataType::Type(); + + std::vector sparse_offsets(sparse_dim); + + DenseTensorMeta x_indexs_meta(indices_dtype, {x.nnz()}, DataLayout::NCHW); + DenseTensorMeta mask_indexs_meta( + indices_dtype, {mask_indices.dims()[1]}, DataLayout::NCHW); + DenseTensorMeta sparse_offset_meta( + indices_dtype, {sparse_dim}, DataLayout::NCHW); + + DenseTensor x_indexs = + phi::Empty(dev_ctx, std::move(x_indexs_meta)); + DenseTensor mask_indexs = + phi::Empty(dev_ctx, std::move(mask_indexs_meta)); + DenseTensor bound_out = + phi::Empty(dev_ctx, std::move(mask_indexs_meta)); + DenseTensor d_sparse_offsets = + phi::Empty(dev_ctx, std::move(sparse_offset_meta)); + IntT* x_indexs_ptr = x_indexs.data(); + IntT* mask_indexs_ptr = mask_indexs.data(); + IntT* bound_out_ptr = bound_out.data(); + + // 1. calc the offsets of per dim + phi::funcs::sparse::CalcOffsetsPerDim(x.dims(), sparse_dim, &sparse_offsets); + // 2. copy sparse_offsets to device + phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data(), + sparse_offsets.data(), + sizeof(IntT) * sparse_dim, +#ifdef PADDLE_WITH_HIP + hipMemcpyHostToDevice, +#else + cudaMemcpyHostToDevice, +#endif + dev_ctx.stream()); + + // 3. flatten x indices and mask indices + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1); + FlattenIndicesKernel<<>>(x.non_zero_indices().data(), + d_sparse_offsets.data(), + x_indexs.numel(), + sparse_dim, + x_indexs_ptr); + + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1); + FlattenIndicesKernel<<>>(mask_indices.data(), + d_sparse_offsets.data(), + mask_indexs.numel(), + sparse_dim, + mask_indexs_ptr); +// 4. call thrust::lower_bound +#ifdef PADDLE_WITH_HIP + thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()), +#endif + x_indexs_ptr, + x_indexs_ptr + x_indexs.numel(), + mask_indexs_ptr, + mask_indexs_ptr + mask_indexs.numel(), + bound_out_ptr); + + // 5. copy value to out + *out = phi::EmptyLike(dev_ctx, x.non_zero_elements()); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, out, static_cast(0)); + T* out_ptr = out->data(); + + const int64_t stride = + x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim; + + SparseMaskCopyKernel<<>>(x_indexs_ptr, + mask_indexs_ptr, + bound_out_ptr, + x.non_zero_elements().data(), + mask_indexs.numel(), + stride, + out_ptr); +} + +template +void SparseMaskHelperKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& mask_indices, + DenseTensor* out) { + PD_DISPATCH_INTEGRAL_TYPES( + x.non_zero_indices().dtype(), "SparseMaskHelperGPUKernel", ([&] { + SparseMaskHelperGPUKernel(dev_ctx, x, mask_indices, out); + })); +} + } // namespace sparse } // namespace phi @@ -138,3 +288,17 @@ PD_REGISTER_KERNEL(sparse_mask, int64_t) { kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); } + +PD_REGISTER_KERNEL(sparse_mask_helper, + GPU, + ALL_LAYOUT, + phi::sparse::SparseMaskHelperKernel, + float, + double, + phi::dtype::float16, + uint8_t, + int16_t, + int, + int64_t) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu index 1109baf92e302..0b6ac1aed0147 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu @@ -665,3 +665,15 @@ PD_REGISTER_KERNEL(csr_values, int64_t) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); } + +PD_REGISTER_KERNEL(sparse_coo_tensor, + GPU, + ALL_LAYOUT, + phi::sparse::SparseCooTensorKernel, + float, + double, + phi::dtype::float16, + uint8_t, + int16_t, + int, + int64_t) {} diff --git a/paddle/phi/kernels/sparse/sparse_mask_kernel.h b/paddle/phi/kernels/sparse/sparse_mask_kernel.h index 210412abd8620..88899e3dc672e 100644 --- a/paddle/phi/kernels/sparse/sparse_mask_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_mask_kernel.h @@ -26,5 +26,11 @@ void SparseMaskKernel(const Context& dev_ctx, const SparseCooTensor& mask, SparseCooTensor* out); +template +void SparseMaskHelperKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& mask_indices, + DenseTensor* out); + } // namespace sparse } // namespace phi diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc index 35329807e7798..15d78692f4f35 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc @@ -66,6 +66,19 @@ PD_REGISTER_KERNEL(sparse_coo_to_dense_grad, kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); } +PD_REGISTER_KERNEL(sparse_coo_tensor_grad, + CPU, + ALL_LAYOUT, + phi::sparse::SparseCooTensorGradKernel, + float, + double, + uint8_t, + int16_t, + int, + int64_t) { + kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); +} + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(coo_values_grad, GPU, @@ -95,4 +108,16 @@ PD_REGISTER_KERNEL(sparse_coo_to_dense_grad, int64_t) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); } +PD_REGISTER_KERNEL(sparse_coo_tensor_grad, + GPU, + ALL_LAYOUT, + phi::sparse::SparseCooTensorGradKernel, + float, + double, + uint8_t, + int16_t, + int, + int64_t) { + kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); +} #endif diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h index 0775582bf1fb8..a00b9c275c292 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" namespace phi { namespace sparse { @@ -32,5 +33,13 @@ void SparseCooToDenseGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, SparseCooTensor* x_grad); +template +void SparseCooTensorGradKernel(const Context& dev_ctx, + const DenseTensor& indices, + const SparseCooTensor& out_grad, + DenseTensor* values_grad) { + SparseMaskHelperKernel(dev_ctx, out_grad, indices, values_grad); +} + } // namespace sparse } // namespace phi diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h index 961cd9f829eb2..8cf9c0a28648a 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/phi/api/lib/utils/storage.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" @@ -147,5 +148,16 @@ void CsrValuesKernel(const Context& dev_ctx, *out = x.non_zero_elements(); } +template +void SparseCooTensorKernel(const Context& dev_ctx, + const DenseTensor& values, + const DenseTensor& indices, + const IntArray& dense_shape, + SparseCooTensor* out) { + *out = + SparseCooTensor(indices, values, phi::make_ddim(dense_shape.GetData())); + // TODO(zhangkaihuo): sort and merge the dumplicate indices +} + } // namespace sparse } // namespace phi diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt index 94378aceff58c..dd4b7e62ec52f 100644 --- a/paddle/phi/tests/api/CMakeLists.txt +++ b/paddle/phi/tests/api/CMakeLists.txt @@ -11,14 +11,14 @@ cc_test(test_mean_api SRCS test_mean_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_dot_api SRCS test_dot_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_empty_api SRCS test_empty_api.cc DEPS ${COMMON_API_TEST_DEPS}) -cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS}) +cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar) cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_cast_api SRCS test_cast_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_to_api SRCS test_to_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_slice_api SRCS test_slice_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_sum_api SRCS test_sum_api.cc DEPS ${COMMON_API_TEST_DEPS}) -cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS}) +cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar) cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_conj_api SRCS test_conj_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_concat_api SRCS test_concat_api.cc DEPS ${COMMON_API_TEST_DEPS}) diff --git a/paddle/phi/tests/api/test_pten_tensor.cc b/paddle/phi/tests/api/test_pten_tensor.cc index 74ed648f3ee6e..590717b8d7b77 100644 --- a/paddle/phi/tests/api/test_pten_tensor.cc +++ b/paddle/phi/tests/api/test_pten_tensor.cc @@ -15,7 +15,6 @@ #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/phi/api/include/tensor.h" -#include "paddle/phi/api/lib/ext_compat_utils.h" #include "paddle/phi/core/kernel_registry.h" PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); @@ -201,7 +200,7 @@ void GroupTestDtype() { void TestInitilized() { experimental::Tensor test_tensor(paddle::PlaceType::kCPU, {1, 1}); - CHECK(test_tensor.is_initialized() == false); + CHECK(test_tensor.is_initialized() == true); test_tensor.mutable_data(paddle::PlaceType::kCPU); CHECK(test_tensor.is_initialized() == true); float* tensor_data = test_tensor.mutable_data(); diff --git a/paddle/phi/tests/common/CMakeLists.txt b/paddle/phi/tests/common/CMakeLists.txt index 710ea3c066472..ca6d20045d171 100644 --- a/paddle/phi/tests/common/CMakeLists.txt +++ b/paddle/phi/tests/common/CMakeLists.txt @@ -2,3 +2,9 @@ cc_test(phi_test_backend SRCS test_backend.cc DEPS gtest) cc_test(phi_test_data_layout SRCS test_data_layout.cc DEPS gtest) cc_test(phi_test_data_type SRCS test_data_type.cc DEPS gtest) cc_test(phi_test_place SRCS test_place.cc DEPS phi_place) +if (WITH_GPU) + nv_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar) +endif() +if(WITH_ROCM) + hip_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar) +endif() diff --git a/paddle/phi/tests/common/test_scalar.cu b/paddle/phi/tests/common/test_scalar.cu new file mode 100644 index 0000000000000..6b0caa175dc04 --- /dev/null +++ b/paddle/phi/tests/common/test_scalar.cu @@ -0,0 +1,205 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include // NOLINT +#include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT); + +namespace phi { +namespace tests { + +using DDim = phi::DDim; +using float16 = phi::dtype::float16; +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +__global__ void FillTensor(float* data) { data[0] = 1; } + +TEST(Scalar, ConstructFromDenseTensor1) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::FLOAT16, phi::make_ddim({1}), phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = 1; + phi::Scalar scalar_test(dense_x); + ASSERT_NEAR(1, scalar_test.to(), 1e-6); +} + +TEST(Scalar, ConstructFromDenseTensor2) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::INT16, phi::make_ddim({1}), phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = 1; + phi::Scalar scalar_test(dense_x); + ASSERT_EQ(1, scalar_test.to()); +} + +TEST(Scalar, ConstructFromDenseTensor3) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::INT8, phi::make_ddim({1}), phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = 1; + phi::Scalar scalar_test(dense_x); + ASSERT_EQ(1, scalar_test.to()); +} + +TEST(Scalar, ConstructFromDenseTensor4) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::BOOL, phi::make_ddim({1}), phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = true; + phi::Scalar scalar_test(dense_x); + ASSERT_EQ(true, scalar_test.to()); +} + +TEST(Scalar, ConstructFromDenseTensor5) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x(alloc.get(), + phi::DenseTensorMeta(phi::DataType::COMPLEX64, + phi::make_ddim({1}), + phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = 1; + phi::Scalar scalar_test(dense_x); + complex64 expected_value(1, 0); + EXPECT_TRUE(expected_value == scalar_test.to()); +} + +TEST(Scalar, ConstructFromDenseTensor6) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x(alloc.get(), + phi::DenseTensorMeta(phi::DataType::COMPLEX128, + phi::make_ddim({1}), + phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = 1; + phi::Scalar scalar_test(dense_x); + complex128 expected_value(1, 0); + EXPECT_TRUE(expected_value == scalar_test.to()); +} + +TEST(Scalar, ConstructFromDenseTensor7) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::GPUPlace()); + phi::DenseTensor dense_x( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW)); + phi::GPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::GPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data); + dev_ctx.Wait(); + phi::Scalar scalar_test(dense_x); + ASSERT_NEAR(1, scalar_test.to(), 1e-6); +} + +TEST(Scalar, ConstructFromTensor) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::GPUPlace()); + auto dense_x = std::make_shared( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW)); + + phi::GPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::GPUPlace()) + .get()); + dev_ctx.Init(); + auto* dense_x_data = dev_ctx.Alloc(dense_x.get()); + FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data); + dev_ctx.Wait(); + paddle::experimental::Tensor x(dense_x); + paddle::experimental::Scalar scalar_test(x); + ASSERT_NEAR(1, scalar_test.to(), 1e-6); +} + +} // namespace tests +} // namespace phi diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt index 824d188457815..7d2fd90e6bb7b 100644 --- a/paddle/phi/tests/core/CMakeLists.txt +++ b/paddle/phi/tests/core/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel) +cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel scalar) cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor) cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc) cc_test(test_type_info SRCS test_type_info.cc) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 13c7adb5872f2..3578b9a1aaeea 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -268,6 +268,7 @@ from .tensor.math import fmin # noqa: F401 from .tensor.math import inner # noqa: F401 from .tensor.math import outer # noqa: F401 +from .tensor.math import frac # noqa: F401 from .tensor.random import bernoulli # noqa: F401 from .tensor.random import poisson # noqa: F401 @@ -608,6 +609,7 @@ 'concat', 'check_shape', 'trunc', + 'frac', 'digamma', 'standard_normal', 'diagonal', diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index fbad470cb3f13..d2bed171aa27a 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -849,7 +849,9 @@ def all_gather(tensor_list, tensor, group=None, use_calc_stream=True): if in_dygraph_mode(): group = _get_default_group() if group is None else group - out = paddle.concat(tensor_list) + tensor_shape = list(tensor.shape) + tensor_shape[0] *= group.nranks + out = paddle.empty(tensor_shape, tensor.dtype) task = group.process_group.all_gather(tensor, out) task.wait() tensor_list.clear() diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py new file mode 100644 index 0000000000000..9df68dc419efa --- /dev/null +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py @@ -0,0 +1,410 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#Taken and modified for fairscale from: +# https://github.com/facebookresearch/fairscale/blob/main/fairscale/optim/oss.py +#Commit: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e + +import copy +import logging +import numpy as np +from collections import OrderedDict + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.optimizer import Optimizer +from paddle.fluid.clip import ClipGradByGlobalNorm +from paddle.distributed.collective import _get_global_group, new_group, broadcast, wait + +from .group_sharded_storage import ParamStorage, GradStorage +from .group_sharded_utils import Type, device_guard, GroupShardedClipGrad + +# CUDA alignment 256 bytes, cpu alignment 4096 bytes +alignment = {"gpu": 256, "cpu": 4096} +align = { + Type.fp16.value: 2, + Type.fp32.value: 4, +} + + +class GroupShardedOptimizerStage2(Optimizer): + """ + A wrapper for Sharding Stage2 Optimizer in Dygraph. + + .. warning: ShardingOptimizer encapsulates the optimization strategy and integrates it into the optimizer. + + .. ZeRO: 1.https://arxiv.org/pdf/1910.02054.pdf 2.https://arxiv.org/pdf/1910.02054.pdf. + + """ + + # TODO (Baibaifan) + # Feature Notes: + # 1. Unified memory for parameters and parameters.grad to InternalStorage. + # 2. Support the segmentation of optimizer parameters and partial updating of parameters. + # 3. Dynamically adjust training parameters and models. + # 4. Support offload function. + # 5. Support the establishment of independent communication groups. + # 6. Broadcast_fp16 is not supported now. + def __init__(self, + params, + optim, + group=None, + offload=False, + device="gpu", + pertrain_sync_models=True, + **kw): + + super().__init__(learning_rate=optim._learning_rate, parameters=params) + assert core.is_compiled_with_cuda(), "Only GPU is supported now" + + # Segmentation information + self._dtype_rank_params = OrderedDict( + ) # {dtype:[param1,param2]} device, rank, params + self._param2rank = {} + self.__segment_params = [] + self._rank_buffer_size = {} # {dtype: {rank: numel+alignment}} + self._param2align = {} # {param.name: align} + + # Default information + self._optim = optim + + assert hasattr(self._optim, "_master_weights" + ), "Must use optimizer with _master_weights attribute" + + # Support parameter group and parameter list + self._local_params = [] + if isinstance(params[0], dict): + for param_group in params: + self._local_params.extend(list(param_group["params"])) + else: + self._local_params.extend(list(params)) + + self._default_device = device + self._pfp16 = len( + list( + filter(lambda x: x.trainable and x.dtype == Type.fp16.value, + self._local_params))) > 0 + + self._group = new_group(_get_global_group() + .ranks) if group is None else group + + self.world_size = self._group.nranks + self._rank = self._group.rank + self._global_root_rank = self._group.ranks[0] + + # Synchronous all ranks models + if pertrain_sync_models: + self._sync_params_and_buffers() + + self.param_storages = {} # {dtype: {rank: InternalStorage}} + + if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm): + logging.warning( + "While using ClipGradByGlobalNorm in GroupShardedOptimizerStage2, the grad clip of original optimizer will be changed." + ) + + self._optim._grad_clip = GroupShardedClipGrad( + self._optim._grad_clip, paddle.get_device(), self._group) + if self._optim._parameter_list and isinstance( + self._optim._parameter_list[0], dict): + for item in self._optim._param_groups: + if "grad_clip" in item.keys(): + item["grad_clip"] = self._optim._grad_clip + + if offload: + assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16" + + self.offload = offload # Using for offload + self.offload_device = "cpu" + self.offload_buffer_size = 0 + self.offload_param2align = {} + self.offload_params = None + self.offload_grads = None + self.dev_id = int(paddle.get_device().split(":")[1]) + + self._master_params = {} + + # Update optimizer parameters and adjust parameter storage and use according to rank. + self._update_opt_status() + + @paddle.autograd.no_grad() + def _sync_params_and_buffers(self): + """ + Sync all model states for all ranks + """ + + for p in self._local_params: + broadcast( + p, + src=self._global_root_rank, + group=self._group, + use_calc_stream=True) + + def _generate_master_params(self, trainable_params): + if self.offload: + for param in trainable_params: + if param.name not in self._master_params.keys(): + self._master_params[param.name] = core.eager.Tensor( + name=param.name, + value=param.cast(dtype=Type.fp32.value).numpy(), + place=core.CPUPlace(), + stop_gradient=param.stop_gradient) + else: + for param in trainable_params: + if param.dtype == Type.fp16.value: + master_tensor = paddle.cast(param, Type.fp32.value) + master_tensor.name = param.name + self._optim._master_weights[param.name] = master_tensor + + def _update_opt_status(self): + """Update optimizer status and parameter storage information, and special functions to be developed. + """ + # func 1 + self._integration_params() + + # Segement helpers + + def _segment_params(self): + """ + Divide all optimizer parameters equally into rank. + """ + if len(self.__segment_params) == 0: + self.__segment_params, param_lists = [ + [] for _ in range(self.world_size) + ], [[] for _ in range(self.world_size)] + sizes = [0] * self.world_size + for param in self._local_params: + # Add this param to rank with smallest size. + rank = sizes.index(min(sizes)) + param_lists[rank].append(param) + + # Statistical real numels + sizes[rank] += param._numel() if param.trainable else 0 + + for rank, params in enumerate(param_lists): + self.__segment_params[rank].extend(params) + return self.__segment_params + + @property + def local_params(self): + return self._local_params + + @property + def param2rank(self): + """Map the params to the rank which owns them""" + if len(self._param2rank) == 0: + for rank, params in enumerate(self._segment_params()): + for param in params: + self._param2rank[param.name] = rank + return self._param2rank + + @property + def dtype_rank_params(self): + """ + Divide the parameters into groups according to rank and dtype. + """ + if len(self._dtype_rank_params) == 0: + # Assign the parameters of each rank according to the type + for param in self._local_params: + if param.dtype not in self._dtype_rank_params.keys(): + self._dtype_rank_params[ + param.dtype] = [[] for _ in range(self.world_size)] + self._dtype_rank_params[param.dtype][self.param2rank[ + param.name]].append(param) + + # Sort per rank params by size + for dtype in self._dtype_rank_params.keys(): + for rank_params in self._dtype_rank_params[dtype]: + rank_params.sort(key=lambda x: x._numel()) + + return self._dtype_rank_params + + @property + def rank_buffer_size(self): + """ + Count the memory size of the parameters corresponding to rank under the corresponding dtype. + """ + # CUDA alignment 256 bytes + if len(self._rank_buffer_size) == 0: + for dtype in self.dtype_rank_params.keys(): + if dtype not in self._rank_buffer_size.keys(): + self._rank_buffer_size[dtype] = {} + for dst_rank, per_rank_params in enumerate( + self.dtype_rank_params[dtype]): + if dst_rank not in self._rank_buffer_size[dtype].keys(): + self._rank_buffer_size[dtype][dst_rank] = 0 + for param in per_rank_params: + if not param.trainable: + continue + size = param._numel() * align[dtype] + remaining = size % alignment[self._default_device] + ali = 0 if remaining == 0 else alignment[ + self._default_device] - remaining + align_ = ali // align[dtype] + self._rank_buffer_size[dtype][dst_rank] += param._numel( + ) + align_ + self._param2align[param.name] = align_ + + return self._rank_buffer_size + + def _integration_params(self): + """ + Integrate the parameters into a continuous memory according to rank, and support the update of training parameters. + """ + + for dtype, per_rank_params in self.dtype_rank_params.items(): + if dtype not in self.param_storages.keys(): + self.param_storages[dtype] = {} + + for dst_rank, params in enumerate(per_rank_params): + if len(params) > 0: + + # Merge all the trainable params in a single InternalStorage + trainable_params = list( + filter(lambda x: x.trainable, params)) + if self._pfp16 and dst_rank == self._rank: + self._generate_master_params(trainable_params) + if trainable_params: + param_storage = ParamStorage( + size=self.rank_buffer_size[dtype][dst_rank], + dtype=dtype, + device=self._default_device) + + param_storage.add_rank_params(trainable_params, + self._param2align) + self.param_storages[dtype][dst_rank] = param_storage + + # Clear the InternalStorage keys which are not in use anymore + dtype_in_use = list(self.dtype_rank_params.keys()) + dtype_to_pop = list( + filter(lambda x: x not in dtype_in_use, self.param_storages.keys())) + for d in dtype_to_pop: + self.param_storages.pop(d) + + if self.offload: + self._optim._master_weights = self._master_params + cpu_master_params = [p for p in self._master_params.values()] + for param in cpu_master_params: + size = param._numel() * align[Type.fp32.value] + remaining = size % alignment[self.offload_device] + ali = 0 if remaining == 0 else alignment[ + self.offload_device] - remaining + align_ = ali // align[Type.fp32.value] + self.offload_buffer_size += param._numel() + align_ + self.offload_param2align[param.name] = align_ + + if cpu_master_params: + with device_guard(self._rank, self.offload_device): + self.offload_params = ParamStorage( + size=self.offload_buffer_size, + dtype=Type.fp32.value, + device=self.offload_device) + self.offload_params.buffer.name = "offload_buffer" + self.offload_params.add_rank_params( + cpu_master_params, self.offload_param2align, False) + self.offload_params.buffer.stop_gradient = False + + self.offload_grads = GradStorage( + size=self.offload_buffer_size, + dtype=Type.fp32.value, + device=self.offload_device, + destination=self._rank, + parm2align=self.offload_param2align, + convert_cpu=True) + for p in cpu_master_params: + self.offload_grads.add_grad( + p, self.offload_param2align[p.name]) + + self._optim._master_weights[ + self.offload_params.buffer. + name] = self.offload_params.buffer + + def _offload_acc_grad(self, param_name, grad_fp32_cpu): + """accumulate grads with offload strategy""" + with device_guard(self._rank, self.offload_device): + if param_name in self._master_params.keys(): + if self._master_params[param_name].grad is None: + self._master_params[param_name]._copy_gradient_from( + grad_fp32_cpu) + else: + self._master_params[param_name].grad.add_(grad_fp32_cpu) + + self.offload_params.buffer._copy_gradient_from( + self.offload_grads.buffer) + + def _offload_scale_grad(self, scale_size): + """scale grads with offload strategy""" + with device_guard(self._rank, self.offload_device): + self.offload_grads.buffer.scale_(scale=scale_size) + + def _offload_clear_grad(self): + """clear grads with offload strategy""" + with device_guard(self._rank, self.offload_device): + self.offload_grads.buffer.zero_() + + def step(self): + """ + A wrapper for Optimizer's step function to finish the update operation of the optimizer. + """ + + if self.offload: + params_list = [self.offload_params.buffer] + + #TODO(Baibaifan): Offload will support param_groups later + if not isinstance(self._optim._param_groups[0], dict): + self._optim._parameter_list = params_list + self._optim._param_groups = params_list + + # Run the optimizer of the current rank step + if self.offload: + with device_guard(device=self.offload_device): + self._optim.step() + + for param in self._local_params: + if param.name in self._master_params.keys(): + param.set_value(self._master_params[param.name].cuda( + self.dev_id).cast(dtype=param.dtype)) + else: + self._optim.step() + + # Synchronize all the updated shards in between the ranks + self._broadcast_params() + + def minimize(self): + raise RuntimeError( + "optimizer.minimize() not support now, please use optimizer.step()") + + def set_state_dict(self, state_dict): + self._optim.set_state_dict(state_dict) + + def state_dict(self): + return self._optim.state_dict() + + def _clear_cache(self): + self.__segment_params.clear() + self._dtype_rank_params.clear() + self._param2rank.clear() + + @paddle.autograd.no_grad() + def _broadcast_params(self): + """Broadcast the parameters of the current rank to each rank""" + + # Exchange all the shards with the other ranks + for dtype_per_rank in self.param_storages.values(): + for dst_rank, internal_storage in dtype_per_rank.items(): + broadcast( + tensor=internal_storage.buffer, + src=self._group.ranks[dst_rank], + group=self._group, + use_calc_stream=True) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py new file mode 100644 index 0000000000000..5f39ea0fd900f --- /dev/null +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py @@ -0,0 +1,536 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#Taken and modified for fairscale from: +# https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/data_parallel/sharded_ddp.py +#Commit: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e + +import logging +import time +import functools +import numpy as np +from functools import reduce +from collections import deque +from types import MethodType + +import paddle +from paddle import nn +from paddle.distributed import collective +from paddle.distributed.utils import get_logger + +from .group_sharded_storage import GradStorage +from .group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 +from .group_sharded_utils import Taskflow, Type, device_guard + +logger_ = get_logger(logging.INFO) + + +def _trainable(param): + return param.trainable + + +class GroupShardedStage2(nn.Layer): + """ + A wrapper for Sharding Stage2 Layer in Dygraph. + .. warning: GroupShardedStage2 encapsulates the layer strategy and integrates it into the nn.Layer. + .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf. + """ + + # TODO (Baibaifan) + # Feature Notes:: + # 1. Unified memory for param and param.grad to InternalStorage. + # 2. Divide param.grad according to rank to centrally apply for and release GPU memory. + # 3. Dynamically adjust training parameters and models. + # 4. Support offload function. + # 5. Support the establishment of independent communication groups. + + def __init__( + self, + layer, + sharding_optimizer, + group=None, + sync_buffers=False, + buffer_max_size=2**23, #8MB + auto_refresh_trainable=True, + device="gpu"): + super().__init__() + + # training options + self._layer = layer + self._sharding_optimizers = [sharding_optimizer] if not isinstance( + sharding_optimizer, list) else sharding_optimizer + assert all( + list( + map(lambda opt: isinstance(opt, GroupShardedOptimizerStage2), + self._sharding_optimizers)) + ), "Please use GroupShardedOptimizerStage2 optimizer" + self._sync_buffers = sync_buffers + self._auto_refresh_trainable = auto_refresh_trainable + + # Communication related attributes + self._group = collective.new_group(collective._get_global_group() + .ranks) if group is None else group + self._world_size_scaling = 1.0 / self._group.nranks + assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1" + self._rank = self._group.rank + self._global_root_rank = self._group.ranks[ + 0] # picking ranks index 0 as the reference + self._default_device = device + + # Global statistical parameters + self._all_params = [] + for optim in self._sharding_optimizers: + self._all_params.extend(list(optim.local_params)) + + self._trainable_params = [] + self._grad_reduced = [] + self._trainable_param2rank = {} + self._trainable_param2align = {} + self._trainable_mask = list(map(_trainable, self._all_params)) + self._param_grads = [] + + # Set grad storage size & Display param sizes and model sizes + model_size = sum([p._numel() for p in self._layer.parameters()]) + assert buffer_max_size >= 0, "buffer_max_size must be GE than 0." + self._buffer_max_size = self._rank_buffer_size(buffer_max_size, + model_size) + self._use_grad_storage = buffer_max_size > 0 + self._grad_storages = {} # {dtype: {rank: GradStorage}} + self._has_grad_storage = [] + self._grad_storage_list = [] + + # Offload + # TODO(haohongxiang): Now it's not be supported for multi-optimizers using Offload strategy + self._offload_optims = list( + filter(lambda optim: optim.offload, self._sharding_optimizers)) + if len(self._offload_optims) > 0: + assert len( + self._sharding_optimizers + ) == 1, "Only support offload strategy for single optimizer" + + self._offload = len(self._offload_optims) > 0 + self._offload_device = "cpu" + + # Set backward pass hooks + self._bw_hooks = [] + + # TODO (Baibaifan) Set tasks flow support asynchronous communicate + # self._tasks_flow = deque() + + # Define optimizer step and clear_grad + self._redefine_opt_step() + self._redefine_opt_clear() + + def forward(self, *inputs, **kwargs): + """ + A wrapper for Sharding Stage2 layer. + - Fresh trainable params or rebuild grad storage + - Sync layer's buffer params + - Clear all flags states + - Forward for origin layers + """ + + # Whether to need to reset trainable parameters + needs_fresh = len(self._bw_hooks) == 0 and self.training + + if self._auto_refresh_trainable: + needs_fresh |= self._detect_train_change() + + # Front hook + self._init_internal_storage(needs_fresh) + + # Sync layer's buffers state + if self._sync_buffers: + self.__sync_buffers() + + # Normal FW on the base model + fw = self._layer(*inputs, **kwargs) + + return fw + + def set_state_dict(self, state_dict, use_structured_name=True): + self._layer.set_state_dict( + state_dict, use_structured_name=use_structured_name) + + def state_dict(self, + destination=None, + include_sublayers=True, + structured_name_prefix=""): + return self._layer.state_dict( + destination=destination, + include_sublayers=include_sublayers, + structured_name_prefix=structured_name_prefix) + + def _clear_gradients(self): + """ + Set zero to the gradient of the optimizer's current rank trainable parameters. + """ + # Release grad storages + for dtype in self._grad_storages.keys(): + if not self._offload and self._rank in self._grad_storages[ + dtype].keys(): + self._grad_storages[dtype][self._rank].buffer.zero_() + + # Release grads of params + for param in self._trainable_params: + if param.name in self._param_grads and param.grad is not None: + param._zero_grads() + + # Release grads of master params with offload strategy + if self._offload: + self._sharding_optimizers[0]._offload_clear_grad() + + def _grad_scale(self): + """ + Before the gradient accumulation, scale the gradient. + """ + # Scale grad storages + for dtype in self._grad_storages.keys(): + if not self._offload and self._rank in self._grad_storages[ + dtype].keys(): + self._grad_storages[dtype][self._rank].buffer.scale_( + scale=self._world_size_scaling) + + # Scale grads of params + for param in self._trainable_params: + if param.name in self._param_grads and param.grad is not None: + param.grad.scale_(scale=self._world_size_scaling) + # param._reset_grad_inplace_version(True) + + # Scale grads of master params with offload strategy + if self._offload: + self._sharding_optimizers[0]._offload_scale_grad( + self._world_size_scaling) + + def _init_internal_storage(self, needs_fresh): + """ + Judge Fresh trainable params or rebuild grad storage. + """ + if needs_fresh: + self._fresh_trainable() + else: + self._build_grad_storages() + + # Clear all flags state + self._clear_counters() + + def to(self, device=None, dtype=None, blocking=True): + """ + Synchronously or asynchronously convert the data type of the layer, the device is not supported now. + """ + assert isinstance(device, str), "Device must be type str" + assert device == self._default_device, "New devices are not supported, because of the optimizer state is not sync" + + self._layer.to(device=device, dtype=dtype, blocking=blocking) + + # Re-build the buckets, hooks, etc.. + self._fresh_trainable() + + def _fresh_trainable(self): + """ Whether to update training parameters. """ + + # Make sure that this is not done while gradients are waiting to be reduced (if no_sync context for instance) + if reduce(lambda x, y: x or y, self._grad_reduced, False): + logging.warning("Grads waiting to be reduced.") + + self._trainable_params = list( + filter(lambda x: x.trainable, self._all_params)) + self._trainable_params.sort(key=lambda x: x._numel()) + + self._trainable_param2rank = {} + for optim in self._sharding_optimizers: + # Need to be wrappered for Sharding Stage2 Optimizer + if len(optim.param_storages.keys()) == 0: + optim._update_opt_status() + + # Get the parameters split by the optimizer according to rank + for per_rank_params in optim.dtype_rank_params.values( + ): # all the params from all ranks + for params in per_rank_params: + for param in filter(lambda x: x.trainable, params): + self._trainable_param2rank[ + param.name] = optim.param2rank[param.name] + self._trainable_param2align[ + param.name] = optim._param2align[param.name] + + # Create grad_storage + self._setup_use_grad_storage() + # setup backward hooks + self._setup_backward_hooks() + + @paddle.autograd.no_grad() + def __sync_buffers(self): + """ + Sync all the param buffers from all ranks (exp: batch norm statistics). + """ + + for buffer in self._layer.buffers(include_sublayers=True): + collective.broadcast( + buffer, + self._global_root_rank, + self._group, + use_calc_stream=True) + + def __getattr__(self, name): + """Forward missing attributes to wrapped layer.""" + try: + return super().__getattr__(name) + except AttributeError: + return getattr(self._layer, name) + + @paddle.autograd.no_grad() + def _clear_counters(self): + """Reset all the grad reduce and call counters.""" + if self.training: + self._grad_reduced = [True for _ in self._trainable_params] + + if self._use_grad_storage: + for grad_storage in self._grad_storage_list: + grad_storage.reset_checked_in() + + def _get_reduce_fn(self, index, param, dst_rank): + """ + There are two ways to reduce gradient. + - 1. Do not use self._use_grad_storage or exceeded buffer_max_size will be reduced separately. + - 2. Use grad_storage Reduce the storage to get the full gradient from different ranks. + """ + + if not self._use_grad_storage or not self._has_grad_storage[index]: + # Direct reduction + @paddle.autograd.no_grad() + def reduce(*_): + # Skip gradient reduction, do not change status information + if self._grad_reduced[index]: + assert param.grad is not None, "Parameter gradient cannot be None" + + # Change reduce information + self._grad_reduced[index] = False + + # Clear the gradient that does not belong to the current rank through the callback function + def cleanup(): + if dst_rank != self._rank: + param.clear_gradient(False) + elif self._offload: + tmp_grad = param.grad.cast( + dtype=Type.fp32.value).cpu() + + self._sharding_optimizers[0]._offload_acc_grad( + param.name, tmp_grad) + del tmp_grad + param.clear_gradient(False) + + # Synchronize the reduce parameter gradient + collective.reduce( + tensor=param.grad, + dst=self._group.ranks[dst_rank], + group=self._group) + # TODO (Baibaifan) Asynchronous the reduce parameter gradient + + # Clear the task flow and trigger callback to clear the redundant gradient + # self._clear_task_flow() + + cleanup() + + else: + # Buffer reduction + @paddle.autograd.no_grad() + def reduce(*_): + # Skip gradient reduction, do not change status information + if self._grad_reduced[index]: + assert param.grad is not None, "Parameter gradient cannot be None" + + # Change reduce information + self._grad_reduced[index] = False + grad_storage = self._grad_storages[param.dtype][dst_rank] + grad_storage.params_checked_in += 1 + + if grad_storage.all_checked_in: + assert grad_storage.buffer is not None + + # Clearing up the grad_storage buffer + def cleanup(): + if dst_rank != self._rank: + for p in grad_storage._params: + p.clear_gradient(False) + + grad_storage.buffer._clear_data() + elif self._offload: + grad_storage.to(device=self._offload_device) + for p in grad_storage._params: + with device_guard(): + tmp_grad = p.grad.cast( + dtype=Type.fp32.value) + self._sharding_optimizers[ + 0]._offload_acc_grad(p.name, tmp_grad) + p.clear_gradient(False) + grad_storage._device = self._default_device + grad_storage.buffer._clear_data() + + # Reduce the bucket + grad_storage.sent = True + # Synchronize the reduce parameter gradient + collective.reduce( + tensor=grad_storage.buffer, + dst=self._group.ranks[grad_storage.destination], + group=self._group) + # TODO (Baibaifan) Asynchronous the reduce parameter gradient + + cleanup() + + # Clear the task flow and trigger callback to clear the redundant gradient + # self._clear_task_flow() + + return reduce + + def _setup_backward_hooks(self): + """ + Set the backward hook to synchronize the gradients of all rank by reduce group ranks. + """ + + # Remove previous backward hooks + while len(self._bw_hooks) > 0: + self._bw_hooks.pop().remove() + + # Go through the parameters, attach the hook + if not self.training: + return + + for index, param in enumerate(self._trainable_params): + dst_rank = self._trainable_param2rank[param.name] + + reduce_function = self._get_reduce_fn(index, param, dst_rank) + + self._bw_hooks.append( + param._register_backward_hook(reduce_function)) + + def _setup_use_grad_storage(self): + """ + Integrate the parameters gradient into a continuous memory according to rank, and support the update of training parameters. + """ + + # According to parameters's numel sort, allocate memory of parameter gradient to continuous memory according to rank + self._grad_storages = {} + self._has_grad_storage = [False for _ in self._trainable_params] + + for index, param in enumerate(self._trainable_params): + dst_rank = self._trainable_param2rank[param.name] + + if param.dtype not in self._grad_storages.keys(): + self._grad_storages[param.dtype] = {} + + if dst_rank not in self._grad_storages[param.dtype].keys(): + self._grad_storages[param.dtype][dst_rank] = GradStorage( + self._buffer_max_size[param.dtype], + dtype=param.dtype, + device=self._default_device, + destination=dst_rank, + parm2align=self._trainable_param2align) + + # Criteria to decide whether this parameter is to be put in GradStorage + if self._grad_storages[param.dtype][dst_rank].can_add_grad_view( + param, self._trainable_param2align[param.name]): + self._grad_storages[param.dtype][dst_rank].add_grad( + param, self._trainable_param2align[param.name]) + self._has_grad_storage[index] = True + else: + self._param_grads.append(param.name) + print( + "Can not add param: {}, param's shape: {}, param align: {}, grad_storages fill: {}, ". + format(param.name, param.shape, self._trainable_param2align[ + param.name], self._grad_storages[param.dtype][dst_rank] + ._fill)) + + for dtype in self._grad_storages.keys(): + self._grad_storage_list.extend( + list(self._grad_storages[dtype].values())) + + # def _clear_task_flow(self): + # """Try to consume the previous tasks.""" + # while len(self._tasks_flow) > 0: + # task = self._tasks_flow.popleft() + # task.wait() + # if task.callback is not None: + # task.callback() + + def _detect_train_change(self): + # Current trainable parameters + trainable_mask = list(map(_trainable, self._all_params)) + + # Whether parameters trainability changed + trainability_changed = trainable_mask != self._trainable_mask + + if trainability_changed: + logging.warning( + "Trainable params changed, because of eval/train mode or parameter freezing/unfreeze." + ) + self._trainable_mask = trainable_mask + + return trainability_changed + + def _build_grad_storages(self): + """ + Rebuild grad storages. + """ + # Rebuild fp16/fp32 grad storages + for dtype in self._grad_storages.keys(): + for dst_rank, grad_storage in self._grad_storages[dtype].items(): + if self._offload or dst_rank != self._rank: + grad_storage.manumal_relase() + grad_storage.rebuild() + + def _rank_buffer_size(self, buffer_max_size, model_size): + """ + Generate the minimum buffer size for each rank & Display param sizes and model sizes. + """ + + # Initialize buffer size + rank_buffer_size = {} + for shard_opt in self._sharding_optimizers: + if shard_opt.rank_buffer_size: + for dtype in shard_opt.rank_buffer_size.keys(): + sizes = max(shard_opt.rank_buffer_size[dtype].values()) + rank_buffer_size[dtype] = min(sizes, buffer_max_size) + + if Type.fp16.value in rank_buffer_size.keys(): + # FP16 GradStorage and model size + logger_.info( + "====== FP16 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======". + format(rank_buffer_size[Type.fp16.value] / 2**19, model_size / 2 + **19)) + if Type.fp32.value in rank_buffer_size.keys(): + # FP32 GradStorage and model size + logger_.info( + "====== FP32 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======". + format(rank_buffer_size[Type.fp32.value] / 2**18, model_size / 2 + **18)) + return rank_buffer_size + + def _redefine_opt_step(self): + grad_func = self._grad_scale + for opt in self._sharding_optimizers: + opt_step = opt.step + + def _opt_step(self): + grad_func() + opt_step() + + opt.step = MethodType(_opt_step, opt) + + def _redefine_opt_clear(self): + clear_func = self._clear_gradients + + def _opt_clear(self): + clear_func() + + for opt in self._sharding_optimizers: + opt.clear_grad = MethodType(_opt_clear, opt) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py new file mode 100644 index 0000000000000..049d3ffa3694f --- /dev/null +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py @@ -0,0 +1,912 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import logging +import numpy as np +from types import MethodType +from collections import OrderedDict + +import paddle +from paddle import nn +from paddle.autograd import EagerPyLayer +import paddle.fluid.core as core +import paddle.fluid.framework as framework +from paddle.fluid.framework import EagerParamBase +from paddle.fluid.clip import ClipGradByGlobalNorm +from paddle.distributed import collective + +from .group_sharded_storage import GradStorage +from .group_sharded_utils import Type, GroupShardedClipGrad, device_guard + + +def _all_gather(tensor, buffer_size, group): + """ + The main difference with paddle.distributed.all_gather: + no need to pass in tensor_list, the returned tensor is spliced + """ + + assert group is not None + if framework.in_dygraph_mode(): + out = paddle.zeros([buffer_size], dtype=tensor.dtype) + task = group.process_group.all_gather(tensor, out) + return out, task + + +# CUDA alignment 256 bytes +alignment = {"gpu": 256, } +align = { + Type.fp16.value: 2, + Type.fp32.value: 4, +} + +global CHECK_LAYER +CHECK_LAYER = dict() # Help to check layer's id -> layer's name + + +class GroupShardedStage3(nn.Layer): + """ + A wrapper for Sharding Stage3 Layer in Dygraph. + + .. warning: GroupShardedStage3 encapsulates the layer strategy and integrates it into the nn.Layer. + + .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf. + """ + + # TODO (Baibaifan) + # Feature Notes:: + # 1. The model supports the segmentation of parameters by global ranks in layers. + # 2. Support communication flow and computing flow. + # 3. Support offload function. + # 4. Support the establishment of independent communication groups. + + def __init__(self, + layer, + optimizer, + group=None, + sync_buffers=False, + device="gpu", + segment_size=2**20, + pertrain_sync_models=True, + offload=False, + sync_comm=False): + super().__init__() + + # Default configs + assert core.is_compiled_with_cuda(), "Only support CUDA." + self._layer = layer + self._default_device = device + self.__sync_buffers = sync_buffers + self._offload = offload + self._sync_comm = sync_comm + # segmentation size + assert segment_size >= 0, "segment_size must be GE than 0." + self._segment_size = segment_size + + global DEV + DEV = "cpu" if paddle.get_device() == "cpu" else paddle.get_device( + ).split(":")[0] + global DEV_ID + DEV_ID = 0 if paddle.get_device() == "cpu" else int(paddle.get_device() + .split(":")[1]) + global param2dtype + param2dtype = dict() + + # Communication group establishment + self._group = collective.new_group(collective._get_global_group() + .ranks) if group is None else group + self._world_size_scaling = 1.0 / self._group.nranks + assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1." + self._rank = self._group.rank + self._global_root_rank = self._group.ranks[ + 0] # picking ranks index 0 as the reference + + # Parameter segmentation for global ranks + # After flatten -> self._param2buffer_size, self._param2buffer, self._trainable_params + self._param2buffer_size = dict() # {param.name: size} + self._param2buffer = dict( + ) # {param.name: [(start0, end0),(start1, end1), ...]} + self._trainable_params = dict() # {id(layer): [trainable_params]} + self._unslice_params = set() # param's numel <= segment_size + self._unslice_params2align = dict() # {param.name: param's align} + self._grad_storages = dict() # {param.dtype: GradStorage} + + assert not isinstance( + optimizer, list), "Multiple optimizers are not supported now." + self._optim = _OptimizerWrapper(optimizer, self._offload, self._group, + self._update_params_slice) + self._ori_parameter_list = self._optim._parameter_list + self._ori_param_groups = self._optim._param_groups + + # Replace optimizer's _grad_clip + if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm): + logging.warning( + "While using ClipGradByGlobalNorm in GroupShardedStage3, the grad clip of original optimizer will be changed." + ) + self._optim._grad_clip = GroupShardedClipGrad( + self._optim._grad_clip, paddle.get_device(), self._group) + if self._optim._parameter_list and isinstance( + self._optim._parameter_list[0], dict): + for item in self._optim._param_groups: + if "grad_clip" in item.keys(): + item["grad_clip"] = self._optim._grad_clip + + # Synchronous all ranks models + if pertrain_sync_models: + self._sync_params_and_buffers() + + self._segment_rank_params(self._layer) + + # Add unslice params to master_weight in fp16 + self._handle_unslice_params() + + # In the first step, record the execution order of the layer + self._order_tracer = OrderedDict() + self._order_tracer["order"] = 0 + self._order_tracer["layer"] = list() + + # Register task flow + self._task_flow = TaskFlow() + + # Register forward hooks + self._register_forward_hooks(self._layer) + + # Register backward parameter hooks + self._register_backward_hooks() + + # Redefine optimizer step and clear function + self._redefine_opt_step() + self._redefine_opt_clear() + + @paddle.autograd.no_grad() + def _sync_params_and_buffers(self): + """ + Sync all model states for all ranks + """ + + for p in self._layer.parameters(): + collective.broadcast( + p, + src=self._global_root_rank, + group=self._group, + use_calc_stream=True) + + def _clear_gradients(self): + assert len(self._trainable_params.keys()) > 0 + current_layer_params = self._layer.parameters(include_sublayers=True) + # 1.Handle param's slice + trainable_params = list( + filter(lambda p: p.trainable and p not in self._unslice_params, + current_layer_params)) + for param in trainable_params: + assert hasattr( + param, "fw_storage" + ), "Find {} don't have fw_storage attribute.".format(param.name) + + param.fw_storage.clear_gradient(False) + param.bw_storage._clear() + param.bw_storage = None + # 2.Handle unslice param + if not self._offload: + for grad_storage in self._grad_storages.values(): + grad_storage.buffer.zero_() + else: + for param in list(self._unslice_params): + param.clear_gradient(False) + tmp_var = param.cuda(DEV_ID) + param._clear_data() + if tmp_var.dtype == Type.fp32.value and param2dtype[ + param.name] == Type.fp16.value: + tmp_var = paddle.cast(tmp_var, Type.fp16.value) + tmp_var._share_buffer_to(param) + del tmp_var + for grad_storage in self._grad_storages.values(): + grad_storage.manumal_relase() + grad_storage.rebuild() + + # Update param memery slice + def _update_params_slice(self): + update_list = self._update_params() + + if not isinstance(self._optim._param_groups[0], dict): + slice_params = [param.fw_storage for param in update_list] + self._optim._parameter_list = slice_params + list( + self._unslice_params) + self._optim._param_groups = slice_params + list( + self._unslice_params) + else: + for param_group in self._optim._param_groups: + p_group = [] + for p in param_group['params']: + if hasattr(p, "fw_storage"): + p_group.append(p.fw_storage) + else: + p_group.append(p) + + param_group['params'] = p_group + + def forward(self, *inputs, **kwargs): + """ + A wrapper for Sharding Stage3 layer. + """ + # 1.Sync layer's buffers state + if self.__sync_buffers: + self._sync_buffers() + + # 2.Normal FW on the base model + fw = self._layer(*inputs, **kwargs) + + return fw + + def set_state_dict(self, state_dict, use_structured_name=True): + self._layer.set_state_dict( + state_dict, use_structured_name=use_structured_name) + + def state_dict(self, + destination=None, + include_sublayers=True, + structured_name_prefix=""): + return self._layer.state_dict( + destination=destination, + include_sublayers=include_sublayers, + structured_name_prefix=structured_name_prefix) + + def _handle_unslice_params(self): + buffer_size = dict() + buffer_size[Type.fp32.value] = 0 + buffer_size[Type.fp16.value] = 0 + for param in self._unslice_params: + # Updata optimizer master weights + if param.dtype == Type.fp16.value and not self._offload: + master_tensor = paddle.cast(param, Type.fp32.value) + master_tensor.name = param.name + self._optim._master_weights[param.name] = master_tensor + param2dtype[param.name] = param.dtype + p_align = self._param2align(param) + self._unslice_params2align[param.name] = p_align + buffer_size[param.dtype] += param._numel() + p_align + + # Create unslice_params'grad + for param in sorted(list(self._unslice_params), key=lambda p: p.name): + if param.dtype not in self._grad_storages.keys(): + self._grad_storages[param.dtype] = GradStorage( + buffer_size[param.dtype], + dtype=param.dtype, + device=self._default_device, + destination=self._rank, + parm2align=self._unslice_params2align) + self._grad_storages[param.dtype].add_grad( + param, self._unslice_params2align[param.name]) + + def _segment_rank_params(self, layer, name="last_layer"): + """ + Flatten parameters according to layer. + """ + current_layer_params = _current_layer_params(layer) + if current_layer_params: + CHECK_LAYER[id(layer)] = name + self._flatten_layer_params(layer, current_layer_params) + + for name, sub_layer in layer.named_children(): + self._segment_rank_params(sub_layer, name) + + def _flatten_layer_params(self, layer, current_layer_params): + """ + Parameter segmentation and memory integration. + """ + + def _add_manage_info(trainable_param): + return _PartitionParam(trainable_param) + + current_params = list() + for p in current_layer_params: + if p.trainable and p._numel() > self._segment_size: + current_params.append(_add_manage_info(p)) + elif p.trainable: + self._unslice_params.add(_UnsliceParam(p)) + + assert id(layer) not in self._trainable_params.keys() + self._trainable_params[id(layer)] = current_params + + for param in self._trainable_params[id(layer)]: + if param.name in self._param2buffer.keys(): + continue + self._param2buffer[param.name] = [] + # 1.Params alignment + align_ = self._param2align(param) + + offset = align_ + param._numel() + buffer_size = offset if offset % self._group.nranks == 0 else offset + self._group.nranks - ( + offset % self._group.nranks) + self._param2buffer_size[param.name] = buffer_size + + # 2.Combination param buffer + assert buffer_size % self._group.nranks == 0 + pre_buffer = buffer_size // self._group.nranks + + for rank_ in range(self._group.nranks): + self._param2buffer[param.name].append( + (rank_ * pre_buffer, (rank_ + 1) * pre_buffer)) + + # Record param's dtype + param2dtype[param.name] = param.dtype + # 3.Flatten layer params and release other rank buffer + self._param_storage(param, buffer_size) + + def _param_storage(self, param, buffer_size): + """ + This is a function to simplify the handling of parameter InternalStorages. + """ + assert isinstance(buffer_size, int) + value = np.zeros( + buffer_size, + dtype=np.float16) if Type.fp16.value == param.dtype else np.zeros( + buffer_size, dtype=np.float32) + buffer = core.eager.Tensor(value=value, place=core.CPUPlace()) + + param_shape = param.shape + origin_state = param.stop_gradient + param.stop_gradient = True + param.flatten_() + param.stop_gradient = origin_state + start, end = self._param2buffer[param.name][self._rank] + + # Copy the current param value + with device_guard(): + tmp_var = buffer._slice(0, param._numel()) + param_cpu = param.cpu() + tmp_var.get_tensor().set(param_cpu.get_tensor(), core.CPUPlace()) + del tmp_var + param.get_tensor()._set_dims(param_shape) + param._clear_data() + + # Current rank param_storage + if self._offload: + with device_guard(): + tmp_tensor = buffer._slice(start, end) + param.fw_storage = core.eager.Tensor( + value=tmp_tensor, + place=core.CPUPlace(), + name="slice@" + param.name) + else: + param.fw_storage = core.eager.Tensor( + value=buffer._slice(start, end), name="slice@" + param.name) + param.status = "part" + + # Updata optimizer master weights + if param.dtype == Type.fp16.value and not self._offload: + master_tensor = paddle.cast(param.fw_storage, Type.fp32.value) + master_tensor.name = param.name + self._optim._master_weights[param.fw_storage.name] = master_tensor + + def _register_forward_hooks(self, layer): + """ + Register EagerPyLayer to manage memory slices. + There are four stages: + FW + 1. Before the forward layers, synchronize the full parameters. + 2. After the forward layers, release the full parameter and keep the parameter slice. + BW + 3. Before the backward layers, synchronize the full parameters and create param's grad. + 4. After the gradient accumulation, release the full parameter and keep the parameter slice. + """ + current_layer_params = _current_layer_params(layer) + if current_layer_params: + self._register_forward_all_hooks(layer, self._task_flow) + + for _, sub_layer in layer.named_children(): + self._register_forward_hooks(sub_layer) + + def _register_forward_all_hooks(self, sub_layer, task_flow): + def _forward_pre_hook(layer, inputs): + return ForwardPreHooks(layer, self._order_tracer, + self._trainable_params, + self._param2buffer_size, self._group, + self._sync_comm, self._offload, task_flow) + + def _forward_post_hook(layer, inputs, outputs): + return ForwardPostHooks.apply( + outputs, layer, self._order_tracer, self._trainable_params, + self._param2buffer, self._param2buffer_size, self._rank, + self._group, self._sync_comm, self._offload, task_flow) + + # register previous forward hooks + sub_layer.register_forward_pre_hook(_forward_pre_hook) + + # register post forward hooks + sub_layer.register_forward_post_hook(_forward_post_hook) + + @paddle.autograd.no_grad() + def _sync_buffers(self): + """ + Sync all the param buffers from all ranks (exp: batch norm statistics). + """ + + for buffer in self._layer.buffers(include_sublayers=True): + collective.broadcast( + buffer, + self._global_root_rank, + self._group, + use_calc_stream=True) + + def __getattr__(self, name): + """Forward missing attributes to wrapped layer.""" + try: + return super().__getattr__(name) + except AttributeError: + return getattr(self._layer, name) + + def _update_params(self): + """ + Update parameters to optimizer memory slice. + """ + update_list = [] + assert len(self._trainable_params.keys()) > 0 + current_layer_params = self._layer.parameters(include_sublayers=True) + trainable_params = list( + filter(lambda p: p.trainable and p not in self._unslice_params, + current_layer_params)) + # 1.Handle param's slice + for param in trainable_params: + assert hasattr( + param, + "fw_storage"), "Find {} don't have fw_storage attribute".format( + param.name) + # Gradient average + if self._offload: + with device_guard(): + param.bw_storage.scale_(scale=self._world_size_scaling) + else: + param.bw_storage.scale_(scale=self._world_size_scaling) + param.fw_storage = _VarBaseWrapper(param) + assert param.fw_storage.grad is None + param.fw_storage._copy_gradient_from(param.bw_storage) + update_list.append(param) + + # 2.Handle unslice param + for grad_storage in self._grad_storages.values(): + grad_storage.buffer.scale_(scale=self._world_size_scaling) + collective.all_reduce(tensor=grad_storage.buffer, group=self._group) + if self._offload: + for param in list(self._unslice_params): + tmp_var = _device2cpu(param, convert_dtype=True) + tmp_var._share_buffer_to(param) + del tmp_var + + for grad_storage in self._grad_storages.values(): + for p in grad_storage._params: + tmp_g = _device2cpu(p.grad, convert_dtype=True) + p.clear_gradient(False) + p._copy_gradient_from(tmp_g) + del tmp_g + grad_storage.buffer._clear() + + return update_list + + def get_all_parameters(self, convert2cpu=False): + """ + Get the full parameters and return the corresponding task flows. + """ + assert len(self._trainable_params.keys()) > 0 + current_layer_params = self._layer.parameters(include_sublayers=True) + trainable_params = list( + filter(lambda p: p.trainable and p not in self._unslice_params, + current_layer_params)) + t_flow = _allgather_buffer( + trainable_params, + self._group, + param2buffer_size=self._param2buffer_size, + use_calc_stream=True, + task_flow=TaskFlow(), + sync_wait=True, + offload=self._offload, + convert2cpu=convert2cpu) + if convert2cpu: + for param in trainable_params: + t_flow.full_param[param.name][0]._share_buffer_to(param) + + self._optim._parameter_list = self._ori_parameter_list + self._optim._param_groups = self._ori_param_groups + + def _register_backward_hooks(self): + current_layer_params = self._layer.parameters(include_sublayers=True) + trainable_params = list( + filter(lambda p: p.trainable and p not in self._unslice_params, + current_layer_params)) + + for param in trainable_params: + allreduce_function = self._get_allreduce_fn(param) + param._register_backward_hook(allreduce_function) + + def _get_allreduce_fn(self, param): + @paddle.autograd.no_grad() + def allreduce_(*_): + if param.name in self._task_flow.full_grad.keys(): + full_grad = self._task_flow.full_grad[param.name] + # Only support sync allreduce current rank's layer now + collective.all_reduce(tensor=full_grad, group=self._group) + + start, end = self._param2buffer[param.name][self._rank] + if param.bw_storage is None: + param.bw_storage = full_grad._slice(start, + end).detach().clone() + if self._offload: + param.bw_storage = _device2cpu(param.bw_storage, True) + else: + if self._offload: + cpu_grad = _device2cpu( + full_grad._slice(start, end).detach().clone(), True) + with device_guard(): + param.bw_storage = paddle.add(param.bw_storage, + cpu_grad) + else: + param.bw_storage = paddle.add( + param.bw_storage, + full_grad._slice(start, end).detach().clone()) + param.clear_gradient(False) + del self._task_flow.full_grad[param.name] + + if param.name in self._task_flow.full_param.keys(): + if param.status == "all": + param.use_count = 0 + param._clear_data() + start, end = self._param2buffer[param.name][self._rank] + param.fw_storage = self._task_flow.full_param[param.name][ + 0]._slice(start, end).detach().clone() + param.status = "part" + del self._task_flow.full_param[param.name] + + if self._offload: + param.fw_storage = _device2cpu(param.fw_storage, True) + + return allreduce_ + + def _param2align(self, param): + # CUDA alignment 256 bytes + size = param._numel() * align[param.dtype] + remaining = size % alignment[self._default_device] + ali = 0 if remaining == 0 else alignment[ + self._default_device] - remaining + align_ = ali // align[param.dtype] + return align_ + + def _redefine_opt_step(self): + params_slice_func = self._update_params_slice + opt_step = self._optim.step + + def _opt_step(self): + if not self.update_scaler: + params_slice_func() + if self.offload: + with device_guard(): + opt_step() + else: + opt_step() + + def _opt_minimize(self): + raise RuntimeError( + "optimizer.minimize() not support now, please use optimizer.step()" + ) + + self._optim.step = MethodType(_opt_step, self._optim) + self._optim.minimize = MethodType(_opt_minimize, self._optim) + + def _redefine_opt_clear(self): + clear_func = self._clear_gradients + + def _opt_clear(self): + clear_func() + + self._optim.clear_grad = MethodType(_opt_clear, self._optim) + + +def ForwardPreHooks(layer, order_tracer, trainable_params, param2buffer_size, + group, sync_comm, offload, task_flow): + + # Record layer's id + layer_id = id(layer) + use_calc, sync_wait = False, False + + if layer_id not in order_tracer.keys() or sync_comm: + use_calc, sync_wait = True, True + + # Whether to use calc stream + task_flow.use_calc[layer_id] = use_calc + else: + # Whether to use calc stream + task_flow.use_calc[layer_id] = use_calc + # wait current layer params + _wait_layer(trainable_params[layer_id], task_flow, group, + param2buffer_size, use_calc, offload) + + if layer_id == order_tracer["layer"][-1]: return + order_ = order_tracer[layer_id] + layer_id = order_tracer["layer"][order_ + 1] + + _allgather_buffer( + trainable_params[layer_id], + group, + param2buffer_size=param2buffer_size, + use_calc_stream=use_calc, + task_flow=task_flow, + sync_wait=sync_wait, + offload=offload) + + return + + +class ForwardPostHooks(EagerPyLayer): + @staticmethod + def forward(ctx, inputs, layer, order_tracer, trainable_params, + param2buffer, param2buffer_size, rank, group, sync_comm, + offload, task_flow): + + layer_id = id(layer) + # release current layer full params + _release_param(trainable_params[layer_id], param2buffer, rank, + task_flow, offload) + + if layer_id not in order_tracer.keys(): + order_ = order_tracer["order"] + order_tracer[layer_id] = order_ + order_tracer["order"] += 1 + order_tracer["layer"].append(layer_id) + + #Record fw info + ctx.order_tracer = order_tracer + ctx.task_flow = task_flow + ctx.group = group + ctx.layer_id = layer_id + ctx.sync_comm = sync_comm + ctx.trainable_params = trainable_params + ctx.param2buffer_size = param2buffer_size + ctx.offload = offload + + return inputs + + @staticmethod + def backward(ctx, *args): + # Load context value + order_tracer = ctx.order_tracer + task_flow = ctx.task_flow + group = ctx.group + layer_id = ctx.layer_id + trainable_params = ctx.trainable_params + param2buffer_size = ctx.param2buffer_size + sync_comm = ctx.sync_comm + offload = ctx.offload + use_calc, sync_wait = False, False + + # Allgather params synchronization + if sync_comm: + use_calc, sync_wait = True, True + _allgather_buffer( + trainable_params[layer_id], + group, + param2buffer_size=param2buffer_size, + use_calc_stream=use_calc, + task_flow=task_flow, + sync_wait=sync_wait, + offload=offload) + else: + _wait_layer(trainable_params[layer_id], task_flow, group, + param2buffer_size, use_calc, offload) + + # Create params's grad + _create_params_grad(trainable_params[layer_id], param2buffer_size, + task_flow) + + # Whether to use calc stream + task_flow.use_calc[layer_id] = use_calc + if layer_id != order_tracer["layer"][0] and not sync_comm: + layer_next_id = order_tracer["layer"][order_tracer[layer_id] - 1] + _allgather_buffer( + trainable_params[layer_next_id], + group, + param2buffer_size=param2buffer_size, + use_calc_stream=use_calc, + task_flow=task_flow, + sync_wait=sync_wait, + offload=offload) + + return args + + +class TaskFlow: + """ + Task flows, one way linked list for task acquisition. + """ + + def __init__(self, + full_param=dict(), + full_grad=dict(), + use_calc=dict(), + callback=None): + self.full_param = full_param + self.full_grad = full_grad + self.use_calc = use_calc + self.callback = callback + + +def _release_param(trainable_params, + param2buffer, + rank, + task_flow, + offload=False): + for param in trainable_params: + # async communicate share weight not clear + param.use_count -= 1 + if param.use_count == 0: + param._clear_data() + if param.name in task_flow.full_param.keys(): + start, end = param2buffer[param.name][rank] + with paddle.amp.auto_cast(enable=False): + param.fw_storage = task_flow.full_param[param.name][ + 0]._slice(start, end).detach().clone() + param.status = "part" + del task_flow.full_param[param.name] + + if offload: + param.fw_storage = _device2cpu(param.fw_storage) + return + + +def _wait_layer(trainable_params, + task_flow, + group, + param2buffer_size, + use_calc_stream, + offload=False): + + for param in trainable_params: + if param.status == "all": + param.use_count += 1 + continue + if param.name in task_flow.full_param.keys(): + full_param, task = task_flow.full_param[param.name] + task.wait() + full_param._slice(0, param._numel())._share_buffer_to(param) + param.fw_storage._clear() + param.fw_storage = None + param.status = "all" + param.use_count += 1 + else: + _allgather_buffer( + trainable_params, + group, + param2buffer_size=param2buffer_size, + use_calc_stream=True, + task_flow=task_flow, + sync_wait=True, + offload=offload) + break + return task_flow + + +def _allgather_buffer(trainable_params, + group, + param2buffer_size, + use_calc_stream, + task_flow, + sync_wait=False, + offload=False, + convert2cpu=False): + + for param in trainable_params: + if param.status == "all": + param.use_count += 1 + continue + + if offload: + param.fw_storage = _cpu2device(param) + + buffer_size = param2buffer_size[param.name] + with paddle.amp.auto_cast(enable=False): + full_param, task = _all_gather(param.fw_storage, buffer_size, group) + + # Allgather current layer in the 1st step synchronously + if sync_wait: + with paddle.amp.auto_cast(enable=False): + task.wait() + full_param._slice(0, param._numel())._share_buffer_to(param) + param.fw_storage._clear() + param.fw_storage = None + param.status = "all" + param.use_count += 1 + task_flow.full_param[param.name] = (full_param, task) + + # parameter converts to cpu + if convert2cpu: + p_name = param.name + param = _device2cpu(param) + del task_flow.full_param[p_name] + task_flow.full_param[p_name] = (param, None) + + return task_flow + + +@paddle.autograd.no_grad() +def _create_params_grad(trainable_params, param2buffer_size, task_flow): + for param in trainable_params: + if param.name in task_flow.full_grad.keys(): + continue + assert isinstance(param2buffer_size[param.name], int) + temp_grad = paddle.zeros( + [param2buffer_size[param.name]], dtype=param.dtype) + temp_tensor = temp_grad._slice(0, param._numel()) + temp_tensor.get_tensor()._set_dims(param.shape) + param._copy_gradient_from(temp_tensor) + del temp_tensor + task_flow.full_grad[param.name] = temp_grad + return task_flow + + +def _PartitionParam(param): + if not hasattr(param, "fw_storage"): + setattr(param, "fw_storage", None) + setattr(param, "bw_storage", None) + setattr(param, "status", "all") + setattr(param, "use_count", 0) + return param + + +def _UnsliceParam(param): + if not hasattr(param, "unslice"): + setattr(param, "unslice", True) + return param + + +def _VarBaseWrapper(param): + varbase = param.fw_storage + tmp_param = EagerParamBase( + shape=varbase.shape, dtype=varbase.dtype, name="slice@" + param.name) + varbase._share_buffer_to(tmp_param) + tmp_param.regularizer = param.regularizer + tmp_param.optimize_attr['learning_rate'] = param.optimize_attr[ + 'learning_rate'] + varbase._clear() + return tmp_param + + +def _OptimizerWrapper(optimizer, offload, group, update_params_slice): + if not hasattr(optimizer, "_optim"): + setattr(optimizer, "_optim", optimizer) + setattr(optimizer, "offload", offload) + setattr(optimizer, "_group", group) + setattr(optimizer, "update_scaler", None) + setattr(optimizer, "update_slice", update_params_slice) + return optimizer + + +def _device2cpu(trans_param, convert_dtype=False): + if convert_dtype: + trans_param = paddle.cast(trans_param, Type.fp32.value) + tmp_p = trans_param.cpu() + trans_param._clear_data() + return tmp_p + + +def _cpu2device(param): + tmp_p = param.fw_storage.cuda(DEV_ID) + if tmp_p.dtype == Type.fp32.value and param2dtype[ + param.name] == Type.fp16.value: + tmp_p = paddle.cast(tmp_p, Type.fp16.value) + return tmp_p + + +def _current_layer_params(layer): + return layer.parameters( + include_sublayers=False) + list(layer.extra_parameters) if hasattr( + layer, "extra_parameters") else layer.parameters( + include_sublayers=False) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py new file mode 100644 index 0000000000000..7a57fb29b9472 --- /dev/null +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py @@ -0,0 +1,313 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#Taken and modified for fairscale from: +# https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/misc/param_bucket.py +#Commit: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e + +import os +import time +import numpy as np + +import paddle +from paddle.fluid import core +from .group_sharded_utils import Type, device_guard + + +class InternalStorage: + """ + This is a basic class, which is responsible for consolidating the basic storage tensor. + + """ + + # Support integration parameter tensor + def __init__(self, size, dtype, device, convert_cpu=False): + self._params = [] + self._param_ids = [] + self._fill = 0 + self._device = device + self._dtype = dtype + + # The flatten tensor + size = [size] if isinstance(size, int) else size + if convert_cpu: + value = np.zeros( + size, + dtype=np.float16) if Type.fp16.value == dtype else np.zeros( + size, dtype=np.float32) + self.buffer = core.eager.Tensor(value=value, place=core.CPUPlace()) + else: + self.buffer = paddle.zeros(size, dtype=dtype) + + self.dev_id = 0 if paddle.get_device() == "cpu" else int( + paddle.get_device().split(":")[1]) + + def to(self, device, dtype=None, keep_alignment=True): + """ + Move the underlying buffer + """ + assert self.buffer is not None, "Cannot move a collapsed bucket, please rebuild it" + assert (dtype == Type.fp32.value or + Type.fp16.value), "Conversion type is not supported now" + + if self._device != device: + tmp_buffer = self.buffer.cuda( + self.dev_id) if device == "gpu" else self.buffer.cpu() + for param in self._params: + param.clear_gradient(False) + + del self.buffer + self.buffer = tmp_buffer + self._device = device + + if dtype is not None: + self.buffer = self.buffer.cast(dtype=dtype) + self._dtype = dtype + + +class ParamStorage(InternalStorage): + """ + This is a basic class to simplify the handling of parameter InternalStorages. + """ + + def __init__(self, size, dtype, device): + super().__init__(size, dtype, device, convert_cpu=True) + self.param2align = None + + def to(self, device, dtype=None, keep_alignment=True): + """ + Move the underlying buffer + """ + + super().to(device, dtype) + + if keep_alignment: + self._array_params() + + @paddle.autograd.no_grad() + def add_rank_params(self, trainable_params, param2align, convert_gpu=True): + """ + Add new parameters to the InternalStorage. Params becomes a view of this InternalStorage buffer. + """ + + assert all([ + id(param) not in self._param_ids for param in trainable_params + ]), "The same param cannot be checked in twice" + assert self.buffer is not None + + self.param2align = param2align + + cpu_param_shape = list() + for param in trainable_params: + p_shape = self._add_param_as_view(param, param2align[param.name], + convert_gpu) + cpu_param_shape.append(p_shape) + + if convert_gpu: + # buffer convert from cpu to cuda + self.buffer = self.buffer.cuda(self.dev_id) + + self._fill = 0 + + for idx, param in enumerate(trainable_params): + self._convert_buffer(param, cpu_param_shape[idx], + param2align[param.name]) + self._params.append(param) + self._param_ids.append(id(param)) + + @paddle.autograd.no_grad() + def _add_param_as_view(self, param, align, convert_gpu=True): + + assert ( + param.dtype == self.buffer.dtype + ), "Different types for the InternalStorage and the param, cannot proceed: {} - {}".format( + param.dtype, self.buffer.dtype) + + var_end = self._fill + param._numel() + offset = var_end + align + assert offset <= self.buffer._numel() + + p_shape = param.shape + + origin_state = param.stop_gradient + param.stop_gradient = True + param.flatten_() + param.stop_gradient = origin_state + + # Copy the current param value + + with device_guard(self.dev_id, "cpu"): + tmp_var = self.buffer._slice(self._fill, var_end) + if convert_gpu: + param_cpu = param.cpu() + param._clear_data() + tmp_var.set_value(param_cpu) + else: + tmp_var.set_value(param) + del tmp_var + + self._fill = offset + return p_shape + + @paddle.autograd.no_grad() + def _convert_buffer(self, param, p_shape, align): + + var_end = self._fill + np.prod(p_shape).tolist() + offset = var_end + align + assert offset <= self.buffer._numel() + + # Convert the param value + with device_guard(self.dev_id, self._device): + tmp_tensor = self.buffer._slice(self._fill, var_end) + tmp_tensor._share_buffer_to(param) + param.get_tensor()._set_dims(p_shape) + + self._fill = offset + + @paddle.autograd.no_grad() + def _array_params(self): + """ + Given the parameters which have been registered previously, rebuild the whole InternalStorage. + """ + assert len(self._params) > 0 + assert self.param2align is not None + + self._fill = 0 + for p in self._params: + self._convert_buffer(p, p.shape, self.param2align[p.name]) # modify + + +class GradStorage(InternalStorage): + """ + This is a basic class to simplify the handling of gradient InternalStorages + """ + + def __init__(self, + size, + dtype, + device, + destination, + parm2align, + convert_cpu=False): + if isinstance(size, np.int64): + size = size.tolist() + super().__init__(size, dtype, device, convert_cpu) + + self._max_size = size + self._release = False + + self.params_checked_in = 0 + self.destination = destination + self._parm2align = parm2align + self.sent = False + + def reset_checked_in(self): + """ Reset the counter of the parameter grads which have been checked in + """ + self.params_checked_in = 0 + self.sent = False + + @property + def all_checked_in(self): + """ Judge all the expected gradient check-in happened """ + return len(self._params) == self.params_checked_in + + def can_add_grad_view(self, param, align): + """ Is there enough InternalStorage to add this parameter gradient, and whether this param have already checked in. + """ + return self._fill + param._numel() + align <= self._max_size and id( + param) not in self._param_ids + + def to(self, device, dtype=None, keep_alignment=True): + """ + Move the underlying buffer + """ + if self._release: + self.rebuild() + + super().to(device, dtype) + + if keep_alignment: + self._array_grads() + + @paddle.autograd.no_grad() + def add_grad(self, param, align): + """ + Add a new parameter gradient to the InternalStorage. Param.grad becomes a view of this InternalStorage buffer. + """ + + assert id( + param + ) not in self._param_ids, "The same gradients cannot be checked in twice" + + self._add_grad_as_view(param, align) + self._params.append(param) + self._param_ids.append(id(param)) + + @paddle.autograd.no_grad() + def manumal_relase(self): + """ + Release the buffer from InternalStorage. The InternalStorage will need to be rebuilt before use. + """ + if not self._release: + for p in self._params: + if p.grad is not None: + p.clear_gradient(False) + + self.buffer = None + self._fill = 0 + self.params_checked_in = 0 + self._release = True + + @paddle.autograd.no_grad() + def rebuild(self): + """ + Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage. + """ + + if self._release: + self.buffer = paddle.zeros([self._max_size], dtype=self._dtype) + + for p in self._params: + self._add_grad_as_view(p, self._parm2align[p.name]) + + self._release = False + + @paddle.autograd.no_grad() + def _array_grads(self): + """ + Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage. + """ + if len(self._params) > 0: + self._fill = 0 + for p in self._params: + self._add_grad_as_view(p, self._parm2align[p.name]) + + @paddle.autograd.no_grad() + def _add_grad_as_view(self, param, align): + assert param._numel( + ) > 0, "Cannot add a gradient to a released InternalStorage, please rebuild" + assert param.dtype == self.buffer.dtype + + grad_end = self._fill + param._numel() + offset = grad_end + align + assert offset <= self.buffer._numel() + + # Copy the current grad value to InternalStorage + with device_guard(self.dev_id, self._device): + tmp_var = self.buffer._slice(self._fill, grad_end) + tmp_var.get_tensor()._set_dims(param.shape) + param._copy_gradient_from(tmp_var) + del tmp_var + + self._fill = offset diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py new file mode 100644 index 0000000000000..eae8f87b01420 --- /dev/null +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py @@ -0,0 +1,227 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import contextlib +from enum import Enum +import numpy as np +from types import MethodType + +import paddle +from paddle import _C_ops +from paddle.fluid import core +from paddle.fluid import layers +from paddle.fluid.dygraph import to_variable +from paddle.fluid.framework import dygraph_only + + +class Taskflow: + """ + Task flows, one way linked list for task acquisition. + """ + + def __init__(self, task, callback): + self.task = task + self.callback = callback + + +class Type(Enum): + """ + Type of trainable parameters + """ + fp16 = paddle.float16 + fp32 = paddle.float32 + + +class GroupShardedClipGrad: + def __init__(self, clip, device, group): + self._clip = clip + self._device = device + self._group = group + + @paddle.autograd.no_grad() + def _dygraph_clip(self, params_grads): + sum_square_fp32, sum_square_fp16 = [], [] + unslice_params_fp32, unslice_params_fp16 = [], [] + + for p, g in params_grads: + p_slice = True # using for slice parameter in sharding stage3 + if g is None or getattr(p, 'need_clip', True) is False: + continue + if hasattr(p, "unslice"): + p_slice = False + + merge_grad = g + if g.type == core.VarDesc.VarType.SELECTED_ROWS: + merge_grad = layers.get_tensor_from_selected_rows( + layers.merge_selected_rows(g)) + square = layers.square(merge_grad) + sum_square = layers.reduce_sum(square) + + if p.dtype == paddle.float16: + if p_slice: sum_square_fp16.append(sum_square) + else: unslice_params_fp16.append(sum_square) + elif p.dtype == paddle.float32: + if p_slice: sum_square_fp32.append(sum_square) + else: unslice_params_fp32.append(sum_square) + + # global norm of non-distributed FP16 params_and_grads + if len(sum_square_fp16) == 0: + global_norm_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) + else: + global_norm_fp16 = layers.concat(sum_square_fp16) + global_norm_fp16 = layers.reduce_sum(global_norm_fp16) + global_norm_fp16 = paddle.cast( + global_norm_fp16, dtype=paddle.float32) + + # global norm of non-distributed FP16 params_and_grads for unslice parameters + if len(unslice_params_fp16) == 0: + global_unslice_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) + else: + global_unslice_fp16 = layers.concat(unslice_params_fp16) + global_unslice_fp16 = layers.reduce_sum(global_unslice_fp16) + global_unslice_fp16 = paddle.cast( + global_unslice_fp16, dtype=paddle.float32) + + # global norm of non-distributed FP32 params_and_grads + global_norm_fp32 = layers.concat(sum_square_fp32) if len( + sum_square_fp32) != 0 else paddle.to_tensor( + [0.], dtype=paddle.float32) + global_norm_fp32 = layers.reduce_sum(global_norm_fp32) + + # global norm of non-distributed FP32 params_and_grads for unslice parameters + global_unslice_fp32 = layers.concat(unslice_params_fp32) if len( + unslice_params_fp32) != 0 else paddle.to_tensor( + [0.], dtype=paddle.float32) + global_unslice_fp32 = layers.reduce_sum(global_unslice_fp32) + global_unslice_var = global_unslice_fp16 + global_unslice_fp32 + + global_norm_var = global_norm_fp16 + global_norm_fp32 + 1.0 / self._group.nranks * global_unslice_var + + # add all reduce to get global norm of distributed params_and_grads + dev_id = int(self._device.split(":")[1]) + if paddle.device.get_device() == "cpu": + global_norm_var = global_norm_var.cuda(dev_id) + + with device_guard(dev_id, "gpu"): + paddle.distributed.all_reduce(global_norm_var, group=self._group) + + global_norm_var = layers.sqrt(global_norm_var) + max_global_norm = layers.fill_constant( + shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) + + clip_var = layers.elementwise_div( + x=max_global_norm, + y=layers.elementwise_max( + x=global_norm_var, y=max_global_norm)) + clip_var_fp16 = paddle.cast(clip_var, paddle.float16) + + for p, g in params_grads: + if getattr(p, 'need_clip', True) is False or g is None: + continue + origin_state = g.stop_gradient + g.stop_gradient = True + if p.dtype == paddle.float16: + g.scale_(clip_var_fp16.item()) + else: + g.scale_(clip_var.item()) + g.stop_gradient = origin_state + # p._reset_grad_inplace_version(True) + + return params_grads + + def __getattr__(self, item): + return getattr(self._clip, item) + + def __call__(self, params_grads): + return self._dygraph_clip(params_grads) + + +@contextlib.contextmanager +def device_guard(dev_id=0, device="cpu"): + origin_device = paddle.device.get_device() + if device == "cpu": + paddle.set_device(device) + elif device == "gpu": + paddle.set_device("gpu:{}".format(dev_id)) + try: + yield + finally: + paddle.set_device(origin_device) + + +@dygraph_only +def GroupShardedScaler(scaler): + def unscale_method(self, optimizer): + if not self._enable: + return + param_grads = [] + param_grads_fp16 = [] + param_grads_fp32 = [] + if hasattr(optimizer, "update_slice"): + optimizer.update_slice() + optimizer.update_scaler = True + + if getattr(optimizer._optim, '_param_groups', None) and isinstance( + optimizer._optim._param_groups[0], dict): + + for group in optimizer._optim._param_groups: + for param in group['params']: + if param.grad is not None: + param_grads.append(param.grad) + if param.grad.dtype in [ + core.VarDesc.VarType.FP16, paddle.float16 + ]: + param_grads_fp16.append(param.grad) + else: + param_grads_fp32.append(param.grad) + else: + for param in optimizer._optim._parameter_list: + if param.grad is not None: + param_grads.append(param.grad) + if param.grad.dtype in [ + core.VarDesc.VarType.FP16, paddle.float16 + ]: + param_grads_fp16.append(param.grad) + else: + param_grads_fp32.append(param.grad) + + temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool)) + temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool)) + + device = "cpu" if optimizer.offload else "gpu" + dev_id = 0 if device == "cpu" else int(paddle.get_device().split(":")[ + 1]) + + with device_guard(dev_id, device): + if len(param_grads_fp16): + _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, + param_grads_fp16, + temp_found_inf_fp16) + if len(param_grads_fp32): + _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, + param_grads_fp32, + temp_found_inf_fp32) + + self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0 + is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32") + + paddle.distributed.all_reduce( + is_found_inf, + op=paddle.distributed.ReduceOp.MAX, + group=optimizer._group) + self._found_inf = is_found_inf.numpy()[0] + + scaler._unscale = MethodType(unscale_method, scaler) + return scaler diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py index 6fd4caa7b4a5c..4c22028b2304c 100644 --- a/python/paddle/distributed/sharding/group_sharded.py +++ b/python/paddle/distributed/sharding/group_sharded.py @@ -20,11 +20,20 @@ from paddle.optimizer import Optimizer from paddle.distributed.utils import get_logger +from paddle.fluid.framework import in_dygraph_mode + +# Old version from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3 from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler +# New version +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import GroupShardedStage3 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import GroupShardedScaler + logger_ = get_logger(logging.INFO) @@ -110,30 +119,56 @@ def check_dtype(param): logger_.info("*" * 30) logger_.info("Sharded level os uses sharded level os_g achieved now.") logger_.info("*" * 30) - optimizer = ShardingOptimizerStage2( - params=model.parameters(), - optim=optimizer, - group=group, - offload=offload) - model = ShardingStage2( - model, - optimizer, - group=group, - sync_buffers=sync_buffers, - buffer_max_size=buffer_max_size) + if in_dygraph_mode(): + optimizer = GroupShardedOptimizerStage2( + params=optimizer._parameter_list, + optim=optimizer, + group=group, + offload=offload) + model = GroupShardedStage2( + model, + optimizer, + group=group, + sync_buffers=sync_buffers, + buffer_max_size=buffer_max_size) + else: + optimizer = ShardingOptimizerStage2( + params=model.parameters(), + optim=optimizer, + group=group, + offload=offload) + model = ShardingStage2( + model, + optimizer, + group=group, + sync_buffers=sync_buffers, + buffer_max_size=buffer_max_size) elif level == 'p_g_os': - model = ShardingStage3( - model, - optimizer=optimizer, - group=group, - sync_buffers=sync_buffers, - segment_size=segment_size, - offload=offload, - sync_comm=sync_comm) + if in_dygraph_mode(): + model = GroupShardedStage3( + model, + optimizer=optimizer, + group=group, + sync_buffers=sync_buffers, + segment_size=segment_size, + offload=offload, + sync_comm=sync_comm) + else: + model = ShardingStage3( + model, + optimizer=optimizer, + group=group, + sync_buffers=sync_buffers, + segment_size=segment_size, + offload=offload, + sync_comm=sync_comm) else: raise ValueError("Please enter the correct level.") if params_fp16 and isinstance(scaler, paddle.amp.GradScaler): - scaler = ShardingScaler(scaler) + if in_dygraph_mode(): + scaler = GroupShardedScaler(scaler) + else: + scaler = ShardingScaler(scaler) logger_.info("*" * 30) logger_.info( "If there is a communication hang using group sharded, please check whether the communication operations of each process are unified." @@ -195,9 +230,9 @@ def save_group_sharded_model(model, output, optimizer=None): ), "Saving directory ({}) should be a directory, not a file".format(output) os.makedirs(output, exist_ok=True) output_model = os.path.join(output, "model.pdmodel") - if isinstance(model, ShardingStage2): + if isinstance(model, (ShardingStage2, GroupShardedStage2)): paddle.save(model._layer.state_dict(), output_model) - elif isinstance(model, ShardingStage3): + elif isinstance(model, (ShardingStage3, GroupShardedStage3)): convert2cpu = True if model._offload else False model.get_all_parameters(convert2cpu=convert2cpu) paddle.save(model._layer.state_dict(), output_model) diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 9bf245ff388b4..b2441e90fc9fb 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -819,6 +819,10 @@ def _slice(self, begin_idx, end_idx): def _numel(self): return self.get_tensor()._numel() + @framework.dygraph_only + def _clear_data(self): + self.get_tensor()._clear() + @framework.dygraph_only def _uva(self, device_id=0): ''' @@ -934,6 +938,7 @@ def to_sparse_coo(self, sparse_dim): setattr(core.eager.Tensor, "_slice", _slice) setattr(core.eager.Tensor, "_numel", _numel) setattr(core.eager.Tensor, "_uva", _uva) + setattr(core.eager.Tensor, "_clear_data", _clear_data) else: setattr(core.VarBase, "__name__", "Tensor") setattr(core.VarBase, "grad", grad) diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index b3baedc401504..ba5e51c11dd65 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -331,22 +331,56 @@ def __call__(self, var, block=None): ["uint16", "float16", "float32", "float64"], "guassian_random") + # to be compatible of fp16 initalizers + if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]: + out_dtype = VarDesc.VarType.FP32 + out_var = block.create_var( + name=unique_name.generate(".".join( + ['normal_init', var.name, 'tmp'])), + shape=var.shape, + dtype=out_dtype, + type=VarDesc.VarType.LOD_TENSOR, + persistable=False) + else: + out_dtype = var.dtype + out_var = var + if self._seed == 0: self._seed = block.program.random_seed - if framework._non_static_mode(): + if in_dygraph_mode(): + place = _current_expected_place() + out_var = _C_ops.final_state_gaussian_random( + var.shape, self._mean, self._std_dev, self._seed, out_dtype, + place) + out_var._share_underline_tensor_to(var) + + if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]: + var_tmp = _C_ops.final_state_cast(out_var, var.dtype) + var_tmp._share_underline_tensor_to(var) + else: + out_var._share_underline_tensor_to(var) + return None + + if _in_legacy_dygraph(): out_var = _C_ops.gaussian_random( - 'shape', var.shape, 'dtype', var.dtype, 'mean', self._mean, + 'shape', var.shape, 'dtype', out_dtype, 'mean', self._mean, 'std', self._std_dev, 'seed', self._seed, 'use_mkldnn', False) - out_var._share_underline_tensor_to(var) + + if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]: + var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype, + 'out_dtype', var.dtype) + var_tmp._share_underline_tensor_to(var) + else: + out_var._share_underline_tensor_to(var) return None else: op = block.append_op( type="gaussian_random", - outputs={"Out": var}, + outputs={"Out": out_var}, attrs={ "shape": var.shape, - "dtype": var.dtype, + "dtype": out_dtype, "mean": self._mean, "std": self._std_dev, "seed": self._seed, @@ -354,6 +388,13 @@ def __call__(self, var, block=None): }, stop_gradient=True) + if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]: + block.append_op( + type="cast", + inputs={"X": out_var}, + outputs={"Out": var}, + attrs={"in_dtype": out_var.dtype, + "out_dtype": var.dtype}) var.op = op return op @@ -567,9 +608,15 @@ def __call__(self, var, block=None): self._seed, 'dtype', out_dtype) else: std = math.sqrt(2.0 / float(fan_in + fan_out)) - out_var = _C_ops.gaussian_random( - 'shape', out_var.shape, 'dtype', out_dtype, 'mean', 0.0, - 'std', std, 'seed', self._seed) + + if in_dygraph_mode(): + place = _current_expected_place() + out_var = _C_ops.final_state_gaussian_random( + out_var.shape, 0.0, std, self._seed, out_dtype, place) + else: + out_var = _C_ops.gaussian_random( + 'shape', out_var.shape, 'dtype', out_dtype, 'mean', 0.0, + 'std', std, 'seed', self._seed) if var.dtype == VarDesc.VarType.FP16 or ( var.dtype == VarDesc.VarType.BF16 and not self._uniform): @@ -720,9 +767,15 @@ def __call__(self, var, block=None): int(out_dtype)) else: std = math.sqrt(2.0 / float(fan_in)) - out_var = _C_ops.gaussian_random( - 'shape', out_var.shape, 'dtype', - int(out_dtype), 'mean', 0.0, 'std', std, 'seed', self._seed) + if in_dygraph_mode(): + place = _current_expected_place() + out_var = _C_ops.final_state_gaussian_random( + out_var.shape, 0.0, std, self._seed, out_dtype, place) + else: + out_var = _C_ops.gaussian_random( + 'shape', out_var.shape, 'dtype', + int(out_dtype), 'mean', 0.0, 'std', std, 'seed', + self._seed) if var.dtype == VarDesc.VarType.FP16 or ( var.dtype == VarDesc.VarType.BF16 and not self._uniform): diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index a405bf829fd48..47f40a2e6a5af 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -28,6 +28,7 @@ from paddle.fluid.framework import _in_legacy_dygraph from ..initializer import Normal, Constant, NumpyArrayInitializer from ..framework import Variable, OpProtoHolder, _non_static_mode, dygraph_only, _dygraph_tracer, default_main_program, _varbase_creator, static_only, _global_flags, _in_legacy_dygraph, in_dygraph_mode +from ..framework import _current_expected_place from .. import dygraph_utils from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ @@ -10970,7 +10971,15 @@ def gaussian_random(shape, if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) - if _non_static_mode(): + if in_dygraph_mode(): + shape = utils.convert_shape_to_list(shape) + place = _current_expected_place() + return _C_ops.final_state_gaussian_random(shape, + float(mean), + float(std), seed, dtype, + place) + + if _in_legacy_dygraph(): shape = utils.convert_shape_to_list(shape) return _C_ops.gaussian_random('shape', shape, 'mean', float(mean), 'std', diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index d0126013dcf82..f7f88ab76f227 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -968,7 +968,7 @@ set_tests_properties(test_lstm_op PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_star_gan_with_gradient_penalty PROPERTIES TIMEOUT 120) set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 200) set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_profiler PROPERTIES TIMEOUT 120) set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT 120) @@ -1045,7 +1045,7 @@ set_tests_properties(test_distributed_fused_lamb_op_with_clip PROPERTIES TIMEOUT set_tests_properties(test_distributed_fused_lamb_op_without_clip PROPERTIES TIMEOUT 120) set_tests_properties(test_elementwise_min_op PROPERTIES TIMEOUT 120) set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120) -set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300) set_tests_properties(test_parallel_executor_transformer_auto_growth PROPERTIES TIMEOUT 120) set_tests_properties(test_py_reader_using_executor PROPERTIES TIMEOUT 120) set_tests_properties(test_elementwise_add_op PROPERTIES TIMEOUT 120) @@ -1142,7 +1142,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120) - set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 120) + set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 200) set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py index d4832782c329a..574a222ba18c9 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py @@ -22,6 +22,7 @@ from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model epoch = 10 @@ -144,4 +145,6 @@ def test_sharding_api(): if __name__ == '__main__': + with _test_eager_guard(): + pass test_sharding_api() diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py new file mode 100644 index 0000000000000..85a5446cb6447 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py @@ -0,0 +1,147 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import shutil +import tempfile +import numpy as np + +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.distributed import fleet +from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard +from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model + +epoch = 10 +paddle.seed(2022) +np.random.seed(2022) +base_lr = 0.1 +momentum_rate = 0.9 +l2_decay = 1e-4 +batch_size = 100 + + +class MLP(fluid.Layer): + def __init__(self, linear_size=1000, param_attr=None, bias_attr=None): + super(MLP, self).__init__() + + self._linear1 = Linear(linear_size, linear_size) + self._linear2 = Linear(linear_size, linear_size) + self._linear3 = Linear(linear_size, 10) + + def forward(self, inputs): + y = self._linear1(inputs) + y = self._linear2(y) + y = self._linear3(y) + return y + + +def reader_decorator(linear_size=1000): + def __reader__(): + for _ in range(100): + img = np.random.rand(linear_size).astype('float32') + label = np.ones(1).astype('int64') + yield img, label + + return __reader__ + + +def optimizer_setting(model, use_pure_fp16, opt_group=False): + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + optimizer = paddle.optimizer.Momentum( + parameters=[{ + "params": list(model.parameters()) + }] if opt_group else list(model.parameters()), + learning_rate=0.001, + weight_decay=0.00001, + grad_clip=clip, + multi_precision=use_pure_fp16) + + return optimizer + + +def train_mlp(model, shard_level, use_pure_fp16, output_dir): + optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16) + model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32') + scaler = paddle.amp.GradScaler(init_loss_scaling=32768) + + model, optimizer, scaler = group_sharded_parallel( + model=model, optimizer=optimizer, level=shard_level, scaler=scaler) + + train_reader = paddle.batch( + reader_decorator(), batch_size=batch_size, drop_last=True) + + train_loader = paddle.io.DataLoader.from_generator( + capacity=32, + use_double_buffer=True, + iterable=True, + return_list=True, + use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader) + + for eop in range(epoch): + model.train() + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + img.stop_gradient = True + with paddle.amp.auto_cast(True, level='O2'): + out = model(img) + loss = paddle.nn.functional.cross_entropy( + input=out, label=label) + avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) + + if not use_pure_fp16: + avg_loss.backward() + optimizer.step() + else: + scaler.scale(avg_loss).backward() + scaler.step(optimizer) + scaler.update() + + optimizer.clear_grad() + + save_group_sharded_model(model, output=output_dir, optimizer=optimizer) + return model.parameters() + + +def test_sharding_api(): + paddle.distributed.init_parallel_env() + mlp, mlp1, mlp2 = MLP(), MLP(), MLP() + state_dict = mlp.state_dict() + mlp1.set_state_dict(state_dict) + mlp2.set_state_dict(state_dict) + + output_dir = tempfile.mkdtemp() + + # fp16 + stage2_params = train_mlp( + mlp1, shard_level="os_g", use_pure_fp16=True, output_dir=output_dir) + stage3_params = train_mlp( + mlp2, shard_level="p_g_os", use_pure_fp16=True, output_dir=output_dir) + + for i in range(len(stage3_params)): + np.testing.assert_allclose( + stage2_params[i].numpy(), + stage3_params[i].numpy(), + rtol=1e-4, + atol=1e-3) + shutil.rmtree(output_dir) + + +if __name__ == '__main__': + with _test_eager_guard(): + test_sharding_api() diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py new file mode 100644 index 0000000000000..8c07734d513c4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py @@ -0,0 +1,229 @@ +# -*- coding: UTF-8 -*- + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import numpy as np +import argparse +import tempfile +import ast +import time +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.distributed import fleet +from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard + +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2 + +seed = 2022 +epoch = 2 +linear_size = 1000 + +np.random.seed(seed) +paddle.seed(seed) + + +class MLP(fluid.Layer): + def __init__(self, linear_size=1000, param_attr=None, bias_attr=None): + super(MLP, self).__init__() + + self._linear1 = Linear(linear_size, linear_size) + self._linear2 = Linear(linear_size, linear_size) + self._linear3 = Linear(linear_size, 10) + + def forward(self, inputs): + y = self._linear1(inputs) + y = self._linear2(y) + y = self._linear3(y) + return y + + +def reader_decorator(linear_size=1000): + def __reader__(): + for _ in range(100): + img = np.random.rand(linear_size).astype('float32') + label = np.ones(1).astype('int64') + yield img, label + + return __reader__ + + +def optimizer_setting(model, use_pure_fp16, opt_group=False): + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + optimizer = paddle.optimizer.AdamW( + parameters=[{ + "params": model.parameters(), + }] if opt_group else model.parameters(), + learning_rate=0.001, + weight_decay=0.00001, + grad_clip=clip, + multi_precision=use_pure_fp16) + + return optimizer + + +def train_mlp(model, + sharding_stage, + batch_size=100, + use_pure_fp16=False, + accumulate_grad=False, + opt_group=False, + save_model=False, + test_minimize=False): + if sharding_stage != "dp": + group = paddle.distributed.new_group([0, 1], backend="nccl") + if opt_group: + optimizer = optimizer_setting( + model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group) + else: + optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16) + + if sharding_stage == 2: + optimizer = GroupShardedOptimizerStage2( + params=optimizer._parameter_list, optim=optimizer, group=group) + + model = GroupShardedStage2( + model, optimizer, group=group, buffer_max_size=2**21) + else: + model = paddle.DataParallel(model) + + # check optimizer.minimize() error + if test_minimize: + try: + optimizer.minimize() + except: + print( + "====== Find sharding_stage2_optimizer.minimize() error ======") + return + + train_reader = paddle.batch( + reader_decorator(), batch_size=batch_size, drop_last=True) + + train_loader = paddle.io.DataLoader.from_generator( + capacity=32, + use_double_buffer=True, + iterable=True, + return_list=True, + use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader) + + if sharding_stage == 2: + model.to(device="gpu") + + for eop in range(epoch): + model.train() + + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + img.stop_gradient = True + + out = model(img) + loss = paddle.nn.functional.cross_entropy(input=out, label=label) + + avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) + if batch_size == 20: + avg_loss = avg_loss / 5 + avg_loss.backward() + + if not accumulate_grad: + optimizer.step() + optimizer.clear_grad() + + if accumulate_grad: + optimizer.step() + optimizer.clear_grad() + + if save_model: + return model, optimizer + return model.parameters() + + +def test_dp_stage2(): + paddle.distributed.init_parallel_env() + mlp = MLP() + state_dict = mlp.state_dict() + mlp1 = MLP() + mlp2 = MLP() + mlp3 = MLP() + mlp4 = MLP() + mlp5 = MLP() + mlp6 = MLP() + mlp7 = MLP() + mlp1.set_state_dict(state_dict) + mlp2.set_state_dict(state_dict) + mlp3.set_state_dict(state_dict) + mlp4.set_state_dict(state_dict) + mlp5.set_state_dict(state_dict) + mlp6.set_state_dict(state_dict) + mlp7.set_state_dict(state_dict) + + # DP VS stage2 + dp_params = train_mlp( + mlp1, sharding_stage="dp", use_pure_fp16=False, opt_group=False) + stage2_params = train_mlp( + mlp2, sharding_stage=2, use_pure_fp16=False, opt_group=False) + for i in range(len(dp_params)): + np.testing.assert_allclose( + dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6) + + # stage2 accumulate grad + stage2_params = train_mlp(mlp3, sharding_stage=2, accumulate_grad=True) + stage2_accumulate_grad = train_mlp( + mlp4, sharding_stage=2, batch_size=20, accumulate_grad=True) + for i in range(len(stage2_params)): + np.testing.assert_allclose( + stage2_params[i].numpy(), + stage2_accumulate_grad[i].numpy(), + rtol=1e-5, + atol=1e-5) + + # stage2 param list VS param group + stage2_params = train_mlp( + mlp5, sharding_stage=2, use_pure_fp16=False, opt_group=True) + for i in range(len(dp_params)): + np.testing.assert_allclose( + dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6) + + # save/load model + output_dir = tempfile.mkdtemp() + model_file = os.path.join(output_dir, "model.pdmodel") + optimizer_file = os.path.join(output_dir, "model.pdopt") + model_stage2, optimizer_stage2 = train_mlp( + mlp6, + sharding_stage=2, + use_pure_fp16=False, + opt_group=False, + save_model=True) + paddle.save(model_stage2.state_dict(), model_file) + paddle.save(optimizer_stage2.state_dict(), optimizer_file) + m_state_dict = paddle.load(model_file) + opt_state_dict = paddle.load(optimizer_file) + model_stage2.set_state_dict(m_state_dict) + optimizer_stage2.set_state_dict(opt_state_dict) + shutil.rmtree(output_dir) + + # check optimizer.minimize() error + train_mlp(mlp7, sharding_stage=2, test_minimize=True) + return + + +if __name__ == '__main__': + with _test_eager_guard(): + test_dp_stage2() diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py new file mode 100644 index 0000000000000..b09314ae9e31c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py @@ -0,0 +1,112 @@ +# -*- coding: UTF-8 -*- + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import ast +import time +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.distributed import fleet +from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard + +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import GroupShardedScaler + +from dygraph_group_sharded_stage2 import MLP, reader_decorator, optimizer_setting + +seed = 2021 +epoch = 2 +batch_size = 32 +linear_size = 1000 + +np.random.seed(seed) +paddle.seed(seed) + + +def train_mlp(model, offload=False): + optimizer = optimizer_setting(model=model, use_pure_fp16=True) + + model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32') + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + scaler = GroupShardedScaler(scaler) + + optimizer = GroupShardedOptimizerStage2( + params=optimizer._parameter_list, optim=optimizer, offload=offload) + model = GroupShardedStage2(model, optimizer, buffer_max_size=2**21) + + train_reader = paddle.batch( + reader_decorator(linear_size), batch_size=batch_size, drop_last=True) + + train_loader = paddle.io.DataLoader.from_generator( + capacity=32, + use_double_buffer=True, + iterable=True, + return_list=True, + use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader) + + for eop in range(epoch): + model.train() + + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + img.stop_gradient = True + + with paddle.amp.auto_cast(True, level='O2'): + out = model(img) + loss = paddle.nn.functional.cross_entropy( + input=out, label=label) + + avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) + scaler.scale(avg_loss).backward() + + scaler.step(optimizer) + scaler.update() + optimizer.clear_grad() + + for dtype in optimizer.param_storages: + for dst_rank, param_storage in optimizer.param_storages[dtype].items(): + param_storage.to(device="gpu", dtype=dtype) + + return model.parameters() + + +def test_sharding_stage2_offload(): + paddle.distributed.init_parallel_env() + mlp = MLP(linear_size) + mlp_offload = MLP(linear_size) + mlp_offload.set_state_dict(mlp.state_dict()) + + mlp_params = train_mlp(mlp, offload=False) + mlp_offload_params = train_mlp(mlp_offload, offload=True) + + for i in range(len(mlp_params)): + np.testing.assert_allclose( + mlp_params[i].numpy(), + mlp_offload_params[i].numpy(), + rtol=5e-3, + atol=5e-3) + return + + +if __name__ == '__main__': + with _test_eager_guard(): + test_sharding_stage2_offload() diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3.py new file mode 100644 index 0000000000000..6c350e63f444c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3.py @@ -0,0 +1,283 @@ +# -*- coding: UTF-8 -*- + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import tempfile +import numpy as np +import argparse +import ast +import time +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.distributed import fleet +from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard + +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import GroupShardedStage3 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import GroupShardedScaler + +epoch = 10 +paddle.seed(2022) +np.random.seed(2022) +base_lr = 0.1 +momentum_rate = 0.9 +l2_decay = 1e-4 + + +class MLP(fluid.Layer): + def __init__(self, linear_size=1000, param_attr=None, bias_attr=None): + super(MLP, self).__init__() + + self._linear1 = Linear(linear_size, linear_size) + self._linear2 = Linear(linear_size, linear_size) + self._linear3 = Linear(linear_size, 10) + + def forward(self, inputs): + y = self._linear1(inputs) + y = self._linear2(y) + y = self._linear3(y) + return y + + +def reader_decorator(linear_size=1000): + def __reader__(): + for _ in range(100): + img = np.random.rand(linear_size).astype('float32') + label = np.ones(1).astype('int64') + yield img, label + + return __reader__ + + +def optimizer_setting(model, use_pure_fp16, opt_group=False): + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + optimizer = paddle.optimizer.Momentum( + parameters=[{ + "params": list(model.parameters()) + }] if opt_group else list(model.parameters()), + learning_rate=0.001, + weight_decay=0.00001, + grad_clip=clip, + multi_precision=use_pure_fp16) + + return optimizer + + +def train_mlp(model, + sharding_stage, + use_pure_fp16=False, + accumulate_grad=False, + batch_size=100, + opt_group=False, + sync_comm=False, + test_minimize=False, + save_model=False): + group = paddle.distributed.new_group([0, 1]) + if opt_group: + optimizer = optimizer_setting( + model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group) + else: + optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16) + + if use_pure_fp16: + model = paddle.amp.decorate( + models=model, level='O2', save_dtype='float32') + scaler = paddle.amp.GradScaler(init_loss_scaling=32768) + scaler = GroupShardedScaler(scaler) + if sharding_stage == 2: + optimizer = GroupShardedOptimizerStage2( + params=optimizer._parameter_list, optim=optimizer, group=group) + model = GroupShardedStage2( + model, optimizer, group=group, buffer_max_size=2**21) + elif sharding_stage == 3: + model = GroupShardedStage3( + model, + optimizer=optimizer, + group=group, + sync_comm=sync_comm, + segment_size=2**15) + + # check optimizer.minimize() error + if test_minimize: + try: + optimizer.minimize() + except: + print( + "====== Find sharding_stage3_optimizer.minimize() error ======") + return + + train_reader = paddle.batch( + reader_decorator(), batch_size=batch_size, drop_last=True) + + train_loader = paddle.io.DataLoader.from_generator( + capacity=32, + use_double_buffer=True, + iterable=True, + return_list=True, + use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader) + + for eop in range(epoch): + model.train() + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + img.stop_gradient = True + with paddle.amp.auto_cast(True, level='O2'): + out = model(img) + loss = paddle.nn.functional.cross_entropy( + input=out, label=label) + avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) + + if batch_size == 20: + avg_loss = avg_loss / 5 + + if not use_pure_fp16: + avg_loss.backward() + else: + scaler.scale(avg_loss).backward() + + if not accumulate_grad: + if not use_pure_fp16: + optimizer.step() + else: + scaler.step(optimizer) + scaler.update() + optimizer.clear_grad() + if accumulate_grad: + if not use_pure_fp16: + optimizer.step() + else: + scaler.step(optimizer) + scaler.update() + optimizer.clear_grad() + if sharding_stage == 3: + model.get_all_parameters() + + if save_model: + return model, optimizer + return model.parameters() + + +def test_stage2_stage3(): + paddle.distributed.init_parallel_env() + mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8, mlp9, mlp10 = MLP( + ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP() + state_dict = mlp.state_dict() + mlp1.set_state_dict(state_dict) + mlp2.set_state_dict(state_dict) + mlp3.set_state_dict(state_dict) + mlp4.set_state_dict(state_dict) + mlp5.set_state_dict(state_dict) + mlp6.set_state_dict(state_dict) + mlp7.set_state_dict(state_dict) + mlp8.set_state_dict(state_dict) + mlp9.set_state_dict(state_dict) + mlp10.set_state_dict(state_dict) + + # fp32 + stage2_params = train_mlp( + mlp1, sharding_stage=2, use_pure_fp16=False, opt_group=False) + stage3_params = train_mlp( + mlp2, sharding_stage=3, use_pure_fp16=False, opt_group=False) + + for i in range(len(stage2_params)): + np.testing.assert_allclose( + stage2_params[i].numpy(), + stage3_params[i].numpy(), + rtol=1e-6, + atol=1e-6) + + # fp32 accumulate grad + stage3_params = train_mlp( + mlp3, + sharding_stage=3, + use_pure_fp16=False, + accumulate_grad=True, + opt_group=True) + stage3_params_add = train_mlp( + mlp4, + sharding_stage=3, + use_pure_fp16=False, + accumulate_grad=True, + batch_size=20, + opt_group=True) + for i in range(len(stage3_params)): + np.testing.assert_allclose( + stage3_params[i].numpy(), + stage3_params_add[i].numpy(), + rtol=1e-6, + atol=1e-4) + + # fp16 + stage2_params = train_mlp( + mlp5, sharding_stage=2, use_pure_fp16=True, opt_group=False) + stage3_params = train_mlp( + mlp6, sharding_stage=3, use_pure_fp16=True, opt_group=False) + for i in range(len(stage2_params)): + np.testing.assert_allclose( + stage2_params[i].numpy(), + stage3_params[i].numpy(), + rtol=1e-4, + atol=1e-3) + + # fp16 sync_comm + stage3_params = train_mlp( + mlp7, sharding_stage=3, use_pure_fp16=True, opt_group=False) + stage3_params_re = train_mlp( + mlp8, + sharding_stage=3, + use_pure_fp16=True, + opt_group=False, + sync_comm=True) + for i in range(len(stage3_params)): + np.testing.assert_allclose( + stage3_params[i].numpy(), stage3_params_re[i].numpy(), rtol=1e-6) + + # save/load model + output_dir = tempfile.mkdtemp() + model_file = os.path.join(output_dir, "model.pdmodel") + optimizer_file = os.path.join(output_dir, "model.pdopt") + model_stage3, optimizer_stage3 = train_mlp( + mlp9, + sharding_stage=3, + use_pure_fp16=False, + opt_group=False, + save_model=True) + paddle.save(model_stage3.state_dict(), model_file) + paddle.save(optimizer_stage3.state_dict(), optimizer_file) + m_state_dict = paddle.load(model_file) + opt_state_dict = paddle.load(optimizer_file) + model_stage3.set_state_dict(m_state_dict) + optimizer_stage3.set_state_dict(opt_state_dict) + shutil.rmtree(output_dir) + + # check optimizer.minimize() error + train_mlp( + mlp10, + sharding_stage=3, + use_pure_fp16=False, + opt_group=False, + test_minimize=True) + + +if __name__ == '__main__': + with _test_eager_guard(): + test_stage2_stage3() diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3_offload.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3_offload.py new file mode 100644 index 0000000000000..5f9ec5c6e708e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3_offload.py @@ -0,0 +1,205 @@ +# -*- coding: UTF-8 -*- + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import ast +import time +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.distributed import fleet +from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard + +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import GroupShardedStage3 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import GroupShardedScaler + +epoch = 10 +paddle.seed(2022) +np.random.seed(2022) +base_lr = 0.1 +momentum_rate = 0.9 +l2_decay = 1e-4 + + +class MLP(fluid.Layer): + def __init__(self, linear_size=1000, param_attr=None, bias_attr=None): + super(MLP, self).__init__() + + self._linear1 = Linear(linear_size, linear_size) + self._linear2 = Linear(linear_size, linear_size) + self._linear3 = Linear(linear_size, 10) + + def forward(self, inputs): + y = self._linear1(inputs) + y = self._linear2(y) + y = self._linear3(y) + return y + + +def reader_decorator(linear_size=1000): + def __reader__(): + for _ in range(100): + img = np.random.rand(linear_size).astype('float32') + label = np.ones(1).astype('int64') + yield img, label + + return __reader__ + + +def optimizer_setting(model, use_pure_fp16, opt_group=False): + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + optimizer = paddle.optimizer.AdamW( + parameters=[{ + "params": model.parameters() + }] if opt_group else model.parameters(), + learning_rate=0.001, + weight_decay=0.00001, + grad_clip=clip, + multi_precision=use_pure_fp16) + + return optimizer + + +def train_mlp(model, + use_pure_fp16=False, + accumulate_grad=False, + offload=False, + batch_size=100, + convert2cpu=False): + group = paddle.distributed.new_group([0, 1]) + optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16) + + if use_pure_fp16: + model = paddle.amp.decorate( + models=model, level='O2', save_dtype='float32') + scaler = paddle.amp.GradScaler(init_loss_scaling=32768) + scaler = GroupShardedScaler(scaler) + + model = GroupShardedStage3( + model, + optimizer=optimizer, + group=group, + offload=offload, + segment_size=2**15) + + train_reader = paddle.batch( + reader_decorator(), batch_size=batch_size, drop_last=True) + + train_loader = paddle.io.DataLoader.from_generator( + capacity=32, + use_double_buffer=True, + iterable=True, + return_list=True, + use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader) + + for eop in range(epoch): + model.train() + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + img.stop_gradient = True + with paddle.amp.auto_cast(True, level='O2'): + out = model(img) + loss = paddle.nn.functional.cross_entropy( + input=out, label=label) + avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) + + if accumulate_grad: + avg_loss = avg_loss / 5 + + if not use_pure_fp16: + avg_loss.backward() + else: + scaler.scale(avg_loss).backward() + + if not accumulate_grad: + if not use_pure_fp16: + optimizer.step() + else: + scaler.step(optimizer) + scaler.update() + optimizer.clear_grad() + if accumulate_grad: + if not use_pure_fp16: + optimizer.step() + else: + scaler.step(optimizer) + scaler.update() + optimizer.clear_grad() + if not convert2cpu: + model.get_all_parameters() + else: + model.get_all_parameters(convert2cpu) + return model.parameters() + + +def test_stage3_offload(): + paddle.distributed.init_parallel_env() + mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6 = MLP(), MLP(), MLP(), MLP(), MLP( + ), MLP(), MLP() + state_dict = mlp.state_dict() + mlp1.set_state_dict(state_dict) + mlp2.set_state_dict(state_dict) + mlp3.set_state_dict(state_dict) + mlp4.set_state_dict(state_dict) + mlp5.set_state_dict(state_dict) + mlp6.set_state_dict(state_dict) + + # fp32 offload + stage3_params = train_mlp(mlp1, use_pure_fp16=False) + stage3_params_offload = train_mlp(mlp2, use_pure_fp16=False, offload=True) + for i in range(len(stage3_params)): + np.testing.assert_allclose( + stage3_params[i].numpy(), + stage3_params_offload[i].numpy(), + rtol=1e-6, + atol=1e-8) + + # fp16 offload + stage3_params = train_mlp(mlp3, use_pure_fp16=True) + stage3_params_offload = train_mlp(mlp4, use_pure_fp16=True, offload=True) + for i in range(len(stage3_params)): + np.testing.assert_allclose( + stage3_params[i].numpy(), + stage3_params_offload[i].numpy(), + rtol=1e-2, + atol=1e-2) + + # fp32 accumulate grad offload + stage3_params = train_mlp( + mlp5, use_pure_fp16=False, batch_size=20, accumulate_grad=True) + stage3_params_offload = train_mlp( + mlp6, + use_pure_fp16=False, + accumulate_grad=True, + offload=True, + batch_size=20, + convert2cpu=True) + for i in range(len(stage3_params)): + np.testing.assert_allclose( + stage3_params[i].numpy(), + stage3_params_offload[i].numpy(), + rtol=1e-6, + atol=1e-8) + return + + +if __name__ == '__main__': + with _test_eager_guard(): + test_stage3_offload() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py index 705831d50f171..0ed9b681fdcf5 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py @@ -23,6 +23,7 @@ import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.utils.internal_storage import GradStorage from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 @@ -138,4 +139,6 @@ def train_mlp(): if __name__ == '__main__': + with _test_eager_guard(): + pass train_mlp() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py index fb01fd46c0d28..82edd1c17a541 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py @@ -26,6 +26,7 @@ from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 @@ -222,4 +223,6 @@ def test_dp_stage2(): if __name__ == '__main__': + with _test_eager_guard(): + pass test_dp_stage2() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py index 39ba44815d940..a7b16bbb75977 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py @@ -23,6 +23,7 @@ from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 @@ -106,4 +107,6 @@ def test_sharding_stage2_offload(): if __name__ == '__main__': + with _test_eager_guard(): + pass test_sharding_stage2_offload() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py index 82821cd7ee644..cdb1de020f56e 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py @@ -26,6 +26,7 @@ from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 @@ -274,4 +275,6 @@ def test_stage2_stage3(): if __name__ == '__main__': + with _test_eager_guard(): + pass test_stage2_stage3() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py index df7ba78d345a3..2cb327a29a3da 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py @@ -23,6 +23,7 @@ from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3 from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler @@ -196,4 +197,6 @@ def test_stage3_offload(): if __name__ == '__main__': + with _test_eager_guard(): + pass test_stage3_offload() diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index d05c9a3c313bb..d254cd286e666 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -21,6 +21,7 @@ from paddle.fluid.op import Operator import paddle.fluid as fluid import paddle +from paddle.fluid.framework import _test_eager_guard class TestAdamOp1(OpTest): @@ -189,6 +190,10 @@ def test_check_output(self): self.inputs['Grad'] = np.random.uniform( -1, 1, (102, 105)).astype("float32") + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_check_output() + def adam_step(inputs, attributes): ''' @@ -732,6 +737,14 @@ def test_adam_op_with_sparse_input_and_weight_decay(self): adam.step() paddle.enable_static() + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_adam_op_dygraph() + self.test_adam_op_with_state_dict() + self.test_adam_with_grad_clip() + self.test_adam_op_with_set_lr() + self.test_adam_op_with_sparse_input_and_weight_decay() + class TestAdamOptimizer(unittest.TestCase): def _test(self, diff --git a/python/paddle/fluid/tests/unittests/test_deform_conv2d.py b/python/paddle/fluid/tests/unittests/test_deform_conv2d.py index 508fc1705218a..f5f1479d07d2f 100644 --- a/python/paddle/fluid/tests/unittests/test_deform_conv2d.py +++ b/python/paddle/fluid/tests/unittests/test_deform_conv2d.py @@ -17,6 +17,7 @@ import paddle.nn.initializer as I import numpy as np import unittest +from paddle.fluid.framework import _test_eager_guard from unittest import TestCase @@ -183,6 +184,10 @@ def test_identity(self): self.place = paddle.CUDAPlace(0) self._test_identity() + def test_identity_with_eager_guard(self): + with _test_eager_guard(): + self.test_identity() + class TestDeformConv2DFunctional(TestCase): batch_size = 4 @@ -418,6 +423,10 @@ def test_identity(self): self.place = paddle.CUDAPlace(0) self._test_identity() + def test_identity_with_eager_guard(self): + with _test_eager_guard(): + self.test_identity() + # testcases for DeformConv2D class TestDeformConv2DWithPadding(TestDeformConv2D): diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py index 45a23231945ec..5fc849575b659 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py @@ -14,13 +14,15 @@ from __future__ import print_function +import paddle import unittest import numpy as np - -import paddle import paddle.fluid.core as core import paddle.fluid as fluid from op_test import OpTest +from paddle.fluid.framework import _test_eager_guard + +paddle.enable_static() def dmc_bilinear(data_im, height, width, h, w): @@ -108,8 +110,24 @@ def dconv_im2col_gemm(input, offset, mask, filter, group, conv_param): return out +def deform_conv2d_wrapper(x, + offset, + weight, + mask=None, + stride=1, + padding=0, + dilation=1, + deformable_groups=1, + groups=1, + im2col_step=1): + return paddle.vision.ops.deform_conv2d(x, offset, weight, None, stride, + padding, dilation, deformable_groups, + groups, mask) + + class TestModulatedDeformableConvOp(OpTest): def setUp(self): + self.python_api = deform_conv2d_wrapper self.op_type = "deformable_conv" self.init_type() self.init_group() @@ -148,13 +166,14 @@ def setUp(self): self.outputs = {'Output': output} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): self.check_grad( {'Input', 'Offset', 'Mask', 'Filter'}, 'Output', - max_relative_error=0.05) + max_relative_error=0.05, + check_eager=True) def init_test_case(self): self.pad = [1, 1] @@ -327,6 +346,10 @@ def test_invalid_filter(): self.assertRaises(ValueError, test_invalid_filter) + def test_error_with_eager_guard(self): + with _test_eager_guard(): + self.test_error() + class TestDeformConv2DAPI(unittest.TestCase): def test_api(self): @@ -358,6 +381,10 @@ def test_deform_conv2d_v2(): test_deform_conv2d_v2() + def test_api_with_eager_guard(self): + with _test_eager_guard(): + self.test_api() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py index e8b18d601afae..304a151c4d3bf 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py @@ -14,12 +14,13 @@ from __future__ import print_function +import paddle import unittest import numpy as np - -import paddle.fluid.core as core import paddle.fluid as fluid +import paddle.fluid.core as core from op_test import OpTest +from paddle.fluid.framework import _test_eager_guard def dmc_bilinear(data_im, height, width, h, w): @@ -105,8 +106,24 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param): return out +def deform_conv2d_wrapper(x, + offset, + weight, + mask=None, + stride=1, + padding=0, + dilation=1, + deformable_groups=1, + groups=1, + im2col_step=1): + return paddle.vision.ops.deform_conv2d(x, offset, weight, None, stride, + padding, dilation, deformable_groups, + groups, mask) + + class TestModulatedDeformableConvOp(OpTest): def setUp(self): + self.python_api = deform_conv2d_wrapper self.op_type = "deformable_conv_v1" self.init_type() self.init_group() @@ -142,18 +159,22 @@ def setUp(self): self.outputs = {'Output': output} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): self.check_grad( - ['Input', 'Offset', 'Filter'], 'Output', max_relative_error=0.05) + ['Input', 'Offset', 'Filter'], + 'Output', + max_relative_error=0.05, + check_eager=True) def test_check_grad_no_filter(self): self.check_grad( ['Input', 'Offset'], 'Output', max_relative_error=0.1, - no_grad_set=set(['Filter'])) + no_grad_set=set(['Filter']), + check_eager=True) def init_test_case(self): self.pad = [1, 1] @@ -292,6 +313,10 @@ def test_invalid_offset(): self.assertRaises(TypeError, test_invalid_offset) + def test_error_with_eager_guard(self): + with _test_eager_guard(): + self.test_error() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py index 7c296c7e40e98..e664face0483a 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py @@ -25,6 +25,7 @@ class TestDygraphGroupSharded(TestMultipleGpus): # check group sharded logic as well as the accuracy with single mode def test_dygraph_group_sharded(self): self.run_mnist_2gpu('dygraph_group_sharded_api.py') + self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py') if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py index 669ab7d8f7f34..b7a5f9c9701c1 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py @@ -24,9 +24,11 @@ class TestDygraphShardingStage2(TestMultipleGpus): # check sharding logic as well as the accuracy with single mode def test_dygraph_sharding_stage2(self): + self.run_mnist_2gpu('dygraph_group_sharded_stage2.py') self.run_mnist_2gpu('dygraph_sharding_stage2.py') def test_dygraph_sharding_stage2_offload(self): + self.run_mnist_2gpu('dygraph_group_sharded_stage2_offload.py') self.run_mnist_2gpu('dygraph_sharding_stage2_offload.py') diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py index c7da5d1e941b4..f69b52cae528a 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py @@ -24,9 +24,11 @@ class TestDygraphShardingStage3(TestMultipleGpus): # check sharding logic as well as the accuracy with single mode def test_dygraph_sharding_stage3(self): + self.run_mnist_2gpu('dygraph_group_sharded_stage3.py') self.run_mnist_2gpu('dygraph_sharding_stage3.py') def test_dygraph_sharding_stage3_offload(self): + self.run_mnist_2gpu('dygraph_group_sharded_stage3_offload.py') self.run_mnist_2gpu('dygraph_sharding_stage3_offload.py') diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index e7abed0964679..7e78b223b3f6a 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -251,9 +251,6 @@ def constructor(self, place): self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace())) self.assertTrue(np.array_equal(egr_tensor12.numpy(), x)) - egr_tensor13 = paddle.randn([2, 2]) - self.assertTrue("eager_tmp" in egr_tensor13.name) - with self.assertRaisesRegexp( ValueError, "The shape of Parameter should not be None"): eager_param = EagerParamBase(shape=None, dtype="float32") @@ -680,7 +677,7 @@ def test_share_buffer_to(self): tensor2 = None tensor = paddle.to_tensor(arr, core.VarDesc.VarType.FP32, core.CPUPlace()) - tensor3 = core.eager.Tensor() + tensor3 = core.eager.Tensor(value=tensor, place=core.CPUPlace()) if core.is_compiled_with_cuda(): tensor2 = paddle.to_tensor(arr2, core.VarDesc.VarType.FP32, core.CUDAPlace(0)) diff --git a/python/paddle/fluid/tests/unittests/test_frac_api.py b/python/paddle/fluid/tests/unittests/test_frac_api.py new file mode 100644 index 0000000000000..4ee3096cde78f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_frac_api.py @@ -0,0 +1,118 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid import Program, program_guard +from paddle.fluid.framework import _test_eager_guard + + +def ref_frac(x): + return x - np.trunc(x) + + +class TestFracAPI(unittest.TestCase): + """Test Frac API""" + + def set_dtype(self): + self.dtype = 'float64' + + def setUp(self): + self.set_dtype() + self.x_np = np.random.uniform(-3, 3, [2, 3]).astype(self.dtype) + self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \ + else paddle.CPUPlace() + + def test_api_static(self): + paddle.enable_static() + with program_guard(Program()): + input = fluid.data('X', self.x_np.shape, self.x_np.dtype) + out = paddle.frac(input) + place = fluid.CPUPlace() + if fluid.core.is_compiled_with_cuda(): + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) + out_ref = ref_frac(self.x_np) + self.assertTrue(np.allclose(out_ref, res)) + + def test_api_dygraph(self): + paddle.disable_static(self.place) + x = paddle.to_tensor(self.x_np) + out = paddle.frac(x) + out_ref = ref_frac(self.x_np) + self.assertTrue(np.allclose(out_ref, out.numpy())) + + def test_api_eager(self): + paddle.disable_static(self.place) + with _test_eager_guard(): + x_tensor = paddle.to_tensor(self.x_np) + out = paddle.frac(x_tensor) + out_ref = ref_frac(self.x_np) + self.assertTrue(np.allclose(out_ref, out.numpy())) + paddle.enable_static() + + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_api_dygraph() + + +class TestFracInt32(TestFracAPI): + """Test Frac API with data type int32""" + + def set_dtype(self): + self.dtype = 'int32' + + +class TestFracInt64(TestFracAPI): + """Test Frac API with data type int64""" + + def set_dtype(self): + self.dtype = 'int64' + + +class TestFracFloat32(TestFracAPI): + """Test Frac API with data type float32""" + + def set_dtype(self): + self.dtype = 'float32' + + +class TestFracError(unittest.TestCase): + """Test Frac Error""" + + def setUp(self): + self.x_np = np.random.uniform(-3, 3, [2, 3]).astype('int16') + self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \ + else paddle.CPUPlace() + + def test_static_error(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.fluid.data('X', [5, 5], 'bool') + self.assertRaises(TypeError, paddle.frac, x) + + def test_dygraph_error(self): + paddle.disable_static(self.place) + x = paddle.to_tensor(self.x_np, dtype='int16') + self.assertRaises(TypeError, paddle.frac, x) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py index 4fca8b9f2a118..4140ce44648fa 100644 --- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py +++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py @@ -23,12 +23,14 @@ from paddle.fluid.op import Operator from paddle.fluid.executor import Executor from paddle.fluid.tests.unittests.op_test import OpTest, convert_uint16_to_float +from paddle.fluid.framework import _test_eager_guard import paddle class TestGaussianRandomOp(OpTest): def setUp(self): self.op_type = "gaussian_random" + self.python_api = paddle.normal self.set_attrs() self.inputs = {} self.use_mkldnn = False @@ -50,6 +52,10 @@ def set_attrs(self): def test_check_output(self): self.check_output_customized(self.verify_output) + def test_eager(self): + with _test_eager_guard(): + self.test_check_output() + def verify_output(self, outs): self.assertEqual(outs[0].shape, (123, 92)) hist, _ = np.histogram(outs[0], range=(-3, 5)) @@ -70,6 +76,7 @@ def verify_output(self, outs): class TestGaussianRandomBF16Op(OpTest): def setUp(self): self.op_type = "gaussian_random" + self.python_api = paddle.normal self.set_attrs() self.inputs = {} self.use_mkldnn = False @@ -93,6 +100,10 @@ def test_check_output(self): self.check_output_with_place_customized( self.verify_output, place=core.CUDAPlace(0)) + def test_eager(self): + with _test_eager_guard(): + self.test_check_output() + def verify_output(self, outs): outs = convert_uint16_to_float(outs) self.assertEqual(outs[0].shape, (123, 92)) diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py index 91c2800836c9d..3a9387082e680 100644 --- a/python/paddle/fluid/tests/unittests/test_initializer.py +++ b/python/paddle/fluid/tests/unittests/test_initializer.py @@ -244,7 +244,7 @@ def test_normal_initializer(self, dtype="float32"): lod_level=0, name="param", initializer=initializer.NormalInitializer(2.3, 1.9, 123)) - num_ops = 1 + num_ops = 2 if (dtype == "float16" or dtype == "uint16") else 1 self.assertEqual(len(block.ops), num_ops) init_op = block.ops[0] self.assertEqual(init_op.type, 'gaussian_random') @@ -685,6 +685,68 @@ def test_uniform_initializer(self, dtype="float32"): self.func_uniform_initializer() +class TestXavierInitializerDygraph(unittest.TestCase): + def func_xvarier_initializer(self, dtype="float32"): + """ + In dygraph mode, we can use initializer directly to initialize a tensor. + """ + paddle.disable_static() + + tensor = paddle.zeros([1024, 1024, 16]) + tensor.stop_gradient = False + + xavier_ = paddle.fluid.initializer.XavierInitializer( + uniform=False, fan_in=3, fan_out=5) + xavier_(tensor) + + hist, _ = output_hist(tensor.numpy()) + + hist2, _ = output_hist( + np.random.normal(0, np.sqrt(2.0 / (3 + 5)), [1024, 1024, 16])) + + self.assertTrue( + np.allclose( + hist, hist2, rtol=0, atol=0.01), + "hist: " + str(hist) + " hist2: " + str(hist2)) + paddle.enable_static() + + def test_xavier_initializer(self, dtype="float32"): + with framework._test_eager_guard(): + self.func_xvarier_initializer() + self.func_xvarier_initializer() + + +class TestMSRAInitializerDygraph(unittest.TestCase): + def func_msra_initializer(self, dtype="float32"): + """ + In dygraph mode, we can use initializer directly to initialize a tensor. + """ + paddle.disable_static() + + tensor = paddle.zeros([1024, 1024, 16]) + tensor.stop_gradient = False + + msra_ = paddle.fluid.initializer.MSRAInitializer( + uniform=False, fan_in=4) + msra_(tensor) + + hist, _ = output_hist(tensor.numpy()) + + hist2, _ = output_hist( + np.random.normal(0, np.sqrt(2.0 / (4)), [1024, 1024, 16])) + + self.assertTrue( + np.allclose( + hist, hist2, rtol=0, atol=0.01), + "hist: " + str(hist) + " hist2: " + str(hist2)) + paddle.enable_static() + + def test_msra_initializer(self, dtype="float32"): + with framework._test_eager_guard(): + self.func_msra_initializer() + self.func_msra_initializer() + + class TesetconsistencyOfDynamicAndStaticGraph(unittest.TestCase): def func_order(self): paddle.set_device('cpu') diff --git a/python/paddle/fluid/tests/unittests/test_initializer_nn.py b/python/paddle/fluid/tests/unittests/test_initializer_nn.py index 74686652044ec..9953681e0f5bd 100644 --- a/python/paddle/fluid/tests/unittests/test_initializer_nn.py +++ b/python/paddle/fluid/tests/unittests/test_initializer_nn.py @@ -400,7 +400,7 @@ def test_normal_initializer(self, dtype="float32"): lod_level=0, name="param", initializer=initializer.Normal(2.3, 1.9)) - num_ops = 1 + num_ops = 2 if dtype in ["float16", "uint16"] else 1 self.assertEqual(len(block.ops), num_ops) init_op = block.ops[0] self.assertEqual(init_op.type, 'gaussian_random') diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index e8820d5a8708e..ba1e9be815de6 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -24,6 +24,7 @@ import numpy as np from paddle.fluid.backward import append_backward from paddle.fluid.framework import Program, program_guard, convert_np_dtype_to_dtype_ +from paddle.fluid.framework import _test_eager_guard import paddle from paddle.io import Dataset import numpy @@ -1114,6 +1115,11 @@ def test_float64(self): def test_float32(self): self.check_with_dtype('float32') + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_float64() + self.test_float32() + class TestMasterWeightSaveForFP16(unittest.TestCase): ''' diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py index 04488ac58c5fb..89cfc711910ce 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py @@ -134,9 +134,11 @@ def test_to_sparse_coo(self): #test to_sparse_coo_grad backward out_grad_indices = [[0, 1], [0, 1]] out_grad_values = [2.0, 3.0] - out_grad = core.eager.sparse_coo_tensor( + out_grad = paddle.sparse.sparse_coo_tensor( paddle.to_tensor(out_grad_indices), - paddle.to_tensor(out_grad_values), out.shape, True) + paddle.to_tensor(out_grad_values), + shape=out.shape, + stop_gradient=True) out.backward(out_grad) assert np.array_equal(dense_x.grad.numpy(), out_grad.to_dense().numpy()) @@ -145,9 +147,11 @@ def test_coo_to_dense(self): with _test_eager_guard(): indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] values = [1.0, 2.0, 3.0, 4.0, 5.0] - sparse_x = core.eager.sparse_coo_tensor( + sparse_x = paddle.sparse.sparse_coo_tensor( paddle.to_tensor(indices), - paddle.to_tensor(values), [3, 4], False) + paddle.to_tensor(values), + shape=[3, 4], + stop_gradient=False) dense_tensor = sparse_x.to_dense() #test to_dense_grad backward out_grad = [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], @@ -158,6 +162,17 @@ def test_coo_to_dense(self): assert np.array_equal(correct_x_grad, sparse_x.grad.values().numpy()) + paddle.device.set_device("cpu") + sparse_x_cpu = paddle.sparse.sparse_coo_tensor( + paddle.to_tensor(indices), + paddle.to_tensor(values), + shape=[3, 4], + stop_gradient=False) + dense_tensor_cpu = sparse_x_cpu.to_dense() + dense_tensor_cpu.backward(paddle.to_tensor(out_grad)) + assert np.array_equal(correct_x_grad, + sparse_x_cpu.grad.values().numpy()) + def test_to_sparse_csr(self): with _test_eager_guard(): x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]] @@ -177,15 +192,52 @@ def test_coo_values_grad(self): with _test_eager_guard(): indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] values = [1.0, 2.0, 3.0, 4.0, 5.0] - sparse_x = core.eager.sparse_coo_tensor( + sparse_x = paddle.sparse.sparse_coo_tensor( paddle.to_tensor(indices), - paddle.to_tensor(values), [3, 4], False) + paddle.to_tensor(values), + shape=[3, 4], + stop_gradient=False) values_tensor = sparse_x.values() out_grad = [2.0, 3.0, 5.0, 8.0, 9.0] # test coo_values_grad values_tensor.backward(paddle.to_tensor(out_grad)) assert np.array_equal(out_grad, sparse_x.grad.values().numpy()) + def test_sparse_coo_tensor_grad(self): + with _test_eager_guard(): + indices = [[0, 1], [0, 1]] + values = [1, 2] + indices = paddle.to_tensor(indices, dtype='int32') + values = paddle.to_tensor( + values, dtype='float32', stop_gradient=False) + sparse_x = paddle.sparse.sparse_coo_tensor( + indices, values, shape=[2, 2], stop_gradient=False) + grad_indices = [[0, 1], [1, 1]] + grad_values = [2, 3] + grad_indices = paddle.to_tensor(grad_indices, dtype='int32') + grad_values = paddle.to_tensor(grad_values, dtype='float32') + sparse_out_grad = paddle.sparse.sparse_coo_tensor( + grad_indices, grad_values, shape=[2, 2]) + sparse_x.backward(sparse_out_grad) + correct_values_grad = [0, 3] + assert np.array_equal(correct_values_grad, values.grad.numpy()) + + place = core.CPUPlace() + indices_cpu = paddle.to_tensor(indices, dtype='int32', place=place) + values_cpu = paddle.to_tensor( + values, dtype='float32', place=place, stop_gradient=False) + sparse_x_cpu = paddle.sparse.sparse_coo_tensor( + indices_cpu, + values_cpu, + shape=[2, 2], + place=place, + stop_gradient=False) + + sparse_out_grad_cpu = paddle.sparse.sparse_coo_tensor( + grad_indices, grad_values, shape=[2, 2], place=place) + sparse_x_cpu.backward(sparse_out_grad_cpu) + assert np.array_equal(correct_values_grad, values_cpu.grad.numpy()) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py index aac904dc2e15d..c826a0e1030f4 100644 --- a/python/paddle/fluid/tests/unittests/test_split_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_op.py @@ -459,5 +459,24 @@ def test_axis_tensor_input(self): self.assertTrue(np.allclose(ex_x2, x2_out)) +class API_TestEmptySplit(unittest.TestCase): + def test_axis_input_empty_section(self): + with fluid.dygraph.guard(): + input_1 = np.random.random([8, 6, 6]).astype("float32") + # input is a variable which shape is [8, 6, 6] + input = paddle.to_tensor(input_1) + x0, x1, x2 = paddle.split(input, num_or_sections=[5, 0, 3]) + x0_out = x0.numpy() + x1_out = x1.numpy() + x2_out = x2.numpy() + ex_x0, ex_x1, ex_x2 = np.split(input_1, [ + 5, + 5, + ]) + self.assertTrue(np.allclose(ex_x0, x0_out)) + self.assertTrue(np.allclose(ex_x1, x1_out)) + self.assertTrue(np.allclose(ex_x2, x2_out)) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py new file mode 100644 index 0000000000000..a8173f054a133 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py @@ -0,0 +1,76 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys +sys.path.append("..") +import paddle +import paddle.fluid as fluid +import numpy as np +import unittest +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + +paddle.enable_static() + + +def bce_loss(input, label): + return -1 * (label * np.log(input) + (1. - label) * np.log(1. - input)) + + +class XPUTestBceLossOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'bce_loss' + self.use_dynamic_create_class = False + + class TestBceLossOp(XPUOpTest): + def setUp(self): + self.op_type = "bce_loss" + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.init_test_case() + input_np = np.random.uniform(0.1, 0.8, + self.shape).astype(self.dtype) + label_np = np.random.randint(0, 2, self.shape).astype(self.dtype) + output_np = bce_loss(input_np, label_np) + + self.inputs = {'X': input_np, 'Label': label_np} + self.outputs = {'Out': output_np} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + def init_test_case(self): + self.shape = [10, 10] + + class TestBceLossOpCase1(TestBceLossOp): + def init_test_cast(self): + self.shape = [2, 3, 4, 5] + + class TestBceLossOpCase2(TestBceLossOp): + def init_test_cast(self): + self.shape = [2, 3, 20] + + +support_types = get_xpu_op_support_types('bce_loss') +for stype in support_types: + create_test_class(globals(), XPUTestBceLossOp, stype) + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py index ab07221a07071..9254a84ec4217 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py @@ -18,37 +18,69 @@ import numpy as np import sys sys.path.append("..") -from op_test import OpTest -import paddle -import paddle.fluid as fluid -from paddle.fluid import Program, program_guard + import paddle +from op_test import OpTest +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + paddle.enable_static() -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestXPUSignOp(OpTest): - def setUp(self): - self.op_type = "sign" - self.dtype = np.float32 - self.inputs = { - 'X': np.random.uniform(-10, 10, (10, 10)).astype(self.dtype) - } - self.outputs = {'Out': np.sign(self.inputs['X'])} - self.attrs = {'use_xpu': True} - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], 'Out') +class XPUTestSignOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'sign' + self.use_dynamic_create_class = False + + class TestSignOPBase(XPUOpTest): + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_dtype() + self.set_case() + + def set_case(self): + self.op_type = 'sign' + self.dtype = self.in_type + self.init_config() + self.x = np.random.uniform(-10, 10, + self.input_shape).astype(self.dtype) + self.inputs = {'X': self.x} + self.outputs = {'Out': np.sign(self.x)} + self.attrs = {'use_xpu': True} + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + def init_config(self): + self.input_shape = [864] + + class XPUTestSign1(TestSignOPBase): + def init_config(self): + self.input_shape = [2, 768] + + class XPUTestSign2(TestSignOPBase): + def init_config(self): + self.input_shape = [3, 8, 4096] + + class XPUTestSign3(TestSignOPBase): + def init_config(self): + self.input_shape = [1024] + + class XPUTestSign4(TestSignOPBase): + def init_config(self): + self.input_shape = [2, 2, 255] + +support_types = get_xpu_op_support_types('sign') +for stype in support_types: + create_test_class(globals(), XPUTestSignOP, stype) if __name__ == "__main__": unittest.main() diff --git a/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py b/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py index b1c0cd4214dbb..3ab3cf6901402 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py +++ b/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py @@ -62,6 +62,6 @@ def forward(self, x): if self.random_routing: rand_routing_prob = paddle.rand( shape=[gate_score.shape[0]], dtype="float32") - topk_idx = paddle.distributed.utils.random_routing( + topk_idx = paddle.distributed.models.moe.utils._random_routing( topk_idx, topk_val, rand_routing_prob) return topk_val, topk_idx diff --git a/python/paddle/incubate/distributed/models/moe/utils.py b/python/paddle/incubate/distributed/models/moe/utils.py index 99e31a16273bf..0e87fe3e31360 100644 --- a/python/paddle/incubate/distributed/models/moe/utils.py +++ b/python/paddle/incubate/distributed/models/moe/utils.py @@ -11,7 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from paddle.distributed.models.moe.utils import * +from paddle.distributed.models.moe.utils import _number_count, _limit_by_capacity, _prune_gate_by_capacity, _assign_pos +import paddle def _alltoall(in_tensor_list, group=None, use_calc_stream=True): diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index aae3d97a79521..de09193ac798e 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -336,7 +336,23 @@ def _append_optimize_op(self, block, param_and_grad): lr = self._create_param_lr(param_and_grad) # create the adam optimize op - if framework._non_static_mode(): + if framework.in_dygraph_mode(): + found_inf = self._get_auxiliary_var('found_inf') + + _beta1 = self._beta1 if not isinstance( + self._beta1, Variable) else self._beta1.numpy().item(0) + _beta2 = self._beta2 if not isinstance( + self._beta2, Variable) else self._beta2.numpy().item(0) + + _, _, _, _, _, _ = _C_ops.final_state_adam( + param_and_grad[0], param_and_grad[1], lr, moment1, moment2, + beta1_pow_acc, beta2_pow_acc, master_weight, found_inf, _beta1, + _beta2, self._epsilon, self._lazy_mode, 1000, find_master, + False) + + return None + + if framework._in_legacy_dygraph(): _beta1 = self._beta1 if not isinstance( self._beta1, Variable) else self._beta1.numpy().item(0) diff --git a/python/paddle/sparse/creation.py b/python/paddle/sparse/creation.py index e29351e3d179c..ac9276f3142c0 100644 --- a/python/paddle/sparse/creation.py +++ b/python/paddle/sparse/creation.py @@ -14,6 +14,7 @@ from paddle import _C_ops from ..framework import core, dygraph_only +from ..framework import _current_expected_place, _get_paddle_place from ..tensor import to_tensor from ..tensor import max from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype @@ -38,6 +39,18 @@ def _infer_dense_shape(indices): return list(lens.numpy()) +def _get_place(place): + place = _get_paddle_place(place) + if place is None: + place = _current_expected_place() + elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace, + core.CUDAPlace)): + raise ValueError( + "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace" + ) + return place + + @dygraph_only def sparse_coo_tensor(indices, values, @@ -94,6 +107,8 @@ def sparse_coo_tensor(indices, # values=[1., 2., 3.]) """ + place = _get_place(place) + if not isinstance(indices, core.eager.Tensor): indices = to_tensor( indices, dtype=None, place=place, stop_gradient=True) @@ -101,13 +116,20 @@ def sparse_coo_tensor(indices, values = to_tensor(values, dtype, place, stop_gradient) if len(indices.shape) != 2: raise ValueError("'indices' must be 2-D.") - if place is not None: + + if not indices.place._equals(place): indices = indices._copy_to(place, False) + + if not values.place._equals(place): values = values._copy_to(place, False) values = _handle_dtype(values, dtype) + values.stop_gradient = stop_gradient + if shape is None: shape = _infer_dense_shape(indices) - return core.eager.sparse_coo_tensor(indices, values, shape, stop_gradient) + + return _C_ops.final_state_sparse_create_sparse_coo_tensor(values, indices, + shape) #TODO: need to support shape is None @@ -171,6 +193,9 @@ def sparse_csr_tensor(crows, # cols=[1, 3, 2, 0, 1], # values=[1, 2, 3, 4, 5]) """ + + place = _get_place(place) + if not isinstance(crows, core.eager.Tensor): crows = to_tensor(crows, dtype=None, place=place, stop_gradient=True) if not isinstance(cols, core.eager.Tensor): @@ -182,10 +207,15 @@ def sparse_csr_tensor(crows, "SparseCsrTensor only support 2-D or 3-D matrix. The 'crows', 'cols' and 'values' must be 1-D." ) - if place is not None: + if not crows.place._equals(place): crows = crows._copy_to(place, False) + + if not cols.place._equals(place): cols = cols._copy_to(place, False) + + if not values.place._equals(place): values = values._copy_to(place, False) values = _handle_dtype(values, dtype) + values.stop_gradient = stop_gradient return core.eager.sparse_csr_tensor(crows, cols, values, shape, stop_gradient) diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index 1b9fd679acee3..54cbaa9be860e 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -228,6 +228,7 @@ from .math import fmin # noqa: F401 from .math import inner # noqa: F401 from .math import outer # noqa: F401 +from .math import frac # noqa: F401 from .random import multinomial # noqa: F401 from .random import standard_normal # noqa: F401 @@ -456,6 +457,7 @@ 'digamma', 'diagonal', 'trunc', + 'frac', 'bitwise_and', 'bitwise_or', 'bitwise_xor', diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 2fcf9ff4213d4..b315e3e9673fc 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -347,7 +347,8 @@ def frobenius_norm(input, dim=None, keepdim=False, name=None): if in_dygraph_mode(): if dim is None: - return _C_ops.final_state_frobenius_norm(input, keepdim, True) + return _C_ops.final_state_frobenius_norm(input, [], keepdim, + True) return _C_ops.final_state_frobenius_norm(input, dim, keepdim, False) if _in_legacy_dygraph(): if dim is None: diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 3a2d08af88ff8..cfc9abb86984d 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -4385,3 +4385,57 @@ def angle(x, name=None): outputs = {"Out": out} helper.append_op(type=op_type, inputs=inputs, outputs=outputs) return out + +def frac(x, name=None): + """ + This API is used to return the fractional portion of each element in input. + + Args: + x (Tensor): The input tensor, which data type should be int32, int64, float32, float64. + name: (str, optional): Name for operation (optional, default is None). For more + + Returns: + Tensor: The output Tensor of frac. + + Examples: + .. code-block:: Python + + import paddle + import numpy as np + + input = paddle.rand([3, 3], 'float32') + print(input.numpy()) + # [[ 1.2203873 -1.0035421 -0.35193074] + # [-0.00928353 0.58917075 -0.8407828 ] + # [-1.5131804 0.5850153 -0.17597814]] + + output = paddle.frac(input) + print(output.numpy()) + # [[ 0.22038734 -0.00354207 -0.35193074] + # [-0.00928353 0.58917075 -0.8407828 ] + # [-0.5131804 0.5850153 -0.17597814]] + """ + op_type = 'elementwise_sub' + axis = -1 + act = None + if x.dtype not in [paddle.int32, paddle.int64, paddle.float32, paddle.float64]: + raise TypeError( + "The data type of input must be one of ['int32', 'int64', 'float32', 'float64'], but got {}".format(x.dtype)) + if in_dygraph_mode(): + y = _C_ops.final_state_trunc(x) + return _C_ops.final_state_subtract(x, y) + else: + if _in_legacy_dygraph(): + y = _C_ops.trunc(x) + return _elementwise_op_in_dygraph( + x, y, axis=axis, act=act, op_name=op_type) + else: + inputs = {"X": x} + attrs = {} + + helper = LayerHelper("trunc", **locals()) + check_variable_and_dtype(x, "X", ['int32', 'int64', 'float32', 'float64'], 'trunc') + y = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type="trunc", inputs=inputs, attrs=attrs, outputs={"Out": y}) + return _elementwise_op(LayerHelper(op_type, **locals())) diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 82818d50510c9..3d0617e40d6b6 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -239,7 +239,15 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None): if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) - if paddle.in_dynamic_mode(): + if in_dygraph_mode(): + shape = utils.convert_shape_to_list(shape) + place = _current_expected_place() + return _C_ops.final_state_gaussian_random(shape, + float(mean), + float(std), seed, dtype, + place) + + if _in_legacy_dygraph(): shape = utils.convert_shape_to_list(shape) return _C_ops.gaussian_random('shape', shape, 'mean', float(mean), 'std', diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index e3d8e8f5f47a5..6387525fa26f1 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -45,6 +45,12 @@ kernel : func : adadelta +- api : adam + args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow) + output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs) + optional : master_param, skip_update + invoke : adam_impl(param, grad, learning_rate, moment1, moment2, beta1_pow, beta2_pow, master_param, skip_update, beta1, beta2, epsilon, lazy_mode, min_row_size_to_use_multithread, multi_precision, use_global_beta_pow) + - api : adamax args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment, Tensor inf_norm, Tensor beta1_pow, float beta1, float beta2, float epsilon) output : Tensor(param_out), Tensor(avg_squared_grad_out), Tensor(avg_squared_update_out) @@ -445,6 +451,16 @@ func : cumsum backward : cumsum_grad +- api : deformable_conv + args : (Tensor x, Tensor offset, Tensor filter, Tensor mask, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step) + output : Tensor(out) + infer_meta : + func : DeformableConvInferMeta + kernel : + func : deformable_conv + optional : mask + backward : deformable_conv_grad + - api : depthwise_conv2d_transpose args : (Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) output : Tensor(out) @@ -782,6 +798,18 @@ kernel : func : gather_tree +- api : gaussian_random + args : (IntArray shape, float mean, float std, int seed, DataType dtype, Place place={}) + output: Tensor + infer_meta : + func : GaussianRandomInferMeta + param : [shape, mean, std, seed, dtype] + kernel : + func : gaussian_random + param : [shape, mean, std, seed, dtype] + data_type : dtype + backend : place + - api : gelu args : (Tensor x, bool approximate) output : Tensor(out) diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index f8366744bdbe6..d243b4d160d57 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -339,6 +339,16 @@ output : Tensor(x_grad) invoke : cumsum(out_grad, axis, flatten, exclusive, !reverse) +- backward_api : deformable_conv_grad + forward : deformable_conv(Tensor x, Tensor offset, Tensor filter, Tensor mask, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step) -> Tensor(out) + args : (Tensor x, Tensor offset, Tensor filter, Tensor mask, Tensor out_grad, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step) + output : Tensor(x_grad), Tensor(offset_grad), Tensor(filter_grad), Tensor(mask_grad) + infer_meta : + func : DeformableConvGradInferMeta + kernel : + func : deformable_conv_grad + optional : mask + - backward_api : depthwise_conv2d_transpose_grad forward : depthwise_conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out) args : (Tensor x, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml index 7bdd77e27bcef..2187d4abb2d63 100644 --- a/python/paddle/utils/code_gen/sparse_api.yaml +++ b/python/paddle/utils/code_gen/sparse_api.yaml @@ -21,6 +21,14 @@ layout : x backward : coo_values_grad +- api : create_sparse_coo_tensor + args : (Tensor values, Tensor indices, IntArray dense_shape) + output : Tensor(out@SparseCooTensor) + kernel : + func : sparse_coo_tensor + layout : values + backward : create_sparse_coo_tensor_grad + - api : csr_values args : (Tensor x) output : Tensor(out@DenseTensor) diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml index 800145b06e0b6..e3946cbf72bc2 100644 --- a/python/paddle/utils/code_gen/sparse_bw_api.yaml +++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml @@ -19,6 +19,13 @@ kernel : func : coo_values_grad +- backward_api : create_sparse_coo_tensor_grad + forward : create_sparse_coo_tensor(Tensor values, Tensor indices, IntArray dense_shape) -> Tensor(out@SparseCooTensor) + args : (Tensor indices, Tensor out_grad) + output : Tensor(values_grad@DenseTensor) + kernel : + func : sparse_coo_tensor_grad + - backward_api : dense_to_coo_grad forward : dense_to_coo(Tensor x, int64_t sparse_dim) -> Tensor(out@SparseCooTensor) args : (Tensor out_grad) diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 2ed01d42cfb8c..8fa51df9ac10d 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -558,7 +558,15 @@ def deform_conv2d(x, use_deform_conv2d_v1 = True if mask is None else False - if _non_static_mode(): + if in_dygraph_mode(): + pre_bias = _C_ops.final_state_deformable_conv( + x, offset, weight, mask, stride, padding, dilation, + deformable_groups, groups, 1) + if bias is not None: + out = nn.elementwise_add(pre_bias, bias, axis=1) + else: + out = pre_bias + elif _in_legacy_dygraph(): attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation, 'deformable_groups', deformable_groups, 'groups', groups, 'im2col_step', 1) diff --git a/tools/infrt/skipped_phi_api.json b/tools/infrt/skipped_phi_api.json index b352240c6dcc5..2502e248c5c48 100644 --- a/tools/infrt/skipped_phi_api.json +++ b/tools/infrt/skipped_phi_api.json @@ -1,4 +1,4 @@ { -"phi_apis":["conj", "dropout", "expand_as", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth", "layer_norm"], +"phi_apis":["conj", "deformable_conv", "dropout", "expand_as", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth", "layer_norm"], "phi_kernels":["equal_all"] }