From 17d8098689c2b07ce55521bc277ccfcd19fcd21a Mon Sep 17 00:00:00 2001 From: jack603047588 <603047588@qq.com> Date: Thu, 2 Nov 2023 16:19:51 +0800 Subject: [PATCH 01/20] abacus-aibox-878 support xpu gcc10 --- cmake/xpu_kp.cmake | 3 ++- paddle/fluid/operators/optimizers/lamb_op_xpu.cc | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake index 67364de4436a7..d44acfd4574f9 100644 --- a/cmake/xpu_kp.cmake +++ b/cmake/xpu_kp.cmake @@ -51,7 +51,8 @@ message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG}) # The host sysroot of XPU compiler is gcc-8.2 if(NOT HOST_SYSROOT) - set(HOST_SYSROOT /opt/compiler/gcc-8.2) + #set(HOST_SYSROOT /opt/compiler/gcc-8.2) + set(HOST_SYSROOT /opt/compiler/gcc-10) endif() if(NOT IS_DIRECTORY ${HOST_SYSROOT}) diff --git a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc index e0233fadb8858..65dd1300c0dd4 100644 --- a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc @@ -36,8 +36,6 @@ class LambOpXPUKernel : public framework::OpKernel { ctx.InputNames("Param").front(), framework::ToTypeName(param_var->Type()))); - using paddle::framework::LoDTensor; - // inputs T epsilon = static_cast(ctx.Attr("epsilon")); T weight_decay = static_cast(ctx.Attr("weight_decay")); From 70f26a2a34f4abe70ce4b62a56bad0a40eb4afef Mon Sep 17 00:00:00 2001 From: tanzhipeng Date: Mon, 13 Nov 2023 12:17:29 +0000 Subject: [PATCH 02/20] update bkcl to version v1.1.6.1. --- cmake/external/xpu.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 8a47a80cb6e69..e63b185802623 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -97,12 +97,12 @@ set(XPU_PACK_DEPENCE_URL if (WITH_BOX_PS OR WITH_XPU_KP) set(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64") set(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64") - set(XPU_XCCL_DIR_NAME "xccl_socket-bdcentos_x86_64") + set(XPU_XCCL_DIR_NAME "xccl_rdma-bdcentos_x86_64") set(XPU_XRE_URL "https://klx-sdk-release-public.su.bcebos.com/xre/release/4.0.28.1/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) set(XPU_XCCL_URL - "https://klx-sdk-release-public.su.bcebos.com/xccl/release/1.0.62.1/${XPU_XCCL_DIR_NAME}.tar.gz" + "https://klx-sdk-release-public.su.bcebos.com/xccl/release/1.1.6.1/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) #"https://klx-sdk-release-public.su.bcebos.com/xdnn/release/2.6.0.1/${XPU_XDNN_DIR_NAME}.tar.gz" set(XPU_XDNN_URL From 14a9f4341fbe7bb3b20a9db7ff29648470b5c0f9 Mon Sep 17 00:00:00 2001 From: xiayanming Date: Wed, 15 Nov 2023 16:17:29 +0800 Subject: [PATCH 03/20] update xdnn version --- cmake/external/xpu.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index e63b185802623..7649af9a0ccfd 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -106,7 +106,7 @@ if (WITH_BOX_PS OR WITH_XPU_KP) CACHE STRING "" FORCE) #"https://klx-sdk-release-public.su.bcebos.com/xdnn/release/2.6.0.1/${XPU_XDNN_DIR_NAME}.tar.gz" set(XPU_XDNN_URL - "https://klx-sdk-release-public.su.bcebos.com/xdnn_train/dev/paddlebox/20231031/${XPU_XDNN_DIR_NAME}.tar.gz" + "https://klx-sdk-release-public.su.bcebos.com/xdnn_train/dev/paddlebox/20231115/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) set(SCALOPUS_URL "https://klx-sdk-release-public.su.bcebos.com/xdnn_train/dev/paddlebox/20230306/scalopus.tar.gz" From 076c0c87c113d3cdecf2453d27a4417258a5b079 Mon Sep 17 00:00:00 2001 From: HuangShiqing Date: Tue, 7 Nov 2023 14:02:14 +0800 Subject: [PATCH 04/20] abacus-aibox-842 add the xpu expand embedding pool support --- paddle/fluid/framework/boxps_worker.cc | 8 + paddle/fluid/framework/data_feed.cc | 184 +++++ paddle/fluid/framework/data_feed.kps | 3 +- paddle/fluid/framework/data_set.cc | 11 +- .../fluid/framework/fleet/box_wrapper_impl.h | 233 ++++++- .../framework/fleet/box_wrapper_kernel.h | 8 +- .../framework/fleet/box_wrapper_kernel.kps | 631 +++++++++++++++++- .../fluid/inference/api/analysis_predictor.cc | 1 + .../operators/collective/c_mixallgather_op.cc | 33 + .../fused/fused_seqpool_cvm_op_xpu.cc | 21 +- .../operators/pull_box_extended_sparse_op.h | 272 +++++++- .../fluid/operators/rank_attention_op_xpu.cc | 19 + .../fluid/platform/device/xpu/xpu2_op_list.h | 4 + 13 files changed, 1391 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/framework/boxps_worker.cc b/paddle/fluid/framework/boxps_worker.cc index 17fffb84077a2..63ca8931f19f9 100644 --- a/paddle/fluid/framework/boxps_worker.cc +++ b/paddle/fluid/framework/boxps_worker.cc @@ -600,6 +600,14 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) { var_num += 1; } } + // printf("[hsq] name: %s in BoxPSWorker::CreateDeviceResource\n", name.c_str()); + // printf("[hsq] sync_mode_:%d, dense_table_:%p\n", sync_mode_, dense_table_); + // printf("[hsq] root_tensor.numel(): %d\n", (int)root_tensor.numel()); + // const void* p = root_tensor.data(); + // printf("[hsq] root_tensor.data(): %p\n",p); + // std::cout<<"[hsq] place_: "<data(); + // printf("[hsq] gpu_tensor->data(): %p\n",p1); if (!gpu_tensor->initialized() && place_ == root_tensor.place()) { auto dim = root_tensor.dims(); gpu_tensor->ShareDataWith(root_tensor).Resize(dim); diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 23cb676589927..7257f2ef238c1 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -3595,6 +3595,8 @@ int SlotPaddleBoxDataFeed::GetCurrentPhase() { return box_ptr->Phase(); } } + +#include void SlotPaddleBoxDataFeed::GetRankOffsetGPU(const int pv_num, const int ins_num) { #if defined(PADDLE_WITH_CUDA) && defined(_LINUX) || defined(PADDLE_WITH_XPU_KP) && !defined(CPU_DATA_FEED) @@ -3609,9 +3611,191 @@ void SlotPaddleBoxDataFeed::GetRankOffsetGPU(const int pv_num, value.d_ad_offset.data(), col); #elif defined(PADDLE_WITH_XPU_KP) + // if(this->place_.GetDeviceId()==0) + // { + // std::vector h_mat(rank_offset_->numel()); + // std::vector h_rank(value.d_rank.numel()); + // std::vector h_cmatch(value.d_cmatch.numel()); + // std::vector h_ad_offset(value.d_ad_offset.numel()); + // xpu_memcpy(h_mat.data(), tensor_ptr, rank_offset_->numel() * sizeof(int), XPU_DEVICE_TO_HOST); + // xpu_memcpy(h_rank.data(), value.d_rank.data(), value.d_rank.numel() * sizeof(int), XPU_DEVICE_TO_HOST); + // xpu_memcpy(h_cmatch.data(), value.d_cmatch.data(), value.d_cmatch.numel() * sizeof(int), XPU_DEVICE_TO_HOST); + // xpu_memcpy(h_ad_offset.data(), value.d_ad_offset.data(), value.d_ad_offset.numel() * sizeof(int), XPU_DEVICE_TO_HOST); + + // printf("[hsq] ins_num:%d, pv_num:%d, max_rank:%d, col:%d\n", ins_num, pv_num, max_rank, col); + + // std::cout<<"[hsq] h_ad_offset: ["; + // for (int i = 0; i < (int)h_ad_offset.size(); i++) { + // std::cout<place_.GetDeviceId(); + // std::ofstream fo; + // fo.open("h_num_"+std::to_string(id)+".txt"); + // fo << ins_num << " " << pv_num << " " << max_rank << " " << col << " "; + // fo.close(); + + // fo.open("h_mat_"+std::to_string(id)+".txt"); + // fo << (int)h_mat.size() << " "; + // for (int i = 0; i < (int)h_mat.size(); i++) { + // fo << h_mat[i] << " ";//-1.0~1.0 + // } + // fo.close(); + + // fo.open("h_rank_"+std::to_string(id)+".txt"); + // fo << (int)h_rank.size() << " "; + // for (int i = 0; i < (int)h_rank.size(); i++) { + // fo << h_rank[i] << " ";//-1.0~1.0 + // } + // fo.close(); + + // fo.open("h_cmatch_"+std::to_string(id)+".txt"); + // fo << (int)h_cmatch.size() << " "; + // for (int i = 0; i < (int)h_cmatch.size(); i++) { + // fo << h_cmatch[i] << " ";//-1.0~1.0 + // } + // fo.close(); + + // fo.open("h_ad_offset_"+std::to_string(id)+".txt"); + // fo << (int)h_ad_offset.size() << " "; + // for (int i = 0; i < (int)h_ad_offset.size(); i++) { + // fo << h_ad_offset[i] << " ";//-1.0~1.0 + // } + // fo.close(); + // } + +// if(this->place_.GetDeviceId()==0) { +// int pv_num_2; +// int ins_num_2; +// int max_rank_2; +// int cols_2; +// std::vector h_mat; +// std::vector h_ad_rank; +// std::vector h_cmatch; +// std::vector h_pv_offset; + +// std::ifstream fi; +// int size; + +// std::string id = std::to_string(this->place_.GetDeviceId()); +// fi.open("h_num_"+id+".txt"); +// fi >> ins_num_2 >> pv_num_2 >> max_rank_2 >> cols_2; +// fi.close(); +// printf("[hsq] ins_num: %d, pv_num: %d, max_rank:%d, col:%d\n", ins_num, pv_num, max_rank, col); +// printf("[hsq] ins_num_2: %d, pv_num_2: %d, max_rank_2:%d, cols_2:%d\n", ins_num_2, pv_num_2, max_rank_2, cols_2); + +// fi.open("h_mat_"+id+".txt"); +// fi >> size; +// if(size!=ins_num_2*cols_2) { +// printf("[hsq] error in h_mat size\n"); +// } +// for (int i = 0; i < size; i++) { +// int val; +// fi >> val; +// h_mat.push_back(val); +// } +// fi.close(); + +// fi.open("h_rank_"+id+".txt"); +// fi >> size; +// if(size!=ins_num_2) { +// printf("[hsq] error in h_ad_rank size, which %d should be %d\n", size, ins_num); +// } +// for (int i = 0; i < size; i++) { +// int val; +// fi >> val; +// h_ad_rank.push_back(val); +// } +// fi.close(); + +// fi.open("h_cmatch_"+id+".txt"); +// fi >> size; +// if(size!=ins_num_2) { +// printf("[hsq] error in h_cmatch size\n"); +// } +// for (int i = 0; i < size; i++) { +// int val; +// fi >> val; +// h_cmatch.push_back(val); +// } +// fi.close(); + +// fi.open("h_ad_offset_"+id+".txt"); +// fi >> size; +// if(size!=pv_num_2+1) { +// printf("[hsq] error in h_pv_offset size\n"); +// } +// for (int i = 0; i < size; i++) { +// int val; +// fi >> val; +// h_pv_offset.push_back(val); +// } +// fi.close(); +// // prepare buffer on xpu +// void *d_mat = nullptr; +// void *d_ad_rank = nullptr; +// void *d_cmatch = nullptr; +// void *d_pv_offset = nullptr; + +// xpu_malloc((void **)&d_mat, h_mat.size() * sizeof(int)); +// xpu_malloc((void **)&d_ad_rank, h_ad_rank.size() * sizeof(int)); +// xpu_malloc((void **)&d_cmatch, h_cmatch.size() * sizeof(int)); +// xpu_malloc((void **)&d_pv_offset, h_pv_offset.size() * sizeof(int)); + +// // copy input to xpu +// xpu_memcpy(d_mat, h_mat.data(), h_mat.size() * sizeof(int), XPUMemcpyKind::XPU_HOST_TO_DEVICE); +// xpu_memcpy(tensor_ptr, h_mat.data(), h_mat.size() * sizeof(int), XPUMemcpyKind::XPU_HOST_TO_DEVICE); +// xpu_memcpy(d_ad_rank, h_ad_rank.data(), h_ad_rank.size() * sizeof(int), XPUMemcpyKind::XPU_HOST_TO_DEVICE); +// xpu_memcpy(d_cmatch, h_cmatch.data(), h_cmatch.size() * sizeof(int), XPUMemcpyKind::XPU_HOST_TO_DEVICE); +// xpu_memcpy(d_pv_offset, h_pv_offset.data(), h_pv_offset.size() * sizeof(int), XPUMemcpyKind::XPU_HOST_TO_DEVICE); + +// std::cout<<"[hsq] place: "<place_<place_, tensor_ptr, ins_num_2, pv_num_2, max_rank_2, + // (int*)d_ad_rank, (int*)d_cmatch, + // (int*)d_pv_offset, cols_2); + +// // std::vector h_mat_out(ins_num_2*cols_2); +// // xpu_memcpy(h_mat_out.data(), d_mat, h_mat_out.size() * sizeof(int), XPUMemcpyKind::XPU_DEVICE_TO_HOST); + +// // std::cout<<"[hsq] mat_out: ["; +// // for (int i = 0; i < ins_num_2*cols_2; i++) { +// // std::cout<place_); + auto ctx = static_cast(dev_ctx)->x_context(); + int r = xpu::constant(ctx, tensor_ptr, rank_offset_->numel(), 0); + PADDLE_ENFORCE_EQ(r, + XPU_SUCCESS, + platform::errors::External( + "XPU constant kernel return wrong value[%d %s]", + r, + XPUAPIErrorMsg[r])); DataFeedPdboxXpuKernelHelper::CopyRankOffset(this->place_, tensor_ptr, ins_num, pv_num, max_rank, value.d_rank.data(), value.d_cmatch.data(), value.d_ad_offset.data(), col); + // std::vector h_mat_out(ins_num*col); + // xpu_memcpy(h_mat_out.data(), tensor_ptr, h_mat_out.size() * sizeof(int), XPUMemcpyKind::XPU_DEVICE_TO_HOST); + + // if(this->place_.GetDeviceId()==0) { + // std::cout<<"[hsq] mat_out: ["; + // for (int i = 0; i < ins_num; i++) { + // for( int j = 0; j < col; j++ ) { + // std::cout<(dev_ctx) ->x_context() ->xpu_stream; - CopyRankOffsetKernel<<<4, 64, stream>>>(dest, ranks, cmatchs, ad_offsets, ins_num, pv_num, max_rank, cols); + CopyRankOffsetKernel<<<8, 64, stream>>>(dest, ranks, cmatchs, ad_offsets, pv_num, ins_num, max_rank, cols); xpu_wait(stream); } diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index 3aaef2faddefc..d0129cd9151dd 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -2114,8 +2114,11 @@ void PadBoxSlotDataset::PreLoadIntoDisk(const std::string& path, } CHECK(slot_pool_ != nullptr) << "slotrecord pool nullptr"; read_ins_ref_ = thread_num_; + if (disable_shuffle_) { + read_ins_ref_ = 1; + } CHECK(down_pool_ != nullptr) << "down_pool nullptr"; - for (int64_t i = 0; i < thread_num_; ++i) { + for (int64_t i = 0; i < read_ins_ref_; ++i) { wait_futures_.emplace_back(down_pool_->Run([this, i]() { platform::Timer timer; timer.Start(); @@ -2785,8 +2788,10 @@ void PadBoxSlotDataset::PrepareTrain(void) { // join or aucrunner mode enable pv if (enable_pv_merge_ && (box_ptr->Phase() & 0x01 == 1 || box_ptr->Mode() == 1)) { - std::shuffle(input_pv_ins_.begin(), input_pv_ins_.end(), - BoxWrapper::LocalRandomEngine()); + if (!disable_random_update_) { + std::shuffle(input_pv_ins_.begin(), input_pv_ins_.end(), + BoxWrapper::LocalRandomEngine()); + } // 分数据到各线程里面 int batchsize = reinterpret_cast(readers_[0].get()) ->GetPvBatchSize(); diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h index 13f8bf1155c81..291bb0036af0f 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_impl.h +++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h @@ -352,6 +352,56 @@ void BoxWrapper::PullSparseCaseCPU(const paddle::platform::Place& place, all_timer.Pause(); } +template +void hsq_dump(void* d_ptr, + int len, + std::string path, + bool need_print, + int oneline_count, + int print_len, + std::string print_name, + std::string mode = "") { + std::vector h_buf(len); + xpu_memcpy(h_buf.data(), d_ptr, h_buf.size() * sizeof(T), XPU_DEVICE_TO_HOST); + + std::ofstream fo; + if(mode=="app") { + fo.open(path, std::ofstream::app); + } else { + fo.open(path); + } + if(oneline_count) { + for (int i = 0; i < (int)h_buf.size()/oneline_count; i++) { + for(int j=0; j& keys, const std::vector& values, @@ -437,6 +487,14 @@ void BoxWrapper::PullSparseCaseXPU(const paddle::platform::Place& place, TRACE_SCOPE_START("PullSparseXPU", xpu_wait(ctx_xpu->xpu_stream)); #endif pull_boxps_timer.Start(); + static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? + std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : + 0; + int dev_id = place.GetDeviceId();//xpu_ctx->dev().id(); +// if(dev_id==target_id) { +// printf("[hsq] dev_id:%d, 2.going to call boxps_ptr_->PullSparseXPU\n", dev_id); +// printf("[hsq] total_length: %d, feature_pull_size_: %d, total_bytes: %d\n", (int)total_length, (int)feature_pull_size_, (int)total_bytes); +// } boxps_ptr_->PullSparseXPU(total_keys, total_values_xpu, static_cast(total_length), device_id); pull_boxps_timer.Pause(); @@ -458,6 +516,9 @@ void BoxWrapper::PullSparseCaseXPU(const paddle::platform::Place& place, } else { pull_offset = dev.pull_offset.data(); } +// if(dev_id==target_id) { +// printf("[hsq] pull_offset.expand_size: %d, pull_offset.expand: %d\n", pull_info_.expand_size, pull_info_.expand); +// } float** xpu_values = dev.values_ptr_tensor.mutable_data( static_cast(values.size() * sizeof(float*)), place); @@ -467,10 +528,64 @@ void BoxWrapper::PullSparseCaseXPU(const paddle::platform::Place& place, #ifdef TRACE_PROFILE TRACE_SCOPE_START("CopyForPull", xpu_wait(ctx_xpu->xpu_stream)); #endif +// if(dev_id==target_id) { +// printf("[hsq] dev_id:%d, 3.going to call box_wrapper_kernel_->CopyForPull\n", dev_id); + +// // std::vector h_key2slot(total_length); +// // xpu_memcpy(h_key2slot.data(), key2slot, h_key2slot.size() * sizeof(int), XPU_DEVICE_TO_HOST); +// // std::cout<<"[hsq] box_wrapper_kernel_->CopyForPull's key2slot: ["; +// // for(int i =0;i<300;i++) { +// // std::cout<CopyForPull(place, xpu_keys, (float**)values.data(), total_values_xpu, pull_offset, slot_lengths_lod.data(), slot_num, key2slot, hidden_size, expand_embed_dim, total_length, total_dims, skip_offset, expand_only); + static int target_count = std::getenv("HSQ_BOXPS_TARGET_COUNT")!=NULL ? + std::stoi(std::string(std::getenv("HSQ_BOXPS_TARGET_COUNT"))) : + 0; + static int count = 0; + if(dev_id==target_id && count==target_count) { + for(int i=0;i(values[i], + slot_lengths[i]*hidden_size, + file_path, + false, // need_print + hidden_size, // oneline_count + 100, + file_path); // print_name + } + + for(int i=slot_num;i<2*slot_num;i++) { + if(values[i]==nullptr) + continue; + std::string file_path = "dev"+std::to_string(dev_id)+"_count"+std::to_string(count)+"_pull_copy_output_expand"+std::to_string(i)+".txt"; + hsq_dump(values[i], + slot_lengths[i-slot_num]*expand_embed_dim, + file_path, + false, // need_print + expand_embed_dim, // oneline_count + 100, + file_path); // print_name + } + } + if(dev_id==target_id) { + count ++; + } #ifdef TRACE_PROFILE TRACE_SCOPE_END("CopyForPull", xpu_wait(ctx_xpu->xpu_stream)); TRACE_SCOPE_END("pull copy", xpu_wait(ctx_xpu->xpu_stream)); @@ -731,16 +846,110 @@ void BoxWrapper::PushSparseGradCaseXPU(const paddle::platform::Place& place, TRACE_SCOPE_START("CopyForPush", xpu_wait(ctx_xpu->xpu_stream)); #endif - float* real_grad_values; - for (int i = 0; i < slot_num; i++) { - if(grad_values[i] != nullptr) { - real_grad_values = const_cast(grad_values[i]); - break; - } +// float* real_grad_values; +// for (int i = 0; i < slot_num; i++) { +// if(grad_values[i] != nullptr) { +// real_grad_values = const_cast(grad_values[i]); +// break; +// } +// } + std::vector slot_inner_offset(total_length); + int out_count = 0; + for(int i=0;idev().id(); + + static int target_count = std::getenv("HSQ_BOXPS_TARGET_COUNT")!=NULL ? + std::stoi(std::string(std::getenv("HSQ_BOXPS_TARGET_COUNT"))) : + 0; + static int count = 18; + if(dev_id==target_id && count==target_count) { + for(int i=0;i((void*)grad_values[i], + slot_lengths[i]*hidden_size, + file_path, + false, // need_print + hidden_size, // oneline_count + 100, + file_path); // print_name + } + + for(int i=slot_num;i<2*slot_num;i++) { + if(grad_values[i]==nullptr) + continue; + std::string file_path = "dev"+std::to_string(dev_id)+"_count"+std::to_string(count)+"_push_copy_input_expand"+std::to_string(i)+".txt"; + hsq_dump((void*)grad_values[i], + slot_lengths[i-slot_num]*expand_embed_dim, + file_path, + false, // need_print + expand_embed_dim, // oneline_count + 100, + file_path); // print_name + } + std::string file_path = "dev"+std::to_string(dev_id)+"_count"+std::to_string(count)+"_push_copy_input_key2slot.txt"; + hsq_dump((void*)key2slot, + (int)total_length, + file_path, + false, // need_print + expand_embed_dim, // oneline_count + 100, + file_path); + file_path = "dev"+std::to_string(dev_id)+"_count"+std::to_string(count)+"_push_copy_input_slot.txt"; + hsq_dump((void*)slot_vector, + (int)slot_num, + file_path, + false, // need_print + 1, // oneline_count + 100, + file_path); + + file_path = "dev"+std::to_string(dev_id)+"_count"+std::to_string(count)+"_push_copy_input_slot_inner_offset.txt"; + hsq_dump((void*)d_slot_inner_offset, + (int)total_length, + file_path, + false, // need_print + 1, // oneline_count + 100, + file_path); } - box_wrapper_kernel_->CopyForPush(place, real_grad_values, total_grad_values_xpu, - push_offset, total_length, slot_vector, slot_lens, slot_num, - hidden_size, batch_size, total_dims, skip_offset, key2slot); + + box_wrapper_kernel_->CopyForPush(place, xpu_values, total_grad_values_xpu, + push_offset, total_length, slot_vector, (int*)d_slot_inner_offset, slot_lens, slot_num, + hidden_size, batch_size, total_dims, skip_offset, key2slot, + expand_embed_dim, + push_float_num_, + expand_only); + + if(dev_id==target_id && count==target_count) { + std::string file_path = "dev"+std::to_string(dev_id)+"_count"+std::to_string(count)+"_push_copy_output.txt"; + hsq_dump(total_grad_values_xpu, + total_length*push_float_num_, + file_path, + false, // need_print + push_float_num_, // oneline_count + 100, + file_path); // print_name + } + if(dev_id==target_id) { + count ++; + } + push_boxps_timer.Resume(); #ifdef TRACE_PROFILE @@ -749,9 +958,15 @@ void BoxWrapper::PushSparseGradCaseXPU(const paddle::platform::Place& place, TRACE_SCOPE_START("PushSparseXPU", xpu_wait(ctx_xpu->xpu_stream)); #endif +// if(dev_id==target_id){ +// printf("[hsq] going to call boxps_ptr_->PushSparseXPU\n"); +// } int ret = boxps_ptr_->PushSparseXPU(total_keys, reinterpret_cast(total_grad_values_xpu), static_cast(total_length), device_id); + // int ret = 0; + // total_keys = total_keys; + PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "PushSparseXPU failed in BoxPS.")); push_boxps_timer.Pause(); diff --git a/paddle/fluid/framework/fleet/box_wrapper_kernel.h b/paddle/fluid/framework/fleet/box_wrapper_kernel.h index 9b7066c3c4115..88adcb870b4b5 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_kernel.h +++ b/paddle/fluid/framework/fleet/box_wrapper_kernel.h @@ -48,18 +48,22 @@ void CopyForPull( void CopyForPush( const paddle::platform::Place& place, - float* gm_src_ptr, + float** gm_src_ptr, void* total_grad_values_xpu, boxps::FeaturePushOffset* push_offset, const int64_t total_length, const int* slots, + const int* slot_inner_offset, const int64_t* slot_lens, const int slot_num, const int hidden_size, const int batch_size, const int* total_dims, const int skip_offset, - const int* key2slot); + const int* key2slot, + const int expand_embed_dim, + const int push_float_num, + bool expand_only); public: const static int MAX_SLOT_SIZE = 10240; diff --git a/paddle/fluid/framework/fleet/box_wrapper_kernel.kps b/paddle/fluid/framework/fleet/box_wrapper_kernel.kps index 6958248a30dcf..68160fd1f492b 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_kernel.kps +++ b/paddle/fluid/framework/fleet/box_wrapper_kernel.kps @@ -30,6 +30,9 @@ limitations under the License. */ #include "xpu/kernel/xtdk_simd.h" #ifdef TRACE_PROFILE +#include "xpu/kernel/xtdk_io.h" +#include + // The producer side. #include #include @@ -58,6 +61,15 @@ struct EmbedxNormalOp { } }; +struct ExpandPushGetOp { + __device__ float get(float* expand, const int& row, + const int& expand_id, + const int& /**hidden*/, + const int& expand_dim) const { + return expand[row * expand_dim + expand_id]; + } +}; + template __device__ void set_byfloat(float* dest, const T& val) { (*reinterpret_cast(dest)) = val; @@ -205,6 +217,245 @@ void BoxWrapperKernel::CopyKeys(const paddle::platform::Place& place, xpu_wait(stream); } +template +__global__ void PullCopyNNCross(const TEmbedxOp* op, + const float scale, + const boxps::FeaturePullOffset* info, + int* total_dims, + unsigned long long* dst_vals, + const int* key2slot, + float* total_values, + const uint32_t* restore_idx, + const int total_length, + const int max_cols_num, + const int hidden_size, + const int expand_embed_dim, + const int pull_float_num, + const int skip_offset, + const int cvm_offset, + const int slot_num){ + int cid = core_id(); + int ncores = core_num(); + if (cid >= ncores) { + return; + } + int thread_id = cluster_id() * ncores + cid; + int nthreads = cluster_num() * ncores; + + const int buf_length = 5; + int per_thread_len = roundup_div(total_length, nthreads); + int per_thread_loop_count = roundup_div(per_thread_len, buf_length); + int per_thread_per_loop_len = roundup_div(per_thread_len, per_thread_loop_count); + + __local__ float lm_total_values[buf_length * pull_float_num]; + __local__ float lm_dst_vals[buf_length * hidden_size]; + __local__ float lm_dst_expand_vals[buf_length * expand_embed_dim]; + __local__ int lm_key2slot[buf_length]; + __local__ int lm_total_dims[buf_length]; + __local__ uint32_t lm_restore_idx[buf_length]; + __local__ boxps::FeaturePullOffset lm_info[1]; + __local__ TEmbedxOp lm_op[1]; + + const int max_slot_num = 1000; + int sm_slot_len = min(max_slot_num, slot_num); + __shared__ uint64_t sm_dst_vals_ptr[max_slot_num]; + __shared__ uint64_t sm_dst_expand_vals_ptr[max_slot_num]; + for (int i = cid; i < sm_slot_len; i += ncores) { + GM2SM(dst_vals + i, sm_dst_vals_ptr + i, sizeof(uint64_t)); + GM2SM(dst_vals + slot_num + i, sm_dst_expand_vals_ptr + i, sizeof(uint64_t)); + } + mfence(); + xpu_sync_all(); + + // if(thread_id==0) { + // printf("[hsq] max_slot_num:%d, slot_num:%d\n", max_slot_num, slot_num); + // printf("[hsq] sm_slot_len:%d\n", sm_slot_len); + // //TODO: why this line will error? + // // printf("[hsq] max_slot_num:%d, slot_num:%d, sm_slot_len:%d\n", max_slot_num, slot_num, sm_slot_len); + // for(int i=0;i= total_length) { + return; + } + int len = min(per_thread_per_loop_len, total_length - gm_offset); + // TODO: why disable restore_idx? + if(restore_idx != nullptr) { + GM2LM(restore_idx + gm_offset, lm_restore_idx, len * sizeof(uint32_t)); + } + int pos = (restore_idx != nullptr) ? lm_restore_idx[gm_offset] : gm_offset; + GM2LM(total_values + pos * pull_float_num, lm_total_values, len * pull_float_num * sizeof(float)); + GM2LM(total_dims + gm_offset, lm_total_dims, len * sizeof(int)); + GM2LM(key2slot + gm_offset, lm_key2slot, len * sizeof(int)); + // int len = min(per_thread_per_loop_len, total_length - gm_offset); + // GM2LM(total_values + gm_offset * pull_float_num, lm_total_values, len * pull_float_num * sizeof(float)); + // GM2LM(total_dims + gm_offset, lm_total_dims, len * sizeof(int)); + + for (int j = 0; j < len; j++) { + for (int k = 0; k < cvm_offset; ++k) { + //TODO:consider xpu_value[slot_id]==nullptr? + lm_dst_vals[j * hidden_size + k] = lm_total_values[j * pull_float_num + lm_info[0].show + skip_offset + k]; + } + // embedx + // embedx flags + expand flags && *(keys[x] + y) != 0 && *(keys[x] + y) + int embedx_size = *((int *)&(lm_total_values[j * pull_float_num + lm_info[0].embedx_size])); + // int embedx_size = 0; + // TODO: expand_size = expand_embed_dim? + int expand_size = *((int *)&(lm_total_values[j * pull_float_num + lm_info[0].expand_size])); + // int expand_size = 0; + lm_total_dims[j] = static_cast(embedx_size > 0) | static_cast((expand_size > 0) << 1); + + if (sm_dst_vals_ptr[lm_key2slot[j]] != 0) { + for (int k = cvm_offset; k < cvm_offset + embedx_size; ++k) { + lm_op[0].copy(lm_dst_vals + j * hidden_size + k, + lm_total_values + j * pull_float_num + lm_info[0].embedx, + k - cvm_offset, + scale); + } + for (int k = cvm_offset + embedx_size; k < hidden_size; ++k) { + lm_dst_vals[j * hidden_size + k] = 0; + } + } + + if (sm_dst_expand_vals_ptr[lm_key2slot[j]] == 0) { + continue; + } + for (int k = hidden_size; k < hidden_size + expand_size; ++k) { + // op.copy(&dest_ptr[expand_id], &src_val[info->expand], expand_id, scale); + lm_op[0].copy(lm_dst_expand_vals + j * expand_embed_dim + k-hidden_size, + lm_total_values + j * pull_float_num + lm_info[0].expand, + k - hidden_size, + scale); + } + for (int k = hidden_size + expand_size; k < max_cols_num; ++k) { + lm_dst_expand_vals[j * expand_embed_dim + k-hidden_size] = 0; + } + } + mfence(); + + // if(i == 0) { + // printf("[hsq] lm_dst_vals[0]:%lp\n", lm_dst_vals[0]); + // } + LM2GM(lm_total_dims, total_dims + gm_offset, len * sizeof(int)); + // LM2GM(lm_dst_vals, dst_vals + gm_offset * hidden_size, len * hidden_size * sizeof(float)); + // LM2GM(lm_dst_expand_vals, dst_vals + total_length * hidden_size+ gm_offset * expand_embed_dim, len * expand_embed_dim * sizeof(float)); + LM2GM(lm_dst_vals, ((__global_ptr__ float*)lm_dst_vals_ptr[0] + gm_offset * hidden_size), len * hidden_size * sizeof(float)); + LM2GM(lm_dst_expand_vals, ((__global_ptr__ float*)lm_dst_vals_ptr[0] + total_length * hidden_size + gm_offset * expand_embed_dim), len * expand_embed_dim * sizeof(float)); + } +} + +template +inline void FeaturePullCopyNNCross( + const paddle::platform::Place& place, + const TEmbedxOp* op, + const float scale, + const boxps::FeaturePullOffset* info, + int* total_dims, + float** xpu_values, // const std::vector& values, + const int* key2slot, + // uint64_t* total_keys_xpu,//useless + float* total_values_xpu, + const uint32_t* xpu_restore_idx, + const int64_t* slot_lens, + const int slot_num, + const int total_length, + const int hidden_size, + const int expand_embed_dim, + const int pull_float_num, + const int skip_offset, + const int cvm_offset, + bool expand_only) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + auto ctx_xpu = static_cast(dev_ctx)->x_context(); + auto stream = ctx_xpu->xpu_stream; + + auto d_op_tmp = memory::Alloc(place, sizeof(TEmbedxOp)); + TEmbedxOp* d_op = reinterpret_cast(d_op_tmp->ptr()); + memory::Copy(place, + d_op, + platform::CPUPlace(), + op, + sizeof(TEmbedxOp)); +#ifdef TRACE_PROFILE + TRACE_SCOPE_START("PullCopyNNCross", xpu_wait(stream)); +#endif + // float* real_dst_vals; + // for (int i = 0; i < slot_num; i++) { + // if(xpu_values[i] != nullptr) { + // real_dst_vals = xpu_values[i]; + // break; + // } + // } + void *d_xpu_values = nullptr; + xpu_malloc((void **)&d_xpu_values, slot_num * 2 * sizeof(float*)); + xpu_memcpy(d_xpu_values, xpu_values, slot_num * 2 * sizeof(float*), XPU_HOST_TO_DEVICE); + + // total_values_xpu->(xpu_values[slot_id], total_dims[slot_id]) + if (expand_only) { + static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? + std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : + 0; + // if(place.GetDeviceId()==target_id) { + // printf("[hsq] total_values_xpu ptr: %p, total_length:%d, hidden_size:%d, expand_embed_dim:%d, pull_float_num:%d, skip_offset:%d, cvm_offset:%d, expand_only:%d, slot_num:%d\n", total_values_xpu, total_length, hidden_size, expand_embed_dim, pull_float_num, skip_offset, cvm_offset, expand_only, slot_num); + // // std::vector h_total_values(total_length*pull_float_num); + // // xpu_memcpy(h_total_values.data(), total_values_xpu, sizeof(float) * total_length*pull_float_num, XPUMemcpyKind::XPU_DEVICE_TO_HOST); + // // for (int i = 0; i < 100; i++) { + // // printf("[hsq] i:%d, slot:%d, show:%f, clk:%f, embedding_size:%d, expand_size:%d\n", i, h_total_values[i*pull_float_num], h_total_values[i*pull_float_num+1], h_total_values[i*pull_float_num+2], h_total_values[i*pull_float_num+4], h_total_values[i*pull_float_num+13]); + // // } + // } + PullCopyNNCross<<<8, 64, stream>>>(d_op, + scale, + info, + total_dims, + reinterpret_cast(d_xpu_values), + key2slot, + total_values_xpu, + xpu_restore_idx, + total_length, + (hidden_size + expand_embed_dim), + hidden_size, + expand_embed_dim, + pull_float_num, + skip_offset, + cvm_offset, + slot_num); + xpu_free(d_xpu_values); + } else { + // PullCopyNNCrossWithEmb + // TODO: + ; + } + xpu_wait(stream); +#ifdef TRACE_PROFILE + TRACE_SCOPE_END("PullCopyNNCross", ); + + TRACE_SCOPE_START("PullCopyNNCross's xpu::copy", xpu_wait(stream)); + xpu_wait(stream); + TRACE_SCOPE_END("PullCopyNNCross's xpu::copy",); +#endif +} + template __global__ void PullCopy(const TEmbedxOp* op, const float scale, @@ -257,6 +508,7 @@ __global__ void PullCopy(const TEmbedxOp* op, for (int j = 0; j < len; j++) { for (int k = 0; k < cvm_offset; ++k) { + // dont's need consider xpu_val[slot_id] == nullptr lm_dst_vals[j * hidden_size + k] = lm_total_values[j * pull_float_num + lm_info[0].show + skip_offset + k]; } // embedx @@ -318,6 +570,7 @@ inline void FeaturePullCopy(const paddle::platform::Place& place, break; } } + // total_values_xpu->(xpu_values[slot_id], total_dims[slot_id]) PullCopy<<<8, 64, stream>>>(d_op, scale, info, @@ -356,6 +609,42 @@ void BoxWrapperKernel::CopyForPull( const int cvm_offset = cvm_offset_ - skip_offset; if (pull_info_.is_quant) { EmbedxQuantOp op; + if(expand_embed_dim > 0 && pull_info_.expand_size > 0) {//nncross + FeaturePullCopyNNCross(place, + &op, + pull_embedx_scale_, + pull_offset, + total_dims, + xpu_values, + key2slot, + // total_keys_xpu,//useless + (float*)total_values_xpu, + xpu_restore_idx, + slot_lens, + slot_num, + (int)total_length, + hidden_size, + expand_embed_dim_, + (int)pull_float_num_, + skip_offset, + cvm_offset, + expand_only + // embedx_dim_,//useless + // gpu_keys,//useless + // //useless + ); + } else if (pull_info_.expand_size < 0 && + expand_embed_dim == cvm_offset + expand_embed_dim_ && + hidden_size == cvm_offset + embedx_dim_) { // var + // TODO: + // FeaturePullCopyVariable( + // op, pull_offset, pull_float_num_, stream, gpu_keys, gpu_values, + // total_values_gpu, hidden_size, embedx_dim_, expand_embed_dim_, + // total_length, total_dims, slot_lens, slot_num, key2slot, + // pull_embedx_scale_, cvm_offset, gpu_restore_idx, skip_offset); + } else { + // normal and adam + // total_values_xpu->(xpu_values[slot_id], total_dims[slot_id]) FeaturePullCopy(place, &op, pull_embedx_scale_, @@ -372,6 +661,7 @@ void BoxWrapperKernel::CopyForPull( (int)pull_float_num_, skip_offset, cvm_offset); + } } else { EmbedxNormalOp op; FeaturePullCopy(place, @@ -393,6 +683,281 @@ void BoxWrapperKernel::CopyForPull( } } +template +__global__ void PushCopyNNCross(const TExpandPushGetOp* op, + const boxps::FeaturePushOffset* info, + unsigned long long* total_values,//src, float ptr[2*slot_num] + const int* total_dims, + const int* key2slot, + const int* slot_vector, + const int* slot_inner_offset, + float* dst_vals, //dst, PushValueType ptr[slot_num] + const int total_length, + const int hidden_size,//src sizeof + const int expand_embed_dim,//src sizeof + const int slot_num, + const int push_float_num,//dst sizeof + const int cvm_offset, + const int skip_offset, + const int bs) { + int cid = core_id(); + int ncores = core_num(); + if (cid >= ncores) { + return; + } + int thread_id = cluster_id() * ncores + cid; + int nthreads = cluster_num() * ncores; + + const int buf_length = 5; + int per_thread_len = roundup_div(total_length, nthreads); + int per_thread_loop_count = roundup_div(per_thread_len, buf_length); + int per_thread_per_loop_len = roundup_div(per_thread_len, per_thread_loop_count); + + __local__ float lm_src_vals[buf_length * hidden_size]; + __local__ float lm_src_expand_vals[buf_length * expand_embed_dim]; + __local__ float lm_dst_vals[buf_length * push_float_num]; + __local__ int lm_total_dims[buf_length]; + __local__ int lm_key2slot[buf_length]; + __local__ int lm_slot_inner_offset[buf_length]; + __local__ boxps::FeaturePushOffset lm_info[1]; + __local__ TExpandPushGetOp lm_op[1]; + + // shared memory max 256 KB per cluster + const int max_slot_num = 1000; + __shared__ int sm_slots[max_slot_num]; + int sm_slot_len = min(max_slot_num, slot_num); + int lm_slot = -1; + for (int i = cid; i < sm_slot_len; i += ncores) { + mfence(); + GM2LM(slot_vector + i, &lm_slot, sizeof(int)); + sm_slots[i] = lm_slot; + } + + __shared__ uint64_t sm_src_vals_ptr[max_slot_num]; + __shared__ uint64_t sm_src_expand_vals_ptr[max_slot_num]; + for (int i = cid; i < sm_slot_len; i += ncores) { + GM2SM(total_values + i, sm_src_vals_ptr + i, sizeof(uint64_t)); + GM2SM(total_values + slot_num + i, sm_src_expand_vals_ptr + i, sizeof(uint64_t)); + } + mfence(); + xpu_sync_all(); + + // if(thread_id==0) { + // printf("[hsq] sm_slots:["); + // for(int i=0;i= total_length) { + return; + } + int len = min(per_thread_per_loop_len, total_length - gm_offset); + // if(i == 0) { + // printf("[hsq] lm_src_vals_ptr[0]:%lp, lm_src_expand_vals_ptr[0]:%lp\n", lm_src_vals_ptr[0], lm_src_expand_vals_ptr[0]); + // } + // GM2LM((__global_ptr__ void*)(lm_src_vals_ptr[0] + gm_offset * hidden_size), lm_src_vals, len * hidden_size * sizeof(float)); + // GM2LM((__global_ptr__ void*)(lm_src_expand_vals_ptr[0] + gm_offset * expand_embed_dim), lm_src_expand_vals, len * expand_embed_dim * sizeof(float)); + GM2LM(((__global_ptr__ float*)lm_src_vals_ptr[0] + gm_offset * hidden_size), lm_src_vals, len * hidden_size * sizeof(float)); + // GM2LM(((__global_ptr__ float*)lm_src_expand_vals_ptr[0] + gm_offset * expand_embed_dim), lm_src_expand_vals, len * expand_embed_dim * sizeof(float)); + GM2LM(total_dims + gm_offset, lm_total_dims, len * sizeof(int)); + GM2LM(key2slot + gm_offset, lm_key2slot, len * sizeof(int)); + GM2LM(slot_inner_offset+ gm_offset, lm_slot_inner_offset, len * sizeof(int)); + for (int j = 0; j < len; j++) { + //slot, k==0 + // lm_dst_vals[j * push_float_num] = sm_slots[lm_key2slot[j]]; + lm_slot = sm_slots[lm_key2slot[j]]; + set_byfloat(lm_dst_vals + j * push_float_num, lm_slot); + // skip + for (int k = 1; k < skip_offset + 1; ++k) { + lm_dst_vals[j * push_float_num + k] = 1.0; + } + // cvm + for (int k = skip_offset + 1; k < cvm_offset + 1 + skip_offset; ++k) { + if(sm_src_vals_ptr[lm_key2slot[j]] !=0){//src[x] != 0 + if (k < lm_info->embed_g) { // cvm + lm_dst_vals[j * push_float_num + k] = lm_src_vals[j * hidden_size + k-skip_offset - 1]; + } else { + lm_dst_vals[j * push_float_num + k] = lm_src_vals[j * hidden_size + k-skip_offset - 1] * -1 * bs; + } + } else { + if (k == lm_info->show) { // show + lm_dst_vals[j * push_float_num + k] = 1; + } else { // other + lm_dst_vals[j * push_float_num + k] = 0; + } + } + } + // hidden_size + // for size not equal hidden_size + for (int k = cvm_offset + 1 + skip_offset; k < 1 + skip_offset + hidden_size; ++k) { + if((lm_total_dims[j] & 0x01) && sm_src_vals_ptr[lm_key2slot[j]] !=0) { + lm_dst_vals[j * push_float_num + k] = lm_src_vals[j * hidden_size + k-skip_offset - 1] * -1 * bs; + } else { + lm_dst_vals[j * push_float_num + k] = 0; + } + } + // if(thread_id==0 && i == 0) { + // printf("[hsq] j:%d, push_float_num:%d, skip_offset:%d, hidden_size:%d\n", j, push_float_num, skip_offset, hidden_size); + // } + // if(lm_key2slot[j]==36){ + // printf("[hsq] j:%d, lm_total_dims[j]:%d, expand ptr: %lp, lm_slot_inner_offset[j]:%d\n", j, lm_total_dims[j], sm_src_expand_vals_ptr[lm_key2slot[j]], lm_slot_inner_offset[j]); + // } + if((lm_total_dims[j] & 0x02) && sm_src_expand_vals_ptr[lm_key2slot[j]] !=0) { + lm_src_expand_vals_ptr[0] = sm_src_expand_vals_ptr[lm_key2slot[j]]; + mfence(); + GM2LM(((__global_ptr__ float*)lm_src_expand_vals_ptr[0] + lm_slot_inner_offset[j] * expand_embed_dim), lm_src_expand_vals, expand_embed_dim * sizeof(float)); + mfence(); + for (int k = 1 + skip_offset + hidden_size; k < push_float_num; ++k) { + // __device__ float get(float* expand, const int& row, + // const int& expand_id, + // const int& /**hidden*/, + // const int& expand_dim) const { + // return expand[row * expand_dim + expand_id]; + // } + lm_dst_vals[j * push_float_num + k] = lm_op[0].get(lm_src_expand_vals, + 0, + k - skip_offset - 1 - hidden_size, + hidden_size, // hidden_size is useless + expand_embed_dim) * -1 * bs; + } + mfence(); + // TODO: why print lm_src_expand_vals[0] is -0.000000, actual is + // if(lm_key2slot[j]==36&&(lm_slot_inner_offset[j]==0||lm_slot_inner_offset[j]==1)){ + // printf("[hsq] lm_slot_inner_offset[j]:%d, expand_ptr:%lp, lm_expand[0]: %f, lm_expand[1]: %f, lm_expand_dst[0]:%f, lm_expand_dst[1]:%f\n", lm_slot_inner_offset[j], ((__global_ptr__ float*)lm_src_expand_vals_ptr[0] + lm_slot_inner_offset[j] * expand_embed_dim), lm_src_expand_vals[0], lm_src_expand_vals[1], lm_dst_vals[j * push_float_num+1 + skip_offset + hidden_size], lm_dst_vals[j * push_float_num+1 + skip_offset + hidden_size+1]); + // } + } else { + for (int k = 1 + skip_offset + hidden_size; k < push_float_num; ++k) { + lm_dst_vals[j * push_float_num + k] = 0; + } + } + // for (int k = 1 + skip_offset + hidden_size; k < push_float_num; ++k) { + // if((lm_total_dims[j] & 0x02) && sm_src_expand_vals_ptr[lm_key2slot[j]] !=0) {//TODO: exchange the for + // lm_dst_vals[j * push_float_num + k] = lm_op[0].get(lm_src_expand_vals, + // j, + // k - skip_offset - 1 - hidden_size, + // hidden_size, // hidden_size is useless + // expand_embed_dim) * -1 * bs; + // } else { + // lm_dst_vals[j * push_float_num + k] = 0; + // } + // } + } + LM2GM(lm_dst_vals, dst_vals + gm_offset * push_float_num,len * push_float_num * sizeof(float)); + } +} + +template +inline void FeaturePushCopyNNCross( + const paddle::platform::Place& place, + const TExpandPushGetOp* op, + const boxps::FeaturePushOffset* info, + float** gm_src, + const int* total_dims, + const int* key2slot, + const int* slot_vector, + const int* slot_inner_offset, + float* push_grad_values, + // uint64_t* total_keys_xpu,//useless + const int total_length, + const int hidden_size, + const int expand_embed_dim, + const int slot_num, + const int push_float_num, // dst sizeof + const int cvm_offset, + const int skip_offset, + const int bs, + bool expand_only) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + auto ctx_xpu = static_cast(dev_ctx)->x_context(); + auto stream = ctx_xpu->xpu_stream; + + auto d_op_tmp = memory::Alloc(place, sizeof(TExpandPushGetOp)); + TExpandPushGetOp* d_op = reinterpret_cast(d_op_tmp->ptr()); + memory::Copy(place, + d_op, + platform::CPUPlace(), + op, + sizeof(TExpandPushGetOp)); + +#ifdef TRACE_PROFILE + TRACE_SCOPE_START("PushCopyNNCross", xpu_wait(stream)); +#endif + if (expand_only) { + // TODO: + // if (d_sort_idx != nullptr){ + // } + // static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? + // std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : + // 0; + // if(place.GetDeviceId()==target_id) { + // printf("[hsq] dev_id:%d, total_length:%d, hidden_size:%d, expand_embed_dim:%d, slot_num:%d, push_float_num:%d, cvm_offset:%d, skip_offset:%d, bs:%d\n", place.GetDeviceId(), total_length, hidden_size, expand_embed_dim, slot_num, push_float_num, cvm_offset, skip_offset, bs); + // // [hsq] total_length:1491834, hidden_size:19, expand_embed_dim:64, slot_num:371, push_float_num:84, cvm_offset:3, skip_offset:0, bs:2048 + // } + PushCopyNNCross<<<8, 64, stream>>>(d_op, + info, + reinterpret_cast(gm_src),//src + total_dims, + key2slot, + slot_vector, + slot_inner_offset, + push_grad_values,//dst + total_length, + hidden_size, + expand_embed_dim, + slot_num, + push_float_num, + cvm_offset, + skip_offset, + bs); + } else { + // PullCopyNNCrossWithEmb + // TODO: + ; + } +#ifdef TRACE_PROFILE + xpu_wait(stream); + TRACE_SCOPE_END("PushCopyNNCross", ); +#endif +} + __global__ void PushCopy(float* src_vals, float* dest_vals, boxps::FeaturePushOffset* push_offset, @@ -437,6 +1002,21 @@ __global__ void PushCopy(float* src_vals, sm_slots[i] = lm_slot; } + // __shared__ float* sm_src_vals_ptr[max_slot_num]; + // for (int i = cid; i < sm_slot_len; i += ncores) { + // GM2SM(src_vals + i, sm_src_vals_ptr + i, sizeof(float*)); + // } + // mfence(); + // xpu_sync_all(); + + // __local__ float* lm_src_vals_ptr[1]; + // for(int i=0;i(dev_ctx) ->x_context() @@ -527,9 +1111,46 @@ void BoxWrapperKernel::CopyForPush( const int c_total_length = static_cast(total_length); float* push_grad_values = reinterpret_cast(total_grad_values_xpu); - PushCopy<<<8, 64, stream>>>(gm_src_ptr, push_grad_values, push_offset, - push_float_num_, c_total_length, hidden_size, batch_size, total_dims, - skip_offset, cvm_offset, key2slot, slots, slot_num); + if (expand_embed_dim > 0 && pull_info_.expand_size > 0) { // nncross + // FeaturePushCopyNNCross + ExpandPushGetOp op; + FeaturePushCopyNNCross(place, + &op, + push_offset, + gm_src_ptr, + total_dims, + key2slot, + slots, + slot_inner_offset, + push_grad_values, + c_total_length, + hidden_size, + expand_embed_dim, + slot_num, + push_float_num, // dst sizeof + cvm_offset, + skip_offset, + batch_size, + expand_only); + } else if (pull_info_.expand_size < 0 && + expand_embed_dim == cvm_offset + expand_embed_dim_ && + hidden_size == cvm_offset + embedx_dim_) { // var + // FeaturePushCopyVariable + // TODO: + } else { + // FeaturePushCopy + // TODO: + float* real_gm_src_ptr; + for (int i = 0; i < slot_num; i++) { + if(gm_src_ptr[i] != 0) { + real_gm_src_ptr = const_cast(gm_src_ptr[i]); + break; + } + } + PushCopy<<<8, 64, stream>>>(real_gm_src_ptr, push_grad_values, push_offset, + push_float_num_, c_total_length, hidden_size, batch_size, total_dims, + skip_offset, cvm_offset, key2slot, slots, slot_num); + } xpu_wait(stream); } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index d6d0f6a21ade3..a94704727f962 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1092,6 +1092,7 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_); } + argument_.SetXpuPrecision(config_.xpu_precision_); if (config_.lite_engine_enabled()) { argument_.SetCpuMathLibraryNumThreads( config_.cpu_math_library_num_threads()); diff --git a/paddle/fluid/operators/collective/c_mixallgather_op.cc b/paddle/fluid/operators/collective/c_mixallgather_op.cc index be6e6f414483d..f98ca2a8bd7a5 100644 --- a/paddle/fluid/operators/collective/c_mixallgather_op.cc +++ b/paddle/fluid/operators/collective/c_mixallgather_op.cc @@ -418,6 +418,39 @@ class CMixAllGatherOpXPUKernel : public framework::OpKernel { TRACE_SCOPE_START("bkcl_all_reduce", xpu_wait(stream)); #endif + + // static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? + // std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : + // 0; + // int dev_id = place.GetDeviceId();//xpu_ctx->dev().id(); + // if(dev_id==target_id) { + // printf("[hsq] c_mixallgather_op, in_tensors.size():%d, numel:%d\n", (int)in_tensors.size(), (int)numel); + // printf("[hsq] offset:["); + // int64_t offset = 0; + // for (size_t i = 0; i < in_tensors.size(); ++i) { + // int64_t len = in_tensors[i]->numel(); + // printf("%d, ", (int)offset); + // offset += len; + // } + // printf("]\n"); + + // std::vector h_recvbuff(numel); + // xpu_memcpy(h_recvbuff.data(), recvbuff, h_recvbuff.size() * sizeof(T), XPU_DEVICE_TO_HOST); + // std::cout<<"[hsq] before all_reduce recvbuff: ["; + // for (int i = 0; i < std::min((int)h_recvbuff.size(), 100); i++) { + // std::cout<comm(), recvbuff, diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op_xpu.cc b/paddle/fluid/operators/fused/fused_seqpool_cvm_op_xpu.cc index 39dda54d08fe6..38d164299a753 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op_xpu.cc +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op_xpu.cc @@ -61,7 +61,9 @@ template class FusedSeqpoolCVMOpXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { +#ifdef TRACE_PROFILE TRACE_SCOPE_START("FusedSeqpoolCVMOpXPUKernel Compute", xpu_wait(ctx.template device_context().x_context()->xpu_stream)); +#endif auto ins = ctx.MultiInput("X"); auto out = ctx.MultiOutput("Out"); std::string pooltype = ctx.Attr("pooltype"); @@ -111,6 +113,17 @@ class FusedSeqpoolCVMOpXPUKernel : public framework::OpKernel { paddle::platform::errors::InvalidArgument( "The output of dims[1] should be dividable of (w-2)")); } + // int w = ins[0]->numel() / x0_dims[0]; + // if(use_cvm) { + // PADDLE_ENFORCE_EQ(y_dims[1] % w, 0, + // paddle::platform::errors::InvalidArgument( + // "The output of dims[1] should be dividable of w")); + // } + // else{ + // PADDLE_ENFORCE_EQ(y_dims[1] % (w-2), 0, + // paddle::platform::errors::InvalidArgument( + // "The output of dims[1] should be dividable of (w-2)")); + // } std::vector cpu_x_addr_vec(slot_num, 0); std::vector cpu_y_addr_vec(slot_num, 0); @@ -130,8 +143,9 @@ class FusedSeqpoolCVMOpXPUKernel : public framework::OpKernel { } lod_index += x_lod.size(); } - +#ifdef TRACE_PROFILE TRACE_SCOPE_START("xpu::sequence_sum_pool_cvm", xpu_wait(xpu_context->xpu_stream);); +#endif int r = xpu::sequence_sum_pool_cvm(xpu_context, cpu_x_addr_vec, cpu_y_addr_vec, @@ -153,7 +167,7 @@ class FusedSeqpoolCVMOpXPUKernel : public framework::OpKernel { embed_thres_size); PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, platform::errors::External( - "The sequence_sum_pool_cvm_concat XPU OP return wrong value[%d %s]", + "The sequence_sum_pool_cvm XPU OP return wrong value[%d %s]", r, XPUAPIErrorMsg[r])); TRACE_SCOPE_END("xpu::sequence_sum_pool_cvm", xpu_wait(xpu_context->xpu_stream);); TRACE_SCOPE_END("FusedSeqpoolCVMOpXPUKernel Compute", xpu_wait(xpu_context->xpu_stream)); @@ -164,7 +178,9 @@ template class FusedSeqpoolCVMGradOpXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { +#ifdef TRACE_PROFILE TRACE_SCOPE_START("FusedSeqpoolCVMGradOpXPUKernel Compute", xpu_wait(ctx.template device_context().x_context()->xpu_stream)); +#endif auto dOut = ctx.MultiInput(framework::GradVarName("Out")); auto xs = ctx.MultiInput("X"); const Tensor* cvm = ctx.Input("CVM"); @@ -205,6 +221,7 @@ class FusedSeqpoolCVMGradOpXPUKernel : public framework::OpKernel { total_values.set_offset(offset); dx->ShareBufferWith(total_values); offset += dx->numel() * sizeof(T); + // printf("[hsq] xs[%d]!=UNDEFINED\n", k); } T* dx_data = dx->mutable_data(place); // T* dx_data = dx->mutable_data(place); diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h index 15fa2c598dc2c..22996763f170f 100644 --- a/paddle/fluid/operators/pull_box_extended_sparse_op.h +++ b/paddle/fluid/operators/pull_box_extended_sparse_op.h @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/fleet/box_wrapper.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/tensor_formatter.h" namespace paddle { namespace operators { @@ -25,6 +26,7 @@ namespace operators { template static void PullBoxExtendedSparseFunctor( const framework::ExecutionContext &ctx) { + // printf("[hsq] hi from PullBoxExtendedSparseFunctor\n"); auto inputs = ctx.MultiInput("Ids"); auto outputs = ctx.MultiOutput("Out"); auto outputs_extend = ctx.MultiOutput("OutExtend"); @@ -33,6 +35,103 @@ static void PullBoxExtendedSparseFunctor( const auto slot_size = inputs.size(); std::vector all_keys(slot_size); + int total_dims0 = 0; + for (size_t i = 0; i < outputs.size(); ++i) { + total_dims0 += outputs[i]->dims()[0]; + } + int total_expand_dims0 = 0; + for (size_t i = 0; i < outputs_extend.size(); ++i) { + total_expand_dims0 += outputs_extend[i]->dims()[0]; + } + + int max_total_dims0 = total_dims0; + bool is_expand_slot_small = true; + if(total_dims0>total_expand_dims0) { + is_expand_slot_small = true; + max_total_dims0 = total_dims0; + } else { + is_expand_slot_small = false; + max_total_dims0 = total_expand_dims0; + } + + std::vector slot_dims0_offset(slot_size); + int offset = 0; + int dims1 = 0; + int expand_dims1 = 0; + + size_t embedx_offset = 0; + size_t expand_offset = 0; + for (int i = 0; i < (int)slot_size; i++) { + slot_dims0_offset[i] = offset; + if(flags.empty()) { + offset += outputs[i]->dims()[0]; + } else { + if(is_expand_slot_small==true){ + if (flags[i] & 0x01) { + offset += outputs[embedx_offset]->dims()[0]; + dims1 = outputs[embedx_offset]->dims()[1]; + embedx_offset++; + } else { + offset += 0; + } + if(flags[i] & 0x02) { + expand_dims1 = outputs_extend[expand_offset]->dims()[1]; + expand_offset++; + } + } else { + if (flags[i] & 0x02) { + offset += outputs_extend[expand_offset]->dims()[0]; + expand_dims1 = outputs_extend[expand_offset]->dims()[1]; + expand_offset++; + } else { + offset += 0; + } + if(flags[i] & 0x01) { + dims1 = outputs[embedx_offset]->dims()[1]; + embedx_offset++; + } + } + } + } + + framework::LoDTensor total_values; + total_values.Resize(phi::make_ddim({max_total_dims0*(dims1+expand_dims1)})); + total_values.mutable_data(ctx.GetPlace()); + // framework::LoDTensor total_values_expand; + // total_values_expand.Resize(phi::make_ddim({max_total_dims0*outputs_extend[i]->dims()[1]})); + // total_values_expand.mutable_data(ctx.GetPlace()); + + + // int total_length = 0; + // for (size_t i = 0; i < outputs.size(); ++i) { + // total_length += outputs[i]->numel(); + // } + // int total_length_expand = 0; + // for (size_t i = 0; i < outputs_extend.size(); ++i) {//outputs_extend.size() maybe less than slot_size + // total_length_expand += outputs_extend[i]->numel(); + // } + // if(ctx.GetPlace().GetDeviceId()==0) { + // printf("[hsq] going to mutable continue tensor\n"); + // } + // framework::LoDTensor total_values; + // total_values.Resize(phi::make_ddim({total_length+total_length_expand})); + // total_values.mutable_data(ctx.GetPlace()); + // int offset = 0; + // int offset_expand = 0; + + // // int total_length_expand = 0; + // // for (size_t i = 0; i < outputs_extend.size(); ++i) {//outputs_extend.size() maybe less than slot_size + // // total_length_expand += outputs_extend[i]->numel(); + // // } + // // framework::LoDTensor total_values_expand; + // // total_values_expand.Resize(phi::make_ddim({total_length_expand})); + // // total_values_expand.mutable_data(ctx.GetPlace()); + // // int offset_expand = 0; + // if(ctx.GetPlace().GetDeviceId()==0) { + // printf("[hsq] end of mutable continue tensor\n"); + // printf("[hsq] slot_size:%d, outputs.size():%d, outputs_extend.size():%d, sizeof(T): %d\n", (int)slot_size, (int)outputs.size(), (int)outputs_extend.size(), (int)sizeof(T)); + // // printf("[hsq] slot_size:%d, outputs.size():%d, outputs_extend.size():%d, total_length:%d, sizeof(T): %d\n", (int)slot_size, (int)outputs.size(), (int)outputs_extend.size(), (int)total_length, (int)sizeof(T)); + // } // BoxPS only supports float now std::vector all_values(slot_size * 2); std::vector slot_lengths(slot_size); @@ -43,9 +142,29 @@ static void PullBoxExtendedSparseFunctor( reinterpret_cast(slot->data()); all_keys[i] = single_slot_keys; slot_lengths[i] = slot->numel(); + // total_values.set_offset(offset); + // outputs[i]->ShareBufferWith(total_values); + if(outputs[embedx_offset]->numel()==0) { + outputs[embedx_offset]->set_layout(paddle::framework::DataLayout::UNDEFINED); + } else { + int offset = slot_dims0_offset[i]*dims1* sizeof(T); + total_values.set_offset(offset); + outputs[i]->ShareBufferWith(total_values); + } auto *output = outputs[i]->mutable_data(ctx.GetPlace()); + // offset += outputs[i]->numel() * sizeof(T); all_values[i] = reinterpret_cast(output); + // total_values.set_offset(total_length* sizeof(T)+offset_expand); + // outputs_extend[i]->ShareBufferWith(total_values); + if(outputs_extend[expand_offset]->numel()==0) { + outputs_extend[expand_offset]->set_layout(paddle::framework::DataLayout::UNDEFINED); + } else { + int offset = slot_dims0_offset[i]*expand_dims1* sizeof(T); + total_values.set_offset(max_total_dims0*dims1* sizeof(T)+offset); + outputs_extend[i]->ShareBufferWith(total_values); + } auto *output_extend = outputs_extend[i]->mutable_data(ctx.GetPlace()); + // offset_expand += outputs_extend[i]->numel() * sizeof(T); all_values[i + slot_size] = reinterpret_cast(output_extend); } } else { @@ -58,14 +177,34 @@ static void PullBoxExtendedSparseFunctor( all_keys[i] = single_slot_keys; slot_lengths[i] = slot->numel(); if (flags[i] & 0x01) { + // total_values.set_offset(offset); + // outputs[embedx_offset]->ShareBufferWith(total_values); + if(outputs[embedx_offset]->numel()==0) { + outputs[embedx_offset]->set_layout(paddle::framework::DataLayout::UNDEFINED); + } else { + int offset = slot_dims0_offset[i]*dims1* sizeof(T); + total_values.set_offset(offset); + outputs[embedx_offset]->ShareBufferWith(total_values); + } auto *output = outputs[embedx_offset]->mutable_data(ctx.GetPlace()); + // offset += outputs[embedx_offset]->numel() * sizeof(T); all_values[i] = reinterpret_cast(output); ++embedx_offset; } else { all_values[i] = 0; } if (flags[i] & 0x02) { + // total_values.set_offset(offset); + // outputs_extend[expand_offset]->ShareBufferWith(total_values); + if(outputs_extend[expand_offset]->numel()==0) { + outputs_extend[expand_offset]->set_layout(paddle::framework::DataLayout::UNDEFINED); + } else { + int offset = slot_dims0_offset[i]*expand_dims1* sizeof(T); + total_values.set_offset(max_total_dims0*dims1* sizeof(T)+offset); + outputs_extend[expand_offset]->ShareBufferWith(total_values); + } auto *output_extend = outputs_extend[expand_offset]->mutable_data(ctx.GetPlace()); + // offset_expand += outputs_extend[expand_offset]->numel() * sizeof(T); all_values[i + slot_size] = reinterpret_cast(output_extend); ++expand_offset; } else { @@ -73,14 +212,85 @@ static void PullBoxExtendedSparseFunctor( } } } + total_values.set_offset(0); #ifdef PADDLE_WITH_BOX_PS - // int skip_offset = ctx.Attr("offset"); - // auto emb_size = ctx.Attr("emb_size"); - // auto emb_extended_size = ctx.Attr("emb_extended_size"); - // auto expand_only = ctx.Attr("expand_only"); - // auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); - // box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths, - // emb_size, emb_extended_size, skip_offset, expand_only); + int skip_offset = ctx.Attr("offset"); + auto emb_size = ctx.Attr("emb_size"); + auto emb_extended_size = ctx.Attr("emb_extended_size"); + auto expand_only = ctx.Attr("expand_only"); + auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); + static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? + std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : + 0; + target_id = target_id; + int dev_id = ctx.GetPlace().GetDeviceId();//xpu_ctx->dev().id(); + dev_id= dev_id; + // if(dev_id==target_id) { + // printf("[hsq] dev_id:%d, 1.going to call box_ptr->PullSparse\n", dev_id); + + // int output_index = 0; + // int output_expand_index = 0; + // printf("[hsq] total_dims0:%d, total_expand_dims0:%d, max_total_dims0:%d, dims1:%d, expand_dims1:%d\n",total_dims0, total_expand_dims0, max_total_dims0, dims1, expand_dims1); + // printf("[hsq] total_values's ptr: %p, ptr_end:%p\n", total_values.data(), total_values.data()+total_values.numel()); + // printf("[hsq] pull_box_extend_sparse tensor shape:\n"); + // for(int i = 0; i < (int)slot_size; i++) { + // printf("[hsq] input[%d].shape: [", i); + // for(int j =0;j<(int)inputs[i]->dims().size();j++){ + // printf("%d,", (int)inputs[i]->dims()[j]); + // } + // printf("]\n"); + + // if(flags[i] & 0x01) { + // printf("[hsq] output[%d].shape: [", i); + // for(int j =0;j<(int)outputs[output_index]->dims().size();j++){ + // printf("%d,", (int)outputs[output_index]->dims()[j]); + // } + // printf("], ptr_begin:%p, ptr_end:%p, slot_dims0_offset[%d]: %d\n", outputs[output_index]->data(), outputs[output_index]->data()+outputs[output_index]->numel(), i, slot_dims0_offset[i]); + // output_index++; + // } + + // if(flags[i] & 0x02) { + // printf("[hsq] output_expand[%d].shape: [", i); + // for(int j =0;j<(int)outputs_extend[output_expand_index]->dims().size();j++){ + // printf("%d,", (int)outputs_extend[output_expand_index]->dims()[j]); + // } + // printf("], ptr_begin:%p, ptr_end:%p\n", outputs_extend[output_expand_index]->data(), outputs_extend[output_expand_index]->data()+outputs_extend[output_expand_index]->numel()); + // output_expand_index++; + // } + // } + // } + box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths, + emb_size, emb_extended_size, skip_offset, expand_only); + if (std::getenv("DUMP_XPU_PUSH_SPARSE_INPUT") != nullptr) { + auto names = ctx.OutputNames("Out"); + for (int i = 0; i (Attr("summarize"))); + // formatter.SetPrintFilePath("dev"+std::to_string(ctx.GetPlace().device)+".push_sparse_input.txt"); + std::string message = std::string("---embs_all_")+std::to_string(i)+std::string("---"); + formatter.Print(*(outputs[i]), name, message); + } + names = ctx.OutputNames("OutExtend"); + for (int i = 0; i (Attr("summarize"))); + // formatter.SetPrintFilePath("dev"+std::to_string(ctx.GetPlace().device)+".push_sparse_input.txt"); + std::string message = std::string("---expand_all_")+std::to_string(i)+std::string("---"); + formatter.Print(*(outputs_extend[i]), name, message); + } + } #endif } @@ -161,14 +371,46 @@ static void PushBoxExtendedSparseFunctor( } } #ifdef PADDLE_WITH_BOX_PS - // int skip_offset = ctx.Attr("offset"); - // auto emb_size = ctx.Attr("emb_size"); - // auto emb_extended_size = ctx.Attr("emb_extended_size"); - // auto expand_only = ctx.Attr("expand_only"); - // auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); - // box_ptr->PushSparseGrad(ctx.GetPlace(), all_keys, all_grad_values, - // slot_lengths, emb_size, emb_extended_size, batch_size, - // skip_offset, expand_only); + int skip_offset = ctx.Attr("offset"); + auto emb_size = ctx.Attr("emb_size"); + auto emb_extended_size = ctx.Attr("emb_extended_size"); + auto expand_only = ctx.Attr("expand_only"); + auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); + // printf("[hsq] gping to call box_ptr->PushSparseGrad\n"); + if (std::getenv("DUMP_XPU_PUSH_SPARSE_INPUT") != nullptr) { + auto names = ctx.InputNames(framework::GradVarName("OutExtend")); + for (int i = (d_output_extend.size()-1); i >=0; i--) { + TensorFormatter formatter; + // const std::string &name = ctx.InputNames(framework::GradVarName("Out"))[i]; + const std::string &name = names[i]; + formatter.SetPrintTensorType(true); + formatter.SetPrintTensorShape(true); + formatter.SetPrintTensorLod(true); + formatter.SetPrintTensorLayout(true); + // formatter.SetSummarize(static_cast(Attr("summarize"))); + // formatter.SetPrintFilePath("dev"+std::to_string(ctx.GetPlace().device)+".push_sparse_input.txt"); + std::string message = std::string("---expand_all_")+std::to_string(i)+std::string("---"); + formatter.Print(*(d_output_extend[i]), "print_" + name, message); + } + + names = ctx.InputNames(framework::GradVarName("Out")); + for (int i = (d_output.size()-1); i >=0; i--) { + TensorFormatter formatter; + // const std::string &name = ctx.InputNames(framework::GradVarName("Out"))[i]; + const std::string &name = names[i]; + formatter.SetPrintTensorType(true); + formatter.SetPrintTensorShape(true); + formatter.SetPrintTensorLod(true); + formatter.SetPrintTensorLayout(true); + // formatter.SetSummarize(static_cast(Attr("summarize"))); + // formatter.SetPrintFilePath("dev"+std::to_string(ctx.GetPlace().device)+".push_sparse_input.txt"); + std::string message = std::string("---embs_all_")+std::to_string(i)+std::string("---"); + formatter.Print(*(d_output[i]), "print_" + name, message); + } + } + box_ptr->PushSparseGrad(ctx.GetPlace(), all_keys, all_grad_values, + slot_lengths, emb_size, emb_extended_size, batch_size, + skip_offset, expand_only); #endif } diff --git a/paddle/fluid/operators/rank_attention_op_xpu.cc b/paddle/fluid/operators/rank_attention_op_xpu.cc index 817ed4cc2b795..cd911b2530b45 100644 --- a/paddle/fluid/operators/rank_attention_op_xpu.cc +++ b/paddle/fluid/operators/rank_attention_op_xpu.cc @@ -56,6 +56,24 @@ class RankAttention2XPUKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); T* out_data = Out->mutable_data(ctx.GetPlace()); + // if(ctx.GetPlace().GetDeviceId()==0) { + // printf("[hsq] rank_attention input ptr:%p, rank_offset ptr:%p, param ptr:%p, out ptr:%p, ins_num: %d, x_fea_dim:%d, max_rank:%d, para_row:%d, para_col:%d\n", X->data(), rank_offset->data(), param->data(), out_data, (int)ins_num, (int)x_fea_dim, (int)max_rank, (int)para_row, (int)para_col); + + // std::vector h_mat(rank_offset->numel()); + // xpu_memcpy(h_mat.data(), rank_offset->data(), rank_offset->numel() * sizeof(int), XPU_DEVICE_TO_HOST); + + // if(ins_num*(2*max_rank+1)!=rank_offset->numel()){ + // printf("[hsq] check error\n"); + // } + // std::cout<<"[hsq] mat_out: ["; + // for (int i = 0; i < ins_num; i++) { + // std::cout<<"ins_id: "<(dev_ctx.x_context(), ins_num, x_fea_dim, X->data(), max_rank, rank_offset->data(), para_row, para_col, param->data(), out_data); @@ -63,6 +81,7 @@ class RankAttention2XPUKernel : public framework::OpKernel { ret, XPU_SUCCESS, platform::errors::External("The rank_attention2 XPU kernel return wrong value[%d %s]", ret, XPUAPIErrorMsg[ret])); + // } } }; diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 5134577fd4ef5..b4da8758619c1 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -581,6 +581,10 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"rank_attention2_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"pull_box_extended_sparse", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"push_box_extended_sparse", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, }; return s_xpu2_kernels; } From 3b09fd0839b79fd5dd65fb3b2f3d2caea119ae4c Mon Sep 17 00:00:00 2001 From: HuangShiqing Date: Thu, 9 Nov 2023 10:21:12 +0800 Subject: [PATCH 05/20] abacus-aibox-842 fix the gm leak --- paddle/fluid/framework/fleet/box_wrapper_impl.h | 1 + paddle/fluid/framework/fleet/box_wrapper_kernel.kps | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h index 291bb0036af0f..68b9c33d37435 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_impl.h +++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h @@ -935,6 +935,7 @@ void BoxWrapper::PushSparseGradCaseXPU(const paddle::platform::Place& place, expand_embed_dim, push_float_num_, expand_only); + xpu_free(d_slot_inner_offset); if(dev_id==target_id && count==target_count) { std::string file_path = "dev"+std::to_string(dev_id)+"_count"+std::to_string(count)+"_push_copy_output.txt"; diff --git a/paddle/fluid/framework/fleet/box_wrapper_kernel.kps b/paddle/fluid/framework/fleet/box_wrapper_kernel.kps index 68160fd1f492b..4c68f1500c2b7 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_kernel.kps +++ b/paddle/fluid/framework/fleet/box_wrapper_kernel.kps @@ -440,12 +440,12 @@ inline void FeaturePullCopyNNCross( skip_offset, cvm_offset, slot_num); - xpu_free(d_xpu_values); } else { // PullCopyNNCrossWithEmb // TODO: ; } + xpu_free(d_xpu_values); xpu_wait(stream); #ifdef TRACE_PROFILE TRACE_SCOPE_END("PullCopyNNCross", ); From c906979a25e911fe91ad3a058d29b1f2b9cc7e40 Mon Sep 17 00:00:00 2001 From: HuangShiqing Date: Wed, 22 Nov 2023 09:45:30 +0800 Subject: [PATCH 06/20] abacus-aibox-842 delete some debug info --- paddle/fluid/framework/boxps_worker.cc | 8 - paddle/fluid/framework/data_feed.cc | 174 --------------- paddle/fluid/framework/data_feed.kps | 1 + .../fluid/framework/fleet/box_wrapper_impl.h | 201 +----------------- .../framework/fleet/box_wrapper_kernel.kps | 160 ++------------ .../fluid/inference/api/analysis_predictor.cc | 1 - .../operators/collective/c_mixallgather_op.cc | 33 --- .../fused/fused_seqpool_cvm_op_xpu.cc | 12 -- .../operators/pull_box_extended_sparse_op.h | 92 -------- .../fluid/operators/rank_attention_op_xpu.cc | 18 -- 10 files changed, 24 insertions(+), 676 deletions(-) diff --git a/paddle/fluid/framework/boxps_worker.cc b/paddle/fluid/framework/boxps_worker.cc index 63ca8931f19f9..17fffb84077a2 100644 --- a/paddle/fluid/framework/boxps_worker.cc +++ b/paddle/fluid/framework/boxps_worker.cc @@ -600,14 +600,6 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) { var_num += 1; } } - // printf("[hsq] name: %s in BoxPSWorker::CreateDeviceResource\n", name.c_str()); - // printf("[hsq] sync_mode_:%d, dense_table_:%p\n", sync_mode_, dense_table_); - // printf("[hsq] root_tensor.numel(): %d\n", (int)root_tensor.numel()); - // const void* p = root_tensor.data(); - // printf("[hsq] root_tensor.data(): %p\n",p); - // std::cout<<"[hsq] place_: "<data(); - // printf("[hsq] gpu_tensor->data(): %p\n",p1); if (!gpu_tensor->initialized() && place_ == root_tensor.place()) { auto dim = root_tensor.dims(); gpu_tensor->ShareDataWith(root_tensor).Resize(dim); diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 7257f2ef238c1..640e66e272304 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -3596,7 +3596,6 @@ int SlotPaddleBoxDataFeed::GetCurrentPhase() { } } -#include void SlotPaddleBoxDataFeed::GetRankOffsetGPU(const int pv_num, const int ins_num) { #if defined(PADDLE_WITH_CUDA) && defined(_LINUX) || defined(PADDLE_WITH_XPU_KP) && !defined(CPU_DATA_FEED) @@ -3611,166 +3610,6 @@ void SlotPaddleBoxDataFeed::GetRankOffsetGPU(const int pv_num, value.d_ad_offset.data(), col); #elif defined(PADDLE_WITH_XPU_KP) - // if(this->place_.GetDeviceId()==0) - // { - // std::vector h_mat(rank_offset_->numel()); - // std::vector h_rank(value.d_rank.numel()); - // std::vector h_cmatch(value.d_cmatch.numel()); - // std::vector h_ad_offset(value.d_ad_offset.numel()); - // xpu_memcpy(h_mat.data(), tensor_ptr, rank_offset_->numel() * sizeof(int), XPU_DEVICE_TO_HOST); - // xpu_memcpy(h_rank.data(), value.d_rank.data(), value.d_rank.numel() * sizeof(int), XPU_DEVICE_TO_HOST); - // xpu_memcpy(h_cmatch.data(), value.d_cmatch.data(), value.d_cmatch.numel() * sizeof(int), XPU_DEVICE_TO_HOST); - // xpu_memcpy(h_ad_offset.data(), value.d_ad_offset.data(), value.d_ad_offset.numel() * sizeof(int), XPU_DEVICE_TO_HOST); - - // printf("[hsq] ins_num:%d, pv_num:%d, max_rank:%d, col:%d\n", ins_num, pv_num, max_rank, col); - - // std::cout<<"[hsq] h_ad_offset: ["; - // for (int i = 0; i < (int)h_ad_offset.size(); i++) { - // std::cout<place_.GetDeviceId(); - // std::ofstream fo; - // fo.open("h_num_"+std::to_string(id)+".txt"); - // fo << ins_num << " " << pv_num << " " << max_rank << " " << col << " "; - // fo.close(); - - // fo.open("h_mat_"+std::to_string(id)+".txt"); - // fo << (int)h_mat.size() << " "; - // for (int i = 0; i < (int)h_mat.size(); i++) { - // fo << h_mat[i] << " ";//-1.0~1.0 - // } - // fo.close(); - - // fo.open("h_rank_"+std::to_string(id)+".txt"); - // fo << (int)h_rank.size() << " "; - // for (int i = 0; i < (int)h_rank.size(); i++) { - // fo << h_rank[i] << " ";//-1.0~1.0 - // } - // fo.close(); - - // fo.open("h_cmatch_"+std::to_string(id)+".txt"); - // fo << (int)h_cmatch.size() << " "; - // for (int i = 0; i < (int)h_cmatch.size(); i++) { - // fo << h_cmatch[i] << " ";//-1.0~1.0 - // } - // fo.close(); - - // fo.open("h_ad_offset_"+std::to_string(id)+".txt"); - // fo << (int)h_ad_offset.size() << " "; - // for (int i = 0; i < (int)h_ad_offset.size(); i++) { - // fo << h_ad_offset[i] << " ";//-1.0~1.0 - // } - // fo.close(); - // } - -// if(this->place_.GetDeviceId()==0) { -// int pv_num_2; -// int ins_num_2; -// int max_rank_2; -// int cols_2; -// std::vector h_mat; -// std::vector h_ad_rank; -// std::vector h_cmatch; -// std::vector h_pv_offset; - -// std::ifstream fi; -// int size; - -// std::string id = std::to_string(this->place_.GetDeviceId()); -// fi.open("h_num_"+id+".txt"); -// fi >> ins_num_2 >> pv_num_2 >> max_rank_2 >> cols_2; -// fi.close(); -// printf("[hsq] ins_num: %d, pv_num: %d, max_rank:%d, col:%d\n", ins_num, pv_num, max_rank, col); -// printf("[hsq] ins_num_2: %d, pv_num_2: %d, max_rank_2:%d, cols_2:%d\n", ins_num_2, pv_num_2, max_rank_2, cols_2); - -// fi.open("h_mat_"+id+".txt"); -// fi >> size; -// if(size!=ins_num_2*cols_2) { -// printf("[hsq] error in h_mat size\n"); -// } -// for (int i = 0; i < size; i++) { -// int val; -// fi >> val; -// h_mat.push_back(val); -// } -// fi.close(); - -// fi.open("h_rank_"+id+".txt"); -// fi >> size; -// if(size!=ins_num_2) { -// printf("[hsq] error in h_ad_rank size, which %d should be %d\n", size, ins_num); -// } -// for (int i = 0; i < size; i++) { -// int val; -// fi >> val; -// h_ad_rank.push_back(val); -// } -// fi.close(); - -// fi.open("h_cmatch_"+id+".txt"); -// fi >> size; -// if(size!=ins_num_2) { -// printf("[hsq] error in h_cmatch size\n"); -// } -// for (int i = 0; i < size; i++) { -// int val; -// fi >> val; -// h_cmatch.push_back(val); -// } -// fi.close(); - -// fi.open("h_ad_offset_"+id+".txt"); -// fi >> size; -// if(size!=pv_num_2+1) { -// printf("[hsq] error in h_pv_offset size\n"); -// } -// for (int i = 0; i < size; i++) { -// int val; -// fi >> val; -// h_pv_offset.push_back(val); -// } -// fi.close(); -// // prepare buffer on xpu -// void *d_mat = nullptr; -// void *d_ad_rank = nullptr; -// void *d_cmatch = nullptr; -// void *d_pv_offset = nullptr; - -// xpu_malloc((void **)&d_mat, h_mat.size() * sizeof(int)); -// xpu_malloc((void **)&d_ad_rank, h_ad_rank.size() * sizeof(int)); -// xpu_malloc((void **)&d_cmatch, h_cmatch.size() * sizeof(int)); -// xpu_malloc((void **)&d_pv_offset, h_pv_offset.size() * sizeof(int)); - -// // copy input to xpu -// xpu_memcpy(d_mat, h_mat.data(), h_mat.size() * sizeof(int), XPUMemcpyKind::XPU_HOST_TO_DEVICE); -// xpu_memcpy(tensor_ptr, h_mat.data(), h_mat.size() * sizeof(int), XPUMemcpyKind::XPU_HOST_TO_DEVICE); -// xpu_memcpy(d_ad_rank, h_ad_rank.data(), h_ad_rank.size() * sizeof(int), XPUMemcpyKind::XPU_HOST_TO_DEVICE); -// xpu_memcpy(d_cmatch, h_cmatch.data(), h_cmatch.size() * sizeof(int), XPUMemcpyKind::XPU_HOST_TO_DEVICE); -// xpu_memcpy(d_pv_offset, h_pv_offset.data(), h_pv_offset.size() * sizeof(int), XPUMemcpyKind::XPU_HOST_TO_DEVICE); - -// std::cout<<"[hsq] place: "<place_<place_, tensor_ptr, ins_num_2, pv_num_2, max_rank_2, - // (int*)d_ad_rank, (int*)d_cmatch, - // (int*)d_pv_offset, cols_2); - -// // std::vector h_mat_out(ins_num_2*cols_2); -// // xpu_memcpy(h_mat_out.data(), d_mat, h_mat_out.size() * sizeof(int), XPUMemcpyKind::XPU_DEVICE_TO_HOST); - -// // std::cout<<"[hsq] mat_out: ["; -// // for (int i = 0; i < ins_num_2*cols_2; i++) { -// // std::cout<place_); auto ctx = static_cast(dev_ctx)->x_context(); int r = xpu::constant(ctx, tensor_ptr, rank_offset_->numel(), 0); @@ -3783,19 +3622,6 @@ void SlotPaddleBoxDataFeed::GetRankOffsetGPU(const int pv_num, DataFeedPdboxXpuKernelHelper::CopyRankOffset(this->place_, tensor_ptr, ins_num, pv_num, max_rank, value.d_rank.data(), value.d_cmatch.data(), value.d_ad_offset.data(), col); - // std::vector h_mat_out(ins_num*col); - // xpu_memcpy(h_mat_out.data(), tensor_ptr, h_mat_out.size() * sizeof(int), XPUMemcpyKind::XPU_DEVICE_TO_HOST); - - // if(this->place_.GetDeviceId()==0) { - // std::cout<<"[hsq] mat_out: ["; - // for (int i = 0; i < ins_num; i++) { - // for( int j = 0; j < col; j++ ) { - // std::cout< -void hsq_dump(void* d_ptr, - int len, - std::string path, - bool need_print, - int oneline_count, - int print_len, - std::string print_name, - std::string mode = "") { - std::vector h_buf(len); - xpu_memcpy(h_buf.data(), d_ptr, h_buf.size() * sizeof(T), XPU_DEVICE_TO_HOST); - - std::ofstream fo; - if(mode=="app") { - fo.open(path, std::ofstream::app); - } else { - fo.open(path); - } - if(oneline_count) { - for (int i = 0; i < (int)h_buf.size()/oneline_count; i++) { - for(int j=0; j& keys, const std::vector& values, @@ -487,14 +437,6 @@ void BoxWrapper::PullSparseCaseXPU(const paddle::platform::Place& place, TRACE_SCOPE_START("PullSparseXPU", xpu_wait(ctx_xpu->xpu_stream)); #endif pull_boxps_timer.Start(); - static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? - std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : - 0; - int dev_id = place.GetDeviceId();//xpu_ctx->dev().id(); -// if(dev_id==target_id) { -// printf("[hsq] dev_id:%d, 2.going to call boxps_ptr_->PullSparseXPU\n", dev_id); -// printf("[hsq] total_length: %d, feature_pull_size_: %d, total_bytes: %d\n", (int)total_length, (int)feature_pull_size_, (int)total_bytes); -// } boxps_ptr_->PullSparseXPU(total_keys, total_values_xpu, static_cast(total_length), device_id); pull_boxps_timer.Pause(); @@ -516,9 +458,6 @@ void BoxWrapper::PullSparseCaseXPU(const paddle::platform::Place& place, } else { pull_offset = dev.pull_offset.data(); } -// if(dev_id==target_id) { -// printf("[hsq] pull_offset.expand_size: %d, pull_offset.expand: %d\n", pull_info_.expand_size, pull_info_.expand); -// } float** xpu_values = dev.values_ptr_tensor.mutable_data( static_cast(values.size() * sizeof(float*)), place); @@ -528,64 +467,10 @@ void BoxWrapper::PullSparseCaseXPU(const paddle::platform::Place& place, #ifdef TRACE_PROFILE TRACE_SCOPE_START("CopyForPull", xpu_wait(ctx_xpu->xpu_stream)); #endif -// if(dev_id==target_id) { -// printf("[hsq] dev_id:%d, 3.going to call box_wrapper_kernel_->CopyForPull\n", dev_id); - -// // std::vector h_key2slot(total_length); -// // xpu_memcpy(h_key2slot.data(), key2slot, h_key2slot.size() * sizeof(int), XPU_DEVICE_TO_HOST); -// // std::cout<<"[hsq] box_wrapper_kernel_->CopyForPull's key2slot: ["; -// // for(int i =0;i<300;i++) { -// // std::cout<CopyForPull(place, xpu_keys, (float**)values.data(), total_values_xpu, pull_offset, slot_lengths_lod.data(), slot_num, key2slot, hidden_size, expand_embed_dim, total_length, total_dims, skip_offset, expand_only); - static int target_count = std::getenv("HSQ_BOXPS_TARGET_COUNT")!=NULL ? - std::stoi(std::string(std::getenv("HSQ_BOXPS_TARGET_COUNT"))) : - 0; - static int count = 0; - if(dev_id==target_id && count==target_count) { - for(int i=0;i(values[i], - slot_lengths[i]*hidden_size, - file_path, - false, // need_print - hidden_size, // oneline_count - 100, - file_path); // print_name - } - - for(int i=slot_num;i<2*slot_num;i++) { - if(values[i]==nullptr) - continue; - std::string file_path = "dev"+std::to_string(dev_id)+"_count"+std::to_string(count)+"_pull_copy_output_expand"+std::to_string(i)+".txt"; - hsq_dump(values[i], - slot_lengths[i-slot_num]*expand_embed_dim, - file_path, - false, // need_print - expand_embed_dim, // oneline_count - 100, - file_path); // print_name - } - } - if(dev_id==target_id) { - count ++; - } #ifdef TRACE_PROFILE TRACE_SCOPE_END("CopyForPull", xpu_wait(ctx_xpu->xpu_stream)); TRACE_SCOPE_END("pull copy", xpu_wait(ctx_xpu->xpu_stream)); @@ -864,71 +749,6 @@ void BoxWrapper::PushSparseGradCaseXPU(const paddle::platform::Place& place, xpu_malloc((void **)&d_slot_inner_offset, total_length * sizeof(int)); xpu_memcpy(d_slot_inner_offset, slot_inner_offset.data(), total_length * sizeof(int), XPU_HOST_TO_DEVICE); - static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? - std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : - 0; - int dev_id = place.GetDeviceId();//xpu_ctx->dev().id(); - - static int target_count = std::getenv("HSQ_BOXPS_TARGET_COUNT")!=NULL ? - std::stoi(std::string(std::getenv("HSQ_BOXPS_TARGET_COUNT"))) : - 0; - static int count = 18; - if(dev_id==target_id && count==target_count) { - for(int i=0;i((void*)grad_values[i], - slot_lengths[i]*hidden_size, - file_path, - false, // need_print - hidden_size, // oneline_count - 100, - file_path); // print_name - } - - for(int i=slot_num;i<2*slot_num;i++) { - if(grad_values[i]==nullptr) - continue; - std::string file_path = "dev"+std::to_string(dev_id)+"_count"+std::to_string(count)+"_push_copy_input_expand"+std::to_string(i)+".txt"; - hsq_dump((void*)grad_values[i], - slot_lengths[i-slot_num]*expand_embed_dim, - file_path, - false, // need_print - expand_embed_dim, // oneline_count - 100, - file_path); // print_name - } - std::string file_path = "dev"+std::to_string(dev_id)+"_count"+std::to_string(count)+"_push_copy_input_key2slot.txt"; - hsq_dump((void*)key2slot, - (int)total_length, - file_path, - false, // need_print - expand_embed_dim, // oneline_count - 100, - file_path); - file_path = "dev"+std::to_string(dev_id)+"_count"+std::to_string(count)+"_push_copy_input_slot.txt"; - hsq_dump((void*)slot_vector, - (int)slot_num, - file_path, - false, // need_print - 1, // oneline_count - 100, - file_path); - - file_path = "dev"+std::to_string(dev_id)+"_count"+std::to_string(count)+"_push_copy_input_slot_inner_offset.txt"; - hsq_dump((void*)d_slot_inner_offset, - (int)total_length, - file_path, - false, // need_print - 1, // oneline_count - 100, - file_path); - } - box_wrapper_kernel_->CopyForPush(place, xpu_values, total_grad_values_xpu, push_offset, total_length, slot_vector, (int*)d_slot_inner_offset, slot_lens, slot_num, hidden_size, batch_size, total_dims, skip_offset, key2slot, @@ -937,21 +757,6 @@ void BoxWrapper::PushSparseGradCaseXPU(const paddle::platform::Place& place, expand_only); xpu_free(d_slot_inner_offset); - if(dev_id==target_id && count==target_count) { - std::string file_path = "dev"+std::to_string(dev_id)+"_count"+std::to_string(count)+"_push_copy_output.txt"; - hsq_dump(total_grad_values_xpu, - total_length*push_float_num_, - file_path, - false, // need_print - push_float_num_, // oneline_count - 100, - file_path); // print_name - } - if(dev_id==target_id) { - count ++; - } - - push_boxps_timer.Resume(); #ifdef TRACE_PROFILE TRACE_SCOPE_END("CopyForPush", xpu_wait(ctx_xpu->xpu_stream)); @@ -959,14 +764,10 @@ void BoxWrapper::PushSparseGradCaseXPU(const paddle::platform::Place& place, TRACE_SCOPE_START("PushSparseXPU", xpu_wait(ctx_xpu->xpu_stream)); #endif -// if(dev_id==target_id){ -// printf("[hsq] going to call boxps_ptr_->PushSparseXPU\n"); -// } + int ret = boxps_ptr_->PushSparseXPU(total_keys, reinterpret_cast(total_grad_values_xpu), static_cast(total_length), device_id); - // int ret = 0; - // total_keys = total_keys; PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "PushSparseXPU failed in BoxPS.")); diff --git a/paddle/fluid/framework/fleet/box_wrapper_kernel.kps b/paddle/fluid/framework/fleet/box_wrapper_kernel.kps index 4c68f1500c2b7..b6ec83564c54f 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_kernel.kps +++ b/paddle/fluid/framework/fleet/box_wrapper_kernel.kps @@ -267,15 +267,6 @@ __global__ void PullCopyNNCross(const TEmbedxOp* op, mfence(); xpu_sync_all(); - // if(thread_id==0) { - // printf("[hsq] max_slot_num:%d, slot_num:%d\n", max_slot_num, slot_num); - // printf("[hsq] sm_slot_len:%d\n", sm_slot_len); - // //TODO: why this line will error? - // // printf("[hsq] max_slot_num:%d, slot_num:%d, sm_slot_len:%d\n", max_slot_num, slot_num, sm_slot_len); - // for(int i=0;i(xpu_values[slot_id], total_dims[slot_id]) if (expand_only) { - static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? - std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : - 0; - // if(place.GetDeviceId()==target_id) { - // printf("[hsq] total_values_xpu ptr: %p, total_length:%d, hidden_size:%d, expand_embed_dim:%d, pull_float_num:%d, skip_offset:%d, cvm_offset:%d, expand_only:%d, slot_num:%d\n", total_values_xpu, total_length, hidden_size, expand_embed_dim, pull_float_num, skip_offset, cvm_offset, expand_only, slot_num); - // // std::vector h_total_values(total_length*pull_float_num); - // // xpu_memcpy(h_total_values.data(), total_values_xpu, sizeof(float) * total_length*pull_float_num, XPUMemcpyKind::XPU_DEVICE_TO_HOST); - // // for (int i = 0; i < 100; i++) { - // // printf("[hsq] i:%d, slot:%d, show:%f, clk:%f, embedding_size:%d, expand_size:%d\n", i, h_total_values[i*pull_float_num], h_total_values[i*pull_float_num+1], h_total_values[i*pull_float_num+2], h_total_values[i*pull_float_num+4], h_total_values[i*pull_float_num+13]); - // // } - // } PullCopyNNCross<<<8, 64, stream>>>(d_op, scale, info, @@ -443,7 +407,7 @@ inline void FeaturePullCopyNNCross( } else { // PullCopyNNCrossWithEmb // TODO: - ; + CHECK(false) << "PullCopyNNCrossWithEmb not implement"; } xpu_free(d_xpu_values); xpu_wait(stream); @@ -637,6 +601,7 @@ void BoxWrapperKernel::CopyForPull( expand_embed_dim == cvm_offset + expand_embed_dim_ && hidden_size == cvm_offset + embedx_dim_) { // var // TODO: + CHECK(false) << "FeaturePullCopyVariable not implement"; // FeaturePullCopyVariable( // op, pull_offset, pull_float_num_, stream, gpu_keys, gpu_values, // total_values_gpu, hidden_size, embedx_dim_, expand_embed_dim_, @@ -644,23 +609,23 @@ void BoxWrapperKernel::CopyForPull( // pull_embedx_scale_, cvm_offset, gpu_restore_idx, skip_offset); } else { // normal and adam - // total_values_xpu->(xpu_values[slot_id], total_dims[slot_id]) - FeaturePullCopy(place, - &op, - pull_embedx_scale_, - pull_offset, - total_dims, - xpu_values, - total_keys_xpu, - (float*)total_values_xpu, - xpu_restore_idx, - slot_lens, - slot_num, - (int)total_length, - hidden_size, - (int)pull_float_num_, - skip_offset, - cvm_offset); + // total_values_xpu->(xpu_values[slot_id], total_dims[slot_id]) + FeaturePullCopy(place, + &op, + pull_embedx_scale_, + pull_offset, + total_dims, + xpu_values, + total_keys_xpu, + (float*)total_values_xpu, + xpu_restore_idx, + slot_lens, + slot_num, + (int)total_length, + hidden_size, + (int)pull_float_num_, + skip_offset, + cvm_offset); } } else { EmbedxNormalOp op; @@ -742,14 +707,6 @@ __global__ void PushCopyNNCross(const TExpandPushGetOp* op, mfence(); xpu_sync_all(); - // if(thread_id==0) { - // printf("[hsq] sm_slots:["); - // for(int i=0;i<<<8, 64, stream>>>(d_op, info, reinterpret_cast(gm_src),//src @@ -950,7 +850,7 @@ inline void FeaturePushCopyNNCross( } else { // PullCopyNNCrossWithEmb // TODO: - ; + CHECK(false) << "PullCopyNNCrossWithEmb not implement"; } #ifdef TRACE_PROFILE xpu_wait(stream); @@ -1002,21 +902,6 @@ __global__ void PushCopy(float* src_vals, sm_slots[i] = lm_slot; } - // __shared__ float* sm_src_vals_ptr[max_slot_num]; - // for (int i = cid; i < sm_slot_len; i += ncores) { - // GM2SM(src_vals + i, sm_src_vals_ptr + i, sizeof(float*)); - // } - // mfence(); - // xpu_sync_all(); - - // __local__ float* lm_src_vals_ptr[1]; - // for(int i=0;i { TRACE_SCOPE_START("bkcl_all_reduce", xpu_wait(stream)); #endif - - // static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? - // std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : - // 0; - // int dev_id = place.GetDeviceId();//xpu_ctx->dev().id(); - // if(dev_id==target_id) { - // printf("[hsq] c_mixallgather_op, in_tensors.size():%d, numel:%d\n", (int)in_tensors.size(), (int)numel); - // printf("[hsq] offset:["); - // int64_t offset = 0; - // for (size_t i = 0; i < in_tensors.size(); ++i) { - // int64_t len = in_tensors[i]->numel(); - // printf("%d, ", (int)offset); - // offset += len; - // } - // printf("]\n"); - - // std::vector h_recvbuff(numel); - // xpu_memcpy(h_recvbuff.data(), recvbuff, h_recvbuff.size() * sizeof(T), XPU_DEVICE_TO_HOST); - // std::cout<<"[hsq] before all_reduce recvbuff: ["; - // for (int i = 0; i < std::min((int)h_recvbuff.size(), 100); i++) { - // std::cout<comm(), recvbuff, diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op_xpu.cc b/paddle/fluid/operators/fused/fused_seqpool_cvm_op_xpu.cc index 38d164299a753..b38b8aa82be9b 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op_xpu.cc +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op_xpu.cc @@ -113,17 +113,6 @@ class FusedSeqpoolCVMOpXPUKernel : public framework::OpKernel { paddle::platform::errors::InvalidArgument( "The output of dims[1] should be dividable of (w-2)")); } - // int w = ins[0]->numel() / x0_dims[0]; - // if(use_cvm) { - // PADDLE_ENFORCE_EQ(y_dims[1] % w, 0, - // paddle::platform::errors::InvalidArgument( - // "The output of dims[1] should be dividable of w")); - // } - // else{ - // PADDLE_ENFORCE_EQ(y_dims[1] % (w-2), 0, - // paddle::platform::errors::InvalidArgument( - // "The output of dims[1] should be dividable of (w-2)")); - // } std::vector cpu_x_addr_vec(slot_num, 0); std::vector cpu_y_addr_vec(slot_num, 0); @@ -221,7 +210,6 @@ class FusedSeqpoolCVMGradOpXPUKernel : public framework::OpKernel { total_values.set_offset(offset); dx->ShareBufferWith(total_values); offset += dx->numel() * sizeof(T); - // printf("[hsq] xs[%d]!=UNDEFINED\n", k); } T* dx_data = dx->mutable_data(place); // T* dx_data = dx->mutable_data(place); diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h index 22996763f170f..1f3c7797c3e10 100644 --- a/paddle/fluid/operators/pull_box_extended_sparse_op.h +++ b/paddle/fluid/operators/pull_box_extended_sparse_op.h @@ -26,7 +26,6 @@ namespace operators { template static void PullBoxExtendedSparseFunctor( const framework::ExecutionContext &ctx) { - // printf("[hsq] hi from PullBoxExtendedSparseFunctor\n"); auto inputs = ctx.MultiInput("Ids"); auto outputs = ctx.MultiOutput("Out"); auto outputs_extend = ctx.MultiOutput("OutExtend"); @@ -97,41 +96,7 @@ static void PullBoxExtendedSparseFunctor( framework::LoDTensor total_values; total_values.Resize(phi::make_ddim({max_total_dims0*(dims1+expand_dims1)})); total_values.mutable_data(ctx.GetPlace()); - // framework::LoDTensor total_values_expand; - // total_values_expand.Resize(phi::make_ddim({max_total_dims0*outputs_extend[i]->dims()[1]})); - // total_values_expand.mutable_data(ctx.GetPlace()); - - // int total_length = 0; - // for (size_t i = 0; i < outputs.size(); ++i) { - // total_length += outputs[i]->numel(); - // } - // int total_length_expand = 0; - // for (size_t i = 0; i < outputs_extend.size(); ++i) {//outputs_extend.size() maybe less than slot_size - // total_length_expand += outputs_extend[i]->numel(); - // } - // if(ctx.GetPlace().GetDeviceId()==0) { - // printf("[hsq] going to mutable continue tensor\n"); - // } - // framework::LoDTensor total_values; - // total_values.Resize(phi::make_ddim({total_length+total_length_expand})); - // total_values.mutable_data(ctx.GetPlace()); - // int offset = 0; - // int offset_expand = 0; - - // // int total_length_expand = 0; - // // for (size_t i = 0; i < outputs_extend.size(); ++i) {//outputs_extend.size() maybe less than slot_size - // // total_length_expand += outputs_extend[i]->numel(); - // // } - // // framework::LoDTensor total_values_expand; - // // total_values_expand.Resize(phi::make_ddim({total_length_expand})); - // // total_values_expand.mutable_data(ctx.GetPlace()); - // // int offset_expand = 0; - // if(ctx.GetPlace().GetDeviceId()==0) { - // printf("[hsq] end of mutable continue tensor\n"); - // printf("[hsq] slot_size:%d, outputs.size():%d, outputs_extend.size():%d, sizeof(T): %d\n", (int)slot_size, (int)outputs.size(), (int)outputs_extend.size(), (int)sizeof(T)); - // // printf("[hsq] slot_size:%d, outputs.size():%d, outputs_extend.size():%d, total_length:%d, sizeof(T): %d\n", (int)slot_size, (int)outputs.size(), (int)outputs_extend.size(), (int)total_length, (int)sizeof(T)); - // } // BoxPS only supports float now std::vector all_values(slot_size * 2); std::vector slot_lengths(slot_size); @@ -142,8 +107,6 @@ static void PullBoxExtendedSparseFunctor( reinterpret_cast(slot->data()); all_keys[i] = single_slot_keys; slot_lengths[i] = slot->numel(); - // total_values.set_offset(offset); - // outputs[i]->ShareBufferWith(total_values); if(outputs[embedx_offset]->numel()==0) { outputs[embedx_offset]->set_layout(paddle::framework::DataLayout::UNDEFINED); } else { @@ -152,10 +115,7 @@ static void PullBoxExtendedSparseFunctor( outputs[i]->ShareBufferWith(total_values); } auto *output = outputs[i]->mutable_data(ctx.GetPlace()); - // offset += outputs[i]->numel() * sizeof(T); all_values[i] = reinterpret_cast(output); - // total_values.set_offset(total_length* sizeof(T)+offset_expand); - // outputs_extend[i]->ShareBufferWith(total_values); if(outputs_extend[expand_offset]->numel()==0) { outputs_extend[expand_offset]->set_layout(paddle::framework::DataLayout::UNDEFINED); } else { @@ -164,7 +124,6 @@ static void PullBoxExtendedSparseFunctor( outputs_extend[i]->ShareBufferWith(total_values); } auto *output_extend = outputs_extend[i]->mutable_data(ctx.GetPlace()); - // offset_expand += outputs_extend[i]->numel() * sizeof(T); all_values[i + slot_size] = reinterpret_cast(output_extend); } } else { @@ -177,8 +136,6 @@ static void PullBoxExtendedSparseFunctor( all_keys[i] = single_slot_keys; slot_lengths[i] = slot->numel(); if (flags[i] & 0x01) { - // total_values.set_offset(offset); - // outputs[embedx_offset]->ShareBufferWith(total_values); if(outputs[embedx_offset]->numel()==0) { outputs[embedx_offset]->set_layout(paddle::framework::DataLayout::UNDEFINED); } else { @@ -187,15 +144,12 @@ static void PullBoxExtendedSparseFunctor( outputs[embedx_offset]->ShareBufferWith(total_values); } auto *output = outputs[embedx_offset]->mutable_data(ctx.GetPlace()); - // offset += outputs[embedx_offset]->numel() * sizeof(T); all_values[i] = reinterpret_cast(output); ++embedx_offset; } else { all_values[i] = 0; } if (flags[i] & 0x02) { - // total_values.set_offset(offset); - // outputs_extend[expand_offset]->ShareBufferWith(total_values); if(outputs_extend[expand_offset]->numel()==0) { outputs_extend[expand_offset]->set_layout(paddle::framework::DataLayout::UNDEFINED); } else { @@ -204,7 +158,6 @@ static void PullBoxExtendedSparseFunctor( outputs_extend[expand_offset]->ShareBufferWith(total_values); } auto *output_extend = outputs_extend[expand_offset]->mutable_data(ctx.GetPlace()); - // offset_expand += outputs_extend[expand_offset]->numel() * sizeof(T); all_values[i + slot_size] = reinterpret_cast(output_extend); ++expand_offset; } else { @@ -219,53 +172,12 @@ static void PullBoxExtendedSparseFunctor( auto emb_extended_size = ctx.Attr("emb_extended_size"); auto expand_only = ctx.Attr("expand_only"); auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); - static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? - std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : - 0; - target_id = target_id; - int dev_id = ctx.GetPlace().GetDeviceId();//xpu_ctx->dev().id(); - dev_id= dev_id; - // if(dev_id==target_id) { - // printf("[hsq] dev_id:%d, 1.going to call box_ptr->PullSparse\n", dev_id); - - // int output_index = 0; - // int output_expand_index = 0; - // printf("[hsq] total_dims0:%d, total_expand_dims0:%d, max_total_dims0:%d, dims1:%d, expand_dims1:%d\n",total_dims0, total_expand_dims0, max_total_dims0, dims1, expand_dims1); - // printf("[hsq] total_values's ptr: %p, ptr_end:%p\n", total_values.data(), total_values.data()+total_values.numel()); - // printf("[hsq] pull_box_extend_sparse tensor shape:\n"); - // for(int i = 0; i < (int)slot_size; i++) { - // printf("[hsq] input[%d].shape: [", i); - // for(int j =0;j<(int)inputs[i]->dims().size();j++){ - // printf("%d,", (int)inputs[i]->dims()[j]); - // } - // printf("]\n"); - - // if(flags[i] & 0x01) { - // printf("[hsq] output[%d].shape: [", i); - // for(int j =0;j<(int)outputs[output_index]->dims().size();j++){ - // printf("%d,", (int)outputs[output_index]->dims()[j]); - // } - // printf("], ptr_begin:%p, ptr_end:%p, slot_dims0_offset[%d]: %d\n", outputs[output_index]->data(), outputs[output_index]->data()+outputs[output_index]->numel(), i, slot_dims0_offset[i]); - // output_index++; - // } - - // if(flags[i] & 0x02) { - // printf("[hsq] output_expand[%d].shape: [", i); - // for(int j =0;j<(int)outputs_extend[output_expand_index]->dims().size();j++){ - // printf("%d,", (int)outputs_extend[output_expand_index]->dims()[j]); - // } - // printf("], ptr_begin:%p, ptr_end:%p\n", outputs_extend[output_expand_index]->data(), outputs_extend[output_expand_index]->data()+outputs_extend[output_expand_index]->numel()); - // output_expand_index++; - // } - // } - // } box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths, emb_size, emb_extended_size, skip_offset, expand_only); if (std::getenv("DUMP_XPU_PUSH_SPARSE_INPUT") != nullptr) { auto names = ctx.OutputNames("Out"); for (int i = 0; i ("emb_extended_size"); auto expand_only = ctx.Attr("expand_only"); auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); - // printf("[hsq] gping to call box_ptr->PushSparseGrad\n"); if (std::getenv("DUMP_XPU_PUSH_SPARSE_INPUT") != nullptr) { auto names = ctx.InputNames(framework::GradVarName("OutExtend")); for (int i = (d_output_extend.size()-1); i >=0; i--) { TensorFormatter formatter; - // const std::string &name = ctx.InputNames(framework::GradVarName("Out"))[i]; const std::string &name = names[i]; formatter.SetPrintTensorType(true); formatter.SetPrintTensorShape(true); @@ -396,7 +305,6 @@ static void PushBoxExtendedSparseFunctor( names = ctx.InputNames(framework::GradVarName("Out")); for (int i = (d_output.size()-1); i >=0; i--) { TensorFormatter formatter; - // const std::string &name = ctx.InputNames(framework::GradVarName("Out"))[i]; const std::string &name = names[i]; formatter.SetPrintTensorType(true); formatter.SetPrintTensorShape(true); diff --git a/paddle/fluid/operators/rank_attention_op_xpu.cc b/paddle/fluid/operators/rank_attention_op_xpu.cc index cd911b2530b45..f252ddba1c316 100644 --- a/paddle/fluid/operators/rank_attention_op_xpu.cc +++ b/paddle/fluid/operators/rank_attention_op_xpu.cc @@ -56,23 +56,6 @@ class RankAttention2XPUKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); T* out_data = Out->mutable_data(ctx.GetPlace()); - // if(ctx.GetPlace().GetDeviceId()==0) { - // printf("[hsq] rank_attention input ptr:%p, rank_offset ptr:%p, param ptr:%p, out ptr:%p, ins_num: %d, x_fea_dim:%d, max_rank:%d, para_row:%d, para_col:%d\n", X->data(), rank_offset->data(), param->data(), out_data, (int)ins_num, (int)x_fea_dim, (int)max_rank, (int)para_row, (int)para_col); - - // std::vector h_mat(rank_offset->numel()); - // xpu_memcpy(h_mat.data(), rank_offset->data(), rank_offset->numel() * sizeof(int), XPU_DEVICE_TO_HOST); - - // if(ins_num*(2*max_rank+1)!=rank_offset->numel()){ - // printf("[hsq] check error\n"); - // } - // std::cout<<"[hsq] mat_out: ["; - // for (int i = 0; i < ins_num; i++) { - // std::cout<<"ins_id: "<(dev_ctx.x_context(), ins_num, x_fea_dim, X->data(), max_rank, rank_offset->data(), para_row, para_col, @@ -81,7 +64,6 @@ class RankAttention2XPUKernel : public framework::OpKernel { ret, XPU_SUCCESS, platform::errors::External("The rank_attention2 XPU kernel return wrong value[%d %s]", ret, XPUAPIErrorMsg[ret])); - // } } }; From 0cc9d7c132826450460a22a12167d320b633d0e5 Mon Sep 17 00:00:00 2001 From: HuangShiqing Date: Wed, 22 Nov 2023 15:28:36 +0800 Subject: [PATCH 07/20] abacus-aibox-842 fix some place --- .../fluid/framework/fleet/box_wrapper_impl.h | 19 ++++++++++-------- .../operators/pull_box_extended_sparse_op.h | 20 +++++++++---------- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h index 70638232e9fe1..0e5d4c1dd065b 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_impl.h +++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h @@ -740,14 +740,18 @@ void BoxWrapper::PushSparseGradCaseXPU(const paddle::platform::Place& place, // } std::vector slot_inner_offset(total_length); int out_count = 0; - for(int i=0;i(d_slot_inner_offset_tmp->ptr()); + memory::Copy(place, + d_slot_inner_offset, + platform::CPUPlace(), + slot_inner_offset.data(), + total_length * sizeof(int)); box_wrapper_kernel_->CopyForPush(place, xpu_values, total_grad_values_xpu, push_offset, total_length, slot_vector, (int*)d_slot_inner_offset, slot_lens, slot_num, @@ -755,7 +759,6 @@ void BoxWrapper::PushSparseGradCaseXPU(const paddle::platform::Place& place, expand_embed_dim, push_float_num_, expand_only); - xpu_free(d_slot_inner_offset); push_boxps_timer.Resume(); #ifdef TRACE_PROFILE diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h index 1f3c7797c3e10..459d9f4937024 100644 --- a/paddle/fluid/operators/pull_box_extended_sparse_op.h +++ b/paddle/fluid/operators/pull_box_extended_sparse_op.h @@ -65,7 +65,7 @@ static void PullBoxExtendedSparseFunctor( if(flags.empty()) { offset += outputs[i]->dims()[0]; } else { - if(is_expand_slot_small==true){ + if(is_expand_slot_small){ if (flags[i] & 0x01) { offset += outputs[embedx_offset]->dims()[0]; dims1 = outputs[embedx_offset]->dims()[1]; @@ -94,7 +94,7 @@ static void PullBoxExtendedSparseFunctor( } framework::LoDTensor total_values; - total_values.Resize(phi::make_ddim({max_total_dims0*(dims1+expand_dims1)})); + total_values.Resize(phi::make_ddim({max_total_dims0 * (dims1 + expand_dims1)})); total_values.mutable_data(ctx.GetPlace()); // BoxPS only supports float now @@ -107,10 +107,10 @@ static void PullBoxExtendedSparseFunctor( reinterpret_cast(slot->data()); all_keys[i] = single_slot_keys; slot_lengths[i] = slot->numel(); - if(outputs[embedx_offset]->numel()==0) { + if (outputs[embedx_offset]->numel() == 0) { outputs[embedx_offset]->set_layout(paddle::framework::DataLayout::UNDEFINED); } else { - int offset = slot_dims0_offset[i]*dims1* sizeof(T); + size_t offset = slot_dims0_offset[i] * dims1 * sizeof(T); total_values.set_offset(offset); outputs[i]->ShareBufferWith(total_values); } @@ -119,8 +119,8 @@ static void PullBoxExtendedSparseFunctor( if(outputs_extend[expand_offset]->numel()==0) { outputs_extend[expand_offset]->set_layout(paddle::framework::DataLayout::UNDEFINED); } else { - int offset = slot_dims0_offset[i]*expand_dims1* sizeof(T); - total_values.set_offset(max_total_dims0*dims1* sizeof(T)+offset); + size_t offset = slot_dims0_offset[i] * expand_dims1 * sizeof(T); + total_values.set_offset(max_total_dims0 * dims1 * sizeof(T) + offset); outputs_extend[i]->ShareBufferWith(total_values); } auto *output_extend = outputs_extend[i]->mutable_data(ctx.GetPlace()); @@ -136,10 +136,10 @@ static void PullBoxExtendedSparseFunctor( all_keys[i] = single_slot_keys; slot_lengths[i] = slot->numel(); if (flags[i] & 0x01) { - if(outputs[embedx_offset]->numel()==0) { + if (outputs[embedx_offset]->numel() == 0) { outputs[embedx_offset]->set_layout(paddle::framework::DataLayout::UNDEFINED); } else { - int offset = slot_dims0_offset[i]*dims1* sizeof(T); + size_t offset = slot_dims0_offset[i] * dims1 * sizeof(T); total_values.set_offset(offset); outputs[embedx_offset]->ShareBufferWith(total_values); } @@ -153,8 +153,8 @@ static void PullBoxExtendedSparseFunctor( if(outputs_extend[expand_offset]->numel()==0) { outputs_extend[expand_offset]->set_layout(paddle::framework::DataLayout::UNDEFINED); } else { - int offset = slot_dims0_offset[i]*expand_dims1* sizeof(T); - total_values.set_offset(max_total_dims0*dims1* sizeof(T)+offset); + size_t offset = slot_dims0_offset[i] * expand_dims1 * sizeof(T); + total_values.set_offset(max_total_dims0 * dims1 * sizeof(T) + offset); outputs_extend[expand_offset]->ShareBufferWith(total_values); } auto *output_extend = outputs_extend[expand_offset]->mutable_data(ctx.GetPlace()); From 1f131dd1fd6703d62444f7e83a4c4177de62f084 Mon Sep 17 00:00:00 2001 From: HuangShiqing Date: Fri, 24 Nov 2023 14:31:22 +0800 Subject: [PATCH 08/20] abacus-aibox-899 adjust the data_feed's disable_random --- paddle/fluid/framework/data_feed.kps | 3 +-- paddle/fluid/framework/data_set.cc | 9 +++++---- paddle/fluid/framework/fleet/box_wrapper_kernel.kps | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/data_feed.kps b/paddle/fluid/framework/data_feed.kps index 796ac1c87509a..a793dc2056e98 100644 --- a/paddle/fluid/framework/data_feed.kps +++ b/paddle/fluid/framework/data_feed.kps @@ -1,9 +1,8 @@ #ifdef PADDLE_WITH_XPU_KP #include "paddle/fluid/framework/data_feed_xpu_kernel_helper.h" #include "xpu/kernel/xtdk.h" -// #include "xpu/kernel/debug.h" #include "xpu/kernel/xtdk_math.h" -#include "xpu/kernel/xtdk_io.h" +// #include "xpu/kernel/xtdk_io.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index d0129cd9151dd..66e4a6f665ed4 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -2113,12 +2113,13 @@ void PadBoxSlotDataset::PreLoadIntoDisk(const std::string& path, VLOG(3) << "RegisterClientToClientMsgHandler done"; } CHECK(slot_pool_ != nullptr) << "slotrecord pool nullptr"; - read_ins_ref_ = thread_num_; - if (disable_shuffle_) { - read_ins_ref_ = 1; + int read_thread_num = thread_num_; + if (disable_random_update_) { + read_thread_num = 1; } + read_ins_ref_ = read_thread_num; CHECK(down_pool_ != nullptr) << "down_pool nullptr"; - for (int64_t i = 0; i < read_ins_ref_; ++i) { + for (int64_t i = 0; i < read_thread_num; ++i) { wait_futures_.emplace_back(down_pool_->Run([this, i]() { platform::Timer timer; timer.Start(); diff --git a/paddle/fluid/framework/fleet/box_wrapper_kernel.kps b/paddle/fluid/framework/fleet/box_wrapper_kernel.kps index b6ec83564c54f..b00ac3031745f 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_kernel.kps +++ b/paddle/fluid/framework/fleet/box_wrapper_kernel.kps @@ -30,7 +30,7 @@ limitations under the License. */ #include "xpu/kernel/xtdk_simd.h" #ifdef TRACE_PROFILE -#include "xpu/kernel/xtdk_io.h" +// #include "xpu/kernel/xtdk_io.h" #include // The producer side. From cae23ba8ce8427aef1744cf8e1b9a65938063a45 Mon Sep 17 00:00:00 2001 From: zmxdream Date: Mon, 27 Nov 2023 14:01:08 +0800 Subject: [PATCH 09/20] fix pull_box_extend_sparse --- paddle/fluid/operators/pull_box_extended_sparse_op.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h index 459d9f4937024..3807cf020abe0 100644 --- a/paddle/fluid/operators/pull_box_extended_sparse_op.h +++ b/paddle/fluid/operators/pull_box_extended_sparse_op.h @@ -107,8 +107,8 @@ static void PullBoxExtendedSparseFunctor( reinterpret_cast(slot->data()); all_keys[i] = single_slot_keys; slot_lengths[i] = slot->numel(); - if (outputs[embedx_offset]->numel() == 0) { - outputs[embedx_offset]->set_layout(paddle::framework::DataLayout::UNDEFINED); + if (outputs[i]->numel() == 0) { + outputs[i]->set_layout(paddle::framework::DataLayout::UNDEFINED); } else { size_t offset = slot_dims0_offset[i] * dims1 * sizeof(T); total_values.set_offset(offset); @@ -116,8 +116,8 @@ static void PullBoxExtendedSparseFunctor( } auto *output = outputs[i]->mutable_data(ctx.GetPlace()); all_values[i] = reinterpret_cast(output); - if(outputs_extend[expand_offset]->numel()==0) { - outputs_extend[expand_offset]->set_layout(paddle::framework::DataLayout::UNDEFINED); + if(outputs_extend[i]->numel()==0) { + outputs_extend[i]->set_layout(paddle::framework::DataLayout::UNDEFINED); } else { size_t offset = slot_dims0_offset[i] * expand_dims1 * sizeof(T); total_values.set_offset(max_total_dims0 * dims1 * sizeof(T) + offset); From 92f99c7658496b00c620fc81d49c181fa4e8a91f Mon Sep 17 00:00:00 2001 From: zmxdream Date: Mon, 27 Nov 2023 14:08:38 +0800 Subject: [PATCH 10/20] fix pull_box_extend_sparse --- paddle/fluid/operators/pull_box_extended_sparse_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h index 3807cf020abe0..084aacf58c872 100644 --- a/paddle/fluid/operators/pull_box_extended_sparse_op.h +++ b/paddle/fluid/operators/pull_box_extended_sparse_op.h @@ -116,7 +116,7 @@ static void PullBoxExtendedSparseFunctor( } auto *output = outputs[i]->mutable_data(ctx.GetPlace()); all_values[i] = reinterpret_cast(output); - if(outputs_extend[i]->numel()==0) { + if(outputs_extend[i]->numel() == 0) { outputs_extend[i]->set_layout(paddle::framework::DataLayout::UNDEFINED); } else { size_t offset = slot_dims0_offset[i] * expand_dims1 * sizeof(T); From afa8c29c2521a47f66cbe6296f7d619000d90676 Mon Sep 17 00:00:00 2001 From: HuangShiqing Date: Mon, 27 Nov 2023 10:15:42 +0800 Subject: [PATCH 11/20] abacus-aibox-901 fix the asq's pushcopy compatible error --- .../framework/fleet/box_wrapper_kernel.kps | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/fleet/box_wrapper_kernel.kps b/paddle/fluid/framework/fleet/box_wrapper_kernel.kps index b00ac3031745f..24a8fa84468f4 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_kernel.kps +++ b/paddle/fluid/framework/fleet/box_wrapper_kernel.kps @@ -858,7 +858,7 @@ inline void FeaturePushCopyNNCross( #endif } -__global__ void PushCopy(float* src_vals, +__global__ void PushCopy(unsigned long long* src_vals, float* dest_vals, boxps::FeaturePushOffset* push_offset, const int push_float_num, @@ -902,9 +902,21 @@ __global__ void PushCopy(float* src_vals, sm_slots[i] = lm_slot; } + __shared__ uint64_t sm_src_vals_ptr[max_slot_num]; + for (int i = cid; i < sm_slot_len; i += ncores) { + GM2SM(src_vals + i, sm_src_vals_ptr + i, sizeof(uint64_t)); + } + mfence(); xpu_sync_all(); + __local__ uint64_t lm_src_vals_ptr[1]; + for (int i = 0; i < slot_num; i++) { + if (sm_src_vals_ptr[i] != 0) { + lm_src_vals_ptr[0] = sm_src_vals_ptr[i]; + break; + } + } GM2LM(push_offset, &info, sizeof(boxps::FeaturePushOffset)); float scale = -1. * batch_size; @@ -917,7 +929,7 @@ __global__ void PushCopy(float* src_vals, int count_per_loop = min(per_thread_per_loop_len, total_length - gm_offset); - GM2LM(src_vals + gm_offset * hidden_size, lm_src_vals, + GM2LM((__global_ptr__ float*)lm_src_vals_ptr[0] + gm_offset * hidden_size, lm_src_vals, count_per_loop * hidden_size * sizeof(float)); GM2LM(total_dims + gm_offset, lm_total_dims, count_per_loop * sizeof(int)); @@ -1024,14 +1036,7 @@ void BoxWrapperKernel::CopyForPush( } else { // FeaturePushCopy // TODO: - float* real_gm_src_ptr; - for (int i = 0; i < slot_num; i++) { - if(gm_src_ptr[i] != 0) { - real_gm_src_ptr = const_cast(gm_src_ptr[i]); - break; - } - } - PushCopy<<<8, 64, stream>>>(real_gm_src_ptr, push_grad_values, push_offset, + PushCopy<<<8, 64, stream>>>(reinterpret_cast(gm_src_ptr), push_grad_values, push_offset, push_float_num_, c_total_length, hidden_size, batch_size, total_dims, skip_offset, cvm_offset, key2slot, slots, slot_num); } From 50e8d5f6218fe4783fa3d0a3bc40b91e3a1f4bc4 Mon Sep 17 00:00:00 2001 From: xiayanming Date: Fri, 24 Nov 2023 17:27:31 +0800 Subject: [PATCH 12/20] fix cvrq check nan --- .../framework/fleet/box_wrapper_kernel.kps | 222 ++++++++++++++++-- 1 file changed, 204 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/framework/fleet/box_wrapper_kernel.kps b/paddle/fluid/framework/fleet/box_wrapper_kernel.kps index 24a8fa84468f4..34f25cb0389ce 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_kernel.kps +++ b/paddle/fluid/framework/fleet/box_wrapper_kernel.kps @@ -30,7 +30,7 @@ limitations under the License. */ #include "xpu/kernel/xtdk_simd.h" #ifdef TRACE_PROFILE -// #include "xpu/kernel/xtdk_io.h" +#include "xpu/kernel/xtdk_io.h" #include // The producer side. @@ -70,6 +70,15 @@ struct ExpandPushGetOp { } }; +struct ExpandPushEmdGetOp { + __device__ float get(float* expand, const int& row, + const int& expand_id, + const int& hidden, + const int& expand_dim) const { + return expand[row * (hidden + expand_dim) + hidden + expand_id]; + } +}; + template __device__ void set_byfloat(float* dest, const T& val) { (*reinterpret_cast(dest)) = val; @@ -340,6 +349,152 @@ __global__ void PullCopyNNCross(const TEmbedxOp* op, } } +template +__global__ void PullCopyNNCrossWithEmb(const TEmbedxOp* op, + const float scale, + const boxps::FeaturePullOffset* info, + int* total_dims, + unsigned long long* dst_vals, + const int* key2slot, + float* total_values, + const uint32_t* restore_idx, + const int total_length, + const int max_cols_num, + const int hidden_size, + const int expand_embed_dim, + const int pull_float_num, + const int skip_offset, + const int cvm_offset, + const int slot_num) { + int cid = core_id(); + int ncores = core_num(); + if (cid >= ncores) { + return; + } + int thread_id = cluster_id() * ncores + cid; + int nthreads = cluster_num() * ncores; + + const int buf_length = 5; + int per_thread_len = roundup_div(total_length, nthreads); + int per_thread_loop_count = roundup_div(per_thread_len, buf_length); + int per_thread_per_loop_len = roundup_div(per_thread_len, per_thread_loop_count); + + __local__ float lm_total_values[buf_length * pull_float_num]; + __local__ float lm_dst_vals[buf_length * hidden_size]; + __local__ float lm_dst_expand_vals[buf_length * (hidden_size + expand_embed_dim)]; + __local__ int lm_key2slot[buf_length]; + __local__ int lm_total_dims[buf_length]; + __local__ uint32_t lm_restore_idx[buf_length]; + __local__ boxps::FeaturePullOffset lm_info[1]; + __local__ TEmbedxOp lm_op[1]; + + const int max_slot_num = 1000; + int sm_slot_len = min(max_slot_num, slot_num); + __shared__ uint64_t sm_dst_vals_ptr[max_slot_num]; + __shared__ uint64_t sm_dst_expand_vals_ptr[max_slot_num]; + for (int i = cid; i < sm_slot_len; i += ncores) { + GM2SM(dst_vals + i, sm_dst_vals_ptr + i, sizeof(uint64_t)); + GM2SM(dst_vals + slot_num + i, sm_dst_expand_vals_ptr + i, sizeof(uint64_t)); + } + mfence(); + xpu_sync_all(); + + __local__ uint64_t lm_dst_vals_ptr[1]; + for(int i=0;i= total_length) { + return; + } + + int len = min(per_thread_per_loop_len, total_length - gm_offset); + if(restore_idx != nullptr) { + GM2LM(restore_idx + gm_offset, lm_restore_idx, len * sizeof(uint32_t)); + } + int pos = (restore_idx != nullptr) ? lm_restore_idx[gm_offset] : gm_offset; + GM2LM(total_values + pos * pull_float_num, lm_total_values, len * pull_float_num * sizeof(float)); + GM2LM(total_dims + gm_offset, lm_total_dims, len * sizeof(int)); + GM2LM(key2slot + gm_offset, lm_key2slot, len * sizeof(int)); + + for (int j = 0; j < len; j++) { + // mfence(); + // cvm offset + for (int k = 0; k < cvm_offset; ++k) { + //TODO:consider xpu_value[slot_id]==nullptr? + if (sm_dst_vals_ptr[lm_key2slot[j]] != 0) { + lm_dst_vals[j * hidden_size + k] = lm_total_values[j * pull_float_num + lm_info[0].show + skip_offset + k]; + } + if (sm_dst_expand_vals_ptr[lm_key2slot[j]] != 0) { + lm_dst_expand_vals[j * (hidden_size + expand_embed_dim) + k] = lm_total_values[j * pull_float_num + lm_info[0].show + skip_offset + k]; + } + } + + // embedx + // embedx flags + expand flags && *(keys[x] + y) != 0 && *(keys[x] + y) + int embedx_size = *((int *)&(lm_total_values[j * pull_float_num + lm_info[0].embedx_size])); + // int embedx_size = 0; + // TODO: expand_size = expand_embed_dim? + int expand_size = *((int *)&(lm_total_values[j * pull_float_num + lm_info[0].expand_size])); + lm_total_dims[j] = static_cast(embedx_size > 0) | static_cast((expand_size > 0) << 1); + + if (sm_dst_vals_ptr[lm_key2slot[j]] != 0) { + for (int k = cvm_offset; k < cvm_offset + embedx_size; ++k) { + lm_op[0].copy(lm_dst_vals + j * hidden_size + k, + lm_total_values + j * pull_float_num + lm_info[0].embedx, + k - cvm_offset, + scale); + } + + for (int k = cvm_offset + embedx_size; k < hidden_size; ++k) { + lm_dst_vals[j * hidden_size + k] = 0; + } + } + + if (sm_dst_expand_vals_ptr[lm_key2slot[j]] != 0) { + for (int k = cvm_offset; k < cvm_offset + embedx_size; ++k) { + lm_op[0].copy(lm_dst_expand_vals + j * (hidden_size + expand_embed_dim) + k, + lm_total_values + j * pull_float_num + lm_info[0].embedx, + k - cvm_offset, + scale); + } + + for (int k = cvm_offset + embedx_size; k < hidden_size; ++k) { + lm_dst_expand_vals[j * (hidden_size + expand_embed_dim) + k] = 0; + } + } + + // expand + if (sm_dst_expand_vals_ptr[lm_key2slot[j]] == 0) { + continue; + } + + for (int k = hidden_size; k < hidden_size + expand_size; ++k) { + lm_op[0].copy(lm_dst_expand_vals + j * (hidden_size + expand_embed_dim) + k, + lm_total_values + j * pull_float_num + lm_info[0].expand, + k - hidden_size, + scale); + } + for (int k = hidden_size + expand_size; k < max_cols_num; ++k) { + lm_dst_expand_vals[j * (hidden_size + expand_embed_dim) + k] = 0; + } + } + mfence(); + + LM2GM(lm_total_dims, total_dims + gm_offset, len * sizeof(int)); + LM2GM(lm_dst_vals, ((__global_ptr__ float*)lm_dst_vals_ptr[0] + gm_offset * hidden_size), len * hidden_size * sizeof(float)); + LM2GM(lm_dst_expand_vals, ((__global_ptr__ float*)lm_dst_vals_ptr[0] + total_length * hidden_size + gm_offset * (hidden_size + expand_embed_dim)), len * (hidden_size + expand_embed_dim) * sizeof(float)); + mfence(); + } +} + template inline void FeaturePullCopyNNCross( const paddle::platform::Place& place, @@ -405,9 +560,22 @@ inline void FeaturePullCopyNNCross( cvm_offset, slot_num); } else { - // PullCopyNNCrossWithEmb - // TODO: - CHECK(false) << "PullCopyNNCrossWithEmb not implement"; + PullCopyNNCrossWithEmb<<<8, 64, stream>>>(d_op, + scale, + info, + total_dims, + reinterpret_cast(d_xpu_values), + key2slot, + total_values_xpu, + xpu_restore_idx, + total_length, + (hidden_size + expand_embed_dim), + hidden_size, + expand_embed_dim, + pull_float_num, + skip_offset, + cvm_offset, + slot_num); } xpu_free(d_xpu_values); xpu_wait(stream); @@ -816,21 +984,18 @@ inline void FeaturePushCopyNNCross( auto ctx_xpu = static_cast(dev_ctx)->x_context(); auto stream = ctx_xpu->xpu_stream; - auto d_op_tmp = memory::Alloc(place, sizeof(TExpandPushGetOp)); - TExpandPushGetOp* d_op = reinterpret_cast(d_op_tmp->ptr()); - memory::Copy(place, - d_op, - platform::CPUPlace(), - op, - sizeof(TExpandPushGetOp)); - #ifdef TRACE_PROFILE TRACE_SCOPE_START("PushCopyNNCross", xpu_wait(stream)); #endif if (expand_only) { - // TODO: - // if (d_sort_idx != nullptr){ - // } + ExpandPushGetOp op; + auto d_op_tmp = memory::Alloc(place, sizeof(ExpandPushGetOp)); + ExpandPushGetOp* d_op = reinterpret_cast(d_op_tmp->ptr()); + memory::Copy(place, + d_op, + platform::CPUPlace(), + &op, + sizeof(ExpandPushGetOp)); PushCopyNNCross<<<8, 64, stream>>>(d_op, info, reinterpret_cast(gm_src),//src @@ -848,9 +1013,30 @@ inline void FeaturePushCopyNNCross( skip_offset, bs); } else { - // PullCopyNNCrossWithEmb - // TODO: - CHECK(false) << "PullCopyNNCrossWithEmb not implement"; + ExpandPushEmdGetOp op; + auto d_op_tmp = memory::Alloc(place, sizeof(ExpandPushEmdGetOp)); + ExpandPushEmdGetOp* d_op = reinterpret_cast(d_op_tmp->ptr()); + memory::Copy(place, + d_op, + platform::CPUPlace(), + &op, + sizeof(ExpandPushEmdGetOp)); + PushCopyNNCross<<<8, 64, stream>>>(d_op, + info, + reinterpret_cast(gm_src),//src + total_dims, + key2slot, + slot_vector, + slot_inner_offset, + push_grad_values,//dst + total_length, + hidden_size, + expand_embed_dim, + slot_num, + push_float_num, + cvm_offset, + skip_offset, + bs); } #ifdef TRACE_PROFILE xpu_wait(stream); From d46e4ec3762ba5debeae3fecd90fe90f13fb68b1 Mon Sep 17 00:00:00 2001 From: HuangShiqing Date: Fri, 1 Dec 2023 11:31:19 +0800 Subject: [PATCH 13/20] abacus-aibox-906 add the support of op index_select/grad --- paddle/fluid/operators/index_select_op_xpu.cc | 234 ++++++++++++++++++ .../fluid/platform/device/xpu/xpu2_op_list.h | 4 + 2 files changed, 238 insertions(+) create mode 100644 paddle/fluid/operators/index_select_op_xpu.cc diff --git a/paddle/fluid/operators/index_select_op_xpu.cc b/paddle/fluid/operators/index_select_op_xpu.cc new file mode 100644 index 0000000000000..8e59a54df9de6 --- /dev/null +++ b/paddle/fluid/operators/index_select_op_xpu.cc @@ -0,0 +1,234 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/index_select_op.h" +#include "paddle/phi/kernels/cpu/index_select_impl.h" + +namespace paddle { +namespace operators { +using LoDTensor = framework::LoDTensor; + +template +class IndexSelectXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto x = ctx.Input("X"); + auto index = ctx.Input("Index"); + auto out = ctx.Output("Out"); + auto dim = ctx.Attr("dim"); + + auto place = ctx.GetPlace(); + auto xpu_context = ctx.template device_context().x_context(); + + const T* x_data = x->data(); + T* out_data = out->mutable_data(place); + + int index_len = index->dims()[0]; + + const auto& index_type = index->dtype(); + bool index_type_match = + index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + // static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? + // std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : + // 0; + // int dev_id = ctx.GetPlace().GetDeviceId(); + // // if(dev_id == target_id) { + // // printf("[hsq] input shape: %d, %d\n", (int)x->dims()[0], (int)x->dims()[1]); + // // printf("[hsq] index_len: %d\n", index_len); + // // printf("[hsq] out shape: %d, %d\n", (int)out->dims()[0], (int)out->dims()[1]); + // // } + + // auto cpu_device_ctx = platform::DeviceContextPool::Instance().Get(phi::CPUPlace()); + // framework::ExecutionContext cpu_execution_ctx(ctx.GetOp(), ctx.scope(), *cpu_device_ctx, ctx.Context()); + + // LoDTensor x_cpu_copy; + // framework::TensorCopySync(*x, platform::CPUPlace(), &x_cpu_copy); + // LoDTensor index_cpu_copy; + // framework::TensorCopySync(*index, platform::CPUPlace(), &index_cpu_copy); + // LoDTensor out_cpu_copy; + // framework::TensorCopySync(*out, platform::CPUPlace(), &out_cpu_copy); + + // if (index_type == phi::DataType::INT32) { + // IndexSelectInner(cpu_execution_ctx, &x_cpu_copy, index_cpu_copy, &out_cpu_copy, dim); + // } else if (index_type == phi::DataType::INT64) { + // IndexSelectInner(cpu_execution_ctx, &x_cpu_copy, index_cpu_copy, &out_cpu_copy, dim); + // } + + int r = -1; + std::vector xshape = phi::vectorize(x->dims()); + if (index_type == phi::DataType::INT64) { + const int64_t* index_data = index->data(); + r = xpu::gather(xpu_context, x_data, index_data, out_data, xshape, index_len, dim); + } else { + const int* index_data = index->data(); + r = xpu::gather(xpu_context, x_data, index_data, out_data, xshape, index_len, dim); + } + + // LoDTensor out_ref_cpu_copy; + // framework::TensorCopySync(*out, platform::CPUPlace(), &out_ref_cpu_copy); + // bool correct = true; + // float diff = 1e-5; + // for (int i = 0; i < out_ref_cpu_copy.numel(); i++) { + // T* ref_data = out_ref_cpu_copy.data(); + // T* cpu_data = out_cpu_copy.data(); + // if(std::abs(*(ref_data + i) - *(cpu_data+i)) > diff) { + // correct = false; + // printf("[hsq] error in %d, out_ref_cpu_copy[%d]=%f, out_cpu_copy[%d]=%f\n", i, i, *(ref_data+i), i, *(cpu_data+i)); + // break; + // } + // } + // if(dev_id == target_id) { + // if(correct) { + // printf("[hsq] index_select op test passed\n"); + // } + // } + + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::External( + "The index_select XPU OP return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + } +}; + +template +class IndexSelectGradXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto x = ctx.Input("X"); + auto index = ctx.Input("Index"); + auto out_grad = ctx.Input(framework::GradVarName("Out")); + auto x_grad = ctx.Output(framework::GradVarName("X")); + auto dim = ctx.Attr("dim"); + + auto place = ctx.GetPlace(); + auto xpu_context = ctx.template device_context().x_context(); + + const auto& index_type = index->dtype(); + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + const T* x_data = x->data(); + const T* out_grad_data = out_grad->data(); + T* x_grad_data = x_grad->mutable_data(place); + + // auto cpu_device_ctx = platform::DeviceContextPool::Instance().Get(phi::CPUPlace()); + // framework::ExecutionContext cpu_execution_ctx(ctx.GetOp(), ctx.scope(), *cpu_device_ctx, ctx.Context()); + + // LoDTensor out_grad_cpu_copy; + // framework::TensorCopySync(*out_grad, platform::CPUPlace(), &out_grad_cpu_copy); + // LoDTensor index_cpu_copy; + // framework::TensorCopySync(*index, platform::CPUPlace(), &index_cpu_copy); + // LoDTensor x_grad_cpu_copy; + // framework::TensorCopySync(*x_grad, platform::CPUPlace(), &x_grad_cpu_copy); + // if (index_type == phi::DataType::INT32) { + // IndexSelectGradInner(cpu_execution_ctx, out_grad_cpu_copy, index_cpu_copy, &x_grad_cpu_copy, dim); + // } else if (index_type == phi::DataType::INT64) { + // IndexSelectGradInner( + // cpu_execution_ctx, out_grad_cpu_copy, index_cpu_copy, &x_grad_cpu_copy, dim); + // } + + int r = -1; + std::vector out_grad_shape = phi::vectorize(out_grad->dims()); + std::vector x_grad_shape = phi::vectorize(x_grad->dims()); + + // static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? + // std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : + // 0; + // int dev_id = ctx.GetPlace().GetDeviceId(); + // // if(dev_id == target_id) { + // // printf("[hsq] out_grad_shape:["); + // // for(int i = 0; i < (int)out_grad_shape.size(); i++) { + // // printf("%d, ", (int)out_grad_shape[i]); + // // } + // // printf("]\n"); + + // // printf("[hsq] x_grad_shape:["); + // // for(int i = 0; i < (int)x_grad_shape.size(); i++) { + // // printf("%d, ", (int)x_grad_shape[i]); + // // } + // // printf("]\n"); + // // } + if (index_type == phi::DataType::INT64) { + const int64_t* index_data = index->data(); + r = xpu::index_select_grad(xpu_context, + x_data, + index_data, + out_grad_data, + dim, + x_grad_data, + out_grad_shape, + x_grad_shape); + } else { + const int* index_data = index->data(); + r = xpu::index_select_grad(xpu_context, + x_data, + index_data, + out_grad_data, + dim, + x_grad_data, + out_grad_shape, + x_grad_shape); + } + + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::External( + "The index_select_grad XPU OP return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + + // LoDTensor x_grad_ref_cpu_copy; + // framework::TensorCopySync(*x_grad, platform::CPUPlace(), &x_grad_ref_cpu_copy); + // bool correct = true; + // float diff = 1e-5; + // for (int i = 0; i < x_grad_ref_cpu_copy.numel(); i++) { + // T* ref_data = x_grad_ref_cpu_copy.data(); + // T* cpu_data = x_grad_cpu_copy.data(); + // if(std::abs(*(ref_data + i) - *(cpu_data+i)) > diff) { + // correct = false; + // printf("[hsq] error in %d, out_ref_cpu_copy[%d]=%f, out_cpu_copy[%d]=%f\n", i, i, *(ref_data+i), i, *(cpu_data+i)); + // break; + // } + // } + + // if(dev_id == target_id) { + // if(correct) { + // printf("[hsq] index_select_grad op test passed\n"); + // } + // } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL(index_select, ops::IndexSelectXPUKernel) + +REGISTER_OP_XPU_KERNEL(index_select_grad, ops::IndexSelectGradXPUKernel) diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index b4da8758619c1..df5585b3128ce 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -585,6 +585,10 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"push_box_extended_sparse", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"index_select", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"index_select_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, }; return s_xpu2_kernels; } From b72cb653849e149f3ce2221bdac2bee1a32ebbb3 Mon Sep 17 00:00:00 2001 From: HuangShiqing Date: Fri, 1 Dec 2023 15:24:16 +0800 Subject: [PATCH 14/20] abacus-aibox-906 remove the debug info --- cmake/external/xpu.cmake | 2 +- paddle/fluid/operators/index_select_op_xpu.cc | 99 ------------------- 2 files changed, 1 insertion(+), 100 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 7649af9a0ccfd..3e3d3849b8e0d 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -106,7 +106,7 @@ if (WITH_BOX_PS OR WITH_XPU_KP) CACHE STRING "" FORCE) #"https://klx-sdk-release-public.su.bcebos.com/xdnn/release/2.6.0.1/${XPU_XDNN_DIR_NAME}.tar.gz" set(XPU_XDNN_URL - "https://klx-sdk-release-public.su.bcebos.com/xdnn_train/dev/paddlebox/20231115/${XPU_XDNN_DIR_NAME}.tar.gz" + "https://klx-sdk-release-public.su.bcebos.com/xdnn_train/dev/paddlebox/20231201/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) set(SCALOPUS_URL "https://klx-sdk-release-public.su.bcebos.com/xdnn_train/dev/paddlebox/20230306/scalopus.tar.gz" diff --git a/paddle/fluid/operators/index_select_op_xpu.cc b/paddle/fluid/operators/index_select_op_xpu.cc index 8e59a54df9de6..3c4172e7abfcb 100644 --- a/paddle/fluid/operators/index_select_op_xpu.cc +++ b/paddle/fluid/operators/index_select_op_xpu.cc @@ -48,32 +48,6 @@ class IndexSelectXPUKernel : public framework::OpKernel { phi::DataType::INT32, phi::DataType::INT64)); - // static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? - // std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : - // 0; - // int dev_id = ctx.GetPlace().GetDeviceId(); - // // if(dev_id == target_id) { - // // printf("[hsq] input shape: %d, %d\n", (int)x->dims()[0], (int)x->dims()[1]); - // // printf("[hsq] index_len: %d\n", index_len); - // // printf("[hsq] out shape: %d, %d\n", (int)out->dims()[0], (int)out->dims()[1]); - // // } - - // auto cpu_device_ctx = platform::DeviceContextPool::Instance().Get(phi::CPUPlace()); - // framework::ExecutionContext cpu_execution_ctx(ctx.GetOp(), ctx.scope(), *cpu_device_ctx, ctx.Context()); - - // LoDTensor x_cpu_copy; - // framework::TensorCopySync(*x, platform::CPUPlace(), &x_cpu_copy); - // LoDTensor index_cpu_copy; - // framework::TensorCopySync(*index, platform::CPUPlace(), &index_cpu_copy); - // LoDTensor out_cpu_copy; - // framework::TensorCopySync(*out, platform::CPUPlace(), &out_cpu_copy); - - // if (index_type == phi::DataType::INT32) { - // IndexSelectInner(cpu_execution_ctx, &x_cpu_copy, index_cpu_copy, &out_cpu_copy, dim); - // } else if (index_type == phi::DataType::INT64) { - // IndexSelectInner(cpu_execution_ctx, &x_cpu_copy, index_cpu_copy, &out_cpu_copy, dim); - // } - int r = -1; std::vector xshape = phi::vectorize(x->dims()); if (index_type == phi::DataType::INT64) { @@ -84,25 +58,6 @@ class IndexSelectXPUKernel : public framework::OpKernel { r = xpu::gather(xpu_context, x_data, index_data, out_data, xshape, index_len, dim); } - // LoDTensor out_ref_cpu_copy; - // framework::TensorCopySync(*out, platform::CPUPlace(), &out_ref_cpu_copy); - // bool correct = true; - // float diff = 1e-5; - // for (int i = 0; i < out_ref_cpu_copy.numel(); i++) { - // T* ref_data = out_ref_cpu_copy.data(); - // T* cpu_data = out_cpu_copy.data(); - // if(std::abs(*(ref_data + i) - *(cpu_data+i)) > diff) { - // correct = false; - // printf("[hsq] error in %d, out_ref_cpu_copy[%d]=%f, out_cpu_copy[%d]=%f\n", i, i, *(ref_data+i), i, *(cpu_data+i)); - // break; - // } - // } - // if(dev_id == target_id) { - // if(correct) { - // printf("[hsq] index_select op test passed\n"); - // } - // } - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, platform::errors::External( "The index_select XPU OP return wrong value[%d %s]", @@ -139,43 +94,9 @@ class IndexSelectGradXPUKernel : public framework::OpKernel { const T* out_grad_data = out_grad->data(); T* x_grad_data = x_grad->mutable_data(place); - // auto cpu_device_ctx = platform::DeviceContextPool::Instance().Get(phi::CPUPlace()); - // framework::ExecutionContext cpu_execution_ctx(ctx.GetOp(), ctx.scope(), *cpu_device_ctx, ctx.Context()); - - // LoDTensor out_grad_cpu_copy; - // framework::TensorCopySync(*out_grad, platform::CPUPlace(), &out_grad_cpu_copy); - // LoDTensor index_cpu_copy; - // framework::TensorCopySync(*index, platform::CPUPlace(), &index_cpu_copy); - // LoDTensor x_grad_cpu_copy; - // framework::TensorCopySync(*x_grad, platform::CPUPlace(), &x_grad_cpu_copy); - // if (index_type == phi::DataType::INT32) { - // IndexSelectGradInner(cpu_execution_ctx, out_grad_cpu_copy, index_cpu_copy, &x_grad_cpu_copy, dim); - // } else if (index_type == phi::DataType::INT64) { - // IndexSelectGradInner( - // cpu_execution_ctx, out_grad_cpu_copy, index_cpu_copy, &x_grad_cpu_copy, dim); - // } - int r = -1; std::vector out_grad_shape = phi::vectorize(out_grad->dims()); std::vector x_grad_shape = phi::vectorize(x_grad->dims()); - - // static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? - // std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : - // 0; - // int dev_id = ctx.GetPlace().GetDeviceId(); - // // if(dev_id == target_id) { - // // printf("[hsq] out_grad_shape:["); - // // for(int i = 0; i < (int)out_grad_shape.size(); i++) { - // // printf("%d, ", (int)out_grad_shape[i]); - // // } - // // printf("]\n"); - - // // printf("[hsq] x_grad_shape:["); - // // for(int i = 0; i < (int)x_grad_shape.size(); i++) { - // // printf("%d, ", (int)x_grad_shape[i]); - // // } - // // printf("]\n"); - // // } if (index_type == phi::DataType::INT64) { const int64_t* index_data = index->data(); r = xpu::index_select_grad(xpu_context, @@ -202,26 +123,6 @@ class IndexSelectGradXPUKernel : public framework::OpKernel { platform::errors::External( "The index_select_grad XPU OP return wrong value[%d %s]", r, XPUAPIErrorMsg[r])); - - // LoDTensor x_grad_ref_cpu_copy; - // framework::TensorCopySync(*x_grad, platform::CPUPlace(), &x_grad_ref_cpu_copy); - // bool correct = true; - // float diff = 1e-5; - // for (int i = 0; i < x_grad_ref_cpu_copy.numel(); i++) { - // T* ref_data = x_grad_ref_cpu_copy.data(); - // T* cpu_data = x_grad_cpu_copy.data(); - // if(std::abs(*(ref_data + i) - *(cpu_data+i)) > diff) { - // correct = false; - // printf("[hsq] error in %d, out_ref_cpu_copy[%d]=%f, out_cpu_copy[%d]=%f\n", i, i, *(ref_data+i), i, *(cpu_data+i)); - // break; - // } - // } - - // if(dev_id == target_id) { - // if(correct) { - // printf("[hsq] index_select_grad op test passed\n"); - // } - // } } }; From 6fb5de3754e137942eef7358104cde4a13535434 Mon Sep 17 00:00:00 2001 From: HuangShiqing Date: Tue, 5 Dec 2023 14:06:03 +0800 Subject: [PATCH 15/20] abacus-aibox-911 add the support of op partial_concat/grad --- .../fluid/operators/partial_concat_op_xpu.cc | 279 ++++++++++++++++++ .../fluid/platform/device/xpu/xpu2_op_list.h | 4 + 2 files changed, 283 insertions(+) create mode 100644 paddle/fluid/operators/partial_concat_op_xpu.cc diff --git a/paddle/fluid/operators/partial_concat_op_xpu.cc b/paddle/fluid/operators/partial_concat_op_xpu.cc new file mode 100644 index 0000000000000..03765a30bcfbf --- /dev/null +++ b/paddle/fluid/operators/partial_concat_op_xpu.cc @@ -0,0 +1,279 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/partial_concat_op.h" + +namespace paddle { +namespace operators { +using LoDTensor = framework::LoDTensor; + +template +class PartialConcatXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto in_vars = ctx.MultiInput("X"); + Tensor *out = ctx.Output("Out"); + PADDLE_ENFORCE_EQ(in_vars[0] != nullptr, + true, + platform::errors::InvalidArgument( + "The input of partial concat should not be null.")); + + auto input_dim = in_vars[0]->dims(); + PADDLE_ENFORCE_EQ(input_dim.size(), + 2, + platform::errors::InvalidArgument( + "Only supports 2-D array with batch size in the 1st " + "dimension and data in the 2nd.")); + auto in_size = input_dim[1]; + // may be negative + auto start_index = ctx.Attr("start_index"); + start_index = ComputeStartIndex(start_index, in_size); + + auto partial_len = ctx.Attr("length"); + if (partial_len < 0) { + partial_len = in_size - start_index; + } + //TODO: what if partial_len > in_size + auto xpu_context = ctx.template device_context().x_context(); + + int in_num = in_vars.size(); + int batch_size = input_dim[0]; + // int out_batch_len = partial_len * in_num; + + std::vector tmp_tensors(in_num); + std::vector tmp_tensors_data(in_num); + std::vector> tmp_outs_shape(in_num); + for (size_t i = 0; i < in_vars.size(); i++) { + tmp_tensors[i].Resize(phi::make_ddim({batch_size, partial_len})); + tmp_tensors_data[i] = tmp_tensors[i].mutable_data(ctx.GetPlace()); + + tmp_outs_shape[i] = std::vector({batch_size, partial_len}); + + const T* input_data = in_vars[i]->data(); + + std::vector xshape = phi::vectorize(in_vars[i]->dims()); + std::vector starts = {0, start_index}; + std::vector ends = {batch_size, start_index + partial_len + 1};//要截取的x的每个维度的终止坐标(不包含) + + int r = xpu::slice(xpu_context, + input_data, + const_cast(tmp_tensors_data[i]), + xshape, + starts, + ends); + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::External( + "The partial_concat XPU OP's slice return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + } + + T* out_data = out->mutable_data(ctx.GetPlace()); + + // static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? + // std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : + // 0; + // int dev_id = ctx.GetPlace().GetDeviceId(); + // // if(dev_id == target_id) { + // // printf("[hsq] in_vars.size(): %d, start_index: %d, partial_len: %d\n", in_num, start_index, partial_len); + // // printf("[hsq] input shape: "); + // // for (size_t i = 0; i < in_vars.size(); ++i) { + // // printf("[%d, %d], ", (int)in_vars[i]->dims()[0], (int)in_vars[i]->dims()[1]); + // // } + // // printf("]\n"); + // // } + // // auto cpu_device_ctx = platform::DeviceContextPool::Instance().Get(phi::CPUPlace()); + // std::vector x_cpu_copys(in_num); + // for (size_t i = 0; i < in_vars.size(); i++) { + // framework::TensorCopySync(*(in_vars[i]), platform::CPUPlace(), &(x_cpu_copys[i])); + // } + // Tensor out_cpu_copy; + // framework::TensorCopySync(*out, platform::CPUPlace(), &out_cpu_copy); + // T* out_cpu_data = out_cpu_copy.data(); + // for (size_t i = 0; i < in_vars.size(); ++i) { + // for (int j = 0; j < batch_size; ++j) { + // const T* in_data = x_cpu_copys[i].data(); + // memcpy(out_cpu_data + out_batch_len * j + partial_len * i, + // in_data + in_size * j + start_index, + // partial_len * sizeof(T)); + // } + // } + + int axis = 1; + int r = xpu::concat(xpu_context, + tmp_tensors_data, + out_data, + tmp_outs_shape, + axis); + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::External( + "The partial_concat XPU OP's concat return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + + // Tensor out_ref_cpu_copy; + // framework::TensorCopySync(*out, platform::CPUPlace(), &out_ref_cpu_copy); + // bool correct = true; + // float diff = 1e-5; + // for (int i = 0; i < out_ref_cpu_copy.numel(); i++) { + // T* ref_data = out_ref_cpu_copy.data(); + // T* cpu_data = out_cpu_copy.data(); + // if(std::abs(*(ref_data + i) - *(cpu_data+i)) > diff) { + // correct = false; + // printf("[hsq] error in %d, out_ref_cpu_copy[%d]=%f, out_cpu_copy[%d]=%f\n", i, i, *(ref_data+i), i, *(cpu_data+i)); + // break; + // } + // } + // if(dev_id == target_id) { + // if(correct) { + // printf("[hsq] partial_concat op test passed\n"); + // } + // } + } +}; + +template +class PartialConcatGradXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *out_grad = ctx.Input(framework::GradVarName("Out")); + auto ins = ctx.MultiInput("X"); + auto xs_grad = ctx.MultiOutput(framework::GradVarName("X")); + + PADDLE_ENFORCE_EQ(ins[0] != nullptr, + true, + platform::errors::InvalidArgument( + "The input of partial concat should not be null.")); + // all parameters + int batch_size = ins[0]->dims()[0]; + int in_size = ins[0]->dims()[1]; + // may be negative + auto start_index = ctx.Attr("start_index"); + start_index = ComputeStartIndex(start_index, in_size); + auto partial_len = ctx.Attr("length"); + if (partial_len < 0) { + partial_len = in_size - start_index; + } + + auto in_num = ins.size(); + + auto xpu_context = ctx.template device_context().x_context(); + + std::vector tmp_tensors(in_num); + std::vector tmp_tensors_data(in_num); + + // std::vector xs_grad_cpu_copys(in_num); + // std::vector xs_grad_ref_cpu_copys(in_num); + + const T* out_grad_data = out_grad->data(); + for (size_t i = 0; i < in_num; i++) { + tmp_tensors[i].Resize(phi::make_ddim({batch_size, partial_len})); + tmp_tensors_data[i] = tmp_tensors[i].mutable_data(ctx.GetPlace()); + + std::vector xshape = phi::vectorize(out_grad->dims()); + std::vector starts = {0, int(partial_len * i)}; + std::vector ends = {batch_size, int(partial_len * i + partial_len + 1)};//要截取的x的每个维度的终止坐标(不包含) + + int r = xpu::slice(xpu_context, + out_grad_data, + const_cast(tmp_tensors_data[i]), + xshape, + starts, + ends); + PADDLE_ENFORCE_EQ( + r, + xpu::Error_t::SUCCESS, + platform::errors::External("The partial_concat_grad XPU OP's slice " + "return wrong value[%d %s]", + r, + XPUAPIErrorMsg[r])); + + std::vector tmp_shape = {batch_size, partial_len}; + std::vector pad_left = {0, start_index}; + std::vector pad_right = {0, in_size - start_index - partial_len}; + T* xs_grad_data = xs_grad[i]->mutable_data(ctx.GetPlace()); + + // framework::TensorCopySync(*(xs_grad[i]), platform::CPUPlace(), &(xs_grad_cpu_copys[i])); + + r = xpu::pad(xpu_context, + tmp_tensors_data[i], + xs_grad_data, + tmp_shape, + pad_left, + pad_right, + T(0)); + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::External( + "The partial_concat_grad XPU OP's pad return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + + // framework::TensorCopySync(*(xs_grad[i]), platform::CPUPlace(), &(xs_grad_ref_cpu_copys[i])); + } + + + // auto grad_batch_len = partial_len * in_num; + // auto all_length = grad_batch_len * batch_size; + // Tensor out_grad_cpu_copy; + // framework::TensorCopySync(*out_grad, platform::CPUPlace(), &out_grad_cpu_copy); + + // // initialize + // auto& place = + // *ctx.template device_context().eigen_device(); + // for (size_t i = 0; i < xs_grad_cpu_copys.size(); ++i) { + // // xs_grad_cpu_copys[i]->mutable_data(ctx.GetPlace()); + // auto dxt = framework::EigenVector::Flatten(xs_grad_cpu_copys[i]); + // dxt.device(place) = dxt.constant(static_cast(0)); + // } + + // auto* out_grad_t = out_grad_cpu_copy.data(); + // for (size_t id = 0; id < all_length; id += partial_len) { + // int bs_id = id / grad_batch_len; + // int bs_index = id % grad_batch_len; + // int var_id = bs_index / partial_len; + // auto* out_t = xs_grad_ref_cpu_copys[var_id].data(); + // memcpy(out_t + bs_id * in_size + start_index, + // out_grad_t + id, + // partial_len * sizeof(T)); + // } + + // bool correct = true; + // float diff = 1e-5; + // for (size_t i = 0; i < in_num; i++) { + // T* ref_data = xs_grad_ref_cpu_copys[i].data(); + // T* cpu_data = xs_grad_cpu_copys[i].data(); + // for (int j = 0; j < xs_grad_cpu_copys[i].numel(); j++) { + + // if(std::abs(*(ref_data + j) - *(cpu_data+j)) > diff) { + // correct = false; + // printf("[hsq] error in %d, out_ref_cpu_copy[%d]=%f, out_cpu_copy[%d]=%f\n", j, j, *(ref_data+j), j, *(cpu_data+j)); + // break; + // } + // } + // } + // static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? + // std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : + // 0; + // int dev_id = ctx.GetPlace().GetDeviceId(); + // if(dev_id == target_id) { + // if(correct) { + // printf("[hsq] partial_concat_grad op test passed\n"); + // } + // } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL(partial_concat, ops::PartialConcatXPUKernel) +REGISTER_OP_XPU_KERNEL(partial_concat_grad, ops::PartialConcatGradXPUKernel) diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index df5585b3128ce..d438f0e8d2a2d 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -589,6 +589,10 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"index_select_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"partial_concat", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"partial_concat_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, }; return s_xpu2_kernels; } From 9b85788de919f4b7123c6b29c882f67ca0f93b02 Mon Sep 17 00:00:00 2001 From: HuangShiqing Date: Tue, 5 Dec 2023 14:09:43 +0800 Subject: [PATCH 16/20] abacus-aibox-911 remove the debug info --- .../fluid/operators/partial_concat_op_xpu.cc | 106 ------------------ 1 file changed, 106 deletions(-) diff --git a/paddle/fluid/operators/partial_concat_op_xpu.cc b/paddle/fluid/operators/partial_concat_op_xpu.cc index 03765a30bcfbf..c95cfa8aa703d 100644 --- a/paddle/fluid/operators/partial_concat_op_xpu.cc +++ b/paddle/fluid/operators/partial_concat_op_xpu.cc @@ -49,7 +49,6 @@ class PartialConcatXPUKernel : public framework::OpKernel { int in_num = in_vars.size(); int batch_size = input_dim[0]; - // int out_batch_len = partial_len * in_num; std::vector tmp_tensors(in_num); std::vector tmp_tensors_data(in_num); @@ -80,35 +79,6 @@ class PartialConcatXPUKernel : public framework::OpKernel { T* out_data = out->mutable_data(ctx.GetPlace()); - // static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? - // std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : - // 0; - // int dev_id = ctx.GetPlace().GetDeviceId(); - // // if(dev_id == target_id) { - // // printf("[hsq] in_vars.size(): %d, start_index: %d, partial_len: %d\n", in_num, start_index, partial_len); - // // printf("[hsq] input shape: "); - // // for (size_t i = 0; i < in_vars.size(); ++i) { - // // printf("[%d, %d], ", (int)in_vars[i]->dims()[0], (int)in_vars[i]->dims()[1]); - // // } - // // printf("]\n"); - // // } - // // auto cpu_device_ctx = platform::DeviceContextPool::Instance().Get(phi::CPUPlace()); - // std::vector x_cpu_copys(in_num); - // for (size_t i = 0; i < in_vars.size(); i++) { - // framework::TensorCopySync(*(in_vars[i]), platform::CPUPlace(), &(x_cpu_copys[i])); - // } - // Tensor out_cpu_copy; - // framework::TensorCopySync(*out, platform::CPUPlace(), &out_cpu_copy); - // T* out_cpu_data = out_cpu_copy.data(); - // for (size_t i = 0; i < in_vars.size(); ++i) { - // for (int j = 0; j < batch_size; ++j) { - // const T* in_data = x_cpu_copys[i].data(); - // memcpy(out_cpu_data + out_batch_len * j + partial_len * i, - // in_data + in_size * j + start_index, - // partial_len * sizeof(T)); - // } - // } - int axis = 1; int r = xpu::concat(xpu_context, tmp_tensors_data, @@ -119,25 +89,6 @@ class PartialConcatXPUKernel : public framework::OpKernel { platform::errors::External( "The partial_concat XPU OP's concat return wrong value[%d %s]", r, XPUAPIErrorMsg[r])); - - // Tensor out_ref_cpu_copy; - // framework::TensorCopySync(*out, platform::CPUPlace(), &out_ref_cpu_copy); - // bool correct = true; - // float diff = 1e-5; - // for (int i = 0; i < out_ref_cpu_copy.numel(); i++) { - // T* ref_data = out_ref_cpu_copy.data(); - // T* cpu_data = out_cpu_copy.data(); - // if(std::abs(*(ref_data + i) - *(cpu_data+i)) > diff) { - // correct = false; - // printf("[hsq] error in %d, out_ref_cpu_copy[%d]=%f, out_cpu_copy[%d]=%f\n", i, i, *(ref_data+i), i, *(cpu_data+i)); - // break; - // } - // } - // if(dev_id == target_id) { - // if(correct) { - // printf("[hsq] partial_concat op test passed\n"); - // } - // } } }; @@ -171,9 +122,6 @@ class PartialConcatGradXPUKernel : public framework::OpKernel { std::vector tmp_tensors(in_num); std::vector tmp_tensors_data(in_num); - // std::vector xs_grad_cpu_copys(in_num); - // std::vector xs_grad_ref_cpu_copys(in_num); - const T* out_grad_data = out_grad->data(); for (size_t i = 0; i < in_num; i++) { tmp_tensors[i].Resize(phi::make_ddim({batch_size, partial_len})); @@ -202,8 +150,6 @@ class PartialConcatGradXPUKernel : public framework::OpKernel { std::vector pad_right = {0, in_size - start_index - partial_len}; T* xs_grad_data = xs_grad[i]->mutable_data(ctx.GetPlace()); - // framework::TensorCopySync(*(xs_grad[i]), platform::CPUPlace(), &(xs_grad_cpu_copys[i])); - r = xpu::pad(xpu_context, tmp_tensors_data[i], xs_grad_data, @@ -215,59 +161,7 @@ class PartialConcatGradXPUKernel : public framework::OpKernel { platform::errors::External( "The partial_concat_grad XPU OP's pad return wrong value[%d %s]", r, XPUAPIErrorMsg[r])); - - // framework::TensorCopySync(*(xs_grad[i]), platform::CPUPlace(), &(xs_grad_ref_cpu_copys[i])); } - - - // auto grad_batch_len = partial_len * in_num; - // auto all_length = grad_batch_len * batch_size; - // Tensor out_grad_cpu_copy; - // framework::TensorCopySync(*out_grad, platform::CPUPlace(), &out_grad_cpu_copy); - - // // initialize - // auto& place = - // *ctx.template device_context().eigen_device(); - // for (size_t i = 0; i < xs_grad_cpu_copys.size(); ++i) { - // // xs_grad_cpu_copys[i]->mutable_data(ctx.GetPlace()); - // auto dxt = framework::EigenVector::Flatten(xs_grad_cpu_copys[i]); - // dxt.device(place) = dxt.constant(static_cast(0)); - // } - - // auto* out_grad_t = out_grad_cpu_copy.data(); - // for (size_t id = 0; id < all_length; id += partial_len) { - // int bs_id = id / grad_batch_len; - // int bs_index = id % grad_batch_len; - // int var_id = bs_index / partial_len; - // auto* out_t = xs_grad_ref_cpu_copys[var_id].data(); - // memcpy(out_t + bs_id * in_size + start_index, - // out_grad_t + id, - // partial_len * sizeof(T)); - // } - - // bool correct = true; - // float diff = 1e-5; - // for (size_t i = 0; i < in_num; i++) { - // T* ref_data = xs_grad_ref_cpu_copys[i].data(); - // T* cpu_data = xs_grad_cpu_copys[i].data(); - // for (int j = 0; j < xs_grad_cpu_copys[i].numel(); j++) { - - // if(std::abs(*(ref_data + j) - *(cpu_data+j)) > diff) { - // correct = false; - // printf("[hsq] error in %d, out_ref_cpu_copy[%d]=%f, out_cpu_copy[%d]=%f\n", j, j, *(ref_data+j), j, *(cpu_data+j)); - // break; - // } - // } - // } - // static int target_id = std::getenv("HSQ_XPURT_TARGET_DEVICE")!=NULL ? - // std::stoi(std::string(std::getenv("HSQ_XPURT_TARGET_DEVICE"))) : - // 0; - // int dev_id = ctx.GetPlace().GetDeviceId(); - // if(dev_id == target_id) { - // if(correct) { - // printf("[hsq] partial_concat_grad op test passed\n"); - // } - // } } }; From 7a658063157f6cbf3e2c9472aaa39b5897b0cca0 Mon Sep 17 00:00:00 2001 From: HuangShiqing Date: Tue, 12 Dec 2023 11:40:24 +0800 Subject: [PATCH 17/20] abacus-aibox-919 fix the partial_concat's shape error --- paddle/fluid/operators/partial_concat_op_xpu.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/partial_concat_op_xpu.cc b/paddle/fluid/operators/partial_concat_op_xpu.cc index c95cfa8aa703d..c72b5f7c8cadf 100644 --- a/paddle/fluid/operators/partial_concat_op_xpu.cc +++ b/paddle/fluid/operators/partial_concat_op_xpu.cc @@ -63,7 +63,7 @@ class PartialConcatXPUKernel : public framework::OpKernel { std::vector xshape = phi::vectorize(in_vars[i]->dims()); std::vector starts = {0, start_index}; - std::vector ends = {batch_size, start_index + partial_len + 1};//要截取的x的每个维度的终止坐标(不包含) + std::vector ends = {batch_size, start_index + partial_len};//要截取的x的每个维度的终止坐标(不包含) int r = xpu::slice(xpu_context, input_data, @@ -129,7 +129,7 @@ class PartialConcatGradXPUKernel : public framework::OpKernel { std::vector xshape = phi::vectorize(out_grad->dims()); std::vector starts = {0, int(partial_len * i)}; - std::vector ends = {batch_size, int(partial_len * i + partial_len + 1)};//要截取的x的每个维度的终止坐标(不包含) + std::vector ends = {batch_size, int(partial_len * i + partial_len)};//要截取的x的每个维度的终止坐标(不包含) int r = xpu::slice(xpu_context, out_grad_data, From 69554bdcbc23fa71415e3c15eacdd49d2025bf4e Mon Sep 17 00:00:00 2001 From: tanzhipeng Date: Wed, 13 Dec 2023 09:50:39 +0000 Subject: [PATCH 18/20] optimize load fc tunefile performance. --- cmake/external/xpu.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 3e3d3849b8e0d..12741a1df5729 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -106,7 +106,7 @@ if (WITH_BOX_PS OR WITH_XPU_KP) CACHE STRING "" FORCE) #"https://klx-sdk-release-public.su.bcebos.com/xdnn/release/2.6.0.1/${XPU_XDNN_DIR_NAME}.tar.gz" set(XPU_XDNN_URL - "https://klx-sdk-release-public.su.bcebos.com/xdnn_train/dev/paddlebox/20231201/${XPU_XDNN_DIR_NAME}.tar.gz" + "https://klx-sdk-release-public.su.bcebos.com/xdnn_train/dev/paddlebox/20231212/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) set(SCALOPUS_URL "https://klx-sdk-release-public.su.bcebos.com/xdnn_train/dev/paddlebox/20230306/scalopus.tar.gz" From bd8f5f82f329b3c882c780f710ecb7620eb9cef8 Mon Sep 17 00:00:00 2001 From: Yeeland Date: Mon, 18 Dec 2023 15:57:32 +0800 Subject: [PATCH 19/20] fix: modify the default value of rank_offset to -1 --- paddle/fluid/framework/data_feed.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 paddle/fluid/framework/data_feed.cc diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc old mode 100644 new mode 100755 index 640e66e272304..7b68f28fd00be --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -3612,7 +3612,7 @@ void SlotPaddleBoxDataFeed::GetRankOffsetGPU(const int pv_num, #elif defined(PADDLE_WITH_XPU_KP) auto dev_ctx = platform::DeviceContextPool::Instance().Get(this->place_); auto ctx = static_cast(dev_ctx)->x_context(); - int r = xpu::constant(ctx, tensor_ptr, rank_offset_->numel(), 0); + int r = xpu::constant(ctx, tensor_ptr, rank_offset_->numel(), -1); PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( From a253e5df14fc1f8766362521ae09edc01d531092 Mon Sep 17 00:00:00 2001 From: jack603047588 <603047588@qq.com> Date: Mon, 18 Dec 2023 17:10:46 +0800 Subject: [PATCH 20/20] abacus-aibox-923 fix need_dump bug in xpu --- paddle/fluid/framework/device_worker.cc | 9 +++++---- paddle/fluid/framework/fleet/box_wrapper.h | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc index cade806f150ff..b6216b9572c6a 100644 --- a/paddle/fluid/framework/device_worker.cc +++ b/paddle/fluid/framework/device_worker.cc @@ -620,16 +620,17 @@ std::set used_slot_set; cpu_tensors[i].ShareDataWith(tensor); } #ifdef PADDLE_WITH_XPU_KP - auto & fid2sign_map = paddle::framework::BoxWrapper::GetInstance()->GetFid2SginMap(); - if (used_slot_set.find(field) != used_slot_set.end()) { + auto fid2sign_map_ptr = paddle::framework::BoxWrapper::GetInstance()->GetFid2SginMap(); + if (used_slot_set.find(field) != used_slot_set.end() \ + && fid2sign_map_ptr != nullptr && fid2sign_map_ptr->size() > 0) { auto t_dtype = framework::TransToProtoVarType(cpu_tensors[i].dtype()); if (t_dtype == proto::VarType::INT64) { size_t numel = cpu_tensors[i].numel(); int64_t * slot_data = cpu_tensors[i].data(); for (size_t j = 0; j < numel; ++j) { uint64_t fid = static_cast(slot_data[j]); - PADDLE_ENFORCE_LT(fid, fid2sign_map.size()); - uint64_t sign = fid2sign_map[fid]; + PADDLE_ENFORCE_LT(fid, fid2sign_map_ptr->size()); + uint64_t sign = (*fid2sign_map_ptr)[fid]; PADDLE_ENFORCE(sign > 0 || (sign == 0 && fid == 0), platform::errors::PreconditionNotMet( "sign can only be 0 when fid is 0, fid:%llu, sign:%llu", diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h index dfcfb1d4ba7b2..1f6be13e904e4 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.h +++ b/paddle/fluid/framework/fleet/box_wrapper.h @@ -684,7 +684,7 @@ class BoxWrapper { void SetDataFuncForCacheManager(int batch_num, std::function>*)> data_func); int PrepareNextBatch(int dev_id); - std::vector & GetFid2SginMap() { return *fid2sign_map_; } + std::vector * GetFid2SginMap() { return fid2sign_map_; } #endif boxps::PSAgentBase* GetAgent(); @@ -969,7 +969,7 @@ class BoxWrapper { int gpu_num_ = GetDeviceCount(); #ifdef PADDLE_WITH_XPU_KP bool use_xpu_sparse_map_; - std::vector * fid2sign_map_; + std::vector * fid2sign_map_ = nullptr; std::unique_ptr box_wrapper_kernel_; #endif