Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix need_dump error in dual-box xpu training #36

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -97,16 +97,16 @@ set(XPU_PACK_DEPENCE_URL
if (WITH_BOX_PS OR WITH_XPU_KP)
set(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64")
set(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64")
set(XPU_XCCL_DIR_NAME "xccl_socket-bdcentos_x86_64")
set(XPU_XCCL_DIR_NAME "xccl_rdma-bdcentos_x86_64")
set(XPU_XRE_URL
"https://klx-sdk-release-public.su.bcebos.com/xre/release/4.0.28.1/${XPU_XRE_DIR_NAME}.tar.gz"
CACHE STRING "" FORCE)
set(XPU_XCCL_URL
"https://klx-sdk-release-public.su.bcebos.com/xccl/release/1.0.62.1/${XPU_XCCL_DIR_NAME}.tar.gz"
"https://klx-sdk-release-public.su.bcebos.com/xccl/release/1.1.6.1/${XPU_XCCL_DIR_NAME}.tar.gz"
CACHE STRING "" FORCE)
#"https://klx-sdk-release-public.su.bcebos.com/xdnn/release/2.6.0.1/${XPU_XDNN_DIR_NAME}.tar.gz"
set(XPU_XDNN_URL
"https://klx-sdk-release-public.su.bcebos.com/xdnn_train/dev/paddlebox/20231031/${XPU_XDNN_DIR_NAME}.tar.gz"
"https://klx-sdk-release-public.su.bcebos.com/xdnn_train/dev/paddlebox/20231212/${XPU_XDNN_DIR_NAME}.tar.gz"
CACHE STRING "" FORCE)
set(SCALOPUS_URL
"https://klx-sdk-release-public.su.bcebos.com/xdnn_train/dev/paddlebox/20230306/scalopus.tar.gz"
Expand Down
3 changes: 2 additions & 1 deletion cmake/xpu_kp.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG})

# The host sysroot of XPU compiler is gcc-8.2
if(NOT HOST_SYSROOT)
set(HOST_SYSROOT /opt/compiler/gcc-8.2)
#set(HOST_SYSROOT /opt/compiler/gcc-8.2)
set(HOST_SYSROOT /opt/compiler/gcc-10)
endif()

if(NOT IS_DIRECTORY ${HOST_SYSROOT})
Expand Down
10 changes: 10 additions & 0 deletions paddle/fluid/framework/data_feed.cc
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -3595,6 +3595,7 @@ int SlotPaddleBoxDataFeed::GetCurrentPhase() {
return box_ptr->Phase();
}
}

void SlotPaddleBoxDataFeed::GetRankOffsetGPU(const int pv_num,
const int ins_num) {
#if defined(PADDLE_WITH_CUDA) && defined(_LINUX) || defined(PADDLE_WITH_XPU_KP) && !defined(CPU_DATA_FEED)
Expand All @@ -3609,6 +3610,15 @@ void SlotPaddleBoxDataFeed::GetRankOffsetGPU(const int pv_num,
value.d_ad_offset.data<int>(), col);

#elif defined(PADDLE_WITH_XPU_KP)
auto dev_ctx = platform::DeviceContextPool::Instance().Get(this->place_);
auto ctx = static_cast<platform::XPUDeviceContext*>(dev_ctx)->x_context();
int r = xpu::constant<int>(ctx, tensor_ptr, rank_offset_->numel(), -1);
PADDLE_ENFORCE_EQ(r,
XPU_SUCCESS,
platform::errors::External(
"XPU constant kernel return wrong value[%d %s]",
r,
XPUAPIErrorMsg[r]));
DataFeedPdboxXpuKernelHelper::CopyRankOffset(this->place_, tensor_ptr, ins_num, pv_num, max_rank,
value.d_rank.data<int>(), value.d_cmatch.data<int>(),
value.d_ad_offset.data<int>(), col);
Expand Down
5 changes: 3 additions & 2 deletions paddle/fluid/framework/data_feed.kps
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#ifdef PADDLE_WITH_XPU_KP
#include "paddle/fluid/framework/data_feed_xpu_kernel_helper.h"
#include "xpu/kernel/xtdk.h"
// #include "xpu/kernel/debug.h"
#include "xpu/kernel/xtdk_math.h"
// #include "xpu/kernel/xtdk_io.h"

namespace paddle {
namespace framework {
Expand Down Expand Up @@ -93,6 +93,7 @@ __global__ void CopyRankOffsetKernel(int* mat,
}
}
}
mfence();
LM2GM(lm_mat, mat + lm_pv_offset[pv_offset_left_index] * col, ad_num * col * sizeof(int));
pv_offset_left_index = pv_offset_right_index;
}
Expand Down Expand Up @@ -376,7 +377,7 @@ void DataFeedPdboxXpuKernelHelper::CopyRankOffset(const paddle::platform::Place&
stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
->x_context()
->xpu_stream;
CopyRankOffsetKernel<<<4, 64, stream>>>(dest, ranks, cmatchs, ad_offsets, ins_num, pv_num, max_rank, cols);
CopyRankOffsetKernel<<<8, 64, stream>>>(dest, ranks, cmatchs, ad_offsets, pv_num, ins_num, max_rank, cols);
xpu_wait(stream);
}

Expand Down
14 changes: 10 additions & 4 deletions paddle/fluid/framework/data_set.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2113,9 +2113,13 @@ void PadBoxSlotDataset::PreLoadIntoDisk(const std::string& path,
VLOG(3) << "RegisterClientToClientMsgHandler done";
}
CHECK(slot_pool_ != nullptr) << "slotrecord pool nullptr";
read_ins_ref_ = thread_num_;
int read_thread_num = thread_num_;
if (disable_random_update_) {
read_thread_num = 1;
}
read_ins_ref_ = read_thread_num;
CHECK(down_pool_ != nullptr) << "down_pool nullptr";
for (int64_t i = 0; i < thread_num_; ++i) {
for (int64_t i = 0; i < read_thread_num; ++i) {
wait_futures_.emplace_back(down_pool_->Run([this, i]() {
platform::Timer timer;
timer.Start();
Expand Down Expand Up @@ -2785,8 +2789,10 @@ void PadBoxSlotDataset::PrepareTrain(void) {
// join or aucrunner mode enable pv
if (enable_pv_merge_ && (box_ptr->Phase() & 0x01 == 1 ||
box_ptr->Mode() == 1)) {
std::shuffle(input_pv_ins_.begin(), input_pv_ins_.end(),
BoxWrapper::LocalRandomEngine());
if (!disable_random_update_) {
std::shuffle(input_pv_ins_.begin(), input_pv_ins_.end(),
BoxWrapper::LocalRandomEngine());
}
// 分数据到各线程里面
int batchsize = reinterpret_cast<SlotPaddleBoxDataFeed*>(readers_[0].get())
->GetPvBatchSize();
Expand Down
9 changes: 5 additions & 4 deletions paddle/fluid/framework/device_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -620,16 +620,17 @@ std::set<std::string> used_slot_set;
cpu_tensors[i].ShareDataWith(tensor);
}
#ifdef PADDLE_WITH_XPU_KP
auto & fid2sign_map = paddle::framework::BoxWrapper::GetInstance()->GetFid2SginMap();
if (used_slot_set.find(field) != used_slot_set.end()) {
auto fid2sign_map_ptr = paddle::framework::BoxWrapper::GetInstance()->GetFid2SginMap();
if (used_slot_set.find(field) != used_slot_set.end() \
&& fid2sign_map_ptr != nullptr && fid2sign_map_ptr->size() > 0) {
auto t_dtype = framework::TransToProtoVarType(cpu_tensors[i].dtype());
if (t_dtype == proto::VarType::INT64) {
size_t numel = cpu_tensors[i].numel();
int64_t * slot_data = cpu_tensors[i].data<int64_t>();
for (size_t j = 0; j < numel; ++j) {
uint64_t fid = static_cast<uint64_t>(slot_data[j]);
PADDLE_ENFORCE_LT(fid, fid2sign_map.size());
uint64_t sign = fid2sign_map[fid];
PADDLE_ENFORCE_LT(fid, fid2sign_map_ptr->size());
uint64_t sign = (*fid2sign_map_ptr)[fid];
PADDLE_ENFORCE(sign > 0 || (sign == 0 && fid == 0),
platform::errors::PreconditionNotMet(
"sign can only be 0 when fid is 0, fid:%llu, sign:%llu",
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/framework/fleet/box_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,7 @@ class BoxWrapper {
void SetDataFuncForCacheManager(int batch_num,
std::function<void(int, std::vector<std::pair<uint64_t*, int>>*)> data_func);
int PrepareNextBatch(int dev_id);
std::vector<uint64_t> & GetFid2SginMap() { return *fid2sign_map_; }
std::vector<uint64_t> * GetFid2SginMap() { return fid2sign_map_; }
#endif

boxps::PSAgentBase* GetAgent();
Expand Down Expand Up @@ -969,7 +969,7 @@ class BoxWrapper {
int gpu_num_ = GetDeviceCount();
#ifdef PADDLE_WITH_XPU_KP
bool use_xpu_sparse_map_;
std::vector<uint64_t> * fid2sign_map_;
std::vector<uint64_t> * fid2sign_map_ = nullptr;
std::unique_ptr<BoxWrapperKernel> box_wrapper_kernel_;
#endif

Expand Down
34 changes: 27 additions & 7 deletions paddle/fluid/framework/fleet/box_wrapper_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -731,16 +731,34 @@ void BoxWrapper::PushSparseGradCaseXPU(const paddle::platform::Place& place,

TRACE_SCOPE_START("CopyForPush", xpu_wait(ctx_xpu->xpu_stream));
#endif
float* real_grad_values;
// float* real_grad_values;
// for (int i = 0; i < slot_num; i++) {
// if(grad_values[i] != nullptr) {
// real_grad_values = const_cast<float*>(grad_values[i]);
// break;
// }
// }
std::vector<int> slot_inner_offset(total_length);
int out_count = 0;
for (int i = 0; i < slot_num; i++) {
if(grad_values[i] != nullptr) {
real_grad_values = const_cast<float*>(grad_values[i]);
break;
for (int64_t j = 0; j < slot_lengths[i]; j++) {
slot_inner_offset[out_count++] = j;
}
}
box_wrapper_kernel_->CopyForPush(place, real_grad_values, total_grad_values_xpu,
push_offset, total_length, slot_vector, slot_lens, slot_num,
hidden_size, batch_size, total_dims, skip_offset, key2slot);
auto d_slot_inner_offset_tmp = memory::Alloc(place, total_length * sizeof(int));
int* d_slot_inner_offset = reinterpret_cast<int*>(d_slot_inner_offset_tmp->ptr());
memory::Copy(place,
d_slot_inner_offset,
platform::CPUPlace(),
slot_inner_offset.data(),
total_length * sizeof(int));

box_wrapper_kernel_->CopyForPush(place, xpu_values, total_grad_values_xpu,
push_offset, total_length, slot_vector, (int*)d_slot_inner_offset, slot_lens, slot_num,
hidden_size, batch_size, total_dims, skip_offset, key2slot,
expand_embed_dim,
push_float_num_,
expand_only);

push_boxps_timer.Resume();
#ifdef TRACE_PROFILE
Expand All @@ -749,9 +767,11 @@ void BoxWrapper::PushSparseGradCaseXPU(const paddle::platform::Place& place,

TRACE_SCOPE_START("PushSparseXPU", xpu_wait(ctx_xpu->xpu_stream));
#endif

int ret = boxps_ptr_->PushSparseXPU(total_keys,
reinterpret_cast<void*>(total_grad_values_xpu),
static_cast<int>(total_length), device_id);

PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
"PushSparseXPU failed in BoxPS."));
push_boxps_timer.Pause();
Expand Down
8 changes: 6 additions & 2 deletions paddle/fluid/framework/fleet/box_wrapper_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,22 @@ void CopyForPull(

void CopyForPush(
const paddle::platform::Place& place,
float* gm_src_ptr,
float** gm_src_ptr,
void* total_grad_values_xpu,
boxps::FeaturePushOffset* push_offset,
const int64_t total_length,
const int* slots,
const int* slot_inner_offset,
const int64_t* slot_lens,
const int slot_num,
const int hidden_size,
const int batch_size,
const int* total_dims,
const int skip_offset,
const int* key2slot);
const int* key2slot,
const int expand_embed_dim,
const int push_float_num,
bool expand_only);

public:
const static int MAX_SLOT_SIZE = 10240;
Expand Down
Loading