Skip to content

Commit

Permalink
[GPUPS]fix debug info (PaddlePaddle#23)
Browse files Browse the repository at this point in the history
* fix debug info

* fix debug info

* fix debug info

* fix
  • Loading branch information
zmxdream authored Jun 24, 2022
1 parent 2c61e92 commit 6db0596
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 65 deletions.
23 changes: 8 additions & 15 deletions paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ __global__ void search_kernel(Table* table,
auto it = table->find(keys[i]);
if (it != table->end()) {
vals[i] = it->second;
} else {
printf("pull miss key: %llu", keys[i]);
}
}
}
Expand Down Expand Up @@ -101,20 +103,7 @@ __global__ void dy_mf_search_kernel(Table* table,
cur->mf[j] = input.mf[j];
}
} else {
if (keys[i] != 0) printf("pull miss key: %d", keys[i]);
FeatureValue* cur = (FeatureValue*)(vals + i * pull_feature_value_size);
cur->delta_score = 0;
cur->show = 0;
cur->clk = 0;
cur->slot = -1;
cur->lr = 0;
cur->lr_g2sum = 0;
cur->mf_size = 0;
cur->mf_dim = 8;
cur->cpu_ptr;
for (int j = 0; j < cur->mf_dim + 1; j++) {
cur->mf[j] = 0;
}
if (keys[i] != 0) printf("pull miss key: %llu", keys[i]);
}
}
}
Expand Down Expand Up @@ -191,6 +180,8 @@ __global__ void update_kernel(Table* table,
auto it = table->find(keys[i]);
if (it != table->end()) {
sgd.update_value((it.getter())->second, grads[i], p_state[i]);
} else {
printf("push miss key: %llu", keys[i]);
}
}
}
Expand All @@ -205,6 +196,8 @@ __global__ void update_kernel(Table* table,
auto it = table->find(keys[i]);
if (it != table->end()) {
sgd.update_value((it.getter())->second, grads[i]);
} else {
printf("push miss key: %llu", keys[i]);
}
}
}
Expand All @@ -221,7 +214,7 @@ __global__ void dy_mf_update_kernel(Table* table,
FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size);
sgd.dy_mf_update_value((it.getter())->second, *cur);
} else {
if (keys[i] != 0) printf("push miss key: %d", keys[i]);
if (keys[i] != 0) printf("push miss key: %llu", keys[i]);
}
}
}
Expand Down
8 changes: 0 additions & 8 deletions paddle/fluid/framework/fleet/heter_ps/heter_comm.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,14 +232,6 @@ class HeterComm {
int multi_node_{0};
std::vector<ncclComm_t> nccl_inner_comms_;
std::vector<ncclComm_t> nccl_inter_comms_;
std::vector<double> mg_time_1;
std::vector<double> mg_time_2;
std::vector<double> mg_time_3;
std::vector<double> mg_time_4;
std::vector<double> mg_time_5;
std::vector<double> mg_time_6;
std::vector<double> mg_time_7;
std::vector<double> mg_time_8;
int node_size_;
std::vector<std::shared_ptr<cub::CachingDeviceAllocator>> allocators_;
int multi_mf_dim_{8};
Expand Down
33 changes: 0 additions & 33 deletions paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,14 +176,6 @@ HeterComm<KeyType, ValType, GradType>::HeterComm(
storage_[i].init(feanum_, resource_->dev_id(i));
}
}
mg_time_1 = std::vector<double>(resource_->total_gpu(), 0.0);
mg_time_2 = std::vector<double>(resource_->total_gpu(), 0.0);
mg_time_3 = std::vector<double>(resource_->total_gpu(), 0.0);
mg_time_4 = std::vector<double>(resource_->total_gpu(), 0.0);
mg_time_5 = std::vector<double>(resource_->total_gpu(), 0.0);
mg_time_6 = std::vector<double>(resource_->total_gpu(), 0.0);
mg_time_7 = std::vector<double>(resource_->total_gpu(), 0.0);
mg_time_8 = std::vector<double>(resource_->total_gpu(), 0.0);
init_path();
}

Expand Down Expand Up @@ -498,24 +490,6 @@ HeterComm<KeyType, ValType, GradType>::~HeterComm() {
delete table;
table = nullptr;
}
for (size_t i = 1; i < mg_time_1.size(); i++) {
mg_time_1[0] += mg_time_1[i];
mg_time_2[0] += mg_time_2[i];
mg_time_3[0] += mg_time_3[i];
mg_time_4[0] += mg_time_4[i];
mg_time_5[0] += mg_time_5[i];
mg_time_6[0] += mg_time_6[i];
mg_time_7[0] += mg_time_7[i];
mg_time_8[0] += mg_time_8[i];
}
VLOG(0) << "yxfffff::mg_1::merge: " << mg_time_1[0];
VLOG(0) << "yxf::mg_2:pull: " << mg_time_2[0];
VLOG(0) << "yxf::mg_3:push: " << mg_time_3[0];
VLOG(0) << "yxf::mg_4:sort: " << mg_time_4[0];
VLOG(0) << "yxf::mg_5:encode: " << mg_time_5[0];
VLOG(0) << "yxf::mg_6:sum: " << mg_time_6[0];
VLOG(0) << "yxf::mg_7:merge_kernel: " << mg_time_7[0];
VLOG(0) << "yxf::mg_8:merge_kernel_1: " << mg_time_8[0];
}
}

Expand Down Expand Up @@ -739,7 +713,6 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(int gpu_num,
d_idx, d_index, len, 0, 8 * sizeof(KeyType), stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
timeline.Pause();
mg_time_4[gpu_num] += timeline.ElapsedSec();
timeline.Start();
temp_storage_bytes = 0;
PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
Expand All @@ -757,7 +730,6 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(int gpu_num,
cudaMemcpyDeviceToHost, stream);
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
timeline.Pause();
mg_time_5[gpu_num] += timeline.ElapsedSec();
timeline.Start();

assert(d_merged_size > 0);
Expand All @@ -776,23 +748,20 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(int gpu_num,
uniq_len, stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
timeline.Pause();
mg_time_6[gpu_num] += timeline.ElapsedSec();
timeline.Start();
grid_size = (uniq_len - 1) / block_size_ + 1;
merge_gradient_kernel<<<grid_size, block_size_, 0, stream>>>(
d_offset, d_fea_num_info_ptr, d_index, (char*)d_grads,
(char*)d_merge_grads_ptr, uniq_len, grad_value_size, merger_);
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
timeline.Pause();
mg_time_7[gpu_num] += timeline.ElapsedSec();
timeline.Start();

PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemcpyAsync(d_grads, d_merge_grads_ptr, grad_value_size * uniq_len,
cudaMemcpyDeviceToDevice, stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
timeline.Pause();
mg_time_1[gpu_num] += timeline.ElapsedSec();
timeline.Start();
}

Expand Down Expand Up @@ -938,7 +907,6 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
ptr_tables_[i]->rwlock_->UNLock();
}
time_lines[i].Pause();
mg_time_2[i] += time_lines[i].ElapsedSec();
}

if (!multi_mf_dim_) {
Expand Down Expand Up @@ -1098,7 +1066,6 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int gpu_num,
ptr_tables_[i]->rwlock_->UNLock();
}
time_lines[i].Pause();
mg_time_3[i] += time_lines[i].ElapsedSec();
}
}
for (int i = 0; i < total_gpu; ++i) {
Expand Down
11 changes: 2 additions & 9 deletions paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
for (const auto feasign : feasign_v) {
int shard_id = feasign % thread_keys_shard_num_;
if (slot_idx >= slot_index_vec_.size()) {
VLOG(0) << "yxf::WRONG:::slot_idx: " << slot_idx << " size: " <<
VLOG(0) << "WRONG::slot_idx: " << slot_idx << " slot_index_vec_size: " <<
slot_index_vec_.size();
}
int dim_id = slot_index_vec_[slot_idx];
Expand Down Expand Up @@ -1126,8 +1126,7 @@ void PSGPUWrapper::EndPass() {
if (gpu_val->mf_size > 0) {
for (int x = 0; x < gpu_val->mf_dim + 1; x++) {
if (x + 8 >= int(downpour_value->size())) {
VLOG(0) << "yxfff::14: x: " << x
<< " size: " << downpour_value_size;
VLOG(0) << "x: " << x << " size: "<< downpour_value_size;
}
cpu_val[x + 8] = gpu_val->mf[x];
}
Expand Down Expand Up @@ -1165,12 +1164,6 @@ void PSGPUWrapper::EndPass() {
gpu_free_channel_->Put(current_task_);
fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_);
timer.Pause();
// timer.Pause();
// VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
// VLOG(1) << "yxf::pull: " << time_1;
// VLOG(1) << "yxf::pull_1: " << time_2;
// VLOG(1) << "yxf::push: " << time_3;
// VLOG(1) << "yxf::push_1: " << time_4;
VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
}

Expand Down

0 comments on commit 6db0596

Please sign in to comment.