[GPUPS]fix debug info (PaddlePaddle#23)

* fix debug info * fix debug info * fix debug info * fix
zmxdream · Jun 24, 2022 · 6db0596 · 6db0596
1 parent 2c61e92
commit 6db0596
Show file tree

Hide file tree

Showing 4 changed files with 10 additions and 65 deletions.
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
@@ -72,6 +72,8 @@ __global__ void search_kernel(Table* table,
     auto it = table->find(keys[i]);
     if (it != table->end()) {
       vals[i] = it->second;
+    } else {
+      printf("pull miss key: %llu", keys[i]);
     }
   }
 }
@@ -101,20 +103,7 @@ __global__ void dy_mf_search_kernel(Table* table,
         cur->mf[j] = input.mf[j];
       }
     } else {
-      if (keys[i] != 0) printf("pull miss key: %d", keys[i]);
-      FeatureValue* cur = (FeatureValue*)(vals + i * pull_feature_value_size);
-      cur->delta_score = 0;
-      cur->show = 0;
-      cur->clk = 0;
-      cur->slot = -1;
-      cur->lr = 0;
-      cur->lr_g2sum = 0;
-      cur->mf_size = 0;
-      cur->mf_dim = 8;
-      cur->cpu_ptr;
-      for (int j = 0; j < cur->mf_dim + 1; j++) {
-        cur->mf[j] = 0;
-      }
+      if (keys[i] != 0) printf("pull miss key: %llu", keys[i]);
     }
   }
 }
@@ -191,6 +180,8 @@ __global__ void update_kernel(Table* table,
     auto it = table->find(keys[i]);
     if (it != table->end()) {
       sgd.update_value((it.getter())->second, grads[i], p_state[i]);
+    } else {
+      printf("push miss key: %llu", keys[i]);
     }
   }
 }
@@ -205,6 +196,8 @@ __global__ void update_kernel(Table* table,
     auto it = table->find(keys[i]);
     if (it != table->end()) {
       sgd.update_value((it.getter())->second, grads[i]);
+    } else {
+      printf("push miss key: %llu", keys[i]);
     }
   }
 }
@@ -221,7 +214,7 @@ __global__ void dy_mf_update_kernel(Table* table,
       FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size);
       sgd.dy_mf_update_value((it.getter())->second, *cur);
     } else {
-      if (keys[i] != 0) printf("push miss key: %d", keys[i]);
+      if (keys[i] != 0) printf("push miss key: %llu", keys[i]);
     }
   }
 }

diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -232,14 +232,6 @@ class HeterComm {
   int multi_node_{0};
   std::vector<ncclComm_t> nccl_inner_comms_;
   std::vector<ncclComm_t> nccl_inter_comms_;
-  std::vector<double> mg_time_1;
-  std::vector<double> mg_time_2;
-  std::vector<double> mg_time_3;
-  std::vector<double> mg_time_4;
-  std::vector<double> mg_time_5;
-  std::vector<double> mg_time_6;
-  std::vector<double> mg_time_7;
-  std::vector<double> mg_time_8;
   int node_size_;
   std::vector<std::shared_ptr<cub::CachingDeviceAllocator>> allocators_;
   int multi_mf_dim_{8};

diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -176,14 +176,6 @@ HeterComm<KeyType, ValType, GradType>::HeterComm(
       storage_[i].init(feanum_, resource_->dev_id(i));
     }
   }
-  mg_time_1 = std::vector<double>(resource_->total_gpu(), 0.0);
-  mg_time_2 = std::vector<double>(resource_->total_gpu(), 0.0);
-  mg_time_3 = std::vector<double>(resource_->total_gpu(), 0.0);
-  mg_time_4 = std::vector<double>(resource_->total_gpu(), 0.0);
-  mg_time_5 = std::vector<double>(resource_->total_gpu(), 0.0);
-  mg_time_6 = std::vector<double>(resource_->total_gpu(), 0.0);
-  mg_time_7 = std::vector<double>(resource_->total_gpu(), 0.0);
-  mg_time_8 = std::vector<double>(resource_->total_gpu(), 0.0);
   init_path();
 }
 
@@ -498,24 +490,6 @@ HeterComm<KeyType, ValType, GradType>::~HeterComm() {
       delete table;
       table = nullptr;
     }
-    for (size_t i = 1; i < mg_time_1.size(); i++) {
-      mg_time_1[0] += mg_time_1[i];
-      mg_time_2[0] += mg_time_2[i];
-      mg_time_3[0] += mg_time_3[i];
-      mg_time_4[0] += mg_time_4[i];
-      mg_time_5[0] += mg_time_5[i];
-      mg_time_6[0] += mg_time_6[i];
-      mg_time_7[0] += mg_time_7[i];
-      mg_time_8[0] += mg_time_8[i];
-    }
-    VLOG(0) << "yxfffff::mg_1::merge: " << mg_time_1[0];
-    VLOG(0) << "yxf::mg_2:pull: " << mg_time_2[0];
-    VLOG(0) << "yxf::mg_3:push: " << mg_time_3[0];
-    VLOG(0) << "yxf::mg_4:sort: " << mg_time_4[0];
-    VLOG(0) << "yxf::mg_5:encode: " << mg_time_5[0];
-    VLOG(0) << "yxf::mg_6:sum: " << mg_time_6[0];
-    VLOG(0) << "yxf::mg_7:merge_kernel: " << mg_time_7[0];
-    VLOG(0) << "yxf::mg_8:merge_kernel_1: " << mg_time_8[0];
    }
 }
 
@@ -739,7 +713,6 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(int gpu_num,
       d_idx, d_index, len, 0, 8 * sizeof(KeyType), stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
   timeline.Pause();
-  mg_time_4[gpu_num] += timeline.ElapsedSec();
   timeline.Start();
   temp_storage_bytes = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
@@ -757,7 +730,6 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(int gpu_num,
                   cudaMemcpyDeviceToHost, stream);
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
   timeline.Pause();
-  mg_time_5[gpu_num] += timeline.ElapsedSec();
   timeline.Start();
 
   assert(d_merged_size > 0);
@@ -776,23 +748,20 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(int gpu_num,
       uniq_len, stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
   timeline.Pause();
-  mg_time_6[gpu_num] += timeline.ElapsedSec();
   timeline.Start();
   grid_size = (uniq_len - 1) / block_size_ + 1;
   merge_gradient_kernel<<<grid_size, block_size_, 0, stream>>>(
       d_offset, d_fea_num_info_ptr, d_index, (char*)d_grads,
       (char*)d_merge_grads_ptr, uniq_len, grad_value_size, merger_);
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
   timeline.Pause();
-  mg_time_7[gpu_num] += timeline.ElapsedSec();
   timeline.Start();
 
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemcpyAsync(d_grads, d_merge_grads_ptr, grad_value_size * uniq_len,
                       cudaMemcpyDeviceToDevice, stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
   timeline.Pause();
-  mg_time_1[gpu_num] += timeline.ElapsedSec();
   timeline.Start();
 }
 
@@ -938,7 +907,6 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
       ptr_tables_[i]->rwlock_->UNLock();
     }
     time_lines[i].Pause();
-    mg_time_2[i] += time_lines[i].ElapsedSec();
   }
 
   if (!multi_mf_dim_) {
@@ -1098,7 +1066,6 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int gpu_num,
         ptr_tables_[i]->rwlock_->UNLock();
       }
       time_lines[i].Pause();
-      mg_time_3[i] += time_lines[i].ElapsedSec();
     }
   }
   for (int i = 0; i < total_gpu; ++i) {

diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -186,7 +186,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
         for (const auto feasign : feasign_v) {
           int shard_id = feasign % thread_keys_shard_num_;
           if (slot_idx >= slot_index_vec_.size()) {
-            VLOG(0) << "yxf::WRONG:::slot_idx: " << slot_idx << " size: " <<
+            VLOG(0) << "WRONG::slot_idx: " << slot_idx << " slot_index_vec_size: " <<
       slot_index_vec_.size();
           }
           int dim_id = slot_index_vec_[slot_idx];
@@ -1126,8 +1126,7 @@ void PSGPUWrapper::EndPass() {
       if (gpu_val->mf_size > 0) {
         for (int x = 0; x < gpu_val->mf_dim + 1; x++) {
           if (x + 8 >= int(downpour_value->size())) {
-            VLOG(0) << "yxfff::14: x: " << x
-                    << " size: " << downpour_value_size;
+            VLOG(0) << "x: " << x << " size: "<< downpour_value_size;
           }
           cpu_val[x + 8] = gpu_val->mf[x];
         }
@@ -1165,12 +1164,6 @@ void PSGPUWrapper::EndPass() {
   gpu_free_channel_->Put(current_task_);
   fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_);
   timer.Pause();
-  // timer.Pause();
-  // VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
-  // VLOG(1) << "yxf::pull: " << time_1;
-  // VLOG(1) << "yxf::pull_1: " << time_2;
-  // VLOG(1) << "yxf::push: " << time_3;
-  // VLOG(1) << "yxf::push_1: " << time_4;
   VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
 }