not_need_stop to cpu

PaddlePaddle · Feb 24, 2025 · 2fb3378 · 2fb3378
1 parent 3102788
commit 2fb3378
Show file tree

Hide file tree

Showing 5 changed files with 13 additions and 129 deletions.
diff --git a/csrc/gpu/save_with_output_msg.cc b/csrc/gpu/save_with_output_msg.cc
@@ -27,20 +27,19 @@ struct msgdata {
 };
 
 void SaveOutMmsg(const paddle::Tensor& x,
-                 const paddle::Tensor& not_need_stop,
+                 const paddle::Tensor& not_need_stop, // cpu
                  int64_t rank_id) {
     if (rank_id > 0) return;
     auto x_cpu = x.copy_to(paddle::CPUPlace(), false);
     int64_t *x_data = x_cpu.data<int64_t>();
-    auto not_need_stop_cpu = not_need_stop.copy_to(paddle::CPUPlace(), false);
-    bool* not_need_stop_data = not_need_stop_cpu.data<bool>();
+    auto not_need_stop_data = not_need_stop.data<bool>()[0];
 
     static struct msgdata msg_sed;
     static key_t key = ftok("./", 1);
     static int msgid = msgget(key, IPC_CREAT | 0666);
 
     msg_sed.mtype = 1;
-    msg_sed.mtext[0] = not_need_stop_data[0] ? 1 : -1;
+    msg_sed.mtext[0] = not_need_stop_data ? 1 : -1;
     int bsz = x.shape()[0];
     msg_sed.mtext[1] = bsz;
     for (int i = 2; i < bsz + 2; i++) {

diff --git a/csrc/gpu/update_inputs.cu b/csrc/gpu/update_inputs.cu
diff --git a/csrc/gpu/update_inputs_v2.cu b/csrc/gpu/update_inputs_v2.cu
@@ -110,7 +110,7 @@ __global__ void update_inputs_kernel_v2(
 
 void UpdateInputesV2(const paddle::Tensor& stop_flags,
                const paddle::Tensor& step_idx,
-               const paddle::Tensor& not_need_stop,
+               const paddle::Tensor& not_need_stop, // cpu
                const paddle::Tensor& seq_lens_this_time,
                const paddle::Tensor& seq_lens_encoder,
                const paddle::Tensor& seq_lens_decoder,
@@ -125,8 +125,11 @@ void UpdateInputesV2(const paddle::Tensor& stop_flags,
   const int now_bsz = seq_lens_this_time.shape()[0];
   const int input_ids_stride = input_ids.shape()[1];
   const int end_length = end_ids.shape()[0];
+
+  auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false);
+
   update_inputs_kernel_v2<1024><<<1, 1024, 0, input_ids.stream()>>>(
-    const_cast<bool*>(not_need_stop.data<bool>()),
+    const_cast<bool*>(not_need_stop_gpu.data<bool>()),
     const_cast<int64_t*>(step_idx.data<int64_t>()),
     const_cast<bool*>(stop_flags.data<bool>()),
     const_cast<int*>(seq_lens_this_time.data<int>()),
@@ -144,6 +147,10 @@ void UpdateInputesV2(const paddle::Tensor& stop_flags,
     input_ids_stride,
     end_length
   );
+
+  auto not_need_stop_cpu = not_need_stop_gpu.copy_to(not_need_stop.place(), false);
+  bool *not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
+  not_need_stop_data[0] = not_need_stop_cpu.data<bool>()[0];
 }
 
 PD_BUILD_OP(update_inputs_v2)

diff --git a/csrc/setup_cuda.py b/csrc/setup_cuda.py
@@ -96,7 +96,6 @@ def get_gencode_flags():
     "./gpu/rebuild_padding_v2.cu",
     "./gpu/set_value_by_flags_v2.cu",
     "./gpu/stop_generation_multi_ends_v2.cu",
-    "./gpu/update_inputs.cu",
     "./gpu/get_output.cc",
     "./gpu/save_with_output_msg.cc",
     "./gpu/write_int8_cache_kv.cu",

diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py
@@ -979,7 +979,7 @@ def _preprocess(self, input_text: list[str]):
             shape=[self.config.batch_size, 1], fill_value=0, dtype="int32"
         )
         self.model_inputs["step_idx"] = paddle.full(shape=[self.config.batch_size, 1], fill_value=0, dtype="int64")
-        self.model_inputs["not_need_stop"] = paddle.full(shape=[1], fill_value=True, dtype="bool")
+        self.model_inputs["not_need_stop"] = paddle.full(shape=[1], fill_value=True, dtype="bool").cpu()  # cpu
         self.model_inputs["stop_flags"] = paddle.full(
             shape=[self.config.batch_size, 1], fill_value=False, dtype="bool"
         )