|
15 | 15 | #include "helper.h" |
16 | 16 | #include "paddle/extension.h" |
17 | 17 | #ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU |
18 | | -#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h" |
19 | 18 | #include "paddle/phi/core/memory/memcpy.h" |
20 | 19 | #endif |
21 | 20 | #include "utils.cuh" |
@@ -288,13 +287,9 @@ void GetBlockShapeAndSplitKVBlock( |
288 | 287 | seq_lens_encoder.data<int>(), |
289 | 288 | max_len_tensor_gpu.data<int>(), |
290 | 289 | bsz); |
291 | | - // Note (sunxin): Skip capturing the DtoH copy (it's time-consuming); CPU data |
292 | | - // is only for branching in attention. |
293 | | -#ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU |
294 | | - if (!phi::backends::gpu::IsCUDAGraphCapturing()) |
295 | | -#endif |
296 | | - max_len_tensor_cpu.copy_( |
297 | | - max_len_tensor_gpu, max_len_tensor_cpu.place(), false); |
| 290 | + |
| 291 | + max_len_tensor_cpu.copy_( |
| 292 | + max_len_tensor_gpu, max_len_tensor_cpu.place(), false); |
298 | 293 |
|
299 | 294 | auto max_len_cpu_ptr = max_len_tensor_cpu.data<int>(); |
300 | 295 | int max_len_this_time = max_len_cpu_ptr[0]; |
@@ -403,13 +398,9 @@ void GetBlockShapeAndSplitKVBlock( |
403 | 398 | bsz, |
404 | 399 | decoder_block_shape_q, |
405 | 400 | group_size); |
406 | | - // Note (sunxin): Skip capturing the DtoH copy (it's time-consuming); CPU |
407 | | - // data is only for branching in attention. |
408 | | -#ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU |
409 | | - if (!phi::backends::gpu::IsCUDAGraphCapturing()) |
410 | | -#endif |
411 | | - decoder_num_blocks_cpu.copy_( |
412 | | - decoder_num_blocks_device, decoder_num_blocks_cpu.place(), false); |
| 401 | + |
| 402 | + decoder_num_blocks_cpu.copy_( |
| 403 | + decoder_num_blocks_device, decoder_num_blocks_cpu.place(), false); |
413 | 404 | PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync( |
414 | 405 | decoder_chunk_size_device.data<int>(), 64, sizeof(int32_t), stream)); |
415 | 406 | } |
|
0 commit comments