From 9afec886a2609177c3f3ae1742ecf9d20066e21f Mon Sep 17 00:00:00 2001 From: tianyuzhou668 <2431054748@qq.com> Date: Tue, 8 Jul 2025 10:43:19 +0800 Subject: [PATCH 1/2] [ILUVATAR_GPU] Remove non-essential patches. --- .../iluvatar_gpu/patches/paddle-corex.patch | 40 ++----------------- 1 file changed, 3 insertions(+), 37 deletions(-) diff --git a/backends/iluvatar_gpu/patches/paddle-corex.patch b/backends/iluvatar_gpu/patches/paddle-corex.patch index c2b68174b06..7194d2bdd3e 100644 --- a/backends/iluvatar_gpu/patches/paddle-corex.patch +++ b/backends/iluvatar_gpu/patches/paddle-corex.patch @@ -1,12 +1,10 @@ -From bbd64f7c4396f12bc13020ba959e2cc89f1011bf Mon Sep 17 00:00:00 2001 +From 228ab20dfcf53fe7af6ce903b7b3d60333518233 Mon Sep 17 00:00:00 2001 From: "tianyu.zhou" Date: Mon, 26 May 2025 14:21:47 +0800 -Subject: [PATCH] Adapt for Iluvatar 0707. +Subject: [PATCH] Adapt for Iluvatar 0708. -Fix bug. --- CMakeLists.txt | 2 +- - paddle/common/backend_header.h | 2 +- .../operators/collective/recv_v2_op.cu.cc | 2 +- .../operators/collective/send_v2_op.cu.cc | 2 +- .../fluid/platform/device/gpu/nccl_helper.h | 2 +- @@ -39,8 +37,7 @@ Fix bug. paddle/phi/kernels/squeeze_kernel.cc | 2 ++ paddle/phi/kernels/strided_slice_kernel.cc | 2 ++ paddle/phi/kernels/unsqueeze_kernel.cc | 2 ++ - paddle/utils/flat_hash_map.h | 5 ++++ - 35 files changed, 122 insertions(+), 37 deletions(-) + 33 files changed, 116 insertions(+), 36 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b2c4e6a650..c51f3df1f0 100755 @@ -55,19 +52,6 @@ index b2c4e6a650..c51f3df1f0 100755 option(CINN_WITH_CUDNN "Compile CINN with CUDNN support" ON) option(WITH_PIP_CUDA_LIBRARIES "Paddle uses the CUDA library provided by NVIDIA" ON) -diff --git a/paddle/common/backend_header.h b/paddle/common/backend_header.h -index 240b04c3ad..82f5aca792 100644 ---- a/paddle/common/backend_header.h -+++ b/paddle/common/backend_header.h -@@ -18,7 +18,7 @@ - #include - #endif - --#if defined(__CUDACC__) && CUDA_VERSION >= 11000 -+#if defined(__CUDACC__) && (CUDA_VERSION >= 11000 || defined(PADDLE_WITH_COREX)) - #define PADDLE_CUDA_BF16 - #include - #endif diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc index ab866f015c..10a8111637 100644 --- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc @@ -703,23 +687,5 @@ index c30752337d..dc1ca3bdc5 100644 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(unsqueeze, GPU, -diff --git a/paddle/utils/flat_hash_map.h b/paddle/utils/flat_hash_map.h -index b643fc1a57..960bc7d9b1 100644 ---- a/paddle/utils/flat_hash_map.h -+++ b/paddle/utils/flat_hash_map.h -@@ -683,8 +683,13 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal { - } - - size_t num_buckets_for_reserve(size_t num_elements) const { -+#ifdef PADDLE_WITH_COREX -+ return static_cast(std::ceil( -+ num_elements / std::min((double)0.5, static_cast(_max_load_factor)))); -+#else - return static_cast(std::ceil( - num_elements / std::min(0.5, static_cast(_max_load_factor)))); -+#endif - } - void rehash_for_other_container(const sherwood_v3_table &other) { - rehash( -- 2.34.1 From 517b0de259c51e012b73b4cb8d5401361cee92da Mon Sep 17 00:00:00 2001 From: tianyuzhou668 <2431054748@qq.com> Date: Wed, 9 Jul 2025 18:47:35 +0800 Subject: [PATCH 2/2] [ILUVATAR_GPU] Update flash_attn kernel. --- backends/iluvatar_gpu/common/cuda_flags.cc | 6 ++++++ .../cuda_kernels/flash_attn_grad_kernel.cu | 5 ++++- .../kernels/cuda_kernels/flash_attn_kernel.cu | 8 ++++++-- .../iluvatar_gpu/patches/paddle-corex.patch | 20 +++---------------- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/backends/iluvatar_gpu/common/cuda_flags.cc b/backends/iluvatar_gpu/common/cuda_flags.cc index 3c55d83b803..8f721f45ccc 100644 --- a/backends/iluvatar_gpu/common/cuda_flags.cc +++ b/backends/iluvatar_gpu/common/cuda_flags.cc @@ -291,4 +291,10 @@ PHI_DEFINE_EXPORTED_int32( 1, "Whether use the impMode of ixdnn for flash attention " ", default is CUDNN_FATTN_LEAST_MEM_MODE."); + +PHI_DEFINE_EXPORTED_int32( + ixdnn_causal_mode, + 0, + "Whether use the causalMode of ixdnn for flash attention " + ", default is 0."); #endif diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/flash_attn_grad_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/flash_attn_grad_kernel.cu index c97d69fb289..f4a58f1a8a0 100644 --- a/backends/iluvatar_gpu/kernels/cuda_kernels/flash_attn_grad_kernel.cu +++ b/backends/iluvatar_gpu/kernels/cuda_kernels/flash_attn_grad_kernel.cu @@ -29,6 +29,7 @@ #include "paddle/phi/kernels/reduce_sum_kernel.h" #include "paddle/phi/kernels/slice_kernel.h" COMMON_DECLARE_int32(ixdnn_imp_mode); +COMMON_DECLARE_int32(ixdnn_causal_mode); COMMON_DECLARE_bool(cudnn_deterministic); @@ -253,7 +254,7 @@ void FlashAttnUnpaddedGradBaseKernel( bool is_mha = (num_heads == num_heads_k); int64_t total_q = dims[0]; - bool is_unpad = (total_q == batch_size * max_seqlen_q) ? false : true; + bool is_unpad = true; const int64_t head_size_rounded = head_size + 32 - head_size % 32; DenseTensor q_padded, k_padded, v_padded, out_padded, dout_padded; @@ -313,6 +314,7 @@ void FlashAttnUnpaddedGradBaseKernel( flashAttnInfo.softmax_scale = std::sqrt(1.f / head_size); flashAttnInfo.dropout_prob = dropout; flashAttnInfo.is_causal = causal; + flashAttnInfo.causal_mode = FLAGS_ixdnn_causal_mode; // flashAttnInfo.is_alibi = use_alibi; // flashAttnInfo.alibi_mode = alibi_mode; flashAttnInfo.return_softmax_lse = false; @@ -718,6 +720,7 @@ void FlashAttnGradBaseKernel( flashAttnInfo.softmax_scale = softmax_scale; flashAttnInfo.dropout_prob = dropout; flashAttnInfo.is_causal = causal; + flashAttnInfo.causal_mode = FLAGS_ixdnn_causal_mode; // flashAttnInfo.is_alibi = use_alibi; // flashAttnInfo.alibi_mode = alibi_mode; flashAttnInfo.return_softmax_lse = false; diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/flash_attn_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/flash_attn_kernel.cu index fb40af9a70f..fa29f489bb9 100644 --- a/backends/iluvatar_gpu/kernels/cuda_kernels/flash_attn_kernel.cu +++ b/backends/iluvatar_gpu/kernels/cuda_kernels/flash_attn_kernel.cu @@ -26,6 +26,7 @@ #include "paddle/phi/kernels/pad_kernel.h" #include "paddle/phi/kernels/slice_kernel.h" COMMON_DECLARE_int32(ixdnn_imp_mode); +COMMON_DECLARE_int32(ixdnn_causal_mode); namespace phi { template @@ -100,7 +101,7 @@ void FlashAttnUnpaddedBaseKernel( // TODO(umiswing): add shape check // ixdnn int64_t total_q = dims[0]; - bool is_unpad = (total_q == batch_size * max_seqlen_q) ? false : true; + bool is_unpad = true; const int64_t head_size_rounded = head_size + 32 - head_size % 32; DenseTensor q_padded, k_padded, v_padded; @@ -152,7 +153,7 @@ void FlashAttnUnpaddedBaseKernel( flashAttnInfo.softmax_scale = std::sqrt(1.f / head_size); flashAttnInfo.dropout_prob = is_test ? 0.0f : dropout; flashAttnInfo.is_causal = causal; - flashAttnInfo.causal_mode = 1; + flashAttnInfo.causal_mode = FLAGS_ixdnn_causal_mode; // flashAttnInfo.is_alibi = use_alibi; // flashAttnInfo.alibi_mode = alibi_mode; flashAttnInfo.return_softmax_lse = true; @@ -283,6 +284,7 @@ void FlashAttnUnpaddedBaseKernel( out->data(), softmax_lse->data())); + cudaDeviceSynchronize(); out->Resize({total_q, num_heads, head_size}); phi::dynload::cudnnDestroyFlashAttnDescriptor(flashAttnDesc); @@ -511,6 +513,7 @@ void FlashAttnBaseKernel( flashAttnInfo.softmax_scale = std::sqrt(1.f / head_size); flashAttnInfo.dropout_prob = dropout; flashAttnInfo.is_causal = causal; + flashAttnInfo.causal_mode = FLAGS_ixdnn_causal_mode; // flashAttnInfo.is_alibi = use_alibi; // flashAttnInfo.alibi_mode = alibi_mode; flashAttnInfo.return_softmax_lse = true; @@ -642,6 +645,7 @@ void FlashAttnBaseKernel( out->data(), softmax_lse->data())); + cudaDeviceSynchronize(); phi::dynload::cudnnDestroyFlashAttnDescriptor(flashAttnDesc); PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::cudnnDestroyTensorDescriptor(q_desc)); diff --git a/backends/iluvatar_gpu/patches/paddle-corex.patch b/backends/iluvatar_gpu/patches/paddle-corex.patch index 7194d2bdd3e..3952781c76b 100644 --- a/backends/iluvatar_gpu/patches/paddle-corex.patch +++ b/backends/iluvatar_gpu/patches/paddle-corex.patch @@ -1,7 +1,7 @@ -From 228ab20dfcf53fe7af6ce903b7b3d60333518233 Mon Sep 17 00:00:00 2001 +From 23d11cd970ef7b9067edefcb979a4f298a44709a Mon Sep 17 00:00:00 2001 From: "tianyu.zhou" Date: Mon, 26 May 2025 14:21:47 +0800 -Subject: [PATCH] Adapt for Iluvatar 0708. +Subject: [PATCH] Adapt for Iluvatar 0709. --- CMakeLists.txt | 2 +- @@ -24,7 +24,6 @@ Subject: [PATCH] Adapt for Iluvatar 0708. paddle/phi/core/utils/data_type.h | 2 +- paddle/phi/kernels/funcs/affine_grid_utils.h | 2 ++ paddle/phi/kernels/funcs/segmented_array.h | 8 +++++++ - paddle/phi/kernels/funcs/select_impl.cu.h | 2 +- paddle/phi/kernels/funcs/softmax_impl.h | 1 + paddle/phi/kernels/gpu/c_embedding_kernel.cu | 2 +- paddle/phi/kernels/gpu/elementwise_grad.h | 4 ++++ @@ -37,7 +36,7 @@ Subject: [PATCH] Adapt for Iluvatar 0708. paddle/phi/kernels/squeeze_kernel.cc | 2 ++ paddle/phi/kernels/strided_slice_kernel.cc | 2 ++ paddle/phi/kernels/unsqueeze_kernel.cc | 2 ++ - 33 files changed, 116 insertions(+), 36 deletions(-) + 32 files changed, 115 insertions(+), 35 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b2c4e6a650..c51f3df1f0 100755 @@ -434,19 +433,6 @@ index dad852093e..8ebceb7074 100644 auto ptr = allocation->ptr(); allocations.emplace_back(std::move(allocation)); -diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h -index f96e392d94..56629c9b15 100644 ---- a/paddle/phi/kernels/funcs/select_impl.cu.h -+++ b/paddle/phi/kernels/funcs/select_impl.cu.h -@@ -396,7 +396,7 @@ void SelectKernel(const KPDevice &dev_ctx, - using CT = int64_t; // set Count_data Type - const int t_size = sizeof(CT); - -- const phi::GPUPlace &cuda_place = dev_ctx.GetPlace(); -+ const phi::CustomPlace &cuda_place = dev_ctx.GetPlace(); - phi::CPUPlace cpu_place = phi::CPUPlace(); - - // 1.1 get stored data num of per block diff --git a/paddle/phi/kernels/funcs/softmax_impl.h b/paddle/phi/kernels/funcs/softmax_impl.h index 8f6b0fdd32..b95ea07883 100644 --- a/paddle/phi/kernels/funcs/softmax_impl.h