PaddlePaddle · yuanlehome · Feb 17, 2025 · Feb 17, 2025 · Feb 17, 2025 · Feb 17, 2025
diff --git a/csrc/gpu/append_attention.cu b/csrc/gpu/append_attention.cu
@@ -62,7 +62,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
     const float out_linear_in_scale,
     const int speculate_max_draft_token_num,
     const bool causal,
-    const bool speculate_decoder) {
+    const bool speculate_decoder,
+    const bool mla_use_absorb) {
   typedef PDTraits<D> traits_;
   typedef typename traits_::DataType DataType_;
   typedef typename traits_::data_t data_t;
@@ -144,6 +145,7 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
           kv_num_blocks_data,
           max_input_length,
           use_neox_rotary_style,
+          mla_use_absorb,
           main_stream,
           &qkv_out,
           const_cast<paddle::Tensor*>(&key_cache),
@@ -171,6 +173,7 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
           kv_num_blocks_data,
           max_input_length,
           use_neox_rotary_style,
+          mla_use_absorb,
           main_stream,
           &qkv_out,
           const_cast<paddle::Tensor*>(&key_cache),
@@ -212,6 +215,7 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
             causal,
             false,
             true,
+            mla_use_absorb,
             main_stream,
             &fmha_out);
           break;
@@ -250,6 +254,7 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
             causal,
             false,
             true,
+            mla_use_absorb,
             main_stream,
             &fmha_out);
           break;
@@ -293,12 +298,13 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
           causal,
           false,
           true,
+          mla_use_absorb,
           main_stream,
           &fmha_out);
     }
   }
 
-  if (max_dec_len_this_time_data > 0) {
+  if (!mla_use_absorb && max_dec_len_this_time_data > 0) {
     cudaStream_t exec_stream;
     if (max_enc_len_this_time_data > 0) {
       cudaStreamWaitEvent(decoder_stream, main_event);
@@ -440,6 +446,7 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
             causal,
             !speculate_decoder,
             !speculate_decoder,
+            mla_use_absorb,
             exec_stream,
             &fmha_out);
           break;
@@ -478,6 +485,7 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
             causal,
             !speculate_decoder,
             !speculate_decoder,
+            mla_use_absorb,
             exec_stream,
             &fmha_out);
           break;
@@ -522,6 +530,7 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
           causal,
           !speculate_decoder,
           !speculate_decoder,
+          mla_use_absorb,
           exec_stream,
           &fmha_out);
     }
@@ -578,7 +587,8 @@ std::vector<paddle::Tensor> AppendAttention(
     const float out_linear_in_scale,
     const int speculate_max_draft_token_num,
     const bool causal,
-    const bool speculate_decoder) {
+    const bool speculate_decoder,
+    const bool mla_use_absorb) {
   AppendAttnMetaData meta_data;
 
   const auto& qkv_dims = qkv.dims();
@@ -641,7 +651,8 @@ std::vector<paddle::Tensor> AppendAttention(
           out_linear_in_scale,
           speculate_max_draft_token_num,
           causal,
-          speculate_decoder);
+          speculate_decoder,
+          mla_use_absorb);
     }
     case paddle::DataType::BFLOAT16: {
       return AppendAttentionKernel<paddle::DataType::BFLOAT16>(
@@ -688,7 +699,8 @@ std::vector<paddle::Tensor> AppendAttention(
           out_linear_in_scale,
           speculate_max_draft_token_num,
           causal,
-          speculate_decoder);
+          speculate_decoder,
+          mla_use_absorb);
     }
     case paddle::DataType::INT32: {
       if (compute_dtype == "bf16") {
@@ -736,7 +748,8 @@ std::vector<paddle::Tensor> AppendAttention(
             out_linear_in_scale,
             speculate_max_draft_token_num,
             causal,
-            speculate_decoder);
+            speculate_decoder,
+            mla_use_absorb);
       } else if (compute_dtype == "fp16") {
         return AppendAttentionKernel<paddle::DataType::FLOAT16>(
             meta_data,
@@ -782,7 +795,8 @@ std::vector<paddle::Tensor> AppendAttention(
             out_linear_in_scale,
             speculate_max_draft_token_num,
             causal,
-            speculate_decoder);
+            speculate_decoder,
+            mla_use_absorb);
       } else {
         PD_THROW("Only supported attr of compute_dtype in ['fp16', 'bf16'].");
         break;
@@ -886,7 +900,8 @@ std::vector<paddle::DataType> AppendAttentionInferDtype(
     const float out_linear_in_scale,
     const int speculate_max_draft_token_num,
     const bool causal,
-    const bool speculate_decoder) {
+    const bool speculate_decoder,
+    const bool mla_use_absorb) {
   if (compute_dtype == "bf16") {
     if (out_linear_in_scale > 0.0) {
       if (fabs(quant_max_bound - 127.0f) < 0.000001) {
@@ -963,7 +978,8 @@ PD_BUILD_OP(append_attention)
             "out_linear_in_scale: float",
             "speculate_max_draft_token_num: int",
             "causal: bool",
-            "speculate_decoder: bool"})
+            "speculate_decoder: bool",
+            "mla_use_absorb: bool"})
     .SetKernelFn(PD_KERNEL(AppendAttention))
     .SetInferShapeFn(PD_INFER_SHAPE(AppendAttentionInferShape))
     .SetInferDtypeFn(PD_INFER_DTYPE(AppendAttentionInferDtype));
diff --git a/csrc/gpu/append_attn/append_attention_c16_impl.cuh b/csrc/gpu/append_attn/append_attention_c16_impl.cuh
@@ -58,7 +58,8 @@ __global__ void multi_query_append_attention_kernel(
     float *__restrict__ tmp_m,      // [token_num, num_chunks, num_heads]
     float *__restrict__ tmp_d,      // [token_num, num_chunks, num_heads]
     OutT *__restrict__ out,
-    const int speculate_max_draft_token_num = 5) {
+    const int speculate_max_draft_token_num = 5,
+    const bool mla_use_absorb = false) {
   constexpr uint32_t num_vecs_per_head_qk =
       HEAD_DIM_QK / num_elems_per_128b<T>();
   constexpr uint32_t num_vecs_per_head_v = HEAD_DIM_V / num_elems_per_128b<T>();
@@ -221,7 +222,7 @@ __global__ void multi_query_append_attention_kernel(
       wid * 4 + tid / 8, tid % 8);
 
   uint32_t kv_idx_base = chunk_start;
-  int block_id = __ldg(&block_table_now[kv_idx_base / BLOCK_SIZE]);
+  int block_id = mla_use_absorb ? kv_idx_base / BLOCK_SIZE : __ldg(&block_table_now[kv_idx_base / BLOCK_SIZE]);
   const uint32_t const_offset_k = kv_head_idx * k_h_stride +
                                   (wid * 4 + tid / 8) * k_b_stride +
                                   tid % 8 * num_elems_per_128b<T>();
@@ -327,7 +328,7 @@ __global__ void multi_query_append_attention_kernel(
     __syncthreads();
 
     kv_idx_base += num_frags_z * 16;
-    block_id = __ldg(&block_table_now[kv_idx_base / BLOCK_SIZE]);
+    block_id = mla_use_absorb ? kv_idx_base / BLOCK_SIZE : __ldg(&block_table_now[kv_idx_base / BLOCK_SIZE]);
     if (block_id < 0) {
       block_id = 0;
     }
@@ -1023,6 +1024,7 @@ void MultiQueryAppendAttention(
     const float in_scale,
     const int speculate_max_draft_token_num,
     const bool is_decoder,
+    const bool mla_use_absorb,
     cudaStream_t &stream,
     paddle::Tensor *out) {
   using NV_TYPE = typename cascade_attn_type_traits<T>::type;
@@ -1133,7 +1135,8 @@ void MultiQueryAppendAttention(
           nullptr,
           nullptr,
           reinterpret_cast<OUT_NV_TYPE *>(out->data<OutT>()),
-          speculate_max_draft_token_num);
+          speculate_max_draft_token_num,
+          mla_use_absorb);
 
     } else {
       phi::Allocator::AllocationPtr tmp_workspace, tmp_m, tmp_d;
@@ -1191,7 +1194,8 @@ void MultiQueryAppendAttention(
           static_cast<float *>(tmp_m->ptr()),
           static_cast<float *>(tmp_d->ptr()),
           reinterpret_cast<OUT_NV_TYPE *>(out->data<OutT>()),
-          speculate_max_draft_token_num);
+          speculate_max_draft_token_num,
+          mla_use_absorb);
       // merge
       constexpr int vec_size = num_elems_per_128b<NV_TYPE>();
       if (is_decoder) {
@@ -1549,6 +1553,7 @@ void CascadeAppendAttentionC16Kernel(
     const bool causal,
     const bool is_decoder,
     const bool enable_prefill,
+    const bool mla_use_absorb,
     cudaStream_t &stream,
     paddle::Tensor *out) {
   const auto token_num = meta_data.token_nums;
@@ -1613,6 +1618,7 @@ void CascadeAppendAttentionC16Kernel(
                                     in_scale,
                                     speculate_max_draft_token_num,
                                     is_decoder,
+                                    mla_use_absorb,
                                     stream,
                                     out);
                               })})})})})})})

diff --git a/csrc/gpu/append_attn/append_attention_kernel.h b/csrc/gpu/append_attn/append_attention_kernel.h
@@ -57,6 +57,7 @@ void CascadeAppendAttentionC16Kernel(
     const bool causal,
     const bool is_decoder,
     const bool enable_prefill,
+    const bool mla_use_absorb,
     cudaStream_t& stream,
     paddle::Tensor* out);
 
@@ -190,6 +191,7 @@ void CascadeAppendAttentionKernel(
     const bool causal,
     const bool is_decoder,
     const bool enable_prefill,
+    const bool mla_use_absorb,
     cudaStream_t& stream,
     paddle::Tensor* out) {
   if (cache_quant_type_str == "none") {
@@ -224,6 +226,7 @@ void CascadeAppendAttentionKernel(
                                              causal,
                                              is_decoder,
                                              enable_prefill,
+                                             mla_use_absorb,
                                              stream,
                                              out);
   } else if (cache_quant_type_str == "cache_int8") {