PaddlePaddle · l1cacheDell · Mar 28, 2025 · Apr 2, 2025 · Apr 16, 2025 · Apr 16, 2025
diff --git a/csrc/gpu/append_attn/append_attention_c16_impl.cuh b/csrc/gpu/append_attn/append_attention_c16_impl.cuh
@@ -1429,7 +1429,6 @@ void MultiQueryAppendAttention(
           static_cast<float *>(tmp_d->ptr()),
           reinterpret_cast<OUT_NV_TYPE *>(out->data<OutT>()),
           speculate_max_draft_token_num);
-
       // merge
       constexpr int vec_size = num_elems_per_128b<NV_TYPE>();
       if (is_decoder) {

diff --git a/csrc/gpu/append_attn/get_block_shape_and_split_kv_block.cu b/csrc/gpu/append_attn/get_block_shape_and_split_kv_block.cu
@@ -219,7 +219,6 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
   );
   auto max_len_kv_cpu =
       max_len_kv.copy_to(paddle::CPUPlace(), false);
-
   // decoder
   int max_dec_len_this_time_data = max_dec_len_this_time.data<int>()[0];
   if (max_dec_len_this_time_data > 0) {

diff --git a/csrc/gpu/append_attn/utils.cuh b/csrc/gpu/append_attn/utils.cuh
@@ -284,6 +284,16 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
       __VA_ARGS__                                       \
       break;                                            \
     }                                                   \
+    case 256: {                                         \
+      constexpr size_t HEAD_DIM = 256;                  \
+      __VA_ARGS__                                       \
+      break;                                            \
+    }                                                   \
+    case 512: {                                         \
+      constexpr size_t HEAD_DIM = 512;                  \
+      __VA_ARGS__                                       \
+      break;                                            \
+    }                                                   \
     default: {                                          \
       PD_THROW("not support the head_dim: ", head_dim); \
     }                                                   \
@@ -377,6 +387,9 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
   } else if (group_size == 8) {                              \
     constexpr size_t GROUP_SIZE = 8;                         \
     __VA_ARGS__                                              \
+  } else if (group_size == 64) {                             \
+    constexpr size_t GROUP_SIZE = 64;                        \
+    __VA_ARGS__                                              \
   } else if (group_size == 16) {                             \
     constexpr size_t GROUP_SIZE = 16;                        \
     __VA_ARGS__                                              \