@@ -633,6 +633,7 @@ struct vk_flash_attn_push_constants {
633633 uint32_t nev2;
634634 uint32_t nev3;
635635 uint32_t nem1;
636+ uint32_t nem2;
636637
637638 uint32_t nb01;
638639 uint32_t nb02;
@@ -643,7 +644,6 @@ struct vk_flash_attn_push_constants {
643644 uint32_t nb21;
644645 uint32_t nb22;
645646 uint32_t nb23;
646- uint32_t nb31;
647647
648648 float scale;
649649 float max_bias;
@@ -658,6 +658,7 @@ struct vk_flash_attn_push_constants {
658658 uint32_t split_kv;
659659 uint32_t k_num;
660660};
661+ static_assert(sizeof(vk_flash_attn_push_constants) <= 128, "sizeof(vk_flash_attn_push_constants) must be <= 128");
661662
662663struct vk_op_push_constants {
663664 uint32_t KX;
@@ -756,6 +757,14 @@ struct vk_op_rope_push_constants {
756757struct vk_op_soft_max_push_constants {
757758 uint32_t KX;
758759 uint32_t KY;
760+ uint32_t ne00;
761+ uint32_t ne01;
762+ uint32_t ne02;
763+ uint32_t ne12;
764+ uint32_t ne13;
765+ uint32_t nb11;
766+ uint32_t nb12;
767+ uint32_t nb13;
759768 float scale;
760769 float max_bias;
761770 float m0;
@@ -6043,7 +6052,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
60436052 GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
60446053
60456054 const uint32_t nem1 = mask ? mask->ne[1] : 0;
6046- const uint32_t nbm1 = mask ? mask->nb[1 ] : 0;
6055+ const uint32_t nem2 = mask ? mask->ne[2 ] : 0;
60476056
60486057 const uint32_t D = neq0;
60496058 uint32_t N = neq1;
@@ -6206,7 +6215,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
62066215 // Try to use split_k when KV is large enough to be worth the overhead
62076216 if (workgroups_x == 1 && shader_core_count > 0 && KV >= 512) {
62086217 // Try to run two workgroups per SM.
6209- split_k = ctx->device->shader_core_count * 2 / workgroups_y;
6218+ split_k = ctx->device->shader_core_count * 2 / ( workgroups_y * workgroups_z) ;
62106219 if (split_k > 1) {
62116220 // Try to evenly split KV into split_k chunks, but it needs to be a multiple
62126221 // of "align", so recompute split_k based on that.
@@ -6216,9 +6225,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
62166225 }
62176226 }
62186227
6219- // Reserve space for split_k temporaries. For each split, we need to store the O matrix (D x ne1)
6220- // and the per-row m and L values (ne1 rows).
6221- const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k : 0;
6228+ // Reserve space for split_k temporaries. For each split x batch , we need to store the O matrix (D x ne1)
6229+ // and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows.
6230+ const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne3 : 0;
62226231 if (split_k_size > ctx->device->max_memory_allocation_size) {
62236232 GGML_ABORT("Requested preallocation size is too large");
62246233 }
@@ -6310,11 +6319,10 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
63106319 (uint32_t)neq2, (uint32_t)neq3,
63116320 (uint32_t)nek2, (uint32_t)nek3,
63126321 (uint32_t)nev2, (uint32_t)nev3,
6313- nem1,
6322+ nem1, nem2,
63146323 q_stride, (uint32_t)nbq2, (uint32_t)nbq3,
63156324 k_stride, (uint32_t)nbk2, (uint32_t)nbk3,
63166325 v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
6317- nbm1,
63186326 scale, max_bias, logit_softcap,
63196327 mask != nullptr, n_head_log2, m0, m1,
63206328 gqa_ratio, split_kv, split_k };
@@ -6337,13 +6345,13 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
63376345 pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
63386346
63396347 ggml_vk_sync_buffers(subctx);
6340- const std::array<uint32_t, 3 > pc2 = { D, (uint32_t)ne1, split_k };
6348+ const std::array<uint32_t, 4 > pc2 = { D, (uint32_t)ne1, (uint32_t)ne3 , split_k };
63416349 ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
63426350 {
63436351 vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
63446352 vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
63456353 },
6346- pc2, { (uint32_t)ne1, 1, 1 });
6354+ pc2, { (uint32_t)ne1, 1, (uint32_t)ne3 });
63476355 } else {
63486356 ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
63496357 {
@@ -7669,7 +7677,13 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
76697677 const uint32_t nrows_x = (uint32_t)ggml_nrows(src0);
76707678 const uint32_t nrows_y = (uint32_t)src0->ne[1];
76717679
7672- const uint32_t n_head_kv = nrows_x/nrows_y;
7680+ const uint32_t ne12 = src1 ? (uint32_t)(src1->ne[2]) : 0u;
7681+ const uint32_t ne13 = src1 ? (uint32_t)(src1->ne[3]) : 0u;
7682+ const uint32_t nb11 = src1 ? (uint32_t)(src1->nb[1] / src1->nb[0]) : 0u;
7683+ const uint32_t nb12 = src1 ? (uint32_t)(src1->nb[2] / src1->nb[0]) : 0u;
7684+ const uint32_t nb13 = src1 ? (uint32_t)(src1->nb[3] / src1->nb[0]) : 0u;
7685+
7686+ const uint32_t n_head_kv = src0->ne[2];
76737687 const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
76747688
76757689 const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
@@ -7678,6 +7692,9 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
76787692 ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
76797693 ncols,
76807694 src1 != nullptr ? nrows_y : (uint32_t)0,
7695+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],
7696+ ne12, ne13,
7697+ nb11, nb12, nb13,
76817698 scale, max_bias,
76827699 m0, m1,
76837700 n_head_log2,
@@ -10251,11 +10268,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
1025110268 if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) {
1025210269 return false;
1025310270 }
10254- // TODO: support broadcast
10255- // ref: https://github.com/ggml-org/llama.cpp/pull/14435
10256- if (op->src[0]->ne[3] != 1) {
10257- return false;
10258- }
1025910271 // It's straightforward to support different K/V dequant, but would
1026010272 // significantly increase the number of pipelines
1026110273 if (op->src[1]->type != op->src[2]->type) {
@@ -10416,13 +10428,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
1041610428 case GGML_OP_DIAG_MASK_INF:
1041710429 return true;
1041810430 case GGML_OP_SOFT_MAX:
10419- // TODO: support batching
10420- if (op->src[0]->ne[3] != 1) {
10421- return false;
10422- }
10423- // TODO: support broadcast
10424- // ref: https://github.com/ggml-org/llama.cpp/pull/14435
10425- return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
1042610431 case GGML_OP_SOFT_MAX_BACK:
1042710432 case GGML_OP_ARGSORT:
1042810433 case GGML_OP_SUM:
0 commit comments