Skip to content

Commit 1bbb143

Browse files
ikawrakowIwan Kawrakow
andauthored
Allow q8_0 KV cache for head size 256 (#330)
* Allow q8_0 KV cache for head size 256 * We need also these --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 05dbbea commit 1bbb143

File tree

4 files changed

+19
-0
lines changed

4 files changed

+19
-0
lines changed

ggml/src/ggml-cuda.cu

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3578,6 +3578,11 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
35783578
if (op->src[0]->ne[0] == 128) {
35793579
return true;
35803580
}
3581+
if (op->src[1]->ne[0] == 256 && op->src[2]->ne[0] == 256 &&
3582+
(op->src[1]->type == GGML_TYPE_F16 || op->src[1]->type == GGML_TYPE_Q8_0) &&
3583+
(op->src[2]->type == GGML_TYPE_F16 || op->src[2]->type == GGML_TYPE_Q8_0)) {
3584+
return true;
3585+
}
35813586
if (op->src[1]->ne[0] == 192 && op->src[2]->ne[0] == 128) {
35823587
return (op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) ||
35833588
(op->src[1]->type == GGML_TYPE_Q8_0 && op->src[2]->type == GGML_TYPE_Q8_0);

ggml/src/ggml-cuda/fattn.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
248248
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
249249

250250
FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
251+
FATTN_VEC_F16_CASE(256, GGML_TYPE_Q8_0,GGML_TYPE_Q8_0)
251252

252253
FATTN_VEC_F16_CASE(128, GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_NL)
253254
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_IQ4_NL)
@@ -265,6 +266,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
265266
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
266267
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
267268
FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
269+
FATTN_VEC_F16_CASE(256, GGML_TYPE_Q8_0,GGML_TYPE_Q8_0)
268270

269271
FATTN_VEC_F16_CASE(128, GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_NL)
270272
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_IQ4_NL)
@@ -347,6 +349,7 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg
347349
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
348350

349351
FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
352+
FATTN_VEC_F32_CASE(256, GGML_TYPE_Q8_0,GGML_TYPE_Q8_0)
350353

351354
FATTN_VEC_F32_CASE_DKDV(192, 128, GGML_TYPE_F16, GGML_TYPE_F16)
352355
FATTN_VEC_F32_CASE_DKDV(192, 128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
@@ -358,6 +361,7 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg
358361
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
359362
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
360363
FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
364+
FATTN_VEC_F32_CASE(256, GGML_TYPE_Q8_0,GGML_TYPE_Q8_0)
361365

362366
FATTN_VEC_F32_CASE_DKDV(192, 128, GGML_TYPE_F16, GGML_TYPE_F16)
363367
FATTN_VEC_F32_CASE_DKDV(192, 128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
2+
3+
#include "../fattn-vec-f16.cuh"
4+
5+
DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
2+
3+
#include "../fattn-vec-f32.cuh"
4+
5+
DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);

0 commit comments

Comments
 (0)