diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 77af0e7fb6a..e795f7ae47b 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -427,7 +427,16 @@ extern "C" { // GGML_TYPE_IQ4_NL_4_8 = 37, // GGML_TYPE_IQ4_NL_8_8 = 38, GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block) - GGML_TYPE_COUNT = 40, + // IDs must match ik_llama.cpp for GGUF interoperability + // IDs 40-136 reserved + GGML_TYPE_IQ2_K = 137, // 2.375 bpw + GGML_TYPE_IQ3_K = 138, // 3.44 bpw + GGML_TYPE_IQ4_K = 139, // 4.5 bpw + GGML_TYPE_IQ5_K = 140, // 5.5 bpw + GGML_TYPE_IQ6_K = 141, // 6.625 bpw + // IDs 142-156 reserved + + GGML_TYPE_COUNT = 142, }; // precision diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 93ab7ea446e..ae8a1afc36e 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -427,6 +427,62 @@ typedef struct { } block_iq4_xs; static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); +// IQ2_K: 2-bit K quantization (GGML_TYPE_IQ2_K = 137) +// 2.375 bpw (76 bytes / 256 values * 8 = 2.375) +typedef struct { + ggml_half d; // 2 bytes - per-block scale + uint16_t extra; // 2 bytes - extra info + uint8_t scales[QK_K/32]; // 8 bytes - per-32 scales + uint8_t qs[QK_K/4]; // 64 bytes - 2-bit quantized values +} block_iq2_k; +static_assert(sizeof(block_iq2_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/32 + QK_K/4, "wrong iq2_k block size/padding"); + +// IQ3_K: 3-bit K quantization (GGML_TYPE_IQ3_K = 138) +// 3.44 bpw (110 bytes / 256 values * 8 = 3.4375) +typedef struct { + ggml_half d; // 2 bytes - per-block scale + uint16_t extra; // 2 bytes - extra info + uint16_t scales_h; // 2 bytes - high bits of scales + uint8_t scales_l[QK_K/32]; // 8 bytes - low bits of scales + uint8_t qs[QK_K/4]; // 64 bytes - low 2 bits + uint8_t qh[QK_K/8]; // 32 bytes - high 1 bit +} block_iq3_k; +static_assert(sizeof(block_iq3_k) == sizeof(ggml_half) + 2*sizeof(uint16_t) + QK_K/32 + QK_K/4 + QK_K/8, "wrong iq3_k block size/padding"); + +// IQ4_K: 4-bit K quantization (GGML_TYPE_IQ4_K = 139) +// 4.5 bpw (144 bytes / 256 values * 8 = 4.5) +typedef struct { + ggml_half d; // 2 bytes - per-block scale + uint16_t extra; // 2 bytes - extra info + uint8_t scales_h[QK_K/64]; // 4 bytes - high bits of scales + uint8_t scales_l[QK_K/32]; // 8 bytes - low bits of scales + uint8_t qs[QK_K/2]; // 128 bytes - 4-bit quantized values +} block_iq4_k; +static_assert(sizeof(block_iq4_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/2 + 3*QK_K/64, "wrong iq4_k block size/padding"); + +// IQ5_K: 5-bit K quantization (GGML_TYPE_IQ5_K = 140) +// 5.5 bpw (176 bytes / 256 values * 8 = 5.5) +typedef struct { + ggml_half d; // 2 bytes - per-block scale + uint16_t extra; // 2 bytes - extra info + uint8_t scales_h[QK_K/64]; // 4 bytes - high bits of scales + uint8_t scales_l[QK_K/32]; // 8 bytes - low bits of scales + uint8_t qs[QK_K/2]; // 128 bytes - low 4 bits + uint8_t qh[QK_K/8]; // 32 bytes - high 1 bit +} block_iq5_k; +static_assert(sizeof(block_iq5_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/2 + QK_K/8 + 3*QK_K/64, "wrong iq5_k block size/padding"); + +// IQ6_K: 6-bit K quantization (GGML_TYPE_IQ6_K = 141) +// 6.625 bpw (212 bytes / 256 values * 8 = 6.625) +typedef struct { + ggml_half d; // 2 bytes - per-block scale + uint16_t extra; // 2 bytes - extra info + int8_t scales[QK_K/16]; // 16 bytes - signed scales + uint8_t qs[QK_K/2]; // 128 bytes - low 4 bits + uint8_t qh[QK_K/4]; // 64 bytes - high 2 bits +} block_iq6_k; +static_assert(sizeof(block_iq6_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/2 + QK_K/4 + QK_K/16, "wrong iq6_k block size/padding"); + #endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL @@ -1089,6 +1145,46 @@ GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16) -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113, GGML_TABLE_END() +// IQ2_K lookup table +GGML_TABLE_BEGIN(int8_t, iq2nl_values, 8) + -31, -13, 1, 17, -26, -8, 6, 22 +GGML_TABLE_END() + +GGML_TABLE_BEGIN(uint16_t, iq2kl_values, 32) + 0xe9c1, 0x0dc1, 0xc1d8, 0xf6d8, 0x0dd8, 0x2fd8, 0xd8e9, 0xe9e9, 0x01e9, 0x0de9, 0x1ce9, 0xc1f6, 0x01f6, 0x0df6, 0x2ff6, 0xe901, + 0xf601, 0x0101, 0x0d01, 0x1c01, 0xd80d, 0xe90d, 0xf60d, 0x010d, 0x0d0d, 0xc11c, 0xe91c, 0x011c, 0x1c1c, 0x2f1c, 0xe92f, 0x0d2f, +GGML_TABLE_END() + +// IQ3_K lookup table +GGML_TABLE_BEGIN(int8_t, iq3nl_values, 16) + -63, -40, -23, -10, 1, 13, 28, 47, + -59, -36, -19, -6, 5, 17, 32, 51, +GGML_TABLE_END() + +// IQ4_K lookup table +GGML_TABLE_BEGIN(int8_t, iq4k_values, 32) + -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113, + -123, -100, -79, -61, -45, -31, -18, -6, 5, 17, 29, 42, 57, 73, 93, 117 +GGML_TABLE_END() + +// IQ5_K lookup table +GGML_TABLE_BEGIN(int8_t, iq5nl_values, 64) + -126, -114, -103, -92, -83, -74, -65, -57, -50, -43, -36, -30, -24, -18, -12, -6, -1, 5, 11, 17, 23, 29, 36, 43, 51, 59, 68, 77, 87, 97, 109, 121, + -124, -112, -101, -90, -81, -72, -63, -55, -48, -41, -34, -28, -22, -16, -10, -4, 1, 7, 13, 19, 25, 31, 38, 45, 53, 61, 70, 79, 89, 99, 111, 123, +GGML_TABLE_END() + +// IQ6_K lookup table +GGML_TABLE_BEGIN(int8_t, iq6nl_values, 128) + -127, -121, -115, -109, -104, -98, -93, -88, -84, -79, -74, -70, -66, -62, -58, -54, + -51, -47, -44, -40, -37, -34, -31, -28, -25, -22, -19, -16, -13, -11, -8, -5, + -2, 0, 3, 6, 9, 12, 14, 17, 20, 23, 27, 30, 33, 36, 40, 44, + 47, 51, 55, 59, 63, 68, 72, 77, 82, 87, 92, 98, 103, 109, 115, 121, + -126, -120, -114, -108, -103, -97, -92, -87, -83, -78, -73, -69, -65, -61, -57, -53, + -50, -46, -43, -39, -36, -33, -30, -27, -24, -21, -18, -15, -12, -10, -7, -4, + -1, 1, 4, 7, 10, 13, 15, 18, 21, 24, 28, 31, 34, 37, 41, 45, + 48, 52, 56, 60, 64, 69, 73, 78, 83, 88, 93, 99, 104, 110, 116, 122, +GGML_TABLE_END() + // e2m1 values (doubled) // ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf GGML_TABLE_BEGIN(int8_t, kvalues_mxfp4, 16) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 64eb01a4e18..db2df122055 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -387,6 +387,36 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { [GGML_TYPE_I32] = { .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32, }, + [GGML_TYPE_IQ2_K] = { + .from_float = quantize_row_iq2_k, + .vec_dot = ggml_vec_dot_iq2_k_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_IQ3_K] = { + .from_float = quantize_row_iq3_k, + .vec_dot = ggml_vec_dot_iq3_k_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_IQ4_K] = { + .from_float = quantize_row_iq4_k, + .vec_dot = ggml_vec_dot_iq4_k_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_IQ5_K] = { + .from_float = quantize_row_iq5_k, + .vec_dot = ggml_vec_dot_iq5_k_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_IQ6_K] = { + .from_float = quantize_row_iq6_k, + .vec_dot = ggml_vec_dot_iq6_k_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, }; const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) { diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index b7a70e06f1d..58dec5e13de 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -4847,6 +4847,11 @@ void ggml_compute_forward_get_rows( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ3_K: + case GGML_TYPE_IQ4_K: + case GGML_TYPE_IQ5_K: + case GGML_TYPE_IQ6_K: { ggml_compute_forward_get_rows_q(params, dst); } break; @@ -5572,6 +5577,11 @@ void ggml_compute_forward_clamp( case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q8_K: + case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ3_K: + case GGML_TYPE_IQ4_K: + case GGML_TYPE_IQ5_K: + case GGML_TYPE_IQ6_K: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index 365cb36d2d7..51aa58bd354 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -1,3 +1,7 @@ +// =========================== IQ*_K quantization types (Iwan Kawrakow) +// Ported from ik_llama.cpp - https://github.com/ikawrakow/ik_llama.cpp +// MIT License - Copyright (C) 2024 Iwan Kawrakow + #define GGML_COMMON_IMPL_C #include "ggml-common.h" @@ -110,6 +114,28 @@ void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRI quantize_row_q8_K_ref(x, y, k); } +//===================================== IQ*_K quants ====================================== + +void quantize_row_iq2_k(const float * x, void * vy, int64_t k) { + quantize_row_iq2_k_ref(x, vy, k); +} + +void quantize_row_iq3_k(const float * x, void * vy, int64_t k) { + quantize_row_iq3_k_ref(x, vy, k); +} + +void quantize_row_iq4_k(const float * x, void * vy, int64_t k) { + quantize_row_iq4_k_ref(x, vy, k); +} + +void quantize_row_iq5_k(const float * x, void * vy, int64_t k) { + quantize_row_iq5_k_ref(x, vy, k); +} + +void quantize_row_iq6_k(const float * x, void * vy, int64_t k) { + quantize_row_iq6_k_ref(x, vy, k); +} + //===================================== Dot products ================================= void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { @@ -1191,3 +1217,255 @@ void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, assert(k % QK_K == 0); quantize_iq4_xs(x, y, 1, k, NULL); } + +void ggml_vec_dot_iq2_k_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(bs); + UNUSED(bx); + UNUSED(by); + UNUSED(nrc); + + const block_iq2_k * GGML_RESTRICT x = (const block_iq2_k *)vx; + const block_q8_K * GGML_RESTRICT y = (const block_q8_K *)vy; + + const int nb = n / QK_K; + + float sumf = 0; + for (int ibl = 0; ibl < nb; ++ibl) { + const float d = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const uint8_t * qs = x[ibl].qs; + const int8_t * qy = y[ibl].qs; + uint16_t extra = x[ibl].extra; + + float sumi = 0; + int shift = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + // Two 16-element sub-blocks per 32-element block + const int ls1 = (x[ibl].scales[ib32] & 0xf) - 8; + const int ls2 = (x[ibl].scales[ib32] >> 4) - 8; + const int8_t * values1 = extra & 1 ? iq2nl_values + 4 : iq2nl_values; + const int8_t * values2 = extra & 2 ? iq2nl_values + 4 : iq2nl_values; + extra >>= 2; + + int suml1 = 0, suml2 = 0; + for (int j = 0; j < 16; ++j) { + suml1 += qy[j+ 0] * values1[(qs[j+ 0] >> shift) & 3]; + suml2 += qy[j+16] * values2[(qs[j+16] >> shift) & 3]; + } + sumi += ls1 * suml1 + ls2 * suml2; + qy += 32; + shift += 2; + if (shift == 8) { qs += 32; shift = 0; } + } + sumf += d * sumi; + } + *s = sumf; +} + +void ggml_vec_dot_iq3_k_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(bs); + UNUSED(bx); + UNUSED(by); + UNUSED(nrc); + + const block_iq3_k * GGML_RESTRICT x = (const block_iq3_k *)vx; + const block_q8_K * GGML_RESTRICT y = (const block_q8_K *)vy; + + const int nb = n / QK_K; + + float sumf = 0; + for (int ibl = 0; ibl < nb; ++ibl) { + const float d = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const uint8_t * qs = x[ibl].qs; + const uint8_t * qh = x[ibl].qh; + const int8_t * qy = y[ibl].qs; + uint16_t sh = x[ibl].scales_h; + uint16_t extra = x[ibl].extra; + + float sumi = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + float dl1 = (2*(x[ibl].scales_l[ib32] & 0xf) + 1) * ((sh & 1) ? -1 : 1); + float dl2 = (2*(x[ibl].scales_l[ib32] >> 4) + 1) * ((sh & 2) ? -1 : 1); + sh >>= 2; + const int8_t * values1 = extra & 1 ? iq3nl_values + 8 : iq3nl_values; + const int8_t * values2 = extra & 2 ? iq3nl_values + 8 : iq3nl_values; + extra >>= 2; + int shift_l = 2*(ib32%4); + int shift_h = ib32%8; + int suml1 = 0, suml2 = 0; + for (int j = 0; j < 16; ++j) { + suml1 += qy[j+ 0] * values1[((qs[j+ 0] >> shift_l) & 3) | (((qh[j+ 0] >> shift_h) & 1) << 2)]; + suml2 += qy[j+16] * values2[((qs[j+16] >> shift_l) & 3) | (((qh[j+16] >> shift_h) & 1) << 2)]; + } + sumi += dl1*suml1 + dl2*suml2; + qy += 32; + if (shift_l == 6) qs += 32; + } + sumf += d * sumi; + } + *s = sumf; +} + +void ggml_vec_dot_iq4_k_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(bs); + UNUSED(bx); + UNUSED(by); + UNUSED(nrc); + + const block_iq4_k * GGML_RESTRICT x = (const block_iq4_k *)vx; + const block_q8_K * GGML_RESTRICT y = (const block_q8_K *)vy; + + const int nb = n / QK_K; + + float sumf = 0; + for (int ibl = 0; ibl < nb; ++ibl) { + const float d = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const uint8_t * qs = x[ibl].qs; + const int8_t * qy = y[ibl].qs; + uint16_t extra = x[ibl].extra; + + float sumi = 0; + for (int ib = 0; ib < QK_K/32; ++ib) { + const uint8_t sh = x[ibl].scales_h[ib/2] >> 4*(ib%2); + const float dl1 = ((x[ibl].scales_l[ib] & 0xf) | ((sh << 4) & 0x30)) - 32; + const float dl2 = ((x[ibl].scales_l[ib] >> 4) | ((sh << 2) & 0x30)) - 32; + const int8_t * values1 = extra & 1 ? iq4k_values + 16 : iq4k_values; + const int8_t * values2 = extra & 2 ? iq4k_values + 16 : iq4k_values; + extra >>= 2; + int suml1 = 0, suml2 = 0; + for (int j = 0; j < 16; ++j) { + suml1 += qy[j+ 0] * values1[qs[j] & 0xf]; + suml2 += qy[j+16] * values2[qs[j] >> 4]; + } + sumi += dl1*suml1 + dl2*suml2; + qy += 32; + qs += 16; + } + sumf += d * sumi; + } + *s = sumf; +} + +void ggml_vec_dot_iq5_k_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(bs); + UNUSED(bx); + UNUSED(by); + UNUSED(nrc); + + const block_iq5_k * GGML_RESTRICT x = (const block_iq5_k *)vx; + const block_q8_K * GGML_RESTRICT y = (const block_q8_K *)vy; + + const int nb = n / QK_K; + + float sumf = 0; + for (int ibl = 0; ibl < nb; ++ibl) { + const float d = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const uint8_t * qs = x[ibl].qs; + const uint8_t * qh = x[ibl].qh; + const int8_t * qy = y[ibl].qs; + uint16_t extra = x[ibl].extra; + + float sumi = 0; + int shift = 0; + for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + float dl1 = ((x[ibl].scales_l[2*ib64+0] & 0xf) | ((x[ibl].scales_h[ib64] << 4) & 0x30)) - 32; + float dl2 = ((x[ibl].scales_l[2*ib64+0] >> 4) | ((x[ibl].scales_h[ib64] << 2) & 0x30)) - 32; + float dl3 = ((x[ibl].scales_l[2*ib64+1] & 0xf) | ((x[ibl].scales_h[ib64] >> 0) & 0x30)) - 32; + float dl4 = ((x[ibl].scales_l[2*ib64+1] >> 4) | ((x[ibl].scales_h[ib64] >> 2) & 0x30)) - 32; + const int8_t * values1 = iq5nl_values + ((extra & 1) << 5); + const int8_t * values2 = iq5nl_values + ((extra & 2) << 4); + const int8_t * values3 = iq5nl_values + ((extra & 4) << 3); + const int8_t * values4 = iq5nl_values + ((extra & 8) << 2); + int suml1 = 0, suml2 = 0, suml3 = 0, suml4 = 0; + for (int j = 0; j < 16; ++j) { + suml1 += qy[j+ 0] * values1[(qs[j+ 0] & 0xf) | (((qh[j+ 0] >> shift) & 1) << 4)]; + suml2 += qy[j+16] * values2[(qs[j+16] & 0xf) | (((qh[j+16] >> shift) & 1) << 4)]; + suml3 += qy[j+32] * values3[(qs[j+ 0] >> 4) | (((qh[j+ 0] >> shift) & 2) << 3)]; + suml4 += qy[j+48] * values4[(qs[j+16] >> 4) | (((qh[j+16] >> shift) & 2) << 3)]; + } + sumi += dl1*suml1 + dl2*suml2 + dl3*suml3 + dl4*suml4; + qy += 64; + qs += 32; + extra >>= 4; + shift += 2; + if (shift == 8) { qh += 32; shift = 0; } + } + sumf += d * sumi; + } + *s = sumf; +} + +// IQ6_K uses polynomial dequantization +#define A_IQ6K -127.f +#define B_IQ6K 6.2568f +#define C_IQ6K 0.11218f +#define D_IQ6K 0.0011972f +#define S_IQ6K 1.f + +void ggml_vec_dot_iq6_k_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(bs); + UNUSED(bx); + UNUSED(by); + UNUSED(nrc); + + const block_iq6_k * GGML_RESTRICT x = (const block_iq6_k *)vx; + const block_q8_K * GGML_RESTRICT y = (const block_q8_K *)vy; + + const int nb = n / QK_K; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const int8_t * sl = x[i].scales; + const int8_t * q8 = y[i].qs; + + uint16_t extra = x[i].extra; + + int shift = 0; + int sumb = 0; + for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + int dl1 = sl[4*ib64 + 0]; + int dl2 = sl[4*ib64 + 1]; + int dl3 = sl[4*ib64 + 2]; + int dl4 = sl[4*ib64 + 3]; + float m1 = extra & 1 ? S_IQ6K : 0; + float m2 = extra & 2 ? S_IQ6K : 0; + float m3 = extra & 4 ? S_IQ6K : 0; + float m4 = extra & 8 ? S_IQ6K : 0; + float sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0; + for (int j = 0; j < 16; ++j) { + float q1 = ((qs[j+ 0] & 0xf) | (((qh[j+ 0] >> shift) & 0x03) << 4)); + float q2 = ((qs[j+16] & 0xf) | (((qh[j+16] >> shift) & 0x03) << 4)); + float q3 = ((qs[j+ 0] >> 4) | (((qh[j+ 0] >> shift) & 0x0c) << 2)); + float q4 = ((qs[j+16] >> 4) | (((qh[j+16] >> shift) & 0x0c) << 2)); + float v1 = A_IQ6K + q1*(B_IQ6K + q1*(-C_IQ6K + q1*D_IQ6K)) + m1; + float v2 = A_IQ6K + q2*(B_IQ6K + q2*(-C_IQ6K + q2*D_IQ6K)) + m2; + float v3 = A_IQ6K + q3*(B_IQ6K + q3*(-C_IQ6K + q3*D_IQ6K)) + m3; + float v4 = A_IQ6K + q4*(B_IQ6K + q4*(-C_IQ6K + q4*D_IQ6K)) + m4; + sumi1 += q8[j+ 0] * v1; + sumi2 += q8[j+16] * v2; + sumi3 += q8[j+32] * v3; + sumi4 += q8[j+48] * v4; + } + sumb += dl1 * sumi1 + dl2 * sumi2 + dl3 * sumi3 + dl4 * sumi4; + q8 += 64; + qs += 32; + extra >>= 4; + shift += 4; + if (shift == 8) { qh += 32; shift = 0; } + } + sumf += d * sumb; + } + *s = sumf; +} diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h index d83eb1b144d..ecf649dcd84 100644 --- a/ggml/src/ggml-cpu/quants.h +++ b/ggml/src/ggml-cpu/quants.h @@ -34,6 +34,12 @@ void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, i void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq2_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq3_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq4_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq5_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq6_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + // Dot product void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -62,6 +68,12 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq2_k_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq3_k_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq4_k_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq5_k_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq6_k_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + // Generic implementation void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index de5cbd75e86..babd5ae1b68 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -1,3 +1,7 @@ +// =========================== IQ*_K quantization types (Iwan Kawrakow) +// Ported from ik_llama.cpp - https://github.com/ikawrakow/ik_llama.cpp +// MIT License - Copyright (C) 2024 Iwan Kawrakow + #define GGML_COMMON_IMPL_C #include "ggml-common.h" @@ -4997,6 +5001,1269 @@ void quantize_row_iq2_s_ref(const float * GGML_RESTRICT x, block_iq2_s * GGML_RE quantize_iq2_s(x, y, 1, k, NULL); } +// Helper function: nearest integer rounding +static inline int nearest_int_ks(float fval) { + assert(fabsf(fval) <= 4194303.f); + float val = fval + 12582912.f; + int i; memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} + +// Helper function: find best index in IQ2 lookup table +static inline int best_index_iq2nl(const int8_t * values, float x) { + int idx = x < values[1] ? 0 : x > values[2] ? 2 : 1; + return x - values[idx] < values[idx+1] - x ? idx : idx + 1; +} + +// Helper function: find best index in IQ3 lookup table +static inline int best_index_iq3nl(const int8_t * values, float x) { + int idx = x < values[2] ? (x < values[1] ? 0 : 1) : (x < values[5] ? (x < values[3] ? 2 : (x < values[4] ? 3 : 4)) : (x < values[6] ? 5 : 6)); + return x - values[idx] < values[idx+1] - x ? idx : idx + 1; +} + +// Helper lookup table for fast index finding in IQ4 quantization +static const int8_t iq4nl_index[241] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 17, 17, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 18, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 19, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 20, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 21, 21, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 22, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 23, 23, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 24, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 25, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 26, 26, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 27, 27, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 28, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 29, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 30, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15 +}; + +// Find the best index in the IQ4 lookup table for a given value +static inline int best_index_iq4nl(const int8_t * values, float x) { + int ix = (int)x - values[0]; + if (ix < 0 || ix >= 241) return ix < 0 ? 0 : 15; + ix = iq4nl_index[ix]; + return ix < 16 ? ix : x - values[ix-16] < values[ix-15] - x ? ix-16 : ix-15; +} + +// Helper function: find best index in IQ5 lookup table +static inline int best_index_iq5nl(const int8_t * values, float x) { + int ix = (int)(x + 128) - 128; + if (ix < -127) return 0; + if (ix > 127) return 31; + // Simple linear search for 32 values + int best = 0; + float best_diff = fabsf(x - values[0]); + for (int i = 1; i < 32; ++i) { + float diff = fabsf(x - values[i]); + if (diff < best_diff) { + best_diff = diff; + best = i; + } + } + return best; +} + +// Comparison function for qsort +typedef struct { float val; int idx; } float_idx_pair_t; +static int compare_float_idx(const void * a, const void * b) { + const float_idx_pair_t * pa = (const float_idx_pair_t *)a; + const float_idx_pair_t * pb = (const float_idx_pair_t *)b; + return (pa->val > pb->val) - (pa->val < pb->val); +} + +void dequantize_row_iq2_k(const block_iq2_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + const uint8_t * qs = x[i].qs; + + uint16_t extra = x[i].extra; + + int shift = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + float dl1 = d * ((x[i].scales[ib32] & 0xf) - 8); + float dl2 = d * ((x[i].scales[ib32] >> 4) - 8); + const int8_t * values1 = extra & 1 ? iq2nl_values + 4 : iq2nl_values; + const int8_t * values2 = extra & 2 ? iq2nl_values + 4 : iq2nl_values; + extra >>= 2; + for (int j = 0; j < 16; ++j) { + y[j+ 0] = dl1 * values1[(qs[j+ 0] >> shift) & 3]; + y[j+16] = dl2 * values2[(qs[j+16] >> shift) & 3]; + } + y += 32; + shift += 2; + if (shift == 8) { qs += 32; shift = 0; } + } + } +} + +void dequantize_row_iq3_k(const block_iq3_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + + uint16_t sh = x[i].scales_h; + uint16_t extra = x[i].extra; + + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + float dl1 = d * ((2*(x[i].scales_l[ib32] & 0xf) + 1) * ((sh & 1) ? -1 : 1)); + float dl2 = d * ((2*(x[i].scales_l[ib32] >> 4) + 1) * ((sh & 2) ? -1 : 1)); + sh >>= 2; + const int8_t * values1 = extra & 1 ? iq3nl_values + 8 : iq3nl_values; + const int8_t * values2 = extra & 2 ? iq3nl_values + 8 : iq3nl_values; + extra >>= 2; + int shift_l = 2*(ib32%4); + int shift_h = ib32%8; + for (int j = 0; j < 16; ++j) { + y[j+ 0] = dl1 * values1[((qs[j+ 0] >> shift_l) & 3) | (((qh[j+ 0] >> shift_h) & 1) << 2)]; + y[j+16] = dl2 * values2[((qs[j+16] >> shift_l) & 3) | (((qh[j+16] >> shift_h) & 1) << 2)]; + } + y += 32; + if (shift_l == 6) qs += 32; + } + } +} + +void dequantize_row_iq4_k(const block_iq4_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + const uint8_t * qs = x[i].qs; + const float d = GGML_FP16_TO_FP32(x[i].d); + + uint16_t extra = x[i].extra; + + for (int ib = 0; ib < QK_K/32; ++ib) { + const uint8_t sh = x[i].scales_h[ib/2] >> 4*(ib%2); + const float dl1 = d * (((x[i].scales_l[ib] & 0xf) | ((sh << 4) & 0x30)) - 32); + const float dl2 = d * (((x[i].scales_l[ib] >> 4) | ((sh << 2) & 0x30)) - 32); + const int8_t * values1 = extra & 1 ? iq4k_values + 16 : iq4k_values; + const int8_t * values2 = extra & 2 ? iq4k_values + 16 : iq4k_values; + extra >>= 2; + for (int j = 0; j < 16; ++j) { + y[j+ 0] = dl1 * values1[qs[j] & 0xf]; + y[j+16] = dl2 * values2[qs[j] >> 4]; + } + y += 32; + qs += 16; + } + } +} + +void dequantize_row_iq5_k(const block_iq5_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint8_t * sl = x[i].scales_l; + const uint8_t * sh = x[i].scales_h; + + uint16_t extra = x[i].extra; + + int shift = 0; + for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + float dl1 = d * (((sl[2*ib64+0] & 0xf) | ((sh[ib64] << 4) & 0x30)) - 32); + float dl2 = d * (((sl[2*ib64+0] >> 4) | ((sh[ib64] << 2) & 0x30)) - 32); + float dl3 = d * (((sl[2*ib64+1] & 0xf) | ((sh[ib64] >> 0) & 0x30)) - 32); + float dl4 = d * (((sl[2*ib64+1] >> 4) | ((sh[ib64] >> 2) & 0x30)) - 32); + const int8_t * values1 = iq5nl_values + ((extra & 1) << 5); + const int8_t * values2 = iq5nl_values + ((extra & 2) << 4); + const int8_t * values3 = iq5nl_values + ((extra & 4) << 3); + const int8_t * values4 = iq5nl_values + ((extra & 8) << 2); + for (int j = 0; j < 16; ++j) { + y[j+ 0] = dl1 * values1[(qs[j+ 0] & 0xf) | (((qh[j+ 0] >> shift) & 1) << 4)]; + y[j+16] = dl2 * values2[(qs[j+16] & 0xf) | (((qh[j+16] >> shift) & 1) << 4)]; + y[j+32] = dl3 * values3[(qs[j+ 0] >> 4) | (((qh[j+ 0] >> shift) & 2) << 3)]; + y[j+48] = dl4 * values4[(qs[j+16] >> 4) | (((qh[j+16] >> shift) & 2) << 3)]; + } + y += 64; + qs += 32; + extra >>= 4; + shift += 2; + if (shift == 8) { qh += 32; shift = 0; } + } + } +} + +// IQ6_K constants for polynomial dequantization +#define A_IQ6K -127.f +#define B_IQ6K 6.2568f +#define C_IQ6K 0.11218f +#define D_IQ6K 0.0011972f +#define S_IQ6K 1.f + +void dequantize_row_iq6_k(const block_iq6_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const int8_t * sl = x[i].scales; + + uint16_t extra = x[i].extra; + + int shift = 0; + for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + float dl1 = d * sl[4*ib64 + 0]; + float dl2 = d * sl[4*ib64 + 1]; + float dl3 = d * sl[4*ib64 + 2]; + float dl4 = d * sl[4*ib64 + 3]; + float m1 = extra & 1 ? S_IQ6K : 0; + float m2 = extra & 2 ? S_IQ6K : 0; + float m3 = extra & 4 ? S_IQ6K : 0; + float m4 = extra & 8 ? S_IQ6K : 0; + for (int j = 0; j < 16; ++j) { + float q1 = ((qs[j+ 0] & 0xf) | (((qh[j+ 0] >> shift) & 0x03) << 4)); + float q2 = ((qs[j+16] & 0xf) | (((qh[j+16] >> shift) & 0x03) << 4)); + float q3 = ((qs[j+ 0] >> 4) | (((qh[j+ 0] >> shift) & 0x0c) << 2)); + float q4 = ((qs[j+16] >> 4) | (((qh[j+16] >> shift) & 0x0c) << 2)); + y[j+ 0] = dl1 * (A_IQ6K + q1*(B_IQ6K + q1*(-C_IQ6K + q1*D_IQ6K)) + m1); + y[j+16] = dl2 * (A_IQ6K + q2*(B_IQ6K + q2*(-C_IQ6K + q2*D_IQ6K)) + m2); + y[j+32] = dl3 * (A_IQ6K + q3*(B_IQ6K + q3*(-C_IQ6K + q3*D_IQ6K)) + m3); + y[j+48] = dl4 * (A_IQ6K + q4*(B_IQ6K + q4*(-C_IQ6K + q4*D_IQ6K)) + m4); + } + y += 64; + qs += 32; + extra >>= 4; + shift += 4; + if (shift == 8) { qh += 32; shift = 0; } + } + } +} + +// Helper function: make quantized scales for IQ*_K types +static float make_qx_quants_k(int n, int nmax, const float * x, int8_t * L, const float * qw) { + float max = 0; + float amax = 0; + for (int i = 0; i < n; ++i) { + float ax = fabsf(x[i]); + if (ax > amax) { amax = ax; max = x[i]; } + } + if (!amax) { // all zero + for (int i = 0; i < n; ++i) L[i] = 0; + return 0.f; + } + float iscale = -nmax / max; + float sumlx = 0; + float suml2 = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int_ks(iscale * x[i]); + l = MAX(-nmax, MIN(nmax-1, l)); + L[i] = l + nmax; + float w = qw ? qw[i] : 1.0f; + sumlx += w*x[i]*l; + suml2 += w*l*l; + } + float scale = suml2 ? sumlx/suml2 : 0.0f; + float best = scale * sumlx; + for (int is = -9; is <= 9; ++is) { + if (is == 0) continue; + iscale = -(nmax + 0.1f*is) / max; + sumlx = suml2 = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int_ks(iscale * x[i]); + l = MAX(-nmax, MIN(nmax-1, l)); + float w = qw ? qw[i] : 1.0f; + sumlx += w*x[i]*l; + suml2 += w*l*l; + } + if (suml2 > 0 && sumlx*sumlx > best*suml2) { + for (int i = 0; i < n; ++i) { + int l = nearest_int_ks(iscale * x[i]); + L[i] = nmax + MAX(-nmax, MIN(nmax-1, l)); + } + scale = sumlx/suml2; best = scale*sumlx; + } + } + return scale; +} + +// Block size for IQ2_K quantization (16 values per sub-block) +#define IQ2K_BLOCK_SIZE 16 + +static void quantize_row_iq2_k_impl(const float * GGML_RESTRICT x, block_iq2_k * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) { + float scales[QK_K/IQ2K_BLOCK_SIZE]; + float weight[IQ2K_BLOCK_SIZE]; + float sumx[IQ2K_BLOCK_SIZE+1], sumw[IQ2K_BLOCK_SIZE+1]; + float sw[QK_K/IQ2K_BLOCK_SIZE]; + int8_t Ls[QK_K/IQ2K_BLOCK_SIZE]; + float_idx_pair_t pairs[IQ2K_BLOCK_SIZE]; + + const int8_t * shifted_values = iq2nl_values + 4; + + for (int64_t ibl = 0; ibl < n_per_row/QK_K; ++ibl) { + memset(&y[ibl], 0, sizeof(block_iq2_k)); + y[ibl].d = GGML_FP32_TO_FP16(0.f); + + const float * xbl = x + ibl*QK_K; + float sumx2 = 0; + for (int j = 0; j < QK_K; ++j) sumx2 += xbl[j]*xbl[j]; + const float sigma2 = 1.5f*sumx2/QK_K; + + uint16_t extra = 0; + float max_abs_scale = 0; + + for (int ib = 0; ib < QK_K/IQ2K_BLOCK_SIZE; ++ib) { + const float * xb = xbl + IQ2K_BLOCK_SIZE*ib; + if (quant_weights) { + const float * qw = quant_weights + ibl*QK_K + ib*IQ2K_BLOCK_SIZE; + for (int j = 0; j < IQ2K_BLOCK_SIZE; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < IQ2K_BLOCK_SIZE; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; + } + sw[ib] = 0; + for (int j = 0; j < IQ2K_BLOCK_SIZE; ++j) { + sw[ib] += weight[j]; + pairs[j].val = xb[j]; + pairs[j].idx = j; + } + + // Sort by value + qsort(pairs, IQ2K_BLOCK_SIZE, sizeof(float_idx_pair_t), compare_float_idx); + + sumx[0] = sumw[0] = 0; + for (int j = 0; j < IQ2K_BLOCK_SIZE; ++j) { + int jj = pairs[j].idx; + sumw[j+1] = sumw[j] + weight[jj]; + sumx[j+1] = sumx[j] + weight[jj]*xb[jj]; + } + + float best = 0, d = 0; + int is_shifted = 0; + float sumqx, sumq2; + + // Search for optimal quantization boundaries + for (int i1 = 0; i1 < IQ2K_BLOCK_SIZE; ++i1) { + for (int i2 = i1; i2 < IQ2K_BLOCK_SIZE; ++i2) { + for (int i3 = i2; i3 < IQ2K_BLOCK_SIZE; ++i3) { + // Try non-shifted values + sumqx = (sumx[i1] - sumx[0])*iq2nl_values[0] + (sumx[i2] - sumx[i1])*iq2nl_values[1] + + (sumx[i3] - sumx[i2])*iq2nl_values[2] + (sumx[IQ2K_BLOCK_SIZE] - sumx[i3])*iq2nl_values[3]; + sumq2 = (sumw[i1] - sumw[0])*iq2nl_values[0]*iq2nl_values[0] + (sumw[i2] - sumw[i1])*iq2nl_values[1]*iq2nl_values[1] + + (sumw[i3] - sumw[i2])*iq2nl_values[2]*iq2nl_values[2] + (sumw[IQ2K_BLOCK_SIZE] - sumw[i3])*iq2nl_values[3]*iq2nl_values[3]; + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + d = sumqx/sumq2; best = d*sumqx; is_shifted = 0; + } + // Try shifted values + sumqx = (sumx[i1] - sumx[0])*shifted_values[0] + (sumx[i2] - sumx[i1])*shifted_values[1] + + (sumx[i3] - sumx[i2])*shifted_values[2] + (sumx[IQ2K_BLOCK_SIZE] - sumx[i3])*shifted_values[3]; + sumq2 = (sumw[i1] - sumw[0])*shifted_values[0]*shifted_values[0] + (sumw[i2] - sumw[i1])*shifted_values[1]*shifted_values[1] + + (sumw[i3] - sumw[i2])*shifted_values[2]*shifted_values[2] + (sumw[IQ2K_BLOCK_SIZE] - sumw[i3])*shifted_values[3]*shifted_values[3]; + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + d = sumqx/sumq2; best = d*sumqx; is_shifted = 1; + } + // Try reversed non-shifted values + sumqx = (sumx[i1] - sumx[0])*iq2nl_values[3] + (sumx[i2] - sumx[i1])*iq2nl_values[2] + + (sumx[i3] - sumx[i2])*iq2nl_values[1] + (sumx[IQ2K_BLOCK_SIZE] - sumx[i3])*iq2nl_values[0]; + sumq2 = (sumw[i1] - sumw[0])*iq2nl_values[3]*iq2nl_values[3] + (sumw[i2] - sumw[i1])*iq2nl_values[2]*iq2nl_values[2] + + (sumw[i3] - sumw[i2])*iq2nl_values[1]*iq2nl_values[1] + (sumw[IQ2K_BLOCK_SIZE] - sumw[i3])*iq2nl_values[0]*iq2nl_values[0]; + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + d = sumqx/sumq2; best = d*sumqx; is_shifted = 0; + } + // Try reversed shifted values + sumqx = (sumx[i1] - sumx[0])*shifted_values[3] + (sumx[i2] - sumx[i1])*shifted_values[2] + + (sumx[i3] - sumx[i2])*shifted_values[1] + (sumx[IQ2K_BLOCK_SIZE] - sumx[i3])*shifted_values[0]; + sumq2 = (sumw[i1] - sumw[0])*shifted_values[3]*shifted_values[3] + (sumw[i2] - sumw[i1])*shifted_values[2]*shifted_values[2] + + (sumw[i3] - sumw[i2])*shifted_values[1]*shifted_values[1] + (sumw[IQ2K_BLOCK_SIZE] - sumw[i3])*shifted_values[0]*shifted_values[0]; + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + d = sumqx/sumq2; best = d*sumqx; is_shifted = 1; + } + } + } + } + scales[ib] = d; + if (is_shifted) extra |= (1 << ib); + + float abs_scale = fabsf(scales[ib]); + if (abs_scale > max_abs_scale) max_abs_scale = abs_scale; + } + + if (!max_abs_scale) continue; + float d = make_qx_quants_k(QK_K/IQ2K_BLOCK_SIZE, 8, scales, Ls, sw); + if (!d) continue; + + y[ibl].extra = extra; + float id = 1/d; + + float final_sumqx = 0, final_sumq2 = 0; + for (int ib = 0; ib < QK_K/IQ2K_BLOCK_SIZE; ++ib) { + int ls = nearest_int_ks(id*scales[ib]); + ls = MAX(-8, MIN(7, ls)); + y[ibl].scales[ib/2] |= ((ls + 8) << 4*(ib%2)); + float dl = d * ls; + if (dl) { + const int8_t * block_values = y[ibl].extra & (1 << ib) ? shifted_values : iq2nl_values; + const float * xb = xbl + IQ2K_BLOCK_SIZE*ib; + if (quant_weights) { + const float * qw = quant_weights + ibl*QK_K + ib*IQ2K_BLOCK_SIZE; + for (int j = 0; j < IQ2K_BLOCK_SIZE; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < IQ2K_BLOCK_SIZE; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; + } + float idl = 1/dl; + int ib32 = ib/2; + int offset = 16*(ib%2); + uint8_t * qs = y[ibl].qs + 32*(ib32/4) + offset; + for (int j = 0; j < 16; ++j) { + const float al = idl*xb[j]; + int ibest = best_index_iq2nl(block_values, al); + qs[j] |= (ibest << 2*(ib32%4)); + float w = weight[j]; + float q = block_values[ibest]*ls; + final_sumqx += w*q*xb[j]; + final_sumq2 += w*q*q; + } + } + } + y[ibl].d = GGML_FP32_TO_FP16(1.030f*(final_sumq2 > 0 ? final_sumqx/final_sumq2 : d)); + } +} + +void quantize_row_iq2_k_ref(const float * GGML_RESTRICT x, block_iq2_k * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + quantize_row_iq2_k_impl(x, y, k, NULL); +} + +size_t quantize_iq2_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + GGML_ASSERT(n_per_row % QK_K == 0); + const int64_t nblock = n_per_row / QK_K; + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_iq2_k_impl(src, (block_iq2_k *)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += nblock * sizeof(block_iq2_k); + } + return nrow * nblock * sizeof(block_iq2_k); +} + +#define IQ3K_BLOCK_SIZE 16 +#define IQ3K_NTRY 3 + +static void quantize_row_iq3_k_impl(const float * GGML_RESTRICT x, block_iq3_k * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) { + float scales[QK_K/IQ3K_BLOCK_SIZE]; + float weight[IQ3K_BLOCK_SIZE]; + uint8_t L[IQ3K_BLOCK_SIZE]; + + const int8_t * shifted_values = iq3nl_values + 8; + + for (int64_t ibl = 0; ibl < n_per_row/QK_K; ++ibl) { + memset(&y[ibl], 0, sizeof(block_iq3_k)); + y[ibl].d = GGML_FP32_TO_FP16(0.f); + + const float * xbl = x + ibl*QK_K; + float sumx2 = 0; + for (int j = 0; j < QK_K; ++j) sumx2 += xbl[j]*xbl[j]; + const float sigma2 = 1.5f*sumx2/QK_K; + + uint16_t extra = 0; + float max_abs_scale = 0; + + for (int ib = 0; ib < QK_K/IQ3K_BLOCK_SIZE; ++ib) { + const float * xb = xbl + IQ3K_BLOCK_SIZE*ib; + if (quant_weights) { + const float * qw = quant_weights + ibl*QK_K + ib*IQ3K_BLOCK_SIZE; + for (int j = 0; j < IQ3K_BLOCK_SIZE; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < IQ3K_BLOCK_SIZE; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; + } + float amax = 0, max = 0; + for (int j = 0; j < IQ3K_BLOCK_SIZE; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { amax = ax; max = xb[j]; } + } + if (amax < 1e-9f) { + scales[ib] = 0; + continue; + } + float d = IQ3K_NTRY > 0 ? -max/iq3nl_values[0] : max/iq3nl_values[0]; + float id = 1/d; + float sumqx_p = 0, sumq2_p = 0; + float sumqx_m = 0, sumq2_m = 0; + float best = 0; + for (int j = 0; j < IQ3K_BLOCK_SIZE; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq3nl(iq3nl_values, al); + float q = iq3nl_values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq3nl(iq3nl_values, -al); + q = iq3nl_values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + if (sumq2_p > 0) { + d = sumqx_p/sumq2_p; + best = d*sumqx_p; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d*sumqx_m; + } + int is_shifted = 0; + for (int itry = -IQ3K_NTRY; itry <= IQ3K_NTRY; ++itry) { + id = (2*itry + iq3nl_values[0])/max; + sumqx_p = sumq2_p = 0; + sumqx_m = sumq2_m = 0; + for (int j = 0; j < IQ3K_BLOCK_SIZE; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq3nl(iq3nl_values, al); + float q = iq3nl_values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq3nl(iq3nl_values, -al); + q = iq3nl_values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = 0; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = 0; + } + id = (2*itry + shifted_values[0])/max; + sumqx_p = sumq2_p = 0; + sumqx_m = sumq2_m = 0; + for (int j = 0; j < IQ3K_BLOCK_SIZE; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq3nl(shifted_values, al); + float q = shifted_values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq3nl(shifted_values, -al); + q = shifted_values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = 1; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = 1; + } + } + if (!d) { + scales[ib] = 0; continue; + } + + const int8_t * block_values = is_shifted ? shifted_values : iq3nl_values; + float sumqx = 0, sumq2 = 0; + id = 1/d; + for (int j = 0; j < IQ3K_BLOCK_SIZE; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq3nl(block_values, al); + L[j] = l; + float q = block_values[l]; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + if (sumq2 > 0) d = sumqx/sumq2; + + // Iterative refinement + for (int iter = 0; iter < 128; ++iter) { + float gmax = 0; + int best_j = -1, dir = 0; + for (int j = 0; j < IQ3K_BLOCK_SIZE; ++j) { + float w = weight[j]; + float g = d * w * (xb[j] - d*block_values[L[j]]); + if (g > 0 && L[j] < 7) { + if (g > gmax) { gmax = g; best_j = j; dir = 1; } + } + else if (g < 0 && L[j] > 0) { + if (-g > gmax) { gmax = -g; best_j = j; dir = -1; } + } + } + if (best_j < 0) break; + + float w = weight[best_j]; + sumqx += w*xb[best_j]*(block_values[L[best_j]+dir] - block_values[L[best_j]]); + sumq2 += w*(block_values[L[best_j]+dir]*block_values[L[best_j]+dir] - block_values[L[best_j]]*block_values[L[best_j]]); + L[best_j] += dir; + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + d = sumqx/sumq2; best = d*sumqx; + } + else if (iter > 8) break; + } + + scales[ib] = d; + if (is_shifted) extra |= (1 << ib); + + float abs_scale = fabsf(scales[ib]); + if (abs_scale > max_abs_scale) max_abs_scale = abs_scale; + } + + if (!max_abs_scale) continue; + + float d = max_abs_scale/31; + y[ibl].extra = extra; + float id = 1/d; + + float sumqx = 0, sumq2 = 0; + for (int ib = 0; ib < QK_K/IQ3K_BLOCK_SIZE; ++ib) { + int ls = nearest_int_ks(0.5f*(id*fabsf(scales[ib])-1)); + ls = MAX(0, MIN(15, ls)); + y[ibl].scales_l[ib/2] |= (ls << 4*(ib%2)); + if (scales[ib] < 0) y[ibl].scales_h |= (1 << ib); + ls = (2*ls + 1) * (scales[ib] < 0 ? -1 : 1); + float dl = d * ls; + if (dl) { + const int8_t * block_values = y[ibl].extra & (1 << ib) ? shifted_values : iq3nl_values; + const float * xb = xbl + IQ3K_BLOCK_SIZE*ib; + if (quant_weights) { + const float * qw = quant_weights + ibl*QK_K + ib*IQ3K_BLOCK_SIZE; + for (int j = 0; j < IQ3K_BLOCK_SIZE; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < IQ3K_BLOCK_SIZE; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; + } + float idl = 1/dl; + int ib32 = ib/2; + int offset = 16*(ib%2); + uint8_t * qs = y[ibl].qs + 32*(ib32/4) + offset; + uint8_t * qh = y[ibl].qh + 32*(ib32/8) + offset; + for (int j = 0; j < 16; ++j) { + const float al = idl*xb[j]; + int ibest = best_index_iq3nl(block_values, al); + qs[j] |= ((ibest & 3) << 2*(ib32%4)); + qh[j] |= ((ibest >> 2) << (ib32%8)); + float w = weight[j]; + float q = block_values[ibest]*ls; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + } + } + y[ibl].d = GGML_FP32_TO_FP16(1.01f*(sumq2 > 0 ? sumqx/sumq2 : d)); + } +} + +void quantize_row_iq3_k_ref(const float * GGML_RESTRICT x, block_iq3_k * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + quantize_row_iq3_k_impl(x, y, k, NULL); +} + +size_t quantize_iq3_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + GGML_ASSERT(n_per_row % QK_K == 0); + const int64_t nblock = n_per_row / QK_K; + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_iq3_k_impl(src, (block_iq3_k *)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += nblock * sizeof(block_iq3_k); + } + return nrow * nblock * sizeof(block_iq3_k); +} + +#define IQ4K_BLOCK_SIZE 16 +#define IQ4K_NTRY 7 + +static void quantize_row_iq4_k_impl(const float * GGML_RESTRICT x, block_iq4_k * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) { + const int super_block_size = QK_K; + const int block_size = IQ4K_BLOCK_SIZE; + const int ntry = IQ4K_NTRY; + const int64_t nblock = n_per_row / super_block_size; + + float weight[IQ4K_BLOCK_SIZE]; + float scales[QK_K/IQ4K_BLOCK_SIZE]; + uint8_t L[QK_K]; + + const int8_t * values = iq4k_values; + const int8_t * shifted_values = iq4k_values + 16; + + for (int64_t ibl = 0; ibl < nblock; ++ibl) { + const float * xbl = x + ibl * super_block_size; + const float * qw = quant_weights ? quant_weights + ibl * super_block_size : NULL; + block_iq4_k * ybl = y + ibl; + + float sigma2 = 0; + for (int j = 0; j < super_block_size; ++j) sigma2 += xbl[j]*xbl[j]; + sigma2 *= 2.f/super_block_size; + + memset(ybl, 0, sizeof(block_iq4_k)); + ybl->d = GGML_FP32_TO_FP16(0.f); + + uint16_t * scales_h = (uint16_t *)ybl->scales_h; + + float max_scale = 0, amax_scale = 0; + uint16_t extra = 0; + + for (int ib = 0; ib < super_block_size/block_size; ++ib) { + const float * xb = xbl + ib*block_size; + if (qw) { + const float * qwb = qw + ib*block_size; + for (int j = 0; j < block_size; ++j) weight[j] = qwb[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j]; + } + float amax = 0, max = 0; + for (int j = 0; j < block_size; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { + amax = ax; max = xb[j]; + } + } + if (amax < 1e-30f) { + scales[ib] = 0; + continue; + } + float d = ntry > 0 ? -max/values[0] : max/values[0]; + float id = 1/d; + float sumqx_p = 0, sumq2_p = 0; + float sumqx_m = 0, sumq2_m = 0; + for (int j = 0; j < block_size; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq4nl(values, al); + float q = values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq4nl(values, -al); + q = values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + d = sumqx_p/sumq2_p; + bool is_shifted = false; + float best = d*sumqx_p; + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d*sumqx_m; + } + for (int itry = -ntry; itry <= ntry; ++itry) { + id = (itry + values[0])/max; + sumqx_p = sumq2_p = 0; + sumqx_m = sumq2_m = 0; + for (int j = 0; j < block_size; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq4nl(values, al); + float q = values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq4nl(values, -al); + q = values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = false; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = false; + } + id = (itry + shifted_values[0])/max; + sumqx_p = sumq2_p = 0; + sumqx_m = sumq2_m = 0; + for (int j = 0; j < block_size; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq4nl(shifted_values, al); + float q = shifted_values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq4nl(shifted_values, -al); + q = shifted_values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = true; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = true; + } + } + if (is_shifted) extra |= (1 << ib); + scales[ib] = d; + float abs_d = fabsf(d); + if (abs_d > amax_scale) { + amax_scale = abs_d; max_scale = d; + } + } + + float d = -max_scale/32; + ybl->d = GGML_FP32_TO_FP16(d); + ybl->extra = extra; + float id = d != 0.0f ? 1/d : 0.f; + float sumqx = 0, sumq2 = 0; + + for (int ib = 0; ib < super_block_size/block_size; ++ib) { + const int8_t * block_values = extra & (1 << ib) ? shifted_values : values; + int l = nearest_int(id*scales[ib]); + l = MAX(-32, MIN(31, l)); + float dl = d * l; + float idl = dl != 0.0f ? 1/dl : 0.f; + uint8_t * Lb = L + ib*block_size; + const float * xb = xbl + ib*block_size; + if (qw) { + const float * qwb = qw + ib*block_size; + for (int j = 0; j < block_size; ++j) weight[j] = qwb[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j]; + } + for (int j = 0; j < block_size; ++j) { + Lb[j] = best_index_iq4nl(block_values, idl*xb[j]); + float w = weight[j]; + float q = block_values[Lb[j]]*l; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + l += 32; + uint8_t l_l = l & 0xf; + uint8_t l_h = l >> 4; + if (ib%2 == 0) ybl->scales_l[ib/2] = l_l; + else ybl->scales_l[ib/2] |= (l_l << 4); + scales_h[ib/8] |= (l_h << 2*(ib%8)); + } + if (sumq2 > 0) ybl->d = GGML_FP32_TO_FP16(sumqx/sumq2); + + for (int i = 0; i < super_block_size/32; ++i) { + for (int j = 0; j < 16; ++j) { + ybl->qs[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4); + } + } + } +} + +void quantize_row_iq4_k_ref(const float * GGML_RESTRICT x, block_iq4_k * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + quantize_row_iq4_k_impl(x, y, k, NULL); +} + +size_t quantize_iq4_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + GGML_ASSERT(n_per_row % QK_K == 0); + const int64_t nblock = n_per_row / QK_K; + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_iq4_k_impl(src, (block_iq4_k *)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += nblock * sizeof(block_iq4_k); + } + return nrow * nblock * sizeof(block_iq4_k); +} + +#define IQ5K_BLOCK_SIZE 16 +#define IQ5K_NTRY 5 + +static void quantize_row_iq5_k_impl(const float * GGML_RESTRICT x, block_iq5_k * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) { + const int ntry = IQ5K_NTRY; + const float step = 1.f; + const int64_t nblock = n_per_row / QK_K; + + float scales[QK_K/IQ5K_BLOCK_SIZE]; + float weight[IQ5K_BLOCK_SIZE]; + + const int8_t * shifted_values = iq5nl_values + 32; + + for (int64_t ibl = 0; ibl < nblock; ++ibl) { + memset(&y[ibl], 0, sizeof(block_iq5_k)); + y[ibl].d = GGML_FP32_TO_FP16(0.f); + + const float * xbl = x + ibl*QK_K; + float sumx2 = 0; + for (int j = 0; j < QK_K; ++j) sumx2 += xbl[j]*xbl[j]; + const float sigma2 = 2*sumx2/QK_K; + + float max_scale = 0, max_abs_scale = 0; + uint16_t extra = 0; + + for (int ib = 0; ib < QK_K/16; ++ib) { + const float * xb = xbl + 16*ib; + const float * qw = quant_weights ? quant_weights + ibl*QK_K + ib*16 : NULL; + if (qw) { + for (int j = 0; j < 16; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < 16; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; + } + float amax = 0, max = 0; + for (int j = 0; j < 16; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { + amax = ax; max = xb[j]; + } + } + if (amax < 1e-30f) { + scales[ib] = 0; + continue; + } + float d = ntry > 0 ? -max/iq5nl_values[0] : max/iq5nl_values[0]; + float id = 1/d; + float sumqx_p = 0, sumq2_p = 0; + float sumqx_m = 0, sumq2_m = 0; + for (int j = 0; j < 16; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq5nl(iq5nl_values, al); + float q = iq5nl_values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq5nl(iq5nl_values, -al); + q = iq5nl_values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + d = sumqx_p/sumq2_p; + float best = d*sumqx_p; + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d*sumqx_m; + } + bool is_shifted = false; + for (int itry = -ntry; itry <= ntry; ++itry) { + id = (itry*step + iq5nl_values[0])/max; + sumqx_p = sumq2_p = 0; + sumqx_m = sumq2_m = 0; + for (int j = 0; j < 16; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq5nl(iq5nl_values, al); + float q = iq5nl_values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq5nl(iq5nl_values, -al); + q = iq5nl_values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = false; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = false; + } + id = (itry*step + shifted_values[0])/max; + sumqx_p = sumq2_p = 0; + sumqx_m = sumq2_m = 0; + for (int j = 0; j < 16; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq5nl(shifted_values, al); + float q = shifted_values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq5nl(shifted_values, -al); + q = shifted_values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = true; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = true; + } + } + if (d != 0.0f) { + const int8_t * block_values = is_shifted ? shifted_values : iq5nl_values; + float sumqx = 0, sumq2 = 0; + id = 1/d; + for (int j = 0; j < 16; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq5nl(block_values, al); + float q = block_values[l]; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + if (sumq2 > 0) d = sumqx/sumq2; + } + scales[ib] = d; + if (is_shifted) extra |= (1 << ib); + + float abs_scale = fabsf(scales[ib]); + if (abs_scale > max_abs_scale) { + max_abs_scale = abs_scale; max_scale = scales[ib]; + } + } + + if (max_abs_scale < 1e-30f) continue; + float d = -max_scale/32; + y[ibl].d = GGML_FP32_TO_FP16(d); + y[ibl].extra = extra; + + float id = 1/d; + + float sumqx = 0, sumq2 = 0; + for (int ib = 0; ib < QK_K/16; ++ib) { + int ls = nearest_int(id*scales[ib]); + ls = MAX(-32, MIN(31, ls)); + int uls = ls + 32; + y[ibl].scales_l[ib/2] |= ((uls & 0xf) << 4*(ib%2)); + y[ibl].scales_h[ib/4] |= ((uls >> 4) << 2*(ib%4)); + float dl = d * ls; + if (dl != 0.0f) { + const int8_t * block_values = y[ibl].extra & (1 << ib) ? shifted_values : iq5nl_values; + const float * xb = xbl + 16*ib; + const float * qw = quant_weights ? quant_weights + ibl*QK_K + ib*16 : NULL; + if (qw) { + for (int j = 0; j < 16; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < 16; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; + } + float idl = 1/dl; + int ib32 = ib/2; + int offset = 16*(ib%2); + uint8_t * qs = y[ibl].qs + 32*(ib32/2) + offset; + uint8_t * qh = y[ibl].qh + 32*(ib32/8) + offset; + for (int j = 0; j < 16; ++j) { + const float al = idl*xb[j]; + int ibest = best_index_iq5nl(block_values, al); + qs[j] |= ((ibest & 0xf) << 4*(ib32%2)); + qh[j] |= ((ibest >> 4) << (ib32%8)); + float w = weight[j]; + float q = block_values[ibest]*ls; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + } + } + if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(sumqx/sumq2); + } +} + +void quantize_row_iq5_k_ref(const float * GGML_RESTRICT x, block_iq5_k * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + quantize_row_iq5_k_impl(x, y, k, NULL); +} + +size_t quantize_iq5_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + GGML_ASSERT(n_per_row % QK_K == 0); + const int64_t nblock = n_per_row / QK_K; + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_iq5_k_impl(src, (block_iq5_k *)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += nblock * sizeof(block_iq5_k); + } + return nrow * nblock * sizeof(block_iq5_k); +} + +#define IQ6K_BLOCK_SIZE 16 +#define IQ6K_NTRY 5 +#define S_IQ6K 1.f + +static const uint8_t iq6nl_index[249] = { + 0, 0, 0, 64, 1, 1, 1, 1, 1, 65, 2, 2, 2, 2, 2, 66, 3, 3, 3, 3, 67, 67, 4, 4, 4, 4, 68, 5, 5, 5, 5, 69, + 69, 6, 6, 6, 70, 70, 7, 7, 7, 71, 8, 8, 8, 72, 72, 9, 9, 9, 73, 73, 10, 10, 10, 74, 11, 11, 11, 75, 12, 12, 12, 76, + 13, 13, 13, 77, 14, 14, 14, 78, 15, 15, 79, 79, 16, 16, 80, 17, 17, 81, 81, 18, 18, 82, 19, 19, 83, 83, 20, 84, 84, 21, 85, 85, + 22, 86, 86, 23, 87, 87, 24, 88, 88, 25, 89, 89, 26, 90, 90, 27, 91, 91, 28, 92, 29, 93, 93, 30, 94, 94, 31, 95, 95, 32, 96, 33, + 97, 97, 34, 98, 98, 35, 99, 99, 36, 100, 100, 37, 101, 38, 102, 102, 39, 103, 103, 40, 104, 104, 41, 41, 105, 42, 42, 106, 106, 43, 107, 107, + 44, 108, 108, 45, 45, 109, 46, 46, 46, 110, 47, 47, 111, 111, 48, 48, 112, 49, 49, 49, 113, 50, 50, 50, 114, 51, 51, 51, 115, 52, 52, 52, + 116, 116, 53, 53, 53, 117, 54, 54, 54, 118, 118, 55, 55, 55, 119, 119, 56, 56, 56, 120, 120, 57, 57, 57, 121, 121, 58, 58, 58, 58, 122, 59, + 59, 59, 59, 123, 123, 60, 60, 60, 60, 124, 61, 61, 61, 61, 61, 125, 62, 62, 62, 62, 62, 126, 63, 63, 63, +}; + +static inline int best_index_iq6nl_f(const float * values, float x) { + int ix = (int)(x - values[0]); + if (ix < 0 || ix >= 249) return ix < 0 ? 0 : 63; + ix = iq6nl_index[ix]; + return ix < 64 ? ix : x - values[ix-64] < values[ix-63] - x ? ix-64 : ix-63; +} + +static void quantize_row_iq6_k_impl(const float * GGML_RESTRICT x, block_iq6_k * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights, const float * values, const float * shifted_values) { + const int ntry = IQ6K_NTRY; + const float step = 1.f; + const int64_t nblock = n_per_row / QK_K; + + float scales[QK_K/IQ6K_BLOCK_SIZE]; + float weight[IQ6K_BLOCK_SIZE]; + + for (int64_t ibl = 0; ibl < nblock; ++ibl) { + memset(&y[ibl], 0, sizeof(block_iq6_k)); + y[ibl].d = GGML_FP32_TO_FP16(0.f); + + const float * xbl = x + ibl*QK_K; + float sumx2 = 0; + for (int j = 0; j < QK_K; ++j) sumx2 += xbl[j]*xbl[j]; + const float sigma2 = 2*sumx2/QK_K; + + float max_scale = 0, max_abs_scale = 0; + uint16_t extra = 0; + + for (int ib = 0; ib < QK_K/16; ++ib) { + const float * xb = xbl + 16*ib; + const float * qw = quant_weights ? quant_weights + ibl*QK_K + ib*16 : NULL; + if (qw) { + for (int j = 0; j < 16; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < 16; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; + } + float amax = 0, max = 0; + for (int j = 0; j < 16; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { + amax = ax; max = xb[j]; + } + } + if (amax < 1e-30f) { + scales[ib] = 0; + continue; + } + float d = ntry > 0 ? -max/values[0] : max/values[0]; + float id = 1/d; + float sumqx_p = 0, sumq2_p = 0; + float sumqx_m = 0, sumq2_m = 0; + for (int j = 0; j < 16; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq6nl_f(values, al); + float q = values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq6nl_f(values, -al); + q = values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + d = sumqx_p/sumq2_p; + float best = d*sumqx_p; + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d*sumqx_m; + } + bool is_shifted = false; + for (int itry = -ntry; itry <= ntry; ++itry) { + id = (itry*step + values[0])/max; + sumqx_p = sumq2_p = 0; + sumqx_m = sumq2_m = 0; + for (int j = 0; j < 16; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq6nl_f(values, al); + float q = values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq6nl_f(values, -al); + q = values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = false; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = false; + } + id = (itry*step + shifted_values[0])/max; + sumqx_p = sumq2_p = 0; + sumqx_m = sumq2_m = 0; + for (int j = 0; j < 16; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq6nl_f(shifted_values, al); + float q = shifted_values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq6nl_f(shifted_values, -al); + q = shifted_values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = true; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = true; + } + } + if (d != 0.0f) { + const float * block_values = is_shifted ? shifted_values : values; + float sumqx = 0, sumq2 = 0; + id = 1/d; + for (int j = 0; j < 16; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq6nl_f(block_values, al); + float q = block_values[l]; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + if (sumq2 > 0) d = sumqx/sumq2; + } + scales[ib] = d; + if (is_shifted) extra |= (1 << ib); + + float abs_scale = fabsf(scales[ib]); + if (abs_scale > max_abs_scale) { + max_abs_scale = abs_scale; max_scale = scales[ib]; + } + } + + if (max_abs_scale < 1e-30f) continue; + float d = -max_scale/127; + y[ibl].d = GGML_FP32_TO_FP16(d); + y[ibl].extra = extra; + + float id = 1/d; + + float sumqx = 0, sumq2 = 0; + for (int ib = 0; ib < QK_K/16; ++ib) { + int ls = nearest_int(id*scales[ib]); + ls = MAX(-127, MIN(127, ls)); + y[ibl].scales[ib] = ls; + float dl = d * ls; + if (dl != 0.0f) { + const float * block_values = y[ibl].extra & (1 << ib) ? shifted_values : values; + const float * xb = xbl + 16*ib; + const float * qw = quant_weights ? quant_weights + ibl*QK_K + ib*16 : NULL; + if (qw) { + for (int j = 0; j < 16; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < 16; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; + } + float idl = 1/dl; + int ib32 = ib/2; + int offset = 16*(ib%2); + uint8_t * qs = y[ibl].qs + 32*(ib32/2) + offset; + uint8_t * qh = y[ibl].qh + 32*(ib32/4) + offset; + for (int j = 0; j < 16; ++j) { + const float al = idl*xb[j]; + int ibest = best_index_iq6nl_f(block_values, al); + qs[j] |= ((ibest & 0xf) << 4*(ib32%2)); + qh[j] |= ((ibest >> 4) << 2*(ib32%4)); + float w = weight[j]; + float q = block_values[ibest]*ls; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + } + } + if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(sumqx/sumq2); + } +} + +void quantize_row_iq6_k_ref(const float * GGML_RESTRICT x, block_iq6_k * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + quantize_iq6_k(x, (void *)y, 1, k, NULL); +} + +size_t quantize_iq6_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + GGML_ASSERT(n_per_row % QK_K == 0); + const int64_t nblock = n_per_row / QK_K; + + // Build float values array from int8_t iq6nl_values + float values[128]; + for (int i = 0; i < 64; ++i) { + values[i] = iq6nl_values[i]; + values[i+64] = values[i] + S_IQ6K; + } + + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_iq6_k_impl(src, (block_iq6_k *)qrow, n_per_row, quant_weights, values, values + 64); + src += n_per_row; + qrow += nblock * sizeof(block_iq6_k); + } + return nrow * nblock * sizeof(block_iq6_k); +} + // =============================== data validation static bool validate_float(float f, size_t i) { @@ -5308,6 +6575,14 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); } break; + // IQ*_K types - no validation needed (per-block scales are embedded in block data) + case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ3_K: + case GGML_TYPE_IQ4_K: + case GGML_TYPE_IQ5_K: + case GGML_TYPE_IQ6_K: + break; + case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 3b688f31c21..69e598d4590 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -39,6 +39,12 @@ GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_ GGML_API void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_iq2_k_ref (const float * GGML_RESTRICT x, block_iq2_k * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_iq3_k_ref (const float * GGML_RESTRICT x, block_iq3_k * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_iq4_k_ref (const float * GGML_RESTRICT x, block_iq4_k * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_iq5_k_ref (const float * GGML_RESTRICT x, block_iq5_k * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_iq6_k_ref (const float * GGML_RESTRICT x, block_iq6_k * GGML_RESTRICT y, int64_t k); + // Dequantization GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); @@ -69,6 +75,12 @@ GGML_API void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, floa GGML_API void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_iq2_k (const block_iq2_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_iq3_k (const block_iq3_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_iq4_k (const block_iq4_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_iq5_k (const block_iq5_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_iq6_k (const block_iq6_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); @@ -80,6 +92,12 @@ GGML_API size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RE GGML_API size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_iq2_k (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_iq3_k (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_iq4_k (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_iq5_k (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_iq6_k (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index ed819eaa4c5..5d8f001ce4b 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -896,6 +896,46 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .type_size = 0, .is_quantized = false, }, + [GGML_TYPE_IQ2_K] = { + .type_name = "iq2_k", + .blck_size = QK_K, + .type_size = sizeof(block_iq2_k), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq2_k, + .from_float_ref = (ggml_from_float_t) quantize_row_iq2_k_ref, + }, + [GGML_TYPE_IQ3_K] = { + .type_name = "iq3_k", + .blck_size = QK_K, + .type_size = sizeof(block_iq3_k), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq3_k, + .from_float_ref = (ggml_from_float_t) quantize_row_iq3_k_ref, + }, + [GGML_TYPE_IQ4_K] = { + .type_name = "iq4_k", + .blck_size = QK_K, + .type_size = sizeof(block_iq4_k), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq4_k, + .from_float_ref = (ggml_from_float_t) quantize_row_iq4_k_ref, + }, + [GGML_TYPE_IQ5_K] = { + .type_name = "iq5_k", + .blck_size = QK_K, + .type_size = sizeof(block_iq5_k), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq5_k, + .from_float_ref = (ggml_from_float_t) quantize_row_iq5_k_ref, + }, + [GGML_TYPE_IQ6_K] = { + .type_name = "iq6_k", + .blck_size = QK_K, + .type_size = sizeof(block_iq6_k), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq6_k, + .from_float_ref = (ggml_from_float_t) quantize_row_iq6_k_ref, + }, }; const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { @@ -1251,7 +1291,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) { } } else { - nbytes = tensor->ne[0]*tensor->nb[0]/blck_size; + nbytes = tensor->nb[1]; for (int i = 1; i < GGML_MAX_DIMS; ++i) { nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; } @@ -1732,7 +1772,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( } result->nb[0] = ggml_type_size(type); - result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type)); + result->nb[1] = ggml_row_size(type, result->ne[0]); for (int i = 2; i < GGML_MAX_DIMS; i++) { result->nb[i] = result->nb[i - 1]*result->ne[i - 1]; } @@ -7585,6 +7625,11 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ2_K: result = quantize_iq2_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ3_K: result = quantize_iq3_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ4_K: result = quantize_iq4_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ5_K: result = quantize_iq5_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ6_K: result = quantize_iq6_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index ed0d7f2cae1..f61142d8c94 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -595,7 +595,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par // calculate byte offsets given the tensor shape and type info.t.nb[0] = type_size; - info.t.nb[1] = info.t.nb[0]*(info.t.ne[0]/blck_size); + info.t.nb[1] = ggml_row_size(info.t.type, info.t.ne[0]); for (int j = 2; j < GGML_MAX_DIMS; ++j) { info.t.nb[j] = info.t.nb[j - 1]*info.t.ne[j - 1]; } @@ -1153,7 +1153,7 @@ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggm GGML_ASSERT(tensor->ne[0] % blck_size == 0 && "tensor row size not divisible by block size of new type"); tensor->nb[0] = type_size; - tensor->nb[1] = tensor->nb[0]*(tensor->ne[0]/blck_size); + tensor->nb[1] = ggml_row_size(type, tensor->ne[0]); for (int i = 2; i < GGML_MAX_DIMS; i++) { tensor->nb[i] = tensor->nb[i - 1]*tensor->ne[i - 1]; } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 689acdc65de..969c06b6f76 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -3753,6 +3753,14 @@ class GGMLQuantizationType(IntEnum): TQ1_0 = 34 TQ2_0 = 35 MXFP4 = 39 + # IDs must match ik_llama.cpp for GGUF interoperability + # IDs 40-136 reserved + IQ2_K = 137 # 2.375 bpw + IQ3_K = 138 # 3.44 bpw + IQ4_K = 139 # 4.5 bpw + IQ5_K = 140 # 5.5 bpw + IQ6_K = 141 # 6.625 bpw + # IDs 142-156 reserved class ExpertGatingFuncType(IntEnum): @@ -3910,6 +3918,11 @@ class VisionProjectorType: GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13), GGMLQuantizationType.TQ2_0: (256, 2 + 64), GGMLQuantizationType.MXFP4: (32, 1 + 16), + GGMLQuantizationType.IQ2_K: (256, 2 + 2 + QK_K // 32 + QK_K // 4), # d(2) + extra(2) + scales[8] + qs[64] = 76 + GGMLQuantizationType.IQ3_K: (256, 2 + 2 + 2 + QK_K // 32 + QK_K // 4 + QK_K // 8), # d(2) + extra(2) + scales_h(2) + scales_l[8] + qs[64] + qh[32] = 110 + GGMLQuantizationType.IQ4_K: (256, 2 + 2 + QK_K // 64 + QK_K // 32 + QK_K // 2), # d(2) + extra(2) + scales_h[4] + scales_l[8] + qs[128] = 144 + GGMLQuantizationType.IQ5_K: (256, 2 + 2 + QK_K // 64 + QK_K // 32 + QK_K // 2 + QK_K // 8), # d(2) + extra(2) + scales_h[4] + scales_l[8] + qs[128] + qh[32] = 176 + GGMLQuantizationType.IQ6_K: (256, 2 + 2 + QK_K // 16 + QK_K // 2 + QK_K // 4), # d(2) + extra(2) + scales[16] + qs[128] + qh[64] = 212 } diff --git a/include/llama.h b/include/llama.h index 077f66dc651..5844ce3bde3 100644 --- a/include/llama.h +++ b/include/llama.h @@ -153,6 +153,17 @@ extern "C" { LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors + // IDs must match ik_llama.cpp for GGUF interoperability + // IDs 135-137 reserved + LLAMA_FTYPE_MOSTLY_IQ2_K = 138, // except 1d tensors, ~2.375 bpw + LLAMA_FTYPE_MOSTLY_IQ3_K = 139, // except 1d tensors, ~3.44 bpw + LLAMA_FTYPE_MOSTLY_IQ4_K = 140, // except 1d tensors, 4.5 bpw + LLAMA_FTYPE_MOSTLY_IQ5_K = 141, // except 1d tensors, 5.5 bpw + LLAMA_FTYPE_MOSTLY_IQ6_K = 142, // except 1d tensors, 6.625 bpw + // IDs 143-156 reserved + + // refer to ik_llama.cpp for IDs 200+ (some are reserved) + LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 1501e392ca8..7e14d4ec52f 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -61,6 +61,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ2_K: return "IQ2_K - 2.375 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_K: return "IQ3_K - 3.44 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ4_K: return "IQ4_K - 4.5 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ5_K: return "IQ5_K - 5.5 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ6_K: return "IQ6_K - 6.625 bpw"; default: return "unknown, may not work"; } @@ -709,6 +714,11 @@ llama_model_loader::llama_model_loader( case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; + case GGML_TYPE_IQ2_K: ftype = LLAMA_FTYPE_MOSTLY_IQ2_K; break; + case GGML_TYPE_IQ3_K: ftype = LLAMA_FTYPE_MOSTLY_IQ3_K; break; + case GGML_TYPE_IQ4_K: ftype = LLAMA_FTYPE_MOSTLY_IQ4_K; break; + case GGML_TYPE_IQ5_K: ftype = LLAMA_FTYPE_MOSTLY_IQ5_K; break; + case GGML_TYPE_IQ6_K: ftype = LLAMA_FTYPE_MOSTLY_IQ6_K; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 24770430e1c..0045231e906 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -219,8 +219,11 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - new_type = GGML_TYPE_Q5_K; + ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K) { + new_type = !qs.has_output ? GGML_TYPE_IQ4_K : GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_K || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) { + new_type = !qs.has_output ? GGML_TYPE_IQ5_K : GGML_TYPE_Q6_K; } else if (new_type != GGML_TYPE_Q8_0) { new_type = GGML_TYPE_Q6_K; @@ -283,6 +286,15 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q4_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_K) { + new_type = qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_IQ4_K : GGML_TYPE_IQ3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_K && qs.model.hparams.n_gqa() >= 2) { + new_type = GGML_TYPE_IQ4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_K && qs.model.hparams.n_gqa() >= 2) { + new_type = GGML_TYPE_IQ5_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } @@ -393,6 +405,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_K) { + new_type = GGML_TYPE_IQ3_K; + } } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; @@ -531,6 +546,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; + case LLAMA_FTYPE_MOSTLY_IQ2_K: default_type = GGML_TYPE_IQ2_K; break; + case LLAMA_FTYPE_MOSTLY_IQ3_K: default_type = GGML_TYPE_IQ3_K; break; + case LLAMA_FTYPE_MOSTLY_IQ4_K: default_type = GGML_TYPE_IQ4_K; break; + case LLAMA_FTYPE_MOSTLY_IQ5_K: default_type = GGML_TYPE_IQ5_K; break; + case LLAMA_FTYPE_MOSTLY_IQ6_K: default_type = GGML_TYPE_IQ6_K; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 037c0582bbb..c1bab2636c8 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -81,7 +81,6 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) { // Total dot product error static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) { GGML_UNUSED(qfns); - std::vector tmp_q1(2*test_size); std::vector tmp_q2(2*test_size); @@ -137,18 +136,21 @@ int main(int argc, char * argv[]) { const ggml_type ei = (ggml_type)i; - printf("Testing %s\n", ggml_type_name((ggml_type) i)); + printf("Testing %s", ggml_type_name((ggml_type) i)); ggml_quantize_init(ei); if (qfns_cpu->from_float && qfns->to_float) { + printf("\n"); const float total_error = total_quantization_error(qfns, qfns_cpu, test_size, test_data.data()); const float max_quantization_error = type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : + type == GGML_TYPE_IQ2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : + type == GGML_TYPE_IQ3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR; failed = !(total_error < max_quantization_error); num_failed += failed; @@ -165,7 +167,8 @@ int main(int argc, char * argv[]) { const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data()); const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS || - type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S + type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S || + type == GGML_TYPE_IQ2_K || type == GGML_TYPE_IQ3_K ? MAX_DOT_PRODUCT_ERROR_LOWBIT : type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0 ? MAX_DOT_PRODUCT_ERROR_TERNARY @@ -175,6 +178,8 @@ int main(int argc, char * argv[]) { if (failed || verbose) { printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error); } + } else { + printf(" (skipping - no quantization)\n"); } } diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 59bf9bd3fd0..f5505b9c8b3 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -54,6 +54,11 @@ static const std::vector QUANT_OPTIONS = { { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, + { "IQ2_K", LLAMA_FTYPE_MOSTLY_IQ2_K, " 2.375 bpw IQ2_K quantization", }, + { "IQ3_K", LLAMA_FTYPE_MOSTLY_IQ3_K, " 3.44 bpw IQ3_K quantization", }, + { "IQ4_K", LLAMA_FTYPE_MOSTLY_IQ4_K, " 4.5 bpw IQ4_K quantization", }, + { "IQ5_K", LLAMA_FTYPE_MOSTLY_IQ5_K, " 5.5 bpw IQ5_K quantization", }, + { "IQ6_K", LLAMA_FTYPE_MOSTLY_IQ6_K, " 6.625 bpw IQ6_K quantization", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },