From 978c38965b03b9eb61a1ef6c6065df67aa9b25f5 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Wed, 25 Feb 2026 10:51:29 +0100 Subject: [PATCH 1/9] Q3_PT --- ggml/include/ggml.h | 4 +- ggml/src/ggml-common.h | 13 + ggml/src/ggml-cpu/arch-fallback.h | 3 + ggml/src/ggml-cpu/arch/arm/quants.c | 4 + ggml/src/ggml-cpu/arch/loongarch/quants.c | 4 + ggml/src/ggml-cpu/arch/powerpc/quants.c | 4 + ggml/src/ggml-cpu/arch/riscv/quants.c | 4 + ggml/src/ggml-cpu/arch/s390/quants.c | 4 + ggml/src/ggml-cpu/arch/wasm/quants.c | 4 + ggml/src/ggml-cpu/arch/x86/quants.c | 4 + ggml/src/ggml-cpu/ggml-cpu.c | 6 + ggml/src/ggml-cpu/ops.cpp | 1 + ggml/src/ggml-cpu/quants.c | 56 +++ ggml/src/ggml-cpu/quants.h | 2 + ggml/src/ggml-quants.c | 493 ++++++++++++++++++++++ ggml/src/ggml-quants.h | 20 + ggml/src/ggml.c | 11 + ggml/src/gguf.cpp | 82 ++-- gguf-py/gguf/constants.py | 2 + include/llama.h | 1 + src/llama-model-loader.cpp | 3 + src/llama-model.cpp | 48 +++ src/llama-quant.cpp | 109 ++++- tools/quantize/quantize.cpp | 6 + 24 files changed, 857 insertions(+), 31 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index fcc51f1f71a..a4f9ab70f32 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -427,7 +427,8 @@ extern "C" { // GGML_TYPE_IQ4_NL_4_8 = 37, // GGML_TYPE_IQ4_NL_8_8 = 38, GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block) - GGML_TYPE_COUNT = 40, + GGML_TYPE_Q3_PT = 40, // 3.875 bpw per-tensor Lloyd-Max, 16-elem affine sub-blocks + GGML_TYPE_COUNT = 41, }; // precision @@ -455,6 +456,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors + GGML_FTYPE_MOSTLY_Q3_PT = 26, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 93ab7ea446e..cd99d6e643b 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -427,6 +427,19 @@ typedef struct { } block_iq4_xs; static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); +// 3.875 bpw - per-tensor Lloyd-Max scalar quantization +// 256 elements = 16 sub-blocks of 16, 8-entry level table trained per tensor +// Layout: 2 (d) + 2 (dmin) + 24 (scales: 32x6-bit) + 96 (qs: 256x3-bit) = 124 bytes +typedef struct { + ggml_half d; // 2 bytes: global scale for 16-elem sub-block ranges + ggml_half dmin; // 2 bytes: global scale for sub-block neg_mins + uint8_t scales[3*QK_K/32]; // 24 bytes: 32 x 6-bit (indices 0..15 = ranges, 16..31 = neg_mins) + uint8_t qs[3*QK_K/8]; // 96 bytes: 256 x 3-bit Lloyd-Max level index, sequential +} block_q3_pt; +static_assert(sizeof(block_q3_pt) == 124, "wrong q3_pt block size"); + +#define IQ3KL_N_LEVELS 8 + #endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index 4dfe28e1d64..907ab853b1e 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -31,6 +31,7 @@ #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K +#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 @@ -184,6 +185,7 @@ #define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K +#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 @@ -276,6 +278,7 @@ #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K +#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index b390ab61c78..dafd66fe68c 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -3766,6 +3766,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +} + void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); diff --git a/ggml/src/ggml-cpu/arch/loongarch/quants.c b/ggml/src/ggml-cpu/arch/loongarch/quants.c index f531e916b9e..e22447c70a9 100644 --- a/ggml/src/ggml-cpu/arch/loongarch/quants.c +++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c @@ -1956,6 +1956,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +} + #if defined(__loongarch_asx) static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { const __m256i a = __lasx_xvmulwev_h_b(x, y); diff --git a/ggml/src/ggml-cpu/arch/powerpc/quants.c b/ggml/src/ggml-cpu/arch/powerpc/quants.c index d3dfd049eaf..3ad7582ddb3 100644 --- a/ggml/src/ggml-cpu/arch/powerpc/quants.c +++ b/ggml/src/ggml-cpu/arch/powerpc/quants.c @@ -2000,6 +2000,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +} + void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c index bf9f4df1182..0718931ad4c 100644 --- a/ggml/src/ggml-cpu/arch/riscv/quants.c +++ b/ggml/src/ggml-cpu/arch/riscv/quants.c @@ -2285,6 +2285,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +} + static void ggml_vec_dot_tq1_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index 19d225a4837..636eb871390 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -1466,3 +1466,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v #endif } +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +} + diff --git a/ggml/src/ggml-cpu/arch/wasm/quants.c b/ggml/src/ggml-cpu/arch/wasm/quants.c index 74a359e6d12..52d990c11ee 100644 --- a/ggml/src/ggml-cpu/arch/wasm/quants.c +++ b/ggml/src/ggml-cpu/arch/wasm/quants.c @@ -1219,3 +1219,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi #endif } +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +} + diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index 74d699f633d..a88f49d9d5c 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -3303,6 +3303,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +} + void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 64eb01a4e18..03f58c45f17 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -384,6 +384,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_Q3_PT] = { + // from_float not set — requires codebook initialization via iq3kl_set_codebook() + .vec_dot = ggml_vec_dot_q3_pt_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, [GGML_TYPE_I32] = { .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32, }, diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index b7a70e06f1d..67424ecc00e 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -5569,6 +5569,7 @@ void ggml_compute_forward_clamp( case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: + case GGML_TYPE_Q3_PT: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q8_K: diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index 365cb36d2d7..7165d448253 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -1001,6 +1001,62 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } +void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q3_pt * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + const float * levels = iq3kl_get_tensor_levels(vx); + GGML_ASSERT(levels != NULL && "Q3_PT levels not set for tensor"); + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float xd = GGML_CPU_FP16_TO_FP32(x[i].d); + const float xdmin = GGML_CPU_FP16_TO_FP32(x[i].dmin); + const float yd = y[i].d; + const uint8_t * sc = x[i].scales; + const uint8_t * qs = x[i].qs; + const int8_t * q8 = y[i].qs; + + float block_sum = 0.f; + for (int ib = 0; ib < QK_K/16; ++ib) { + // Inline 6-bit unpack for range scale (index ib) and neg_min scale (index ib + QK_K/16) + const int sbit0 = ib * 6, sbyte0 = sbit0 / 8, soff0 = sbit0 % 8; + const int sbit1 = (ib + QK_K/16) * 6, sbyte1 = sbit1 / 8, soff1 = sbit1 % 8; + uint8_t qrange = (sc[sbyte0] >> soff0) & 0x3F; + if (soff0 > 2) { qrange |= (uint8_t)((sc[sbyte0+1] << (8 - soff0)) & 0x3F); } + uint8_t qnmin = (sc[sbyte1] >> soff1) & 0x3F; + if (soff1 > 2) { qnmin |= (uint8_t)((sc[sbyte1+1] << (8 - soff1)) & 0x3F); } + const float range = xd * (float)qrange; + const float sub_min = -xdmin * (float)qnmin; + + float sum_lq = 0.f; + for (int j = 0; j < 16; ++j) { + // Inline 3-bit unpack + const int qk = ib * 16 + j; + const int qbit = qk * 3; + const int qbyte = qbit / 8; + const int qoff = qbit % 8; + int q = (qs[qbyte] >> qoff) & 0x7; + if (qoff > 5) { q |= (int)((qs[qbyte+1] << (8 - qoff)) & 0x7); } + sum_lq += levels[q] * (float)q8[qk]; + } + // min contribution uses precomputed 16-element sum from block_q8_K.bsums + block_sum += sum_lq * range + sub_min * (float)y[i].bsums[ib]; + } + sumf += block_sum * yd; + } + *s = sumf; +} + void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h index d83eb1b144d..9e4cd2179b2 100644 --- a/ggml/src/ggml-cpu/quants.h +++ b/ggml/src/ggml-cpu/quants.h @@ -61,6 +61,7 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q3_pt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); // Generic implementation void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); @@ -91,6 +92,7 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); #ifdef __cplusplus } diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index de5cbd75e86..1d53400de13 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -709,6 +709,15 @@ static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint } } +// Extract only the scale (not min) from Q4_K-style packed scales +static inline void get_scale_k4_only(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d) { + if (j < 4) { + *d = q[j] & 63; + } else { + *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + } +} + //========================- 2-bit (de)-quantization void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k) { @@ -4062,6 +4071,485 @@ void quantize_row_iq3_s_ref(const float * GGML_RESTRICT x, block_iq3_s * GGML_RE quantize_iq3_s(x, y, 1, k, NULL); } +// ====================== Q3_PT: 3.875 bpw per-tensor Lloyd-Max ====================== +// +// Block format (124 bytes per QK_K=256 elements): +// d (2 bytes): global scale for 16-element sub-block ranges +// dmin (2 bytes): global scale for sub-block neg_mins (-sub_min) +// scales[24] : 32 × 6-bit values (0..15 = ranges, 16..31 = neg_mins) +// qs[96] : 256 × 3-bit indices into 8-entry per-tensor Lloyd-Max level table +// +// Per-tensor: 8 float32 "levels" in [0,1] from Lloyd-Max training on affine-normalized +// 16-element sub-block values. Stored in GGUF as "q3_pt.levels" (float32 array). + +// Global levels (used during quantization for the current tensor) +static float iq3kl_levels[IQ3KL_N_LEVELS]; +static bool iq3kl_levels_set = false; + +void iq3kl_set_levels(const float * levels) { + memcpy(iq3kl_levels, levels, IQ3KL_N_LEVELS * sizeof(float)); + iq3kl_levels_set = true; +} + +const float * iq3kl_get_levels(void) { + return iq3kl_levels_set ? iq3kl_levels : NULL; +} + +void iq3kl_free_levels(void) { + iq3kl_levels_set = false; +} + +// Per-tensor levels registry for inference (range-based lookup by data address) +#define IQ3KL_MAX_TENSORS 1024 + +typedef struct { + const void * data; + size_t nbytes; + float levels[IQ3KL_N_LEVELS]; +} iq3kl_tensor_entry; + +static iq3kl_tensor_entry iq3kl_tensor_registry[IQ3KL_MAX_TENSORS]; +static int iq3kl_tensor_registry_count = 0; + +GGML_API void iq3kl_register_tensor_levels(const void * data, size_t nbytes, const float * levels) { + if (iq3kl_tensor_registry_count >= IQ3KL_MAX_TENSORS) { return; } + for (int i = 0; i < iq3kl_tensor_registry_count; ++i) { + if (iq3kl_tensor_registry[i].data == data) { + iq3kl_tensor_registry[i].nbytes = nbytes; + memcpy(iq3kl_tensor_registry[i].levels, levels, IQ3KL_N_LEVELS * sizeof(float)); + return; + } + } + iq3kl_tensor_registry[iq3kl_tensor_registry_count].data = data; + iq3kl_tensor_registry[iq3kl_tensor_registry_count].nbytes = nbytes; + memcpy(iq3kl_tensor_registry[iq3kl_tensor_registry_count].levels, levels, IQ3KL_N_LEVELS * sizeof(float)); + iq3kl_tensor_registry_count++; +} + +GGML_API void iq3kl_clear_tensor_levels(void) { + iq3kl_tensor_registry_count = 0; +} + +GGML_API const float * iq3kl_get_tensor_levels(const void * data_ptr) { + const uint8_t * p = (const uint8_t *)data_ptr; + for (int i = 0; i < iq3kl_tensor_registry_count; ++i) { + const uint8_t * base = (const uint8_t *)iq3kl_tensor_registry[i].data; + if (p >= base && p < base + iq3kl_tensor_registry[i].nbytes) { + return iq3kl_tensor_registry[i].levels; + } + } + return iq3kl_get_levels(); +} + +void iq3kl_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[IQ3KL_N_LEVELS]) { + + const int64_t n_sub = n_per_row / 16; // 16-element sub-blocks per row + + // Binning parameters + const int N_BINS = 8192; + const float bin_width = 1.0f / N_BINS; + float * bin_sum_w = (float *)calloc(N_BINS, sizeof(float)); + float * bin_sum_wt = (float *)calloc(N_BINS, sizeof(float)); + GGML_ASSERT(bin_sum_w && bin_sum_wt); + + // First pass: bin the affine-normalized values with their weights + for (int64_t row = 0; row < nrow; ++row) { + const float * xrow = data + row * n_per_row; + for (int64_t ib = 0; ib < n_sub; ++ib) { + const float * xb = xrow + ib * 16; + const int col_base = (int)(ib * 16); + float sb_min = xb[0], sb_max = xb[0]; + for (int j = 1; j < 16; ++j) { + if (xb[j] < sb_min) sb_min = xb[j]; + if (xb[j] > sb_max) sb_max = xb[j]; + } + const float sb_range = sb_max - sb_min; + for (int j = 0; j < 16; ++j) { + float w = 1.0f; + if (imatrix) { + w = imatrix[col_base + j]; + if (w < 1e-10f) w = 1e-10f; + } + if (sb_range > 1e-6f) { + w *= sb_range; + float t = (xb[j] - sb_min) / sb_range; + int bin_idx = (int)(t * N_BINS); + if (bin_idx >= N_BINS) bin_idx = N_BINS - 1; + bin_sum_w[bin_idx] += w; + bin_sum_wt[bin_idx] += w * t; + } + } + } + } + + // Initialize 8 levels uniformly in [0, 1] + float levels[IQ3KL_N_LEVELS]; + for (int k = 0; k < IQ3KL_N_LEVELS; ++k) { + levels[k] = (float)k / (IQ3KL_N_LEVELS - 1); + } + + // Lloyd-Max (weighted k-means) iterations with early convergence + for (int iter = 0; iter < 300; ++iter) { + float sum_w [IQ3KL_N_LEVELS] = {0}; + float sum_wt[IQ3KL_N_LEVELS] = {0}; + + // Process bins instead of individual values + for (int b = 0; b < N_BINS; ++b) { + if (bin_sum_w[b] < 1e-12f) continue; + const float t = (b + 0.5f) * bin_width; // representative value at bin center + int best = 0; + float best_d2 = (t - levels[0]) * (t - levels[0]); + for (int k = 1; k < IQ3KL_N_LEVELS; ++k) { + float d2 = (t - levels[k]) * (t - levels[k]); + if (d2 < best_d2) { best_d2 = d2; best = k; } + } + sum_w [best] += bin_sum_w[b]; + sum_wt[best] += bin_sum_wt[b]; + } + + // Check for early convergence + float max_delta = 0.0f; + for (int k = 0; k < IQ3KL_N_LEVELS; ++k) { + if (sum_w[k] > 1e-12f) { + float new_level = sum_wt[k] / sum_w[k]; + max_delta = fmaxf(max_delta, fabsf(new_level - levels[k])); + levels[k] = new_level; + } + } + if (max_delta < 1e-10f) break; + + // Keep levels sorted (insertion sort — 8 elements) + for (int k = 1; k < IQ3KL_N_LEVELS; ++k) { + float v = levels[k]; int m = k - 1; + while (m >= 0 && levels[m] > v) { levels[m+1] = levels[m]; m--; } + levels[m+1] = v; + } + } + + memcpy(levels_out, levels, IQ3KL_N_LEVELS * sizeof(float)); + iq3kl_set_levels(levels); + free(bin_sum_w); + free(bin_sum_wt); +} + +// --- Q3_PT bit-packing helpers --- + +// 6-bit sequential packing: 32 values in 24 bytes (4 values per 3 bytes). +// Indices 0..15 = sub-block ranges, 16..31 = sub-block neg_mins. +static inline uint8_t iq3kl_sc_get(const uint8_t * GGML_RESTRICT sc, int i) { + const int bit = i * 6; + const int byte = bit / 8; + const int off = bit % 8; + uint8_t val = (sc[byte] >> off) & 0x3F; + if (off > 2) { val |= (uint8_t)((sc[byte+1] << (8 - off)) & 0x3F); } + return val; +} + +static inline void iq3kl_sc_set(uint8_t * GGML_RESTRICT sc, int i, uint8_t v) { + const int bit = i * 6; + const int byte = bit / 8; + const int off = bit % 8; + sc[byte] |= (uint8_t)((v & 0x3F) << off); + if (off > 2) { sc[byte+1] |= (uint8_t)(v >> (8 - off)); } +} + +// 3-bit sequential packing: 256 values in 96 bytes (8 values per 3 bytes). +static inline int iq3kl_unpack3(const uint8_t * GGML_RESTRICT qs, int k) { + const int bit = k * 3; + const int byte = bit / 8; + const int off = bit % 8; + int val = (qs[byte] >> off) & 0x7; + if (off > 5) { val |= (int)((qs[byte+1] << (8 - off)) & 0x7); } + return val; +} + +static inline void iq3kl_pack3(uint8_t * GGML_RESTRICT qs, int k, int v) { + const int bit = k * 3; + const int byte = bit / 8; + const int off = bit % 8; + qs[byte] |= (uint8_t)((v & 0x7) << off); + if (off > 5) { qs[byte+1] |= (uint8_t)((v & 0x7) >> (8 - off)); } +} + +void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + const float * L = iq3kl_get_tensor_levels(x); + GGML_ASSERT(L != NULL && "Q3_PT levels not set for tensor"); + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + const float dmin = GGML_FP16_TO_FP32(x[i].dmin); + const uint8_t * sc = x[i].scales; + const uint8_t * qs = x[i].qs; + + for (int ib = 0; ib < QK_K/16; ++ib) { + const float range = d * (float)iq3kl_sc_get(sc, ib); + const float sub_min = -dmin * (float)iq3kl_sc_get(sc, ib + QK_K/16); + for (int j = 0; j < 16; ++j) { + const int q = iq3kl_unpack3(qs, ib*16 + j); + y[ib*16 + j] = L[q] * range + sub_min; + } + } + y += QK_K; + } +} + +#define IQ3KL_REFINE_ITERS 5 + +// Find the optimal global d-scale for 6-bit (nmax=63) sub-block range quantization, +// minimizing Σ_i weights[i] * (vals[i] - d * clamp(round(vals[i]/d), 0, nmax))^2. +// Tries d = vals[i] / nmax as "anchor" for each sub-block i (O(n^2), n=QK_K/16=16). +// Without imatrix all weights are equal and the winner is always max/nmax, so this is a no-op. +// With imatrix it can redirect scale resolution to important sub-blocks at the cost of +// less important ones that would otherwise dominate via raw max(). +static float iq3kl_find_optimal_d(const float * GGML_RESTRICT vals, + const float * GGML_RESTRICT weights, + int n, int nmax) { + float max_val = 0.f; + for (int i = 0; i < n; ++i) { if (vals[i] > max_val) max_val = vals[i]; } + if (max_val < 1e-6f) return 0.f; + float best_d = max_val / (float)nmax, best_err = FLT_MAX; + for (int i = 0; i < n; ++i) { + if (vals[i] < 1e-6f) continue; + const float d_cand = vals[i] / (float)nmax; + float err = 0.f; + for (int j = 0; j < n; ++j) { + int q = (int)(vals[j] / d_cand + 0.5f); + if (q > nmax) q = nmax; + const float delta = vals[j] - d_cand * (float)q; + err += weights[j] * delta * delta; + } + if (err < best_err) { best_err = err; best_d = d_cand; } + } + return best_d; +} + +static void quantize_row_q3_pt_impl(const float * GGML_RESTRICT x, + void * GGML_RESTRICT vy, + int64_t n, + const float * GGML_RESTRICT quant_weights) { + GGML_ASSERT(iq3kl_levels_set && "Q3_PT levels not set - call iq3kl_set_levels() first"); + GGML_ASSERT(n % QK_K == 0); + + const int64_t nbl = n / QK_K; + block_q3_pt * y = (block_q3_pt *) vy; + const float * L = iq3kl_levels; + + for (int ibl = 0; ibl < nbl; ++ibl) { + const float * xbl = x + QK_K * ibl; + block_q3_pt * blk = &y[ibl]; + + float sigma2 = 0; + if (quant_weights) { + for (int i = 0; i < QK_K; ++i) { + sigma2 += xbl[i] * xbl[i]; + } + sigma2 = 2.f * sigma2 / QK_K; + } + + // Per-sub-block importance weights: sum of AWQ weights over 16 elements. + // Used by iq3kl_find_optimal_d() to direct scale resolution toward important sub-blocks. + float w_ib[QK_K / 16]; + for (int ib = 0; ib < QK_K / 16; ++ib) { + float wsum = 0.f; + if (quant_weights) { + for (int j = 0; j < 16; ++j) { + const int elem = ib * 16 + j; + wsum += quant_weights[QK_K * ibl + elem] * sqrtf(sigma2 + xbl[elem] * xbl[elem]); + } + } else { + wsum = 16.f; // uniform — find_optimal_d is a no-op (max/63 always wins) + } + w_ib[ib] = wsum; + } + + // Compute per-sub-block ranges and neg_mins from raw min/max + float sub_ranges[QK_K / 16]; + float neg_mins[QK_K / 16]; + for (int ib = 0; ib < QK_K / 16; ++ib) { + const float * xb = xbl + ib * 16; + float sb_min = xb[0], sb_max = xb[0]; + for (int j = 1; j < 16; ++j) { + if (xb[j] < sb_min) { + sb_min = xb[j]; + } + if (xb[j] > sb_max) { + sb_max = xb[j]; + } + } + sub_ranges[ib] = sb_max - sb_min; + neg_mins[ib] = MAX(-sb_min, 0.f); + } + + // Pre-refinement: one weighted-LS pass with continuous (float) ranges before 6-bit + // quantization. Finds better initial (range, neg_min) from the raw min/max assignments, + // avoiding scale quantization noise in the very first set of level assignments. + for (int ib = 0; ib < QK_K / 16; ++ib) { + const float * xb = xbl + ib * 16; + if (sub_ranges[ib] < 1e-6f) { + continue; + } + const float inv_range0 = 1.f / sub_ranges[ib]; + const float sub_min0 = -neg_mins[ib]; + double sA = 0, sB = 0, sC = 0, sD = 0, sE = 0; + for (int j = 0; j < 16; ++j) { + const int elem = ib * 16 + j; + const float xj = xb[j]; + const float w = quant_weights ? quant_weights[QK_K * ibl + elem] * sqrtf(sigma2 + xj * xj) : 1.0f; + const float t = (xj - sub_min0) * inv_range0; + int best = 0; + float best_d2 = (t - L[0]) * (t - L[0]); + for (int k = 1; k < IQ3KL_N_LEVELS; ++k) { + const float d2 = (t - L[k]) * (t - L[k]); + if (d2 < best_d2) { + best_d2 = d2; + best = k; + } + } + const float lq = L[best]; + sA += (double) w * (double) lq * (double) lq; + sB += (double) w * (double) lq; + sC += (double) w; + sD += (double) w * (double) xj * (double) lq; + sE += (double) w * (double) xj; + } + const double det = sA * sC - sB * sB; + if (det > 1e-20) { + const float nr = (float) ((sD * sC - sE * sB) / det); + const float nm = (float) (-(sE * sA - sD * sB) / det); + if (nr > 0.f) { + sub_ranges[ib] = nr; + } + if (nm > 0.f) { + neg_mins[ib] = nm; + } + } + } + + // Importance-weighted d/dmin search (replaces plain max/63) + float d_val = iq3kl_find_optimal_d(sub_ranges, w_ib, QK_K / 16, 63); + float dmin_val = iq3kl_find_optimal_d(neg_mins, w_ib, QK_K / 16, 63); + + // Quantize ranges and neg_mins to 6-bit + memset(blk->scales, 0, sizeof(blk->scales)); + memset(blk->qs, 0, sizeof(blk->qs)); + const float inv_d = d_val > 0 ? 1.f / d_val : 0.f; + const float inv_dmin = dmin_val > 0 ? 1.f / dmin_val : 0.f; + for (int ib = 0; ib < QK_K / 16; ++ib) { + uint8_t sc = MIN(63, nearest_int(inv_d * sub_ranges[ib])); + uint8_t sm = MIN(63, nearest_int(inv_dmin * neg_mins[ib])); + iq3kl_sc_set(blk->scales, ib, sc); + iq3kl_sc_set(blk->scales, ib + QK_K / 16, sm); + } + blk->d = GGML_FP32_TO_FP16(d_val); + blk->dmin = GGML_FP32_TO_FP16(dmin_val); + + // Initial level assignment + for (int ib = 0; ib < QK_K / 16; ++ib) { + const float range = d_val * (float) iq3kl_sc_get(blk->scales, ib); + const float sub_min = -dmin_val * (float) iq3kl_sc_get(blk->scales, ib + QK_K / 16); + const float inv_range = range > 1e-6f ? 1.f / range : 0.f; + for (int j = 0; j < 16; ++j) { + const int elem = ib * 16 + j; + const float t = (xbl[elem] - sub_min) * inv_range; + int best = 0; + float best_d2 = (t - L[0]) * (t - L[0]); + for (int k = 1; k < IQ3KL_N_LEVELS; ++k) { + const float d2 = (t - L[k]) * (t - L[k]); + if (d2 < best_d2) { + best_d2 = d2; + best = k; + } + } + iq3kl_pack3(blk->qs, elem, best); + } + } + + // Iterative refinement: weighted LS for (range, neg_min) + importance-weighted d/dmin. + for (int iter = 0; iter < IQ3KL_REFINE_ITERS; ++iter) { + for (int ib = 0; ib < QK_K / 16; ++ib) { + double sA = 0, sB = 0, sC = 0, sD = 0, sE = 0; + for (int j = 0; j < 16; ++j) { + const int elem = ib * 16 + j; + const float xj = xbl[elem]; + const float w = quant_weights ? quant_weights[QK_K * ibl + elem] * sqrtf(sigma2 + xj * xj) : 1.0f; + const float lq = L[iq3kl_unpack3(blk->qs, elem)]; + sA += (double) w * (double) lq * (double) lq; + sB += (double) w * (double) lq; + sC += (double) w; + sD += (double) w * (double) xj * (double) lq; + sE += (double) w * (double) xj; + } + const double det = sA * sC - sB * sB; + if (det < 1e-20) { + continue; + } + const float new_range = (float) ((sD * sC - sE * sB) / det); + const float new_negmin = (float) (-(sE * sA - sD * sB) / det); + sub_ranges[ib] = new_range > 0.f ? new_range : 0.f; + neg_mins[ib] = new_negmin > 0.f ? new_negmin : 0.f; + } + + // Importance-weighted d/dmin search on updated sub_ranges/neg_mins + d_val = iq3kl_find_optimal_d(sub_ranges, w_ib, QK_K / 16, 63); + dmin_val = iq3kl_find_optimal_d(neg_mins, w_ib, QK_K / 16, 63); + + // Re-pack scales + memset(blk->scales, 0, sizeof(blk->scales)); + const float inv_d2 = d_val > 0 ? 1.f / d_val : 0.f; + const float inv_dmin2 = dmin_val > 0 ? 1.f / dmin_val : 0.f; + for (int ib = 0; ib < QK_K / 16; ++ib) { + uint8_t sc = MIN(63, nearest_int(inv_d2 * sub_ranges[ib])); + uint8_t sm = MIN(63, nearest_int(inv_dmin2 * neg_mins[ib])); + iq3kl_sc_set(blk->scales, ib, sc); + iq3kl_sc_set(blk->scales, ib + QK_K / 16, sm); + } + blk->d = GGML_FP32_TO_FP16(d_val); + blk->dmin = GGML_FP32_TO_FP16(dmin_val); + + // Re-assign levels + memset(blk->qs, 0, sizeof(blk->qs)); + for (int ib = 0; ib < QK_K / 16; ++ib) { + const float range = d_val * (float) iq3kl_sc_get(blk->scales, ib); + const float sub_min = -dmin_val * (float) iq3kl_sc_get(blk->scales, ib + QK_K / 16); + const float inv_range = range > 1e-6f ? 1.f / range : 0.f; + for (int j = 0; j < 16; ++j) { + const int elem = ib * 16 + j; + const float t = (xbl[elem] - sub_min) * inv_range; + int best = 0; + float best_d2 = (t - L[0]) * (t - L[0]); + for (int k = 1; k < IQ3KL_N_LEVELS; ++k) { + const float d2 = (t - L[k]) * (t - L[k]); + if (d2 < best_d2) { + best_d2 = d2; + best = k; + } + } + iq3kl_pack3(blk->qs, elem, best); + } + } + } + } +} + +size_t quantize_q3_pt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + GGML_ASSERT(n_per_row % QK_K == 0); + int64_t nblock = n_per_row / QK_K; + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_q3_pt_impl(src, qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += nblock * sizeof(block_q3_pt); + } + return nrow * nblock * sizeof(block_q3_pt); +} + +void quantize_row_q3_pt_ref(const float * GGML_RESTRICT x, block_q3_pt * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + quantize_q3_pt(x, y, 1, k, NULL); +} // =================================== 1.5 bpw =================================================== @@ -5307,6 +5795,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); } break; + case GGML_TYPE_Q3_PT: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_pt, data, nb); + } break; case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -5323,3 +5815,4 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte return true; } + diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 3b688f31c21..78abfde2e13 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -96,6 +96,26 @@ GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTR GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API void quantize_row_q3_pt_ref(const float * GGML_RESTRICT x, block_q3_pt * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_q3_pt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + +// Q3_PT levels management (per-tensor Lloyd-Max levels in [0,1]) +GGML_API void iq3kl_set_levels(const float * levels); // set global levels (quantization) +GGML_API const float * iq3kl_get_levels(void); +GGML_API void iq3kl_free_levels(void); + +// Per-tensor levels registry (inference — range-based lookup by data address) +GGML_API void iq3kl_register_tensor_levels(const void * data, size_t nbytes, const float * levels); +GGML_API void iq3kl_clear_tensor_levels(void); +GGML_API const float * iq3kl_get_tensor_levels(const void * data_ptr); + +// Train 8 Lloyd-Max levels from tensor data via weighted k-means on affine-normalized +// 16-element sub-block values. Also sets the global levels via iq3kl_set_levels(). +// data: float array [nrow * n_per_row], imatrix: importance weights [n_per_row] or NULL. +GGML_API void iq3kl_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[8]); + GGML_API void iq2xs_init_impl(enum ggml_type type); GGML_API void iq2xs_free_impl(enum ggml_type type); GGML_API void iq3xs_init_impl(int grid_size); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index e9529fbb662..7869e340fd3 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -896,6 +896,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .type_size = 0, .is_quantized = false, }, + [GGML_TYPE_Q3_PT] = { + .type_name = "q3_pt", + .blck_size = QK_K, + .type_size = sizeof(block_q3_pt), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q3_pt, + .from_float_ref = (ggml_from_float_t) quantize_row_q3_pt_ref, + }, }; const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { @@ -1386,6 +1394,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break; case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break; + case GGML_FTYPE_MOSTLY_Q3_PT: wtype = GGML_TYPE_Q3_PT; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -7530,6 +7539,7 @@ void ggml_quantize_init(enum ggml_type type) { case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break; case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break; case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break; + case GGML_TYPE_Q3_PT: break; // levels set externally via iq3kl_set_levels() default: // nothing break; } @@ -7606,6 +7616,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q3_PT: result = quantize_q3_pt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index cbeedf6c4b6..0dc97d0b7fd 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -1327,37 +1327,63 @@ struct gguf_writer_base { if (kv.is_array) { write(GGUF_TYPE_ARRAY); - write(kv.get_type()); + const enum gguf_type elem_type = kv.get_type(); + write(elem_type); write(ne); + // Write array element data based on element type + switch (elem_type) { + case GGUF_TYPE_UINT8: + case GGUF_TYPE_INT8: + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: { + // Write raw bytes inline for array data + for (size_t i = 0; i < kv.data.size(); ++i) { + write(kv.data[i]); + } + } break; + case GGUF_TYPE_BOOL: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; + case GGUF_TYPE_STRING: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; + case GGUF_TYPE_ARRAY: + default: GGML_ABORT("invalid array element type"); + } } else { write(kv.get_type()); - } - - switch (kv.get_type()) { - case GGUF_TYPE_UINT8: - case GGUF_TYPE_INT8: - case GGUF_TYPE_UINT16: - case GGUF_TYPE_INT16: - case GGUF_TYPE_UINT32: - case GGUF_TYPE_INT32: - case GGUF_TYPE_FLOAT32: - case GGUF_TYPE_UINT64: - case GGUF_TYPE_INT64: - case GGUF_TYPE_FLOAT64: { - write(kv.data); - } break; - case GGUF_TYPE_BOOL: { - for (size_t i = 0; i < ne; ++i) { - write(kv.get_val(i)); - } - } break; - case GGUF_TYPE_STRING: { - for (size_t i = 0; i < ne; ++i) { - write(kv.get_val(i)); - } - } break; - case GGUF_TYPE_ARRAY: - default: GGML_ABORT("invalid type"); + switch (kv.get_type()) { + case GGUF_TYPE_UINT8: + case GGUF_TYPE_INT8: + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: { + write(kv.data); + } break; + case GGUF_TYPE_BOOL: { + write(kv.get_val(0)); + } break; + case GGUF_TYPE_STRING: { + write(kv.get_val(0)); + } break; + case GGUF_TYPE_ARRAY: + default: GGML_ABORT("invalid type"); + } } } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 839c6e787fc..a1a771ccfdf 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -3773,6 +3773,7 @@ class GGMLQuantizationType(IntEnum): TQ1_0 = 34 TQ2_0 = 35 MXFP4 = 39 + Q3_PT = 40 class ExpertGatingFuncType(IntEnum): @@ -3930,6 +3931,7 @@ class VisionProjectorType: GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13), GGMLQuantizationType.TQ2_0: (256, 2 + 64), GGMLQuantizationType.MXFP4: (32, 1 + 16), + GGMLQuantizationType.Q3_PT: (256, 4 + 12 + 256 // 8 - 8 + 256 // 4), } diff --git a/include/llama.h b/include/llama.h index 077f66dc651..2eee1ab0865 100644 --- a/include/llama.h +++ b/include/llama.h @@ -152,6 +152,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_PT = 39, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 1501e392ca8..78ab5e8a877 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -1,6 +1,7 @@ #include "llama-model-loader.h" #include "ggml.h" +#include "llama.h" #include #include @@ -54,6 +55,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; + case LLAMA_FTYPE_MOSTLY_Q3_PT: return "Q3_PT - 3.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; @@ -709,6 +711,7 @@ llama_model_loader::llama_model_loader( case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; + case GGML_TYPE_Q3_PT: ftype = LLAMA_FTYPE_MOSTLY_Q3_PT; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index dabf3b3086e..0dda192142b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -15,6 +15,13 @@ #include "models/models.h" +// Q3_PT levels functions (defined in ggml-quants.c) +extern "C" { + void iq3kl_set_levels(const float * levels); + void iq3kl_register_tensor_levels(const void * data, size_t nbytes, const float * levels); + void iq3kl_clear_tensor_levels(void); +} + #include #include #include @@ -7850,6 +7857,47 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } + // Q3_PT: load per-tensor levels from GGUF metadata and register them. + // Must happen AFTER load_all_data so tensor data pointers are valid. + { + static const size_t IQ3KL_N_LEVELS = 8; + int64_t lv_idx = gguf_find_key(ml.meta.get(), "q3_pt.levels"); + if (lv_idx >= 0) { + const float * lv_data = (const float *)gguf_get_arr_data(ml.meta.get(), lv_idx); + const size_t lv_len = gguf_get_arr_n(ml.meta.get(), lv_idx); + + // Build tensor-name to slot index map (GGUF file order = quantizer order) + std::unordered_map name_to_slot; + { + const int64_t n_tensors = gguf_get_n_tensors(ml.meta.get()); + for (int64_t ti = 0; ti < n_tensors; ++ti) { + name_to_slot[gguf_get_tensor_name(ml.meta.get(), ti)] = (size_t)ti; + } + } + + iq3kl_clear_tensor_levels(); + int n_registered = 0; + + for (auto & [ctx, buf_map] : ctx_buf_maps) { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + if (t->type != GGML_TYPE_Q3_PT || t->data == nullptr) { continue; } + auto it = name_to_slot.find(ggml_get_name(t)); + if (it == name_to_slot.end()) { continue; } + const size_t lv_offset = it->second * IQ3KL_N_LEVELS; + if (lv_offset + IQ3KL_N_LEVELS > lv_len) { continue; } + iq3kl_register_tensor_levels(t->data, ggml_nbytes(t), lv_data + lv_offset); + if (n_registered == 0) { + iq3kl_set_levels(lv_data + lv_offset); // global fallback + } + n_registered++; + } + } + if (n_registered > 0) { + LLAMA_LOG_INFO("%s: registered %d Q3_PT per-tensor level tables\n", __func__, n_registered); + } + } + } + if (use_mmap_buffer) { for (auto & mapping : ml.mappings) { pimpl->mappings.emplace_back(std::move(mapping)); diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 24770430e1c..6c3e8a613ec 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1,7 +1,9 @@ #include "llama-quant.h" +#include "ggml.h" #include "llama-impl.h" #include "llama-model.h" #include "llama-model-loader.h" +#include "llama.h" #include #include @@ -13,6 +15,13 @@ #include #include +// Q3_PT levels functions (defined in ggml-quants.c) +extern "C" { + void iq3kl_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[8]); + void iq3kl_set_levels(const float * levels); +} + // Quantization types. Changes to this struct must be replicated in quantize.cpp struct tensor_quantization { std::string name; @@ -248,6 +257,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ3_S; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) { + new_type = GGML_TYPE_IQ4_XS; + } else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { new_type = GGML_TYPE_Q4_K; } @@ -286,6 +298,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) { + new_type = GGML_TYPE_Q3_PT; + } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q4_K; } @@ -355,6 +370,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) { + new_type = GGML_TYPE_IQ4_XS; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { if (arch == LLM_ARCH_FALCON) { new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : @@ -484,6 +502,7 @@ static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S || dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_M || + dst_type == GGML_TYPE_Q3_PT || ( // Q2_K_S is the worst k-quant type - only allow it without imatrix for token embeddings dst_type == GGML_TYPE_Q2_K && ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(t->name, "token_embd.weight") != 0 ) @@ -531,6 +550,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; + case LLAMA_FTYPE_MOSTLY_Q3_PT: default_type = GGML_TYPE_Q3_PT; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -747,6 +767,83 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const auto tn = LLM_TN(model.arch); + // Q3_PT two-pass approach: train all per-tensor levels BEFORE opening the output + // file, so the levels KV entry is already populated at the time of the metadata placeholder. + static const size_t IQ3KL_N_LEVELS = 8; + std::vector iq3kl_all_levels; // indexed by position in tensors[] + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT && !params->dry_run) { + LLAMA_LOG_INFO("%s: Q3_PT pass 1: training per-tensor levels...\n", __func__); + iq3kl_all_levels.assign(tensors.size() * IQ3KL_N_LEVELS, 0.0f); + + // Temporary dequant buffer for pass 1 (reuse f32_conv_buf / read_data declared below) + std::vector> p1_read_data; + std::vector> p1_f32_buf; + std::vector p1_workers; + p1_workers.reserve(nthread); + + for (size_t ti = 0; ti < tensors.size(); ++ti) { + ggml_tensor * tensor = tensors[ti]->tensor; + const std::string tname = ggml_get_name(tensor); + + // Determine whether this tensor will be Q3_PT (mirror the pass-2 logic) + bool quantize = tname.rfind("weight") == tname.size() - 6; + quantize &= (ggml_n_dims(tensor) >= 2); + quantize &= tname.find("_norm.weight") == std::string::npos; + quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos; + if (!quantize) { continue; } + + ggml_type new_type = default_type; + if (!params->pure) { + new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); + } + if (new_type != GGML_TYPE_Q3_PT) { continue; } + + // Load tensor data + const size_t tsz = ggml_nbytes(tensor); + if (!ml.use_mmap) { + if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); } + tensor->data = p1_read_data.data(); + } + ml.load_data_for(tensor); + + // Dequantize to f32 if needed + const int64_t nelements = ggml_nelements(tensor); + float * f32_data; + if (tensor->type == GGML_TYPE_F32) { + f32_data = (float *) tensor->data; + } else { + llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread); + f32_data = (float *) p1_f32_buf.data(); + } + + // Resolve imatrix + const float * imatrix = nullptr; + if (imatrix_data) { + auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped)); + if (it2 != imatrix_data->end() && + it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) { + imatrix = it2->second.data(); + } + } + + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows = tensor->ne[1]; + + LLAMA_LOG_INFO("%s: Q3_PT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name); + iq3kl_train_levels(f32_data, nrows, n_per_row, imatrix, + iq3kl_all_levels.data() + ti * IQ3KL_N_LEVELS); + } + + // All levels ready — store in GGUF metadata before the file is opened + for (auto & ctx : ctx_outs) { + if (ctx) { + gguf_set_arr_data(ctx.get(), "q3_pt.levels", GGUF_TYPE_FLOAT32, + iq3kl_all_levels.data(), iq3kl_all_levels.size()); + } + } + LLAMA_LOG_INFO("%s: Q3_PT pass 1 complete.\n", __func__); + } + // no output file for --dry-run if (!params->dry_run) { new_ofstream(0); @@ -755,6 +852,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // flag for `--dry-run`, to let the user know if imatrix will be required for a real // quantization, as a courtesy bool will_require_imatrix = false; + size_t tensor_pass2_idx = 0; // index into tensors[], used for Q3_PT levels lookup for (const auto * it : tensors) { const auto & weight = *it; @@ -846,7 +944,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: new_type = default_type; // get more optimal quantization type based on the tensor shape, layer, etc. - if (!params->pure && ggml_is_quantized(default_type)) { + if (ggml_is_quantized(default_type)) { // if the user provided tensor types - use those bool manual = false; if (params->tensor_types) { @@ -865,7 +963,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } // if not manual - use the standard logic for choosing the quantization type based on the selected mixture - if (!manual) { + if (!manual && !params->pure) { new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); } @@ -1012,6 +1110,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1; + // Q3_PT: set the per-tensor levels (trained in pass 1) as global for quantization + if (new_type == GGML_TYPE_Q3_PT) { + iq3kl_set_levels(iq3kl_all_levels.data() + tensor_pass2_idx * IQ3KL_N_LEVELS); + } + // quantize each expert separately since they have different importance matrices new_size = 0; for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { @@ -1058,6 +1161,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: fout.write((const char *) new_data, new_size); zeros(fout, GGML_PAD(new_size, align) - new_size); } // no --dry-run + + tensor_pass2_idx++; } // iterate over tensors if (!params->dry_run) { diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 59bf9bd3fd0..526a3935556 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -37,6 +37,7 @@ static const std::vector QUANT_OPTIONS = { { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, + { "Q3_PT", LLAMA_FTYPE_MOSTLY_Q3_PT, " 3.25 bpw quantization", }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, @@ -159,6 +160,9 @@ static void usage(const char * executable) { printf(" WARNING: this is an advanced option, use with care.\n"); printf(" --dry-run\n"); printf(" calculate and show the final quantization size without performing quantization\n"); + printf(" --threads n\n"); + printf(" number of threads to use for cross-tensor parallelization (default: 0, use same as within-tensor)\n"); + printf(" when n > 0, enables parallel quantization of multiple tensors simultaneously\n"); printf(" example: llama-quantize --dry-run model-f32.gguf Q4_K\n\n"); printf("note: --include-weights and --exclude-weights cannot be used together\n\n"); printf("-----------------------------------------------------------------------------\n"); @@ -561,6 +565,8 @@ int main(int argc, char ** argv) { } } else if (strcmp(argv[arg_idx], "--keep-split") == 0) { params.keep_split = true; + } else if (strcmp(argv[arg_idx], "--keep-split") == 0) { + params.keep_split = true; } else { usage(argv[0]); } From aca2e6ce7909064dbfbf3bd168e9ad3db1826e6c Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Thu, 26 Feb 2026 23:18:35 +0100 Subject: [PATCH 2/9] Missing renames --- ggml/src/ggml-common.h | 2 +- ggml/src/ggml-cpu/ggml-cpu.c | 2 +- ggml/src/ggml-cpu/quants.c | 2 +- ggml/src/ggml-quants.c | 156 +++++++++++++++++------------------ ggml/src/ggml-quants.h | 16 ++-- ggml/src/ggml.c | 2 +- src/llama-model.cpp | 18 ++-- src/llama-quant.cpp | 18 ++-- 8 files changed, 108 insertions(+), 108 deletions(-) diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index cd99d6e643b..b1a2fa4f1a5 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -438,7 +438,7 @@ typedef struct { } block_q3_pt; static_assert(sizeof(block_q3_pt) == 124, "wrong q3_pt block size"); -#define IQ3KL_N_LEVELS 8 +#define Q3PT_N_LEVELS 8 #endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 03f58c45f17..314d5c1ccae 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -385,7 +385,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .nrows = 1, }, [GGML_TYPE_Q3_PT] = { - // from_float not set — requires codebook initialization via iq3kl_set_codebook() + // from_float not set — requires codebook initialization via q3pt_set_codebook() .vec_dot = ggml_vec_dot_q3_pt_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index 7165d448253..f3e912112e9 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -1014,7 +1014,7 @@ void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const int nb = n / QK_K; - const float * levels = iq3kl_get_tensor_levels(vx); + const float * levels = q3pt_get_tensor_levels(vx); GGML_ASSERT(levels != NULL && "Q3_PT levels not set for tensor"); float sumf = 0.f; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 1d53400de13..6802f0e5235 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4083,66 +4083,66 @@ void quantize_row_iq3_s_ref(const float * GGML_RESTRICT x, block_iq3_s * GGML_RE // 16-element sub-block values. Stored in GGUF as "q3_pt.levels" (float32 array). // Global levels (used during quantization for the current tensor) -static float iq3kl_levels[IQ3KL_N_LEVELS]; -static bool iq3kl_levels_set = false; +static float q3pt_levels[Q3PT_N_LEVELS]; +static bool q3pt_levels_set = false; -void iq3kl_set_levels(const float * levels) { - memcpy(iq3kl_levels, levels, IQ3KL_N_LEVELS * sizeof(float)); - iq3kl_levels_set = true; +void q3pt_set_levels(const float * levels) { + memcpy(q3pt_levels, levels, Q3PT_N_LEVELS * sizeof(float)); + q3pt_levels_set = true; } -const float * iq3kl_get_levels(void) { - return iq3kl_levels_set ? iq3kl_levels : NULL; +const float * q3pt_get_levels(void) { + return q3pt_levels_set ? q3pt_levels : NULL; } -void iq3kl_free_levels(void) { - iq3kl_levels_set = false; +void q3pt_free_levels(void) { + q3pt_levels_set = false; } // Per-tensor levels registry for inference (range-based lookup by data address) -#define IQ3KL_MAX_TENSORS 1024 +#define Q3PT_MAX_TENSORS 1024 typedef struct { const void * data; size_t nbytes; - float levels[IQ3KL_N_LEVELS]; -} iq3kl_tensor_entry; - -static iq3kl_tensor_entry iq3kl_tensor_registry[IQ3KL_MAX_TENSORS]; -static int iq3kl_tensor_registry_count = 0; - -GGML_API void iq3kl_register_tensor_levels(const void * data, size_t nbytes, const float * levels) { - if (iq3kl_tensor_registry_count >= IQ3KL_MAX_TENSORS) { return; } - for (int i = 0; i < iq3kl_tensor_registry_count; ++i) { - if (iq3kl_tensor_registry[i].data == data) { - iq3kl_tensor_registry[i].nbytes = nbytes; - memcpy(iq3kl_tensor_registry[i].levels, levels, IQ3KL_N_LEVELS * sizeof(float)); + float levels[Q3PT_N_LEVELS]; +} q3pt_tensor_entry; + +static q3pt_tensor_entry q3pt_tensor_registry[Q3PT_MAX_TENSORS]; +static int q3pt_tensor_registry_count = 0; + +GGML_API void q3pt_register_tensor_levels(const void * data, size_t nbytes, const float * levels) { + if (q3pt_tensor_registry_count >= Q3PT_MAX_TENSORS) { return; } + for (int i = 0; i < q3pt_tensor_registry_count; ++i) { + if (q3pt_tensor_registry[i].data == data) { + q3pt_tensor_registry[i].nbytes = nbytes; + memcpy(q3pt_tensor_registry[i].levels, levels, Q3PT_N_LEVELS * sizeof(float)); return; } } - iq3kl_tensor_registry[iq3kl_tensor_registry_count].data = data; - iq3kl_tensor_registry[iq3kl_tensor_registry_count].nbytes = nbytes; - memcpy(iq3kl_tensor_registry[iq3kl_tensor_registry_count].levels, levels, IQ3KL_N_LEVELS * sizeof(float)); - iq3kl_tensor_registry_count++; + q3pt_tensor_registry[q3pt_tensor_registry_count].data = data; + q3pt_tensor_registry[q3pt_tensor_registry_count].nbytes = nbytes; + memcpy(q3pt_tensor_registry[q3pt_tensor_registry_count].levels, levels, Q3PT_N_LEVELS * sizeof(float)); + q3pt_tensor_registry_count++; } -GGML_API void iq3kl_clear_tensor_levels(void) { - iq3kl_tensor_registry_count = 0; +GGML_API void q3pt_clear_tensor_levels(void) { + q3pt_tensor_registry_count = 0; } -GGML_API const float * iq3kl_get_tensor_levels(const void * data_ptr) { +GGML_API const float * q3pt_get_tensor_levels(const void * data_ptr) { const uint8_t * p = (const uint8_t *)data_ptr; - for (int i = 0; i < iq3kl_tensor_registry_count; ++i) { - const uint8_t * base = (const uint8_t *)iq3kl_tensor_registry[i].data; - if (p >= base && p < base + iq3kl_tensor_registry[i].nbytes) { - return iq3kl_tensor_registry[i].levels; + for (int i = 0; i < q3pt_tensor_registry_count; ++i) { + const uint8_t * base = (const uint8_t *)q3pt_tensor_registry[i].data; + if (p >= base && p < base + q3pt_tensor_registry[i].nbytes) { + return q3pt_tensor_registry[i].levels; } } - return iq3kl_get_levels(); + return q3pt_get_levels(); } -void iq3kl_train_levels(const float * data, int64_t nrow, int64_t n_per_row, - const float * imatrix, float levels_out[IQ3KL_N_LEVELS]) { +void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[Q3PT_N_LEVELS]) { const int64_t n_sub = n_per_row / 16; // 16-element sub-blocks per row @@ -4184,15 +4184,15 @@ void iq3kl_train_levels(const float * data, int64_t nrow, int64_t n_per_row, } // Initialize 8 levels uniformly in [0, 1] - float levels[IQ3KL_N_LEVELS]; - for (int k = 0; k < IQ3KL_N_LEVELS; ++k) { - levels[k] = (float)k / (IQ3KL_N_LEVELS - 1); + float levels[Q3PT_N_LEVELS]; + for (int k = 0; k < Q3PT_N_LEVELS; ++k) { + levels[k] = (float)k / (Q3PT_N_LEVELS - 1); } // Lloyd-Max (weighted k-means) iterations with early convergence for (int iter = 0; iter < 300; ++iter) { - float sum_w [IQ3KL_N_LEVELS] = {0}; - float sum_wt[IQ3KL_N_LEVELS] = {0}; + float sum_w [Q3PT_N_LEVELS] = {0}; + float sum_wt[Q3PT_N_LEVELS] = {0}; // Process bins instead of individual values for (int b = 0; b < N_BINS; ++b) { @@ -4200,7 +4200,7 @@ void iq3kl_train_levels(const float * data, int64_t nrow, int64_t n_per_row, const float t = (b + 0.5f) * bin_width; // representative value at bin center int best = 0; float best_d2 = (t - levels[0]) * (t - levels[0]); - for (int k = 1; k < IQ3KL_N_LEVELS; ++k) { + for (int k = 1; k < Q3PT_N_LEVELS; ++k) { float d2 = (t - levels[k]) * (t - levels[k]); if (d2 < best_d2) { best_d2 = d2; best = k; } } @@ -4210,7 +4210,7 @@ void iq3kl_train_levels(const float * data, int64_t nrow, int64_t n_per_row, // Check for early convergence float max_delta = 0.0f; - for (int k = 0; k < IQ3KL_N_LEVELS; ++k) { + for (int k = 0; k < Q3PT_N_LEVELS; ++k) { if (sum_w[k] > 1e-12f) { float new_level = sum_wt[k] / sum_w[k]; max_delta = fmaxf(max_delta, fabsf(new_level - levels[k])); @@ -4220,15 +4220,15 @@ void iq3kl_train_levels(const float * data, int64_t nrow, int64_t n_per_row, if (max_delta < 1e-10f) break; // Keep levels sorted (insertion sort — 8 elements) - for (int k = 1; k < IQ3KL_N_LEVELS; ++k) { + for (int k = 1; k < Q3PT_N_LEVELS; ++k) { float v = levels[k]; int m = k - 1; while (m >= 0 && levels[m] > v) { levels[m+1] = levels[m]; m--; } levels[m+1] = v; } } - memcpy(levels_out, levels, IQ3KL_N_LEVELS * sizeof(float)); - iq3kl_set_levels(levels); + memcpy(levels_out, levels, Q3PT_N_LEVELS * sizeof(float)); + q3pt_set_levels(levels); free(bin_sum_w); free(bin_sum_wt); } @@ -4237,7 +4237,7 @@ void iq3kl_train_levels(const float * data, int64_t nrow, int64_t n_per_row, // 6-bit sequential packing: 32 values in 24 bytes (4 values per 3 bytes). // Indices 0..15 = sub-block ranges, 16..31 = sub-block neg_mins. -static inline uint8_t iq3kl_sc_get(const uint8_t * GGML_RESTRICT sc, int i) { +static inline uint8_t q3pt_sc_get(const uint8_t * GGML_RESTRICT sc, int i) { const int bit = i * 6; const int byte = bit / 8; const int off = bit % 8; @@ -4246,7 +4246,7 @@ static inline uint8_t iq3kl_sc_get(const uint8_t * GGML_RESTRICT sc, int i) { return val; } -static inline void iq3kl_sc_set(uint8_t * GGML_RESTRICT sc, int i, uint8_t v) { +static inline void q3pt_sc_set(uint8_t * GGML_RESTRICT sc, int i, uint8_t v) { const int bit = i * 6; const int byte = bit / 8; const int off = bit % 8; @@ -4255,7 +4255,7 @@ static inline void iq3kl_sc_set(uint8_t * GGML_RESTRICT sc, int i, uint8_t v) { } // 3-bit sequential packing: 256 values in 96 bytes (8 values per 3 bytes). -static inline int iq3kl_unpack3(const uint8_t * GGML_RESTRICT qs, int k) { +static inline int q3pt_unpack3(const uint8_t * GGML_RESTRICT qs, int k) { const int bit = k * 3; const int byte = bit / 8; const int off = bit % 8; @@ -4264,7 +4264,7 @@ static inline int iq3kl_unpack3(const uint8_t * GGML_RESTRICT qs, int k) { return val; } -static inline void iq3kl_pack3(uint8_t * GGML_RESTRICT qs, int k, int v) { +static inline void q3pt_pack3(uint8_t * GGML_RESTRICT qs, int k, int v) { const int bit = k * 3; const int byte = bit / 8; const int off = bit % 8; @@ -4275,7 +4275,7 @@ static inline void iq3kl_pack3(uint8_t * GGML_RESTRICT qs, int k, int v) { void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; - const float * L = iq3kl_get_tensor_levels(x); + const float * L = q3pt_get_tensor_levels(x); GGML_ASSERT(L != NULL && "Q3_PT levels not set for tensor"); for (int i = 0; i < nb; i++) { @@ -4285,10 +4285,10 @@ void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_REST const uint8_t * qs = x[i].qs; for (int ib = 0; ib < QK_K/16; ++ib) { - const float range = d * (float)iq3kl_sc_get(sc, ib); - const float sub_min = -dmin * (float)iq3kl_sc_get(sc, ib + QK_K/16); + const float range = d * (float)q3pt_sc_get(sc, ib); + const float sub_min = -dmin * (float)q3pt_sc_get(sc, ib + QK_K/16); for (int j = 0; j < 16; ++j) { - const int q = iq3kl_unpack3(qs, ib*16 + j); + const int q = q3pt_unpack3(qs, ib*16 + j); y[ib*16 + j] = L[q] * range + sub_min; } } @@ -4296,7 +4296,7 @@ void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_REST } } -#define IQ3KL_REFINE_ITERS 5 +#define Q3PT_REFINE_ITERS 5 // Find the optimal global d-scale for 6-bit (nmax=63) sub-block range quantization, // minimizing Σ_i weights[i] * (vals[i] - d * clamp(round(vals[i]/d), 0, nmax))^2. @@ -4304,7 +4304,7 @@ void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_REST // Without imatrix all weights are equal and the winner is always max/nmax, so this is a no-op. // With imatrix it can redirect scale resolution to important sub-blocks at the cost of // less important ones that would otherwise dominate via raw max(). -static float iq3kl_find_optimal_d(const float * GGML_RESTRICT vals, +static float q3pt_find_optimal_d(const float * GGML_RESTRICT vals, const float * GGML_RESTRICT weights, int n, int nmax) { float max_val = 0.f; @@ -4330,12 +4330,12 @@ static void quantize_row_q3_pt_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) { - GGML_ASSERT(iq3kl_levels_set && "Q3_PT levels not set - call iq3kl_set_levels() first"); + GGML_ASSERT(q3pt_levels_set && "Q3_PT levels not set - call q3pt_set_levels() first"); GGML_ASSERT(n % QK_K == 0); const int64_t nbl = n / QK_K; block_q3_pt * y = (block_q3_pt *) vy; - const float * L = iq3kl_levels; + const float * L = q3pt_levels; for (int ibl = 0; ibl < nbl; ++ibl) { const float * xbl = x + QK_K * ibl; @@ -4350,7 +4350,7 @@ static void quantize_row_q3_pt_impl(const float * GGML_RESTRICT x, } // Per-sub-block importance weights: sum of AWQ weights over 16 elements. - // Used by iq3kl_find_optimal_d() to direct scale resolution toward important sub-blocks. + // Used by q3pt_find_optimal_d() to direct scale resolution toward important sub-blocks. float w_ib[QK_K / 16]; for (int ib = 0; ib < QK_K / 16; ++ib) { float wsum = 0.f; @@ -4401,7 +4401,7 @@ static void quantize_row_q3_pt_impl(const float * GGML_RESTRICT x, const float t = (xj - sub_min0) * inv_range0; int best = 0; float best_d2 = (t - L[0]) * (t - L[0]); - for (int k = 1; k < IQ3KL_N_LEVELS; ++k) { + for (int k = 1; k < Q3PT_N_LEVELS; ++k) { const float d2 = (t - L[k]) * (t - L[k]); if (d2 < best_d2) { best_d2 = d2; @@ -4429,8 +4429,8 @@ static void quantize_row_q3_pt_impl(const float * GGML_RESTRICT x, } // Importance-weighted d/dmin search (replaces plain max/63) - float d_val = iq3kl_find_optimal_d(sub_ranges, w_ib, QK_K / 16, 63); - float dmin_val = iq3kl_find_optimal_d(neg_mins, w_ib, QK_K / 16, 63); + float d_val = q3pt_find_optimal_d(sub_ranges, w_ib, QK_K / 16, 63); + float dmin_val = q3pt_find_optimal_d(neg_mins, w_ib, QK_K / 16, 63); // Quantize ranges and neg_mins to 6-bit memset(blk->scales, 0, sizeof(blk->scales)); @@ -4440,42 +4440,42 @@ static void quantize_row_q3_pt_impl(const float * GGML_RESTRICT x, for (int ib = 0; ib < QK_K / 16; ++ib) { uint8_t sc = MIN(63, nearest_int(inv_d * sub_ranges[ib])); uint8_t sm = MIN(63, nearest_int(inv_dmin * neg_mins[ib])); - iq3kl_sc_set(blk->scales, ib, sc); - iq3kl_sc_set(blk->scales, ib + QK_K / 16, sm); + q3pt_sc_set(blk->scales, ib, sc); + q3pt_sc_set(blk->scales, ib + QK_K / 16, sm); } blk->d = GGML_FP32_TO_FP16(d_val); blk->dmin = GGML_FP32_TO_FP16(dmin_val); // Initial level assignment for (int ib = 0; ib < QK_K / 16; ++ib) { - const float range = d_val * (float) iq3kl_sc_get(blk->scales, ib); - const float sub_min = -dmin_val * (float) iq3kl_sc_get(blk->scales, ib + QK_K / 16); + const float range = d_val * (float) q3pt_sc_get(blk->scales, ib); + const float sub_min = -dmin_val * (float) q3pt_sc_get(blk->scales, ib + QK_K / 16); const float inv_range = range > 1e-6f ? 1.f / range : 0.f; for (int j = 0; j < 16; ++j) { const int elem = ib * 16 + j; const float t = (xbl[elem] - sub_min) * inv_range; int best = 0; float best_d2 = (t - L[0]) * (t - L[0]); - for (int k = 1; k < IQ3KL_N_LEVELS; ++k) { + for (int k = 1; k < Q3PT_N_LEVELS; ++k) { const float d2 = (t - L[k]) * (t - L[k]); if (d2 < best_d2) { best_d2 = d2; best = k; } } - iq3kl_pack3(blk->qs, elem, best); + q3pt_pack3(blk->qs, elem, best); } } // Iterative refinement: weighted LS for (range, neg_min) + importance-weighted d/dmin. - for (int iter = 0; iter < IQ3KL_REFINE_ITERS; ++iter) { + for (int iter = 0; iter < Q3PT_REFINE_ITERS; ++iter) { for (int ib = 0; ib < QK_K / 16; ++ib) { double sA = 0, sB = 0, sC = 0, sD = 0, sE = 0; for (int j = 0; j < 16; ++j) { const int elem = ib * 16 + j; const float xj = xbl[elem]; const float w = quant_weights ? quant_weights[QK_K * ibl + elem] * sqrtf(sigma2 + xj * xj) : 1.0f; - const float lq = L[iq3kl_unpack3(blk->qs, elem)]; + const float lq = L[q3pt_unpack3(blk->qs, elem)]; sA += (double) w * (double) lq * (double) lq; sB += (double) w * (double) lq; sC += (double) w; @@ -4493,8 +4493,8 @@ static void quantize_row_q3_pt_impl(const float * GGML_RESTRICT x, } // Importance-weighted d/dmin search on updated sub_ranges/neg_mins - d_val = iq3kl_find_optimal_d(sub_ranges, w_ib, QK_K / 16, 63); - dmin_val = iq3kl_find_optimal_d(neg_mins, w_ib, QK_K / 16, 63); + d_val = q3pt_find_optimal_d(sub_ranges, w_ib, QK_K / 16, 63); + dmin_val = q3pt_find_optimal_d(neg_mins, w_ib, QK_K / 16, 63); // Re-pack scales memset(blk->scales, 0, sizeof(blk->scales)); @@ -4503,8 +4503,8 @@ static void quantize_row_q3_pt_impl(const float * GGML_RESTRICT x, for (int ib = 0; ib < QK_K / 16; ++ib) { uint8_t sc = MIN(63, nearest_int(inv_d2 * sub_ranges[ib])); uint8_t sm = MIN(63, nearest_int(inv_dmin2 * neg_mins[ib])); - iq3kl_sc_set(blk->scales, ib, sc); - iq3kl_sc_set(blk->scales, ib + QK_K / 16, sm); + q3pt_sc_set(blk->scales, ib, sc); + q3pt_sc_set(blk->scales, ib + QK_K / 16, sm); } blk->d = GGML_FP32_TO_FP16(d_val); blk->dmin = GGML_FP32_TO_FP16(dmin_val); @@ -4512,22 +4512,22 @@ static void quantize_row_q3_pt_impl(const float * GGML_RESTRICT x, // Re-assign levels memset(blk->qs, 0, sizeof(blk->qs)); for (int ib = 0; ib < QK_K / 16; ++ib) { - const float range = d_val * (float) iq3kl_sc_get(blk->scales, ib); - const float sub_min = -dmin_val * (float) iq3kl_sc_get(blk->scales, ib + QK_K / 16); + const float range = d_val * (float) q3pt_sc_get(blk->scales, ib); + const float sub_min = -dmin_val * (float) q3pt_sc_get(blk->scales, ib + QK_K / 16); const float inv_range = range > 1e-6f ? 1.f / range : 0.f; for (int j = 0; j < 16; ++j) { const int elem = ib * 16 + j; const float t = (xbl[elem] - sub_min) * inv_range; int best = 0; float best_d2 = (t - L[0]) * (t - L[0]); - for (int k = 1; k < IQ3KL_N_LEVELS; ++k) { + for (int k = 1; k < Q3PT_N_LEVELS; ++k) { const float d2 = (t - L[k]) * (t - L[k]); if (d2 < best_d2) { best_d2 = d2; best = k; } } - iq3kl_pack3(blk->qs, elem, best); + q3pt_pack3(blk->qs, elem, best); } } } diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 78abfde2e13..fd8501be4fe 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -101,19 +101,19 @@ GGML_API void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_API size_t quantize_q3_pt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); // Q3_PT levels management (per-tensor Lloyd-Max levels in [0,1]) -GGML_API void iq3kl_set_levels(const float * levels); // set global levels (quantization) -GGML_API const float * iq3kl_get_levels(void); -GGML_API void iq3kl_free_levels(void); +GGML_API void q3pt_set_levels(const float * levels); // set global levels (quantization) +GGML_API const float * q3pt_get_levels(void); +GGML_API void q3pt_free_levels(void); // Per-tensor levels registry (inference — range-based lookup by data address) -GGML_API void iq3kl_register_tensor_levels(const void * data, size_t nbytes, const float * levels); -GGML_API void iq3kl_clear_tensor_levels(void); -GGML_API const float * iq3kl_get_tensor_levels(const void * data_ptr); +GGML_API void q3pt_register_tensor_levels(const void * data, size_t nbytes, const float * levels); +GGML_API void q3pt_clear_tensor_levels(void); +GGML_API const float * q3pt_get_tensor_levels(const void * data_ptr); // Train 8 Lloyd-Max levels from tensor data via weighted k-means on affine-normalized -// 16-element sub-block values. Also sets the global levels via iq3kl_set_levels(). +// 16-element sub-block values. Also sets the global levels via q3pt_set_levels(). // data: float array [nrow * n_per_row], imatrix: importance weights [n_per_row] or NULL. -GGML_API void iq3kl_train_levels(const float * data, int64_t nrow, int64_t n_per_row, +GGML_API void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, float levels_out[8]); GGML_API void iq2xs_init_impl(enum ggml_type type); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 7869e340fd3..5ef37ca9e64 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7539,7 +7539,7 @@ void ggml_quantize_init(enum ggml_type type) { case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break; case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break; case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break; - case GGML_TYPE_Q3_PT: break; // levels set externally via iq3kl_set_levels() + case GGML_TYPE_Q3_PT: break; // levels set externally via q3pt_set_levels() default: // nothing break; } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0dda192142b..1843654adfe 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -17,9 +17,9 @@ // Q3_PT levels functions (defined in ggml-quants.c) extern "C" { - void iq3kl_set_levels(const float * levels); - void iq3kl_register_tensor_levels(const void * data, size_t nbytes, const float * levels); - void iq3kl_clear_tensor_levels(void); + void q3pt_set_levels(const float * levels); + void q3pt_register_tensor_levels(const void * data, size_t nbytes, const float * levels); + void q3pt_clear_tensor_levels(void); } #include @@ -7860,7 +7860,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // Q3_PT: load per-tensor levels from GGUF metadata and register them. // Must happen AFTER load_all_data so tensor data pointers are valid. { - static const size_t IQ3KL_N_LEVELS = 8; + static const size_t Q3PT_N_LEVELS = 8; int64_t lv_idx = gguf_find_key(ml.meta.get(), "q3_pt.levels"); if (lv_idx >= 0) { const float * lv_data = (const float *)gguf_get_arr_data(ml.meta.get(), lv_idx); @@ -7875,7 +7875,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } - iq3kl_clear_tensor_levels(); + q3pt_clear_tensor_levels(); int n_registered = 0; for (auto & [ctx, buf_map] : ctx_buf_maps) { @@ -7883,11 +7883,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (t->type != GGML_TYPE_Q3_PT || t->data == nullptr) { continue; } auto it = name_to_slot.find(ggml_get_name(t)); if (it == name_to_slot.end()) { continue; } - const size_t lv_offset = it->second * IQ3KL_N_LEVELS; - if (lv_offset + IQ3KL_N_LEVELS > lv_len) { continue; } - iq3kl_register_tensor_levels(t->data, ggml_nbytes(t), lv_data + lv_offset); + const size_t lv_offset = it->second * Q3PT_N_LEVELS; + if (lv_offset + Q3PT_N_LEVELS > lv_len) { continue; } + q3pt_register_tensor_levels(t->data, ggml_nbytes(t), lv_data + lv_offset); if (n_registered == 0) { - iq3kl_set_levels(lv_data + lv_offset); // global fallback + q3pt_set_levels(lv_data + lv_offset); // global fallback } n_registered++; } diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 6c3e8a613ec..95ed79bdc27 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -17,9 +17,9 @@ // Q3_PT levels functions (defined in ggml-quants.c) extern "C" { - void iq3kl_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, float levels_out[8]); - void iq3kl_set_levels(const float * levels); + void q3pt_set_levels(const float * levels); } // Quantization types. Changes to this struct must be replicated in quantize.cpp @@ -769,11 +769,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // Q3_PT two-pass approach: train all per-tensor levels BEFORE opening the output // file, so the levels KV entry is already populated at the time of the metadata placeholder. - static const size_t IQ3KL_N_LEVELS = 8; - std::vector iq3kl_all_levels; // indexed by position in tensors[] + static const size_t Q3PT_N_LEVELS = 8; + std::vector q3pt_all_levels; // indexed by position in tensors[] if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT && !params->dry_run) { LLAMA_LOG_INFO("%s: Q3_PT pass 1: training per-tensor levels...\n", __func__); - iq3kl_all_levels.assign(tensors.size() * IQ3KL_N_LEVELS, 0.0f); + q3pt_all_levels.assign(tensors.size() * Q3PT_N_LEVELS, 0.0f); // Temporary dequant buffer for pass 1 (reuse f32_conv_buf / read_data declared below) std::vector> p1_read_data; @@ -830,15 +830,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const int64_t nrows = tensor->ne[1]; LLAMA_LOG_INFO("%s: Q3_PT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name); - iq3kl_train_levels(f32_data, nrows, n_per_row, imatrix, - iq3kl_all_levels.data() + ti * IQ3KL_N_LEVELS); + q3pt_train_levels(f32_data, nrows, n_per_row, imatrix, + q3pt_all_levels.data() + ti * Q3PT_N_LEVELS); } // All levels ready — store in GGUF metadata before the file is opened for (auto & ctx : ctx_outs) { if (ctx) { gguf_set_arr_data(ctx.get(), "q3_pt.levels", GGUF_TYPE_FLOAT32, - iq3kl_all_levels.data(), iq3kl_all_levels.size()); + q3pt_all_levels.data(), q3pt_all_levels.size()); } } LLAMA_LOG_INFO("%s: Q3_PT pass 1 complete.\n", __func__); @@ -1112,7 +1112,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // Q3_PT: set the per-tensor levels (trained in pass 1) as global for quantization if (new_type == GGML_TYPE_Q3_PT) { - iq3kl_set_levels(iq3kl_all_levels.data() + tensor_pass2_idx * IQ3KL_N_LEVELS); + q3pt_set_levels(q3pt_all_levels.data() + tensor_pass2_idx * Q3PT_N_LEVELS); } // quantize each expert separately since they have different importance matrices From 4fd75dfe7a95068aa4b4d8749363e6e06397db28 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Fri, 27 Feb 2026 18:05:11 +0100 Subject: [PATCH 3/9] Q3_KPT --- ggml/include/ggml.h | 4 +- ggml/src/ggml-common.h | 7 + ggml/src/ggml-cpu/ggml-cpu.c | 6 + ggml/src/ggml-cpu/ops.cpp | 1 + ggml/src/ggml-cpu/quants.c | 75 ++++++ ggml/src/ggml-cpu/quants.h | 2 + ggml/src/ggml-quants.c | 499 ++++++++++++++++++++++++++++++++++- ggml/src/ggml-quants.h | 13 + ggml/src/ggml.c | 11 + include/llama.h | 1 + src/llama-model-loader.cpp | 2 + src/llama-model.cpp | 47 ++++ src/llama-quant.cpp | 111 +++++++- tools/quantize/quantize.cpp | 1 + 14 files changed, 765 insertions(+), 15 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index a4f9ab70f32..7b0e7ec1235 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -428,7 +428,8 @@ extern "C" { // GGML_TYPE_IQ4_NL_8_8 = 38, GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block) GGML_TYPE_Q3_PT = 40, // 3.875 bpw per-tensor Lloyd-Max, 16-elem affine sub-blocks - GGML_TYPE_COUNT = 41, + GGML_TYPE_Q3_KPT = 41, // Q3_K with learned per-tensor levels (3.4375 bpw) + GGML_TYPE_COUNT = 42, }; // precision @@ -465,6 +466,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors + GGML_FTYPE_MOSTLY_Q3_KPT = 27, // except 1d tensors }; // available tensor operations: diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index b1a2fa4f1a5..43e6f6b6f86 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -276,6 +276,7 @@ typedef struct { } block_q2_K; static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); + // 3-bit quantization // weight is represented as x = a * q // 16 blocks of 16 elements each @@ -305,6 +306,12 @@ typedef struct { } block_q4_K; static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding"); +// Q3_KPT: Q3_K with learned per-tensor levels +// Reuses block_q3_K structure but maps 3-bit indices through learned level table +typedef block_q3_K block_q3_kpt; +#define Q3KPT_N_LEVELS 8 + + // 5-bit quantization // 8 blocks of 32 elements each // weight is represented as x = a * q + b diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 314d5c1ccae..8d3b90cc0de 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -390,6 +390,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_Q3_KPT] = { + // from_float not set — requires level initialization via q3kpt_set_levels() + .vec_dot = ggml_vec_dot_q3_kpt_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, [GGML_TYPE_I32] = { .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32, }, diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 67424ecc00e..b7cd334b8f3 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -5557,6 +5557,7 @@ void ggml_compute_forward_clamp( case GGML_TYPE_MXFP4: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: + case GGML_TYPE_Q3_KPT: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index f3e912112e9..ccfe593b00e 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -1100,6 +1100,81 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } +// Q3_KPT vec_dot - similar to Q3_K but with learned levels +void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q3_kpt * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + const float * levels = q3kpt_get_tensor_levels(vx); + GGML_ASSERT(levels != NULL && "Q3_KPT levels not set for tensor"); + + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d); + const float yd = y[i].d; + const uint8_t * q = x[i].qs; + const uint8_t * hm = x[i].hmask; + const int8_t * q8 = y[i].qs; + uint8_t m = 1; + + uint32_t aux32[4]; + memcpy(aux32, x[i].scales, 12); + uint32_t tmp = aux32[2]; + aux32[2] = ((aux32[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + aux32[3] = ((aux32[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + aux32[0] = (aux32[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + aux32[1] = (aux32[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + const uint8_t * aux = (const uint8_t *)aux32; + + int is = 0; + float block_sum = 0.f; + for (int blk = 0; blk < QK_K; blk += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + int sc1 = (int)aux[is] - 32; + int sc2 = (int)aux[is+1] - 32; + is += 2; + float dl1 = d_all * sc1; + float dl2 = d_all * sc2; + + float sum1 = 0.f, sum2 = 0.f; + for (int l = 0; l < 16; ++l) { + int k_idx = ((q[l+0] >> shift) & 3) + ((hm[l+0] & m) ? 4 : 0); + sum1 += (levels[k_idx] * 7.0f - 4.0f) * (float)q8[l+0]; + } + for (int l = 0; l < 16; ++l) { + int k_idx = ((q[l+16] >> shift) & 3) + ((hm[l+16] & m) ? 4 : 0); + sum2 += (levels[k_idx] * 7.0f - 4.0f) * (float)q8[l+16]; + } + block_sum += dl1 * sum1 + dl2 * sum2; + + shift += 2; + m <<= 1; + q8 += 32; + } + q += 32; + } + sumf += block_sum * yd; + } + *s = sumf; +} + +void ggml_vec_dot_q3_kpt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q3_kpt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +} + void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h index 9e4cd2179b2..24d08d6d67c 100644 --- a/ggml/src/ggml-cpu/quants.h +++ b/ggml/src/ggml-cpu/quants.h @@ -62,6 +62,8 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q3_pt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q3_kpt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); // Generic implementation void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 6802f0e5235..3de383cb6a4 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4071,16 +4071,495 @@ void quantize_row_iq3_s_ref(const float * GGML_RESTRICT x, block_iq3_s * GGML_RE quantize_iq3_s(x, y, 1, k, NULL); } -// ====================== Q3_PT: 3.875 bpw per-tensor Lloyd-Max ====================== +// ====================== Q3_KPT: Q3_K with learned per-tensor levels ====================== // -// Block format (124 bytes per QK_K=256 elements): -// d (2 bytes): global scale for 16-element sub-block ranges -// dmin (2 bytes): global scale for sub-block neg_mins (-sub_min) -// scales[24] : 32 × 6-bit values (0..15 = ranges, 16..31 = neg_mins) -// qs[96] : 256 × 3-bit indices into 8-entry per-tensor Lloyd-Max level table +// Block format: Identical to block_q3_K (110 bytes per QK_K=256 elements) +// hmask[QK_K/8] : high bit for 3-bit indices +// qs[QK_K/4] : low 2 bits for 3-bit indices +// scales[12] : 6-bit quantized scales +// d : super-block scale // -// Per-tensor: 8 float32 "levels" in [0,1] from Lloyd-Max training on affine-normalized -// 16-element sub-block values. Stored in GGUF as "q3_pt.levels" (float32 array). +// The difference from Q3_K: instead of q ∈ {-4,-3,-2,-1,0,1,2,3}, +// we use learned levels L[0..7] and compute: x = d * sc * (L[k] - 4) +// where k is the 3-bit index. +// +// Per-tensor: 8 float32 "levels" in [0,1] from Lloyd-Max training. +// Stored in GGUF as "q3_kpt.levels" (float32 array). + +static float q3kpt_levels[Q3KPT_N_LEVELS]; +static bool q3kpt_levels_set = false; + +GGML_API void q3kpt_set_levels(const float * levels) { + memcpy(q3kpt_levels, levels, Q3KPT_N_LEVELS * sizeof(float)); + q3kpt_levels_set = true; +} + +GGML_API const float * q3kpt_get_levels(void) { + return q3kpt_levels_set ? q3kpt_levels : NULL; +} + +GGML_API void q3kpt_free_levels(void) { + q3kpt_levels_set = false; +} + +// Per-tensor levels registry for inference +#define Q3KPT_MAX_TENSORS 1024 + +typedef struct { + const void * data; + size_t nbytes; + float levels[Q3KPT_N_LEVELS]; +} q3kpt_tensor_entry; + +static q3kpt_tensor_entry q3kpt_tensor_registry[Q3KPT_MAX_TENSORS]; +static int q3kpt_tensor_registry_count = 0; + +GGML_API void q3kpt_register_tensor_levels(const void * data, size_t nbytes, const float * levels) { + if (q3kpt_tensor_registry_count >= Q3KPT_MAX_TENSORS) { return; } + for (int i = 0; i < q3kpt_tensor_registry_count; ++i) { + if (q3kpt_tensor_registry[i].data == data) { + q3kpt_tensor_registry[i].nbytes = nbytes; + memcpy(q3kpt_tensor_registry[i].levels, levels, Q3KPT_N_LEVELS * sizeof(float)); + return; + } + } + q3kpt_tensor_registry[q3kpt_tensor_registry_count].data = data; + q3kpt_tensor_registry[q3kpt_tensor_registry_count].nbytes = nbytes; + memcpy(q3kpt_tensor_registry[q3kpt_tensor_registry_count].levels, levels, Q3KPT_N_LEVELS * sizeof(float)); + q3kpt_tensor_registry_count++; +} + +GGML_API void q3kpt_clear_tensor_levels(void) { + q3kpt_tensor_registry_count = 0; +} + +GGML_API const float * q3kpt_get_tensor_levels(const void * data_ptr) { + const uint8_t * p = (const uint8_t *)data_ptr; + for (int i = 0; i < q3kpt_tensor_registry_count; ++i) { + const uint8_t * base = (const uint8_t *)q3kpt_tensor_registry[i].data; + if (p >= base && p < base + q3kpt_tensor_registry[i].nbytes) { + return q3kpt_tensor_registry[i].levels; + } + } + return q3kpt_get_levels(); +} + +// Train levels in the symmetric quantization space +GGML_API void q3kpt_train_levels(const float * data, + int64_t nrow, + int64_t n_per_row, + const float * imatrix, + float levels_out[Q3KPT_N_LEVELS]) { + // Binning parameters + const int N_BINS = 8192; + const float bin_width = 1.0f / N_BINS; + float * bin_sum_w = (float *) calloc(N_BINS, sizeof(float)); + float * bin_sum_wt = (float *) calloc(N_BINS, sizeof(float)); + GGML_ASSERT(bin_sum_w && bin_sum_wt); + + const int nb = (int) (n_per_row / QK_K); + + // Single pass: use simple max_abs/4 scale estimation per sub-block, then bin + for (int64_t row = 0; row < nrow; ++row) { + const float * xrow = data + row * n_per_row; + + for (int i = 0; i < nb; i++) { + const float * x = xrow + i * QK_K; + + for (int j = 0; j < QK_K / 16; ++j) { + // Simple symmetric scale: max_abs / 4 + float amax = 0; + for (int l = 0; l < 16; ++l) { + float ax = fabsf(x[16 * j + l]); + if (ax > amax) { + amax = ax; + } + } + if (amax < 1e-10f) { + continue; + } + + float d = amax / 4.0f; + float inv_d = 1.0f / d; + + for (int l = 0; l < 16; ++l) { + float val = x[16 * j + l] * inv_d; + // Map from [-4, 3] symmetric space to [0, 1] + float t = (val + 4.0f) / 7.0f; + + if (t < 0.0f) { + t = 0.0f; + } + if (t > 1.0f) { + t = 1.0f; + } + + int bin_idx = (int) (t * N_BINS); + if (bin_idx >= N_BINS) { + bin_idx = N_BINS - 1; + } + + int elem = i * QK_K + 16 * j + l; + float w = imatrix ? imatrix[elem] : 1.0f; + if (w < 1e-10f) { + w = 1e-10f; + } + w *= d * d; + + bin_sum_w[bin_idx] += w; + bin_sum_wt[bin_idx] += w * t; + } + } + } + } + + // Initialize 8 levels uniformly in [0, 1] + float levels[Q3KPT_N_LEVELS]; + for (int k = 0; k < Q3KPT_N_LEVELS; ++k) { + levels[k] = (float) k / (Q3KPT_N_LEVELS - 1); + } + + // Lloyd-Max iterations on bins + for (int iter = 0; iter < 100; ++iter) { + float sum_w[Q3KPT_N_LEVELS] = { 0 }; + float sum_wt[Q3KPT_N_LEVELS] = { 0 }; + + for (int b = 0; b < N_BINS; ++b) { + if (bin_sum_w[b] < 1e-12f) { + continue; + } + const float t = (b + 0.5f) * bin_width; + int best = 0; + float best_d2 = (t - levels[0]) * (t - levels[0]); + for (int k = 1; k < Q3KPT_N_LEVELS; ++k) { + float d2 = (t - levels[k]) * (t - levels[k]); + if (d2 < best_d2) { + best_d2 = d2; + best = k; + } + } + sum_w[best] += bin_sum_w[b]; + sum_wt[best] += bin_sum_wt[b]; + } + + float max_delta = 0.0f; + for (int k = 0; k < Q3KPT_N_LEVELS; ++k) { + if (sum_w[k] > 1e-12f) { + float new_level = sum_wt[k] / sum_w[k]; + max_delta = fmaxf(max_delta, fabsf(new_level - levels[k])); + levels[k] = new_level; + } + } + if (max_delta < 1e-10f) { + break; + } + + for (int k = 1; k < Q3KPT_N_LEVELS; ++k) { + float v = levels[k]; + int m = k - 1; + while (m >= 0 && levels[m] > v) { + levels[m + 1] = levels[m]; + m--; + } + levels[m + 1] = v; + } + } + + memcpy(levels_out, levels, Q3KPT_N_LEVELS * sizeof(float)); + q3kpt_set_levels(levels); + free(bin_sum_w); + free(bin_sum_wt); +} + +void dequantize_row_q3_kpt(const block_q3_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + const float * levels = q3kpt_get_tensor_levels(x); + GGML_ASSERT(levels != NULL && "Q3_KPT levels not set for tensor"); + + // levels are in [0,1], map to approximate [-4, 3] range for Q3_K compatibility + // The dequant formula: y = d * sc * (L[k] * 8 - 4) = d * sc * (L[k] - 0.5) * 8 + // But simpler: store shifted levels and use: y = d * sc * L_shifted[k] + // where L_shifted[k] = (L[k] - 0.5) * 8 or just use (L[k] - 4) if L is in [0,7] + + // Actually, let's use: reconstructed = d * sc * (L[k] - 4) + // where L[k] is in [0, 7] (shifted from [0,1]) + + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + for (int i = 0; i < nb; i++) { + const float d_all = GGML_FP16_TO_FP32(x[i].d); + const uint8_t * q = x[i].qs; + const uint8_t * hm = x[i].hmask; + uint8_t m = 1; + + uint32_t aux32[4]; + memcpy(aux32, x[i].scales, 12); + uint32_t tmp = aux32[2]; + aux32[2] = ((aux32[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + aux32[3] = ((aux32[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + aux32[0] = (aux32[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + aux32[1] = (aux32[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + const uint8_t * aux = (const uint8_t *) aux32; + + int is = 0; + for (int n = 0; n < QK_K; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + int sc1 = (int) aux[is] - 32; + int sc2 = (int) aux[is + 1] - 32; + is += 2; + float dl1 = d_all * sc1; + float dl2 = d_all * sc2; + + for (int l = 0; l < 16; ++l) { + int k_idx = ((q[l + 0] >> shift) & 3) + ((hm[l + 0] & m) ? 4 : 0); + y[l + 0] = dl1 * (levels[k_idx] * 7.0f - 4.0f); + } + for (int l = 0; l < 16; ++l) { + int k_idx = ((q[l + 16] >> shift) & 3) + ((hm[l + 16] & m) ? 4 : 0); + y[l + 16] = dl2 * (levels[k_idx] * 7.0f - 4.0f); + } + y += 32; + shift += 2; + m <<= 1; + } + q += 32; + } + } +} + +// Helper: find optimal symmetric scale for non-uniform mapped levels. +// Closely mirrors make_qx_quants but uses nearest-mapped-level assignment +// instead of rounding to nearest integer. +// mapped_levels[k] = levels[k]*7 - 4, k=0..7. +// Returns the per-sub-block scale d such that x[i] ≈ d * ml[L[i]]. +// L[i] gets the best level index [0..7]. +static float make_q3kpt_quants(int n, + const float * GGML_RESTRICT x, + int8_t * GGML_RESTRICT L, + const float * GGML_RESTRICT weight, + const float * mapped_levels) { + // Find the most negative and most positive mapped levels + float ml_neg = mapped_levels[0], ml_pos = mapped_levels[Q3KPT_N_LEVELS - 1]; + + // Precompute boundaries for branchless nearest-level search + float bounds[Q3KPT_N_LEVELS - 1]; + for (int k = 0; k < Q3KPT_N_LEVELS - 1; ++k) { + bounds[k] = 0.5f * (mapped_levels[k] + mapped_levels[k + 1]); + } + + // Find max absolute value in data (and its sign) + float max = 0, amax = 0; + for (int i = 0; i < n; ++i) { + float ax = fabsf(x[i]); + if (ax > amax) { + amax = ax; + max = x[i]; + } + } + if (amax < GROUP_MAX_EPS) { + // Find level closest to 0 + int zero_k = 0; + float zero_d = fabsf(mapped_levels[0]); + for (int k = 1; k < Q3KPT_N_LEVELS; ++k) { + if (fabsf(mapped_levels[k]) < zero_d) { + zero_d = fabsf(mapped_levels[k]); + zero_k = k; + } + } + for (int i = 0; i < n; ++i) { + L[i] = zero_k; + } + return 0.f; + } + + float best_scale = 0; + float best_obj = 0; + bool first = true; + + for (int is = -15; is <= 15; ++is) { + float iscales[2] = { + -(fabsf(ml_neg) + 0.1f * is) / max, // map max to ml_neg (Q3_K style) + (fabsf(ml_pos) + 0.1f * is) / max // map max to ml_pos + }; + + for (int opt = 0; opt < 2; ++opt) { + float iscale = iscales[opt]; + + float sumlx = 0, suml2 = 0; + for (int i = 0; i < n; ++i) { + float scaled = x[i] * iscale; + // Branchless nearest level assignment + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]) + + (scaled > bounds[3]) + (scaled > bounds[4]) + (scaled > bounds[5]) + (scaled > bounds[6]); + float w = weight ? weight[i] : x[i] * x[i]; + sumlx += w * x[i] * mapped_levels[best_k]; + suml2 += w * mapped_levels[best_k] * mapped_levels[best_k]; + } + + if (suml2 > 0 && (first || sumlx * sumlx > best_obj * suml2)) { + float scale = sumlx / suml2; + best_obj = scale * sumlx; + best_scale = scale; + first = false; + // Re-assign L with this iscale + for (int i = 0; i < n; ++i) { + float scaled = x[i] * iscale; + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]) + + (scaled > bounds[3]) + (scaled > bounds[4]) + (scaled > bounds[5]) + (scaled > bounds[6]); + L[i] = best_k; + } + } + } + } + return best_scale; +} + +static void quantize_row_q3_kpt_impl(const float * GGML_RESTRICT x, + block_q3_kpt * GGML_RESTRICT y, + int64_t n_per_row, + const float * GGML_RESTRICT quant_weights) { + assert(n_per_row % QK_K == 0); + const int nb = n_per_row / QK_K; + const float * levels = q3kpt_get_levels(); + GGML_ASSERT(levels != NULL && "Q3_KPT levels not set - call q3kpt_set_levels() first"); + + // Precompute mapped levels: ml[k] = levels[k] * 7 - 4 + float mapped_levels[Q3KPT_N_LEVELS]; + for (int k = 0; k < Q3KPT_N_LEVELS; ++k) { + mapped_levels[k] = levels[k] * 7.0f - 4.0f; + } + + // Precompute boundaries for branchless nearest-level search + float bounds[Q3KPT_N_LEVELS - 1]; + for (int k = 0; k < Q3KPT_N_LEVELS - 1; ++k) { + bounds[k] = 0.5f * (mapped_levels[k] + mapped_levels[k + 1]); + } + + int8_t L[QK_K]; + float scales[QK_K / 16]; + float weight[16]; + float sw[QK_K / 16]; + int8_t Ls[QK_K / 16]; + + for (int i = 0; i < nb; i++) { + float sumx2 = 0; + for (int j = 0; j < QK_K; ++j) { + sumx2 += x[j] * x[j]; + } + float sigma2 = 2 * sumx2 / QK_K; + + // First pass: find per-sub-block scales optimized for mapped levels + for (int j = 0; j < QK_K / 16; ++j) { + if (quant_weights) { + const float * qw = quant_weights + QK_K * i + 16 * j; + for (int l = 0; l < 16; ++l) { + weight[l] = qw[l] * sqrtf(sigma2 + x[16 * j + l] * x[16 * j + l]); + } + } else { + for (int l = 0; l < 16; ++l) { + weight[l] = x[16 * j + l] * x[16 * j + l]; + } + } + float sumw = 0; + for (int l = 0; l < 16; ++l) { + sumw += weight[l]; + } + sw[j] = sumw; + + scales[j] = make_q3kpt_quants(16, x + 16 * j, L + 16 * j, weight, mapped_levels); + } + + // Two-tier scale quantization (identical to Q3_K) + memset(y[i].scales, 0, 12); + float d_block = make_qx_quants(QK_K / 16, 32, scales, Ls, 1, sw); + for (int j = 0; j < QK_K / 16; ++j) { + int l = Ls[j]; + if (j < 8) { + y[i].scales[j] = l & 0xF; + } else { + y[i].scales[j - 8] |= ((l & 0xF) << 4); + } + l >>= 4; + y[i].scales[j % 4 + 8] |= (l << (2 * (j / 4))); + } + y[i].d = GGML_FP32_TO_FP16(d_block); + + // Second pass: level assignment using the quantized scales but + // assigning nearest LEARNED LEVEL instead of nearest integer + int8_t sc; + for (int j = 0; j < QK_K / 16; ++j) { + sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j - 8] >> 4; + sc = (sc | (((y[i].scales[8 + j % 4] >> (2 * (j / 4))) & 3) << 4)) - 32; + float d = GGML_FP16_TO_FP32(y[i].d) * sc; + if (!d) { + // Find level closest to 0 for zero-scale sub-blocks + int zero_k = 0; + float zero_dist = fabsf(mapped_levels[0]); + for (int k = 1; k < Q3KPT_N_LEVELS; ++k) { + if (fabsf(mapped_levels[k]) < zero_dist) { + zero_dist = fabsf(mapped_levels[k]); + zero_k = k; + } + } + for (int ii = 0; ii < 16; ++ii) { + L[16 * j + ii] = zero_k; + } + continue; + } + for (int ii = 0; ii < 16; ++ii) { + float scaled = x[16 * j + ii] / d; + // Branchless nearest level assignment + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]) + + (scaled > bounds[3]) + (scaled > bounds[4]) + (scaled > bounds[5]) + (scaled > bounds[6]); + L[16 * j + ii] = best_k; + } + } + + // Pack level indices (same bit layout as Q3_K) + memset(y[i].hmask, 0, QK_K / 8); + int m = 0; + uint8_t hm = 1; + for (int j = 0; j < QK_K; ++j) { + if (L[j] > 3) { + y[i].hmask[m] |= hm; + L[j] -= 4; + } + if (++m == QK_K / 8) { + m = 0; + hm <<= 1; + } + } + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + y[i].qs[j / 4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); + } + } + x += QK_K; + } +} + +size_t quantize_q3_kpt(const float * GGML_RESTRICT src, + void * GGML_RESTRICT dst, + int64_t nrow, + int64_t n_per_row, + const float * imatrix) { + size_t row_size = ggml_row_size(GGML_TYPE_Q3_KPT, n_per_row); + char * qrow = (char *) dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_q3_kpt_impl(src, (block_q3_kpt *) qrow, n_per_row, imatrix); + src += n_per_row; + qrow += row_size; + } + return nrow * row_size; +} + +void quantize_row_q3_kpt_ref(const float * GGML_RESTRICT x, block_q3_kpt * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + quantize_q3_kpt(x, y, 1, k, NULL); +} // Global levels (used during quantization for the current tensor) static float q3pt_levels[Q3PT_N_LEVELS]; @@ -5799,6 +6278,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_pt, data, nb); } break; + case GGML_TYPE_Q3_KPT: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_kpt, data, nb); + } break; case GGML_TYPE_I8: case GGML_TYPE_I16: diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index fd8501be4fe..d047972940d 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -25,6 +25,7 @@ GGML_API void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q3_kpt_ref(const float * GGML_RESTRICT x, block_q3_kpt * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k); @@ -51,6 +52,7 @@ GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q3_kpt(const block_q3_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); @@ -78,6 +80,17 @@ GGML_API size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RE GGML_API size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q3_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + +// Q3_KPT level management +GGML_API void q3kpt_set_levels(const float * levels); +GGML_API const float * q3kpt_get_levels(void); +GGML_API void q3kpt_free_levels(void); +GGML_API void q3kpt_register_tensor_levels(const void * data, size_t nbytes, const float * levels); +GGML_API void q3kpt_clear_tensor_levels(void); +GGML_API const float * q3kpt_get_tensor_levels(const void * data_ptr); +GGML_API void q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[Q3KPT_N_LEVELS]); GGML_API size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 5ef37ca9e64..1cae9356c97 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -904,6 +904,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q3_pt, .from_float_ref = (ggml_from_float_t) quantize_row_q3_pt_ref, }, + [GGML_TYPE_Q3_KPT] = { + .type_name = "q3_kpt", + .blck_size = QK_K, + .type_size = sizeof(block_q3_kpt), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q3_kpt, + .from_float_ref = (ggml_from_float_t) quantize_row_q3_kpt_ref, + }, }; const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { @@ -1395,6 +1403,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break; case GGML_FTYPE_MOSTLY_Q3_PT: wtype = GGML_TYPE_Q3_PT; break; + case GGML_FTYPE_MOSTLY_Q3_KPT: wtype = GGML_TYPE_Q3_KPT; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -7540,6 +7549,7 @@ void ggml_quantize_init(enum ggml_type type) { case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break; case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break; case GGML_TYPE_Q3_PT: break; // levels set externally via q3pt_set_levels() + case GGML_TYPE_Q3_KPT: break; // levels set externally via q3kpt_set_levels() default: // nothing break; } @@ -7617,6 +7627,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q3_PT: result = quantize_q3_pt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q3_KPT: result = quantize_q3_kpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/include/llama.h b/include/llama.h index 2eee1ab0865..37a076d89a6 100644 --- a/include/llama.h +++ b/include/llama.h @@ -153,6 +153,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_PT = 39, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_KPT = 40, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 78ab5e8a877..a3f3aca8813 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -56,6 +56,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; case LLAMA_FTYPE_MOSTLY_Q3_PT: return "Q3_PT - 3.25 bpw"; + case LLAMA_FTYPE_MOSTLY_Q3_KPT: return "Q3_KPT - Q3_K with learned levels"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; @@ -712,6 +713,7 @@ llama_model_loader::llama_model_loader( case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; case GGML_TYPE_Q3_PT: ftype = LLAMA_FTYPE_MOSTLY_Q3_PT; break; + case GGML_TYPE_Q3_KPT: ftype = LLAMA_FTYPE_MOSTLY_Q3_KPT; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1843654adfe..d0c4d6c69da 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -22,6 +22,13 @@ extern "C" { void q3pt_clear_tensor_levels(void); } +// Q3_KPT levels functions (defined in ggml-quants.c) +extern "C" { + void q3kpt_set_levels(const float * levels); + void q3kpt_register_tensor_levels(const void * data, size_t nbytes, const float * levels); + void q3kpt_clear_tensor_levels(void); +} + #include #include #include @@ -7898,6 +7905,46 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } + // Q3_KPT: load per-tensor levels from GGUF metadata and register them. + { + static const size_t Q3KPT_N_LEVELS = 8; + int64_t lv_idx = gguf_find_key(ml.meta.get(), "q3_kpt.levels"); + if (lv_idx >= 0) { + const float * lv_data = (const float *)gguf_get_arr_data(ml.meta.get(), lv_idx); + const size_t lv_len = gguf_get_arr_n(ml.meta.get(), lv_idx); + + // Build tensor-name to slot index map (GGUF file order = quantizer order) + std::unordered_map name_to_slot; + { + const int64_t n_tensors = gguf_get_n_tensors(ml.meta.get()); + for (int64_t ti = 0; ti < n_tensors; ++ti) { + name_to_slot[gguf_get_tensor_name(ml.meta.get(), ti)] = (size_t)ti; + } + } + + q3kpt_clear_tensor_levels(); + int n_registered = 0; + + for (auto & [ctx, buf_map] : ctx_buf_maps) { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + if (t->type != GGML_TYPE_Q3_KPT || t->data == nullptr) { continue; } + auto it = name_to_slot.find(ggml_get_name(t)); + if (it == name_to_slot.end()) { continue; } + const size_t lv_offset = it->second * Q3KPT_N_LEVELS; + if (lv_offset + Q3KPT_N_LEVELS > lv_len) { continue; } + q3kpt_register_tensor_levels(t->data, ggml_nbytes(t), lv_data + lv_offset); + if (n_registered == 0) { + q3kpt_set_levels(lv_data + lv_offset); // global fallback + } + n_registered++; + } + } + if (n_registered > 0) { + LLAMA_LOG_INFO("%s: registered %d Q3_KPT per-tensor level tables\n", __func__, n_registered); + } + } + } + if (use_mmap_buffer) { for (auto & mapping : ml.mappings) { pimpl->mappings.emplace_back(std::move(mapping)); diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 95ed79bdc27..cfcf1a0d99d 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -22,6 +22,13 @@ extern "C" { void q3pt_set_levels(const float * levels); } +// Q3_KPT levels functions (defined in ggml-quants.c) +extern "C" { + void q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[8]); + void q3kpt_set_levels(const float * levels); +} + // Quantization types. Changes to this struct must be replicated in quantize.cpp struct tensor_quantization { std::string name; @@ -260,6 +267,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) { new_type = GGML_TYPE_IQ4_XS; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) { + new_type = GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { new_type = GGML_TYPE_Q4_K; } @@ -307,7 +317,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) { new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; @@ -358,10 +368,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) { new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K - : GGML_TYPE_Q3_K; + : (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT ? GGML_TYPE_Q3_KPT : GGML_TYPE_Q3_K); } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 || (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) { @@ -402,13 +412,13 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) { + ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) { new_type = GGML_TYPE_Q5_K; } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K; } @@ -417,7 +427,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } } else if (name.find("attn_qkv.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; @@ -551,6 +561,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_Q3_PT: default_type = GGML_TYPE_Q3_PT; break; + case LLAMA_FTYPE_MOSTLY_Q3_KPT: default_type = GGML_TYPE_Q3_KPT; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -844,6 +855,89 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: LLAMA_LOG_INFO("%s: Q3_PT pass 1 complete.\n", __func__); } + // Q3_KPT two-pass approach: train all per-tensor levels BEFORE opening the output + static const size_t Q3KPT_N_LEVELS = 8; + std::vector q3kpt_all_levels; // indexed by position in tensors[] + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT && !params->dry_run) { + LLAMA_LOG_INFO("%s: Q3_KPT pass 1: training per-tensor levels...\n", __func__); + q3kpt_all_levels.assign(tensors.size() * Q3KPT_N_LEVELS, 0.0f); + + // Temporary dequant buffer for pass 1 + std::vector> p1_read_data; + std::vector> p1_f32_buf; + std::vector p1_workers; + p1_workers.reserve(nthread); + + for (size_t ti = 0; ti < tensors.size(); ++ti) { + ggml_tensor * tensor = tensors[ti]->tensor; + const std::string tname = ggml_get_name(tensor); + + // Determine whether this tensor will be Q3_KPT (mirror the pass-2 logic) + bool quantize = tname.rfind("weight") == tname.size() - 6; + quantize &= (ggml_n_dims(tensor) >= 2); + quantize &= tname.find("_norm.weight") == std::string::npos; + quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos; + if (!quantize) { continue; } + + ggml_type new_type = default_type; + if (!params->pure) { + new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); + } + if (params->token_embedding_type < GGML_TYPE_COUNT && + (tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) { + new_type = params->token_embedding_type; + } + if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") { + new_type = params->output_tensor_type; + } + if (new_type != GGML_TYPE_Q3_KPT) { continue; } + + // Load tensor data + const size_t tsz = ggml_nbytes(tensor); + if (!ml.use_mmap) { + if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); } + tensor->data = p1_read_data.data(); + } + ml.load_data_for(tensor); + + // Dequantize to f32 if needed + const int64_t nelements = ggml_nelements(tensor); + float * f32_data; + if (tensor->type == GGML_TYPE_F32) { + f32_data = (float *) tensor->data; + } else { + llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread); + f32_data = (float *) p1_f32_buf.data(); + } + + // Resolve imatrix + const float * imatrix = nullptr; + if (imatrix_data) { + auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped)); + if (it2 != imatrix_data->end() && + it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) { + imatrix = it2->second.data(); + } + } + + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows = tensor->ne[1]; + + LLAMA_LOG_INFO("%s: Q3_KPT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name); + q3kpt_train_levels(f32_data, nrows, n_per_row, imatrix, + q3kpt_all_levels.data() + ti * Q3KPT_N_LEVELS); + } + + // All levels ready — store in GGUF metadata before the file is opened + for (auto & ctx : ctx_outs) { + if (ctx) { + gguf_set_arr_data(ctx.get(), "q3_kpt.levels", GGUF_TYPE_FLOAT32, + q3kpt_all_levels.data(), q3kpt_all_levels.size()); + } + } + LLAMA_LOG_INFO("%s: Q3_KPT pass 1 complete.\n", __func__); + } + // no output file for --dry-run if (!params->dry_run) { new_ofstream(0); @@ -1115,6 +1209,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: q3pt_set_levels(q3pt_all_levels.data() + tensor_pass2_idx * Q3PT_N_LEVELS); } + // Q3_KPT: set the per-tensor levels (trained in pass 1) as global for quantization + if (new_type == GGML_TYPE_Q3_KPT) { + q3kpt_set_levels(q3kpt_all_levels.data() + tensor_pass2_idx * Q3KPT_N_LEVELS); + } + // quantize each expert separately since they have different importance matrices new_size = 0; for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 526a3935556..7bfa3fb6601 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -38,6 +38,7 @@ static const std::vector QUANT_OPTIONS = { { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, { "Q3_PT", LLAMA_FTYPE_MOSTLY_Q3_PT, " 3.25 bpw quantization", }, + { "Q3_KPT", LLAMA_FTYPE_MOSTLY_Q3_KPT, " Q3_K with learned per-tensor levels" }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, From e395d080c25410f6d3bb85c7e2a601165524bd75 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Sat, 28 Feb 2026 19:59:00 +0100 Subject: [PATCH 4/9] Q4_DPT (= IQ4_NL with trained kvalues) --- ggml/include/ggml.h | 4 +- ggml/src/ggml-common.h | 5 + ggml/src/ggml-cpu/ggml-cpu.c | 6 + ggml/src/ggml-cpu/ops.cpp | 1 + ggml/src/ggml-cpu/quants.c | 34 ++++ ggml/src/ggml-cpu/quants.h | 2 + ggml/src/ggml-quants.c | 297 +++++++++++++++++++++++++++++++++++ ggml/src/ggml-quants.h | 21 +++ ggml/src/ggml.c | 11 ++ include/llama.h | 1 + src/llama-model-loader.cpp | 2 + src/llama-model.cpp | 48 ++++++ src/llama-quant.cpp | 98 ++++++++++++ tests/CMakeLists.txt | 4 + tests/test-quant-q3kpt.cpp | 191 ++++++++++++++++++++++ tests/test-quant-q4dpt.cpp | 175 +++++++++++++++++++++ tools/quantize/quantize.cpp | 1 + 17 files changed, 900 insertions(+), 1 deletion(-) create mode 100644 tests/test-quant-q3kpt.cpp create mode 100644 tests/test-quant-q4dpt.cpp diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 7b0e7ec1235..b022730f8db 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -429,7 +429,8 @@ extern "C" { GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block) GGML_TYPE_Q3_PT = 40, // 3.875 bpw per-tensor Lloyd-Max, 16-elem affine sub-blocks GGML_TYPE_Q3_KPT = 41, // Q3_K with learned per-tensor levels (3.4375 bpw) - GGML_TYPE_COUNT = 42, + GGML_TYPE_Q4_DPT = 42, // IQ4_NL with learned per-tensor int8 levels (4.125 bpw) + GGML_TYPE_COUNT = 43, }; // precision @@ -467,6 +468,7 @@ extern "C" { GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors GGML_FTYPE_MOSTLY_Q3_KPT = 27, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_DPT = 28, // except 1d tensors }; // available tensor operations: diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 43e6f6b6f86..11f7a6bb9cf 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -447,6 +447,11 @@ static_assert(sizeof(block_q3_pt) == 124, "wrong q3_pt block size"); #define Q3PT_N_LEVELS 8 +// Q4_DPT: IQ4_NL with learned per-tensor int8 levels (4.125 bpw) +// Block format: identical to block_iq4_nl (2 + 16 = 18 bytes per 32 elements) +typedef block_iq4_nl block_q4_dpt; +#define Q4DPT_N_LEVELS 16 + #endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 8d3b90cc0de..f4441b32d94 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -396,6 +396,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_Q4_DPT] = { + // from_float not set — requires level initialization via q4dpt_set_levels() + .vec_dot = ggml_vec_dot_q4_dpt_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, [GGML_TYPE_I32] = { .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32, }, diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index b7cd334b8f3..73107dcb8f9 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -5558,6 +5558,7 @@ void ggml_compute_forward_clamp( case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q3_KPT: + case GGML_TYPE_Q4_DPT: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index ccfe593b00e..1e90cef3f9c 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -1175,6 +1175,40 @@ void ggml_vec_dot_q3_kpt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v ggml_vec_dot_q3_kpt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); } +void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK4_NL == 0); + static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); + + const block_q4_dpt * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + const int nb = n / QK4_NL; + + const int8_t * values = q4dpt_get_tensor_levels(vx); + GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor"); + + float sumf = 0; + for (int ib = 0; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d) * GGML_CPU_FP16_TO_FP32(x[ib].d); + int32_t blk = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + blk += (int32_t)y[ib].qs[j+ 0] * (int32_t)values[x[ib].qs[j] & 0xf]; + blk += (int32_t)y[ib].qs[j+QK4_NL/2] * (int32_t)values[x[ib].qs[j] >> 4]; + } + sumf += d * (float)blk; + } + *s = sumf; +} + +void ggml_vec_dot_q4_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q4_dpt_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); +} + void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h index 24d08d6d67c..1377a6a59bd 100644 --- a/ggml/src/ggml-cpu/quants.h +++ b/ggml/src/ggml-cpu/quants.h @@ -64,6 +64,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void ggml_vec_dot_q3_pt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q3_kpt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q4_dpt_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); // Generic implementation void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 3de383cb6a4..6c38c553016 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4561,6 +4561,299 @@ void quantize_row_q3_kpt_ref(const float * GGML_RESTRICT x, block_q3_kpt * GGML_ quantize_q3_kpt(x, y, 1, k, NULL); } +// Forward declaration needed since quantize_row_iq4_nl_impl is defined later in this file. +static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, + const float * GGML_RESTRICT x, + ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l, + float * scales, float * weight, uint8_t * L, + const int8_t * values, + const float * quant_weights, + const int ntry); + +// ====================== Q4_DPT: IQ4_NL with learned per-tensor int8 levels ====================== +// +// Block format: identical to block_iq4_nl (18 bytes per QK4_NL=32 elements) +// d : ggml_half — per-block scale +// qs : QK4_NL/2 bytes — 4-bit indices into the 16-entry level table +// +// The difference from IQ4_NL: instead of the fixed kvalues_iq4nl int8 table, +// we use 16 int8 levels learned per-tensor via weighted Lloyd-Max k-means. +// Normalization: symmetric (x/amax), bin domain [-1, 1]. +// Levels stored in GGUF as "q4_dpt.levels" (int8 array, 16 values per tensor). + +static int8_t q4dpt_levels[Q4DPT_N_LEVELS]; +static bool q4dpt_levels_set = false; + +void q4dpt_set_levels(const int8_t * levels) { + memcpy(q4dpt_levels, levels, Q4DPT_N_LEVELS * sizeof(int8_t)); + q4dpt_levels_set = true; +} + +const int8_t * q4dpt_get_levels(void) { + return q4dpt_levels_set ? q4dpt_levels : NULL; +} + +void q4dpt_free_levels(void) { + q4dpt_levels_set = false; +} + +// Per-tensor levels registry (inference — range-based lookup by data address) +#define Q4DPT_MAX_TENSORS 1024 + +typedef struct { + const void * data; + size_t nbytes; + int8_t levels[Q4DPT_N_LEVELS]; +} q4dpt_tensor_entry; + +static q4dpt_tensor_entry q4dpt_tensor_registry[Q4DPT_MAX_TENSORS]; +static int q4dpt_tensor_registry_count = 0; + +void q4dpt_register_tensor_levels(const void * data, size_t nbytes, const int8_t * levels) { + if (q4dpt_tensor_registry_count >= Q4DPT_MAX_TENSORS) { return; } + for (int i = 0; i < q4dpt_tensor_registry_count; ++i) { + if (q4dpt_tensor_registry[i].data == data) { + q4dpt_tensor_registry[i].nbytes = nbytes; + memcpy(q4dpt_tensor_registry[i].levels, levels, Q4DPT_N_LEVELS * sizeof(int8_t)); + return; + } + } + q4dpt_tensor_registry[q4dpt_tensor_registry_count].data = data; + q4dpt_tensor_registry[q4dpt_tensor_registry_count].nbytes = nbytes; + memcpy(q4dpt_tensor_registry[q4dpt_tensor_registry_count].levels, levels, Q4DPT_N_LEVELS * sizeof(int8_t)); + q4dpt_tensor_registry_count++; +} + +void q4dpt_clear_tensor_levels(void) { + q4dpt_tensor_registry_count = 0; +} + +const int8_t * q4dpt_get_tensor_levels(const void * data_ptr) { + const uint8_t * p = (const uint8_t *) data_ptr; + for (int i = 0; i < q4dpt_tensor_registry_count; ++i) { + const uint8_t * base = (const uint8_t *) q4dpt_tensor_registry[i].data; + if (p >= base && p < base + q4dpt_tensor_registry[i].nbytes) { + return q4dpt_tensor_registry[i].levels; + } + } + return q4dpt_get_levels(); +} + +// Train 16 Lloyd-Max int8 levels. +// Bins x/amax values from 32-element IQ4_NL-style blocks into [-1,1], +// runs weighted k-means initialized from IQ4_NL, rounds float centroids to sorted int8[16]. +void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[Q4DPT_N_LEVELS]) { + const int N_BINS = 8192; + const float bin_width = 2.0f / N_BINS; + float * bin_sum_w = (float *) calloc(N_BINS, sizeof(float)); + float * bin_sum_wt = (float *) calloc(N_BINS, sizeof(float)); + GGML_ASSERT(bin_sum_w && bin_sum_wt); + + const int64_t n_blocks = n_per_row / QK4_NL; + + // Build weighted histogram: normalize each block by amax, bin into [-1, 1] + for (int64_t row = 0; row < nrow; ++row) { + const float * xrow = data + row * n_per_row; + for (int64_t ib = 0; ib < n_blocks; ++ib) { + const float * xb = xrow + ib * QK4_NL; + float amax = 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { amax = ax; } + } + if (amax < 1e-10f) { continue; } + const float inv_amax = 1.0f / amax; + for (int j = 0; j < QK4_NL; ++j) { + float w = 1.0f; + if (imatrix) { + w = imatrix[ib * QK4_NL + j]; + if (w < 1e-10f) { w = 1e-10f; } + } + w *= amax * amax; + float t = xb[j] * inv_amax; + int bin_idx = (int)((t + 1.0f) * 0.5f * N_BINS); + if (bin_idx < 0) { bin_idx = 0; } + if (bin_idx >= N_BINS) { bin_idx = N_BINS - 1; } + bin_sum_w[bin_idx] += w; + bin_sum_wt[bin_idx] += w * t; + } + } + } + + // Initialize from IQ4_NL values (normalized to [-1, 1]) + float levels_f[Q4DPT_N_LEVELS]; + for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { + levels_f[k] = (float)kvalues_iq4nl[k] / 127.0f; + } + + // Lloyd-Max iterations + for (int iter = 0; iter < 200; ++iter) { + float sw[Q4DPT_N_LEVELS] = { 0 }; + float swt[Q4DPT_N_LEVELS] = { 0 }; + for (int b = 0; b < N_BINS; ++b) { + if (bin_sum_w[b] < 1e-12f) { continue; } + float t = -1.0f + (b + 0.5f) * bin_width; + int best = 0; + float bd = (t - levels_f[0]) * (t - levels_f[0]); + for (int k = 1; k < Q4DPT_N_LEVELS; ++k) { + float d = (t - levels_f[k]) * (t - levels_f[k]); + if (d < bd) { bd = d; best = k; } + } + sw[best] += bin_sum_w[b]; + swt[best] += bin_sum_wt[b]; + } + float max_delta = 0.0f; + for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { + if (sw[k] > 1e-12f) { + float nl = swt[k] / sw[k]; + max_delta = fmaxf(max_delta, fabsf(nl - levels_f[k])); + levels_f[k] = nl; + } + } + if (max_delta < 1e-10f) { break; } + // Insertion sort to keep levels ordered + for (int k = 1; k < Q4DPT_N_LEVELS; ++k) { + float v = levels_f[k]; + int m = k - 1; + while (m >= 0 && levels_f[m] > v) { levels_f[m+1] = levels_f[m]; m--; } + levels_f[m+1] = v; + } + } + + // Round float centroids to int8, preserve sort order + for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { + int v = (int)roundf(levels_f[k] * 127.0f); + if (v < -128) { v = -128; } + if (v > 127) { v = 127; } + levels_out[k] = (int8_t)v; + } + q4dpt_set_levels(levels_out); + + free(bin_sum_w); + free(bin_sum_wt); +} + +void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK4_NL == 0); + const int64_t nb = k / QK4_NL; + const int8_t * values = q4dpt_get_tensor_levels(x); + GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor"); + + for (int i = 0; i < nb; i++) { + const uint8_t * qs = x[i].qs; + const float d = GGML_FP16_TO_FP32(x[i].d); + for (int j = 0; j < QK4_NL/2; ++j) { + y[j] = d * (float)values[qs[j] & 0xf]; + y[j + QK4_NL/2] = d * (float)values[qs[j] >> 4]; + } + y += QK4_NL; + } +} + +// Quantize one 32-element block using float levels and optimal per-block scale. +// Uses scale perturbation (ntry) to improve accuracy, mirroring quantize_row_iq4_nl_impl. +static void quantize_block_q4_dpt(const float * GGML_RESTRICT xb, block_q4_dpt * GGML_RESTRICT out, + const int8_t * values, const float * qw, int ntry) { + float amax = 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { amax = ax; } + } + if (amax < 1e-10f) { + out->d = 0; + memset(out->qs, 0, QK4_NL/2); + return; + } + + // Find magnitude of the level furthest from zero + float max_abs_level = 0.0f; + for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { + float al = fabsf((float)values[k]); + if (al > max_abs_level) { max_abs_level = al; } + } + if (max_abs_level < 1e-10f) { max_abs_level = 1.0f; } + + const float d_base = amax / max_abs_level; + float best_err = 1e30f; + float best_d = d_base; + uint8_t best_L[QK4_NL]; + memset(best_L, 0, sizeof(best_L)); + + for (int itry = -ntry; itry <= ntry; ++itry) { + float d = d_base * (1.0f + (float)itry * (0.1f / ntry)); + if (d < 1e-20f) { continue; } + const float inv_d = 1.0f / d; + + // Nearest-level assignment in scale-normalized space + uint8_t L[QK4_NL]; + for (int j = 0; j < QK4_NL; ++j) { + float t = xb[j] * inv_d; + int bk = 0; + float bd = fabsf(t - (float)values[0]); + for (int k = 1; k < Q4DPT_N_LEVELS; ++k) { + float dist = fabsf(t - (float)values[k]); + if (dist < bd) { bd = dist; bk = k; } + } + L[j] = (uint8_t)bk; + } + + // Optimal scale via weighted least-squares: d* = sum(w*x*lvl) / sum(w*lvl^2) + float num = 0.0f, den = 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float w = qw ? qw[j] : 1.0f; + float lf = (float)values[L[j]]; + num += w * xb[j] * lf; + den += w * lf * lf; + } + d = den > 1e-20f ? num / den : d_base; + + // Recompute error with optimal d + float err = 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float w = qw ? qw[j] : 1.0f; + float e = xb[j] - d * (float)values[L[j]]; + err += w * e * e; + } + if (err < best_err) { + best_err = err; + best_d = d; + memcpy(best_L, L, QK4_NL); + } + } + + out->d = GGML_FP32_TO_FP16(best_d); + for (int j = 0; j < QK4_NL/2; ++j) { + out->qs[j] = best_L[j] | (best_L[j + QK4_NL/2] << 4); + } +} + +size_t quantize_q4_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, + int64_t nrow, int64_t n_per_row, const float * quant_weights) { + GGML_ASSERT(n_per_row % QK4_NL == 0); + const int8_t * values = q4dpt_get_levels(); + GGML_ASSERT(values != NULL && "Q4_DPT levels not set - call q4dpt_set_levels() first"); + + const int64_t nblock = n_per_row / QK4_NL; + char * qrow = (char *) dst; + + for (int64_t row = 0; row < nrow; ++row) { + block_q4_dpt * q4 = (block_q4_dpt *) qrow; + for (int64_t ibl = 0; ibl < nblock; ++ibl) { + const float * qw = quant_weights ? quant_weights + QK4_NL * ibl : NULL; + quantize_block_q4_dpt(src + QK4_NL * ibl, &q4[ibl], values, qw, 7); + } + src += n_per_row; + qrow += nblock * sizeof(block_q4_dpt); + } + return (size_t) nrow * nblock * sizeof(block_q4_dpt); +} + +void quantize_row_q4_dpt_ref(const float * GGML_RESTRICT x, block_q4_dpt * GGML_RESTRICT y, int64_t k) { + assert(k % QK4_NL == 0); + quantize_q4_dpt(x, y, 1, k, NULL); +} + // Global levels (used during quantization for the current tensor) static float q3pt_levels[Q3PT_N_LEVELS]; static bool q3pt_levels_set = false; @@ -6282,6 +6575,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_kpt, data, nb); } break; + case GGML_TYPE_Q4_DPT: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_dpt, data, nb); + } break; case GGML_TYPE_I8: case GGML_TYPE_I16: diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index d047972940d..48206702767 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -129,6 +129,27 @@ GGML_API const float * q3pt_get_tensor_levels(const void * data_ptr); GGML_API void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, float levels_out[8]); +// Q4_DPT: IQ4_NL with learned per-tensor int8 levels +GGML_API void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q4_dpt_ref(const float * GGML_RESTRICT x, block_q4_dpt * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_q4_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + +// Q4_DPT levels management (per-tensor Lloyd-Max int8 levels) +GGML_API void q4dpt_set_levels(const int8_t * levels); +GGML_API const int8_t * q4dpt_get_levels(void); +GGML_API void q4dpt_free_levels(void); + +// Per-tensor levels registry (inference) +GGML_API void q4dpt_register_tensor_levels(const void * data, size_t nbytes, const int8_t * levels); +GGML_API void q4dpt_clear_tensor_levels(void); +GGML_API const int8_t * q4dpt_get_tensor_levels(const void * data_ptr); + +// Train 16 Lloyd-Max int8 levels from tensor data. +// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[16]. +// Also sets the global levels via q4dpt_set_levels(). +GGML_API void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[Q4DPT_N_LEVELS]); + GGML_API void iq2xs_init_impl(enum ggml_type type); GGML_API void iq2xs_free_impl(enum ggml_type type); GGML_API void iq3xs_init_impl(int grid_size); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 1cae9356c97..630a2d6fdd5 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -912,6 +912,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q3_kpt, .from_float_ref = (ggml_from_float_t) quantize_row_q3_kpt_ref, }, + [GGML_TYPE_Q4_DPT] = { + .type_name = "q4_dpt", + .blck_size = QK4_NL, + .type_size = sizeof(block_q4_dpt), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q4_dpt, + .from_float_ref = (ggml_from_float_t) quantize_row_q4_dpt_ref, + }, }; const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { @@ -1404,6 +1412,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break; case GGML_FTYPE_MOSTLY_Q3_PT: wtype = GGML_TYPE_Q3_PT; break; case GGML_FTYPE_MOSTLY_Q3_KPT: wtype = GGML_TYPE_Q3_KPT; break; + case GGML_FTYPE_MOSTLY_Q4_DPT: wtype = GGML_TYPE_Q4_DPT; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -7550,6 +7559,7 @@ void ggml_quantize_init(enum ggml_type type) { case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break; case GGML_TYPE_Q3_PT: break; // levels set externally via q3pt_set_levels() case GGML_TYPE_Q3_KPT: break; // levels set externally via q3kpt_set_levels() + case GGML_TYPE_Q4_DPT: break; // levels set externally via q4dpt_set_levels() default: // nothing break; } @@ -7628,6 +7638,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q3_PT: result = quantize_q3_pt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q3_KPT: result = quantize_q3_kpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q4_DPT: result = quantize_q4_dpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/include/llama.h b/include/llama.h index 37a076d89a6..12d9d545302 100644 --- a/include/llama.h +++ b/include/llama.h @@ -154,6 +154,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_PT = 39, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_KPT = 40, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_DPT = 41, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index a3f3aca8813..104be97f651 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -57,6 +57,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; case LLAMA_FTYPE_MOSTLY_Q3_PT: return "Q3_PT - 3.25 bpw"; case LLAMA_FTYPE_MOSTLY_Q3_KPT: return "Q3_KPT - Q3_K with learned levels"; + case LLAMA_FTYPE_MOSTLY_Q4_DPT: return "Q4_DPT - IQ4_NL with learned levels"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; @@ -714,6 +715,7 @@ llama_model_loader::llama_model_loader( case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; case GGML_TYPE_Q3_PT: ftype = LLAMA_FTYPE_MOSTLY_Q3_PT; break; case GGML_TYPE_Q3_KPT: ftype = LLAMA_FTYPE_MOSTLY_Q3_KPT; break; + case GGML_TYPE_Q4_DPT: ftype = LLAMA_FTYPE_MOSTLY_Q4_DPT; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d0c4d6c69da..bef7c57dc0b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -29,6 +29,13 @@ extern "C" { void q3kpt_clear_tensor_levels(void); } +// Q4_DPT levels functions (defined in ggml-quants.c) +extern "C" { + void q4dpt_set_levels(const int8_t * levels); + void q4dpt_register_tensor_levels(const void * data, size_t nbytes, const int8_t * levels); + void q4dpt_clear_tensor_levels(void); +} + #include #include #include @@ -7945,6 +7952,47 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } + // Q4_DPT: load per-tensor int8 levels from GGUF metadata and register them. + // Must happen AFTER load_all_data so tensor data pointers are valid. + { + static const size_t Q4DPT_N_LEVELS = 16; + int64_t lv_idx = gguf_find_key(ml.meta.get(), "q4_dpt.levels"); + if (lv_idx >= 0) { + const int8_t * lv_data = (const int8_t *) gguf_get_arr_data(ml.meta.get(), lv_idx); + const size_t lv_len = gguf_get_arr_n(ml.meta.get(), lv_idx); + + // Build tensor-name to slot index map (GGUF file order = quantizer order) + std::unordered_map name_to_slot; + { + const int64_t n_tensors = gguf_get_n_tensors(ml.meta.get()); + for (int64_t ti = 0; ti < n_tensors; ++ti) { + name_to_slot[gguf_get_tensor_name(ml.meta.get(), ti)] = (size_t) ti; + } + } + + q4dpt_clear_tensor_levels(); + int n_registered = 0; + + for (auto & [ctx, buf_map] : ctx_buf_maps) { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + if (t->type != GGML_TYPE_Q4_DPT || t->data == nullptr) { continue; } + auto it = name_to_slot.find(ggml_get_name(t)); + if (it == name_to_slot.end()) { continue; } + const size_t lv_offset = it->second * Q4DPT_N_LEVELS; + if (lv_offset + Q4DPT_N_LEVELS > lv_len) { continue; } + q4dpt_register_tensor_levels(t->data, ggml_nbytes(t), lv_data + lv_offset); + if (n_registered == 0) { + q4dpt_set_levels(lv_data + lv_offset); // global fallback + } + n_registered++; + } + } + if (n_registered > 0) { + LLAMA_LOG_INFO("%s: registered %d Q4_DPT per-tensor level tables\n", __func__, n_registered); + } + } + } + if (use_mmap_buffer) { for (auto & mapping : ml.mappings) { pimpl->mappings.emplace_back(std::move(mapping)); diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index cfcf1a0d99d..cef8f18abb2 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -29,6 +29,13 @@ extern "C" { void q3kpt_set_levels(const float * levels); } +// Q4_DPT levels functions (defined in ggml-quants.c) +extern "C" { + void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[16]); + void q4dpt_set_levels(const int8_t * levels); +} + // Quantization types. Changes to this struct must be replicated in quantize.cpp struct tensor_quantization { std::string name; @@ -270,6 +277,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) { new_type = GGML_TYPE_Q4_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_DPT) { + new_type = GGML_TYPE_IQ4_XS; + } else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { new_type = GGML_TYPE_Q4_K; } @@ -562,6 +572,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_Q3_PT: default_type = GGML_TYPE_Q3_PT; break; case LLAMA_FTYPE_MOSTLY_Q3_KPT: default_type = GGML_TYPE_Q3_KPT; break; + case LLAMA_FTYPE_MOSTLY_Q4_DPT: default_type = GGML_TYPE_Q4_DPT; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -938,6 +949,88 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: LLAMA_LOG_INFO("%s: Q3_KPT pass 1 complete.\n", __func__); } + // Q4_DPT two-pass approach: train all per-tensor int8 levels BEFORE opening the output + // file, so the levels KV entry is already populated at the time of the metadata placeholder. + static const size_t Q4DPT_N_LEVELS = 16; + std::vector q4dpt_all_levels; // indexed by position in tensors[] + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_DPT && !params->dry_run) { + LLAMA_LOG_INFO("%s: Q4_DPT pass 1: training per-tensor int8 levels...\n", __func__); + q4dpt_all_levels.assign(tensors.size() * Q4DPT_N_LEVELS, (int8_t)0); + + std::vector> p1_read_data; + std::vector> p1_f32_buf; + std::vector p1_workers; + p1_workers.reserve(nthread); + + for (size_t ti = 0; ti < tensors.size(); ++ti) { + ggml_tensor * tensor = tensors[ti]->tensor; + const std::string tname = ggml_get_name(tensor); + + bool quantize = tname.rfind("weight") == tname.size() - 6; + quantize &= (ggml_n_dims(tensor) >= 2); + quantize &= tname.find("_norm.weight") == std::string::npos; + quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos; + if (!quantize) { continue; } + + ggml_type new_type = default_type; + if (!params->pure) { + new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); + } + if (params->token_embedding_type < GGML_TYPE_COUNT && + (tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) { + new_type = params->token_embedding_type; + } + if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") { + new_type = params->output_tensor_type; + } + if (new_type != GGML_TYPE_Q4_DPT) { continue; } + + // Load tensor data + const size_t tsz = ggml_nbytes(tensor); + if (!ml.use_mmap) { + if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); } + tensor->data = p1_read_data.data(); + } + ml.load_data_for(tensor); + + // Dequantize to f32 if needed + const int64_t nelements = ggml_nelements(tensor); + float * f32_data; + if (tensor->type == GGML_TYPE_F32) { + f32_data = (float *) tensor->data; + } else { + llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread); + f32_data = (float *) p1_f32_buf.data(); + } + + // Resolve imatrix + const float * imatrix = nullptr; + if (imatrix_data) { + auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped)); + if (it2 != imatrix_data->end() && + it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) { + imatrix = it2->second.data(); + } + } + + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows = tensor->ne[1]; + + LLAMA_LOG_INFO("%s: Q4_DPT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name); + q4dpt_train_levels(f32_data, nrows, n_per_row, imatrix, + q4dpt_all_levels.data() + ti * Q4DPT_N_LEVELS); + } + + // Store in GGUF metadata before the file is opened + for (auto & ctx : ctx_outs) { + if (ctx) { + gguf_set_arr_data(ctx.get(), "q4_dpt.levels", GGUF_TYPE_INT8, + q4dpt_all_levels.data(), q4dpt_all_levels.size()); + } + } + LLAMA_LOG_INFO("%s: Q4_DPT pass 1 complete.\n", __func__); + } + // no output file for --dry-run if (!params->dry_run) { new_ofstream(0); @@ -1214,6 +1307,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: q3kpt_set_levels(q3kpt_all_levels.data() + tensor_pass2_idx * Q3KPT_N_LEVELS); } + // Q4_DPT: set the per-tensor levels (trained in pass 1) as global for quantization + if (new_type == GGML_TYPE_Q4_DPT) { + q4dpt_set_levels(q4dpt_all_levels.data() + tensor_pass2_idx * Q4DPT_N_LEVELS); + } + // quantize each expert separately since they have different importance matrices new_size = 0; for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 350bffc3157..c3640896444 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -252,6 +252,10 @@ if (NOT GGML_BACKEND_DL) llama_build_and_test(test-rope.cpp) endif() +# New quant tests +llama_build_and_test(test-quant-q3kpt.cpp) +llama_build_and_test(test-quant-q4dpt.cpp) + # libmtmd set(LLAMA_TEST_NAME test-mtmd-c-api) llama_build_and_test(test-mtmd-c-api.c) diff --git a/tests/test-quant-q3kpt.cpp b/tests/test-quant-q3kpt.cpp new file mode 100644 index 00000000000..19a0e2a0bee --- /dev/null +++ b/tests/test-quant-q3kpt.cpp @@ -0,0 +1,191 @@ +// test-quant-q3kpt.cpp +// Quantization accuracy test for Q3_KPT (Q3_K with per-tensor learned levels). + +#include "ggml-backend.h" +#include "ggml.h" +#include +#include + +extern "C" { +void q3kpt_train_levels(const float * data, + int64_t nrow, + int64_t n_per_row, + const float * imatrix, + float levels_out[8]); +void q3kpt_set_levels(const float * levels); +const float * q3kpt_get_tensor_levels(const void * data_ptr); +size_t quantize_q3_kpt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +void quantize_row_q8_K_ref(const float * x, void * y, int64_t k); +} + +#define Q3KPT_N_LEVELS 8 + +#include +#include +#include +#include +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +static float rmse(const float * a, const float * b, size_t n) { + double s = 0; + for (size_t i = 0; i < n; ++i) { + double d = (double) a[i] - (double) b[i]; + s += d * d; + } + return (float) std::sqrt(s / (double) n); +} + +static float std_quant_rmse(ggml_type type, const float * data, size_t nrow, size_t n_per_row) { + const size_t rs = ggml_row_size(type, n_per_row); + std::vector qb(nrow * rs); + std::vector dq(nrow * n_per_row); + ggml_quantize_chunk(type, data, qb.data(), 0, nrow, n_per_row, nullptr); + const ggml_type_traits * tr = ggml_get_type_traits(type); + for (size_t r = 0; r < nrow; ++r) { + tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t) n_per_row); + } + return rmse(data, dq.data(), nrow * n_per_row); +} + +// Run actual Q3_KPT quantization: train levels, set, quantize, dequantize, return RMSE +static float q3kpt_rmse_actual(const float * data, size_t nrow, size_t n_per_row) { + float levels[Q3KPT_N_LEVELS]; + q3kpt_train_levels(data, (int64_t) nrow, (int64_t) n_per_row, nullptr, levels); + q3kpt_set_levels(levels); + const size_t rs = ggml_row_size(GGML_TYPE_Q3_KPT, n_per_row); + std::vector qb(nrow * rs); + std::vector dq(nrow * n_per_row); + quantize_q3_kpt(data, qb.data(), (int64_t) nrow, (int64_t) n_per_row, nullptr); + const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_Q3_KPT); + for (size_t r = 0; r < nrow; ++r) { + tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t) n_per_row); + } + return rmse(data, dq.data(), nrow * n_per_row); +} + +// --------------------------------------------------------------------------- +// Test cases +// --------------------------------------------------------------------------- +struct TestCase { + std::string name; + std::vector data; + size_t nrow, n_per_row; +}; + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- +int main(int argc, char ** argv) { + (void)argc; + (void)argv; + + ggml_backend_load_all(); + + std::mt19937 rng(0xdeadbeef); + std::vector cases; + + { + TestCase tc; + tc.name = "Gaussian(0,0.02) 64x4096"; + tc.nrow = 64; + tc.n_per_row = 4096; + tc.data.resize(tc.nrow * tc.n_per_row); + std::normal_distribution nd(0, 0.02f); + for (auto & v : tc.data) { + v = nd(rng); + } + cases.push_back(std::move(tc)); + } + { + TestCase tc; + tc.name = "Gaussian(0,0.05) 32x8192"; + tc.nrow = 32; + tc.n_per_row = 8192; + tc.data.resize(tc.nrow * tc.n_per_row); + std::normal_distribution nd(0, 0.05f); + for (auto & v : tc.data) { + v = nd(rng); + } + cases.push_back(std::move(tc)); + } + { + TestCase tc; + tc.name = "Gaussian(0,0.01) 128x2048"; + tc.nrow = 128; + tc.n_per_row = 2048; + tc.data.resize(tc.nrow * tc.n_per_row); + std::normal_distribution nd(0, 0.01f); + for (auto & v : tc.data) { + v = nd(rng); + } + cases.push_back(std::move(tc)); + } + { + TestCase tc; + tc.name = "Laplace(0,0.01) 64x4096"; + tc.nrow = 64; + tc.n_per_row = 4096; + tc.data.resize(tc.nrow * tc.n_per_row); + std::exponential_distribution ed(100.f); + std::bernoulli_distribution sgnd(0.5f); + for (auto & v : tc.data) { + v = ed(rng); + if (sgnd(rng)) { + v = -v; + } + } + cases.push_back(std::move(tc)); + } + { + TestCase tc; + tc.name = "Uniform(-0.1,0.1) 64x4096"; + tc.nrow = 64; + tc.n_per_row = 4096; + tc.data.resize(tc.nrow * tc.n_per_row); + std::uniform_real_distribution ud(-0.1f, 0.1f); + for (auto & v : tc.data) { + v = ud(rng); + } + cases.push_back(std::move(tc)); + } + + printf("Q3_KPT quantization accuracy vs Q3_K (lower=better; 1.00=Q3_K baseline)\n\n"); + printf("%-28s %7s %7s %7s\n", "Test", "Q3_K", "Q3_KPT", "Ratio"); + printf("%-28s %7s %7s %7s\n", "----------------------------", "-------", "-------", "-------"); + + int tc_idx = 0; + bool any_fail = false; + for (auto & tc : cases) { + fprintf(stderr, "[%u/%zu] %s... ", ++tc_idx, cases.size(), tc.name.c_str()); + fflush(stderr); + + float q3k_rmse = std_quant_rmse(GGML_TYPE_Q3_K, tc.data.data(), tc.nrow, tc.n_per_row); + float q3kpt_rmse = q3kpt_rmse_actual(tc.data.data(), tc.nrow, tc.n_per_row); + + fprintf(stderr, "done\n"); + + float ratio = q3kpt_rmse / q3k_rmse; + // Q3_KPT should be competitive with or better than Q3_K + bool ok = (ratio < 1.2f); // Allow 20% slack for now + if (!ok) { + any_fail = true; + } + printf("%-28s %7.6f %7.6f %7.4f%s\n", tc.name.c_str(), q3k_rmse, q3kpt_rmse, ratio, ok ? "" : " FAIL"); + fflush(stdout); + } + + if (any_fail) { + fprintf(stderr, "\nFAIL: Q3_KPT RMSE significantly worse than Q3_K on some test cases\n"); + return 1; + } + + printf("\nPASS\n"); + return 0; +} diff --git a/tests/test-quant-q4dpt.cpp b/tests/test-quant-q4dpt.cpp new file mode 100644 index 00000000000..2e4c5f947e7 --- /dev/null +++ b/tests/test-quant-q4dpt.cpp @@ -0,0 +1,175 @@ +// test-quant-q4dpt.cpp +// Quantization accuracy test for Q4_DPT (IQ4_NL with per-tensor learned int8 levels). + +#include "ggml-backend.h" +#include "ggml.h" +#include +#include + +extern "C" { +void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[16]); +void q4dpt_set_levels(const int8_t * levels); +size_t quantize_q4_dpt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +} + +#define Q4DPT_N_LEVELS 16 + +#include +#include +#include +#include +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +static float rmse(const float * a, const float * b, size_t n) { + double s = 0; + for (size_t i = 0; i < n; ++i) { + double d = (double) a[i] - (double) b[i]; + s += d * d; + } + return (float) std::sqrt(s / (double) n); +} + +static float std_quant_rmse(ggml_type type, const float * data, size_t nrow, size_t n_per_row) { + const size_t rs = ggml_row_size(type, n_per_row); + std::vector qb(nrow * rs); + std::vector dq(nrow * n_per_row); + ggml_quantize_chunk(type, data, qb.data(), 0, nrow, n_per_row, nullptr); + const ggml_type_traits * tr = ggml_get_type_traits(type); + for (size_t r = 0; r < nrow; ++r) { + tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t) n_per_row); + } + return rmse(data, dq.data(), nrow * n_per_row); +} + +// Run Q4_DPT: train levels, set, quantize, dequantize, return RMSE +static float q4dpt_rmse_actual(const float * data, size_t nrow, size_t n_per_row) { + int8_t levels[Q4DPT_N_LEVELS]; + q4dpt_train_levels(data, (int64_t) nrow, (int64_t) n_per_row, nullptr, levels); + q4dpt_set_levels(levels); + const size_t rs = ggml_row_size(GGML_TYPE_Q4_DPT, n_per_row); + std::vector qb(nrow * rs); + std::vector dq(nrow * n_per_row); + quantize_q4_dpt(data, qb.data(), (int64_t) nrow, (int64_t) n_per_row, nullptr); + const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_Q4_DPT); + for (size_t r = 0; r < nrow; ++r) { + tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t) n_per_row); + } + return rmse(data, dq.data(), nrow * n_per_row); +} + +// --------------------------------------------------------------------------- +// Test cases +// --------------------------------------------------------------------------- +struct TestCase { + std::string name; + std::vector data; + size_t nrow, n_per_row; +}; + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- +int main(int argc, char ** argv) { + (void) argc; + (void) argv; + + ggml_backend_load_all(); + + std::mt19937 rng(0xdeadbeef); + std::vector cases; + + { + TestCase tc; + tc.name = "Gaussian(0,0.02) 64x4096"; + tc.nrow = 64; + tc.n_per_row = 4096; + tc.data.resize(tc.nrow * tc.n_per_row); + std::normal_distribution nd(0, 0.02f); + for (auto & v : tc.data) { v = nd(rng); } + cases.push_back(std::move(tc)); + } + { + TestCase tc; + tc.name = "Gaussian(0,0.05) 32x8192"; + tc.nrow = 32; + tc.n_per_row = 8192; + tc.data.resize(tc.nrow * tc.n_per_row); + std::normal_distribution nd(0, 0.05f); + for (auto & v : tc.data) { v = nd(rng); } + cases.push_back(std::move(tc)); + } + { + TestCase tc; + tc.name = "Gaussian(0,0.01) 128x2048"; + tc.nrow = 128; + tc.n_per_row = 2048; + tc.data.resize(tc.nrow * tc.n_per_row); + std::normal_distribution nd(0, 0.01f); + for (auto & v : tc.data) { v = nd(rng); } + cases.push_back(std::move(tc)); + } + { + TestCase tc; + tc.name = "Laplace(0,0.01) 64x4096"; + tc.nrow = 64; + tc.n_per_row = 4096; + tc.data.resize(tc.nrow * tc.n_per_row); + std::exponential_distribution ed(100.f); + std::bernoulli_distribution sgnd(0.5f); + for (auto & v : tc.data) { + v = ed(rng); + if (sgnd(rng)) { v = -v; } + } + cases.push_back(std::move(tc)); + } + { + TestCase tc; + tc.name = "Uniform(-0.1,0.1) 64x4096"; + tc.nrow = 64; + tc.n_per_row = 4096; + tc.data.resize(tc.nrow * tc.n_per_row); + std::uniform_real_distribution ud(-0.1f, 0.1f); + for (auto & v : tc.data) { v = ud(rng); } + cases.push_back(std::move(tc)); + } + + printf("Q4_DPT quantization accuracy vs IQ4_NL (lower=better; 1.00=IQ4_NL baseline)\n\n"); + printf("%-28s %7s %7s %7s\n", "Test", "IQ4_NL", "Q4_DPT", "Ratio"); + printf("%-28s %7s %7s %7s\n", "----------------------------", "-------", "-------", "-------"); + + int tc_idx = 0; + bool any_fail = false; + for (auto & tc : cases) { + fprintf(stderr, "[%u/%zu] %s... ", ++tc_idx, cases.size(), tc.name.c_str()); + fflush(stderr); + + float iq4nl_rmse = std_quant_rmse(GGML_TYPE_IQ4_NL, tc.data.data(), tc.nrow, tc.n_per_row); + float q4dpt_rmse = q4dpt_rmse_actual(tc.data.data(), tc.nrow, tc.n_per_row); + + fprintf(stderr, "done\n"); + + float ratio = q4dpt_rmse / iq4nl_rmse; + // Q4_DPT should be competitive with or better than IQ4_NL + bool ok = (ratio < 1.2f); + if (!ok) { any_fail = true; } + printf("%-28s %7.6f %7.6f %7.4f%s\n", + tc.name.c_str(), iq4nl_rmse, q4dpt_rmse, ratio, ok ? "" : " FAIL"); + fflush(stdout); + } + + if (any_fail) { + fprintf(stderr, "\nFAIL: Q4_DPT RMSE significantly worse than IQ4_NL on some test cases\n"); + return 1; + } + + printf("\nPASS\n"); + return 0; +} diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 7bfa3fb6601..43bb822d8f0 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -39,6 +39,7 @@ static const std::vector QUANT_OPTIONS = { { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, { "Q3_PT", LLAMA_FTYPE_MOSTLY_Q3_PT, " 3.25 bpw quantization", }, { "Q3_KPT", LLAMA_FTYPE_MOSTLY_Q3_KPT, " Q3_K with learned per-tensor levels" }, + { "Q4_DPT", LLAMA_FTYPE_MOSTLY_Q4_DPT, " IQ4_NL with learned per-tensor int8 levels" }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, From 2300458ad2eaaf2c9ae6de803097a569b1edc5ff Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Sun, 1 Mar 2026 04:34:43 +0100 Subject: [PATCH 5/9] Dynamic version of IQ4_NL - now with UGLY HACKY CUDA KERNELS --- ggml/src/ggml-cpu/arch-fallback.h | 3 + ggml/src/ggml-cpu/arch/x86/quants.c | 88 ++++++++ ggml/src/ggml-cpu/llamafile/sgemm.cpp | 42 +++- ggml/src/ggml-cpu/quants.c | 4 - ggml/src/ggml-cuda/common.cuh | 11 + ggml/src/ggml-cuda/convert.cu | 33 +++ ggml/src/ggml-cuda/convert.cuh | 3 + ggml/src/ggml-cuda/ggml-cuda.cu | 7 +- ggml/src/ggml-cuda/mmq.cu | 20 ++ ggml/src/ggml-cuda/mmq.cuh | 77 +++++++ ggml/src/ggml-cuda/mmvq.cu | 19 ++ ggml/src/ggml-cuda/vecdotq.cuh | 24 +++ ggml/src/ggml-quants.c | 242 +++++++++++++-------- tests/CMakeLists.txt | 1 + tests/test-quant-q4dpt-experiment.cpp | 289 ++++++++++++++++++++++++++ 15 files changed, 764 insertions(+), 99 deletions(-) create mode 100644 tests/test-quant-q4dpt-experiment.cpp diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index 907ab853b1e..8b8ad90ab51 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -32,6 +32,7 @@ #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K #define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K +#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 @@ -186,6 +187,7 @@ #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K #define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K +#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0 #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 @@ -279,6 +281,7 @@ #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K #define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K +#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0 #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index a88f49d9d5c..aeac7dcc7bf 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -3717,6 +3717,94 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v *s = sumf; } +void ggml_vec_dot_q4_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK4_NL == 0); + static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); + + const block_q4_dpt * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + const int nb = n / QK4_NL; + + // Per-tensor levels — looked up from the CPU-side registry once per call. + const int8_t * values = q4dpt_get_tensor_levels(vx); + GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor"); + + int ib = 0; + float sumf = 0; + +#if defined __AVX2__ + + const __m128i values128 = _mm_loadu_si128((const __m128i*)values); + const __m128i m4b = _mm_set1_epi8(0x0f); + const __m256i mone = _mm256_set1_epi16(1); + + __m256 accum1 = _mm256_setzero_ps(); + __m256 accum2 = _mm256_setzero_ps(); + for (; ib + 1 < nb; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs); + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs); + const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs); + const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs); + const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)), + _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b))); + const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)), + _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b))); + const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); + const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); + const __m256i p_1 = _mm256_madd_epi16(p16_1, mone); + const __m256i p_2 = _mm256_madd_epi16(p16_2, mone); + accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)), + _mm256_cvtepi32_ps(p_1), accum1); + accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)), + _mm256_cvtepi32_ps(p_2), accum2); + } + + sumf = hsum_float_8(_mm256_add_ps(accum1, accum2)); + +#elif defined __AVX__ + const __m128i values128 = _mm_loadu_si128((const __m128i*)values); + const __m128i m4b = _mm_set1_epi8(0x0f); + + __m256 accum = _mm256_setzero_ps(); + for (; ib + 1 < nb; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs); + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); + const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs); + const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1); + const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); + const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1); + + const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)); + const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)); + const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)); + const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)); + + const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1); + const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d); + accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum); + } + + sumf = hsum_float_8(accum); + +#endif + for (; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + sumi1 += y[ib].qs[j+ 0] * values[x[ib].qs[j] & 0xf]; + sumi2 += y[ib].qs[j+QK4_NL/2] * values[x[ib].qs[j] >> 4]; + } + sumf += d * (sumi1 + sumi2); + } + *s = sumf; +} + void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index da412fd009b..809c5dc108f 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -1349,16 +1349,20 @@ class tinyBLAS_Q0_AVX { const TA *A, int64_t lda, const TB *B, int64_t ldb, TC *C, int64_t ldc, - int ith, int nth) + int ith, int nth, + const int8_t * custom_table = nullptr) : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { - const int8_t kvalues_iq4nl[16] = { - -127, -104, -83, -65, - -49, -35, -22, -10, - 1, 13, 25, 38, - 53, 69, 89, 113 - }; - - iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl); + if (custom_table) { + iq4nlt = _mm_loadu_si128((const __m128i *)custom_table); + } else { + const int8_t kvalues_iq4nl[16] = { + -127, -104, -83, -65, + -49, -35, -22, -10, + 1, 13, 25, 38, + 53, 69, 89, 113 + }; + iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl); + } } void matmul(int64_t m, int64_t n) { @@ -4013,6 +4017,26 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 #endif } + case GGML_TYPE_Q4_DPT: { + if (Btype != GGML_TYPE_Q8_0) + return false; +#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) + // Q4_DPT has identical block layout to IQ4_NL (block_q4_dpt = block_iq4_nl) + // but uses a per-tensor lookup table instead of the fixed IQ4_NL values. + const int8_t * levels = q4dpt_get_tensor_levels(A); + if (!levels) return false; + tinyBLAS_Q0_AVX tb{ + k, (const block_iq4_nl *)A, lda, + (const block_q8_0 *)B, ldb, + (float *)C, ldc, + params->ith, params->nth, levels}; + tb.matmul(m, n); + return true; +#else + return false; +#endif + } + default: return false; } diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index 1e90cef3f9c..13f57c96e9f 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -1205,10 +1205,6 @@ void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_q4_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - ggml_vec_dot_q4_dpt_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); -} - void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 36d8a3aaab2..140b73a3d95 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1015,6 +1015,17 @@ struct ggml_cuda_type_traits { static constexpr int qi = QI4_NL; }; +template<> +struct ggml_cuda_type_traits { + static constexpr int qk = QK4_NL; + static constexpr int qr = QR4_NL; + static constexpr int qi = QI4_NL; +}; + +// Per-tensor lookup table for Q4_DPT (device global memory). +// Each TU gets its own copy; initialized via cudaGetSymbolAddress + cudaMemcpyAsync before use. +__device__ int8_t q4dpt_levels_cuda[16]; + template<> struct ggml_cuda_type_traits { static constexpr int qk = QK_K; diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 09b6d5db6a0..c803e871349 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -593,12 +593,41 @@ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t dequantize_block_iq1_s<<>>(vx, y); } +void ggml_cuda_set_q4dpt_levels(const int8_t * levels, cudaStream_t stream) { + int8_t * d_q4dpt_levels; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, 16, cudaMemcpyHostToDevice, stream)); +} + template static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { const int nb = (k + QK_K - 1) / QK_K; dequantize_block_iq4_nl<<>>(vx, y); } +template +static __global__ void dequantize_block_q4_dpt(const void * __restrict__ vx, dst_t * __restrict__ yy) { + const int64_t i = blockIdx.x; + const block_q4_dpt * x = (const block_q4_dpt *) vx + i*(QK_K/QK4_NL); + + const int64_t tid = threadIdx.x; + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[ib].qs + 4*il; + const float d = (float)x[ib].d; + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * q4dpt_levels_cuda[q4[j] & 0xf]; + y[j+16] = d * q4dpt_levels_cuda[q4[j] >> 4]; + } +} + +template +static void dequantize_row_q4_dpt_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = (k + QK_K - 1) / QK_K; + dequantize_block_q4_dpt<<>>(vx, y); +} + template static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { const int nb = k / QK_K; @@ -709,6 +738,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq1_m_cuda; case GGML_TYPE_IQ4_NL: return dequantize_row_iq4_nl_cuda; + case GGML_TYPE_Q4_DPT: + return dequantize_row_q4_dpt_cuda; case GGML_TYPE_IQ4_XS: return dequantize_row_iq4_xs_cuda; case GGML_TYPE_IQ3_S: @@ -760,6 +791,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq1_m_cuda; case GGML_TYPE_IQ4_NL: return dequantize_row_iq4_nl_cuda; + case GGML_TYPE_Q4_DPT: + return dequantize_row_q4_dpt_cuda; case GGML_TYPE_IQ4_XS: return dequantize_row_iq4_xs_cuda; case GGML_TYPE_IQ3_S: diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh index 09f9a33f909..c211b2e9369 100644 --- a/ggml/src/ggml-cuda/convert.cuh +++ b/ggml/src/ggml-cuda/convert.cuh @@ -31,6 +31,9 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type); to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type); to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type); +// Set the Q4_DPT lookup table in device constant memory. +void ggml_cuda_set_q4dpt_levels(const int8_t * levels, cudaStream_t stream); + template __host__ __device__ inline dst_t ggml_cuda_cast(src_t x) { if constexpr (std::is_same_v) { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 7e6d3303549..fabbf0daeb6 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4618,6 +4618,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_Q4_DPT: case GGML_TYPE_IQ4_XS: case GGML_TYPE_BF16: return true; @@ -4652,7 +4653,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g { return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 || - op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) && + op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL || + op->type == GGML_TYPE_Q4_DPT) && op->src[0]->type == GGML_TYPE_F32 && (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32); } break; @@ -4705,6 +4707,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) { return true; } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_DPT) { + return true; + } if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) { return true; } diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 9a69f41d159..5cb1f6f9123 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -2,6 +2,8 @@ #include "mmq.cuh" #include "quantize.cuh" #include "mmid.cuh" +#include "convert.cuh" +#include "ggml-quants.h" static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { switch (args.type_x) { @@ -62,6 +64,9 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con case GGML_TYPE_IQ4_NL: mul_mat_q_case(ctx, args, stream); break; + case GGML_TYPE_Q4_DPT: + mul_mat_q_case(ctx, args, stream); + break; default: GGML_ABORT("fatal error"); break; @@ -79,6 +84,15 @@ void ggml_cuda_mul_mat_q( cudaStream_t stream = ctx.stream(); const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + // Set Q4_DPT lookup table if needed + if (src0->type == GGML_TYPE_Q4_DPT) { + const int8_t * levels = q4dpt_get_tensor_levels(src0->data); + GGML_ASSERT(levels != NULL && "Q4_DPT tensor levels not set"); + int8_t * d_q4dpt_levels; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, 16, cudaMemcpyHostToDevice, stream)); + } + const size_t ts_src0 = ggml_type_size(src0->type); const size_t ts_src1 = ggml_type_size(src1->type); const size_t ts_dst = ggml_type_size(dst->type); @@ -286,6 +300,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_Q4_DPT: mmq_supported = true; break; default: @@ -364,3 +379,8 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; } + +// Q4_DPT must be instantiated in this TU (not a separate template-instance file) +// because it accesses the TU-local __device__ variable q4dpt_levels_cuda, +// which is initialized by the code above. +DECL_MMQ_CASE(GGML_TYPE_Q4_DPT); diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 255e59f6fc6..a55e567fba8 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -1,6 +1,7 @@ #pragma once #include "common.cuh" +#include "ggml.h" #include "vecdotq.cuh" #include "mma.cuh" @@ -86,6 +87,7 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) { return MMQ_Q8_1_DS_LAYOUT_DS4; case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_Q4_DPT: return MMQ_Q8_1_DS_LAYOUT_D4; default: GGML_ABORT("fatal error"); @@ -202,6 +204,7 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml case GGML_TYPE_IQ1_S: return MMQ_DP4A_TXS_Q8_0; case GGML_TYPE_IQ4_XS: return MMQ_DP4A_TXS_Q8_0; case GGML_TYPE_IQ4_NL: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_Q4_DPT: return MMQ_DP4A_TXS_Q8_0; default: return tile_x_sizes{0, 0, 0}; } } @@ -243,6 +246,7 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { case GGML_TYPE_IQ1_S: return MMQ_MMA_TILE_X_K_Q8_0; case GGML_TYPE_IQ4_XS: return MMQ_MMA_TILE_X_K_Q8_0; case GGML_TYPE_IQ4_NL: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q4_DPT: return MMQ_MMA_TILE_X_K_Q8_0; default: return 0; } } @@ -2681,6 +2685,71 @@ template static __device__ __forceinline__ void loa } } +template static __device__ __forceinline__ void load_tiles_q4_dpt( + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + +#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) + int * x_qs = (int *) x_tile; + float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2); +#else + constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_DPT, mmq_y); + int * x_qs = (int *) x_tile; + float * x_df = (float *) (x_qs + txs.qs); +#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) + + constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_NL); + constexpr int nrows = warp_size / threads_per_row; + const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x; + const int kbx = txi / QI4_NL; + const int kqsx = txi % QI4_NL; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) { + int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row); + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_dpt * bxi = (const block_q4_dpt *) x + kbx0 + i*stride + kbx; + + const int aux_q4 = get_int_b2(bxi->qs, kqsx); + const int2 v = get_int_from_table_16(aux_q4, q4dpt_levels_cuda); + const int k0 = kbx * (2 * QI4_NL) + kqsx; + +#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) + x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x; + x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + QI4_NL] = v.y; +#else + x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 0] = v.x; + x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + QI4_NL] = v.y; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) + } + + constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI4_NL; + constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row; + const int kbxd = threadIdx.x % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) { + int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_dpt * bxi = (const block_q4_dpt *) x + kbx0 + i*stride + kbxd; + +#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) + x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = __half2float(bxi->d); +#else + x_df[i*(MMQ_TILE_NE_K/QI4_NL) + i/QI4_NL + kbxd] = __half2float(bxi->d); +#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) + } +} + template static __device__ __forceinline__ void load_tiles_iq2_xxs( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { constexpr int nwarps = mmq_get_nwarps_device(); @@ -3357,6 +3426,14 @@ struct mmq_type_traits { static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; }; +template +struct mmq_type_traits { + static constexpr int vdr = VDR_Q4_DPT_Q8_1_MMQ; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_dpt; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; +}; + template struct mmq_type_traits { static constexpr int vdr = VDR_IQ4_XS_Q8_1_MMQ; diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index ce25ccf427c..83434c6a09d 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -2,6 +2,8 @@ #include "quantize.cuh" #include "unary.cuh" #include "vecdotq.cuh" +#include "convert.cuh" +#include "ggml-quants.h" #include @@ -27,6 +29,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) case GGML_TYPE_IQ1_S: return vec_dot_iq1_s_q8_1; case GGML_TYPE_IQ1_M: return vec_dot_iq1_m_q8_1; case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1; + case GGML_TYPE_Q4_DPT: return vec_dot_q4_dpt_q8_1; case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1; case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1; default: return nullptr; @@ -52,6 +55,7 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) { case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ; case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ; case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ; + case GGML_TYPE_Q4_DPT: return VDR_Q4_DPT_Q8_1_MMVQ; case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ; default: return 1; } @@ -617,6 +621,12 @@ static void mul_mat_vec_q_switch_type( nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; + case GGML_TYPE_Q4_DPT: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); + break; case GGML_TYPE_IQ4_XS: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, @@ -646,6 +656,15 @@ void ggml_cuda_mul_mat_vec_q( cudaStream_t stream = ctx.stream(); + // Set Q4_DPT lookup table if needed + if (src0->type == GGML_TYPE_Q4_DPT) { + const int8_t * levels = q4dpt_get_tensor_levels(src0->data); + GGML_ASSERT(levels != NULL && "Q4_DPT tensor levels not set"); + int8_t * d_q4dpt_levels; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, 16, cudaMemcpyHostToDevice, stream)); + } + const size_t ts_src0 = ggml_type_size(src0->type); const size_t ts_src1 = ggml_type_size(src1->type); const size_t ts_dst = ggml_type_size(dst->type); diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh index ab803aca21b..7f0377b8385 100644 --- a/ggml/src/ggml-cuda/vecdotq.cuh +++ b/ggml/src/ggml-cuda/vecdotq.cuh @@ -1208,6 +1208,30 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( return d * sumi; } +#define VDR_Q4_DPT_Q8_1_MMVQ 2 +#define VDR_Q4_DPT_Q8_1_MMQ 4 + +static __device__ __forceinline__ float vec_dot_q4_dpt_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { + + const block_q4_dpt * bq4 = (const block_q4_dpt *) vbq + kbx; + + const int * q8 = (const int *) bq8_1->qs + iqs; + + int sumi = 0; +#pragma unroll + for (int l = 0; l < VDR_Q4_DPT_Q8_1_MMVQ; ++l) { + const int aux_q4 = get_int_b2(bq4->qs, iqs + l); + const int2 v = get_int_from_table_16(aux_q4, q4dpt_levels_cuda); + + sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi); + sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi); + } + + const float d = __half2float(bq4->d) * __low2float(bq8_1->ds); + return d * sumi; +} + #define VDR_IQ4_XS_Q8_1_MMVQ 4 #define VDR_IQ4_XS_Q8_1_MMQ 4 diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 6c38c553016..e9c07c7ca27 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4639,9 +4639,47 @@ const int8_t * q4dpt_get_tensor_levels(const void * data_ptr) { return q4dpt_get_levels(); } +// Run Lloyd-Max iterations on a pre-built histogram. +// levels[] is updated in-place (and kept sorted). +static void q4dpt_run_lloyd_max(const float * bin_sum_w, const float * bin_sum_wt, + float * levels, int n_bins, float bin_width, int max_iter) { + for (int iter = 0; iter < max_iter; ++iter) { + float sw[Q4DPT_N_LEVELS] = { 0 }; + float swt[Q4DPT_N_LEVELS] = { 0 }; + for (int b = 0; b < n_bins; ++b) { + if (bin_sum_w[b] < 1e-12f) { continue; } + float t = -1.0f + (b + 0.5f) * bin_width; + int best = 0; + float bd = (t - levels[0]) * (t - levels[0]); + for (int k = 1; k < Q4DPT_N_LEVELS; ++k) { + float d = (t - levels[k]) * (t - levels[k]); + if (d < bd) { bd = d; best = k; } + } + sw[best] += bin_sum_w[b]; + swt[best] += bin_sum_wt[b]; + } + float max_delta = 0.0f; + for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { + if (sw[k] > 1e-12f) { + float nl = swt[k] / sw[k]; + max_delta = fmaxf(max_delta, fabsf(nl - levels[k])); + levels[k] = nl; + } + } + if (max_delta < 1e-10f) { break; } + for (int k = 1; k < Q4DPT_N_LEVELS; ++k) { + float v = levels[k]; + int m = k - 1; + while (m >= 0 && levels[m] > v) { levels[m+1] = levels[m]; m--; } + levels[m+1] = v; + } + } +} + // Train 16 Lloyd-Max int8 levels. // Bins x/amax values from 32-element IQ4_NL-style blocks into [-1,1], -// runs weighted k-means initialized from IQ4_NL, rounds float centroids to sorted int8[16]. +// runs weighted k-means (seeded from IQ4_NL values), then rounds float +// centroids to sorted int8[16] with post-rounding local search. void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t levels_out[Q4DPT_N_LEVELS]) { const int N_BINS = 8192; @@ -4681,53 +4719,77 @@ void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, } } - // Initialize from IQ4_NL values (normalized to [-1, 1]) - float levels_f[Q4DPT_N_LEVELS]; + // Initialize from IQ4_NL values normalized to [-1, 1], then run Lloyd-Max + float best_levels[Q4DPT_N_LEVELS]; for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { - levels_f[k] = (float)kvalues_iq4nl[k] / 127.0f; - } - - // Lloyd-Max iterations - for (int iter = 0; iter < 200; ++iter) { - float sw[Q4DPT_N_LEVELS] = { 0 }; - float swt[Q4DPT_N_LEVELS] = { 0 }; - for (int b = 0; b < N_BINS; ++b) { - if (bin_sum_w[b] < 1e-12f) { continue; } - float t = -1.0f + (b + 0.5f) * bin_width; - int best = 0; - float bd = (t - levels_f[0]) * (t - levels_f[0]); - for (int k = 1; k < Q4DPT_N_LEVELS; ++k) { - float d = (t - levels_f[k]) * (t - levels_f[k]); - if (d < bd) { bd = d; best = k; } - } - sw[best] += bin_sum_w[b]; - swt[best] += bin_sum_wt[b]; - } - float max_delta = 0.0f; - for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { - if (sw[k] > 1e-12f) { - float nl = swt[k] / sw[k]; - max_delta = fmaxf(max_delta, fabsf(nl - levels_f[k])); - levels_f[k] = nl; - } - } - if (max_delta < 1e-10f) { break; } - // Insertion sort to keep levels ordered - for (int k = 1; k < Q4DPT_N_LEVELS; ++k) { - float v = levels_f[k]; - int m = k - 1; - while (m >= 0 && levels_f[m] > v) { levels_f[m+1] = levels_f[m]; m--; } - levels_f[m+1] = v; - } + best_levels[k] = (float)kvalues_iq4nl[k] / 127.0f; } + q4dpt_run_lloyd_max(bin_sum_w, bin_sum_wt, best_levels, N_BINS, bin_width, 500); // Round float centroids to int8, preserve sort order + int8_t levels_i8[Q4DPT_N_LEVELS]; for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { - int v = (int)roundf(levels_f[k] * 127.0f); + int v = (int)roundf(best_levels[k] * 127.0f); if (v < -128) { v = -128; } if (v > 127) { v = 127; } - levels_out[k] = (int8_t)v; + levels_i8[k] = (int8_t)v; } + + // Post-rounding local search: try ±1 adjustments to each level greedily. + // The int8 rounding can introduce sub-optimal level placement; this + // hill-climbing on discrete int8 values often recovers a better solution. + for (int pass = 0; pass < 10; ++pass) { + int improved = 0; + for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { + // Evaluate current histogram MSE with int8 levels + float cur_levels[Q4DPT_N_LEVELS]; + for (int i = 0; i < Q4DPT_N_LEVELS; ++i) { + cur_levels[i] = (float)levels_i8[i] / 127.0f; + } + float cur_mse = 0.0f; + for (int b = 0; b < N_BINS; ++b) { + if (bin_sum_w[b] < 1e-12f) { continue; } + float t = -1.0f + (b + 0.5f) * bin_width; + float bd = (t - cur_levels[0]) * (t - cur_levels[0]); + for (int i = 1; i < Q4DPT_N_LEVELS; ++i) { + float d = (t - cur_levels[i]) * (t - cur_levels[i]); + if (d < bd) { bd = d; } + } + cur_mse += bin_sum_w[b] * bd; + } + + int8_t best_val = levels_i8[k]; + int8_t lo = (k > 0) ? (int8_t)(levels_i8[k-1] + 1) : -128; + int8_t hi = (k < Q4DPT_N_LEVELS - 1) ? (int8_t)(levels_i8[k+1] - 1) : 127; + for (int delta = -1; delta <= 1; delta += 2) { + int8_t nv = (int8_t)(levels_i8[k] + delta); + if (nv < lo || nv > hi) { continue; } + cur_levels[k] = (float)nv / 127.0f; + float test_mse = 0.0f; + for (int b = 0; b < N_BINS; ++b) { + if (bin_sum_w[b] < 1e-12f) { continue; } + float t = -1.0f + (b + 0.5f) * bin_width; + float bd = (t - cur_levels[0]) * (t - cur_levels[0]); + for (int i = 1; i < Q4DPT_N_LEVELS; ++i) { + float d = (t - cur_levels[i]) * (t - cur_levels[i]); + if (d < bd) { bd = d; } + } + test_mse += bin_sum_w[b] * bd; + } + if (test_mse < cur_mse) { + best_val = nv; + cur_mse = test_mse; + improved = 1; + } + cur_levels[k] = (float)levels_i8[k] / 127.0f; // restore + } + levels_i8[k] = best_val; + } + if (!improved) { break; } + } + + memcpy(levels_out, levels_i8, Q4DPT_N_LEVELS * sizeof(int8_t)); + q4dpt_set_levels(levels_out); free(bin_sum_w); @@ -4751,14 +4813,14 @@ void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RE } } -// Quantize one 32-element block using float levels and optimal per-block scale. -// Uses scale perturbation (ntry) to improve accuracy, mirroring quantize_row_iq4_nl_impl. +// Quantize one 32-element block using int8 levels and optimal per-block scale. +// IQ4_NL-style scale perturbation with negative-scale support and final re-assignment. static void quantize_block_q4_dpt(const float * GGML_RESTRICT xb, block_q4_dpt * GGML_RESTRICT out, const int8_t * values, const float * qw, int ntry) { - float amax = 0.0f; + float amax = 0.0f, max_val = 0.0f; for (int j = 0; j < QK4_NL; ++j) { float ax = fabsf(xb[j]); - if (ax > amax) { amax = ax; } + if (ax > amax) { amax = ax; max_val = xb[j]; } } if (amax < 1e-10f) { out->d = 0; @@ -4766,62 +4828,72 @@ static void quantize_block_q4_dpt(const float * GGML_RESTRICT xb, block_q4_dpt * return; } - // Find magnitude of the level furthest from zero - float max_abs_level = 0.0f; - for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { - float al = fabsf((float)values[k]); - if (al > max_abs_level) { max_abs_level = al; } - } - if (max_abs_level < 1e-10f) { max_abs_level = 1.0f; } + // Initial scale: d = -max/values[0] (allows negative d for asymmetric levels) + float d = ntry > 0 ? -max_val / (float)values[0] : max_val / (float)values[0]; + float id = (fabsf(d) > 1e-20f) ? 1.0f / d : 0.0f; - const float d_base = amax / max_abs_level; - float best_err = 1e30f; - float best_d = d_base; + // Initial assignment + optimal scale via least-squares + uint8_t L[QK4_NL]; + float sumqx = 0.0f, sumq2 = 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float al = id * xb[j]; + int bk = 0; + float bd = fabsf(al - (float)values[0]); + for (int k = 1; k < Q4DPT_N_LEVELS; ++k) { + float dist = fabsf(al - (float)values[k]); + if (dist < bd) { bd = dist; bk = k; } + } + L[j] = (uint8_t)bk; + float q = (float)values[bk]; + float w = qw ? qw[j] : 1.0f; + sumqx += w * q * xb[j]; + sumq2 += w * q * q; + } + d = (sumq2 > 1e-20f) ? sumqx / sumq2 : d; + float best = d * sumqx; uint8_t best_L[QK4_NL]; - memset(best_L, 0, sizeof(best_L)); + memcpy(best_L, L, QK4_NL); + float best_d = d; + // Scale perturbation: id = (itry + values[0]) / max_val (IQ4_NL-style) for (int itry = -ntry; itry <= ntry; ++itry) { - float d = d_base * (1.0f + (float)itry * (0.1f / ntry)); - if (d < 1e-20f) { continue; } - const float inv_d = 1.0f / d; - - // Nearest-level assignment in scale-normalized space - uint8_t L[QK4_NL]; + id = ((float)itry + (float)values[0]) / max_val; + sumqx = sumq2 = 0.0f; for (int j = 0; j < QK4_NL; ++j) { - float t = xb[j] * inv_d; + float al = id * xb[j]; int bk = 0; - float bd = fabsf(t - (float)values[0]); + float bd = fabsf(al - (float)values[0]); for (int k = 1; k < Q4DPT_N_LEVELS; ++k) { - float dist = fabsf(t - (float)values[k]); + float dist = fabsf(al - (float)values[k]); if (dist < bd) { bd = dist; bk = k; } } L[j] = (uint8_t)bk; - } - - // Optimal scale via weighted least-squares: d* = sum(w*x*lvl) / sum(w*lvl^2) - float num = 0.0f, den = 0.0f; - for (int j = 0; j < QK4_NL; ++j) { - float w = qw ? qw[j] : 1.0f; - float lf = (float)values[L[j]]; - num += w * xb[j] * lf; - den += w * lf * lf; - } - d = den > 1e-20f ? num / den : d_base; - - // Recompute error with optimal d - float err = 0.0f; - for (int j = 0; j < QK4_NL; ++j) { + float q = (float)values[bk]; float w = qw ? qw[j] : 1.0f; - float e = xb[j] - d * (float)values[L[j]]; - err += w * e * e; + sumqx += w * q * xb[j]; + sumq2 += w * q * q; } - if (err < best_err) { - best_err = err; - best_d = d; + if (sumq2 > 0.0f && sumqx * sumqx > best * sumq2) { + d = sumqx / sumq2; + best = d * sumqx; + best_d = d; memcpy(best_L, L, QK4_NL); } } + // Final re-assignment using the best scale + id = (fabsf(best_d) > 1e-20f) ? 1.0f / best_d : 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float al = id * xb[j]; + int bk = 0; + float bd = fabsf(al - (float)values[0]); + for (int k = 1; k < Q4DPT_N_LEVELS; ++k) { + float dist = fabsf(al - (float)values[k]); + if (dist < bd) { bd = dist; bk = k; } + } + best_L[j] = (uint8_t)bk; + } + out->d = GGML_FP32_TO_FP16(best_d); for (int j = 0; j < QK4_NL/2; ++j) { out->qs[j] = best_L[j] | (best_L[j + QK4_NL/2] << 4); @@ -4841,7 +4913,7 @@ size_t quantize_q4_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst block_q4_dpt * q4 = (block_q4_dpt *) qrow; for (int64_t ibl = 0; ibl < nblock; ++ibl) { const float * qw = quant_weights ? quant_weights + QK4_NL * ibl : NULL; - quantize_block_q4_dpt(src + QK4_NL * ibl, &q4[ibl], values, qw, 7); + quantize_block_q4_dpt(src + QK4_NL * ibl, &q4[ibl], values, qw, 15); } src += n_per_row; qrow += nblock * sizeof(block_q4_dpt); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index c3640896444..19bcf9a57a8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -255,6 +255,7 @@ endif() # New quant tests llama_build_and_test(test-quant-q3kpt.cpp) llama_build_and_test(test-quant-q4dpt.cpp) +llama_build(test-quant-q4dpt-experiment.cpp) # libmtmd set(LLAMA_TEST_NAME test-mtmd-c-api) diff --git a/tests/test-quant-q4dpt-experiment.cpp b/tests/test-quant-q4dpt-experiment.cpp new file mode 100644 index 00000000000..ba6fbab48dc --- /dev/null +++ b/tests/test-quant-q4dpt-experiment.cpp @@ -0,0 +1,289 @@ +// test-quant-q4dpt-experiment.cpp +// Sweep quantization knobs to find the best combination for Gaussian data. + +#include "ggml-backend.h" +#include "ggml.h" +#include +#include +#include +#include +#include +#include +#include + +extern "C" { +void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[16]); +void q4dpt_set_levels(const int8_t * levels); +void dequantize_row_q4_dpt(const void * x, float * y, int64_t k); +} + +static const int8_t kvalues_iq4nl[16] = { + -127, -104, -83, -65, -49, -35, -22, -10, + 1, 13, 25, 38, 53, 69, 89, 113, +}; + +#define Q4DPT_N_LEVELS 16 +#define QK4_NL 32 + +// --- block_q4_dpt layout (same as block_iq4_nl) --- +#pragma pack(push, 1) +struct block_q4_dpt { + uint16_t d; // f16 delta + uint8_t qs[QK4_NL/2]; // nibbles +}; +#pragma pack(pop) + +// --- Configurable quantization function --- +struct QuantConfig { + bool use_element_weights; // weight[j] = xb[j]^2 vs uniform + bool allow_neg_scale; // d can be negative (IQ4_NL-style init) + bool iq4nl_perturbation; // id = (itry+values[0])/max vs d = d_base*(1+itry*frac) + bool final_reassign; // re-assign after finding best scale + float perturb_range; // perturbation range for generic perturbation (e.g., 0.1) + int ntry; // number of perturbation steps + const char * name; +}; + +static int best_index(int n, const int8_t * values, float x) { + int best = 0; + float bd = fabsf(x - (float)values[0]); + for (int k = 1; k < n; ++k) { + float d = fabsf(x - (float)values[k]); + if (d < bd) { bd = d; best = k; } + } + return best; +} + +static void quantize_block_experiment(const float * xb, block_q4_dpt * out, + const int8_t * values, const QuantConfig & cfg) { + float amax = 0.0f, max_val = 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { amax = ax; max_val = xb[j]; } + } + if (amax < 1e-10f) { + out->d = 0; + memset(out->qs, 0, QK4_NL/2); + return; + } + + // Weights + float weight[QK4_NL]; + if (cfg.use_element_weights) { + for (int j = 0; j < QK4_NL; ++j) { weight[j] = xb[j] * xb[j]; } + } else { + for (int j = 0; j < QK4_NL; ++j) { weight[j] = 1.0f; } + } + + // Find max abs level + float max_abs_level = 0.0f; + for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { + float al = fabsf((float)values[k]); + if (al > max_abs_level) max_abs_level = al; + } + if (max_abs_level < 1e-10f) max_abs_level = 1.0f; + + // Initial scale + float d; + if (cfg.allow_neg_scale) { + d = cfg.ntry > 0 ? -max_val / (float)values[0] : max_val / (float)values[0]; + } else { + d = amax / max_abs_level; + } + float id = (fabsf(d) > 1e-20f) ? 1.0f / d : 0.0f; + + // Initial assignment + uint8_t L[QK4_NL]; + float sumqx = 0.0f, sumq2 = 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float al = id * xb[j]; + L[j] = (uint8_t)best_index(Q4DPT_N_LEVELS, values, al); + float q = (float)values[L[j]]; + float w = weight[j]; + sumqx += w * q * xb[j]; + sumq2 += w * q * q; + } + d = (sumq2 > 1e-20f) ? sumqx / sumq2 : d; + float best_metric = d * sumqx; // sumqx^2/sumq2 proxy + uint8_t best_L[QK4_NL]; + memcpy(best_L, L, QK4_NL); + float best_d = d; + + int ntry = cfg.ntry; + + // Scale perturbation + if (cfg.iq4nl_perturbation) { + // IQ4_NL-style: id = (itry + values[0]) / max_val + for (int itry = -ntry; itry <= ntry; ++itry) { + id = ((float)itry + (float)values[0]) / max_val; + sumqx = sumq2 = 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float al = id * xb[j]; + L[j] = (uint8_t)best_index(Q4DPT_N_LEVELS, values, al); + float q = (float)values[L[j]]; + float w = weight[j]; + sumqx += w * q * xb[j]; + sumq2 += w * q * q; + } + if (sumq2 > 0.0f && sumqx * sumqx > best_metric * sumq2) { + d = sumqx / sumq2; + best_metric = d * sumqx; + best_d = d; + memcpy(best_L, L, QK4_NL); + } + } + } else { + // Generic: d = d_base * (1 + itry * range / ntry) + float d_base = best_d; + for (int itry = -ntry; itry <= ntry; ++itry) { + d = d_base * (1.0f + (float)itry * (cfg.perturb_range / ntry)); + if (fabsf(d) < 1e-20f) continue; + id = 1.0f / d; + sumqx = sumq2 = 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float al = id * xb[j]; + L[j] = (uint8_t)best_index(Q4DPT_N_LEVELS, values, al); + float q = (float)values[L[j]]; + float w = weight[j]; + sumqx += w * q * xb[j]; + sumq2 += w * q * q; + } + if (sumq2 > 0.0f && sumqx * sumqx > best_metric * sumq2) { + d = sumqx / sumq2; + best_metric = d * sumqx; + best_d = d; + memcpy(best_L, L, QK4_NL); + } + } + } + + // Final re-assignment + if (cfg.final_reassign) { + id = (fabsf(best_d) > 1e-20f) ? 1.0f / best_d : 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float al = id * xb[j]; + best_L[j] = (uint8_t)best_index(Q4DPT_N_LEVELS, values, al); + } + } + + // Store FP16 scale via memcpy of the raw half-float bits + uint16_t d_fp16; + { + float f = best_d; + // Use ggml's conversion + ggml_fp16_t h = ggml_fp32_to_fp16(f); + memcpy(&d_fp16, &h, sizeof(d_fp16)); + } + out->d = d_fp16; + for (int j = 0; j < QK4_NL/2; ++j) { + out->qs[j] = best_L[j] | (best_L[j + QK4_NL/2] << 4); + } +} + +// --------------------------------------------------------------------------- +static float rmse_vec(const float * a, const float * b, size_t n) { + double s = 0; + for (size_t i = 0; i < n; ++i) { double d = (double)a[i] - (double)b[i]; s += d*d; } + return (float)std::sqrt(s / (double)n); +} + +static float experiment_rmse(const float * data, size_t nrow, size_t n_per_row, + const int8_t * levels, const QuantConfig & cfg) { + q4dpt_set_levels(levels); + size_t nblock = n_per_row / QK4_NL; + size_t total = nrow * n_per_row; + std::vector qblocks(nrow * nblock); + std::vector deq(total); + + for (size_t row = 0; row < nrow; ++row) { + for (size_t ib = 0; ib < nblock; ++ib) { + quantize_block_experiment( + data + row * n_per_row + ib * QK4_NL, + &qblocks[row * nblock + ib], levels, cfg); + } + } + + // Dequantize + const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_Q4_DPT); + for (size_t row = 0; row < nrow; ++row) { + tr->to_float((const void *)&qblocks[row * nblock], + deq.data() + row * n_per_row, (int64_t)n_per_row); + } + + return rmse_vec(data, deq.data(), total); +} + +static float iq4nl_rmse(const float * data, size_t nrow, size_t n_per_row) { + size_t rs = ggml_row_size(GGML_TYPE_IQ4_NL, n_per_row); + std::vector qb(nrow * rs); + std::vector dq(nrow * n_per_row); + ggml_quantize_chunk(GGML_TYPE_IQ4_NL, data, qb.data(), 0, nrow, n_per_row, nullptr); + const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_IQ4_NL); + for (size_t r = 0; r < nrow; ++r) { + tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t)n_per_row); + } + return rmse_vec(data, dq.data(), nrow * n_per_row); +} + +int main() { + ggml_backend_load_all(); + + // Generate Gaussian data + std::mt19937 rng(0xdeadbeef); + const size_t nrow = 64, ncol = 4096; + std::vector data(nrow * ncol); + std::normal_distribution nd(0, 0.02f); + for (auto & v : data) { v = nd(rng); } + + float ref = iq4nl_rmse(data.data(), nrow, ncol); + printf("IQ4_NL reference RMSE: %.6f\n\n", ref); + + // Train levels + int8_t trained_levels[Q4DPT_N_LEVELS]; + q4dpt_train_levels(data.data(), (int64_t)nrow, (int64_t)ncol, nullptr, trained_levels); + + printf("Trained levels: "); + for (int k = 0; k < Q4DPT_N_LEVELS; ++k) printf("%4d", trained_levels[k]); + printf("\nIQ4_NL levels: "); + for (int k = 0; k < Q4DPT_N_LEVELS; ++k) printf("%4d", kvalues_iq4nl[k]); + printf("\n\n"); + + // Define configurations to test + QuantConfig configs[] = { + // name, elem_w, neg_scale, iq4nl_perturb, final_reassign, perturb_range, ntry + { false, false, false, false, 0.10f, 7, "A: baseline (uniform w, pos scale, generic 10%)" }, + { true, false, false, false, 0.10f, 7, "B: elem_w only" }, + { false, true, true, false, 0.0f, 7, "C: neg_scale + iq4nl_perturb" }, + { true, true, true, false, 0.0f, 7, "D: elem_w + neg_scale + iq4nl_perturb" }, + { true, true, true, true, 0.0f, 7, "E: D + final_reassign" }, + { false, false, false, false, 0.05f, 7, "F: uniform w, pos scale, generic 5%" }, + { false, false, false, false, 0.20f, 7, "G: uniform w, pos scale, generic 20%" }, + { false, false, false, false, 0.10f, 15,"H: baseline ntry=15" }, + { true, false, false, false, 0.10f, 15,"I: elem_w ntry=15" }, + { false, true, true, true, 0.0f, 7, "J: neg_scale + iq4nl_perturb + reassign (no elem_w)" }, + { false, true, true, true, 0.0f, 15,"K: J with ntry=15" }, + { true, true, true, true, 0.0f, 15,"L: all features ntry=15" }, + { false, false, false, true, 0.10f, 7, "M: baseline + reassign" }, + { true, false, false, true, 0.10f, 7, "N: elem_w + reassign" }, + { false, true, false, false, 0.10f, 7, "O: neg_scale + generic 10%" }, + { false, true, false, true, 0.10f, 7, "P: neg_scale + generic 10% + reassign" }, + { true, true, false, true, 0.10f, 7, "Q: elem_w + neg_scale + generic 10% + reassign" }, + }; + int nconfigs = sizeof(configs) / sizeof(configs[0]); + + printf("%-55s %8s %8s %8s %8s\n", "Configuration", "Trained", "IQ4_NL", "Tr.Ratio", "NL.Ratio"); + printf("%-55s %8s %8s %8s %8s\n", + "-------------------------------------------------------", "--------", "--------", "--------", "--------"); + + for (int c = 0; c < nconfigs; ++c) { + float rmse_trained = experiment_rmse(data.data(), nrow, ncol, trained_levels, configs[c]); + float rmse_iq4nl = experiment_rmse(data.data(), nrow, ncol, kvalues_iq4nl, configs[c]); + printf("%-55s %8.6f %8.6f %8.4f %8.4f\n", + configs[c].name, rmse_trained, rmse_iq4nl, + rmse_trained / ref, rmse_iq4nl / ref); + } + + printf("\n(Ratio < 1.0 = better than IQ4_NL's native quantization)\n"); + return 0; +} From f4081f42daf17e70005a79887aa73902e1f7065f Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Sun, 1 Mar 2026 16:14:02 +0100 Subject: [PATCH 6/9] Attempts at civilizing the passing of kvalues (not working yet) --- ggml/src/ggml-cpu/arch/x86/quants.c | 3 +- ggml/src/ggml-cpu/ggml-cpu.c | 9 ++ ggml/src/ggml-cpu/llamafile/sgemm.cpp | 2 +- ggml/src/ggml-cpu/quants.c | 6 +- ggml/src/ggml-cuda/convert.cu | 2 +- ggml/src/ggml-cuda/ggml-cuda.cu | 6 + ggml/src/ggml-cuda/mmq.cu | 10 +- ggml/src/ggml-cuda/mmvq.cu | 10 +- ggml/src/ggml-quants.c | 143 +++---------------- ggml/src/ggml-quants.h | 13 +- src/llama-context.cpp | 6 +- src/llama-graph.cpp | 57 ++++++++ src/llama-graph.h | 39 ++++++ src/llama-model.cpp | 189 +++++++++----------------- src/llama-model.h | 7 + 15 files changed, 225 insertions(+), 277 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index aeac7dcc7bf..b726ef63584 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -3731,8 +3731,7 @@ void ggml_vec_dot_q4_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v const int nb = n / QK4_NL; - // Per-tensor levels — looked up from the CPU-side registry once per call. - const int8_t * values = q4dpt_get_tensor_levels(vx); + const int8_t * values = (const int8_t *)ggml_quant_get_current_levels(GGML_TYPE_Q4_DPT); GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor"); int ib = 0; diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index f4441b32d94..e94dffd3281 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3,6 +3,7 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" +#include "ggml-quants.h" #include "traits.h" #include "ggml-cpu-impl.h" #include "ggml-impl.h" @@ -1278,6 +1279,14 @@ void ggml_compute_forward_mul_mat( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows + // Set current per-tensor quantization levels from graph input (src[2] or src[3]) + { + const int levels_src = (dst->op == GGML_OP_MUL_MAT_ID) ? 3 : 2; + if (dst->src[levels_src] && dst->src[levels_src]->data) { + ggml_quant_set_current_levels(src0->type, dst->src[levels_src]->data); + } + } + // TODO: extract to "extra_op" #if GGML_USE_LLAMAFILE // broadcast factors diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 809c5dc108f..ef2b0a2e852 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -4023,7 +4023,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) // Q4_DPT has identical block layout to IQ4_NL (block_q4_dpt = block_iq4_nl) // but uses a per-tensor lookup table instead of the fixed IQ4_NL values. - const int8_t * levels = q4dpt_get_tensor_levels(A); + const int8_t * levels = (const int8_t *)ggml_quant_get_current_levels(GGML_TYPE_Q4_DPT); if (!levels) return false; tinyBLAS_Q0_AVX tb{ k, (const block_iq4_nl *)A, lda, diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index 13f57c96e9f..9a01a110e52 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -1014,7 +1014,7 @@ void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const int nb = n / QK_K; - const float * levels = q3pt_get_tensor_levels(vx); + const float * levels = (const float *)ggml_quant_get_current_levels(GGML_TYPE_Q3_PT); GGML_ASSERT(levels != NULL && "Q3_PT levels not set for tensor"); float sumf = 0.f; @@ -1114,7 +1114,7 @@ void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const int nb = n / QK_K; - const float * levels = q3kpt_get_tensor_levels(vx); + const float * levels = (const float *)ggml_quant_get_current_levels(GGML_TYPE_Q3_KPT); GGML_ASSERT(levels != NULL && "Q3_KPT levels not set for tensor"); const uint32_t kmask1 = 0x03030303; @@ -1189,7 +1189,7 @@ void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const int nb = n / QK4_NL; - const int8_t * values = q4dpt_get_tensor_levels(vx); + const int8_t * values = (const int8_t *)ggml_quant_get_current_levels(GGML_TYPE_Q4_DPT); GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor"); float sumf = 0; diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index c803e871349..3fb03091991 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -596,7 +596,7 @@ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t void ggml_cuda_set_q4dpt_levels(const int8_t * levels, cudaStream_t stream) { int8_t * d_q4dpt_levels; CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda)); - CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, 16, cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, 16, cudaMemcpyDeviceToDevice, stream)); } template diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index fabbf0daeb6..4118087103d 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1341,6 +1341,12 @@ static void ggml_cuda_op_mul_mat_cublas( ggml_cuda_pool_alloc src1_ddq_as_f32(ctx.pool(id)); if (src0->type != GGML_TYPE_F32) { + // Set Q4_DPT levels in convert.cu's TU before dequantize + if (src0->type == GGML_TYPE_Q4_DPT) { + const int levels_src = (dst->op == GGML_OP_MUL_MAT_ID) ? 3 : 2; + GGML_ASSERT(dst->src[levels_src] && dst->src[levels_src]->data); + ggml_cuda_set_q4dpt_levels((const int8_t *)dst->src[levels_src]->data, stream); + } const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); GGML_ASSERT(to_fp32_cuda != nullptr); src0_ddq_as_f32.alloc(row_diff*ne00); diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 5cb1f6f9123..72f125661aa 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -3,7 +3,6 @@ #include "quantize.cuh" #include "mmid.cuh" #include "convert.cuh" -#include "ggml-quants.h" static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { switch (args.type_x) { @@ -84,13 +83,14 @@ void ggml_cuda_mul_mat_q( cudaStream_t stream = ctx.stream(); const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; - // Set Q4_DPT lookup table if needed + // Set Q4_DPT lookup table from graph input levels + // MUL_MAT: levels in src[2], MUL_MAT_ID: levels in src[3] (src[2] is expert IDs) if (src0->type == GGML_TYPE_Q4_DPT) { - const int8_t * levels = q4dpt_get_tensor_levels(src0->data); - GGML_ASSERT(levels != NULL && "Q4_DPT tensor levels not set"); + const int levels_src = (dst->op == GGML_OP_MUL_MAT_ID) ? 3 : 2; + GGML_ASSERT(dst->src[levels_src] && dst->src[levels_src]->data && "Q4_DPT MUL_MAT requires levels"); int8_t * d_q4dpt_levels; CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda)); - CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, 16, cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, dst->src[levels_src]->data, 16, cudaMemcpyDeviceToDevice, stream)); } const size_t ts_src0 = ggml_type_size(src0->type); diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 83434c6a09d..74707055756 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -3,7 +3,6 @@ #include "unary.cuh" #include "vecdotq.cuh" #include "convert.cuh" -#include "ggml-quants.h" #include @@ -656,13 +655,14 @@ void ggml_cuda_mul_mat_vec_q( cudaStream_t stream = ctx.stream(); - // Set Q4_DPT lookup table if needed + // Set Q4_DPT lookup table from graph input levels + // MUL_MAT: levels in src[2], MUL_MAT_ID: levels in src[3] (src[2] is expert IDs) if (src0->type == GGML_TYPE_Q4_DPT) { - const int8_t * levels = q4dpt_get_tensor_levels(src0->data); - GGML_ASSERT(levels != NULL && "Q4_DPT tensor levels not set"); + const int levels_src = (dst->op == GGML_OP_MUL_MAT_ID) ? 3 : 2; + GGML_ASSERT(dst->src[levels_src] && dst->src[levels_src]->data && "Q4_DPT MUL_MAT requires levels"); int8_t * d_q4dpt_levels; CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda)); - CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, 16, cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, dst->src[levels_src]->data, 16, cudaMemcpyDeviceToDevice, stream)); } const size_t ts_src0 = ggml_type_size(src0->type); diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index e9c07c7ca27..92041557ce0 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4092,6 +4092,7 @@ static bool q3kpt_levels_set = false; GGML_API void q3kpt_set_levels(const float * levels) { memcpy(q3kpt_levels, levels, Q3KPT_N_LEVELS * sizeof(float)); q3kpt_levels_set = true; + ggml_quant_set_current_levels(GGML_TYPE_Q3_KPT, q3kpt_levels); } GGML_API const float * q3kpt_get_levels(void) { @@ -4102,47 +4103,6 @@ GGML_API void q3kpt_free_levels(void) { q3kpt_levels_set = false; } -// Per-tensor levels registry for inference -#define Q3KPT_MAX_TENSORS 1024 - -typedef struct { - const void * data; - size_t nbytes; - float levels[Q3KPT_N_LEVELS]; -} q3kpt_tensor_entry; - -static q3kpt_tensor_entry q3kpt_tensor_registry[Q3KPT_MAX_TENSORS]; -static int q3kpt_tensor_registry_count = 0; - -GGML_API void q3kpt_register_tensor_levels(const void * data, size_t nbytes, const float * levels) { - if (q3kpt_tensor_registry_count >= Q3KPT_MAX_TENSORS) { return; } - for (int i = 0; i < q3kpt_tensor_registry_count; ++i) { - if (q3kpt_tensor_registry[i].data == data) { - q3kpt_tensor_registry[i].nbytes = nbytes; - memcpy(q3kpt_tensor_registry[i].levels, levels, Q3KPT_N_LEVELS * sizeof(float)); - return; - } - } - q3kpt_tensor_registry[q3kpt_tensor_registry_count].data = data; - q3kpt_tensor_registry[q3kpt_tensor_registry_count].nbytes = nbytes; - memcpy(q3kpt_tensor_registry[q3kpt_tensor_registry_count].levels, levels, Q3KPT_N_LEVELS * sizeof(float)); - q3kpt_tensor_registry_count++; -} - -GGML_API void q3kpt_clear_tensor_levels(void) { - q3kpt_tensor_registry_count = 0; -} - -GGML_API const float * q3kpt_get_tensor_levels(const void * data_ptr) { - const uint8_t * p = (const uint8_t *)data_ptr; - for (int i = 0; i < q3kpt_tensor_registry_count; ++i) { - const uint8_t * base = (const uint8_t *)q3kpt_tensor_registry[i].data; - if (p >= base && p < base + q3kpt_tensor_registry[i].nbytes) { - return q3kpt_tensor_registry[i].levels; - } - } - return q3kpt_get_levels(); -} // Train levels in the symmetric quantization space GGML_API void q3kpt_train_levels(const float * data, @@ -4274,7 +4234,7 @@ GGML_API void q3kpt_train_levels(const float * data, void dequantize_row_q3_kpt(const block_q3_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; - const float * levels = q3kpt_get_tensor_levels(x); + const float * levels = (const float *)ggml_quant_get_current_levels(GGML_TYPE_Q3_KPT); GGML_ASSERT(levels != NULL && "Q3_KPT levels not set for tensor"); // levels are in [0,1], map to approximate [-4, 3] range for Q3_K compatibility @@ -4587,6 +4547,7 @@ static bool q4dpt_levels_set = false; void q4dpt_set_levels(const int8_t * levels) { memcpy(q4dpt_levels, levels, Q4DPT_N_LEVELS * sizeof(int8_t)); q4dpt_levels_set = true; + ggml_quant_set_current_levels(GGML_TYPE_Q4_DPT, q4dpt_levels); } const int8_t * q4dpt_get_levels(void) { @@ -4597,47 +4558,6 @@ void q4dpt_free_levels(void) { q4dpt_levels_set = false; } -// Per-tensor levels registry (inference — range-based lookup by data address) -#define Q4DPT_MAX_TENSORS 1024 - -typedef struct { - const void * data; - size_t nbytes; - int8_t levels[Q4DPT_N_LEVELS]; -} q4dpt_tensor_entry; - -static q4dpt_tensor_entry q4dpt_tensor_registry[Q4DPT_MAX_TENSORS]; -static int q4dpt_tensor_registry_count = 0; - -void q4dpt_register_tensor_levels(const void * data, size_t nbytes, const int8_t * levels) { - if (q4dpt_tensor_registry_count >= Q4DPT_MAX_TENSORS) { return; } - for (int i = 0; i < q4dpt_tensor_registry_count; ++i) { - if (q4dpt_tensor_registry[i].data == data) { - q4dpt_tensor_registry[i].nbytes = nbytes; - memcpy(q4dpt_tensor_registry[i].levels, levels, Q4DPT_N_LEVELS * sizeof(int8_t)); - return; - } - } - q4dpt_tensor_registry[q4dpt_tensor_registry_count].data = data; - q4dpt_tensor_registry[q4dpt_tensor_registry_count].nbytes = nbytes; - memcpy(q4dpt_tensor_registry[q4dpt_tensor_registry_count].levels, levels, Q4DPT_N_LEVELS * sizeof(int8_t)); - q4dpt_tensor_registry_count++; -} - -void q4dpt_clear_tensor_levels(void) { - q4dpt_tensor_registry_count = 0; -} - -const int8_t * q4dpt_get_tensor_levels(const void * data_ptr) { - const uint8_t * p = (const uint8_t *) data_ptr; - for (int i = 0; i < q4dpt_tensor_registry_count; ++i) { - const uint8_t * base = (const uint8_t *) q4dpt_tensor_registry[i].data; - if (p >= base && p < base + q4dpt_tensor_registry[i].nbytes) { - return q4dpt_tensor_registry[i].levels; - } - } - return q4dpt_get_levels(); -} // Run Lloyd-Max iterations on a pre-built histogram. // levels[] is updated in-place (and kept sorted). @@ -4799,7 +4719,7 @@ void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK4_NL == 0); const int64_t nb = k / QK4_NL; - const int8_t * values = q4dpt_get_tensor_levels(x); + const int8_t * values = (const int8_t *)ggml_quant_get_current_levels(GGML_TYPE_Q4_DPT); GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor"); for (int i = 0; i < nb; i++) { @@ -4933,6 +4853,7 @@ static bool q3pt_levels_set = false; void q3pt_set_levels(const float * levels) { memcpy(q3pt_levels, levels, Q3PT_N_LEVELS * sizeof(float)); q3pt_levels_set = true; + ggml_quant_set_current_levels(GGML_TYPE_Q3_PT, q3pt_levels); } const float * q3pt_get_levels(void) { @@ -4943,47 +4864,6 @@ void q3pt_free_levels(void) { q3pt_levels_set = false; } -// Per-tensor levels registry for inference (range-based lookup by data address) -#define Q3PT_MAX_TENSORS 1024 - -typedef struct { - const void * data; - size_t nbytes; - float levels[Q3PT_N_LEVELS]; -} q3pt_tensor_entry; - -static q3pt_tensor_entry q3pt_tensor_registry[Q3PT_MAX_TENSORS]; -static int q3pt_tensor_registry_count = 0; - -GGML_API void q3pt_register_tensor_levels(const void * data, size_t nbytes, const float * levels) { - if (q3pt_tensor_registry_count >= Q3PT_MAX_TENSORS) { return; } - for (int i = 0; i < q3pt_tensor_registry_count; ++i) { - if (q3pt_tensor_registry[i].data == data) { - q3pt_tensor_registry[i].nbytes = nbytes; - memcpy(q3pt_tensor_registry[i].levels, levels, Q3PT_N_LEVELS * sizeof(float)); - return; - } - } - q3pt_tensor_registry[q3pt_tensor_registry_count].data = data; - q3pt_tensor_registry[q3pt_tensor_registry_count].nbytes = nbytes; - memcpy(q3pt_tensor_registry[q3pt_tensor_registry_count].levels, levels, Q3PT_N_LEVELS * sizeof(float)); - q3pt_tensor_registry_count++; -} - -GGML_API void q3pt_clear_tensor_levels(void) { - q3pt_tensor_registry_count = 0; -} - -GGML_API const float * q3pt_get_tensor_levels(const void * data_ptr) { - const uint8_t * p = (const uint8_t *)data_ptr; - for (int i = 0; i < q3pt_tensor_registry_count; ++i) { - const uint8_t * base = (const uint8_t *)q3pt_tensor_registry[i].data; - if (p >= base && p < base + q3pt_tensor_registry[i].nbytes) { - return q3pt_tensor_registry[i].levels; - } - } - return q3pt_get_levels(); -} void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, float levels_out[Q3PT_N_LEVELS]) { @@ -5119,7 +4999,7 @@ static inline void q3pt_pack3(uint8_t * GGML_RESTRICT qs, int k, int v) { void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; - const float * L = q3pt_get_tensor_levels(x); + const float * L = (const float *)ggml_quant_get_current_levels(GGML_TYPE_Q3_PT); GGML_ASSERT(L != NULL && "Q3_PT levels not set for tensor"); for (int i = 0; i < nb; i++) { @@ -6668,3 +6548,14 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte return true; } +// Unified per-type current-levels pointer (set by CPU MUL_MAT dispatch from graph input) +static const void * ggml_quant_current_levels[GGML_TYPE_COUNT] = { NULL }; + +void ggml_quant_set_current_levels(enum ggml_type type, const void * data) { + ggml_quant_current_levels[type] = data; +} + +const void * ggml_quant_get_current_levels(enum ggml_type type) { + return ggml_quant_current_levels[type]; +} + diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 48206702767..a6fef9ef43c 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -86,9 +86,6 @@ GGML_API size_t quantize_q3_kpt(const float * GGML_RESTRICT src, void * GGML_RES GGML_API void q3kpt_set_levels(const float * levels); GGML_API const float * q3kpt_get_levels(void); GGML_API void q3kpt_free_levels(void); -GGML_API void q3kpt_register_tensor_levels(const void * data, size_t nbytes, const float * levels); -GGML_API void q3kpt_clear_tensor_levels(void); -GGML_API const float * q3kpt_get_tensor_levels(const void * data_ptr); GGML_API void q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, float levels_out[Q3KPT_N_LEVELS]); GGML_API size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); @@ -119,9 +116,6 @@ GGML_API const float * q3pt_get_levels(void); GGML_API void q3pt_free_levels(void); // Per-tensor levels registry (inference — range-based lookup by data address) -GGML_API void q3pt_register_tensor_levels(const void * data, size_t nbytes, const float * levels); -GGML_API void q3pt_clear_tensor_levels(void); -GGML_API const float * q3pt_get_tensor_levels(const void * data_ptr); // Train 8 Lloyd-Max levels from tensor data via weighted k-means on affine-normalized // 16-element sub-block values. Also sets the global levels via q3pt_set_levels(). @@ -139,10 +133,9 @@ GGML_API void q4dpt_set_levels(const int8_t * levels); GGML_API const int8_t * q4dpt_get_levels(void); GGML_API void q4dpt_free_levels(void); -// Per-tensor levels registry (inference) -GGML_API void q4dpt_register_tensor_levels(const void * data, size_t nbytes, const int8_t * levels); -GGML_API void q4dpt_clear_tensor_levels(void); -GGML_API const int8_t * q4dpt_get_tensor_levels(const void * data_ptr); +// Unified per-type current-levels pointer (set by CPU MUL_MAT dispatch from graph input src[2]) +GGML_API void ggml_quant_set_current_levels(enum ggml_type type, const void * data); +GGML_API const void * ggml_quant_get_current_levels(enum ggml_type type); // Train 16 Lloyd-Max int8 levels from tensor data. // Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[16]. diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 98d055d34ef..819c80d148b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2044,8 +2044,10 @@ llm_graph_params llama_context::graph_params( /*.cvec =*/ cvec.get(), /*.loras =*/ loras.get(), /*.mctx =*/ mctx, - /*.cross =*/ &cross, - /*.samplers =*/ sampling.samplers, + /*.cross =*/ &cross, + /*.quant_levels_data =*/ &model.quant_levels_data, + /*.quant_level_index =*/ &model.quant_level_index, + /*.samplers =*/ sampling.samplers, /*.n_outputs =*/ n_outputs, /*.cb =*/ graph_get_cb(), /*.res =*/ res, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 23a86ea2905..f696e50b93b 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -877,12 +877,15 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : loras (params.loras), mctx (params.mctx), cross (params.cross), + quant_levels_data(params.quant_levels_data), + quant_level_index(params.quant_level_index), samplers (params.samplers), cb_func (params.cb), res (params.res), ctx0 (res->get_ctx()), gf (res->get_gf()) { res->set_params(params); + build_inp_quant_levels(); } void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const { @@ -1684,6 +1687,60 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const { return cur; } +void llm_graph_context::build_inp_quant_levels() { + if (!quant_levels_data || quant_levels_data->empty()) { return; } + + auto inp = std::make_unique(*quant_levels_data); + + for (const auto & [type, data] : *quant_levels_data) { + if (data.empty()) { continue; } + ggml_type tensor_type; + int64_t n_elem; + if (type == GGML_TYPE_Q4_DPT) { + tensor_type = GGML_TYPE_I8; + n_elem = (int64_t)data.size(); + } else { // Q3_PT, Q3_KPT + tensor_type = GGML_TYPE_F32; + n_elem = (int64_t)(data.size() / sizeof(float)); + } + inp->levels[type] = ggml_new_tensor_1d(ctx0, tensor_type, n_elem); + ggml_set_input(inp->levels[type]); + } + + quant_levels_inp = inp.get(); + res->add_input(std::move(inp)); +} + +void llm_graph_context::attach_quant_levels() { + if (!quant_levels_inp || !quant_level_index) { return; } + + const int n_nodes = ggml_graph_n_nodes(gf); + for (int i = 0; i < n_nodes; i++) { + ggml_tensor * node = ggml_graph_node(gf, i); + if (node->op != GGML_OP_MUL_MAT && node->op != GGML_OP_MUL_MAT_ID) { continue; } + + ggml_tensor * w = node->src[0]; + ggml_tensor * levels_all = quant_levels_inp->levels[w->type]; + if (!levels_all) { continue; } + + auto it = quant_level_index->find(ggml_get_name(w)); + if (it == quant_level_index->end()) { continue; } + + const int64_t n_levels = (w->type == GGML_TYPE_Q4_DPT) ? 16 : 8; + const size_t elem_size = ggml_type_size(levels_all->type); + ggml_tensor * lv = ggml_view_1d(ctx0, levels_all, + n_levels, (int64_t)(it->second * (size_t)n_levels * elem_size)); + + // MUL_MAT: src[0]=weights, src[1]=input, src[2] is free + // MUL_MAT_ID: src[0]=weights, src[1]=input, src[2]=expert_ids, src[3] is free + if (node->op == GGML_OP_MUL_MAT) { + node->src[2] = lv; + } else { + node->src[3] = lv; + } + } +} + ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const { auto inp = std::make_unique(hparams); diff --git a/src/llama-graph.h b/src/llama-graph.h index e8f006977d2..e849c541fa8 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -501,6 +501,34 @@ class llm_graph_input_sampling : public llm_graph_input_i { std::map samplers; }; +class llm_graph_input_quant_levels : public llm_graph_input_i { +public: + llm_graph_input_quant_levels( + const std::unordered_map> & data) + : levels_data(data) {} + virtual ~llm_graph_input_quant_levels() = default; + + void set_input(const llama_ubatch * ubatch) override { + GGML_UNUSED(ubatch); + for (const auto & [type, data] : levels_data) { + if (levels[type] && !data.empty()) { + ggml_backend_tensor_set(levels[type], data.data(), 0, data.size()); + } + } + } + + bool can_reuse(const llm_graph_params & params) override { + GGML_UNUSED(params); + return true; // levels don't change between batches + } + + // per-type input tensors, created during graph build + ggml_tensor * levels[GGML_TYPE_COUNT] = {}; + + // reference to model's host-side level data for populating + const std::unordered_map> & levels_data; +}; + // // llm_graph_result // @@ -534,6 +562,10 @@ struct llm_graph_params { const llama_memory_context_i * mctx; const llama_cross * cross; + // per-tensor quantization levels (Q4_DPT, Q3_PT, Q3_KPT) + const std::unordered_map> * quant_levels_data = nullptr; + const std::unordered_map * quant_level_index = nullptr; + std::map samplers; static bool samplers_equal( @@ -742,6 +774,11 @@ struct llm_graph_context { const llama_memory_context_i * mctx; const llama_cross * cross; + // per-tensor quantization levels + const std::unordered_map> * quant_levels_data = nullptr; + const std::unordered_map * quant_level_index = nullptr; + llm_graph_input_quant_levels * quant_levels_inp = nullptr; + std::map samplers; const llm_graph_cb & cb_func; @@ -852,6 +889,8 @@ struct llm_graph_context { ggml_tensor * build_inp_cls() const; ggml_tensor * build_inp_cross_embd() const; + void build_inp_quant_levels(); + void attach_quant_levels(); ggml_tensor * build_inp_pos_bucket_enc() const; ggml_tensor * build_inp_pos_bucket_dec() const; ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index bef7c57dc0b..f03278485b4 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -15,25 +15,13 @@ #include "models/models.h" -// Q3_PT levels functions (defined in ggml-quants.c) +// Q3_PT/Q3_KPT/Q4_DPT: global fallback levels functions (defined in ggml-quants.c) +// These remain temporarily for the global fallback used by dequantize paths. +// The per-tensor registry functions are removed; per-tensor levels now use graph inputs. extern "C" { - void q3pt_set_levels(const float * levels); - void q3pt_register_tensor_levels(const void * data, size_t nbytes, const float * levels); - void q3pt_clear_tensor_levels(void); -} - -// Q3_KPT levels functions (defined in ggml-quants.c) -extern "C" { - void q3kpt_set_levels(const float * levels); - void q3kpt_register_tensor_levels(const void * data, size_t nbytes, const float * levels); - void q3kpt_clear_tensor_levels(void); -} - -// Q4_DPT levels functions (defined in ggml-quants.c) -extern "C" { - void q4dpt_set_levels(const int8_t * levels); - void q4dpt_register_tensor_levels(const void * data, size_t nbytes, const int8_t * levels); - void q4dpt_clear_tensor_levels(void); + void q3pt_set_levels(const float * levels); + void q3kpt_set_levels(const float * levels); + void q4dpt_set_levels(const int8_t * levels); } #include @@ -7871,124 +7859,79 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } - // Q3_PT: load per-tensor levels from GGUF metadata and register them. - // Must happen AFTER load_all_data so tensor data pointers are valid. + // Load per-tensor quantization levels from GGUF metadata into model fields. + // These will be used as graph inputs (see llm_graph_input_quant_levels). { - static const size_t Q3PT_N_LEVELS = 8; - int64_t lv_idx = gguf_find_key(ml.meta.get(), "q3_pt.levels"); - if (lv_idx >= 0) { - const float * lv_data = (const float *)gguf_get_arr_data(ml.meta.get(), lv_idx); - const size_t lv_len = gguf_get_arr_n(ml.meta.get(), lv_idx); - - // Build tensor-name to slot index map (GGUF file order = quantizer order) - std::unordered_map name_to_slot; - { - const int64_t n_tensors = gguf_get_n_tensors(ml.meta.get()); - for (int64_t ti = 0; ti < n_tensors; ++ti) { - name_to_slot[gguf_get_tensor_name(ml.meta.get(), ti)] = (size_t)ti; - } - } - - q3pt_clear_tensor_levels(); - int n_registered = 0; - - for (auto & [ctx, buf_map] : ctx_buf_maps) { - for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - if (t->type != GGML_TYPE_Q3_PT || t->data == nullptr) { continue; } - auto it = name_to_slot.find(ggml_get_name(t)); - if (it == name_to_slot.end()) { continue; } - const size_t lv_offset = it->second * Q3PT_N_LEVELS; - if (lv_offset + Q3PT_N_LEVELS > lv_len) { continue; } - q3pt_register_tensor_levels(t->data, ggml_nbytes(t), lv_data + lv_offset); - if (n_registered == 0) { - q3pt_set_levels(lv_data + lv_offset); // global fallback - } - n_registered++; - } - } - if (n_registered > 0) { - LLAMA_LOG_INFO("%s: registered %d Q3_PT per-tensor level tables\n", __func__, n_registered); + // Build tensor-name to GGUF-order slot index map (shared across all types) + std::unordered_map gguf_name_to_idx; + { + const int64_t n_tensors = gguf_get_n_tensors(ml.meta.get()); + for (int64_t ti = 0; ti < n_tensors; ++ti) { + gguf_name_to_idx[gguf_get_tensor_name(ml.meta.get(), ti)] = (size_t)ti; } } - } - - // Q3_KPT: load per-tensor levels from GGUF metadata and register them. - { - static const size_t Q3KPT_N_LEVELS = 8; - int64_t lv_idx = gguf_find_key(ml.meta.get(), "q3_kpt.levels"); - if (lv_idx >= 0) { - const float * lv_data = (const float *)gguf_get_arr_data(ml.meta.get(), lv_idx); - const size_t lv_len = gguf_get_arr_n(ml.meta.get(), lv_idx); - // Build tensor-name to slot index map (GGUF file order = quantizer order) - std::unordered_map name_to_slot; - { - const int64_t n_tensors = gguf_get_n_tensors(ml.meta.get()); - for (int64_t ti = 0; ti < n_tensors; ++ti) { - name_to_slot[gguf_get_tensor_name(ml.meta.get(), ti)] = (size_t)ti; - } - } - - q3kpt_clear_tensor_levels(); - int n_registered = 0; + struct level_type_info { + ggml_type type; + const char * gguf_key; + size_t n_levels; // number of level values per tensor + size_t elem_bytes; // size of each level value + }; - for (auto & [ctx, buf_map] : ctx_buf_maps) { - for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - if (t->type != GGML_TYPE_Q3_KPT || t->data == nullptr) { continue; } - auto it = name_to_slot.find(ggml_get_name(t)); - if (it == name_to_slot.end()) { continue; } - const size_t lv_offset = it->second * Q3KPT_N_LEVELS; - if (lv_offset + Q3KPT_N_LEVELS > lv_len) { continue; } - q3kpt_register_tensor_levels(t->data, ggml_nbytes(t), lv_data + lv_offset); - if (n_registered == 0) { - q3kpt_set_levels(lv_data + lv_offset); // global fallback - } - n_registered++; - } - } - if (n_registered > 0) { - LLAMA_LOG_INFO("%s: registered %d Q3_KPT per-tensor level tables\n", __func__, n_registered); - } - } - } + const level_type_info level_types[] = { + { GGML_TYPE_Q3_PT, "q3_pt.levels", 8, sizeof(float) }, + { GGML_TYPE_Q3_KPT, "q3_kpt.levels", 8, sizeof(float) }, + { GGML_TYPE_Q4_DPT, "q4_dpt.levels", 16, sizeof(int8_t) }, + }; - // Q4_DPT: load per-tensor int8 levels from GGUF metadata and register them. - // Must happen AFTER load_all_data so tensor data pointers are valid. - { - static const size_t Q4DPT_N_LEVELS = 16; - int64_t lv_idx = gguf_find_key(ml.meta.get(), "q4_dpt.levels"); - if (lv_idx >= 0) { - const int8_t * lv_data = (const int8_t *) gguf_get_arr_data(ml.meta.get(), lv_idx); - const size_t lv_len = gguf_get_arr_n(ml.meta.get(), lv_idx); + for (const auto & lt : level_types) { + int64_t lv_idx = gguf_find_key(ml.meta.get(), lt.gguf_key); + if (lv_idx < 0) { continue; } - // Build tensor-name to slot index map (GGUF file order = quantizer order) - std::unordered_map name_to_slot; - { - const int64_t n_tensors = gguf_get_n_tensors(ml.meta.get()); - for (int64_t ti = 0; ti < n_tensors; ++ti) { - name_to_slot[gguf_get_tensor_name(ml.meta.get(), ti)] = (size_t) ti; - } - } + const uint8_t * lv_raw = (const uint8_t *)gguf_get_arr_data(ml.meta.get(), lv_idx); + const size_t lv_arr_n = gguf_get_arr_n(ml.meta.get(), lv_idx); - q4dpt_clear_tensor_levels(); - int n_registered = 0; + // Single pass: assign sequential slot indices and copy levels data + const size_t bytes_per_tensor = lt.n_levels * lt.elem_bytes; + auto & data = quant_levels_data[lt.type]; + size_t slot = 0; + bool global_set = false; for (auto & [ctx, buf_map] : ctx_buf_maps) { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - if (t->type != GGML_TYPE_Q4_DPT || t->data == nullptr) { continue; } - auto it = name_to_slot.find(ggml_get_name(t)); - if (it == name_to_slot.end()) { continue; } - const size_t lv_offset = it->second * Q4DPT_N_LEVELS; - if (lv_offset + Q4DPT_N_LEVELS > lv_len) { continue; } - q4dpt_register_tensor_levels(t->data, ggml_nbytes(t), lv_data + lv_offset); - if (n_registered == 0) { - q4dpt_set_levels(lv_data + lv_offset); // global fallback + if (t->type != lt.type) { continue; } + + auto it = gguf_name_to_idx.find(ggml_get_name(t)); + if (it == gguf_name_to_idx.end()) { continue; } + + const size_t gguf_offset = it->second * lt.n_levels; + if (gguf_offset + lt.n_levels > lv_arr_n) { continue; } + + // Record slot index and append levels data + quant_level_index[ggml_get_name(t)] = slot; + data.resize((slot + 1) * bytes_per_tensor); + memcpy(data.data() + slot * bytes_per_tensor, + lv_raw + gguf_offset * lt.elem_bytes, + bytes_per_tensor); + slot++; + + // Set the global fallback from the first tensor's levels + if (!global_set) { + if (lt.type == GGML_TYPE_Q4_DPT) { + q4dpt_set_levels((const int8_t *)(lv_raw + gguf_offset * lt.elem_bytes)); + } else if (lt.type == GGML_TYPE_Q3_PT) { + q3pt_set_levels((const float *)(lv_raw + gguf_offset * lt.elem_bytes)); + } else if (lt.type == GGML_TYPE_Q3_KPT) { + q3kpt_set_levels((const float *)(lv_raw + gguf_offset * lt.elem_bytes)); + } + global_set = true; } - n_registered++; } } - if (n_registered > 0) { - LLAMA_LOG_INFO("%s: registered %d Q4_DPT per-tensor level tables\n", __func__, n_registered); + + if (slot > 0) { + LLAMA_LOG_INFO("%s: loaded %zu %s per-tensor level tables\n", + __func__, slot, lt.gguf_key); } } } @@ -9013,6 +8956,8 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm->res->set_outputs(); + llm->attach_quant_levels(); + return llm->res->get_gf(); } diff --git a/src/llama-model.h b/src/llama-model.h index d7c3e7d1c1a..00899a705c1 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -511,6 +511,13 @@ struct llama_model { // for keeping track of associated LoRA adapters std::unordered_set loras; + // host-side levels data for dynamic quantization types (Q4_DPT, Q3_PT, Q3_KPT) + // indexed by ggml_type, raw bytes (int8 for Q4_DPT, float for Q3_PT/Q3_KPT) + std::unordered_map> quant_levels_data; + + // maps tensor name → slot index within quant_levels_data for that type + std::unordered_map quant_level_index; + int64_t t_load_us = 0; int64_t t_start_us = 0; From 72fb85470dd0e01c36f835ab8f79b5ee6564e6c4 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Sun, 1 Mar 2026 18:57:16 +0100 Subject: [PATCH 7/9] Sanitize the registry --- ggml/src/ggml-cpu/ggml-cpu.c | 9 ++-- ggml/src/ggml-cuda/ggml-cuda.cu | 8 +-- ggml/src/ggml-cuda/mmq.cu | 11 ++-- ggml/src/ggml-cuda/mmvq.cu | 11 ++-- ggml/src/ggml-quants.c | 35 +++++++++++++ ggml/src/ggml-quants.h | 6 +++ src/llama-context.cpp | 6 +-- src/llama-graph.cpp | 57 --------------------- src/llama-graph.h | 38 -------------- src/llama-model.cpp | 89 +++++++++++++++++---------------- src/llama-model.h | 23 ++++++--- 11 files changed, 127 insertions(+), 166 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index e94dffd3281..e45a4f688a5 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1279,11 +1279,12 @@ void ggml_compute_forward_mul_mat( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // Set current per-tensor quantization levels from graph input (src[2] or src[3]) + // Set current per-tensor quantization levels from registry { - const int levels_src = (dst->op == GGML_OP_MUL_MAT_ID) ? 3 : 2; - if (dst->src[levels_src] && dst->src[levels_src]->data) { - ggml_quant_set_current_levels(src0->type, dst->src[levels_src]->data); + size_t levels_size; + const void * levels = ggml_quant_get_tensor_aux_data(src0, &levels_size); + if (levels) { + ggml_quant_set_current_levels(src0->type, levels); } } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 4118087103d..034553bf806 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3,6 +3,7 @@ #include "ggml-backend-impl.h" #include "ggml-cuda/common.cuh" +#include "ggml-quants.h" #include "ggml-cuda/acc.cuh" #include "ggml-cuda/add-id.cuh" #include "ggml-cuda/arange.cuh" @@ -1343,9 +1344,10 @@ static void ggml_cuda_op_mul_mat_cublas( if (src0->type != GGML_TYPE_F32) { // Set Q4_DPT levels in convert.cu's TU before dequantize if (src0->type == GGML_TYPE_Q4_DPT) { - const int levels_src = (dst->op == GGML_OP_MUL_MAT_ID) ? 3 : 2; - GGML_ASSERT(dst->src[levels_src] && dst->src[levels_src]->data); - ggml_cuda_set_q4dpt_levels((const int8_t *)dst->src[levels_src]->data, stream); + size_t levels_size; + const void * levels = ggml_quant_get_tensor_aux_data(src0, &levels_size); + GGML_ASSERT(levels && "Q4_DPT MUL_MAT requires levels"); + ggml_cuda_set_q4dpt_levels((const int8_t *)levels, stream); } const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); GGML_ASSERT(to_fp32_cuda != nullptr); diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 72f125661aa..fbc75b6f6f0 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -3,6 +3,7 @@ #include "quantize.cuh" #include "mmid.cuh" #include "convert.cuh" +#include "ggml-quants.h" static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { switch (args.type_x) { @@ -83,14 +84,14 @@ void ggml_cuda_mul_mat_q( cudaStream_t stream = ctx.stream(); const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; - // Set Q4_DPT lookup table from graph input levels - // MUL_MAT: levels in src[2], MUL_MAT_ID: levels in src[3] (src[2] is expert IDs) + // Set Q4_DPT lookup table from per-tensor registry if (src0->type == GGML_TYPE_Q4_DPT) { - const int levels_src = (dst->op == GGML_OP_MUL_MAT_ID) ? 3 : 2; - GGML_ASSERT(dst->src[levels_src] && dst->src[levels_src]->data && "Q4_DPT MUL_MAT requires levels"); + size_t levels_size; + const void * levels = ggml_quant_get_tensor_aux_data(src0, &levels_size); + GGML_ASSERT(levels && "Q4_DPT MUL_MAT requires levels (register with ggml_quant_set_tensor_aux_data)"); int8_t * d_q4dpt_levels; CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda)); - CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, dst->src[levels_src]->data, 16, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, levels_size, cudaMemcpyHostToDevice, stream)); } const size_t ts_src0 = ggml_type_size(src0->type); diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 74707055756..788cf6749e6 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -3,6 +3,7 @@ #include "unary.cuh" #include "vecdotq.cuh" #include "convert.cuh" +#include "ggml-quants.h" #include @@ -655,14 +656,14 @@ void ggml_cuda_mul_mat_vec_q( cudaStream_t stream = ctx.stream(); - // Set Q4_DPT lookup table from graph input levels - // MUL_MAT: levels in src[2], MUL_MAT_ID: levels in src[3] (src[2] is expert IDs) + // Set Q4_DPT lookup table from per-tensor registry if (src0->type == GGML_TYPE_Q4_DPT) { - const int levels_src = (dst->op == GGML_OP_MUL_MAT_ID) ? 3 : 2; - GGML_ASSERT(dst->src[levels_src] && dst->src[levels_src]->data && "Q4_DPT MUL_MAT requires levels"); + size_t levels_size; + const void * levels = ggml_quant_get_tensor_aux_data(src0, &levels_size); + GGML_ASSERT(levels && "Q4_DPT MUL_MAT requires levels (register with ggml_quant_set_tensor_aux_data)"); int8_t * d_q4dpt_levels; CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda)); - CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, dst->src[levels_src]->data, 16, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, levels_size, cudaMemcpyHostToDevice, stream)); } const size_t ts_src0 = ggml_type_size(src0->type); diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 92041557ce0..f3771ebb93c 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -6559,3 +6559,38 @@ const void * ggml_quant_get_current_levels(enum ggml_type type) { return ggml_quant_current_levels[type]; } +// Per-tensor auxiliary data registry (indexed by tensor struct pointer) +// Allows backends to look up aux data for a specific weight tensor +// Key is tensor struct pointer (stable), value is aux data pointer +#define GGML_QUANT_AUX_HASH_SIZE 1024 + +static struct { + const void * tensor_ptr; // ggml_tensor struct pointer (key) + const void * aux_data; // aux data pointer (levels, kvalues, etc.) + size_t aux_size; // aux data size in bytes +} ggml_quant_aux_registry[GGML_QUANT_AUX_HASH_SIZE]; + +static size_t ggml_quant_aux_hash(const void * ptr) { + return (size_t)((uintptr_t)ptr >> 4) % GGML_QUANT_AUX_HASH_SIZE; +} + +void ggml_quant_set_tensor_aux_data(const void * tensor_ptr, const void * aux_data, size_t aux_size) { + size_t h = ggml_quant_aux_hash(tensor_ptr); + ggml_quant_aux_registry[h].tensor_ptr = tensor_ptr; + ggml_quant_aux_registry[h].aux_data = aux_data; + ggml_quant_aux_registry[h].aux_size = aux_size; +} + +const void * ggml_quant_get_tensor_aux_data(const void * tensor_ptr, size_t * out_size) { + size_t h = ggml_quant_aux_hash(tensor_ptr); + if (ggml_quant_aux_registry[h].tensor_ptr == tensor_ptr) { + if (out_size) *out_size = ggml_quant_aux_registry[h].aux_size; + return ggml_quant_aux_registry[h].aux_data; + } + return NULL; +} + +void ggml_quant_clear_aux_registry(void) { + memset(ggml_quant_aux_registry, 0, sizeof(ggml_quant_aux_registry)); +} + diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index a6fef9ef43c..35fb38c6496 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -137,6 +137,12 @@ GGML_API void q4dpt_free_levels(void); GGML_API void ggml_quant_set_current_levels(enum ggml_type type, const void * data); GGML_API const void * ggml_quant_get_current_levels(enum ggml_type type); +// Per-tensor auxiliary data registry (indexed by tensor pointer) +// Allows backends to look up aux data for a specific weight tensor +GGML_API void ggml_quant_set_tensor_aux_data(const void * tensor_data, const void * aux_data, size_t aux_size); +GGML_API const void * ggml_quant_get_tensor_aux_data(const void * tensor_data, size_t * out_size); +GGML_API void ggml_quant_clear_aux_registry(void); + // Train 16 Lloyd-Max int8 levels from tensor data. // Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[16]. // Also sets the global levels via q4dpt_set_levels(). diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 819c80d148b..98d055d34ef 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2044,10 +2044,8 @@ llm_graph_params llama_context::graph_params( /*.cvec =*/ cvec.get(), /*.loras =*/ loras.get(), /*.mctx =*/ mctx, - /*.cross =*/ &cross, - /*.quant_levels_data =*/ &model.quant_levels_data, - /*.quant_level_index =*/ &model.quant_level_index, - /*.samplers =*/ sampling.samplers, + /*.cross =*/ &cross, + /*.samplers =*/ sampling.samplers, /*.n_outputs =*/ n_outputs, /*.cb =*/ graph_get_cb(), /*.res =*/ res, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index f696e50b93b..23a86ea2905 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -877,15 +877,12 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : loras (params.loras), mctx (params.mctx), cross (params.cross), - quant_levels_data(params.quant_levels_data), - quant_level_index(params.quant_level_index), samplers (params.samplers), cb_func (params.cb), res (params.res), ctx0 (res->get_ctx()), gf (res->get_gf()) { res->set_params(params); - build_inp_quant_levels(); } void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const { @@ -1687,60 +1684,6 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const { return cur; } -void llm_graph_context::build_inp_quant_levels() { - if (!quant_levels_data || quant_levels_data->empty()) { return; } - - auto inp = std::make_unique(*quant_levels_data); - - for (const auto & [type, data] : *quant_levels_data) { - if (data.empty()) { continue; } - ggml_type tensor_type; - int64_t n_elem; - if (type == GGML_TYPE_Q4_DPT) { - tensor_type = GGML_TYPE_I8; - n_elem = (int64_t)data.size(); - } else { // Q3_PT, Q3_KPT - tensor_type = GGML_TYPE_F32; - n_elem = (int64_t)(data.size() / sizeof(float)); - } - inp->levels[type] = ggml_new_tensor_1d(ctx0, tensor_type, n_elem); - ggml_set_input(inp->levels[type]); - } - - quant_levels_inp = inp.get(); - res->add_input(std::move(inp)); -} - -void llm_graph_context::attach_quant_levels() { - if (!quant_levels_inp || !quant_level_index) { return; } - - const int n_nodes = ggml_graph_n_nodes(gf); - for (int i = 0; i < n_nodes; i++) { - ggml_tensor * node = ggml_graph_node(gf, i); - if (node->op != GGML_OP_MUL_MAT && node->op != GGML_OP_MUL_MAT_ID) { continue; } - - ggml_tensor * w = node->src[0]; - ggml_tensor * levels_all = quant_levels_inp->levels[w->type]; - if (!levels_all) { continue; } - - auto it = quant_level_index->find(ggml_get_name(w)); - if (it == quant_level_index->end()) { continue; } - - const int64_t n_levels = (w->type == GGML_TYPE_Q4_DPT) ? 16 : 8; - const size_t elem_size = ggml_type_size(levels_all->type); - ggml_tensor * lv = ggml_view_1d(ctx0, levels_all, - n_levels, (int64_t)(it->second * (size_t)n_levels * elem_size)); - - // MUL_MAT: src[0]=weights, src[1]=input, src[2] is free - // MUL_MAT_ID: src[0]=weights, src[1]=input, src[2]=expert_ids, src[3] is free - if (node->op == GGML_OP_MUL_MAT) { - node->src[2] = lv; - } else { - node->src[3] = lv; - } - } -} - ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const { auto inp = std::make_unique(hparams); diff --git a/src/llama-graph.h b/src/llama-graph.h index e849c541fa8..0bf4faf5783 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -501,33 +501,6 @@ class llm_graph_input_sampling : public llm_graph_input_i { std::map samplers; }; -class llm_graph_input_quant_levels : public llm_graph_input_i { -public: - llm_graph_input_quant_levels( - const std::unordered_map> & data) - : levels_data(data) {} - virtual ~llm_graph_input_quant_levels() = default; - - void set_input(const llama_ubatch * ubatch) override { - GGML_UNUSED(ubatch); - for (const auto & [type, data] : levels_data) { - if (levels[type] && !data.empty()) { - ggml_backend_tensor_set(levels[type], data.data(), 0, data.size()); - } - } - } - - bool can_reuse(const llm_graph_params & params) override { - GGML_UNUSED(params); - return true; // levels don't change between batches - } - - // per-type input tensors, created during graph build - ggml_tensor * levels[GGML_TYPE_COUNT] = {}; - - // reference to model's host-side level data for populating - const std::unordered_map> & levels_data; -}; // // llm_graph_result @@ -562,10 +535,6 @@ struct llm_graph_params { const llama_memory_context_i * mctx; const llama_cross * cross; - // per-tensor quantization levels (Q4_DPT, Q3_PT, Q3_KPT) - const std::unordered_map> * quant_levels_data = nullptr; - const std::unordered_map * quant_level_index = nullptr; - std::map samplers; static bool samplers_equal( @@ -774,11 +743,6 @@ struct llm_graph_context { const llama_memory_context_i * mctx; const llama_cross * cross; - // per-tensor quantization levels - const std::unordered_map> * quant_levels_data = nullptr; - const std::unordered_map * quant_level_index = nullptr; - llm_graph_input_quant_levels * quant_levels_inp = nullptr; - std::map samplers; const llm_graph_cb & cb_func; @@ -889,8 +853,6 @@ struct llm_graph_context { ggml_tensor * build_inp_cls() const; ggml_tensor * build_inp_cross_embd() const; - void build_inp_quant_levels(); - void attach_quant_levels(); ggml_tensor * build_inp_pos_bucket_enc() const; ggml_tensor * build_inp_pos_bucket_dec() const; ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f03278485b4..777ea85fc00 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -22,6 +22,7 @@ extern "C" { void q3pt_set_levels(const float * levels); void q3kpt_set_levels(const float * levels); void q4dpt_set_levels(const int8_t * levels); + void ggml_quant_set_tensor_aux_data(const void * tensor_data, const void * aux_data, size_t aux_size); } #include @@ -7859,15 +7860,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } - // Load per-tensor quantization levels from GGUF metadata into model fields. - // These will be used as graph inputs (see llm_graph_input_quant_levels). + // Load per-tensor quantization auxiliary data (levels/kvalues) from GGUF metadata. + // Indexed by weight tensor pointer for direct lookup during inference. { - // Build tensor-name to GGUF-order slot index map (shared across all types) - std::unordered_map gguf_name_to_idx; - { - const int64_t n_tensors = gguf_get_n_tensors(ml.meta.get()); - for (int64_t ti = 0; ti < n_tensors; ++ti) { - gguf_name_to_idx[gguf_get_tensor_name(ml.meta.get(), ti)] = (size_t)ti; + // Build tensor name to tensor pointer map + std::unordered_map name_to_tensor; + for (auto & [ctx, buf_map] : ctx_buf_maps) { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + name_to_tensor[ggml_get_name(t)] = t; } } @@ -7891,47 +7891,50 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const uint8_t * lv_raw = (const uint8_t *)gguf_get_arr_data(ml.meta.get(), lv_idx); const size_t lv_arr_n = gguf_get_arr_n(ml.meta.get(), lv_idx); - // Single pass: assign sequential slot indices and copy levels data - const size_t bytes_per_tensor = lt.n_levels * lt.elem_bytes; - auto & data = quant_levels_data[lt.type]; - size_t slot = 0; + size_t tensor_count = 0; bool global_set = false; - for (auto & [ctx, buf_map] : ctx_buf_maps) { - for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - if (t->type != lt.type) { continue; } - - auto it = gguf_name_to_idx.find(ggml_get_name(t)); - if (it == gguf_name_to_idx.end()) { continue; } - - const size_t gguf_offset = it->second * lt.n_levels; - if (gguf_offset + lt.n_levels > lv_arr_n) { continue; } - - // Record slot index and append levels data - quant_level_index[ggml_get_name(t)] = slot; - data.resize((slot + 1) * bytes_per_tensor); - memcpy(data.data() + slot * bytes_per_tensor, - lv_raw + gguf_offset * lt.elem_bytes, - bytes_per_tensor); - slot++; - - // Set the global fallback from the first tensor's levels - if (!global_set) { - if (lt.type == GGML_TYPE_Q4_DPT) { - q4dpt_set_levels((const int8_t *)(lv_raw + gguf_offset * lt.elem_bytes)); - } else if (lt.type == GGML_TYPE_Q3_PT) { - q3pt_set_levels((const float *)(lv_raw + gguf_offset * lt.elem_bytes)); - } else if (lt.type == GGML_TYPE_Q3_KPT) { - q3kpt_set_levels((const float *)(lv_raw + gguf_offset * lt.elem_bytes)); - } - global_set = true; + // Iterate over GGUF slots to find matching tensors + for (size_t gguf_slot = 0; gguf_slot < lv_arr_n / lt.n_levels; ++gguf_slot) { + std::string tensor_name = gguf_get_tensor_name(ml.meta.get(), gguf_slot); + auto it = name_to_tensor.find(tensor_name); + if (it == name_to_tensor.end()) { continue; } + + ggml_tensor* t = it->second; + if (t->type != lt.type) { continue; } + + const size_t gguf_offset = gguf_slot * lt.n_levels; + + // Store directly indexed by tensor pointer + auto & aux = tensor_aux_data[t]; + aux.type = lt.type; + aux.host_data.assign( + lv_raw + gguf_offset * lt.elem_bytes, + lv_raw + (gguf_offset + lt.n_levels) * lt.elem_bytes + ); + aux.aux_tensor = nullptr; // Will be created during graph build + + // Register in global registry for backend access + ggml_quant_set_tensor_aux_data(t, aux.host_data.data(), aux.host_data.size()); + + tensor_count++; + + // Set the global fallback from the first tensor's levels + if (!global_set) { + if (lt.type == GGML_TYPE_Q4_DPT) { + q4dpt_set_levels((const int8_t *)(lv_raw + gguf_offset * lt.elem_bytes)); + } else if (lt.type == GGML_TYPE_Q3_PT) { + q3pt_set_levels((const float *)(lv_raw + gguf_offset * lt.elem_bytes)); + } else if (lt.type == GGML_TYPE_Q3_KPT) { + q3kpt_set_levels((const float *)(lv_raw + gguf_offset * lt.elem_bytes)); } + global_set = true; } } - if (slot > 0) { + if (tensor_count > 0) { LLAMA_LOG_INFO("%s: loaded %zu %s per-tensor level tables\n", - __func__, slot, lt.gguf_key); + __func__, tensor_count, lt.gguf_key); } } } @@ -8956,8 +8959,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm->res->set_outputs(); - llm->attach_quant_levels(); - return llm->res->get_gf(); } diff --git a/src/llama-model.h b/src/llama-model.h index 00899a705c1..92f24bb85bf 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -511,12 +511,23 @@ struct llama_model { // for keeping track of associated LoRA adapters std::unordered_set loras; - // host-side levels data for dynamic quantization types (Q4_DPT, Q3_PT, Q3_KPT) - // indexed by ggml_type, raw bytes (int8 for Q4_DPT, float for Q3_PT/Q3_KPT) - std::unordered_map> quant_levels_data; - - // maps tensor name → slot index within quant_levels_data for that type - std::unordered_map quant_level_index; + // host-side auxiliary data for dynamic quantization types (Q4_DPT, Q3_PT, Q3_KPT) + // indexed by weight tensor pointer, allows separate GPU placement of aux data + struct tensor_auxiliary { + ggml_type type; // Quantization type this aux data is for + std::vector host_data; // Host copy of aux data (levels or kvalues) + struct ggml_tensor * aux_tensor; // Separate ggml tensor for backend placement + }; + + // Hash function for ggml_tensor pointers (reuse existing ggml_hash pattern) + struct ggml_tensor_ptr_hash { + size_t operator()(const ggml_tensor* t) const noexcept { + return (size_t)(uintptr_t)t >> 4; // Same as ggml_hash() + } + }; + + // Per-tensor auxiliary data lookup - indexed by WEIGHT tensor pointer + std::unordered_map tensor_aux_data; int64_t t_load_us = 0; int64_t t_start_us = 0; From 09de1292cd31552069022b49c29e6e736dff3d1b Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Wed, 4 Mar 2026 22:01:32 +0100 Subject: [PATCH 8/9] PROPER handling of level passing. So good that ggml_tensor had some free padding at the end :) --- ggml/include/ggml-cpu.h | 2 +- ggml/include/ggml.h | 12 +- ggml/src/ggml-blas/ggml-blas.cpp | 6 +- ggml/src/ggml-common.h | 17 + ggml/src/ggml-cpu/arch/arm/quants.c | 10 +- ggml/src/ggml-cpu/arch/loongarch/quants.c | 18 +- ggml/src/ggml-cpu/arch/powerpc/quants.c | 24 +- ggml/src/ggml-cpu/arch/riscv/quants.c | 20 +- ggml/src/ggml-cpu/arch/s390/quants.c | 24 +- ggml/src/ggml-cpu/arch/wasm/quants.c | 14 +- ggml/src/ggml-cpu/arch/x86/quants.c | 194 +++- ggml/src/ggml-cpu/ggml-cpu.c | 31 +- ggml/src/ggml-cpu/llamafile/sgemm.cpp | 4 +- ggml/src/ggml-cpu/llamafile/sgemm.h | 2 +- ggml/src/ggml-cpu/ops.cpp | 26 +- ggml/src/ggml-cpu/quants.c | 178 +++- ggml/src/ggml-cpu/quants.h | 118 +-- ggml/src/ggml-cpu/vec.cpp | 9 +- ggml/src/ggml-cpu/vec.h | 8 +- ggml/src/ggml-cuda/common.cuh | 10 + ggml/src/ggml-cuda/convert.cu | 34 + ggml/src/ggml-cuda/ggml-cuda.cu | 6 +- ggml/src/ggml-cuda/mmq.cu | 21 +- ggml/src/ggml-cuda/mmq.cuh | 11 + ggml/src/ggml-cuda/mmvq.cu | 18 +- ggml/src/ggml-cuda/vecdotq.cuh | 37 + ggml/src/ggml-quants.c | 1021 +++++++++++++++++++-- ggml/src/ggml-quants.h | 103 ++- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 4 +- ggml/src/ggml.c | 39 +- include/llama.h | 1 + src/llama-model-loader.cpp | 2 + src/llama-model.cpp | 29 +- src/llama-quant.cpp | 118 ++- tests/CMakeLists.txt | 2 + tests/test-backend-ops.cpp | 2 +- tests/test-quant-q2dpt.cpp | 170 ++++ tests/test-quant-q2kpt.cpp | 358 ++++++++ tests/test-quant-q3kpt.cpp | 5 +- tests/test-quant-q4dpt-experiment.cpp | 4 +- tests/test-quant-q4dpt.cpp | 4 +- tests/test-quantize-fns.cpp | 6 +- tests/test-quantize-perf.cpp | 2 +- tests/test-quantize-stats.cpp | 2 +- tools/export-lora/export-lora.cpp | 2 +- tools/quantize/quantize.cpp | 1 + 46 files changed, 2298 insertions(+), 431 deletions(-) create mode 100644 tests/test-quant-q2dpt.cpp create mode 100644 tests/test-quant-q2kpt.cpp diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index e3e067c916f..8783049ed84 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -111,7 +111,7 @@ extern "C" { // Internal types and functions exposed for tests and benchmarks typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, - const void * GGML_RESTRICT y, size_t by, int nrc); + const void * GGML_RESTRICT y, size_t by, int nrc, const void * levels); struct ggml_type_traits_cpu { ggml_from_float_t from_float; diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index b022730f8db..5799761e0f4 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -430,7 +430,9 @@ extern "C" { GGML_TYPE_Q3_PT = 40, // 3.875 bpw per-tensor Lloyd-Max, 16-elem affine sub-blocks GGML_TYPE_Q3_KPT = 41, // Q3_K with learned per-tensor levels (3.4375 bpw) GGML_TYPE_Q4_DPT = 42, // IQ4_NL with learned per-tensor int8 levels (4.125 bpw) - GGML_TYPE_COUNT = 43, + GGML_TYPE_Q2_DPT = 43, // 2-bit with learned per-tensor int8 levels (2.5 bpw) + GGML_TYPE_Q2_KPT = 44, // Q2_K with learned per-tensor float levels (2.625 bpw) + GGML_TYPE_COUNT = 45, }; // precision @@ -469,6 +471,7 @@ extern "C" { GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors GGML_FTYPE_MOSTLY_Q3_KPT = 27, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_DPT = 28, // except 1d tensors + GGML_FTYPE_MOSTLY_Q2_KPT = 29, // except 1d tensors }; // available tensor operations: @@ -687,9 +690,8 @@ extern "C" { char name[GGML_MAX_NAME]; - void * extra; // extra things e.g. for ggml-cuda.cu - - char padding[8]; + void * extra; // extra things e.g. for ggml-cuda.cu + void * quant_levels; // per-tensor quantization levels (replaces char padding[8]; same size on 64-bit) }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); @@ -2707,7 +2709,7 @@ extern "C" { # define GGML_RESTRICT restrict # endif #endif - typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); struct ggml_type_traits { diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 2e9ddf2240d..2320ef84358 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -80,7 +80,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg #ifdef GGML_USE_OPENMP #pragma omp parallel for num_threads(n_threads) for (int64_t i01 = 0; i01 < ne01; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, src0->quant_levels); } #else for (int i = 1; i < n_threads; i++) { @@ -89,7 +89,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg if (start < end) { ctx->tasks.push_back(std::async(std::launch::async, [=]() { for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, src0->quant_levels); } })); } @@ -99,7 +99,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg const int64_t start = 0; const int64_t end = ne01/n_threads; for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, src0->quant_levels); } } #endif diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 11f7a6bb9cf..991c005c3b2 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -452,6 +452,23 @@ static_assert(sizeof(block_q3_pt) == 124, "wrong q3_pt block size"); typedef block_iq4_nl block_q4_dpt; #define Q4DPT_N_LEVELS 16 +// Q2_DPT: 2-bit per-tensor Lloyd-Max scalar quantization (2.5 bpw) +// Block format: 2 bytes (FP16 scale) + 8 bytes (2-bit indices for 32 elements) = 10 bytes per block +// 4 learned int8 levels per tensor, optimized via Lloyd-Max k-means +typedef struct { + ggml_half d; // 2 bytes: FP16 scale (delta) + uint8_t qs[8]; // 8 bytes: 2-bit indices (4 values per byte, 32 elements total) +} block_q2_dpt; +static_assert(sizeof(block_q2_dpt) == sizeof(ggml_half) + 8, "wrong q2_dpt block size/padding"); + +#define QK2_DPT 32 +#define Q2DPT_N_LEVELS 4 + +// Q2_KPT: Q2_K with learned per-tensor float levels (2.625 bpw) +// Reuses block_q2_K structure but maps 2-bit indices through learned level table +typedef block_q2_K block_q2_kpt; +#define Q2KPT_N_LEVELS 4 + #endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index dafd66fe68c..cab8a708242 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -137,7 +137,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in //===================================== Dot products ================================= -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -430,7 +430,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -650,7 +650,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo *s = sumf; } -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -762,7 +762,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -880,7 +880,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; diff --git a/ggml/src/ggml-cpu/arch/loongarch/quants.c b/ggml/src/ggml-cpu/arch/loongarch/quants.c index e22447c70a9..37db3e18925 100644 --- a/ggml/src/ggml-cpu/arch/loongarch/quants.c +++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c @@ -644,7 +644,7 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -772,7 +772,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -827,11 +827,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -880,11 +880,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(sumf); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -936,11 +936,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(sumf); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -983,7 +983,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(sumf); UNUSED(x); UNUSED(y); - ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } diff --git a/ggml/src/ggml-cpu/arch/powerpc/quants.c b/ggml/src/ggml-cpu/arch/powerpc/quants.c index 3ad7582ddb3..063fce318f2 100644 --- a/ggml/src/ggml-cpu/arch/powerpc/quants.c +++ b/ggml/src/ggml-cpu/arch/powerpc/quants.c @@ -141,7 +141,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i //===================================== Dot products ================================= -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -207,11 +207,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -274,7 +274,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } @@ -340,11 +340,11 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -412,11 +412,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(sumf); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -488,11 +488,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(sumf); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -557,7 +557,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } @@ -2194,7 +2194,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v UNUSED(nb); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c index 0718931ad4c..27f7661e767 100644 --- a/ggml/src/ggml-cpu/arch/riscv/quants.c +++ b/ggml/src/ggml-cpu/arch/riscv/quants.c @@ -115,7 +115,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i //===================================== Dot products ================================= -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { #if defined(__riscv_v) const int qk = QK8_0; const int nb = n / qk; @@ -166,11 +166,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; #else - ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { #if defined(__riscv_v) const int qk = QK8_1; const int nb = n / qk; @@ -217,11 +217,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; #else - ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { #if defined(__riscv_v) const int qk = QK8_0; const int nb = n / qk; @@ -271,11 +271,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; #else - ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { #if defined(__riscv_v) const int qk = QK8_1; const int nb = n / qk; @@ -324,11 +324,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; #else - ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -372,7 +372,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index 636eb871390..da5a4b6f089 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -146,7 +146,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i //===================================== Dot products ================================= -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -201,11 +201,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -258,7 +258,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } @@ -353,11 +353,11 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -495,11 +495,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -648,11 +648,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -698,7 +698,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } @@ -1390,7 +1390,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v UNUSED(nb); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } diff --git a/ggml/src/ggml-cpu/arch/wasm/quants.c b/ggml/src/ggml-cpu/arch/wasm/quants.c index 52d990c11ee..77fb7228b0a 100644 --- a/ggml/src/ggml-cpu/arch/wasm/quants.c +++ b/ggml/src/ggml-cpu/arch/wasm/quants.c @@ -229,7 +229,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in //===================================== Dot products ================================= -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -355,7 +355,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -442,11 +442,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(sumf); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -537,11 +537,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(sumf); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -605,7 +605,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index b726ef63584..7eff7712678 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -540,7 +540,7 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -698,7 +698,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -753,11 +753,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(x); UNUSED(y); UNUSED(ib); - ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -843,7 +844,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo *s = sumf; } -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -919,11 +920,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(ib); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -1005,11 +1006,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(ib); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -1077,7 +1078,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } -void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -1205,11 +1207,12 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -1271,11 +1274,12 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -1463,11 +1467,12 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1735,11 +1740,12 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1913,11 +1919,12 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(kmask2); UNUSED(kmask3); UNUSED(utmp); - ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -2123,11 +2130,12 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(kmask2); UNUSED(kmask3); UNUSED(utmp); - ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -2328,7 +2336,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } @@ -2369,7 +2377,8 @@ static const int8_t keven_signs_q2xs[1024] = { }; #endif -void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -2483,11 +2492,12 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -2780,11 +2790,12 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -2965,11 +2976,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -3089,11 +3101,12 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -3299,15 +3312,17 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); } -void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -3422,11 +3437,12 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -3629,11 +3645,12 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(y); UNUSED(nb); UNUSED(scale); - ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -3717,7 +3734,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v *s = sumf; } -void ggml_vec_dot_q4_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -3731,7 +3748,7 @@ void ggml_vec_dot_q4_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v const int nb = n / QK4_NL; - const int8_t * values = (const int8_t *)ggml_quant_get_current_levels(GGML_TYPE_Q4_DPT); + const int8_t * values = (const int8_t *)levels; GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor"); int ib = 0; @@ -3804,7 +3821,96 @@ void ggml_vec_dot_q4_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v *s = sumf; } -void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q2_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK2_DPT == 0); + static_assert(QK2_DPT == QK8_0, "QK2_DPT and QK8_0 must be the same"); + + const block_q2_dpt * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + const int nb = n / QK2_DPT; + + const int8_t * values = (const int8_t *)levels; + GGML_ASSERT(values != NULL && "Q2_DPT levels not set for tensor"); + + int ib = 0; + float sumf = 0; + +#if defined __AVX2__ + + const __m128i values128 = _mm_loadu_si128((const __m128i*)values); + const __m128i m3 = _mm_set1_epi8(0x03); + + __m256 accum = _mm256_setzero_ps(); + for (; ib + 1 < nb; ib += 2) { + const __m128i q2bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs); + const __m128i q2bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs); + const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs); + const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs); + + // Extract 2-bit indices and lookup values - process 8 elements at a time + // For each byte of q2bits, we have 4 x 2-bit indices + const __m128i q2_01_l = _mm_shuffle_epi8(values128, _mm_and_si128(q2bits_1, m3)); + const __m128i q2_01_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_1, 2), m3)); + const __m128i q2_02_l = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_1, 4), m3)); + const __m128i q2_02_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_1, 6), m3)); + const __m128i q2_11_l = _mm_shuffle_epi8(values128, _mm_and_si128(q2bits_2, m3)); + const __m128i q2_11_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_2, 2), m3)); + const __m128i q2_12_l = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_2, 4), m3)); + const __m128i q2_12_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_2, 6), m3)); + + // Combine pairs into __m256i + const __m256i q4b_1a = MM256_SET_M128I(q2_01_h, q2_01_l); + const __m256i q4b_1b = MM256_SET_M128I(q2_02_h, q2_02_l); + const __m256i q4b_2a = MM256_SET_M128I(q2_11_h, q2_11_l); + const __m256i q4b_2b = MM256_SET_M128I(q2_12_h, q2_12_l); + + // Split q8 into pairs and compute dot products + const __m256i q8b_1a = _mm256_and_si256(q8b_1, _mm256_set1_epi16(0x00ff)); + const __m256i q8b_1b = _mm256_srli_epi16(q8b_1, 8); + const __m256i q8b_2a = _mm256_and_si256(q8b_2, _mm256_set1_epi16(0x00ff)); + const __m256i q8b_2b = _mm256_srli_epi16(q8b_2, 8); + + const __m256i p16_1a = mul_add_epi8(q4b_1a, q8b_1a); + const __m256i p16_1b = mul_add_epi8(q4b_1b, q8b_1b); + const __m256i p16_2a = mul_add_epi8(q4b_2a, q8b_2a); + const __m256i p16_2b = mul_add_epi8(q4b_2b, q8b_2b); + + const __m256i mone = _mm256_set1_epi16(1); + const __m256i p_1 = _mm256_add_epi32(_mm256_madd_epi16(p16_1a, mone), _mm256_madd_epi16(p16_1b, mone)); + const __m256i p_2 = _mm256_add_epi32(_mm256_madd_epi16(p16_2a, mone), _mm256_madd_epi16(p16_2b, mone)); + + accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)), + _mm256_cvtepi32_ps(p_1), accum); + accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)), + _mm256_cvtepi32_ps(p_2), accum); + } + + sumf = hsum_float_8(accum); + +#endif + for (; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); + int sumi = 0; + for (int j = 0; j < QK2_DPT/4; ++j) { + uint8_t q = x[ib].qs[j]; + sumi += y[ib].qs[j*4 + 0] * values[(q >> 0) & 3]; + sumi += y[ib].qs[j*4 + 1] * values[(q >> 2) & 3]; + sumi += y[ib].qs[j*4 + 2] * values[(q >> 4) & 3]; + sumi += y[ib].qs[j*4 + 3] * values[(q >> 6) & 3]; + } + sumf += d * sumi; + } + *s = sumf; +} + +void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -3906,6 +4012,6 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index e45a4f688a5..070d1341358 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -403,6 +403,18 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, }, + [GGML_TYPE_Q2_DPT] = { + // from_float not set — requires level initialization via q2dpt_set_levels() + .vec_dot = ggml_vec_dot_q2_dpt_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, + [GGML_TYPE_Q2_KPT] = { + // from_float not set — requires level initialization via q2kpt_set_levels() + .vec_dot = ggml_vec_dot_q2_kpt_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, [GGML_TYPE_I32] = { .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32, }, @@ -1234,7 +1246,7 @@ static void ggml_compute_forward_mul_mat_one_chunk( //} for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) { - vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot); + vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot, src0->quant_levels); } for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) { @@ -1279,15 +1291,6 @@ void ggml_compute_forward_mul_mat( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // Set current per-tensor quantization levels from registry - { - size_t levels_size; - const void * levels = ggml_quant_get_tensor_aux_data(src0, &levels_size); - if (levels) { - ggml_quant_set_current_levels(src0->type, levels); - } - } - // TODO: extract to "extra_op" #if GGML_USE_LLAMAFILE // broadcast factors @@ -1309,7 +1312,8 @@ void ggml_compute_forward_mul_mat( nb1/ggml_type_size(dst->type), src0->type, src1->type, - dst->type)) + dst->type, + src0->quant_levels)) goto UseGgmlGemm1; return; } @@ -1377,7 +1381,8 @@ UseGgmlGemm1:; nb1/ggml_type_size(dst->type), src0->type, vec_dot_type, - dst->type)) + dst->type, + src0->quant_levels)) goto UseGgmlGemm2; return; } @@ -1511,7 +1516,7 @@ static void ggml_compute_forward_mul_mat_id_one_chunk( float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2)); for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) { - vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1); + vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1, src0->quant_levels); } memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float)); diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index ef2b0a2e852..39599461b0b 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -3679,7 +3679,7 @@ class tinyBLAS_PPC { */ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C, - int64_t ldc, int Atype, int Btype, int Ctype) { + int64_t ldc, int Atype, int Btype, int Ctype, const void * quant_levels) { assert(m >= 0); assert(n >= 0); @@ -4023,7 +4023,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) // Q4_DPT has identical block layout to IQ4_NL (block_q4_dpt = block_iq4_nl) // but uses a per-tensor lookup table instead of the fixed IQ4_NL values. - const int8_t * levels = (const int8_t *)ggml_quant_get_current_levels(GGML_TYPE_Q4_DPT); + const int8_t * levels = (const int8_t *)quant_levels; if (!levels) return false; tinyBLAS_Q0_AVX tb{ k, (const block_iq4_nl *)A, lda, diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h index 867b0c04aee..117a36560e7 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.h +++ b/ggml/src/ggml-cpu/llamafile/sgemm.h @@ -18,7 +18,7 @@ extern "C" { bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t, const void *, int64_t, const void *, int64_t, void *, int64_t, - int, int, int); + int, int, int, const void * quant_levels); #ifdef __cplusplus } diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 73107dcb8f9..ad18e34cc66 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -519,7 +519,7 @@ static void ggml_compute_forward_dup_from_q( dequantize_row_q( (const void *) ((char *) src0->data + x_offset), - (float *) ((char *) dst->data + dst_offset), qk); + (float *) ((char *) dst->data + dst_offset), qk, src0->quant_levels); } } @@ -639,7 +639,7 @@ static void ggml_compute_forward_add_q_f32( assert(ne00 % 32 == 0); // unquantize row from src0 to temp buffer - dequantize_row_q(src0_row, wdata, ne00); + dequantize_row_q(src0_row, wdata, ne00, src0->quant_levels); // add src1 ggml_vec_acc_f32(ne00, wdata, src1_row); // quantize row to dst @@ -972,7 +972,7 @@ static void ggml_compute_forward_add1_q_f32( assert(ne0 % 32 == 0); // unquantize row from src0 to temp buffer - dequantize_row_q(src0_row, wdata, ne0); + dequantize_row_q(src0_row, wdata, ne0, src0->quant_levels); // add src1 ggml_vec_acc1_f32(ne0, wdata, v); // quantize row to dst @@ -4315,7 +4315,7 @@ static void ggml_compute_forward_out_prod_q_f32( float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); - dequantize_row_q(s0, wdata, ne0); + dequantize_row_q(s0, wdata, ne0, src0->quant_levels); ggml_vec_mad_f32(ne0, d, wdata, *s1); } } @@ -4690,7 +4690,7 @@ static void ggml_compute_forward_get_rows_q( dequantize_row_q( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), - (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc, src0->quant_levels); } } @@ -5424,7 +5424,7 @@ static void ggml_compute_forward_soft_max_ext_back_f32( // linear runtime, no additional memory float dot_y_dy = 0; - ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1); + ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1, nullptr); ggml_vec_cpy_f32 (nc, dx, dy); ggml_vec_acc1_f32 (nc, dx, -dot_y_dy); ggml_vec_mul_f32 (nc, dx, dx, y); @@ -5991,7 +5991,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( float v = 0; ggml_vec_dot_f16(ne02, &v, 0, (ggml_fp16_t *) wdata_src + i1n, 0, - (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1); + (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1, nullptr); dst_data[i10*s0 + i00] += v; } } @@ -6079,7 +6079,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( float v = 0; ggml_vec_dot_f32(ne02, &v, 0, wdata_src + i1n, 0, - wdata_kernel + i00*ne02, 0, 1); + wdata_kernel + i00*ne02, 0, 1, nullptr); dst_data[i10*s0 + i00] += v; } } @@ -6992,7 +6992,7 @@ void ggml_compute_forward_conv_transpose_2d( float v = 0; ggml_vec_dot_f16(ne03, &v, 0, wdata_src + i1n, 0, - wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); + wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1, nullptr); dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; } } @@ -8242,7 +8242,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk( float s; // KQ value const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3); - kq_vec_dot(DK, &s, 0, k_data, 0, Q_q, 0, 1); + kq_vec_dot(DK, &s, 0, k_data, 0, Q_q, 0, 1, k->quant_levels); s = s*scale; // scale KQ value @@ -8289,7 +8289,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk( // V += v*expf(s - M) if (v_to_float) { - v_to_float(v_data, V32, DV); + v_to_float(v_data, V32, DV, v->quant_levels); ggml_vec_mad_f32(DV, VKQ32, V32, vs); } else { // V is F32 @@ -9002,7 +9002,7 @@ static void ggml_compute_forward_flash_attn_back_f32( ggml_vec_dot_f32(neq0, S + i1, 0, (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1, nullptr); } // scale @@ -9116,7 +9116,7 @@ static void ggml_compute_forward_flash_attn_back_f32( // S = SM * (S - dot(SM, S)) float dot_SM_gradSM = 0; - ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1); + ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1, nullptr); ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); ggml_vec_mul_f32 (masked_begin, S, S, SM); diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index 9a01a110e52..58746db8316 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -112,7 +112,7 @@ void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRI //===================================== Dot products ================================= -void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -149,7 +149,7 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c } // TODO: add WASM SIMD -void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -185,7 +185,7 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -216,7 +216,7 @@ void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -259,7 +259,7 @@ void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -302,7 +302,7 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -332,7 +332,7 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -384,7 +384,7 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -416,7 +416,7 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -468,7 +468,7 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -547,7 +547,7 @@ void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -622,7 +622,7 @@ void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -702,7 +702,7 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -757,7 +757,7 @@ void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -799,7 +799,7 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs *s = 0.125f * sumf; } -void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -849,7 +849,7 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = 0.125f * sumf; } -void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -901,7 +901,7 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = 0.125f * sumf; } -void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -945,7 +945,7 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs *s = 0.25f * sumf; } -void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1001,7 +1001,7 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1014,8 +1014,8 @@ void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const int nb = n / QK_K; - const float * levels = (const float *)ggml_quant_get_current_levels(GGML_TYPE_Q3_PT); - GGML_ASSERT(levels != NULL && "Q3_PT levels not set for tensor"); + const float * lv = (const float *)levels; + GGML_ASSERT(lv != NULL && "Q3_PT levels not set for tensor"); float sumf = 0.f; for (int i = 0; i < nb; ++i) { @@ -1047,7 +1047,7 @@ void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const int qoff = qbit % 8; int q = (qs[qbyte] >> qoff) & 0x7; if (qoff > 5) { q |= (int)((qs[qbyte+1] << (8 - qoff)) & 0x7); } - sum_lq += levels[q] * (float)q8[qk]; + sum_lq += lv[q] * (float)q8[qk]; } // min contribution uses precomputed 16-element sum from block_q8_K.bsums block_sum += sum_lq * range + sub_min * (float)y[i].bsums[ib]; @@ -1057,7 +1057,7 @@ void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1101,7 +1101,7 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } // Q3_KPT vec_dot - similar to Q3_K but with learned levels -void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1114,8 +1114,8 @@ void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const int nb = n / QK_K; - const float * levels = (const float *)ggml_quant_get_current_levels(GGML_TYPE_Q3_KPT); - GGML_ASSERT(levels != NULL && "Q3_KPT levels not set for tensor"); + const float * lv = (const float *)levels; + GGML_ASSERT(lv != NULL && "Q3_KPT levels not set for tensor"); const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; @@ -1152,11 +1152,11 @@ void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, float sum1 = 0.f, sum2 = 0.f; for (int l = 0; l < 16; ++l) { int k_idx = ((q[l+0] >> shift) & 3) + ((hm[l+0] & m) ? 4 : 0); - sum1 += (levels[k_idx] * 7.0f - 4.0f) * (float)q8[l+0]; + sum1 += (lv[k_idx] * 7.0f - 4.0f) * (float)q8[l+0]; } for (int l = 0; l < 16; ++l) { int k_idx = ((q[l+16] >> shift) & 3) + ((hm[l+16] & m) ? 4 : 0); - sum2 += (levels[k_idx] * 7.0f - 4.0f) * (float)q8[l+16]; + sum2 += (lv[k_idx] * 7.0f - 4.0f) * (float)q8[l+16]; } block_sum += dl1 * sum1 + dl2 * sum2; @@ -1171,11 +1171,86 @@ void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_q3_kpt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - ggml_vec_dot_q3_kpt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +void ggml_vec_dot_q3_kpt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + ggml_vec_dot_q3_kpt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); } -void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +// Q2_KPT vec_dot - similar to Q2_K but with learned levels +void ggml_vec_dot_q2_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q2_kpt * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + const float * lv = (const float *)levels; + GGML_ASSERT(lv != NULL && "Q2_KPT levels not set for tensor"); + + // Precompute mapped levels: ml[k] = levels[k] * 3.0 + float ml[Q2KPT_N_LEVELS]; + for (int k = 0; k < Q2KPT_N_LEVELS; ++k) { + ml[k] = lv[k] * 3.0f; + } + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + const uint8_t * q2 = x[i].qs; + const int8_t * q8 = y[i].qs; + const uint8_t * sc = x[i].scales; + + // Min term: accumulate integer bsums * min_scale (same as Q2_K) + int summs = 0; + for (int j = 0; j < 16; ++j) { + summs += y[i].bsums[j] * (sc[j] >> 4); + } + + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + // Scale term: need floating-point because levels are non-uniform + int is = 0; + float fsum = 0; + for (int k = 0; k < QK_K/128; ++k) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + int d_sc = sc[is++] & 0xF; + float suml = 0; + for (int l = 0; l < 16; ++l) { + int idx = (q2[l] >> shift) & 3; + suml += ml[idx] * (float)q8[l]; + } + fsum += d_sc * suml; + + d_sc = sc[is++] & 0xF; + suml = 0; + for (int l = 16; l < 32; ++l) { + int idx = (q2[l] >> shift) & 3; + suml += ml[idx] * (float)q8[l]; + } + fsum += d_sc * suml; + + shift += 2; + q8 += 32; + } + q2 += 32; + } + sumf += dall * fsum - dmin * summs; + } + *s = sumf; +} + +void ggml_vec_dot_q2_kpt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + ggml_vec_dot_q2_kpt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); +} + +void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -1189,7 +1264,7 @@ void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const int nb = n / QK4_NL; - const int8_t * values = (const int8_t *)ggml_quant_get_current_levels(GGML_TYPE_Q4_DPT); + const int8_t * values = (const int8_t *)levels; GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor"); float sumf = 0; @@ -1205,7 +1280,40 @@ void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q2_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK2_DPT == 0); + static_assert(QK2_DPT == QK8_0, "QK2_DPT and QK8_0 must be the same"); + + const block_q2_dpt * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + const int nb = n / QK2_DPT; + + const int8_t * values = (const int8_t *)levels; + GGML_ASSERT(values != NULL && "Q2_DPT levels not set for tensor"); + + float sumf = 0; + for (int ib = 0; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d) * GGML_CPU_FP16_TO_FP32(x[ib].d); + int32_t blk = 0; + for (int j = 0; j < QK2_DPT/4; ++j) { + uint8_t q = x[ib].qs[j]; + blk += (int32_t)y[ib].qs[j*4 + 0] * (int32_t)values[(q >> 0) & 3]; + blk += (int32_t)y[ib].qs[j*4 + 1] * (int32_t)values[(q >> 2) & 3]; + blk += (int32_t)y[ib].qs[j*4 + 2] * (int32_t)values[(q >> 4) & 3]; + blk += (int32_t)y[ib].qs[j*4 + 3] * (int32_t)values[(q >> 6) & 3]; + } + sumf += d * (float)blk; + } + *s = sumf; +} + +void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1266,7 +1374,7 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -1295,7 +1403,7 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h index 1377a6a59bd..973e552f02f 100644 --- a/ggml/src/ggml-cpu/quants.h +++ b/ggml/src/ggml-cpu/quants.h @@ -35,68 +35,72 @@ void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); // Dot product -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); - -void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); - -void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); - -void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); - -void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q3_pt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q3_kpt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q4_dpt_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); + +void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); + +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); + +void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); + +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q3_pt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q3_kpt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q4_dpt_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q2_dpt_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q2_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q2_kpt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q2_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); // Generic implementation void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); - -void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); - -void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); - -void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); + +void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); + +void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); + +void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); #ifdef __cplusplus } diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index d0e4001338a..ebf63280ad7 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -8,7 +8,8 @@ ggml_fp16_t ggml_table_gelu_f16[1 << 16]; // precomputed quick gelu table for f16 (128 KB) ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; -void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) { +void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); GGML_UNUSED(nrc); GGML_UNUSED(bx); @@ -136,7 +137,8 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G *s = sumf; } -void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) { +void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); GGML_UNUSED(nrc); GGML_UNUSED(bx); @@ -261,7 +263,8 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * *s = sumf; } -void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) { +void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); GGML_UNUSED(nrc); GGML_UNUSED(bx); diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 3198b33b509..ed0f7555673 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -39,9 +39,9 @@ extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; // fundamental operations // -void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc); -void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc); -void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc); +void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc, const void * levels); +void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels); +void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels); void ggml_vec_silu_f32(const int n, float * y, const float * x); ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean ) @@ -863,7 +863,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float #endif } -inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); } +inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1, NULL); *s = sqrtf(*s); } inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 140b73a3d95..0f00f7cc075 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1026,6 +1026,16 @@ struct ggml_cuda_type_traits { // Each TU gets its own copy; initialized via cudaGetSymbolAddress + cudaMemcpyAsync before use. __device__ int8_t q4dpt_levels_cuda[16]; +// Per-tensor lookup table for Q2_DPT (4 int8 levels). +__device__ int8_t q2dpt_levels_cuda[4]; + +template<> +struct ggml_cuda_type_traits { + static constexpr int qk = QK2_DPT; + static constexpr int qr = 4; // 4 elements per "quantum" (2-bit) + static constexpr int qi = 1; // 1 uint32 per block +}; + template<> struct ggml_cuda_type_traits { static constexpr int qk = QK_K; diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 3fb03091991..b73fa9058bc 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -599,6 +599,12 @@ void ggml_cuda_set_q4dpt_levels(const int8_t * levels, cudaStream_t stream) { CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, 16, cudaMemcpyDeviceToDevice, stream)); } +void ggml_cuda_set_q2dpt_levels(const int8_t * levels, cudaStream_t stream) { + int8_t * d_q2dpt_levels; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q2dpt_levels, q2dpt_levels_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_q2dpt_levels, levels, 4, cudaMemcpyDeviceToDevice, stream)); +} + template static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { const int nb = (k + QK_K - 1) / QK_K; @@ -628,6 +634,30 @@ static void dequantize_row_q4_dpt_cuda(const void * vx, dst_t * y, const int64_t dequantize_block_q4_dpt<<>>(vx, y); } +template +static __global__ void dequantize_block_q2_dpt(const void * __restrict__ vx, dst_t * __restrict__ yy) { + const int64_t i = blockIdx.x; + const block_q2_dpt * x = (const block_q2_dpt *) vx + i*(QK_K/QK2_DPT); + + const int64_t tid = threadIdx.x; + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q2 = x[ib].qs + il; + const float d = (float)x[ib].d; + uint8_t q = q2[0]; + y[ 0] = d * q2dpt_levels_cuda[(q >> 0) & 3]; + y[ 1] = d * q2dpt_levels_cuda[(q >> 2) & 3]; + y[ 2] = d * q2dpt_levels_cuda[(q >> 4) & 3]; + y[ 3] = d * q2dpt_levels_cuda[(q >> 6) & 3]; +} + +template +static void dequantize_row_q2_dpt_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = (k + QK_K - 1) / QK_K; + dequantize_block_q2_dpt<<>>(vx, y); +} + template static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { const int nb = k / QK_K; @@ -740,6 +770,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq4_nl_cuda; case GGML_TYPE_Q4_DPT: return dequantize_row_q4_dpt_cuda; + case GGML_TYPE_Q2_DPT: + return dequantize_row_q2_dpt_cuda; case GGML_TYPE_IQ4_XS: return dequantize_row_iq4_xs_cuda; case GGML_TYPE_IQ3_S: @@ -793,6 +825,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq4_nl_cuda; case GGML_TYPE_Q4_DPT: return dequantize_row_q4_dpt_cuda; + case GGML_TYPE_Q2_DPT: + return dequantize_row_q2_dpt_cuda; case GGML_TYPE_IQ4_XS: return dequantize_row_iq4_xs_cuda; case GGML_TYPE_IQ3_S: diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 034553bf806..2876570da3e 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1344,10 +1344,8 @@ static void ggml_cuda_op_mul_mat_cublas( if (src0->type != GGML_TYPE_F32) { // Set Q4_DPT levels in convert.cu's TU before dequantize if (src0->type == GGML_TYPE_Q4_DPT) { - size_t levels_size; - const void * levels = ggml_quant_get_tensor_aux_data(src0, &levels_size); - GGML_ASSERT(levels && "Q4_DPT MUL_MAT requires levels"); - ggml_cuda_set_q4dpt_levels((const int8_t *)levels, stream); + GGML_ASSERT(src0->quant_levels && "Q4_DPT MUL_MAT requires levels (set tensor->quant_levels)"); + ggml_cuda_set_q4dpt_levels((const int8_t *)src0->quant_levels, stream); } const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); GGML_ASSERT(to_fp32_cuda != nullptr); diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index fbc75b6f6f0..1931f35e170 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -67,6 +67,9 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con case GGML_TYPE_Q4_DPT: mul_mat_q_case(ctx, args, stream); break; + case GGML_TYPE_Q2_DPT: + mul_mat_q_case(ctx, args, stream); + break; default: GGML_ABORT("fatal error"); break; @@ -84,14 +87,20 @@ void ggml_cuda_mul_mat_q( cudaStream_t stream = ctx.stream(); const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; - // Set Q4_DPT lookup table from per-tensor registry + // Set Q4_DPT lookup table from tensor's quant_levels if (src0->type == GGML_TYPE_Q4_DPT) { - size_t levels_size; - const void * levels = ggml_quant_get_tensor_aux_data(src0, &levels_size); - GGML_ASSERT(levels && "Q4_DPT MUL_MAT requires levels (register with ggml_quant_set_tensor_aux_data)"); + GGML_ASSERT(src0->quant_levels && "Q4_DPT MUL_MAT requires levels (set tensor->quant_levels)"); int8_t * d_q4dpt_levels; CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda)); - CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, levels_size, cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, src0->quant_levels, Q4DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream)); + } + + // Set Q2_DPT lookup table from tensor's quant_levels + if (src0->type == GGML_TYPE_Q2_DPT) { + GGML_ASSERT(src0->quant_levels && "Q2_DPT MUL_MAT requires levels (set tensor->quant_levels)"); + int8_t * d_q2dpt_levels; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q2dpt_levels, q2dpt_levels_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_q2dpt_levels, src0->quant_levels, Q2DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream)); } const size_t ts_src0 = ggml_type_size(src0->type); @@ -302,6 +311,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_NL: case GGML_TYPE_Q4_DPT: + case GGML_TYPE_Q2_DPT: mmq_supported = true; break; default: @@ -385,3 +395,4 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t // because it accesses the TU-local __device__ variable q4dpt_levels_cuda, // which is initialized by the code above. DECL_MMQ_CASE(GGML_TYPE_Q4_DPT); +DECL_MMQ_CASE(GGML_TYPE_Q2_DPT); diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index a55e567fba8..d411859d8be 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -88,6 +88,7 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) { case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_NL: case GGML_TYPE_Q4_DPT: + case GGML_TYPE_Q2_DPT: return MMQ_Q8_1_DS_LAYOUT_D4; default: GGML_ABORT("fatal error"); @@ -205,6 +206,7 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml case GGML_TYPE_IQ4_XS: return MMQ_DP4A_TXS_Q8_0; case GGML_TYPE_IQ4_NL: return MMQ_DP4A_TXS_Q8_0; case GGML_TYPE_Q4_DPT: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_Q2_DPT: return MMQ_DP4A_TXS_Q8_0_16; default: return tile_x_sizes{0, 0, 0}; } } @@ -247,6 +249,7 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { case GGML_TYPE_IQ4_XS: return MMQ_MMA_TILE_X_K_Q8_0; case GGML_TYPE_IQ4_NL: return MMQ_MMA_TILE_X_K_Q8_0; case GGML_TYPE_Q4_DPT: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q2_DPT: return MMQ_MMA_TILE_X_K_Q8_0; default: return 0; } } @@ -3434,6 +3437,14 @@ struct mmq_type_traits { static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; }; +template +struct mmq_type_traits { + static constexpr int vdr = VDR_Q2_DPT_Q8_1_MMQ; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_dpt; // Reuse Q4_DPT loader (same layout) + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; +}; + template struct mmq_type_traits { static constexpr int vdr = VDR_IQ4_XS_Q8_1_MMQ; diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 788cf6749e6..1e8eb34fd50 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -30,6 +30,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) case GGML_TYPE_IQ1_M: return vec_dot_iq1_m_q8_1; case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1; case GGML_TYPE_Q4_DPT: return vec_dot_q4_dpt_q8_1; + case GGML_TYPE_Q2_DPT: return vec_dot_q2_dpt_q8_1; case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1; case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1; default: return nullptr; @@ -56,6 +57,7 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) { case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ; case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ; case GGML_TYPE_Q4_DPT: return VDR_Q4_DPT_Q8_1_MMVQ; + case GGML_TYPE_Q2_DPT: return VDR_Q2_DPT_Q8_1_MMVQ; case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ; default: return 1; } @@ -656,14 +658,20 @@ void ggml_cuda_mul_mat_vec_q( cudaStream_t stream = ctx.stream(); - // Set Q4_DPT lookup table from per-tensor registry + // Set Q4_DPT lookup table from tensor's quant_levels if (src0->type == GGML_TYPE_Q4_DPT) { - size_t levels_size; - const void * levels = ggml_quant_get_tensor_aux_data(src0, &levels_size); - GGML_ASSERT(levels && "Q4_DPT MUL_MAT requires levels (register with ggml_quant_set_tensor_aux_data)"); + GGML_ASSERT(src0->quant_levels && "Q4_DPT MUL_MAT requires levels (set tensor->quant_levels)"); int8_t * d_q4dpt_levels; CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda)); - CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, levels_size, cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, src0->quant_levels, Q4DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream)); + } + + // Set Q2_DPT lookup table from tensor's quant_levels + if (src0->type == GGML_TYPE_Q2_DPT) { + GGML_ASSERT(src0->quant_levels && "Q2_DPT MUL_MAT requires levels (set tensor->quant_levels)"); + int8_t * d_q2dpt_levels; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q2dpt_levels, q2dpt_levels_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_q2dpt_levels, src0->quant_levels, Q2DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream)); } const size_t ts_src0 = ggml_type_size(src0->type); diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh index 7f0377b8385..2cfb100f492 100644 --- a/ggml/src/ggml-cuda/vecdotq.cuh +++ b/ggml/src/ggml-cuda/vecdotq.cuh @@ -1232,6 +1232,43 @@ static __device__ __forceinline__ float vec_dot_q4_dpt_q8_1( return d * sumi; } +// Q2_DPT: 2-bit quantization with 4 learned levels +// Helper: lookup 4 int8 levels using 2-bit indices packed in a 32-bit int +static __device__ __forceinline__ int4 get_int_from_table_4(const int & q2, const int8_t * table) { + int4 result; + result.x = table[(q2 >> 0) & 3]; + result.y = table[(q2 >> 8) & 3]; + result.z = table[(q2 >> 16) & 3]; + result.w = table[(q2 >> 24) & 3]; + return result; +} + +#define VDR_Q2_DPT_Q8_1_MMVQ 4 +#define VDR_Q2_DPT_Q8_1_MMQ 8 + +static __device__ __forceinline__ float vec_dot_q2_dpt_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { + + const block_q2_dpt * bq2 = (const block_q2_dpt *) vbq + kbx; + + const int * q8 = (const int *) bq8_1->qs + iqs; + + int sumi = 0; +#pragma unroll + for (int l = 0; l < VDR_Q2_DPT_Q8_1_MMVQ; ++l) { + const int aux_q2 = get_int_b4(bq2->qs, l); + const int4 v = get_int_from_table_4(aux_q2, q2dpt_levels_cuda); + + sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi); + sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi); + sumi = ggml_cuda_dp4a(v.z, q8[l + 8], sumi); + sumi = ggml_cuda_dp4a(v.w, q8[l + 12], sumi); + } + + const float d = __half2float(bq2->d) * __low2float(bq8_1->ds); + return d * sumi; +} + #define VDR_IQ4_XS_Q8_1_MMVQ 4 #define VDR_IQ4_XS_Q8_1_MMQ 4 diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index f3771ebb93c..52d7c681f7a 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -304,7 +304,7 @@ void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RE } } -void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { static const int qk = QK4_0; assert(k % qk == 0); @@ -324,7 +324,7 @@ void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRI } } -void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { static const int qk = QK4_1; assert(k % qk == 0); @@ -345,7 +345,7 @@ void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRI } } -void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { static const int qk = QK5_0; assert(k % qk == 0); @@ -371,7 +371,7 @@ void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRI } } -void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { static const int qk = QK5_1; assert(k % qk == 0); @@ -398,7 +398,7 @@ void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRI } } -void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { static const int qk = QK8_0; assert(k % qk == 0); @@ -414,7 +414,7 @@ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRI } } -void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { static const int qk = QK_MXFP4; assert(k % qk == 0); @@ -790,7 +790,7 @@ void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_REST } } -void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1134,7 +1134,7 @@ void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_REST } } -void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1358,7 +1358,7 @@ void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_REST } } -void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1560,7 +1560,7 @@ void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_REST } } -void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -1768,7 +1768,7 @@ void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_REST } } -void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2221,7 +2221,7 @@ size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, return nrow * row_size; } -void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2260,7 +2260,7 @@ void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_REST } } -void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2281,7 +2281,7 @@ void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_REST // ====================== "True" 2-bit (de)-quantization -void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2309,7 +2309,7 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_ // ====================== 2.3125 bpw (de)-quantization -void dequantize_row_iq2_xs(const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq2_xs(const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2336,7 +2336,7 @@ void dequantize_row_iq2_xs(const block_iq2_xs * GGML_RESTRICT x, float * GGML_RE // ====================== 2.5625 bpw (de)-quantization -void dequantize_row_iq2_s(const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq2_s(const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2368,7 +2368,7 @@ void dequantize_row_iq2_s(const block_iq2_s * GGML_RESTRICT x, float * GGML_REST // ====================== 3.0625 bpw (de)-quantization -void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2400,7 +2400,7 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_ // ====================== 3.3125 bpw (de)-quantization -void dequantize_row_iq3_s(const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq3_s(const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2443,7 +2443,7 @@ void dequantize_row_iq3_s(const block_iq3_s * GGML_RESTRICT x, float * GGML_REST // ====================== 1.5625 bpw (de)-quantization -void dequantize_row_iq1_s(const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq1_s(const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2468,7 +2468,7 @@ void dequantize_row_iq1_s(const block_iq1_s * GGML_RESTRICT x, float * GGML_REST } } -void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2518,7 +2518,7 @@ void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_REST } } -void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK4_NL == 0); const int64_t nb = k / QK4_NL; @@ -2536,7 +2536,7 @@ void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RE } } -void dequantize_row_iq4_xs(const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq4_xs(const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2600,7 +2600,7 @@ void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_REST } } -void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -4092,7 +4092,6 @@ static bool q3kpt_levels_set = false; GGML_API void q3kpt_set_levels(const float * levels) { memcpy(q3kpt_levels, levels, Q3KPT_N_LEVELS * sizeof(float)); q3kpt_levels_set = true; - ggml_quant_set_current_levels(GGML_TYPE_Q3_KPT, q3kpt_levels); } GGML_API const float * q3kpt_get_levels(void) { @@ -4231,11 +4230,11 @@ GGML_API void q3kpt_train_levels(const float * data, free(bin_sum_wt); } -void dequantize_row_q3_kpt(const block_q3_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q3_kpt(const block_q3_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; - const float * levels = (const float *)ggml_quant_get_current_levels(GGML_TYPE_Q3_KPT); - GGML_ASSERT(levels != NULL && "Q3_KPT levels not set for tensor"); + const float * lv = (const float *)levels; + GGML_ASSERT(lv != NULL && "Q3_KPT levels not set for tensor"); // levels are in [0,1], map to approximate [-4, 3] range for Q3_K compatibility // The dequant formula: y = d * sc * (L[k] * 8 - 4) = d * sc * (L[k] - 0.5) * 8 @@ -4275,11 +4274,11 @@ void dequantize_row_q3_kpt(const block_q3_kpt * GGML_RESTRICT x, float * GGML_RE for (int l = 0; l < 16; ++l) { int k_idx = ((q[l + 0] >> shift) & 3) + ((hm[l + 0] & m) ? 4 : 0); - y[l + 0] = dl1 * (levels[k_idx] * 7.0f - 4.0f); + y[l + 0] = dl1 * (lv[k_idx] * 7.0f - 4.0f); } for (int l = 0; l < 16; ++l) { int k_idx = ((q[l + 16] >> shift) & 3) + ((hm[l + 16] & m) ? 4 : 0); - y[l + 16] = dl2 * (levels[k_idx] * 7.0f - 4.0f); + y[l + 16] = dl2 * (lv[k_idx] * 7.0f - 4.0f); } y += 32; shift += 2; @@ -4547,7 +4546,6 @@ static bool q4dpt_levels_set = false; void q4dpt_set_levels(const int8_t * levels) { memcpy(q4dpt_levels, levels, Q4DPT_N_LEVELS * sizeof(int8_t)); q4dpt_levels_set = true; - ggml_quant_set_current_levels(GGML_TYPE_Q4_DPT, q4dpt_levels); } const int8_t * q4dpt_get_levels(void) { @@ -4560,18 +4558,20 @@ void q4dpt_free_levels(void) { // Run Lloyd-Max iterations on a pre-built histogram. -// levels[] is updated in-place (and kept sorted). +// levels[] (n_levels entries) is updated in-place (and kept sorted). static void q4dpt_run_lloyd_max(const float * bin_sum_w, const float * bin_sum_wt, - float * levels, int n_bins, float bin_width, int max_iter) { + float * levels, int n_levels, int n_bins, float bin_width, int max_iter) { + // sw/swt sized for the max possible n_levels (Q4DPT_N_LEVELS) + float sw[Q4DPT_N_LEVELS] = { 0 }; + float swt[Q4DPT_N_LEVELS] = { 0 }; for (int iter = 0; iter < max_iter; ++iter) { - float sw[Q4DPT_N_LEVELS] = { 0 }; - float swt[Q4DPT_N_LEVELS] = { 0 }; + for (int k = 0; k < n_levels; ++k) { sw[k] = 0; swt[k] = 0; } for (int b = 0; b < n_bins; ++b) { if (bin_sum_w[b] < 1e-12f) { continue; } float t = -1.0f + (b + 0.5f) * bin_width; int best = 0; float bd = (t - levels[0]) * (t - levels[0]); - for (int k = 1; k < Q4DPT_N_LEVELS; ++k) { + for (int k = 1; k < n_levels; ++k) { float d = (t - levels[k]) * (t - levels[k]); if (d < bd) { bd = d; best = k; } } @@ -4579,7 +4579,7 @@ static void q4dpt_run_lloyd_max(const float * bin_sum_w, const float * bin_sum_w swt[best] += bin_sum_wt[b]; } float max_delta = 0.0f; - for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { + for (int k = 0; k < n_levels; ++k) { if (sw[k] > 1e-12f) { float nl = swt[k] / sw[k]; max_delta = fmaxf(max_delta, fabsf(nl - levels[k])); @@ -4587,7 +4587,7 @@ static void q4dpt_run_lloyd_max(const float * bin_sum_w, const float * bin_sum_w } } if (max_delta < 1e-10f) { break; } - for (int k = 1; k < Q4DPT_N_LEVELS; ++k) { + for (int k = 1; k < n_levels; ++k) { float v = levels[k]; int m = k - 1; while (m >= 0 && levels[m] > v) { levels[m+1] = levels[m]; m--; } @@ -4644,7 +4644,7 @@ void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { best_levels[k] = (float)kvalues_iq4nl[k] / 127.0f; } - q4dpt_run_lloyd_max(bin_sum_w, bin_sum_wt, best_levels, N_BINS, bin_width, 500); + q4dpt_run_lloyd_max(bin_sum_w, bin_sum_wt, best_levels, Q4DPT_N_LEVELS, N_BINS, bin_width, 500); // Round float centroids to int8, preserve sort order int8_t levels_i8[Q4DPT_N_LEVELS]; @@ -4716,10 +4716,10 @@ void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, free(bin_sum_wt); } -void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK4_NL == 0); const int64_t nb = k / QK4_NL; - const int8_t * values = (const int8_t *)ggml_quant_get_current_levels(GGML_TYPE_Q4_DPT); + const int8_t * values = (const int8_t *)levels; GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor"); for (int i = 0; i < nb; i++) { @@ -4846,6 +4846,889 @@ void quantize_row_q4_dpt_ref(const float * GGML_RESTRICT x, block_q4_dpt * GGML_ quantize_q4_dpt(x, y, 1, k, NULL); } +//////////////////////////////////////////////////////////////////////////////// +// Q2_DPT - 2-bit per-tensor Lloyd-Max quantization (2.5 bpw) +//////////////////////////////////////////////////////////////////////////////// + +// Global levels (used during quantization for the current tensor) +static int8_t q2dpt_levels[Q2DPT_N_LEVELS]; +static bool q2dpt_levels_set = false; + +void q2dpt_set_levels(const int8_t * levels) { + memcpy(q2dpt_levels, levels, Q2DPT_N_LEVELS * sizeof(int8_t)); + q2dpt_levels_set = true; +} + +const int8_t * q2dpt_get_levels(void) { + return q2dpt_levels_set ? q2dpt_levels : NULL; +} + +void q2dpt_free_levels(void) { + q2dpt_levels_set = false; +} + +// Lloyd-Max k-means for Q2_DPT: train 4 int8 levels from weight data. +// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[4]. +// Also sets the global levels via q2dpt_set_levels(). +void q2dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[Q2DPT_N_LEVELS]) { + GGML_ASSERT(nrow * n_per_row > 0); + GGML_ASSERT(n_per_row % QK2_DPT == 0); + + const int N_BINS = 8192; + const float bin_width = 2.0f / N_BINS; + + // Allocate and clear histogram buffers + float * bin_sum_w = (float *) calloc(N_BINS, sizeof(float)); + float * bin_sum_wt = (float *) calloc(N_BINS, sizeof(float)); + GGML_ASSERT(bin_sum_w && bin_sum_wt); + + // Build histogram: bin normalized values (x/amax), weight by amax^2 + for (int64_t row = 0; row < nrow; ++row) { + const float * row_data = data + row * n_per_row; + const float * row_w = imatrix ? imatrix + row * n_per_row : NULL; + + for (int64_t ibl = 0; ibl < n_per_row / QK2_DPT; ++ibl) { + const float * block = row_data + ibl * QK2_DPT; + const float * w = row_w ? row_w + ibl * QK2_DPT : NULL; + + // Find max abs in block + float amax = 0.0f; + for (int j = 0; j < QK2_DPT; ++j) { + float ax = fabsf(block[j]); + if (ax > amax) amax = ax; + } + if (amax < 1e-10f) continue; + + // Bin normalized values + for (int j = 0; j < QK2_DPT; ++j) { + float x = block[j] / amax; + float wt = amax * amax * (w ? w[j] : 1.0f); + int bin = (int)((x + 1.0f) / bin_width); + bin = (bin < 0) ? 0 : (bin >= N_BINS) ? N_BINS - 1 : bin; + bin_sum_w[bin] += wt; + bin_sum_wt[bin] += x * wt; + } + } + } + + // Initialize from Q4_DPT levels (subsample to 4 levels) + const int8_t * q4dpt_init = q4dpt_get_levels(); + float best_levels[Q2DPT_N_LEVELS]; + if (q4dpt_init) { + // Subsample Q4_DPT's 16 levels to 4 levels + best_levels[0] = (float)q4dpt_init[0] / 127.0f; + best_levels[1] = (float)q4dpt_init[5] / 127.0f; + best_levels[2] = (float)q4dpt_init[10] / 127.0f; + best_levels[3] = (float)q4dpt_init[15] / 127.0f; + } else { + // Fallback: uniform asymmetric initialization + best_levels[0] = -1.0f; + best_levels[1] = -0.33f; + best_levels[2] = 0.33f; + best_levels[3] = 1.0f; + } + + // Run Lloyd-Max iterations + q4dpt_run_lloyd_max(bin_sum_w, bin_sum_wt, best_levels, Q2DPT_N_LEVELS, N_BINS, bin_width, 500); + + // Round to int8 and enforce sorted order + int8_t levels_i8[Q2DPT_N_LEVELS]; + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) { + int v = (int)roundf(best_levels[k] * 127.0f); + if (v < -128) v = -128; + if (v > 127) v = 127; + levels_i8[k] = (int8_t)v; + } + + // Greedy local search: try +/-1 adjustments + float base_score = 0.0f; + for (int bin = 0; bin < N_BINS; ++bin) { + if (bin_sum_w[bin] > 0) { + float x = bin_sum_wt[bin] / bin_sum_w[bin]; + float best_dist = fabsf(x - best_levels[0]); + for (int k = 1; k < Q2DPT_N_LEVELS; ++k) { + float dist = fabsf(x - best_levels[k]); + if (dist < best_dist) best_dist = dist; + } + base_score += best_dist * bin_sum_w[bin]; + } + } + + for (int pass = 0; pass < 10; ++pass) { + bool improved = false; + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) { + int8_t orig = levels_i8[k]; + for (int delta = -1; delta <= 1; delta += 2) { + int8_t trial = (int8_t)(orig + delta); + if (k > 0 && trial <= levels_i8[k-1]) continue; + if (k < Q2DPT_N_LEVELS - 1 && trial >= levels_i8[k+1]) continue; + + levels_i8[k] = trial; + float cur_levels[Q2DPT_N_LEVELS]; + for (int i = 0; i < Q2DPT_N_LEVELS; ++i) + cur_levels[i] = (float)levels_i8[i] / 127.0f; + + float cur_score = 0.0f; + for (int bin = 0; bin < N_BINS; ++bin) { + if (bin_sum_w[bin] > 0) { + float x = bin_sum_wt[bin] / bin_sum_w[bin]; + float best_dist = fabsf(x - cur_levels[0]); + for (int i = 1; i < Q2DPT_N_LEVELS; ++i) { + float dist = fabsf(x - cur_levels[i]); + if (dist < best_dist) best_dist = dist; + } + cur_score += best_dist * bin_sum_w[bin]; + } + } + + if (cur_score < base_score) { + base_score = cur_score; + improved = true; + } else { + levels_i8[k] = orig; + } + } + } + if (!improved) break; + } + + memcpy(levels_out, levels_i8, Q2DPT_N_LEVELS * sizeof(int8_t)); + q2dpt_set_levels(levels_i8); + + free(bin_sum_w); + free(bin_sum_wt); +} + +void dequantize_row_q2_dpt(const block_q2_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { + assert(k % QK2_DPT == 0); + const int64_t nb = k / QK2_DPT; + const int8_t * values = (const int8_t *)levels; + GGML_ASSERT(values != NULL && "Q2_DPT levels not set for tensor"); + + for (int i = 0; i < nb; i++) { + const uint8_t * qs = x[i].qs; + const float d = GGML_FP16_TO_FP32(x[i].d); + for (int j = 0; j < QK2_DPT/4; ++j) { + uint8_t q = qs[j]; + y[j*4 + 0] = d * (float)values[(q >> 0) & 3]; + y[j*4 + 1] = d * (float)values[(q >> 2) & 3]; + y[j*4 + 2] = d * (float)values[(q >> 4) & 3]; + y[j*4 + 3] = d * (float)values[(q >> 6) & 3]; + } + y += QK2_DPT; + } +} + +// Strategy bitmask for quantize_block_q2_dpt (for A/B testing). +// Bit 0: level-anchor CD (approach A) +// Bit 1: boundary sweep+CD (approach B) +// Bit 2: dual-extreme CD (approach C: max_val AND min_val anchors) +// Bit 3: element-anchor CD (approach D: every xb[j]/values[k] as anchor) +// Bit 4: brute-force monotone partition (approach E: exhaustive search, O(n^3) per block) +static int q2dpt_quant_strategy = 0x3; // default: A+B + +void q2dpt_set_quant_strategy(int s) { q2dpt_quant_strategy = s; } + +// Refine d via iterated CD until convergence. Returns best d. +static float q2dpt_cd_refine(const float * GGML_RESTRICT xb, const float * qw, + const int8_t * values, float d) { + for (int iter = 0; iter < 8; ++iter) { + float id = (fabsf(d) > 1e-20f) ? 1.0f / d : 0.0f; + float sumqx = 0.0f, sumq2 = 0.0f; + for (int j = 0; j < QK2_DPT; ++j) { + float al = id * xb[j]; + int bk = 0; + float bd = fabsf(al - (float)values[0]); + for (int m = 1; m < Q2DPT_N_LEVELS; ++m) { + float dist = fabsf(al - (float)values[m]); + if (dist < bd) { bd = dist; bk = m; } + } + float q = (float)values[bk]; + float w = qw ? qw[j] : 1.0f; + sumqx += w * q * xb[j]; + sumq2 += w * q * q; + } + if (sumq2 < 1e-20f) break; + float d_new = sumqx / sumq2; + if (fabsf(d_new - d) < 1e-8f * fabsf(d)) break; + d = d_new; + } + return d; +} + +// Evaluate a candidate d: returns objective, fills L[]. +static float q2dpt_eval(const float * GGML_RESTRICT xb, const float * qw, + const int8_t * values, float d, uint8_t * L) { + float id = (fabsf(d) > 1e-20f) ? 1.0f / d : 0.0f; + float sumqx = 0.0f, sumq2 = 0.0f; + for (int j = 0; j < QK2_DPT; ++j) { + float al = id * xb[j]; + int bk = 0; + float bd = fabsf(al - (float)values[0]); + for (int m = 1; m < Q2DPT_N_LEVELS; ++m) { + float dist = fabsf(al - (float)values[m]); + if (dist < bd) { bd = dist; bk = m; } + } + L[j] = (uint8_t)bk; + float q = (float)values[bk]; + float w = qw ? qw[j] : 1.0f; + sumqx += w * q * xb[j]; + sumq2 += w * q * q; + } + if (sumq2 < 1e-20f) return -1e30f; + return (sumqx / sumq2) * sumqx; +} + +// Helper: try a single starting d, refine via CD, update best if improved. +static inline void q2dpt_try_d(const float * GGML_RESTRICT xb, const float * qw, + const int8_t * values, float d_init, + float * best, float * best_d, uint8_t * best_L) { + uint8_t L[QK2_DPT]; + float d = q2dpt_cd_refine(xb, qw, values, d_init); + float score = q2dpt_eval(xb, qw, values, d, L); + if (score > *best) { + *best = score; *best_d = d; + memcpy(best_L, L, QK2_DPT); + } +} + +// Quantize one 32-element block using 4 int8 levels and optimal per-block scale. +// The q2dpt_quant_strategy bitmask selects which search approaches are used: +// Bit 0 (A): level anchors + CD (d = max_val / values[k] for each k) +// Bit 1 (B): boundary sweep + CD (d = xb[j] / boundary for each j and boundary) +// Bit 2 (C): dual-extreme anchors + CD (A using both max_val AND min_val) +// Bit 3 (D): element-anchor scan + CD (d = xb[j] / values[k] for each j, k) +// Bit 4 (E): brute-force monotone partition (exhaustive over all C(35,3) partitions) +static void quantize_block_q2_dpt(const float * GGML_RESTRICT xb, block_q2_dpt * GGML_RESTRICT out, + const int8_t * values, const float * qw, int ntry) { + (void)ntry; + const int strat = q2dpt_quant_strategy; + + float amax = 0.0f, max_val = 0.0f, min_val = 0.0f; + for (int j = 0; j < QK2_DPT; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { amax = ax; max_val = xb[j]; } + if (xb[j] < min_val) min_val = xb[j]; + if (xb[j] > max_val) max_val = xb[j]; + } + if (amax < 1e-10f) { + out->d = 0; + memset(out->qs, 0, QK2_DPT/4); + return; + } + + uint8_t best_L[QK2_DPT]; + float best = -1e30f; + float best_d = 0.0f; + + // --- A: level-anchor CD (4 starting points) --- + if (strat & 0x1) { + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) { + if (values[k] == 0) continue; + q2dpt_try_d(xb, qw, values, max_val / (float)values[k], + &best, &best_d, best_L); + } + } + + // --- B: boundary-crossing sweep + CD --- + if (strat & 0x2) { + for (int b = 0; b < Q2DPT_N_LEVELS - 1; ++b) { + float bnd = ((float)values[b] + (float)values[b + 1]) * 0.5f; + if (fabsf(bnd) < 0.5f) continue; + for (int j = 0; j < QK2_DPT; ++j) { + if (fabsf(xb[j]) < 1e-12f) continue; + q2dpt_try_d(xb, qw, values, xb[j] / bnd, + &best, &best_d, best_L); + } + } + } + + // --- C: dual-extreme anchors + CD (8 starting points) --- + if (strat & 0x4) { + float extremes[2] = { max_val, min_val }; + for (int e = 0; e < 2; ++e) { + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) { + if (values[k] == 0) continue; + q2dpt_try_d(xb, qw, values, extremes[e] / (float)values[k], + &best, &best_d, best_L); + } + } + } + + // --- D: element-anchor scan + CD (32 x 4 starting points) --- + if (strat & 0x8) { + for (int j = 0; j < QK2_DPT; ++j) { + if (fabsf(xb[j]) < 1e-12f) continue; + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) { + if (values[k] == 0) continue; + q2dpt_try_d(xb, qw, values, xb[j] / (float)values[k], + &best, &best_d, best_L); + } + } + } + + // --- E: brute-force monotone partition enumeration --- + // For a single scale d, the optimal assignment must be monotone on sorted x: + // if x_i < x_j then L[i] <= L[j] (for d>0) or L[i] >= L[j] (for d<0). + // We enumerate all C(32+3, 3) = C(35,3) = 6545 ways to partition 32 sorted + // elements into 4 groups, score each in O(1) using prefix sums, then pick best. + if (strat & 0x10) { + // Sort elements by value, keeping original indices + int idx[QK2_DPT]; + for (int j = 0; j < QK2_DPT; ++j) idx[j] = j; + // Simple insertion sort (only 32 elements) + for (int i = 1; i < QK2_DPT; ++i) { + int t = idx[i]; + float tv = xb[t]; + int j = i - 1; + while (j >= 0 && xb[idx[j]] > tv) { + idx[j + 1] = idx[j]; + --j; + } + idx[j + 1] = t; + } + + // Build weighted prefix sums: swx[i] = sum_{j0 maps sorted ascending to values ascending, + // d<0 maps sorted ascending to values descending. + for (int flip = 0; flip < 2; ++flip) { + float v[Q2DPT_N_LEVELS]; + if (flip == 0) { + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) v[k] = (float)values[k]; + } else { + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) v[k] = (float)values[Q2DPT_N_LEVELS - 1 - k]; + } + + // Precompute per-level pair products for scoring + float vv[Q2DPT_N_LEVELS]; + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) vv[k] = v[k] * v[k]; + + float bf_best = -1e30f; + int bf_b1 = 0, bf_b2 = 0, bf_b3 = 0; + + // Enumerate partition boundaries b1 <= b2 <= b3 where group k = [b_k, b_{k+1}) + // b0=0, b4=32. 0 <= b1 <= b2 <= b3 <= 32. + for (int b1 = 0; b1 <= QK2_DPT; ++b1) { + // Segment 0: indices [0, b1) assigned to v[0] + float s0_wx = swx[b1] - swx[0]; + float s0_w = sw[b1] - sw[0]; + for (int b2 = b1; b2 <= QK2_DPT; ++b2) { + // Segment 1: indices [b1, b2) assigned to v[1] + float s01_wx = s0_wx + (swx[b2] - swx[b1]); + float s01_w = s0_w + (sw[b2] - sw[b1]); + // sumqx and sumq2 for segments 0+1 can be expressed but we need all 4. + // For efficiency, compute incrementally for b3. + float partial_sumqx = v[0] * s0_wx + v[1] * (swx[b2] - swx[b1]); + float partial_sumq2 = vv[0] * s0_w + vv[1] * (sw[b2] - sw[b1]); + (void)s01_wx; (void)s01_w; + for (int b3 = b2; b3 <= QK2_DPT; ++b3) { + // Segment 2: [b2, b3) -> v[2], Segment 3: [b3, 32) -> v[3] + float seg2_wx = swx[b3] - swx[b2]; + float seg2_w = sw[b3] - sw[b2]; + float seg3_wx = swx[QK2_DPT] - swx[b3]; + float seg3_w = sw[QK2_DPT] - sw[b3]; + + float sumqx = partial_sumqx + v[2] * seg2_wx + v[3] * seg3_wx; + float sumq2 = partial_sumq2 + vv[2] * seg2_w + vv[3] * seg3_w; + + if (sumq2 < 1e-20f) continue; + // score = d * sumqx = sumqx^2 / sumq2 (only valid when sumqx > 0) + float score = sumqx * sumqx / sumq2; + // d = sumqx / sumq2; for validity we need d > 0 when flip==0, d < 0 when flip==1 + // i.e. sumqx > 0 for flip==0, sumqx < 0 for flip==1 + if (flip == 0 && sumqx <= 0.0f) continue; + if (flip == 1 && sumqx >= 0.0f) continue; + if (score > bf_best) { + bf_best = score; + bf_b1 = b1; bf_b2 = b2; bf_b3 = b3; + } + } + } + } + + if (bf_best > -1e29f) { + // Reconstruct the assignment for the best partition + uint8_t L_bf[QK2_DPT]; + for (int j = 0; j < bf_b1; ++j) { + int orig = idx[j]; + L_bf[orig] = flip == 0 ? 0 : (Q2DPT_N_LEVELS - 1); + } + for (int j = bf_b1; j < bf_b2; ++j) { + int orig = idx[j]; + L_bf[orig] = flip == 0 ? 1 : (Q2DPT_N_LEVELS - 2); + } + for (int j = bf_b2; j < bf_b3; ++j) { + int orig = idx[j]; + L_bf[orig] = flip == 0 ? 2 : (Q2DPT_N_LEVELS - 1 - 2); + } + for (int j = bf_b3; j < QK2_DPT; ++j) { + int orig = idx[j]; + L_bf[orig] = flip == 0 ? 3 : 0; + } + // Compute d from this assignment + float sumqx = 0.0f, sumq2 = 0.0f; + for (int j = 0; j < QK2_DPT; ++j) { + float q = (float)values[L_bf[j]]; + float w = qw ? qw[j] : 1.0f; + sumqx += w * q * xb[j]; + sumq2 += w * q * q; + } + if (sumq2 > 1e-20f) { + float d_bf = sumqx / sumq2; + float score = (sumqx / sumq2) * sumqx; + if (score > best) { + best = score; + best_d = d_bf; + memcpy(best_L, L_bf, QK2_DPT); + } + } + } + } + } + + // Final re-assignment with best scale + float id = (fabsf(best_d) > 1e-20f) ? 1.0f / best_d : 0.0f; + for (int j = 0; j < QK2_DPT; ++j) { + float al = id * xb[j]; + int bk = 0; + float bd = fabsf(al - (float)values[0]); + for (int k = 1; k < Q2DPT_N_LEVELS; ++k) { + float dist = fabsf(al - (float)values[k]); + if (dist < bd) { bd = dist; bk = k; } + } + best_L[j] = (uint8_t)bk; + } + + // Pack 2-bit indices: 4 values per byte + out->d = GGML_FP32_TO_FP16(best_d); + for (int j = 0; j < QK2_DPT/4; ++j) { + out->qs[j] = best_L[j*4] | (best_L[j*4+1] << 2) | (best_L[j*4+2] << 4) | (best_L[j*4+3] << 6); + } +} + +size_t quantize_q2_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, + int64_t nrow, int64_t n_per_row, const float * quant_weights) { + GGML_ASSERT(n_per_row % QK2_DPT == 0); + const int8_t * values = q2dpt_get_levels(); + GGML_ASSERT(values != NULL && "Q2_DPT levels not set - call q2dpt_set_levels() first"); + + const int64_t nblock = n_per_row / QK2_DPT; + char * qrow = (char *) dst; + + for (int64_t row = 0; row < nrow; ++row) { + block_q2_dpt * q2 = (block_q2_dpt *) qrow; + for (int64_t ibl = 0; ibl < nblock; ++ibl) { + const float * qw = quant_weights ? quant_weights + QK2_DPT * ibl : NULL; + quantize_block_q2_dpt(src + QK2_DPT * ibl, &q2[ibl], values, qw, 7); + } + src += n_per_row; + qrow += nblock * sizeof(block_q2_dpt); + } + return (size_t) nrow * nblock * sizeof(block_q2_dpt); +} + +void quantize_row_q2_dpt_ref(const float * GGML_RESTRICT x, block_q2_dpt * GGML_RESTRICT y, int64_t k) { + assert(k % QK2_DPT == 0); + quantize_q2_dpt(x, y, 1, k, NULL); +} + +// ====================== Q2_KPT: Q2_K with learned per-tensor float levels ====================== + +static float q2kpt_levels[Q2KPT_N_LEVELS]; +static bool q2kpt_levels_set = false; + +void q2kpt_set_levels(const float * levels) { + memcpy(q2kpt_levels, levels, Q2KPT_N_LEVELS * sizeof(float)); + q2kpt_levels_set = true; +} + +const float * q2kpt_get_levels(void) { + return q2kpt_levels_set ? q2kpt_levels : NULL; +} + +void q2kpt_free_levels(void) { + q2kpt_levels_set = false; +} + +// Train 4 Lloyd-Max float levels from tensor data for Q2_KPT. +// Uses Q2_K-style scale+min estimation to normalize sub-block values to [0,1]. +GGML_API void q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[Q2KPT_N_LEVELS]) { + const int N_BINS = 8192; + const float bin_width = 1.0f / N_BINS; + float * bin_sum_w = (float *) calloc(N_BINS, sizeof(float)); + float * bin_sum_wt = (float *) calloc(N_BINS, sizeof(float)); + GGML_ASSERT(bin_sum_w && bin_sum_wt); + + const int nb = (int)(n_per_row / QK_K); + + // Single pass: for each 16-element sub-block, estimate scale+min, normalize to [0,1], bin + for (int64_t row = 0; row < nrow; ++row) { + const float * xrow = data + row * n_per_row; + + for (int i = 0; i < nb; i++) { + const float * x = xrow + i * QK_K; + + for (int j = 0; j < QK_K / 16; ++j) { + // Find min and max of sub-block + float xmin = x[16 * j], xmax = x[16 * j]; + for (int l = 1; l < 16; ++l) { + if (x[16 * j + l] < xmin) xmin = x[16 * j + l]; + if (x[16 * j + l] > xmax) xmax = x[16 * j + l]; + } + // Q2_K clamps min to <= 0 + if (xmin > 0) xmin = 0; + float range = xmax - xmin; + if (range < 1e-10f) continue; + + float inv_range = 1.0f / range; + + for (int l = 0; l < 16; ++l) { + // Normalize to [0, 1]: t = (x - min) / range + float t = (x[16 * j + l] - xmin) * inv_range; + if (t < 0.0f) t = 0.0f; + if (t > 1.0f) t = 1.0f; + + int bin_idx = (int)(t * N_BINS); + if (bin_idx >= N_BINS) bin_idx = N_BINS - 1; + + int elem = i * QK_K + 16 * j + l; + float w = imatrix ? imatrix[elem] : 1.0f; + if (w < 1e-10f) w = 1e-10f; + // Weight by range² (like Q3_KPT weights by d²) + w *= range * range; + + bin_sum_w[bin_idx] += w; + bin_sum_wt[bin_idx] += w * t; + } + } + } + } + + // Initialize 4 levels uniformly in [0, 1] + float levels[Q2KPT_N_LEVELS]; + for (int k = 0; k < Q2KPT_N_LEVELS; ++k) { + levels[k] = (float)k / (Q2KPT_N_LEVELS - 1); + } + + // Lloyd-Max iterations on bins + for (int iter = 0; iter < 100; ++iter) { + float sum_w[Q2KPT_N_LEVELS] = { 0 }; + float sum_wt[Q2KPT_N_LEVELS] = { 0 }; + + for (int b = 0; b < N_BINS; ++b) { + if (bin_sum_w[b] < 1e-12f) continue; + const float t = (b + 0.5f) * bin_width; + int best = 0; + float bd2 = (t - levels[0]) * (t - levels[0]); + for (int k = 1; k < Q2KPT_N_LEVELS; ++k) { + float d2 = (t - levels[k]) * (t - levels[k]); + if (d2 < bd2) { bd2 = d2; best = k; } + } + sum_w[best] += bin_sum_w[b]; + sum_wt[best] += bin_sum_wt[b]; + } + + float max_delta = 0.0f; + for (int k = 0; k < Q2KPT_N_LEVELS; ++k) { + if (sum_w[k] > 1e-12f) { + float new_level = sum_wt[k] / sum_w[k]; + max_delta = fmaxf(max_delta, fabsf(new_level - levels[k])); + levels[k] = new_level; + } + } + if (max_delta < 1e-10f) break; + + // Sort levels + for (int k = 1; k < Q2KPT_N_LEVELS; ++k) { + float v = levels[k]; + int m = k - 1; + while (m >= 0 && levels[m] > v) { + levels[m + 1] = levels[m]; + m--; + } + levels[m + 1] = v; + } + } + + memcpy(levels_out, levels, Q2KPT_N_LEVELS * sizeof(float)); + q2kpt_set_levels(levels); + free(bin_sum_w); + free(bin_sum_wt); +} + +void dequantize_row_q2_kpt(const block_q2_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + const float * lv = (const float *)levels; + GGML_ASSERT(lv != NULL && "Q2_KPT levels not set for tensor"); + + // Precompute mapped levels: ml[k] = levels[k] * 3.0 + float ml[Q2KPT_N_LEVELS]; + for (int i = 0; i < Q2KPT_N_LEVELS; ++i) { + ml[i] = lv[i] * 3.0f; + } + + for (int i = 0; i < nb; i++) { + const float d_all = GGML_FP16_TO_FP32(x[i].d); + const float m_all = GGML_FP16_TO_FP32(x[i].dmin); + const uint8_t * q = x[i].qs; + + int is = 0; + for (int n = 0; n < QK_K; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + uint8_t sc = x[i].scales[is++]; + float dl = d_all * (sc & 0xF); + float mn = m_all * (sc >> 4); + for (int l = 0; l < 16; ++l) { + int idx = (q[l] >> shift) & 3; + *y++ = dl * ml[idx] - mn; + } + + sc = x[i].scales[is++]; + dl = d_all * (sc & 0xF); + mn = m_all * (sc >> 4); + for (int l = 0; l < 16; ++l) { + int idx = (q[l + 16] >> shift) & 3; + *y++ = dl * ml[idx] - mn; + } + + shift += 2; + } + q += 32; + } + } +} + +// Helper: find optimal (scale, min) for non-uniform mapped levels with offset. +// mapped_levels[k] = levels[k]*3, k=0..3. +// Model: x[i] ≈ scale * ml[L[i]] - min_offset, with min_offset >= 0. +// Returns the per-sub-block scale; *the_min receives the min offset. +// L[i] gets the best level index [0..3]. +static float make_q2kpt_quants(int n, + const float * GGML_RESTRICT x, + uint8_t * GGML_RESTRICT L, + float * GGML_RESTRICT the_min, + const float * mapped_levels, + const float * weight) { + // Precompute boundaries for nearest-level assignment + float bounds[Q2KPT_N_LEVELS - 1]; + for (int k = 0; k < Q2KPT_N_LEVELS - 1; ++k) { + bounds[k] = 0.5f * (mapped_levels[k] + mapped_levels[k + 1]); + } + + float xmin = x[0], xmax = x[0]; + for (int i = 1; i < n; ++i) { + if (x[i] < xmin) xmin = x[i]; + if (x[i] > xmax) xmax = x[i]; + } + if (xmin > 0) xmin = 0; + if (xmax <= xmin) { + for (int i = 0; i < n; ++i) L[i] = 0; + *the_min = -xmin; + return 0.f; + } + + float ml_max = mapped_levels[Q2KPT_N_LEVELS - 1]; + + float best_scale = 0, best_min = 0; + float best_obj = 0; + bool first = true; + + // Try multiple trial scales (perturbations around initial estimate) + for (int is = -9; is <= 36; ++is) { + float iscale = (ml_max + 0.1f * is) / (xmax - xmin); + float trial_min = -xmin; + + // Assign each element to nearest mapped level + float sum_l = 0, sum_l2 = 0, sum_lx = 0; + float sum_x = 0, sum_w = 0; + for (int i = 0; i < n; ++i) { + float scaled = iscale * (x[i] + trial_min); + // Nearest level assignment + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); + float ml_val = mapped_levels[best_k]; + float w = weight ? weight[i] : x[i] * x[i]; + sum_l += w * ml_val; + sum_l2 += w * ml_val * ml_val; + sum_lx += w * ml_val * x[i]; + sum_x += w * x[i]; + sum_w += w; + } + + // 2D least-squares: x[i] ≈ A * ml[L[i]] + B + // Normal equations: [sum_l2, sum_l; sum_l, sum_w] [A; B] = [sum_lx; sum_x] + float D = sum_w * sum_l2 - sum_l * sum_l; + if (D > 0) { + float this_scale = (sum_w * sum_lx - sum_x * sum_l) / D; + float this_min = (sum_l2 * sum_x - sum_l * sum_lx) / D; + + // min_offset = -B, must be >= 0 (i.e. B <= 0) + if (this_min > 0) { + this_min = 0; + this_scale = sum_lx / sum_l2; + } + + float cur_error = 0; + for (int i = 0; i < n; ++i) { + float scaled = (x[i] - this_min) / (this_scale > 1e-15f ? this_scale : 1e-15f); + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); + float diff = this_scale * mapped_levels[best_k] + this_min - x[i]; + float w = weight ? weight[i] : x[i] * x[i]; + cur_error += w * diff * diff; + } + + if (first || cur_error < best_obj) { + best_obj = cur_error; + best_scale = this_scale; + best_min = this_min; + first = false; + + // Store assignment + for (int i = 0; i < n; ++i) { + float scaled = (x[i] - best_min) / (best_scale > 1e-15f ? best_scale : 1e-15f); + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); + L[i] = best_k; + } + } + } + } + + *the_min = -best_min; // Store as positive offset (Q2_K convention: ml = dmin * min_q) + return best_scale; +} + +static void quantize_row_q2_kpt_impl(const float * GGML_RESTRICT x, + block_q2_kpt * GGML_RESTRICT y, + int64_t n_per_row, + const float * GGML_RESTRICT quant_weights) { + assert(n_per_row % QK_K == 0); + const int nb = n_per_row / QK_K; + const float * levels = q2kpt_get_levels(); + GGML_ASSERT(levels != NULL && "Q2_KPT levels not set - call q2kpt_set_levels() first"); + + // Precompute mapped levels: ml[k] = levels[k] * 3.0 + float mapped_levels[Q2KPT_N_LEVELS]; + for (int k = 0; k < Q2KPT_N_LEVELS; ++k) { + mapped_levels[k] = levels[k] * 3.0f; + } + + // Precompute boundaries for nearest-level assignment + float bounds[Q2KPT_N_LEVELS - 1]; + for (int k = 0; k < Q2KPT_N_LEVELS - 1; ++k) { + bounds[k] = 0.5f * (mapped_levels[k] + mapped_levels[k + 1]); + } + + uint8_t L[QK_K]; + float mins[QK_K/16]; + float scales[QK_K/16]; + float sw[QK_K/16]; + float weight[16]; + uint8_t Ls[QK_K/16], Lm[QK_K/16]; + + for (int i = 0; i < nb; i++) { + memset(sw, 0, QK_K/16*sizeof(float)); + float sumx2 = 0; + for (int j = 0; j < QK_K; ++j) sumx2 += x[j] * x[j]; + float sigma2 = sumx2 / QK_K; + + // First pass: find per-sub-block scales optimized for mapped levels + for (int j = 0; j < QK_K/16; ++j) { + if (quant_weights) { + const float * qw = quant_weights + QK_K * i + 16 * j; + for (int l = 0; l < 16; ++l) + weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l] * x[16*j + l]); + } else { + for (int l = 0; l < 16; ++l) + weight[l] = x[16*j + l] * x[16*j + l]; + } + for (int l = 0; l < 16; ++l) sw[j] += weight[l]; + + scales[j] = make_q2kpt_quants(16, x + 16*j, L + 16*j, &mins[j], + mapped_levels, weight); + } + + // Two-tier scale quantization (identical to Q2_K): + // Quantize scales [0..15] and mins [0..15] separately using make_qp_quants + float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw); + float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw); + + y[i].d = GGML_FP32_TO_FP16(dm); + y[i].dmin = GGML_FP32_TO_FP16(mm); + dm = GGML_FP16_TO_FP32(y[i].d); + mm = GGML_FP16_TO_FP32(y[i].dmin); + + for (int j = 0; j < QK_K/16; ++j) { + y[i].scales[j] = Ls[j] | (Lm[j] << 4); + } + + // Second pass: re-assign with quantized scales using nearest mapped level + for (int j = 0; j < QK_K/16; ++j) { + const float d = dm * (y[i].scales[j] & 0xF); + if (!d) { + // Assign to level closest to 0 for zero-scale sub-blocks + int zero_k = 0; + float zero_d = fabsf(mapped_levels[0]); + for (int k = 1; k < Q2KPT_N_LEVELS; ++k) { + if (fabsf(mapped_levels[k]) < zero_d) { + zero_d = fabsf(mapped_levels[k]); + zero_k = k; + } + } + for (int ii = 0; ii < 16; ++ii) L[16*j + ii] = zero_k; + continue; + } + const float m = mm * (y[i].scales[j] >> 4); + for (int ii = 0; ii < 16; ++ii) { + float scaled = (x[16*j + ii] + m) / d; + // Nearest mapped level assignment + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); + L[16*j + ii] = best_k; + } + } + + // Pack 2-bit indices (same layout as Q2_K) + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) + | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); + } + } + + x += QK_K; + } +} + +size_t quantize_q2_kpt(const float * GGML_RESTRICT src, + void * GGML_RESTRICT dst, + int64_t nrow, + int64_t n_per_row, + const float * imatrix) { + size_t row_size = ggml_row_size(GGML_TYPE_Q2_KPT, n_per_row); + char * qrow = (char *) dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_q2_kpt_impl(src, (block_q2_kpt *) qrow, n_per_row, imatrix); + src += n_per_row; + qrow += row_size; + } + return nrow * row_size; +} + +void quantize_row_q2_kpt_ref(const float * GGML_RESTRICT x, block_q2_kpt * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + quantize_q2_kpt(x, y, 1, k, NULL); +} + // Global levels (used during quantization for the current tensor) static float q3pt_levels[Q3PT_N_LEVELS]; static bool q3pt_levels_set = false; @@ -4853,7 +5736,6 @@ static bool q3pt_levels_set = false; void q3pt_set_levels(const float * levels) { memcpy(q3pt_levels, levels, Q3PT_N_LEVELS * sizeof(float)); q3pt_levels_set = true; - ggml_quant_set_current_levels(GGML_TYPE_Q3_PT, q3pt_levels); } const float * q3pt_get_levels(void) { @@ -4996,10 +5878,10 @@ static inline void q3pt_pack3(uint8_t * GGML_RESTRICT qs, int k, int v) { if (off > 5) { qs[byte+1] |= (uint8_t)((v & 0x7) >> (8 - off)); } } -void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int nb = k / QK_K; - const float * L = (const float *)ggml_quant_get_current_levels(GGML_TYPE_Q3_PT); + const float * L = (const float *)levels; GGML_ASSERT(L != NULL && "Q3_PT levels not set for tensor"); for (int i = 0; i < nb; i++) { @@ -6531,6 +7413,14 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_dpt, data, nb); } break; + case GGML_TYPE_Q2_DPT: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q2_dpt, data, nb); + } break; + case GGML_TYPE_Q2_KPT: + { + VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_kpt, data, nb, d, dmin); + } break; case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -6548,49 +7438,4 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte return true; } -// Unified per-type current-levels pointer (set by CPU MUL_MAT dispatch from graph input) -static const void * ggml_quant_current_levels[GGML_TYPE_COUNT] = { NULL }; - -void ggml_quant_set_current_levels(enum ggml_type type, const void * data) { - ggml_quant_current_levels[type] = data; -} - -const void * ggml_quant_get_current_levels(enum ggml_type type) { - return ggml_quant_current_levels[type]; -} - -// Per-tensor auxiliary data registry (indexed by tensor struct pointer) -// Allows backends to look up aux data for a specific weight tensor -// Key is tensor struct pointer (stable), value is aux data pointer -#define GGML_QUANT_AUX_HASH_SIZE 1024 - -static struct { - const void * tensor_ptr; // ggml_tensor struct pointer (key) - const void * aux_data; // aux data pointer (levels, kvalues, etc.) - size_t aux_size; // aux data size in bytes -} ggml_quant_aux_registry[GGML_QUANT_AUX_HASH_SIZE]; - -static size_t ggml_quant_aux_hash(const void * ptr) { - return (size_t)((uintptr_t)ptr >> 4) % GGML_QUANT_AUX_HASH_SIZE; -} - -void ggml_quant_set_tensor_aux_data(const void * tensor_ptr, const void * aux_data, size_t aux_size) { - size_t h = ggml_quant_aux_hash(tensor_ptr); - ggml_quant_aux_registry[h].tensor_ptr = tensor_ptr; - ggml_quant_aux_registry[h].aux_data = aux_data; - ggml_quant_aux_registry[h].aux_size = aux_size; -} - -const void * ggml_quant_get_tensor_aux_data(const void * tensor_ptr, size_t * out_size) { - size_t h = ggml_quant_aux_hash(tensor_ptr); - if (ggml_quant_aux_registry[h].tensor_ptr == tensor_ptr) { - if (out_size) *out_size = ggml_quant_aux_registry[h].aux_size; - return ggml_quant_aux_registry[h].aux_data; - } - return NULL; -} - -void ggml_quant_clear_aux_registry(void) { - memset(ggml_quant_aux_registry, 0, sizeof(ggml_quant_aux_registry)); -} diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 35fb38c6496..e1b1ac375c5 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -41,35 +41,35 @@ GGML_API void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_ GGML_API void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k); // Dequantization -GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); - -GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); - -GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q3_kpt(const block_q3_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); - -GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); - -GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); + +GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); + +GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q3_kpt(const block_q3_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); + +GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); + +GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); @@ -107,7 +107,7 @@ GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTR GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API void quantize_row_q3_pt_ref(const float * GGML_RESTRICT x, block_q3_pt * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); GGML_API size_t quantize_q3_pt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); // Q3_PT levels management (per-tensor Lloyd-Max levels in [0,1]) @@ -124,7 +124,7 @@ GGML_API void q3pt_train_levels(const float * data, int64_t nrow, int64 const float * imatrix, float levels_out[8]); // Q4_DPT: IQ4_NL with learned per-tensor int8 levels -GGML_API void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); GGML_API void quantize_row_q4_dpt_ref(const float * GGML_RESTRICT x, block_q4_dpt * GGML_RESTRICT y, int64_t k); GGML_API size_t quantize_q4_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); @@ -133,15 +133,38 @@ GGML_API void q4dpt_set_levels(const int8_t * levels); GGML_API const int8_t * q4dpt_get_levels(void); GGML_API void q4dpt_free_levels(void); -// Unified per-type current-levels pointer (set by CPU MUL_MAT dispatch from graph input src[2]) -GGML_API void ggml_quant_set_current_levels(enum ggml_type type, const void * data); -GGML_API const void * ggml_quant_get_current_levels(enum ggml_type type); - -// Per-tensor auxiliary data registry (indexed by tensor pointer) -// Allows backends to look up aux data for a specific weight tensor -GGML_API void ggml_quant_set_tensor_aux_data(const void * tensor_data, const void * aux_data, size_t aux_size); -GGML_API const void * ggml_quant_get_tensor_aux_data(const void * tensor_data, size_t * out_size); -GGML_API void ggml_quant_clear_aux_registry(void); +// Q2_DPT: 2-bit with learned per-tensor int8 levels +GGML_API void dequantize_row_q2_dpt(const block_q2_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void quantize_row_q2_dpt_ref(const float * GGML_RESTRICT x, block_q2_dpt * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_q2_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + +// Q2_DPT levels management (per-tensor Lloyd-Max int8 levels) +GGML_API void q2dpt_set_levels(const int8_t * levels); +GGML_API const int8_t * q2dpt_get_levels(void); +GGML_API void q2dpt_free_levels(void); +GGML_API void q2dpt_set_quant_strategy(int s); + +// Train 4 Lloyd-Max int8 levels from tensor data for Q2_DPT. +// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[4]. +// Also sets the global levels via q2dpt_set_levels(). +GGML_API void q2dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[Q2DPT_N_LEVELS]); + +// Q2_KPT: Q2_K with learned per-tensor float levels +GGML_API void dequantize_row_q2_kpt(const block_q2_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void quantize_row_q2_kpt_ref(const float * GGML_RESTRICT x, block_q2_kpt * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_q2_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + +// Q2_KPT levels management (per-tensor float levels in [0,1]) +GGML_API void q2kpt_set_levels(const float * levels); +GGML_API const float * q2kpt_get_levels(void); +GGML_API void q2kpt_free_levels(void); + +// Train 4 Lloyd-Max float levels from tensor data for Q2_KPT. +// Bins normalized sub-block values in [0,1], runs weighted k-means for 4 centroids. +// Also sets the global levels via q2kpt_set_levels(). +GGML_API void q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[Q2KPT_N_LEVELS]); // Train 16 Lloyd-Max int8 levels from tensor data. // Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[16]. diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 0fae68628b6..8cad75ec427 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -11917,7 +11917,7 @@ static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr); } -static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) { +static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant, const void * levels = nullptr) { if (quant == GGML_TYPE_F32) { memcpy(to, from, sizeof(float) * ne); return; @@ -11927,7 +11927,7 @@ static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, gg ggml_to_float_t dequant_fn = tt->to_float; - dequant_fn(from, to, ne); + dequant_fn(from, to, ne, levels); } static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 630a2d6fdd5..56d9313e258 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -456,6 +456,11 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) { } } +static void ggml_fp16_to_fp32_row_leveled(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t n, const void * levels) { + GGML_UNUSED(levels); + ggml_fp16_to_fp32_row((const ggml_fp16_t *)x, y, n); +} + void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { int i = 0; for (; i < n; ++i) { @@ -470,6 +475,11 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { } } +static void ggml_bf16_to_fp32_row_leveled(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t n, const void * levels) { + GGML_UNUSED(levels); + ggml_bf16_to_fp32_row((const ggml_bf16_t *)x, y, n); +} + void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) { for (int i = 0; i < n; i++) { y[i] = ggml_compute_fp32_to_bf16(x[i]); @@ -648,7 +658,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .blck_size = 1, .type_size = sizeof(ggml_fp16_t), .is_quantized = false, - .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, + .to_float = ggml_fp16_to_fp32_row_leveled, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row, }, [GGML_TYPE_Q4_0] = { @@ -841,7 +851,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .blck_size = 1, .type_size = sizeof(ggml_bf16_t), .is_quantized = false, - .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, + .to_float = ggml_bf16_to_fp32_row_leveled, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref, }, [31] = { // GGML_TYPE_Q4_0_4_4 @@ -920,6 +930,22 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q4_dpt, .from_float_ref = (ggml_from_float_t) quantize_row_q4_dpt_ref, }, + [GGML_TYPE_Q2_DPT] = { + .type_name = "q2_dpt", + .blck_size = QK2_DPT, + .type_size = sizeof(block_q2_dpt), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q2_dpt, + .from_float_ref = (ggml_from_float_t) quantize_row_q2_dpt_ref, + }, + [GGML_TYPE_Q2_KPT] = { + .type_name = "q2_kpt", + .blck_size = QK_K, + .type_size = sizeof(block_q2_kpt), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q2_kpt, + .from_float_ref = (ggml_from_float_t) quantize_row_q2_kpt_ref, + }, }; const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { @@ -1413,6 +1439,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_Q3_PT: wtype = GGML_TYPE_Q3_PT; break; case GGML_FTYPE_MOSTLY_Q3_KPT: wtype = GGML_TYPE_Q3_KPT; break; case GGML_FTYPE_MOSTLY_Q4_DPT: wtype = GGML_TYPE_Q4_DPT; break; + case GGML_FTYPE_MOSTLY_Q2_KPT: wtype = GGML_TYPE_Q2_KPT; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -7557,9 +7584,10 @@ void ggml_quantize_init(enum ggml_type type) { case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break; case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break; case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break; - case GGML_TYPE_Q3_PT: break; // levels set externally via q3pt_set_levels() - case GGML_TYPE_Q3_KPT: break; // levels set externally via q3kpt_set_levels() - case GGML_TYPE_Q4_DPT: break; // levels set externally via q4dpt_set_levels() + case GGML_TYPE_Q3_PT: break; // levels stored in tensor->quant_levels + case GGML_TYPE_Q3_KPT: break; // levels stored in tensor->quant_levels + case GGML_TYPE_Q4_DPT: break; // levels stored in tensor->quant_levels + case GGML_TYPE_Q2_KPT: break; // levels stored in tensor->quant_levels default: // nothing break; } @@ -7639,6 +7667,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_Q3_PT: result = quantize_q3_pt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q3_KPT: result = quantize_q3_kpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_DPT: result = quantize_q4_dpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q2_KPT: result = quantize_q2_kpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/include/llama.h b/include/llama.h index 12d9d545302..dd705134bfa 100644 --- a/include/llama.h +++ b/include/llama.h @@ -155,6 +155,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q3_PT = 39, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_KPT = 40, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_DPT = 41, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_KPT = 42, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 104be97f651..b7a002c27ec 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -58,6 +58,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q3_PT: return "Q3_PT - 3.25 bpw"; case LLAMA_FTYPE_MOSTLY_Q3_KPT: return "Q3_KPT - Q3_K with learned levels"; case LLAMA_FTYPE_MOSTLY_Q4_DPT: return "Q4_DPT - IQ4_NL with learned levels"; + case LLAMA_FTYPE_MOSTLY_Q2_KPT: return "Q2_KPT - Q2_K with learned levels"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; @@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader( case GGML_TYPE_Q3_PT: ftype = LLAMA_FTYPE_MOSTLY_Q3_PT; break; case GGML_TYPE_Q3_KPT: ftype = LLAMA_FTYPE_MOSTLY_Q3_KPT; break; case GGML_TYPE_Q4_DPT: ftype = LLAMA_FTYPE_MOSTLY_Q4_DPT; break; + case GGML_TYPE_Q2_KPT: ftype = LLAMA_FTYPE_MOSTLY_Q2_KPT; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 777ea85fc00..3012746cfb5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -15,15 +15,6 @@ #include "models/models.h" -// Q3_PT/Q3_KPT/Q4_DPT: global fallback levels functions (defined in ggml-quants.c) -// These remain temporarily for the global fallback used by dequantize paths. -// The per-tensor registry functions are removed; per-tensor levels now use graph inputs. -extern "C" { - void q3pt_set_levels(const float * levels); - void q3kpt_set_levels(const float * levels); - void q4dpt_set_levels(const int8_t * levels); - void ggml_quant_set_tensor_aux_data(const void * tensor_data, const void * aux_data, size_t aux_size); -} #include #include @@ -7882,6 +7873,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { { GGML_TYPE_Q3_PT, "q3_pt.levels", 8, sizeof(float) }, { GGML_TYPE_Q3_KPT, "q3_kpt.levels", 8, sizeof(float) }, { GGML_TYPE_Q4_DPT, "q4_dpt.levels", 16, sizeof(int8_t) }, + { GGML_TYPE_Q2_KPT, "q2_kpt.levels", 4, sizeof(float) }, }; for (const auto & lt : level_types) { @@ -7892,7 +7884,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const size_t lv_arr_n = gguf_get_arr_n(ml.meta.get(), lv_idx); size_t tensor_count = 0; - bool global_set = false; // Iterate over GGUF slots to find matching tensors for (size_t gguf_slot = 0; gguf_slot < lv_arr_n / lt.n_levels; ++gguf_slot) { @@ -7912,24 +7903,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { lv_raw + gguf_offset * lt.elem_bytes, lv_raw + (gguf_offset + lt.n_levels) * lt.elem_bytes ); - aux.aux_tensor = nullptr; // Will be created during graph build + aux.aux_tensor = nullptr; - // Register in global registry for backend access - ggml_quant_set_tensor_aux_data(t, aux.host_data.data(), aux.host_data.size()); + // Set quant_levels directly on the tensor + t->quant_levels = aux.host_data.data(); tensor_count++; - - // Set the global fallback from the first tensor's levels - if (!global_set) { - if (lt.type == GGML_TYPE_Q4_DPT) { - q4dpt_set_levels((const int8_t *)(lv_raw + gguf_offset * lt.elem_bytes)); - } else if (lt.type == GGML_TYPE_Q3_PT) { - q3pt_set_levels((const float *)(lv_raw + gguf_offset * lt.elem_bytes)); - } else if (lt.type == GGML_TYPE_Q3_KPT) { - q3kpt_set_levels((const float *)(lv_raw + gguf_offset * lt.elem_bytes)); - } - global_set = true; - } } if (tensor_count > 0) { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index cef8f18abb2..93b89952991 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -36,6 +36,13 @@ extern "C" { void q4dpt_set_levels(const int8_t * levels); } +// Q2_KPT levels functions (defined in ggml-quants.c) +extern "C" { + void q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[4]); + void q2kpt_set_levels(const float * levels); +} + // Quantization types. Changes to this struct must be replicated in quantize.cpp struct tensor_quantization { std::string name; @@ -151,7 +158,7 @@ static void llama_tensor_dequantize_impl( } else if (tensor->type == GGML_TYPE_BF16) { ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements); } else if (ggml_is_quantized(tensor->type)) { - qtype->to_float(tensor->data, f32_output, nelements); + qtype->to_float(tensor->data, f32_output, nelements, tensor->quant_levels); } else { GGML_ABORT("fatal error"); // unreachable } @@ -181,13 +188,14 @@ static void llama_tensor_dequantize_impl( size_t thr_elems = thr_blocks * block_size; // number of elements for this thread size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread - auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { + const void * quant_levels = tensor->quant_levels; + auto compute = [qtype, quant_levels] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { if (typ == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels); } else if (typ == GGML_TYPE_BF16) { ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels); } else { - qtype->to_float(inbuf, outbuf, nels); + qtype->to_float(inbuf, outbuf, nels, quant_levels); } }; workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems); @@ -280,6 +288,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_DPT) { new_type = GGML_TYPE_IQ4_XS; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) { + new_type = GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { new_type = GGML_TYPE_Q4_K; } @@ -309,7 +320,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } } } else if (name.find("attn_v.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) { new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) { @@ -372,6 +383,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } @@ -422,11 +434,12 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) { + ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT || + ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) { new_type = GGML_TYPE_Q5_K; } } else { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; @@ -573,6 +586,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: case LLAMA_FTYPE_MOSTLY_Q3_PT: default_type = GGML_TYPE_Q3_PT; break; case LLAMA_FTYPE_MOSTLY_Q3_KPT: default_type = GGML_TYPE_Q3_KPT; break; case LLAMA_FTYPE_MOSTLY_Q4_DPT: default_type = GGML_TYPE_Q4_DPT; break; + case LLAMA_FTYPE_MOSTLY_Q2_KPT: default_type = GGML_TYPE_Q2_KPT; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -1031,6 +1045,83 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: LLAMA_LOG_INFO("%s: Q4_DPT pass 1 complete.\n", __func__); } + // Q2_KPT two-pass approach: train all per-tensor float levels BEFORE opening the output + static const size_t Q2KPT_N_LEVELS_SZ = 4; + std::vector q2kpt_all_levels; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT && !params->dry_run) { + LLAMA_LOG_INFO("%s: Q2_KPT pass 1: training per-tensor levels...\n", __func__); + q2kpt_all_levels.assign(tensors.size() * Q2KPT_N_LEVELS_SZ, 0.0f); + + std::vector> p1_read_data; + std::vector> p1_f32_buf; + std::vector p1_workers; + p1_workers.reserve(nthread); + + for (size_t ti = 0; ti < tensors.size(); ++ti) { + ggml_tensor * tensor = tensors[ti]->tensor; + const std::string tname = ggml_get_name(tensor); + + bool quantize = tname.rfind("weight") == tname.size() - 6; + quantize &= (ggml_n_dims(tensor) >= 2); + quantize &= tname.find("_norm.weight") == std::string::npos; + quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos; + if (!quantize) { continue; } + + ggml_type new_type = default_type; + if (!params->pure) { + new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); + } + if (params->token_embedding_type < GGML_TYPE_COUNT && + (tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) { + new_type = params->token_embedding_type; + } + if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") { + new_type = params->output_tensor_type; + } + if (new_type != GGML_TYPE_Q2_KPT) { continue; } + + const size_t tsz = ggml_nbytes(tensor); + if (!ml.use_mmap) { + if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); } + tensor->data = p1_read_data.data(); + } + ml.load_data_for(tensor); + + const int64_t nelements = ggml_nelements(tensor); + float * f32_data; + if (tensor->type == GGML_TYPE_F32) { + f32_data = (float *) tensor->data; + } else { + llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread); + f32_data = (float *) p1_f32_buf.data(); + } + + const float * imatrix = nullptr; + if (imatrix_data) { + auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped)); + if (it2 != imatrix_data->end() && + it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) { + imatrix = it2->second.data(); + } + } + + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows = tensor->ne[1]; + + LLAMA_LOG_INFO("%s: Q2_KPT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name); + q2kpt_train_levels(f32_data, nrows, n_per_row, imatrix, + q2kpt_all_levels.data() + ti * Q2KPT_N_LEVELS_SZ); + } + + for (auto & ctx : ctx_outs) { + if (ctx) { + gguf_set_arr_data(ctx.get(), "q2_kpt.levels", GGUF_TYPE_FLOAT32, + q2kpt_all_levels.data(), q2kpt_all_levels.size()); + } + } + LLAMA_LOG_INFO("%s: Q2_KPT pass 1 complete.\n", __func__); + } + // no output file for --dry-run if (!params->dry_run) { new_ofstream(0); @@ -1312,6 +1403,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: q4dpt_set_levels(q4dpt_all_levels.data() + tensor_pass2_idx * Q4DPT_N_LEVELS); } + // Q2_KPT: set the per-tensor levels (trained in pass 1) as global for quantization + if (new_type == GGML_TYPE_Q2_KPT) { + q2kpt_set_levels(q2kpt_all_levels.data() + tensor_pass2_idx * Q2KPT_N_LEVELS_SZ); + // DIAGNOSTIC: print levels being used for quantization + static int q2kpt_pass2_diag = 0; + if (q2kpt_pass2_diag < 5) { + const float * lv = q2kpt_all_levels.data() + tensor_pass2_idx * Q2KPT_N_LEVELS_SZ; + LLAMA_LOG_INFO("%s: [Q2_KPT pass2 diag] ti=%zu tensor='%s' levels=[%.4f,%.4f,%.4f,%.4f]\n", + __func__, tensor_pass2_idx, ggml_get_name(tensor), lv[0], lv[1], lv[2], lv[3]); + q2kpt_pass2_diag++; + } + } + // quantize each expert separately since they have different importance matrices new_size = 0; for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { @@ -1329,7 +1433,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row); std::vector deq(nrows*n_per_row); const ggml_type_traits * qtype = ggml_get_type_traits(new_type); - qtype->to_float(new_data_03, deq.data(), deq.size()); + qtype->to_float(new_data_03, deq.data(), deq.size(), nullptr); double err = 0.0f; for (int i = 0; i < (int) deq.size(); ++i) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 19bcf9a57a8..35268ee4f9f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -255,6 +255,8 @@ endif() # New quant tests llama_build_and_test(test-quant-q3kpt.cpp) llama_build_and_test(test-quant-q4dpt.cpp) +llama_build_and_test(test-quant-q2dpt.cpp) +llama_build_and_test(test-quant-q2kpt.cpp) llama_build(test-quant-q4dpt-experiment.cpp) # libmtmd diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index e8e237c6ec8..988e34a8949 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -259,7 +259,7 @@ static std::vector tensor_to_float(const ggml_tensor * t) { } else if (t->type == GGML_TYPE_I8) { tv.push_back((float)*(int8_t *) &buf[i]); } else if (quantized) { - tt->to_float(&buf[i], vq.data(), bs); + tt->to_float(&buf[i], vq.data(), bs, nullptr); tv.insert(tv.end(), vq.begin(), vq.end()); } else { GGML_ABORT("fatal error"); diff --git a/tests/test-quant-q2dpt.cpp b/tests/test-quant-q2dpt.cpp new file mode 100644 index 00000000000..d47a69b5619 --- /dev/null +++ b/tests/test-quant-q2dpt.cpp @@ -0,0 +1,170 @@ +// test-quant-q2dpt.cpp +// Quantization accuracy test for Q2_DPT (2-bit with per-tensor learned int8 levels). +// Compares multiple per-block scale-search strategies against Q2_K baseline. + +#include "ggml-backend.h" +#include "ggml.h" +#include +#include + +extern "C" { +void q2dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[4]); +void q2dpt_set_levels(const int8_t * levels); +void q2dpt_set_quant_strategy(int s); +size_t quantize_q2_dpt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +} + +#define Q2DPT_N_LEVELS 4 + +#include +#include +#include +#include +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +static float rmse(const float * a, const float * b, size_t n) { + double s = 0; + for (size_t i = 0; i < n; ++i) { + double d = (double) a[i] - (double) b[i]; + s += d * d; + } + return (float) std::sqrt(s / (double) n); +} + +static float std_quant_rmse(ggml_type type, const float * data, size_t nrow, size_t n_per_row) { + const size_t rs = ggml_row_size(type, n_per_row); + std::vector qb(nrow * rs); + std::vector dq(nrow * n_per_row); + ggml_quantize_chunk(type, data, qb.data(), 0, nrow, n_per_row, nullptr); + const ggml_type_traits * tr = ggml_get_type_traits(type); + for (size_t r = 0; r < nrow; ++r) { + tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t) n_per_row, nullptr); + } + return rmse(data, dq.data(), nrow * n_per_row); +} + +// Run Q2_DPT with a given strategy: train levels, set, quantize, dequantize, return RMSE +static float q2dpt_rmse_with_strategy(const float * data, size_t nrow, size_t n_per_row, int strategy) { + std::vector imatrix_train(nrow * n_per_row, 1.0f); + std::vector imatrix_quant(n_per_row, 1.0f); + + int8_t levels[Q2DPT_N_LEVELS]; + q2dpt_train_levels(data, (int64_t) nrow, (int64_t) n_per_row, imatrix_train.data(), levels); + q2dpt_set_levels(levels); + q2dpt_set_quant_strategy(strategy); + + const size_t rs = ggml_row_size(GGML_TYPE_Q2_DPT, n_per_row); + std::vector qb(nrow * rs); + std::vector dq(nrow * n_per_row); + quantize_q2_dpt(data, qb.data(), (int64_t) nrow, (int64_t) n_per_row, imatrix_quant.data()); + const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_Q2_DPT); + for (size_t r = 0; r < nrow; ++r) { + tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t) n_per_row, levels); + } + return rmse(data, dq.data(), nrow * n_per_row); +} + +// --------------------------------------------------------------------------- +// Test cases +// --------------------------------------------------------------------------- +struct TestCase { + std::string name; + std::vector data; + size_t nrow, n_per_row; +}; + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- +int main(void) { + ggml_backend_load_all(); + + std::vector cases; + std::mt19937 gen(42); + + // Test case 1: Gaussian(0, 0.02) 64x4096 + { + auto & tc = cases.emplace_back(); + tc.name = "Gauss(0,0.02) 64x4096"; + tc.nrow = 64; + tc.n_per_row = 4096; + tc.data.resize(tc.nrow * tc.n_per_row); + std::normal_distribution dist(0.0f, 0.02f); + for (auto & v : tc.data) v = dist(gen); + } + + // Test case 2: Laplace(0, 0.01) 64x4096 + { + auto & tc = cases.emplace_back(); + tc.name = "Laplace(0,0.01) 64x4096"; + tc.nrow = 64; + tc.n_per_row = 4096; + tc.data.resize(tc.nrow * tc.n_per_row); + std::piecewise_constant_distribution dist( + 2, -1.0f, 1.0f, [](float x){ return std::exp(-std::abs(x)/0.01f); }); + for (auto & v : tc.data) v = dist(gen); + } + + // Test case 3: Uniform(-0.1, 0.1) 64x4096 + { + auto & tc = cases.emplace_back(); + tc.name = "Uniform(-0.1,0.1) 64x4096"; + tc.nrow = 64; + tc.n_per_row = 4096; + tc.data.resize(tc.nrow * tc.n_per_row); + std::uniform_real_distribution dist(-0.1f, 0.1f); + for (auto & v : tc.data) v = dist(gen); + } + + // Strategies to test: { bitmask, label } + struct Strategy { int mask; const char * label; }; + Strategy strategies[] = { + { 0x1, "A:lvl-anchor" }, // 4 level-anchored CD starts + { 0x2, "B:bnd-sweep" }, // boundary-crossing sweep + CD + { 0x4, "C:dual-extreme" }, // max_val + min_val anchors + CD + { 0x8, "D:elem-anchor" }, // element-anchor scan + CD + { 0x10, "E:brute-force" }, // exhaustive monotone partition + { 0x3, "A+B" }, // best of A and B combined + { 0x1F, "A+B+C+D+E" }, // everything + }; + const int n_strat = (int)(sizeof(strategies) / sizeof(strategies[0])); + + // Header + printf("Q2_DPT per-block strategy comparison (ratio vs Q2_K; lower=better)\n\n"); + printf("%-26s %8s", "Test", "Q2_K"); + for (int s = 0; s < n_strat; ++s) + printf(" %14s", strategies[s].label); + printf("\n"); + printf("%-26s %8s", "--------------------------", "--------"); + for (int s = 0; s < n_strat; ++s) + printf(" %14s", "--------------"); + printf("\n"); + + for (size_t i = 0; i < cases.size(); ++i) { + auto & tc = cases[i]; + printf("%-26s", tc.name.c_str()); + fflush(stdout); + + float rmse_q2_k = std_quant_rmse(GGML_TYPE_Q2_K, tc.data.data(), tc.nrow, tc.n_per_row); + printf(" %8.6f", rmse_q2_k); + fflush(stdout); + + for (int s = 0; s < n_strat; ++s) { + float r = q2dpt_rmse_with_strategy(tc.data.data(), tc.nrow, tc.n_per_row, strategies[s].mask); + printf(" %14.4f", r / rmse_q2_k); + fflush(stdout); + } + printf("\n"); + } + + printf("\n"); + return 0; +} diff --git a/tests/test-quant-q2kpt.cpp b/tests/test-quant-q2kpt.cpp new file mode 100644 index 00000000000..1b43b8fa1f4 --- /dev/null +++ b/tests/test-quant-q2kpt.cpp @@ -0,0 +1,358 @@ +// test-quant-q2kpt.cpp +// Correctness and accuracy test for Q2_KPT (Q2_K with learned per-tensor float levels). +// Tests: +// 1. Level training sanity (levels in [0,1], strictly increasing) +// 2. Round-trip RMSE vs Q2_K and Q2_KPT-uniform-levels across distributions +// 3. Manual vec-dot consistency: for a single QK_K row, verify that +// dequantize_row + manual dot == hand-rolled accumulation matches. + +#include "ggml-backend.h" +#include "ggml.h" +#include +#include +#include +#include +#include +#include +#include +#include + +// Q2_K/Q2_KPT block size (same as QK_K in ggml-common.h) +#define MY_QK_K 256 + +// --------------------------------------------------------------------------- +// Declarations for Q2_KPT internals (all in libggml-base.so) +// --------------------------------------------------------------------------- +extern "C" { + void q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[4]); + void q2kpt_set_levels(const float * levels); + const float *q2kpt_get_levels(void); + size_t quantize_q2_kpt(const float * src, void * dst, + int64_t nrows, int64_t n_per_row, + const float * imatrix); +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +static float rmse(const float * a, const float * b, size_t n) { + double s = 0; + for (size_t i = 0; i < n; ++i) { + double d = (double)a[i] - (double)b[i]; + s += d * d; + } + return (float)std::sqrt(s / (double)n); +} + +// Quantize float data to `type`, dequantize back, return RMSE. +static float std_quant_rmse(ggml_type type, const float * data, + size_t nrow, size_t n_per_row) { + const size_t rs = ggml_row_size(type, n_per_row); + std::vector qb(nrow * rs); + std::vector dq(nrow * n_per_row); + ggml_quantize_chunk(type, data, qb.data(), 0, nrow, n_per_row, nullptr); + const ggml_type_traits * tr = ggml_get_type_traits(type); + for (size_t r = 0; r < nrow; ++r) { + tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, + (int64_t)n_per_row, nullptr); + } + return rmse(data, dq.data(), nrow * n_per_row); +} + +// Train Q2_KPT levels on data, quantize (with imatrix=1), dequantize, return RMSE. +static float q2kpt_rmse(const float * data, size_t nrow, size_t n_per_row, + float out_levels[4]) { + std::vector imatrix(n_per_row, 1.0f); + + q2kpt_train_levels(data, (int64_t)nrow, (int64_t)n_per_row, + imatrix.data(), out_levels); + q2kpt_set_levels(out_levels); + + const size_t rs = ggml_row_size(GGML_TYPE_Q2_KPT, n_per_row); + std::vector qb(nrow * rs); + std::vector dq(nrow * n_per_row); + + for (size_t r = 0; r < nrow; ++r) { + quantize_q2_kpt(data + r * n_per_row, + qb.data() + r * rs, + 1, (int64_t)n_per_row, + imatrix.data()); + } + const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_Q2_KPT); + for (size_t r = 0; r < nrow; ++r) { + tr->to_float(qb.data() + r * rs, + dq.data() + r * n_per_row, + (int64_t)n_per_row, out_levels); + } + return rmse(data, dq.data(), nrow * n_per_row); +} + +// --------------------------------------------------------------------------- +// main +// --------------------------------------------------------------------------- +int main(void) { + ggml_backend_load_all(); + + std::mt19937 gen(42); + bool all_ok = true; + + // ----------------------------------------------------------------------- + // Section 1: Level training sanity + // ----------------------------------------------------------------------- + printf("=== Section 1: Trained level values ===\n"); + { + const size_t nrow = 32, n_per_row = MY_QK_K; + std::vector data(nrow * n_per_row); + std::normal_distribution dist(0.0f, 0.02f); + for (auto & v : data) v = dist(gen); + + std::vector imatrix(n_per_row, 1.0f); + float levels[4]; + q2kpt_train_levels(data.data(), (int64_t)nrow, (int64_t)n_per_row, + imatrix.data(), levels); + printf(" Trained levels: [%.6f, %.6f, %.6f, %.6f]\n", + levels[0], levels[1], levels[2], levels[3]); + bool ordered = (levels[0] < levels[1]) && (levels[1] < levels[2]) && + (levels[2] < levels[3]); + bool in_range = (levels[0] >= 0.0f) && (levels[3] <= 1.0f); + printf(" Strictly increasing: %s In [0,1]: %s\n", + ordered ? "YES" : "NO", in_range ? "YES" : "NO"); + if (!ordered || !in_range) { + printf(" FAIL: levels are malformed!\n"); + all_ok = false; + } else { + printf(" PASS\n"); + } + } + printf("\n"); + + // ----------------------------------------------------------------------- + // Section 2: Round-trip RMSE vs Q2_K + // ----------------------------------------------------------------------- + printf("=== Section 2: Round-trip RMSE (ratio vs Q2_K) ===\n"); + printf("%-30s %10s %10s %8s %s\n", "Distribution", "Q2_K", "Q2_KPT", "ratio", ""); + printf("%-30s %10s %10s %8s\n", + "------------------------------", "----------", "----------", "--------"); + + const size_t nrow = 64, n_per_row = 4 * MY_QK_K; + + auto run_dist_test = [&](const char * name, std::vector & data) { + float r_q2k = std_quant_rmse(GGML_TYPE_Q2_K, data.data(), nrow, n_per_row); + float levels[4]; + float r_kpt = q2kpt_rmse(data.data(), nrow, n_per_row, levels); + float ratio = r_kpt / (r_q2k + 1e-10f); + // Sanity: Q2_KPT at same BPW should be within 3x of Q2_K + bool ok = ratio < 3.0f; + printf("%-30s %10.6f %10.6f %8.4f %s levels=[%.3f,%.3f,%.3f,%.3f]\n", + name, r_q2k, r_kpt, ratio, ok ? "PASS" : "FAIL", + levels[0], levels[1], levels[2], levels[3]); + if (!ok) all_ok = false; + }; + + { + std::vector data(nrow * n_per_row); + std::normal_distribution dist(0.0f, 0.02f); + for (auto & v : data) v = dist(gen); + run_dist_test("Gaussian(0, 0.02)", data); + } + { + std::vector data(nrow * n_per_row); + std::exponential_distribution edist(100.0f); + std::uniform_int_distribution sign_d(0, 1); + for (auto & v : data) v = edist(gen) * (sign_d(gen) ? 1.0f : -1.0f); + run_dist_test("Laplace(0, 0.01)", data); + } + { + std::vector data(nrow * n_per_row); + std::uniform_real_distribution dist(-0.1f, 0.1f); + for (auto & v : data) v = dist(gen); + run_dist_test("Uniform(-0.1, 0.1)", data); + } + printf("\n"); + + // ----------------------------------------------------------------------- + // Section 3: Uniform-levels baseline: Q2_KPT with {0, 1/3, 2/3, 1} + // should behave similarly to Q2_K (both 2-bit, same BPW) + // ----------------------------------------------------------------------- + printf("=== Section 3: Uniform-level baseline ===\n"); + { + float uniform_levels[4] = {0.0f, 1.0f/3.0f, 2.0f/3.0f, 1.0f}; + q2kpt_set_levels(uniform_levels); + + std::vector data(nrow * n_per_row); + std::normal_distribution dist(0.0f, 0.02f); + for (auto & v : data) v = dist(gen); + + // Q2_K baseline + float r_q2k = std_quant_rmse(GGML_TYPE_Q2_K, data.data(), nrow, n_per_row); + + // Q2_KPT with uniform levels (no re-training) + const size_t rs = ggml_row_size(GGML_TYPE_Q2_KPT, n_per_row); + std::vector qb(nrow * rs); + std::vector dq(nrow * n_per_row); + for (size_t r = 0; r < nrow; ++r) { + quantize_q2_kpt(data.data() + r * n_per_row, + qb.data() + r * rs, 1, (int64_t)n_per_row, nullptr); + } + const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_Q2_KPT); + for (size_t r = 0; r < nrow; ++r) { + tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t)n_per_row, uniform_levels); + } + float r_kpt_unif = rmse(data.data(), dq.data(), nrow * n_per_row); + float ratio = r_kpt_unif / (r_q2k + 1e-10f); + bool ok = ratio < 5.0f; // with uniform levels Q2_KPT uses different quantizer + printf(" Q2_K: %.6f\n", r_q2k); + printf(" Q2_KPT(unif): %.6f (ratio %.4f vs Q2_K) %s\n", + r_kpt_unif, ratio, ok ? "PASS" : "FAIL (too bad)"); + if (!ok) all_ok = false; + } + printf("\n"); + + // ----------------------------------------------------------------------- + // Section 4: Dequant-only consistency check + // Quantize with known levels, dequantize, verify specific values. + // ----------------------------------------------------------------------- + printf("=== Section 4: Dequant value spot-check ===\n"); + { + // Use uniform levels so we know exactly what the mapping should be + float ulev[4] = {0.0f, 1.0f/3.0f, 2.0f/3.0f, 1.0f}; + q2kpt_set_levels(ulev); + + // Create data spanning [0, 3] (to fill levels[k]*3 = {0,1,2,3}) + // A single QK_K block with values {0, 1, 2, 3, 0, 1, 2, 3, ...} + std::vector data(MY_QK_K); + for (int i = 0; i < MY_QK_K; ++i) { + // Values 0,1,2,3 repeating - with uniform levels these should quantize + // to indices 0,1,2,3 and dequant as 0, d*1, d*2, d*3 for some d + data[i] = (float)(i % 4); + } + + const size_t rs = ggml_row_size(GGML_TYPE_Q2_KPT, MY_QK_K); + std::vector qb(rs); + std::vector dq(MY_QK_K); + + quantize_q2_kpt(data.data(), qb.data(), 1, MY_QK_K, nullptr); + const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_Q2_KPT); + tr->to_float(qb.data(), dq.data(), MY_QK_K, ulev); + + float err = rmse(data.data(), dq.data(), MY_QK_K); + printf(" Input pattern {0,1,2,3,...} with uniform levels:\n"); + printf(" First 12 input: %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f\n", + data[0], data[1], data[2], data[3], data[4], data[5], + data[6], data[7], data[8], data[9], data[10], data[11]); + printf(" First 12 dequant: %.4f %.4f %.4f %.4f %.4f %.4f %.4f %.4f %.4f %.4f %.4f %.4f\n", + dq[0], dq[1], dq[2], dq[3], dq[4], dq[5], + dq[6], dq[7], dq[8], dq[9], dq[10], dq[11]); + printf(" RMSE: %.6f\n", err); + // Expect dequant ≈ input * scale (some small error due to scale quantization) + bool ok = err < 0.5f; // very generous: at most 0.5 absolute error on 0-3 range + printf(" %s\n", ok ? "PASS" : "FAIL (reconstruction wildly wrong)"); + if (!ok) all_ok = false; + } + printf("\n"); + + // ----------------------------------------------------------------------- + // Section 5: Vec-dot via ggml_mul_mat vs dequant-based reference + // This exercises the full inference path: level dispatch + vec_dot kernel. + // ----------------------------------------------------------------------- + printf("=== Section 5: Vec-dot via ggml_mul_mat ===\n"); + + // Get CPU backend after load_all + ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + printf(" CPU backend device not found, skipping\n"); + } else { + ggml_backend_t backend = ggml_backend_dev_init(cpu_dev, nullptr); + + // Set trained levels + float ml_levels[4]; + { + const int nrow2 = 16, nprow2 = MY_QK_K; + std::vector td(nrow2 * nprow2); + std::normal_distribution dist(0.0f, 0.02f); + for (auto & v : td) v = dist(gen); + std::vector im(nprow2, 1.0f); + q2kpt_train_levels(td.data(), nrow2, nprow2, im.data(), ml_levels); + q2kpt_set_levels(ml_levels); + } + printf(" Levels: [%.4f, %.4f, %.4f, %.4f]\n", + ml_levels[0], ml_levels[1], ml_levels[2], ml_levels[3]); + + const int ne0 = 4 * MY_QK_K; // columns (embedding) + const int ne1 = 4; // rows (output features) + + // Generate weight and activation data + std::vector weights(ne1 * ne0), acts(ne0); + { + std::normal_distribution dw(0.0f, 0.02f); + std::normal_distribution da(0.0f, 1.0f); + for (auto & v : weights) v = dw(gen); + for (auto & v : acts) v = da(gen); + } + + // Quantize weights to Q2_KPT + const size_t rs = ggml_row_size(GGML_TYPE_Q2_KPT, ne0); + std::vector qw(ne1 * rs); + for (int r = 0; r < ne1; ++r) + quantize_q2_kpt(weights.data() + r * ne0, qw.data() + r * rs, 1, ne0, nullptr); + + // Reference: dequant weights x float acts + const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_Q2_KPT); + std::vector dw(ne1 * ne0); + for (int r = 0; r < ne1; ++r) + tr->to_float(qw.data() + r * rs, dw.data() + r * ne0, ne0, ml_levels); + + std::vector ref_out(ne1, 0.0f); + for (int r = 0; r < ne1; ++r) + for (int c = 0; c < ne0; ++c) + ref_out[r] += dw[r * ne0 + c] * acts[c]; + + // ggml_mul_mat - use no_alloc so ggml_backend_alloc_ctx_tensors works + const size_t ctx_mem = 1 * 1024 * 1024; // just for tensor metadata + ggml_init_params params = { ctx_mem, nullptr, true }; + ggml_context * ctx = ggml_init(params); + + ggml_tensor * W = ggml_new_tensor_2d(ctx, GGML_TYPE_Q2_KPT, ne0, ne1); + ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne0); + ggml_tensor * y = ggml_mul_mat(ctx, W, x); + + ggml_cgraph * graph = ggml_new_graph(ctx); + ggml_build_forward_expand(graph, y); + + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend); + W->quant_levels = ml_levels; // set per-tensor levels for inference + ggml_backend_tensor_set(W, qw.data(), 0, (int64_t)(ne1 * rs)); + ggml_backend_tensor_set(x, acts.data(), 0, ne0 * sizeof(float)); + + ggml_backend_graph_compute(backend, graph); + + std::vector got_out(ne1); + ggml_backend_tensor_get(y, got_out.data(), 0, ne1 * sizeof(float)); + + ggml_backend_buffer_free(buf); + ggml_free(ctx); + ggml_backend_free(backend); + + float max_rel = 0; + for (int r = 0; r < ne1; ++r) { + float rel = std::abs(got_out[r] - ref_out[r]) / + (std::abs(ref_out[r]) + 1e-9f); + max_rel = std::max(max_rel, rel); + printf(" row %d: ref=%.6f got=%.6f rel_err=%.3e\n", + r, ref_out[r], got_out[r], rel); + } + bool vd_ok = max_rel < 0.01f; + printf(" max_rel_err=%.3e %s\n", max_rel, vd_ok ? "PASS" : "FAIL"); + if (!vd_ok) all_ok = false; + } + printf("\n"); + + // ----------------------------------------------------------------------- + // Summary + // ----------------------------------------------------------------------- + printf("=== Summary: %s ===\n", all_ok ? "ALL PASS" : "SOME FAILURES"); + + return all_ok ? 0 : 1; +} diff --git a/tests/test-quant-q3kpt.cpp b/tests/test-quant-q3kpt.cpp index 19a0e2a0bee..7dedda89a3a 100644 --- a/tests/test-quant-q3kpt.cpp +++ b/tests/test-quant-q3kpt.cpp @@ -13,7 +13,6 @@ void q3kpt_train_levels(const float * data, const float * imatrix, float levels_out[8]); void q3kpt_set_levels(const float * levels); -const float * q3kpt_get_tensor_levels(const void * data_ptr); size_t quantize_q3_kpt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix); void quantize_row_q8_K_ref(const float * x, void * y, int64_t k); } @@ -49,7 +48,7 @@ static float std_quant_rmse(ggml_type type, const float * data, size_t nrow, siz ggml_quantize_chunk(type, data, qb.data(), 0, nrow, n_per_row, nullptr); const ggml_type_traits * tr = ggml_get_type_traits(type); for (size_t r = 0; r < nrow; ++r) { - tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t) n_per_row); + tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t) n_per_row, nullptr); } return rmse(data, dq.data(), nrow * n_per_row); } @@ -65,7 +64,7 @@ static float q3kpt_rmse_actual(const float * data, size_t nrow, size_t n_per_row quantize_q3_kpt(data, qb.data(), (int64_t) nrow, (int64_t) n_per_row, nullptr); const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_Q3_KPT); for (size_t r = 0; r < nrow; ++r) { - tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t) n_per_row); + tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t) n_per_row, levels); } return rmse(data, dq.data(), nrow * n_per_row); } diff --git a/tests/test-quant-q4dpt-experiment.cpp b/tests/test-quant-q4dpt-experiment.cpp index ba6fbab48dc..fbccec9fa85 100644 --- a/tests/test-quant-q4dpt-experiment.cpp +++ b/tests/test-quant-q4dpt-experiment.cpp @@ -208,7 +208,7 @@ static float experiment_rmse(const float * data, size_t nrow, size_t n_per_row, const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_Q4_DPT); for (size_t row = 0; row < nrow; ++row) { tr->to_float((const void *)&qblocks[row * nblock], - deq.data() + row * n_per_row, (int64_t)n_per_row); + deq.data() + row * n_per_row, (int64_t)n_per_row, levels); } return rmse_vec(data, deq.data(), total); @@ -221,7 +221,7 @@ static float iq4nl_rmse(const float * data, size_t nrow, size_t n_per_row) { ggml_quantize_chunk(GGML_TYPE_IQ4_NL, data, qb.data(), 0, nrow, n_per_row, nullptr); const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_IQ4_NL); for (size_t r = 0; r < nrow; ++r) { - tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t)n_per_row); + tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t)n_per_row, nullptr); } return rmse_vec(data, dq.data(), nrow * n_per_row); } diff --git a/tests/test-quant-q4dpt.cpp b/tests/test-quant-q4dpt.cpp index 2e4c5f947e7..75912d3ed16 100644 --- a/tests/test-quant-q4dpt.cpp +++ b/tests/test-quant-q4dpt.cpp @@ -44,7 +44,7 @@ static float std_quant_rmse(ggml_type type, const float * data, size_t nrow, siz ggml_quantize_chunk(type, data, qb.data(), 0, nrow, n_per_row, nullptr); const ggml_type_traits * tr = ggml_get_type_traits(type); for (size_t r = 0; r < nrow; ++r) { - tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t) n_per_row); + tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t) n_per_row, nullptr); } return rmse(data, dq.data(), nrow * n_per_row); } @@ -60,7 +60,7 @@ static float q4dpt_rmse_actual(const float * data, size_t nrow, size_t n_per_row quantize_q4_dpt(data, qb.data(), (int64_t) nrow, (int64_t) n_per_row, nullptr); const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_Q4_DPT); for (size_t r = 0; r < nrow; ++r) { - tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t) n_per_row); + tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t) n_per_row, levels); } return rmse(data, dq.data(), nrow * n_per_row); } diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 037c0582bbb..fcb02ddba59 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -50,7 +50,7 @@ static float total_quantization_error(const ggml_type_traits * qfns, const ggml_ std::vector tmp_out(test_size); qfns_cpu->from_float(test_data, tmp_q.data(), test_size); - qfns->to_float(tmp_q.data(), tmp_out.data(), test_size); + qfns->to_float(tmp_q.data(), tmp_out.data(), test_size, nullptr); return array_rmse(test_data, tmp_out.data(), test_size); } @@ -62,10 +62,10 @@ static float reference_quantization_error(const ggml_type_traits * qfns, const g // FIXME: why is done twice? qfns_cpu->from_float(test_data, tmp_q.data(), test_size); - qfns->to_float(tmp_q.data(), tmp_out.data(), test_size); + qfns->to_float(tmp_q.data(), tmp_out.data(), test_size, nullptr); qfns->from_float_ref(test_data, tmp_q.data(), test_size); - qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size); + qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size, nullptr); return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size); } diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index cac0782dee9..6b0d87ed5e4 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -309,7 +309,7 @@ int main(int argc, char * argv[]) { for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { - qfns->to_float(test_q1, test_out, size); + qfns->to_float(test_q1, test_out, size, nullptr); return test_out[0]; }; size_t quantized_size = ggml_row_size(type, size); diff --git a/tests/test-quantize-stats.cpp b/tests/test-quantize-stats.cpp index de587d456d0..c56f1011875 100644 --- a/tests/test-quantize-stats.cpp +++ b/tests/test-quantize-stats.cpp @@ -158,7 +158,7 @@ static void test_roundtrip_on_chunk( } else { qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size); } - qfns.to_float(quantized_scratch, output_scratch, chunk_size); + qfns.to_float(quantized_scratch, output_scratch, chunk_size, nullptr); update_error_stats(chunk_size, input_scratch, output_scratch, stats); } diff --git a/tools/export-lora/export-lora.cpp b/tools/export-lora/export-lora.cpp index 41f426208f8..97b89c8188a 100644 --- a/tools/export-lora/export-lora.cpp +++ b/tools/export-lora/export-lora.cpp @@ -317,7 +317,7 @@ struct lora_merge_ctx { auto nels = ggml_nelements(inp_base); const auto * qtype = ggml_get_type_traits(base->type); std::vector dequant_buf(nels * sizeof(float)); - qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels); + qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels, nullptr); ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size()); } else { ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base)); diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 43bb822d8f0..1a0acb8ba51 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -40,6 +40,7 @@ static const std::vector QUANT_OPTIONS = { { "Q3_PT", LLAMA_FTYPE_MOSTLY_Q3_PT, " 3.25 bpw quantization", }, { "Q3_KPT", LLAMA_FTYPE_MOSTLY_Q3_KPT, " Q3_K with learned per-tensor levels" }, { "Q4_DPT", LLAMA_FTYPE_MOSTLY_Q4_DPT, " IQ4_NL with learned per-tensor int8 levels" }, + { "Q2_KPT", LLAMA_FTYPE_MOSTLY_Q2_KPT, " Q2_K with learned per-tensor float levels" }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, From 4e2aa5393cb3171a237781cddc7cae257bd43c21 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Fri, 13 Mar 2026 16:36:49 +0100 Subject: [PATCH 9/9] Q2_DPT, per superblock scales --- ggml/include/ggml-cpu.h | 3 +- ggml/include/ggml.h | 1 + ggml/src/ggml-blas/ggml-blas.cpp | 17 +- ggml/src/ggml-cpu/arch/x86/quants.c | 7 + ggml/src/ggml-cpu/ggml-cpu.c | 31 +- ggml/src/ggml-cpu/ops.cpp | 42 ++- ggml/src/ggml-cpu/quants.c | 42 ++- ggml/src/ggml-cuda/convert.cuh | 3 + ggml/src/ggml-quants.c | 535 ++++++++++++++++++++++------ ggml/src/ggml-quants.h | 8 +- ggml/src/ggml.c | 3 +- pocs/vdot/q8dot.cpp | 4 +- pocs/vdot/vdot.cpp | 4 +- src/llama-model.cpp | 26 +- src/llama-quant.cpp | 117 +++--- tests/test-quant-q2kpt.cpp | 49 ++- tests/test-quantize-fns.cpp | 2 +- tests/test-quantize-perf.cpp | 2 +- 18 files changed, 694 insertions(+), 202 deletions(-) diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 8783049ed84..9c3a6da5dcf 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -117,7 +117,8 @@ extern "C" { ggml_from_float_t from_float; ggml_vec_dot_t vec_dot; enum ggml_type vec_dot_type; - int64_t nrows; // number of rows to process simultaneously + int64_t nrows; // number of rows to process simultaneously + size_t levels_row_stride; // bytes to add per row to get next row's quant_levels (0 = per-tensor) }; GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 5799761e0f4..78ac0abcd07 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2720,6 +2720,7 @@ extern "C" { bool is_quantized; ggml_to_float_t to_float; ggml_from_float_t from_float_ref; + size_t levels_row_stride; // bytes to advance quant_levels per row (0 = per-tensor) }; GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 2320ef84358..bc4a2a5ed10 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -1,5 +1,15 @@ #include "ggml-impl.h" #include "ggml-blas.h" + +// Helper: compute quant_levels stride for a given row. +// For Q2_KPT (per-block levels), stride depends on tensor width. +static inline size_t ggml_quant_levels_stride(ggml_type type, size_t constant_stride, int64_t ne0) { + if (type == GGML_TYPE_Q2_KPT) { + return (size_t)(ne0 / 256) * 4 * sizeof(float); + } + return constant_stride; +} + #include "ggml-backend-impl.h" #include @@ -77,10 +87,11 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1); + const size_t lrs = ggml_quant_levels_stride(src0->type, ggml_get_type_traits(src0->type)->levels_row_stride, src0->ne[0]); #ifdef GGML_USE_OPENMP #pragma omp parallel for num_threads(n_threads) for (int64_t i01 = 0; i01 < ne01; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, src0->quant_levels); + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs); } #else for (int i = 1; i < n_threads; i++) { @@ -89,7 +100,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg if (start < end) { ctx->tasks.push_back(std::async(std::launch::async, [=]() { for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, src0->quant_levels); + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs); } })); } @@ -99,7 +110,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg const int64_t start = 0; const int64_t end = ne01/n_threads; for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, src0->quant_levels); + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs); } } #endif diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index 7eff7712678..f609b5d1868 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -541,6 +541,7 @@ static inline __m128i get_scale_shuffle(int i) { #endif void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_0; const int nb = n / qk; @@ -699,6 +700,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_1; const int nb = n / qk; @@ -845,6 +847,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo } void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_0; const int nb = n / qk; @@ -925,6 +928,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_1; const int nb = n / qk; @@ -1011,6 +1015,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_0; const int nb = n / qk; @@ -3735,6 +3740,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v } void ggml_vec_dot_q4_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -3822,6 +3828,7 @@ void ggml_vec_dot_q4_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v } void ggml_vec_dot_q2_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 070d1341358..f2a6a6d66ba 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -414,6 +414,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_q2_kpt_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, + .levels_row_stride = 0, // computed dynamically: (ne0/QK_K)*Q2KPT_N_LEVELS*sizeof(float) }, [GGML_TYPE_I32] = { .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32, @@ -1184,8 +1185,15 @@ static void ggml_compute_forward_mul_mat_one_chunk( const bool src1_cont = ggml_is_contiguous(src1); - ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; - enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + // For Q2_KPT, levels are per-block: stride = (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float) + // ne00 is the number of elements per row in src0 (input dimension), NOT ne0 (= ne01 = output rows). + // For non-square matrices (e.g. ffn_up: [hidden, intermediate]) ne00 != ne01, so ne00 is correct. + // For other types, use the static stride from type_traits_cpu + const size_t levels_row_stride = (type == GGML_TYPE_Q2_KPT) + ? (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float) + : type_traits_cpu[type].levels_row_stride; // broadcast factors const int64_t r2 = ne12 / ne02; @@ -1246,7 +1254,11 @@ static void ggml_compute_forward_mul_mat_one_chunk( //} for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) { - vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot, src0->quant_levels); + // For Q2_KPT, levels are stored per-expert: [expert0_rows, expert1_rows, ...] + // So for 3D tensors we need to index by (i03 * ne01 + ir0) + const size_t levels_row_idx = (type == GGML_TYPE_Q2_KPT && ne03 > 1) ? (i03 * ne01 + ir0) : ir0; + const void * row_levels = (const char*)src0->quant_levels + levels_row_idx * levels_row_stride; + vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot, row_levels); } for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) { @@ -1482,8 +1494,14 @@ static void ggml_compute_forward_mul_mat_id_one_chunk( const enum ggml_type type = src0->type; - ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; - enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + // For Q2_KPT, levels are per-block: stride = (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float) + // ne00 is the input dimension (elements per row in src0), NOT ne0 (= ne01 = output rows). + // For other types, use the static stride from type_traits_cpu + const size_t levels_row_stride = (type == GGML_TYPE_Q2_KPT) + ? (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float) + : type_traits_cpu[type].levels_row_stride; const int64_t blck_0 = 16; const int64_t blck_1 = 16; @@ -1516,7 +1534,8 @@ static void ggml_compute_forward_mul_mat_id_one_chunk( float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2)); for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) { - vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1, src0->quant_levels); + const void * row_levels = (const char*)src0->quant_levels + (cur_a * ne01 + ir0) * levels_row_stride; + vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1, row_levels); } memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float)); diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index ad18e34cc66..6a6df0a157a 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -8,6 +8,19 @@ #include "unary-ops.h" #include "vec.h" +// Helper: compute quant_levels stride for a given row. +// For most types this is the constant levels_row_stride from type_traits. +// For Q2_KPT (per-block levels), stride depends on tensor width (ne[0]). +static inline size_t ggml_quant_levels_stride(ggml_type type, size_t constant_stride, int64_t ne0) { + if (type == GGML_TYPE_Q2_KPT) { + // Q2_KPT has Q2KPT_N_LEVELS floats per 256-element block + // Stride = (ne0 / 256) * Q2KPT_N_LEVELS * sizeof(float) + return (size_t)(ne0 / 256) * 4 * sizeof(float); + } + return constant_stride; +} + + #include #include #include @@ -517,9 +530,11 @@ static void ggml_compute_forward_dup_from_q( const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13; + const size_t q_lrs0 = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]); dequantize_row_q( (const void *) ((char *) src0->data + x_offset), - (float *) ((char *) dst->data + dst_offset), qk, src0->quant_levels); + (float *) ((char *) dst->data + dst_offset), qk, + (const char*)src0->quant_levels + i01 * q_lrs0); } } @@ -639,7 +654,8 @@ static void ggml_compute_forward_add_q_f32( assert(ne00 % 32 == 0); // unquantize row from src0 to temp buffer - dequantize_row_q(src0_row, wdata, ne00, src0->quant_levels); + const size_t q_lrs_add = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]); + dequantize_row_q(src0_row, wdata, ne00, (const char*)src0->quant_levels + i1 * q_lrs_add); // add src1 ggml_vec_acc_f32(ne00, wdata, src1_row); // quantize row to dst @@ -972,7 +988,8 @@ static void ggml_compute_forward_add1_q_f32( assert(ne0 % 32 == 0); // unquantize row from src0 to temp buffer - dequantize_row_q(src0_row, wdata, ne0, src0->quant_levels); + const size_t q_lrs_add = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]); + dequantize_row_q(src0_row, wdata, ne00, (const char*)src0->quant_levels + i1 * q_lrs_add); // add src1 ggml_vec_acc1_f32(ne0, wdata, v); // quantize row to dst @@ -4315,7 +4332,8 @@ static void ggml_compute_forward_out_prod_q_f32( float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); - dequantize_row_q(s0, wdata, ne0, src0->quant_levels); + const size_t q_lrs_op = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]); + dequantize_row_q(s0, wdata, ne0, (const char*)src0->quant_levels + i01 * q_lrs_op); ggml_vec_mad_f32(ne0, d, wdata, *s1); } } @@ -4688,9 +4706,21 @@ static void ggml_compute_forward_get_rows_q( GGML_ASSERT(i01 >= 0 && i01 < ne01); + const size_t q_lrs_gr = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]); + // For Q2_KPT with 3D tensors, levels are indexed by [i12 * ne02 * ne01 + i11 * ne01 + i01] + // For 2D tensors, levels are indexed by [i11 * ne01 + i01] (or just [i01] if ne02 == 1) + size_t levels_row_idx; + if (type == GGML_TYPE_Q2_KPT && ne03 > 1) { + levels_row_idx = (i12 * ne02 + i11) * ne01 + i01; + } else if (type == GGML_TYPE_Q2_KPT) { + levels_row_idx = i11 * ne01 + i01; + } else { + levels_row_idx = i01; + } dequantize_row_q( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), - (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc, src0->quant_levels); + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc, + (const char*)src0->quant_levels + levels_row_idx * q_lrs_gr); } } @@ -5572,6 +5602,8 @@ void ggml_compute_forward_clamp( case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_Q3_PT: + case GGML_TYPE_Q2_KPT: + case GGML_TYPE_Q2_DPT: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q8_K: diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index 58746db8316..93d0956aa3d 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -113,6 +113,7 @@ void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRI //===================================== Dot products ================================= void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_0; const int nb = n / qk; @@ -150,6 +151,7 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c // TODO: add WASM SIMD void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_1; const int nb = n / qk; @@ -186,6 +188,7 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c } void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -217,6 +220,7 @@ void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, } void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_0; const int nb = n / qk; @@ -260,6 +264,7 @@ void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c } void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_1; const int nb = n / qk; @@ -303,6 +308,7 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c } void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_0; const int nb = n / qk; @@ -333,6 +339,7 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c } void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -385,6 +392,7 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -417,6 +425,7 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -469,6 +478,7 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c } void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -548,6 +558,7 @@ void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c } void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -621,7 +632,6 @@ void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c for (int l = 0; l < 8; ++l) sumf += sums[l]; *s = sumf; } - void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(n % QK_K == 0); assert(nrc == 1); @@ -652,6 +662,7 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c float sumf = 0; for (int i = 0; i < nb; ++i) { + GGML_UNUSED(levels); const uint8_t * GGML_RESTRICT q4 = x[i].qs; const uint8_t * GGML_RESTRICT hm = x[i].qh; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -703,6 +714,7 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c } void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -758,6 +770,7 @@ void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c } void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -800,6 +813,7 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs } void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -850,6 +864,7 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -902,6 +917,7 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -946,6 +962,7 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs } void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1002,6 +1019,7 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1058,6 +1076,7 @@ void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1102,6 +1121,7 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, // Q3_KPT vec_dot - similar to Q3_K but with learned levels void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1192,15 +1212,18 @@ void ggml_vec_dot_q2_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const float * lv = (const float *)levels; GGML_ASSERT(lv != NULL && "Q2_KPT levels not set for tensor"); - // Precompute mapped levels: ml[k] = levels[k] * 3.0 - float ml[Q2KPT_N_LEVELS]; - for (int k = 0; k < Q2KPT_N_LEVELS; ++k) { - ml[k] = lv[k] * 3.0f; - } - float sumf = 0; for (int i = 0; i < nb; ++i) { + // Per-block levels: block i uses lv[i*4 + 0..3] + const float * block_lv = lv + i * Q2KPT_N_LEVELS; + + // Precompute mapped levels for this block: ml[k] = levels[k] * 3.0 + float ml[Q2KPT_N_LEVELS]; + for (int k = 0; k < Q2KPT_N_LEVELS; ++k) { + ml[k] = block_lv[k] * 3.0f; + } + const uint8_t * q2 = x[i].qs; const int8_t * q8 = y[i].qs; const uint8_t * sc = x[i].scales; @@ -1251,6 +1274,7 @@ void ggml_vec_dot_q2_kpt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v } void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -1281,6 +1305,7 @@ void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, } void ggml_vec_dot_q2_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -1314,6 +1339,7 @@ void ggml_vec_dot_q2_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, } void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1375,6 +1401,7 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -1404,6 +1431,7 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, } void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh index c211b2e9369..de2036fec90 100644 --- a/ggml/src/ggml-cuda/convert.cuh +++ b/ggml/src/ggml-cuda/convert.cuh @@ -34,6 +34,9 @@ to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type); // Set the Q4_DPT lookup table in device constant memory. void ggml_cuda_set_q4dpt_levels(const int8_t * levels, cudaStream_t stream); +// Set the Q2_DPT lookup table in device constant memory. +void ggml_cuda_set_q2dpt_levels(const int8_t * levels, cudaStream_t stream); + template __host__ __device__ inline dst_t ggml_cuda_cast(src_t x) { if constexpr (std::is_same_v) { diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 52d7c681f7a..9676e0aceab 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -5346,17 +5346,338 @@ void quantize_row_q2_dpt_ref(const float * GGML_RESTRICT x, block_q2_dpt * GGML_ static float q2kpt_levels[Q2KPT_N_LEVELS]; static bool q2kpt_levels_set = false; +// Global level storage for Q2_KPT (per-block levels for last quantized tensor) +static float *q2kpt_block_levels = NULL; +static size_t q2kpt_max_levels = 0; +static size_t q2kpt_cur_levels = 0; + +// Prepare the levels buffer for a tensor with given dimensions. +// This should be called before parallel quantization to pre-allocate storage. +void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row) { + const int nb = (int)(n_per_row / QK_K); + const size_t total_levels = (size_t)nrows * nb * Q2KPT_N_LEVELS; + if (total_levels > q2kpt_max_levels) { + q2kpt_block_levels = (float *) realloc(q2kpt_block_levels, total_levels * sizeof(float)); + q2kpt_max_levels = total_levels; + } + q2kpt_cur_levels = total_levels; +} + void q2kpt_set_levels(const float * levels) { memcpy(q2kpt_levels, levels, Q2KPT_N_LEVELS * sizeof(float)); q2kpt_levels_set = true; } const float * q2kpt_get_levels(void) { + // Return per-block levels if available, otherwise global levels + if (q2kpt_block_levels && q2kpt_cur_levels > 0) { + return q2kpt_block_levels; + } return q2kpt_levels_set ? q2kpt_levels : NULL; } void q2kpt_free_levels(void) { q2kpt_levels_set = false; + if (q2kpt_block_levels) { + free(q2kpt_block_levels); + q2kpt_block_levels = NULL; + q2kpt_max_levels = 0; + q2kpt_cur_levels = 0; + } +} + +// Train 4 Lloyd-Max float levels for a single 256-element block. +// Normalizes sub-block values to [0,1] using Q2_K-style scale+min estimation, then runs k-means. +// Forward declaration (defined later in this file) +static float make_q2kpt_quants(int n, const float * GGML_RESTRICT x, + uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, + const float * mapped_levels, const float * weight); + +// ---- q2kpt_quantize_block_given_levels ---------------------------------------- +// Quantize one QK_K-element block using caller-specified levels (no training). +// block_x: QK_K floats of original data +// y: output block_q2_kpt (filled in place) +// quant_weights: QK_K importance weights (or NULL → use x[i]²) +// sigma2: mean(x²) for the block (for weight formula) +// levels: Q2KPT_N_LEVELS values in [0,1], must be sorted ascending +// ------------------------------------------------------------------------------- +static void q2kpt_quantize_block_given_levels( + const float * GGML_RESTRICT block_x, + block_q2_kpt * GGML_RESTRICT y, + const float * GGML_RESTRICT quant_weights, + float sigma2, + const float levels[Q2KPT_N_LEVELS]) { + + float mapped_levels[Q2KPT_N_LEVELS]; + float bounds[Q2KPT_N_LEVELS - 1]; + for (int k = 0; k < Q2KPT_N_LEVELS; ++k) mapped_levels[k] = levels[k] * 3.0f; + for (int k = 0; k < Q2KPT_N_LEVELS - 1; ++k) + bounds[k] = 0.5f * (mapped_levels[k] + mapped_levels[k + 1]); + + uint8_t L[QK_K]; + float mins[QK_K / 16], scales[QK_K / 16], sw[QK_K / 16]; + float weight[16]; + uint8_t Ls[QK_K / 16], Lm[QK_K / 16]; + + memset(sw, 0, sizeof(sw)); + float sumx2 = sigma2 * QK_K; // reconstitute (or recompute below) + + for (int j = 0; j < QK_K / 16; ++j) { + const float * bx = block_x + 16 * j; + if (quant_weights) { + const float * qw = quant_weights + 16 * j; + for (int l = 0; l < 16; ++l) + weight[l] = qw[l] * sqrtf(sigma2 + bx[l] * bx[l]); + } else { + for (int l = 0; l < 16; ++l) + weight[l] = bx[l] * bx[l]; + } + for (int l = 0; l < 16; ++l) sw[j] += weight[l]; + scales[j] = make_q2kpt_quants(16, bx, L + 16 * j, &mins[j], mapped_levels, weight); + } + + float dm = make_qp_quants(QK_K / 16, 15, scales, Ls, sw); + float mm = make_qp_quants(QK_K / 16, 15, mins, Lm, sw); + + y->d = GGML_FP32_TO_FP16(dm); + y->dmin = GGML_FP32_TO_FP16(mm); + dm = GGML_FP16_TO_FP32(y->d); + mm = GGML_FP16_TO_FP32(y->dmin); + + for (int j = 0; j < QK_K / 16; ++j) y->scales[j] = Ls[j] | (Lm[j] << 4); + + // Second pass: reassign with quantized scales + for (int j = 0; j < QK_K / 16; ++j) { + const float d = dm * (y->scales[j] & 0xF); + if (!d) { + int zero_k = 0; + float zero_d = fabsf(mapped_levels[0]); + for (int k = 1; k < Q2KPT_N_LEVELS; ++k) { + if (fabsf(mapped_levels[k]) < zero_d) { zero_d = fabsf(mapped_levels[k]); zero_k = k; } + } + for (int ii = 0; ii < 16; ++ii) L[16 * j + ii] = zero_k; + continue; + } + const float m = mm * (y->scales[j] >> 4); + for (int ii = 0; ii < 16; ++ii) { + float scaled = (block_x[16 * j + ii] + m) / d; + L[16 * j + ii] = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); + } + } + + // Pack 2-bit indices (Q2_K layout) + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + y->qs[j / 4 + l] = L[j + l] | (L[j + l + 32] << 2) + | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); + } + } + + (void)sumx2; +} + +// ---- Histogram Lloyd-Max helper ---------------------------------------------- +// Runs weighted Lloyd-Max iterations on a pre-built histogram. +// bin_centers[b]: representative value for bin b (weighted centroid) +// bin_w[b]: total weight of data in bin b +// levels[]: centroids, in/out (must be sorted ascending on entry) +// ------------------------------------------------------------------------------- +static void q2kpt_histogram_lloyd_max( + int n_bins, const float * bin_centers, const float * bin_w, + float * levels, int n_levels, int n_iter) { + + for (int iter = 0; iter < n_iter; ++iter) { + float sum_w[Q2KPT_N_LEVELS] = { 0.0f }; + float sum_wt[Q2KPT_N_LEVELS] = { 0.0f }; + + for (int b = 0; b < n_bins; ++b) { + float w = bin_w[b]; + if (w < 1e-30f) continue; + float t = bin_centers[b]; + int best = 0; + float bd2 = (t - levels[0]) * (t - levels[0]); + for (int k = 1; k < n_levels; ++k) { + float d2 = (t - levels[k]) * (t - levels[k]); + if (d2 < bd2) { bd2 = d2; best = k; } + } + sum_w[best] += w; + sum_wt[best] += w * t; + } + + float new_levels[Q2KPT_N_LEVELS]; + float max_delta = 0.0f; + for (int k = 0; k < n_levels; ++k) { + new_levels[k] = (sum_w[k] > 1e-30f) ? sum_wt[k] / sum_w[k] : levels[k]; + } + // Sort ascending (insertion sort, n_levels=4) + for (int k = 1; k < n_levels; ++k) { + float v = new_levels[k]; int m = k - 1; + while (m >= 0 && new_levels[m] > v) { new_levels[m + 1] = new_levels[m]; --m; } + new_levels[m + 1] = v; + } + for (int k = 0; k < n_levels; ++k) + max_delta = fmaxf(max_delta, fabsf(new_levels[k] - levels[k])); + memcpy(levels, new_levels, n_levels * sizeof(float)); + if (max_delta < 1e-7f) break; + } +} + +// ---- q2kpt_optimize_block_levels ---------------------------------------------- +// Full closed-loop EM training for one QK_K block using histogram binning: +// 1. Init: histogram Lloyd-Max on normalized [0,1] sub-block values +// 2. EM cycles: full E-step → build effective-T histogram → cheap Lloyd-Max +// 3. Final quantize with best levels seen +// +// block_x: QK_K floats of original data +// block_y: workspace / output (holds the best quantization on return) +// quant_weights: QK_K per-element importance weights (or NULL → use x[i]²) +// sigma2: mean(x²) for the block +// levels_out: Q2KPT_N_LEVELS trained levels in [0,1], ascending +// ------------------------------------------------------------------------------- +#define Q2KPT_N_BINS 128 // histogram bins +#define Q2KPT_INIT_LLOYD 10 // Lloyd-Max iters on init histogram +#define Q2KPT_N_EM_CYCLES 4 // number of full E-step calls +#define Q2KPT_LLOYD_PER_CYCLE 10 // cheap histogram Lloyd-Max iters per cycle + +static void q2kpt_optimize_block_levels( + const float * GGML_RESTRICT block_x, + block_q2_kpt * GGML_RESTRICT block_y, + const float * GGML_RESTRICT quant_weights, + float sigma2, + float levels_out[Q2KPT_N_LEVELS]) { + + const float inv_bins = 1.0f / Q2KPT_N_BINS; + + // ---- Build per-element weights and sub-block-normalised values ----------- + float weights[QK_K]; + float norm_vals[QK_K]; + + for (int sb = 0; sb < QK_K / 16; ++sb) { + const float * xsb = block_x + sb * 16; + float xmin = xsb[0], xmax = xsb[0]; + for (int l = 1; l < 16; ++l) { + if (xsb[l] < xmin) xmin = xsb[l]; + if (xsb[l] > xmax) xmax = xsb[l]; + } + if (xmin > 0.0f) xmin = 0.0f; + float range = xmax - xmin; + float inv_range = (range > 1e-10f) ? 1.0f / range : 0.0f; + + for (int l = 0; l < 16; ++l) { + int el = sb * 16 + l; + float t = (range > 1e-10f) ? + fmaxf(0.0f, fminf(1.0f, (xsb[l] - xmin) * inv_range)) : 0.0f; + norm_vals[el] = t; + + float w; + if (quant_weights) { + w = quant_weights[el] * sqrtf(sigma2 + xsb[l] * xsb[l]); + } else { + w = xsb[l] * xsb[l]; + } + // Scale by range² so normalised-space errors match actual-space + weights[el] = fmaxf(w * range * range, 1e-30f); + } + } + + // ---- Phase 1: Init levels via histogram Lloyd-Max on norm_vals ---------- + float bin_w[Q2KPT_N_BINS]; + float bin_wt[Q2KPT_N_BINS]; + float bin_centers[Q2KPT_N_BINS]; + + memset(bin_w, 0, sizeof(bin_w)); + memset(bin_wt, 0, sizeof(bin_wt)); + + for (int i = 0; i < QK_K; ++i) { + float t = norm_vals[i]; + int b = (int)(t * Q2KPT_N_BINS); + if (b >= Q2KPT_N_BINS) b = Q2KPT_N_BINS - 1; + bin_w[b] += weights[i]; + bin_wt[b] += weights[i] * t; + } + for (int b = 0; b < Q2KPT_N_BINS; ++b) + bin_centers[b] = (bin_w[b] > 1e-30f) ? bin_wt[b] / bin_w[b] : (b + 0.5f) * inv_bins; + + float levels[Q2KPT_N_LEVELS]; + for (int k = 0; k < Q2KPT_N_LEVELS; ++k) + levels[k] = (float)k / (Q2KPT_N_LEVELS - 1); + + q2kpt_histogram_lloyd_max(Q2KPT_N_BINS, bin_centers, bin_w, + levels, Q2KPT_N_LEVELS, Q2KPT_INIT_LLOYD); + + // ---- Phase 2: EM cycles ------------------------------------------------- + // Each cycle: full E-step → build effective-T histogram → cheap Lloyd-Max. + // Effective-T: T_i = (x_i + B_i) / A_i, W_i = w_i * A_i² + // The M-step optimal level for class k is the W-weighted mean of T_i in k. + float best_levels[Q2KPT_N_LEVELS]; + memcpy(best_levels, levels, sizeof(levels)); + float best_err = 1e38f; + + float eff_bin_w[Q2KPT_N_BINS]; + float eff_bin_wt[Q2KPT_N_BINS]; + float eff_bin_centers[Q2KPT_N_BINS]; + + for (int cycle = 0; cycle < Q2KPT_N_EM_CYCLES; ++cycle) { + + // Full E-step + q2kpt_quantize_block_given_levels(block_x, block_y, quant_weights, sigma2, levels); + + float d_all_q = GGML_FP16_TO_FP32(block_y->d); + float dmin_q = GGML_FP16_TO_FP32(block_y->dmin); + + memset(eff_bin_w, 0, sizeof(eff_bin_w)); + memset(eff_bin_wt, 0, sizeof(eff_bin_wt)); + float err = 0.0f; + + for (int el = 0; el < QK_K; ++el) { + int sb = el / 16; + int k_j = block_y->scales[sb] & 0xF; + int m_j = block_y->scales[sb] >> 4; + + float A = d_all_q * (float)k_j * 3.0f; + float mn = dmin_q * (float)m_j; + + int qs_byte = (el / 128) * 32 + el % 32; + int shift = ((el % 128) / 32) * 2; + int idx = (block_y->qs[qs_byte] >> shift) & 3; + + float w = quant_weights ? + quant_weights[el] * sqrtf(sigma2 + block_x[el] * block_x[el]) : + block_x[el] * block_x[el]; + w = fmaxf(w, 1e-30f); + + float y_approx = A * levels[idx] - mn; + float diff = y_approx - block_x[el]; + err += w * diff * diff; + + // Build effective-T histogram for histogram Lloyd-Max M-step + if (A > 1e-10f) { + float T = fmaxf(0.0f, fminf(1.0f, (block_x[el] + mn) / A)); + float W = w * A * A; + int b = (int)(T * Q2KPT_N_BINS); + if (b >= Q2KPT_N_BINS) b = Q2KPT_N_BINS - 1; + eff_bin_w[b] += W; + eff_bin_wt[b] += W * T; + } + } + + if (err < best_err) { + best_err = err; + memcpy(best_levels, levels, sizeof(levels)); + } + + for (int b = 0; b < Q2KPT_N_BINS; ++b) + eff_bin_centers[b] = (eff_bin_w[b] > 1e-30f) + ? eff_bin_wt[b] / eff_bin_w[b] + : (b + 0.5f) * inv_bins; + + q2kpt_histogram_lloyd_max(Q2KPT_N_BINS, eff_bin_centers, eff_bin_w, + levels, Q2KPT_N_LEVELS, Q2KPT_LLOYD_PER_CYCLE); + } + + // Final quantize with the best levels found across all cycles + memcpy(levels_out, best_levels, sizeof(best_levels)); + q2kpt_quantize_block_given_levels(block_x, block_y, quant_weights, sigma2, best_levels); } // Train 4 Lloyd-Max float levels from tensor data for Q2_KPT. @@ -5472,13 +5793,16 @@ void dequantize_row_q2_kpt(const block_q2_kpt * GGML_RESTRICT x, float * GGML_RE const float * lv = (const float *)levels; GGML_ASSERT(lv != NULL && "Q2_KPT levels not set for tensor"); - // Precompute mapped levels: ml[k] = levels[k] * 3.0 - float ml[Q2KPT_N_LEVELS]; - for (int i = 0; i < Q2KPT_N_LEVELS; ++i) { - ml[i] = lv[i] * 3.0f; - } - for (int i = 0; i < nb; i++) { + // Per-block levels: block i uses lv[i*4 + 0..3] + const float * block_lv = lv + i * Q2KPT_N_LEVELS; + + // Precompute mapped levels: ml[k] = levels[k] * 3.0 + float ml[Q2KPT_N_LEVELS]; + for (int j = 0; j < Q2KPT_N_LEVELS; ++j) { + ml[j] = block_lv[j] * 3.0f; + } + const float d_all = GGML_FP16_TO_FP32(x[i].d); const float m_all = GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * q = x[i].qs; @@ -5545,17 +5869,15 @@ static float make_q2kpt_quants(int n, float best_obj = 0; bool first = true; - // Try multiple trial scales (perturbations around initial estimate) + // Grid search: try multiple trial scales for (int is = -9; is <= 36; ++is) { float iscale = (ml_max + 0.1f * is) / (xmax - xmin); float trial_min = -xmin; - // Assign each element to nearest mapped level float sum_l = 0, sum_l2 = 0, sum_lx = 0; float sum_x = 0, sum_w = 0; for (int i = 0; i < n; ++i) { float scaled = iscale * (x[i] + trial_min); - // Nearest level assignment int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); float ml_val = mapped_levels[best_k]; float w = weight ? weight[i] : x[i] * x[i]; @@ -5566,14 +5888,11 @@ static float make_q2kpt_quants(int n, sum_w += w; } - // 2D least-squares: x[i] ≈ A * ml[L[i]] + B - // Normal equations: [sum_l2, sum_l; sum_l, sum_w] [A; B] = [sum_lx; sum_x] float D = sum_w * sum_l2 - sum_l * sum_l; if (D > 0) { float this_scale = (sum_w * sum_lx - sum_x * sum_l) / D; float this_min = (sum_l2 * sum_x - sum_l * sum_lx) / D; - // min_offset = -B, must be >= 0 (i.e. B <= 0) if (this_min > 0) { this_min = 0; this_scale = sum_lx / sum_l2; @@ -5593,140 +5912,138 @@ static float make_q2kpt_quants(int n, best_scale = this_scale; best_min = this_min; first = false; - - // Store assignment - for (int i = 0; i < n; ++i) { - float scaled = (x[i] - best_min) / (best_scale > 1e-15f ? best_scale : 1e-15f); - int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); - L[i] = best_k; - } } } } - *the_min = -best_min; // Store as positive offset (Q2_K convention: ml = dmin * min_q) + // Inner EM refinement from best found by grid search: iterate + // assign→refit→assign until convergence (fixes Problem 3) + for (int refine = 0; refine < 8; ++refine) { + float sum_l = 0, sum_l2 = 0, sum_lx = 0, sum_x = 0, sum_w = 0; + for (int i = 0; i < n; ++i) { + float scaled = (x[i] - best_min) / (best_scale > 1e-15f ? best_scale : 1e-15f); + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); + float ml_val = mapped_levels[best_k]; + float w = weight ? weight[i] : x[i] * x[i]; + sum_l += w * ml_val; + sum_l2 += w * ml_val * ml_val; + sum_lx += w * ml_val * x[i]; + sum_x += w * x[i]; + sum_w += w; + } + float D = sum_w * sum_l2 - sum_l * sum_l; + if (D <= 0) break; + float new_scale = (sum_w * sum_lx - sum_x * sum_l) / D; + float new_min = (sum_l2 * sum_x - sum_l * sum_lx) / D; + if (new_min > 0) { + new_min = 0; + new_scale = sum_l2 > 1e-30f ? sum_lx / sum_l2 : 0.f; + } + float cur_error = 0; + for (int i = 0; i < n; ++i) { + float scaled = (x[i] - new_min) / (new_scale > 1e-15f ? new_scale : 1e-15f); + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); + float diff = new_scale * mapped_levels[best_k] + new_min - x[i]; + float w = weight ? weight[i] : x[i] * x[i]; + cur_error += w * diff * diff; + } + if (cur_error >= best_obj - 1e-12f * best_obj) { break; } + best_obj = cur_error; + best_scale = new_scale; + best_min = new_min; + } + + // Final assignment with best (scale, min) + for (int i = 0; i < n; ++i) { + float scaled = (x[i] - best_min) / (best_scale > 1e-15f ? best_scale : 1e-15f); + L[i] = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); + } + + *the_min = -best_min; return best_scale; } static void quantize_row_q2_kpt_impl(const float * GGML_RESTRICT x, block_q2_kpt * GGML_RESTRICT y, int64_t n_per_row, - const float * GGML_RESTRICT quant_weights) { + const float * GGML_RESTRICT quant_weights, + const float * GGML_RESTRICT imatrix, + float * GGML_RESTRICT block_levels) { assert(n_per_row % QK_K == 0); - const int nb = n_per_row / QK_K; - const float * levels = q2kpt_get_levels(); - GGML_ASSERT(levels != NULL && "Q2_KPT levels not set - call q2kpt_set_levels() first"); + const int nb = n_per_row / QK_K; + GGML_ASSERT(block_levels != NULL && "block_levels buffer must be provided"); - // Precompute mapped levels: ml[k] = levels[k] * 3.0 - float mapped_levels[Q2KPT_N_LEVELS]; - for (int k = 0; k < Q2KPT_N_LEVELS; ++k) { - mapped_levels[k] = levels[k] * 3.0f; - } + for (int i = 0; i < nb; ++i) { + const float * block_x = x + i * QK_K; - // Precompute boundaries for nearest-level assignment - float bounds[Q2KPT_N_LEVELS - 1]; - for (int k = 0; k < Q2KPT_N_LEVELS - 1; ++k) { - bounds[k] = 0.5f * (mapped_levels[k] + mapped_levels[k + 1]); - } - - uint8_t L[QK_K]; - float mins[QK_K/16]; - float scales[QK_K/16]; - float sw[QK_K/16]; - float weight[16]; - uint8_t Ls[QK_K/16], Lm[QK_K/16]; + // Per-block quant_weights and imatrix slices (fixes the imatrix offset bug: + // previously the full-row imatrix was passed and indexed from 0 for every block) + const float * block_qw = quant_weights ? quant_weights + i * QK_K : NULL; + const float * block_im = imatrix ? imatrix + i * QK_K : NULL; - for (int i = 0; i < nb; i++) { - memset(sw, 0, QK_K/16*sizeof(float)); - float sumx2 = 0; - for (int j = 0; j < QK_K; ++j) sumx2 += x[j] * x[j]; + float sumx2 = 0.0f; + for (int j = 0; j < QK_K; ++j) sumx2 += block_x[j] * block_x[j]; float sigma2 = sumx2 / QK_K; - // First pass: find per-sub-block scales optimized for mapped levels - for (int j = 0; j < QK_K/16; ++j) { - if (quant_weights) { - const float * qw = quant_weights + QK_K * i + 16 * j; - for (int l = 0; l < 16; ++l) - weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l] * x[16*j + l]); - } else { - for (int l = 0; l < 16; ++l) - weight[l] = x[16*j + l] * x[16*j + l]; - } - for (int l = 0; l < 16; ++l) sw[j] += weight[l]; - - scales[j] = make_q2kpt_quants(16, x + 16*j, L + 16*j, &mins[j], - mapped_levels, weight); - } - - // Two-tier scale quantization (identical to Q2_K): - // Quantize scales [0..15] and mins [0..15] separately using make_qp_quants - float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw); - float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw); - - y[i].d = GGML_FP32_TO_FP16(dm); - y[i].dmin = GGML_FP32_TO_FP16(mm); - dm = GGML_FP16_TO_FP32(y[i].d); - mm = GGML_FP16_TO_FP32(y[i].dmin); + float block_lv[Q2KPT_N_LEVELS]; + // Runs k-means init + EM loop; fills block_lv AND writes the best + // quantized block into y[i] as a side-effect. + q2kpt_optimize_block_levels(block_x, &y[i], block_qw, sigma2, block_lv); - for (int j = 0; j < QK_K/16; ++j) { - y[i].scales[j] = Ls[j] | (Lm[j] << 4); - } + memcpy(block_levels + i * Q2KPT_N_LEVELS, block_lv, Q2KPT_N_LEVELS * sizeof(float)); - // Second pass: re-assign with quantized scales using nearest mapped level - for (int j = 0; j < QK_K/16; ++j) { - const float d = dm * (y[i].scales[j] & 0xF); - if (!d) { - // Assign to level closest to 0 for zero-scale sub-blocks - int zero_k = 0; - float zero_d = fabsf(mapped_levels[0]); - for (int k = 1; k < Q2KPT_N_LEVELS; ++k) { - if (fabsf(mapped_levels[k]) < zero_d) { - zero_d = fabsf(mapped_levels[k]); - zero_k = k; - } - } - for (int ii = 0; ii < 16; ++ii) L[16*j + ii] = zero_k; - continue; - } - const float m = mm * (y[i].scales[j] >> 4); - for (int ii = 0; ii < 16; ++ii) { - float scaled = (x[16*j + ii] + m) / d; - // Nearest mapped level assignment - int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); - L[16*j + ii] = best_k; - } - } - - // Pack 2-bit indices (same layout as Q2_K) - for (int j = 0; j < QK_K; j += 128) { - for (int l = 0; l < 32; ++l) { - y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) - | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); - } - } - - x += QK_K; + (void)block_im; // imatrix is folded into block_qw; retained for future use } } size_t quantize_q2_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, + int64_t start_row, int64_t nrow, int64_t n_per_row, const float * imatrix) { size_t row_size = ggml_row_size(GGML_TYPE_Q2_KPT, n_per_row); char * qrow = (char *) dst; + const int nb = (int)(n_per_row / QK_K); + const size_t total_levels = (size_t)nrow * nb * Q2KPT_N_LEVELS; + const size_t levels_needed = (size_t)(start_row + nrow) * nb * Q2KPT_N_LEVELS; + + // Ensure buffer is large enough (should have been pre-allocated via q2kpt_prepare_levels) + if (levels_needed > q2kpt_max_levels) { + q2kpt_block_levels = (float *) realloc(q2kpt_block_levels, levels_needed * sizeof(float)); + q2kpt_max_levels = levels_needed; + } + q2kpt_cur_levels = levels_needed; + + // Temporary buffer for one row's block levels + float * row_block_levels = (float *) malloc(nb * Q2KPT_N_LEVELS * sizeof(float)); + for (int64_t row = 0; row < nrow; ++row) { - quantize_row_q2_kpt_impl(src, (block_q2_kpt *) qrow, n_per_row, imatrix); + // Quantize row with per-block trained levels + quantize_row_q2_kpt_impl(src, (block_q2_kpt *) qrow, n_per_row, imatrix, imatrix, row_block_levels); + // Copy this row's block levels to the global buffer at the correct offset + memcpy(q2kpt_block_levels + (start_row + row) * nb * Q2KPT_N_LEVELS, row_block_levels, + nb * Q2KPT_N_LEVELS * sizeof(float)); src += n_per_row; qrow += row_size; } + free(row_block_levels); return nrow * row_size; } +// Train per-row levels for all rows of a tensor and write to out_levels. +// out_levels must be pre-allocated to nrow * Q2KPT_N_LEVELS floats. +void q2kpt_train_all_row_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float * out_levels) { + for (int64_t r = 0; r < nrow; ++r) { + q2kpt_train_levels(data + r * n_per_row, 1, n_per_row, imatrix, + out_levels + r * Q2KPT_N_LEVELS); + } +} + void quantize_row_q2_kpt_ref(const float * GGML_RESTRICT x, block_q2_kpt * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); - quantize_q2_kpt(x, y, 1, k, NULL); + quantize_q2_kpt(x, y, 0, 1, k, NULL); } // Global levels (used during quantization for the current tensor) diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index e1b1ac375c5..e0c0484c19c 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -153,12 +153,14 @@ GGML_API void q2dpt_train_levels(const float * data, int64_t nrow, int // Q2_KPT: Q2_K with learned per-tensor float levels GGML_API void dequantize_row_q2_kpt(const block_q2_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); GGML_API void quantize_row_q2_kpt_ref(const float * GGML_RESTRICT x, block_q2_kpt * GGML_RESTRICT y, int64_t k); -GGML_API size_t quantize_q2_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q2_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t start_row, int64_t nrows, int64_t n_per_row, const float * imatrix); // Q2_KPT levels management (per-tensor float levels in [0,1]) GGML_API void q2kpt_set_levels(const float * levels); GGML_API const float * q2kpt_get_levels(void); GGML_API void q2kpt_free_levels(void); +// Prepare levels buffer for a tensor with given dimensions (call before parallel quantization) +GGML_API void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row); // Train 4 Lloyd-Max float levels from tensor data for Q2_KPT. // Bins normalized sub-block values in [0,1], runs weighted k-means for 4 centroids. @@ -166,6 +168,10 @@ GGML_API void q2kpt_free_levels(void); GGML_API void q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, float levels_out[Q2KPT_N_LEVELS]); +// Train per-row levels for all rows: writes nrow * Q2KPT_N_LEVELS floats to out_levels. +GGML_API void q2kpt_train_all_row_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float * out_levels); + // Train 16 Lloyd-Max int8 levels from tensor data. // Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[16]. // Also sets the global levels via q4dpt_set_levels(). diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 56d9313e258..7a70ce3919c 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -945,6 +945,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q2_kpt, .from_float_ref = (ggml_from_float_t) quantize_row_q2_kpt_ref, + .levels_row_stride = 0, // computed dynamically: (ne[0]/256)*4*sizeof(float) }, }; @@ -7667,7 +7668,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_Q3_PT: result = quantize_q3_pt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q3_KPT: result = quantize_q3_kpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_DPT: result = quantize_q4_dpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q2_KPT: result = quantize_q2_kpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q2_KPT: result = quantize_q2_kpt (src + start, (char *) dst + start_row * row_size, start_row, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp index 3df6e1f4211..bdf6414aad4 100644 --- a/pocs/vdot/q8dot.cpp +++ b/pocs/vdot/q8dot.cpp @@ -157,8 +157,8 @@ int main(int argc, char** argv) { t1 = std::chrono::high_resolution_clock::now(); float fs; - if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1); - else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1); + if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1, nullptr); + else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1, nullptr); t2 = std::chrono::high_resolution_clock::now(); t = 1e-3*std::chrono::duration_cast(t2-t1).count(); if (iloop > 3) ggml.addResult(fs, t); diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp index 2dca62848bc..a78fabc28c0 100644 --- a/pocs/vdot/vdot.cpp +++ b/pocs/vdot/vdot.cpp @@ -285,8 +285,8 @@ int main(int argc, char** argv) { else { const auto * vdot = ggml_get_type_traits_cpu(funcs_cpu->vec_dot_type); vdot->from_float(y1.data(), q8.data(), kVecSize); - if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1); - else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1); + if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1, nullptr); + else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1, nullptr); } sumq += result; t2 = std::chrono::high_resolution_clock::now(); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 3012746cfb5..1f13f62b909 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7873,7 +7873,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { { GGML_TYPE_Q3_PT, "q3_pt.levels", 8, sizeof(float) }, { GGML_TYPE_Q3_KPT, "q3_kpt.levels", 8, sizeof(float) }, { GGML_TYPE_Q4_DPT, "q4_dpt.levels", 16, sizeof(int8_t) }, - { GGML_TYPE_Q2_KPT, "q2_kpt.levels", 4, sizeof(float) }, }; for (const auto & lt : level_types) { @@ -7916,6 +7915,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) { __func__, tensor_count, lt.gguf_key); } } + + // Q2_KPT: per-block levels stored as per-tensor GGUF keys "{tensor_name}.q2kpt_levels" + // Each key holds n_blocks * Q2KPT_N_LEVELS floats for that tensor (4 floats per 256-element block). + { + size_t q2kpt_loaded = 0; + for (auto & [tname, t] : name_to_tensor) { + if (t->type != GGML_TYPE_Q2_KPT) { continue; } + const std::string key = tname + ".q2kpt_levels"; + int64_t lv_idx = gguf_find_key(ml.meta.get(), key.c_str()); + if (lv_idx < 0) { continue; } + + const uint8_t * lv_raw = (const uint8_t *)gguf_get_arr_data(ml.meta.get(), lv_idx); + const size_t lv_n = gguf_get_arr_n(ml.meta.get(), lv_idx); + + auto & aux = tensor_aux_data[t]; + aux.type = GGML_TYPE_Q2_KPT; + aux.host_data.assign(lv_raw, lv_raw + lv_n * sizeof(float)); + aux.aux_tensor = nullptr; + t->quant_levels = aux.host_data.data(); + q2kpt_loaded++; + } + if (q2kpt_loaded > 0) { + LLAMA_LOG_INFO("%s: loaded %zu Q2_KPT per-block level tables\n", __func__, q2kpt_loaded); + } + } } if (use_mmap_buffer) { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 93b89952991..a677323c560 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -36,12 +36,12 @@ extern "C" { void q4dpt_set_levels(const int8_t * levels); } -// Q2_KPT levels functions (defined in ggml-quants.c) -extern "C" { - void q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, - const float * imatrix, float levels_out[4]); - void q2kpt_set_levels(const float * levels); -} +// Q2_KPT levels are handled internally by quantize_q2_kpt +#define Q2KPT_N_LEVELS 4 +#define QK_K 256 +extern "C" const float * q2kpt_get_levels(void); +extern "C" void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row); +extern "C" void q2kpt_free_levels(void); // Quantization types. Changes to this struct must be replicated in quantize.cpp struct tensor_quantization { @@ -1045,12 +1045,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: LLAMA_LOG_INFO("%s: Q4_DPT pass 1 complete.\n", __func__); } - // Q2_KPT two-pass approach: train all per-tensor float levels BEFORE opening the output - static const size_t Q2KPT_N_LEVELS_SZ = 4; - std::vector q2kpt_all_levels; + // Q2_KPT two-pass approach: train all per-block levels BEFORE opening the output + // file, so the levels KV entry is already populated at the time of the metadata placeholder. + // Per-block levels: 4 floats per 256-element block. + struct q2kpt_tensor_levels { + std::string name; + std::vector levels; // nrows * (n_per_row / QK_K) * Q2KPT_N_LEVELS floats + }; + std::vector q2kpt_all_levels; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT && !params->dry_run) { - LLAMA_LOG_INFO("%s: Q2_KPT pass 1: training per-tensor levels...\n", __func__); - q2kpt_all_levels.assign(tensors.size() * Q2KPT_N_LEVELS_SZ, 0.0f); + LLAMA_LOG_INFO("%s: Q2_KPT pass 1: training per-block levels...\n", __func__); std::vector> p1_read_data; std::vector> p1_f32_buf; @@ -1061,6 +1065,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ggml_tensor * tensor = tensors[ti]->tensor; const std::string tname = ggml_get_name(tensor); + // Determine whether this tensor will be Q2_KPT (mirror the pass-2 logic) bool quantize = tname.rfind("weight") == tname.size() - 6; quantize &= (ggml_n_dims(tensor) >= 2); quantize &= tname.find("_norm.weight") == std::string::npos; @@ -1080,6 +1085,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } if (new_type != GGML_TYPE_Q2_KPT) { continue; } + // Load tensor data const size_t tsz = ggml_nbytes(tensor); if (!ml.use_mmap) { if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); } @@ -1087,6 +1093,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } ml.load_data_for(tensor); + // Dequantize to f32 if needed const int64_t nelements = ggml_nelements(tensor); float * f32_data; if (tensor->type == GGML_TYPE_F32) { @@ -1096,6 +1103,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: f32_data = (float *) p1_f32_buf.data(); } + // Resolve imatrix const float * imatrix = nullptr; if (imatrix_data) { auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped)); @@ -1108,15 +1116,49 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const int64_t n_per_row = tensor->ne[0]; const int64_t nrows = tensor->ne[1]; - LLAMA_LOG_INFO("%s: Q2_KPT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name); - q2kpt_train_levels(f32_data, nrows, n_per_row, imatrix, - q2kpt_all_levels.data() + ti * Q2KPT_N_LEVELS_SZ); + // Allocate levels buffer for this tensor + const int nb = n_per_row / QK_K; + const size_t n_levels = (size_t)nrows * tensor->ne[2] * nb * Q2KPT_N_LEVELS; + q2kpt_all_levels.push_back({tname, std::vector(n_levels)}); + + LLAMA_LOG_INFO("%s: Q2_KPT levels for [%zu/%zu] %s (%zu floats)\n", + __func__, ti+1, tensors.size(), tensor->name, n_levels); + + // Train levels by running quantization internally + // We need to quantize to f32 -> Q2_KPT -> f32 to get the trained levels + std::vector> p1_qbuf(ggml_nbytes(tensor)); + const size_t row_size = ggml_row_size(GGML_TYPE_Q2_KPT, n_per_row); + + // Prepare levels buffer for this tensor + q2kpt_free_levels(); + q2kpt_prepare_levels(nrows * tensor->ne[2], n_per_row); + + // Quantize each expert slice + const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; + for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { + const float * f32_data_03 = f32_data + i03 * nelements_matrix; + void * q_data_03 = (char *)p1_qbuf.data() + row_size * i03 * nrows; + const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; + + // start_row must be the absolute row index for correct levels indexing + ggml_quantize_chunk(GGML_TYPE_Q2_KPT, f32_data_03, q_data_03, i03 * nrows, nrows, n_per_row, imatrix_03); + } + + // Copy trained levels to our storage + const float * trained_levels = q2kpt_get_levels(); + if (trained_levels) { + memcpy(q2kpt_all_levels.back().levels.data(), trained_levels, n_levels * sizeof(float)); + } } - for (auto & ctx : ctx_outs) { - if (ctx) { - gguf_set_arr_data(ctx.get(), "q2_kpt.levels", GGUF_TYPE_FLOAT32, - q2kpt_all_levels.data(), q2kpt_all_levels.size()); + // Store all levels in GGUF metadata before the file is opened + for (const auto & tl : q2kpt_all_levels) { + for (auto & ctx : ctx_outs) { + if (ctx) { + const std::string key = tl.name + ".q2kpt_levels"; + gguf_set_arr_data(ctx.get(), key.c_str(), GGUF_TYPE_FLOAT32, + tl.levels.data(), tl.levels.size()); + } } } LLAMA_LOG_INFO("%s: Q2_KPT pass 1 complete.\n", __func__); @@ -1403,17 +1445,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: q4dpt_set_levels(q4dpt_all_levels.data() + tensor_pass2_idx * Q4DPT_N_LEVELS); } - // Q2_KPT: set the per-tensor levels (trained in pass 1) as global for quantization + // Q2_KPT: quantize_q2_kpt trains per-block levels internally. + // Levels were already trained and saved to GGUF in pass 1. + // We still need to allocate the levels buffer for quantization to work correctly. if (new_type == GGML_TYPE_Q2_KPT) { - q2kpt_set_levels(q2kpt_all_levels.data() + tensor_pass2_idx * Q2KPT_N_LEVELS_SZ); - // DIAGNOSTIC: print levels being used for quantization - static int q2kpt_pass2_diag = 0; - if (q2kpt_pass2_diag < 5) { - const float * lv = q2kpt_all_levels.data() + tensor_pass2_idx * Q2KPT_N_LEVELS_SZ; - LLAMA_LOG_INFO("%s: [Q2_KPT pass2 diag] ti=%zu tensor='%s' levels=[%.4f,%.4f,%.4f,%.4f]\n", - __func__, tensor_pass2_idx, ggml_get_name(tensor), lv[0], lv[1], lv[2], lv[3]); - q2kpt_pass2_diag++; - } + const int64_t total_rows = nrows * tensor->ne[2]; + q2kpt_free_levels(); // Clear any stale levels from previous tensor + q2kpt_prepare_levels(total_rows, n_per_row); // Allocate for this tensor } // quantize each expert separately since they have different importance matrices @@ -1424,29 +1462,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); - - // TODO: temporary sanity check that the F16 -> MXFP4 is lossless -#if 0 - if (new_type == GGML_TYPE_MXFP4) { - auto * x = f32_data_03; - - //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row); - std::vector deq(nrows*n_per_row); - const ggml_type_traits * qtype = ggml_get_type_traits(new_type); - qtype->to_float(new_data_03, deq.data(), deq.size(), nullptr); - - double err = 0.0f; - for (int i = 0; i < (int) deq.size(); ++i) { - err += fabsf(deq[i] - x[i]); - //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) { - if (deq[i] != x[i]) { - LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]); - } - } - //LLAMA_LOG_INFO("err = %f\n", err); - GGML_ASSERT(err == 0.00000); - } -#endif } LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0); } diff --git a/tests/test-quant-q2kpt.cpp b/tests/test-quant-q2kpt.cpp index 1b43b8fa1f4..d40df2297fd 100644 --- a/tests/test-quant-q2kpt.cpp +++ b/tests/test-quant-q2kpt.cpp @@ -23,13 +23,16 @@ // --------------------------------------------------------------------------- // Declarations for Q2_KPT internals (all in libggml-base.so) // --------------------------------------------------------------------------- +#define Q2KPT_N_LEVELS 4 + extern "C" { void q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, float levels_out[4]); void q2kpt_set_levels(const float * levels); + void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row); const float *q2kpt_get_levels(void); size_t quantize_q2_kpt(const float * src, void * dst, - int64_t nrows, int64_t n_per_row, + int64_t start_row, int64_t nrows, int64_t n_per_row, const float * imatrix); } @@ -62,14 +65,22 @@ static float std_quant_rmse(ggml_type type, const float * data, } // Train Q2_KPT levels on data, quantize (with imatrix=1), dequantize, return RMSE. +// Q2_KPT has per-block levels (4 floats per 256-elem block), so we need to handle that. static float q2kpt_rmse(const float * data, size_t nrow, size_t n_per_row, float out_levels[4]) { std::vector imatrix(n_per_row, 1.0f); + // Train initial levels (used as starting point, but quantize will train per-row) q2kpt_train_levels(data, (int64_t)nrow, (int64_t)n_per_row, imatrix.data(), out_levels); - q2kpt_set_levels(out_levels); - + + const int nb = (int)(n_per_row / MY_QK_K); // blocks per row + const size_t total_levels = nrow * nb * Q2KPT_N_LEVELS; + std::vector all_levels(total_levels); + + // Prepare level storage for per-block levels + q2kpt_prepare_levels((int64_t)nrow, (int64_t)n_per_row); + const size_t rs = ggml_row_size(GGML_TYPE_Q2_KPT, n_per_row); std::vector qb(nrow * rs); std::vector dq(nrow * n_per_row); @@ -77,14 +88,21 @@ static float q2kpt_rmse(const float * data, size_t nrow, size_t n_per_row, for (size_t r = 0; r < nrow; ++r) { quantize_q2_kpt(data + r * n_per_row, qb.data() + r * rs, - 1, (int64_t)n_per_row, + r, 1, (int64_t)n_per_row, imatrix.data()); } + + // Get the trained per-block levels + const float * trained_levels = q2kpt_get_levels(); + memcpy(all_levels.data(), trained_levels, total_levels * sizeof(float)); + + // Dequant each row with its own per-block levels const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_Q2_KPT); for (size_t r = 0; r < nrow; ++r) { tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, - (int64_t)n_per_row, out_levels); + (int64_t)n_per_row, + all_levels.data() + r * nb * Q2KPT_N_LEVELS); } return rmse(data, dq.data(), nrow * n_per_row); } @@ -178,8 +196,8 @@ int main(void) { // ----------------------------------------------------------------------- printf("=== Section 3: Uniform-level baseline ===\n"); { - float uniform_levels[4] = {0.0f, 1.0f/3.0f, 2.0f/3.0f, 1.0f}; - q2kpt_set_levels(uniform_levels); + float uniform_levels_4[4] = {0.0f, 1.0f/3.0f, 2.0f/3.0f, 1.0f}; + q2kpt_set_levels(uniform_levels_4); std::vector data(nrow * n_per_row); std::normal_distribution dist(0.0f, 0.02f); @@ -189,16 +207,25 @@ int main(void) { float r_q2k = std_quant_rmse(GGML_TYPE_Q2_K, data.data(), nrow, n_per_row); // Q2_KPT with uniform levels (no re-training) + // Need per-block levels: repeat uniform_levels for each block + const int nb = n_per_row / MY_QK_K; // blocks per row + std::vector uniform_levels(nb * 4); + for (int b = 0; b < nb; ++b) { + for (int k = 0; k < 4; ++k) { + uniform_levels[b * 4 + k] = uniform_levels_4[k]; + } + } + const size_t rs = ggml_row_size(GGML_TYPE_Q2_KPT, n_per_row); std::vector qb(nrow * rs); std::vector dq(nrow * n_per_row); for (size_t r = 0; r < nrow; ++r) { quantize_q2_kpt(data.data() + r * n_per_row, - qb.data() + r * rs, 1, (int64_t)n_per_row, nullptr); + qb.data() + r * rs, r, 1, (int64_t)n_per_row, nullptr); } const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_Q2_KPT); for (size_t r = 0; r < nrow; ++r) { - tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t)n_per_row, uniform_levels); + tr->to_float(qb.data() + r * rs, dq.data() + r * n_per_row, (int64_t)n_per_row, uniform_levels.data()); } float r_kpt_unif = rmse(data.data(), dq.data(), nrow * n_per_row); float ratio = r_kpt_unif / (r_q2k + 1e-10f); @@ -233,7 +260,7 @@ int main(void) { std::vector qb(rs); std::vector dq(MY_QK_K); - quantize_q2_kpt(data.data(), qb.data(), 1, MY_QK_K, nullptr); + quantize_q2_kpt(data.data(), qb.data(), 0, 1, MY_QK_K, nullptr); const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_Q2_KPT); tr->to_float(qb.data(), dq.data(), MY_QK_K, ulev); @@ -296,7 +323,7 @@ int main(void) { const size_t rs = ggml_row_size(GGML_TYPE_Q2_KPT, ne0); std::vector qw(ne1 * rs); for (int r = 0; r < ne1; ++r) - quantize_q2_kpt(weights.data() + r * ne0, qw.data() + r * rs, 1, ne0, nullptr); + quantize_q2_kpt(weights.data() + r * ne0, qw.data() + r * rs, r, 1, ne0, nullptr); // Reference: dequant weights x float acts const ggml_type_traits * tr = ggml_get_type_traits(GGML_TYPE_Q2_KPT); diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index fcb02ddba59..d03df2ec71f 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -91,7 +91,7 @@ static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_tr vdot->from_float(test_data2, tmp_q2.data(), test_size); float result = INFINITY; - qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1); + qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1, nullptr); const float dot_ref = dot_product(test_data1, test_data2, test_size); diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 6b0d87ed5e4..92597a04f46 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -341,7 +341,7 @@ int main(int argc, char * argv[]) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { float result; - qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1); + qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1, nullptr); return result; }; size_t quantized_size = ggml_row_size(type, size);