Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,14 @@ extern "C" {
// Internal types and functions exposed for tests and benchmarks

typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
const void * GGML_RESTRICT y, size_t by, int nrc, const void * levels);

struct ggml_type_traits_cpu {
ggml_from_float_t from_float;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously
int64_t nrows; // number of rows to process simultaneously
size_t levels_row_stride; // bytes to add per row to get next row's quant_levels (0 = per-tensor)
};

GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
Expand Down
19 changes: 14 additions & 5 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,12 @@ extern "C" {
// GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
GGML_TYPE_COUNT = 40,
GGML_TYPE_Q3_PT = 40, // 3.875 bpw per-tensor Lloyd-Max, 16-elem affine sub-blocks
GGML_TYPE_Q3_KPT = 41, // Q3_K with learned per-tensor levels (3.4375 bpw)
GGML_TYPE_Q4_DPT = 42, // IQ4_NL with learned per-tensor int8 levels (4.125 bpw)
GGML_TYPE_Q2_DPT = 43, // 2-bit with learned per-tensor int8 levels (2.5 bpw)
GGML_TYPE_Q2_KPT = 44, // Q2_K with learned per-tensor float levels (2.625 bpw)
GGML_TYPE_COUNT = 45,
};

// precision
Expand Down Expand Up @@ -455,6 +460,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
GGML_FTYPE_MOSTLY_Q3_PT = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
Expand All @@ -463,6 +469,9 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_Q3_KPT = 27, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_DPT = 28, // except 1d tensors
GGML_FTYPE_MOSTLY_Q2_KPT = 29, // except 1d tensors
};

// available tensor operations:
Expand Down Expand Up @@ -681,9 +690,8 @@ extern "C" {

char name[GGML_MAX_NAME];

void * extra; // extra things e.g. for ggml-cuda.cu

char padding[8];
void * extra; // extra things e.g. for ggml-cuda.cu
void * quant_levels; // per-tensor quantization levels (replaces char padding[8]; same size on 64-bit)
};

static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
Expand Down Expand Up @@ -2701,7 +2709,7 @@ extern "C" {
# define GGML_RESTRICT restrict
# endif
#endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

struct ggml_type_traits {
Expand All @@ -2712,6 +2720,7 @@ extern "C" {
bool is_quantized;
ggml_to_float_t to_float;
ggml_from_float_t from_float_ref;
size_t levels_row_stride; // bytes to advance quant_levels per row (0 = per-tensor)
};

GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
Expand Down
17 changes: 14 additions & 3 deletions ggml/src/ggml-blas/ggml-blas.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
#include "ggml-impl.h"
#include "ggml-blas.h"

// Helper: compute quant_levels stride for a given row.
// For Q2_KPT (per-block levels), stride depends on tensor width.
static inline size_t ggml_quant_levels_stride(ggml_type type, size_t constant_stride, int64_t ne0) {
if (type == GGML_TYPE_Q2_KPT) {
return (size_t)(ne0 / 256) * 4 * sizeof(float);
}
return constant_stride;
}

#include "ggml-backend-impl.h"

#include <future>
Expand Down Expand Up @@ -77,10 +87,11 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1);
const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1);

const size_t lrs = ggml_quant_levels_stride(src0->type, ggml_get_type_traits(src0->type)->levels_row_stride, src0->ne[0]);
#ifdef GGML_USE_OPENMP
#pragma omp parallel for num_threads(n_threads)
for (int64_t i01 = 0; i01 < ne01; i01++) {
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs);
}
#else
for (int i = 1; i < n_threads; i++) {
Expand All @@ -89,7 +100,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
if (start < end) {
ctx->tasks.push_back(std::async(std::launch::async, [=]() {
for (int64_t i01 = start; i01 < end; i01++) {
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs);
}
}));
}
Expand All @@ -99,7 +110,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
const int64_t start = 0;
const int64_t end = ne01/n_threads;
for (int64_t i01 = start; i01 < end; i01++) {
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs);
}
}
#endif
Expand Down
42 changes: 42 additions & 0 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ typedef struct {
} block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");


// 3-bit quantization
// weight is represented as x = a * q
// 16 blocks of 16 elements each
Expand Down Expand Up @@ -305,6 +306,12 @@ typedef struct {
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");

// Q3_KPT: Q3_K with learned per-tensor levels
// Reuses block_q3_K structure but maps 3-bit indices through learned level table
typedef block_q3_K block_q3_kpt;
#define Q3KPT_N_LEVELS 8


// 5-bit quantization
// 8 blocks of 32 elements each
// weight is represented as x = a * q + b
Expand Down Expand Up @@ -427,6 +434,41 @@ typedef struct {
} block_iq4_xs;
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");

// 3.875 bpw - per-tensor Lloyd-Max scalar quantization
// 256 elements = 16 sub-blocks of 16, 8-entry level table trained per tensor
// Layout: 2 (d) + 2 (dmin) + 24 (scales: 32x6-bit) + 96 (qs: 256x3-bit) = 124 bytes
typedef struct {
ggml_half d; // 2 bytes: global scale for 16-elem sub-block ranges
ggml_half dmin; // 2 bytes: global scale for sub-block neg_mins
uint8_t scales[3*QK_K/32]; // 24 bytes: 32 x 6-bit (indices 0..15 = ranges, 16..31 = neg_mins)
uint8_t qs[3*QK_K/8]; // 96 bytes: 256 x 3-bit Lloyd-Max level index, sequential
} block_q3_pt;
static_assert(sizeof(block_q3_pt) == 124, "wrong q3_pt block size");

#define Q3PT_N_LEVELS 8

// Q4_DPT: IQ4_NL with learned per-tensor int8 levels (4.125 bpw)
// Block format: identical to block_iq4_nl (2 + 16 = 18 bytes per 32 elements)
typedef block_iq4_nl block_q4_dpt;
#define Q4DPT_N_LEVELS 16

// Q2_DPT: 2-bit per-tensor Lloyd-Max scalar quantization (2.5 bpw)
// Block format: 2 bytes (FP16 scale) + 8 bytes (2-bit indices for 32 elements) = 10 bytes per block
// 4 learned int8 levels per tensor, optimized via Lloyd-Max k-means
typedef struct {
ggml_half d; // 2 bytes: FP16 scale (delta)
uint8_t qs[8]; // 8 bytes: 2-bit indices (4 values per byte, 32 elements total)
} block_q2_dpt;
static_assert(sizeof(block_q2_dpt) == sizeof(ggml_half) + 8, "wrong q2_dpt block size/padding");

#define QK2_DPT 32
#define Q2DPT_N_LEVELS 4

// Q2_KPT: Q2_K with learned per-tensor float levels (2.625 bpw)
// Reuses block_q2_K structure but maps 2-bit indices through learned level table
typedef block_q2_K block_q2_kpt;
#define Q2KPT_N_LEVELS 4

#endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL

Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-cpu/arch-fallback.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K
#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
Expand Down Expand Up @@ -184,6 +186,8 @@
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K
#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
Expand Down Expand Up @@ -276,6 +280,8 @@
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K
#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
Expand Down
14 changes: 9 additions & 5 deletions ggml/src/ggml-cpu/arch/arm/quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in

//===================================== Dot products =================================

void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;

Expand Down Expand Up @@ -430,7 +430,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
}

void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_1;
const int nb = n / qk;

Expand Down Expand Up @@ -650,7 +650,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
*s = sumf;
}

void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;

Expand Down Expand Up @@ -762,7 +762,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
}

void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_1;
const int nb = n / qk;

Expand Down Expand Up @@ -880,7 +880,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
}

void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;

Expand Down Expand Up @@ -3766,6 +3766,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
#endif
}

void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
}

void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
Expand Down
22 changes: 13 additions & 9 deletions ggml/src/ggml-cpu/arch/loongarch/quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -644,7 +644,7 @@ static inline __m128i get_scale_shuffle(int i) {
}
#endif

void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;

Expand Down Expand Up @@ -772,7 +772,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
}

void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_1;
const int nb = n / qk;

Expand Down Expand Up @@ -827,11 +827,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}

void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;

Expand Down Expand Up @@ -880,11 +880,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(sumf);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}

void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_1;
const int nb = n / qk;

Expand Down Expand Up @@ -936,11 +936,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(sumf);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}

void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;

Expand Down Expand Up @@ -983,7 +983,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(sumf);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}

Expand Down Expand Up @@ -1956,6 +1956,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
#endif
}

void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
}

#if defined(__loongarch_asx)
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
const __m256i a = __lasx_xvmulwev_h_b(x, y);
Expand Down
Loading