Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IQ1_S_R4: better 1.5 bpw quants #185

Merged
merged 10 commits into from
Feb 5, 2025
2 changes: 2 additions & 0 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
{ "IQ2_M_R4", LLAMA_FTYPE_MOSTLY_IQ2_M_R4, " 2.7 bpw quantization", },
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
{ "IQ1_S_R4", LLAMA_FTYPE_MOSTLY_IQ1_S_R4, " 1.5 bpw quantization", },
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
{ "IQ1_BN", LLAMA_FTYPE_MOSTLY_IQ1_BN, " 1.62 bpw quantization (Bitnet)", },
{ "IQ2_BN", LLAMA_FTYPE_MOSTLY_IQ2_BN, " 2.00 bpw quantization (Bitnet)", },
Expand Down Expand Up @@ -510,6 +511,7 @@ int main(int argc, char ** argv) {
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 ||
params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 ||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 ||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)) {
fprintf(stderr, "\n==========================================================================================================\n");
fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
Expand Down
2 changes: 2 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,7 @@ extern "C" {
GGML_TYPE_IQ2_XXS_R4= 216,
GGML_TYPE_IQ2_XS_R4 = 217,
GGML_TYPE_IQ3_XXS_R4= 218,
GGML_TYPE_IQ1_S_R4 = 219,
GGML_TYPE_IQ4_NL_R4 = 220,
GGML_TYPE_IQ3_S_R4 = 221,
GGML_TYPE_IQ2_S_R4 = 222,
Expand Down Expand Up @@ -510,6 +511,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ2_XXS_R4= 215, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_XS_R4 = 216, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ3_XXS_R4= 217, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_S_R4 = 218, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_NL_R4 = 219, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ3_S_R4 = 220, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_S_R4 = 221, // except 1d tensors
Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,12 @@ typedef struct {
} block_iq1_s;
static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");

typedef struct {
uint8_t qs[16];
uint16_t qh[4];
} block_iq1_s_r4;
static_assert(sizeof(block_iq1_s_r4) == 24, "wrong iq1_s_r4 block size/padding");

// 1.75 bpw
typedef struct {
uint8_t qs[QK_K/8]; // grid index, low 8 bits
Expand Down
289 changes: 203 additions & 86 deletions ggml/src/ggml-quants.c

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions ggml/src/ggml-quants.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGM
void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
void quantize_row_iq1_bn_ref (const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k);
void quantize_row_iq1_s_ref (const float * GGML_RESTRICT x, block_iq1_s * GGML_RESTRICT y, int64_t k);

void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
Expand All @@ -66,6 +67,7 @@ void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq1_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq1_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

// Dequantization
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
Expand Down Expand Up @@ -148,6 +150,9 @@ void iq2xs_free_impl(enum ggml_type type);
void iq3xs_init_impl(int grid_size);
void iq3xs_free_impl(int grid_size);

void iq1s_process_1block(int block_size, const float * xb, const float * weight, int8_t * L,
float * the_scale, uint16_t * the_index, int * the_shift, float * pairs, float * sumx, float * sumw);

#if defined(__ARM_FEATURE_SVE)
extern int ggml_sve_cnt_b;
#endif
Expand Down
27 changes: 25 additions & 2 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1176,13 +1176,26 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.type_size = sizeof(block_iq1_s),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq1_s,
.from_float = NULL,
.from_float_ref = NULL,
.from_float = quantize_row_iq1_s,
.from_float_ref = (ggml_from_float_t)quantize_row_iq1_s_ref,
.vec_dot = ggml_vec_dot_iq1_s_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.row_meta_size = 0,
},
[GGML_TYPE_IQ1_S_R4] = {
.type_name = "iq1_s_r4",
.blck_size = 32,
.type_size = sizeof(block_iq1_s_r4)/4,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq1_s_r4,
.from_float = quantize_row_iq1_s_r4,
.from_float_ref = (ggml_from_float_t)quantize_row_iq1_s_r4_ref,
.vec_dot = vec_dot_iq1_s_r4_q8_k,
.vec_dot_type = GGML_TYPE_Q8_1_X4,
.nrows = 1,
.row_meta_size = 2,
},
[GGML_TYPE_IQ1_M] = {
.type_name = "iq1_m",
.blck_size = QK_K,
Expand Down Expand Up @@ -4387,6 +4400,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_IQ3_S_R4: wtype = GGML_TYPE_IQ3_S_R4; break;
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
case GGML_FTYPE_MOSTLY_IQ2_S_R4: wtype = GGML_TYPE_IQ2_S_R4; break;
case GGML_FTYPE_MOSTLY_IQ1_S_R4: wtype = GGML_TYPE_IQ1_S_R4; break;
case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
Expand Down Expand Up @@ -10934,6 +10948,7 @@ static void ggml_compute_forward_add(
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
Expand Down Expand Up @@ -11402,6 +11417,7 @@ static void ggml_compute_forward_add1(
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
Expand Down Expand Up @@ -11567,6 +11583,7 @@ static void ggml_compute_forward_acc(
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
Expand Down Expand Up @@ -14805,6 +14822,7 @@ static void ggml_compute_forward_out_prod(
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
Expand Down Expand Up @@ -15210,6 +15228,7 @@ static void ggml_compute_forward_set(
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
Expand Down Expand Up @@ -15509,6 +15528,7 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
Expand Down Expand Up @@ -16137,6 +16157,7 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_Q8_K:
case GGML_TYPE_Q8_K64:
case GGML_TYPE_Q8_K16:
Expand Down Expand Up @@ -22893,6 +22914,7 @@ void ggml_quantize_init(enum ggml_type type) {
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
case GGML_TYPE_IQ1_S_R4:iq2xs_init_impl(GGML_TYPE_IQ1_S); break;
case GGML_TYPE_IQ3_XXS_R4:
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
case GGML_TYPE_IQ3_S_R4:
Expand Down Expand Up @@ -22975,6 +22997,7 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_IQ3_S_R4:result = quantize_iq3_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_S_R4:result = quantize_iq2_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ1_S_R4:result = quantize_iq1_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
Expand Down
Loading