Skip to content

Commit 4ade4c5

Browse files
ikawrakowIwan Kawrakow
andauthored
IQ2_K_R4 (#146)
* iq2_k_r4: Zen4 * iq2_k_r4: NEON * iq2_k_r4: better matrix x vector multiplication on NEON --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent d69344f commit 4ade4c5

File tree

10 files changed

+430
-11
lines changed

10 files changed

+430
-11
lines changed

examples/quantize/quantize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
5353
{ "IQ4_KS", LLAMA_FTYPE_MOSTLY_IQ4_KS, " 4.25 bpw non-linear quantization", },
5454
{ "IQ4_KSS", LLAMA_FTYPE_MOSTLY_IQ4_KSS, " 4.0 bpw non-linear quantization", },
5555
{ "IQ2_K", LLAMA_FTYPE_MOSTLY_IQ2_K, " 2.375 bpw non-linear quantization",},
56+
{ "IQ2_K_R4", LLAMA_FTYPE_MOSTLY_IQ2_K_R4, "IQ2_K repacked",},
5657
{ "IQ2_KS", LLAMA_FTYPE_MOSTLY_IQ2_KS, " 2.1875 bpw non-linear quantization",},
5758
{ "IQ3_K", LLAMA_FTYPE_MOSTLY_IQ3_K, " 3.44 bpw non-linear quantization", },
5859
{ "IQ3_K_R4", LLAMA_FTYPE_MOSTLY_IQ3_K_R4, "IQ3_K repacked", },

ggml/include/ggml.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,7 @@ extern "C" {
423423
GGML_TYPE_BF16_R16 = 230,
424424
GGML_TYPE_Q6_0_R4 = 233,
425425
GGML_TYPE_IQ2_BN_R4 = 335,
426+
GGML_TYPE_IQ2_K_R4 = 337,
426427
GGML_TYPE_IQ3_K_R4 = 338,
427428
GGML_TYPE_IQ4_K_R4 = 339,
428429
GGML_TYPE_Q8_K_R8 = 399,
@@ -498,6 +499,7 @@ extern "C" {
498499
GGML_FTYPE_MOSTLY_BF16_R16 = 224, // except 1d tensors
499500
GGML_FTYPE_MOSTLY_Q6_0_R4 = 227, // except 1d tensors
500501
GGML_FTYPE_MOSTLY_IQ2_BN_R4 = 329, // except 1d tensors
502+
GGML_FTYPE_MOSTLY_IQ2_K_R4 = 330, // except 1d tensors
501503
GGML_FTYPE_MOSTLY_IQ3_K_R4 = 331, // except 1d tensors
502504
GGML_FTYPE_MOSTLY_IQ4_K_R4 = 332, // except 1d tensors
503505
GGML_FTYPE_MOSTLY_Q8_K_R8 = 399, // except 1d tensors

ggml/src/ggml-common.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,14 @@ typedef struct {
521521
} block_iq2_k;
522522
static_assert(sizeof(block_iq2_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/32 + QK_K/4, "wrong iq2_k block size/padding");
523523

524+
typedef struct {
525+
ggml_half d[4];
526+
uint8_t extra[8];
527+
uint8_t scales[QK_K/8];
528+
uint8_t qs[QK_K];
529+
} block_iq2_k_r4;
530+
static_assert(sizeof(block_iq2_k_r4) == 4*sizeof(block_iq2_k), "wrong iq2_k_r4 block size/padding");
531+
524532
typedef struct {
525533
uint16_t extra;
526534
uint8_t scales[QK_K/64];

ggml/src/ggml-quants.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15207,6 +15207,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
1520715207
case GGML_TYPE_Q4_K_R4: break;
1520815208
case GGML_TYPE_Q5_K_R4: break;
1520915209
case GGML_TYPE_Q6_K_R4: break;
15210+
case GGML_TYPE_IQ2_K_R4: break;
1521015211
case GGML_TYPE_IQ3_K_R4: break;
1521115212
case GGML_TYPE_IQ4_K_R4: break;
1521215213
case GGML_TYPE_Q8_K_R8: break;

ggml/src/ggml.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,6 +1308,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
13081308
.nrows = 1,
13091309
.row_meta_size = 0,
13101310
},
1311+
[GGML_TYPE_IQ2_K_R4] = {
1312+
.type_name = "iq2_k_r4",
1313+
.blck_size = QK_K,
1314+
.type_size = sizeof(block_iq2_k),
1315+
.is_quantized = true,
1316+
.to_float = (ggml_to_float_t) dequantize_row_iq2_k_r4,
1317+
.from_float = quantize_row_iq2_k_r4,
1318+
.from_float_ref = (ggml_from_float_t)quantize_row_iq2_k_r4_ref,
1319+
.vec_dot = vec_dot_iq2_k_r4_q8_k,
1320+
.vec_dot_type = GGML_TYPE_Q8_K,
1321+
.nrows = 1,
1322+
.row_meta_size = 0,
1323+
},
13111324
[GGML_TYPE_IQ2_KS] = {
13121325
.type_name = "iq2_ks",
13131326
.blck_size = QK_K,
@@ -4173,6 +4186,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
41734186
case GGML_FTYPE_MOSTLY_IQ4_KS: wtype = GGML_TYPE_IQ4_KS; break;
41744187
case GGML_FTYPE_MOSTLY_IQ4_KSS: wtype = GGML_TYPE_IQ4_KSS; break;
41754188
case GGML_FTYPE_MOSTLY_IQ2_K: wtype = GGML_TYPE_IQ2_K; break;
4189+
case GGML_FTYPE_MOSTLY_IQ2_K_R4: wtype = GGML_TYPE_IQ2_K_R4; break;
41764190
case GGML_FTYPE_MOSTLY_IQ2_KS: wtype = GGML_TYPE_IQ2_KS; break;
41774191
case GGML_FTYPE_MOSTLY_IQ3_K: wtype = GGML_TYPE_IQ3_K; break;
41784192
case GGML_FTYPE_MOSTLY_IQ4_K: wtype = GGML_TYPE_IQ4_K; break;
@@ -10711,6 +10725,7 @@ static void ggml_compute_forward_add(
1071110725
case GGML_TYPE_IQ4_KS:
1071210726
case GGML_TYPE_IQ4_KSS:
1071310727
case GGML_TYPE_IQ2_K:
10728+
case GGML_TYPE_IQ2_K_R4:
1071410729
case GGML_TYPE_IQ2_KS:
1071510730
case GGML_TYPE_IQ3_K:
1071610731
case GGML_TYPE_IQ4_K:
@@ -11168,6 +11183,7 @@ static void ggml_compute_forward_add1(
1116811183
case GGML_TYPE_IQ4_KS:
1116911184
case GGML_TYPE_IQ4_KSS:
1117011185
case GGML_TYPE_IQ2_K:
11186+
case GGML_TYPE_IQ2_K_R4:
1117111187
case GGML_TYPE_IQ2_KS:
1117211188
case GGML_TYPE_IQ3_K:
1117311189
case GGML_TYPE_IQ4_K:
@@ -11322,6 +11338,7 @@ static void ggml_compute_forward_acc(
1132211338
case GGML_TYPE_IQ4_KS:
1132311339
case GGML_TYPE_IQ4_KSS:
1132411340
case GGML_TYPE_IQ2_K:
11341+
case GGML_TYPE_IQ2_K_R4:
1132511342
case GGML_TYPE_IQ2_KS:
1132611343
case GGML_TYPE_IQ3_K:
1132711344
case GGML_TYPE_IQ4_K:
@@ -14522,6 +14539,7 @@ static void ggml_compute_forward_out_prod(
1452214539
case GGML_TYPE_IQ4_KS:
1452314540
case GGML_TYPE_IQ4_KSS:
1452414541
case GGML_TYPE_IQ2_K:
14542+
case GGML_TYPE_IQ2_K_R4:
1452514543
case GGML_TYPE_IQ2_KS:
1452614544
case GGML_TYPE_IQ3_K:
1452714545
case GGML_TYPE_IQ4_K:
@@ -14916,6 +14934,7 @@ static void ggml_compute_forward_set(
1491614934
case GGML_TYPE_IQ4_KS:
1491714935
case GGML_TYPE_IQ4_KSS:
1491814936
case GGML_TYPE_IQ2_K:
14937+
case GGML_TYPE_IQ2_K_R4:
1491914938
case GGML_TYPE_IQ2_KS:
1492014939
case GGML_TYPE_IQ3_K:
1492114940
case GGML_TYPE_IQ4_K:
@@ -15204,6 +15223,7 @@ static void ggml_compute_forward_get_rows(
1520415223
case GGML_TYPE_IQ4_KS:
1520515224
case GGML_TYPE_IQ4_KSS:
1520615225
case GGML_TYPE_IQ2_K:
15226+
case GGML_TYPE_IQ2_K_R4:
1520715227
case GGML_TYPE_IQ2_KS:
1520815228
case GGML_TYPE_IQ3_K:
1520915229
case GGML_TYPE_IQ4_K:
@@ -15821,6 +15841,7 @@ static void ggml_compute_forward_clamp(
1582115841
case GGML_TYPE_IQ4_KS:
1582215842
case GGML_TYPE_IQ4_KSS:
1582315843
case GGML_TYPE_IQ2_K:
15844+
case GGML_TYPE_IQ2_K_R4:
1582415845
case GGML_TYPE_IQ2_KS:
1582515846
case GGML_TYPE_IQ3_K:
1582615847
case GGML_TYPE_IQ4_K:
@@ -22666,6 +22687,7 @@ size_t ggml_quantize_chunk(
2266622687
case GGML_TYPE_IQ4_KS: result = quantize_iq4_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2266722688
case GGML_TYPE_IQ4_KSS: result = quantize_iq4_kss(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2266822689
case GGML_TYPE_IQ2_K: result = quantize_iq2_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
22690+
case GGML_TYPE_IQ2_K_R4:result = quantize_iq2_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2266922691
case GGML_TYPE_IQ2_KS: result = quantize_iq2_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2267022692
case GGML_TYPE_IQ3_K: result = quantize_iq3_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2267122693
case GGML_TYPE_IQ4_K: result = quantize_iq4_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;

0 commit comments

Comments
 (0)