Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/quantize-stats/quantize-stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ static void test_roundtrip_on_chunk(
}

if (use_reference) {
qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size);
qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
} else {
qfns.from_float(input_scratch, quantized_scratch, chunk_size);
}
Expand Down
50 changes: 25 additions & 25 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -714,9 +714,9 @@ extern "C" {
GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN

GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
GGML_API GGML_CALL int64_t ggml_blck_size(enum ggml_type type);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is more than just a name change but returning a larger integer should be safe I think.

GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row

GGML_DEPRECATED(
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
Expand Down Expand Up @@ -2410,31 +2410,31 @@ extern "C" {
#endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr,
int64_t k, int64_t bx);
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_from_float_to_mat_t)
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);

typedef struct {
const char * type_name;
int blck_size;
size_t type_size;
bool is_quantized;
ggml_to_float_t to_float;
ggml_from_float_t from_float;
ggml_from_float_t from_float_reference;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously;
int64_t ncols; // number of columns to process simultaneously;
int64_t interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
const char * type_name;
int64_t blck_size;
int64_t blck_size_interleave; // interleave elements in blocks
size_t type_size;
bool is_quantized;
ggml_to_float_t to_float;
ggml_from_float_t from_float;
ggml_from_float_t from_float_ref;
ggml_from_float_to_mat_t from_float_to_mat;
ggml_gemv_t gemv;
ggml_gemm_t gemm;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously
int64_t ncols; // number of columns to process simultaneously
ggml_gemv_t gemv;
ggml_gemm_t gemm;
} ggml_type_traits_t;

GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
Expand Down
68 changes: 36 additions & 32 deletions ggml/src/ggml-aarch64.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,51 +20,51 @@

// Functions to create the interleaved data layout formats

// interleave 4 block_q4_0s in blocks of interleave_blcksize
// interleave 4 block_q4_0s in blocks of blck_size_interleave
// returns an interleaved block_q4_0x4
// in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
// first, then interleave quants from 4 block_q4_0s in blocks of interleave_blcksize
// first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
//
// - in : an array of block_q4_0 pointers
// - interleave_blcksize : the block_q4_0 quants bytes are interleaved in blocks of
// interleave_blcksize bytes
// - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
// blck_size_interleave bytes
// - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes
// from bias offset form to pure sign form (this saves subtract
// operations durin unpacking)
//
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int interleave_blcksize, unsigned int xor_mask) {
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
block_q4_0x4 out;

for (int i = 0; i < 4; i++) {
out.d[i] = in[i].d;
}

for (int i = 0; i < QK4_0 * 2; i++) {
int src_offset = (i / (4 * interleave_blcksize)) * interleave_blcksize;
int src_id = (i % (4 * interleave_blcksize)) / interleave_blcksize;
src_offset += (i % interleave_blcksize);
int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
src_offset += (i % blck_size_interleave);

out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
}

return out;
}

// interleave 8 block_q4_0s in blocks of interleave_blcksize
// interleave 8 block_q4_0s in blocks of blck_size_interleave
// returns an interleaved block_q4_0x8
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
// first, then interleave quants from 8 block_q4_0s in blocks of interleave_blcksize
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int interleave_blcksize, unsigned int xor_mask) {
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
block_q4_0x8 out;

for (int i = 0; i < 8; i++) {
out.d[i] = in[i].d;
}

for (int i = 0; i < QK4_0 * 4; i++) {
int src_offset = (i / (8 * interleave_blcksize)) * interleave_blcksize;
int src_id = (i % (8 * interleave_blcksize)) / interleave_blcksize;
src_offset += (i % interleave_blcksize);
int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave;
int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave;
src_offset += (i % blck_size_interleave);

out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
}
Expand Down Expand Up @@ -135,7 +135,7 @@ void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k)
}
#else
// scalar
const int interleave_blcksize = 4;
const int blck_size_interleave = 4;
float srcv[4][QK8_0];
float id[4];

Expand All @@ -155,12 +155,12 @@ void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k)
}

for (int j = 0; j < QK8_0 * 4; j++) {
int src_offset = (j / (4 * interleave_blcksize)) * interleave_blcksize;
int src_id = (j % (4 * interleave_blcksize)) / interleave_blcksize;
src_offset += (j % interleave_blcksize);
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
src_offset += (j % blck_size_interleave);

float x0 = srcv[src_id][src_offset] * id[src_id];
y[i].qs[j] = roundf(x0);;
y[i].qs[j] = roundf(x0);
}
}
#endif
Expand Down Expand Up @@ -253,7 +253,7 @@ void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k)
}
#else
// scalar
const int interleave_blcksize = 8;
const int blck_size_interleave = 8;
float srcv[4][QK8_0];
float id[4];

Expand All @@ -273,26 +273,30 @@ void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k)
}

for (int j = 0; j < QK8_0 * 4; j++) {
int src_offset = (j / (4 * interleave_blcksize)) * interleave_blcksize;
int src_id = (j % (4 * interleave_blcksize)) / interleave_blcksize;
src_offset += (j % interleave_blcksize);
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
src_offset += (j % blck_size_interleave);

float x0 = srcv[src_id][src_offset] * id[src_id];
y[i].qs[j] = roundf(x0);;
y[i].qs[j] = roundf(x0);
}
}
#endif
}

void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t interleave_blcksize) {
void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) {
assert(nrow == 4);
UNUSED(nrow);
if (interleave_blcksize == 4) quantize_q8_0_4x4(x, vy, n_per_row);
else if (interleave_blcksize == 8) quantize_q8_0_4x8(x, vy, n_per_row);
else assert(false);
if (blck_size_interleave == 4) {
quantize_q8_0_4x4(x, vy, n_per_row);
} else if (blck_size_interleave == 8) {
quantize_q8_0_4x8(x, vy, n_per_row);
} else {
assert(false);
}
}

static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int interleave_blcksize) {
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
assert(n_per_row % QK4_0 == 0);
const int nb = n_per_row / QK4_0;

Expand All @@ -311,15 +315,15 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
for (int64_t x = 0; x < nb; x++) {

for (int i = 0; i < nrows_interleaved; i++ ) {
quantize_row_q4_0_reference(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
}

if (nrows_interleaved == 8) {
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, interleave_blcksize, 0x88);
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave, 0x88);
out_ptr = (block_q4_0x8 *) out_ptr + 1;
}
else if (nrows_interleaved == 4) {
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, interleave_blcksize, 0x88);
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave, 0x88);
out_ptr = (block_q4_0x4 *) out_ptr + 1;
}
}
Expand Down
14 changes: 7 additions & 7 deletions ggml/src/ggml-aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,22 @@ extern "C" {
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t interleave_blcksize);
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);

// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

// GEMV
void ggml_gemv_q4_0_4x4_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_4x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

// GEMM
void ggml_gemm_q4_0_4x4_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

#ifdef __cplusplus
}
Expand Down
Loading