Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,9 @@ const std::vector<ggml_type> kv_cache_types = {
GGML_TYPE_TURBO2_0,
GGML_TYPE_TURBO3_0,
GGML_TYPE_TURBO4_0,
GGML_TYPE_TBQ2_0,
GGML_TYPE_TBQ3_0,
GGML_TYPE_TBQ4_0,
};

static ggml_type kv_cache_type_from_str(const std::string & s) {
Expand Down
5 changes: 4 additions & 1 deletion ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,10 @@ extern "C" {
GGML_TYPE_TURBO3_0 = 41, // TurboQuant 3-bit KV cache: 2-bit PolarQuant + 1-bit QJL
GGML_TYPE_TURBO4_0 = 42, // TurboQuant 4-bit KV cache: 3-bit PolarQuant + 1-bit QJL
GGML_TYPE_TURBO2_0 = 43, // TurboQuant 2-bit KV cache: 2-bit PolarQuant, no QJL
GGML_TYPE_COUNT = 44,
GGML_TYPE_TBQ3_0 = 44, // TBQ 3-bit KV cache: SRHT + Lloyd-Max 8-level codebook
GGML_TYPE_TBQ4_0 = 45, // TBQ 4-bit KV cache: SRHT + Lloyd-Max 16-level codebook
GGML_TYPE_TBQ2_0 = 46, // TBQ 2-bit KV cache: SRHT + Lloyd-Max 4-level codebook
GGML_TYPE_COUNT = 47,
};

// precision
Expand Down
1 change: 1 addition & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ add_library(ggml-base
ggml-quants.c
ggml-quants.h
ggml-turbo-quant.c
ggml-tbq-quant.c
gguf.cpp)

set_target_properties(ggml-base PROPERTIES
Expand Down
30 changes: 30 additions & 0 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,36 @@ typedef struct {
} block_turbo4_0; // 68 bytes total
static_assert(sizeof(block_turbo4_0) == 2*sizeof(ggml_half) + QK_TURBO4*3/8 + QK_TURBO4/8, "wrong turbo4_0 block size/padding");

// TBQ 3-bit: SRHT + Lloyd-Max 8-level codebook
// Per block: 3-bit packed indices (48 bytes) + norm(fp16) = 50 bytes per 128 values
// = 3.125 bits/value → 5.12× compression vs fp16
#define QK_TBQ3 128
typedef struct {
uint8_t qs[48]; // 3-bit packed codebook indices (128 * 3 / 8 = 48)
ggml_half norm; // L2 norm of the original block
} block_tbq3_0;
static_assert(sizeof(block_tbq3_0) == 50, "wrong tbq3_0 block size/padding");

// TBQ 4-bit: SRHT + Lloyd-Max 16-level codebook
// Per block: 4-bit packed indices (64 bytes) + norm(fp16) = 66 bytes per 128 values
// = 4.125 bits/value → 3.88× compression vs fp16
#define QK_TBQ4 128
typedef struct {
uint8_t qs[64]; // 4-bit packed codebook indices (128 * 4 / 8 = 64)
ggml_half norm; // L2 norm of the original block
} block_tbq4_0;
static_assert(sizeof(block_tbq4_0) == 66, "wrong tbq4_0 block size/padding");

// TBQ 2-bit: SRHT + Lloyd-Max 4-level codebook
// Per block: 2-bit packed indices (32 bytes) + norm(fp16) = 34 bytes per 128 values
// = 2.125 bits/value → 7.53× compression vs fp16
#define QK_TBQ2 128
typedef struct {
uint8_t qs[32]; // 2-bit packed codebook indices (128 * 2 / 8 = 32)
ggml_half norm; // L2 norm (corrected: ||x|| / ||centroids||)
} block_tbq2_0;
static_assert(sizeof(block_tbq2_0) == 34, "wrong tbq2_0 block size/padding");

//
// Super-block quantization structures
//
Expand Down
18 changes: 18 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,24 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TBQ2_0] = {
.from_float = quantize_row_tbq2_0,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TBQ3_0] = {
.from_float = quantize_row_tbq3_0,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TBQ4_0] = {
.from_float = quantize_row_tbq4_0,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_I32] = {
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
},
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-cpu/quants.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ void quantize_row_turbo2_0_ref(const float * GGML_RESTRICT x, block_turbo2_0 * G
void quantize_row_turbo3_0_ref(const float * GGML_RESTRICT x, block_turbo3_0 * GGML_RESTRICT y, int64_t k);
void quantize_row_turbo4_0_ref(const float * GGML_RESTRICT x, block_turbo4_0 * GGML_RESTRICT y, int64_t k);

// TBQ (SRHT + Lloyd-Max) — impl in ggml-tbq-quant.c
void quantize_row_tbq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_tbq3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_tbq4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void ggml_vec_dot_tbq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_tbq3_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_tbq4_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@ if (CUDAToolkit_FOUND)
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "template-instances/fattn-vec*turbo4_0*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "template-instances/fattn-vec*tbq2_0*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "template-instances/fattn-vec*tbq3_0*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "template-instances/fattn-vec*tbq4_0*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
endif()

ggml_add_backend_library(ggml-cuda
Expand Down
Loading