spiritbuun · dusterbloom · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -401,6 +401,9 @@ const std::vector<ggml_type> kv_cache_types = {
     GGML_TYPE_TURBO2_0,
     GGML_TYPE_TURBO3_0,
     GGML_TYPE_TURBO4_0,
+    GGML_TYPE_TBQ2_0,
+    GGML_TYPE_TBQ3_0,
+    GGML_TYPE_TBQ4_0,
 };
 
 static ggml_type kv_cache_type_from_str(const std::string & s) {

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -431,7 +431,10 @@ extern "C" {
         GGML_TYPE_TURBO3_0 = 41, // TurboQuant 3-bit KV cache: 2-bit PolarQuant + 1-bit QJL
         GGML_TYPE_TURBO4_0 = 42, // TurboQuant 4-bit KV cache: 3-bit PolarQuant + 1-bit QJL
         GGML_TYPE_TURBO2_0 = 43, // TurboQuant 2-bit KV cache: 2-bit PolarQuant, no QJL
-        GGML_TYPE_COUNT   = 44,
+        GGML_TYPE_TBQ3_0  = 44, // TBQ 3-bit KV cache: SRHT + Lloyd-Max 8-level codebook
+        GGML_TYPE_TBQ4_0  = 45, // TBQ 4-bit KV cache: SRHT + Lloyd-Max 16-level codebook
+        GGML_TYPE_TBQ2_0  = 46, // TBQ 2-bit KV cache: SRHT + Lloyd-Max 4-level codebook
+        GGML_TYPE_COUNT   = 47,
     };
 
     // precision

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
@@ -206,6 +206,7 @@ add_library(ggml-base
             ggml-quants.c
             ggml-quants.h
             ggml-turbo-quant.c
+            ggml-tbq-quant.c
             gguf.cpp)
 
 set_target_properties(ggml-base PROPERTIES

diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
@@ -304,6 +304,36 @@ typedef struct {
 } block_turbo4_0;                       // 68 bytes total
 static_assert(sizeof(block_turbo4_0) == 2*sizeof(ggml_half) + QK_TURBO4*3/8 + QK_TURBO4/8, "wrong turbo4_0 block size/padding");
 
+// TBQ 3-bit: SRHT + Lloyd-Max 8-level codebook
+// Per block: 3-bit packed indices (48 bytes) + norm(fp16) = 50 bytes per 128 values
+// = 3.125 bits/value → 5.12× compression vs fp16
+#define QK_TBQ3 128
+typedef struct {
+    uint8_t  qs[48];    // 3-bit packed codebook indices (128 * 3 / 8 = 48)
+    ggml_half norm;     // L2 norm of the original block
+} block_tbq3_0;
+static_assert(sizeof(block_tbq3_0) == 50, "wrong tbq3_0 block size/padding");
+
+// TBQ 4-bit: SRHT + Lloyd-Max 16-level codebook
+// Per block: 4-bit packed indices (64 bytes) + norm(fp16) = 66 bytes per 128 values
+// = 4.125 bits/value → 3.88× compression vs fp16
+#define QK_TBQ4 128
+typedef struct {
+    uint8_t  qs[64];    // 4-bit packed codebook indices (128 * 4 / 8 = 64)
+    ggml_half norm;     // L2 norm of the original block
+} block_tbq4_0;
+static_assert(sizeof(block_tbq4_0) == 66, "wrong tbq4_0 block size/padding");
+
+// TBQ 2-bit: SRHT + Lloyd-Max 4-level codebook
+// Per block: 2-bit packed indices (32 bytes) + norm(fp16) = 34 bytes per 128 values
+// = 2.125 bits/value → 7.53× compression vs fp16
+#define QK_TBQ2 128
+typedef struct {
+    uint8_t  qs[32];    // 2-bit packed codebook indices (128 * 2 / 8 = 32)
+    ggml_half norm;     // L2 norm (corrected: ||x|| / ||centroids||)
+} block_tbq2_0;
+static_assert(sizeof(block_tbq2_0) == 34, "wrong tbq2_0 block size/padding");
+
 //
 // Super-block quantization structures
 //

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -408,6 +408,24 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_F32,
         .nrows                    = 1,
     },
+    [GGML_TYPE_TBQ2_0] = {
+        .from_float               = quantize_row_tbq2_0,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_F32,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_TBQ3_0] = {
+        .from_float               = quantize_row_tbq3_0,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_F32,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_TBQ4_0] = {
+        .from_float               = quantize_row_tbq4_0,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_F32,
+        .nrows                    = 1,
+    },
     [GGML_TYPE_I32] = {
         .from_float               = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
     },

diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h
@@ -38,6 +38,14 @@ void quantize_row_turbo2_0_ref(const float * GGML_RESTRICT x, block_turbo2_0 * G
 void quantize_row_turbo3_0_ref(const float * GGML_RESTRICT x, block_turbo3_0 * GGML_RESTRICT y, int64_t k);
 void quantize_row_turbo4_0_ref(const float * GGML_RESTRICT x, block_turbo4_0 * GGML_RESTRICT y, int64_t k);
 
+// TBQ (SRHT + Lloyd-Max) — impl in ggml-tbq-quant.c
+void quantize_row_tbq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_tbq3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_tbq4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void ggml_vec_dot_tbq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_tbq3_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_tbq4_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
 void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 

diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -128,6 +128,12 @@ if (CUDAToolkit_FOUND)
         list(APPEND GGML_SOURCES_CUDA ${SRCS})
         file(GLOB   SRCS "template-instances/fattn-vec*turbo4_0*.cu")
         list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB   SRCS "template-instances/fattn-vec*tbq2_0*.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB   SRCS "template-instances/fattn-vec*tbq3_0*.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB   SRCS "template-instances/fattn-vec*tbq4_0*.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
     endif()
 
     ggml_add_backend_library(ggml-cuda