diff --git a/common/arg.cpp b/common/arg.cpp index 0e7fc4e519a..b6e4ae8f0e6 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -387,6 +387,7 @@ const std::vector kv_cache_types = { GGML_TYPE_IQ4_NL, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, + GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO4_0, }; diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index e385a2daa1f..7df582e619a 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -430,7 +430,8 @@ extern "C" { GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale) GGML_TYPE_TURBO3_0 = 41, // TurboQuant 3-bit KV cache: 2-bit PolarQuant + 1-bit QJL GGML_TYPE_TURBO4_0 = 42, // TurboQuant 4-bit KV cache: 3-bit PolarQuant + 1-bit QJL - GGML_TYPE_COUNT = 43, + GGML_TYPE_TURBO2_0 = 43, // TurboQuant 2-bit KV cache: 2-bit PolarQuant (no QJL) + GGML_TYPE_COUNT = 44, }; // precision @@ -2490,7 +2491,9 @@ extern "C" { GGML_API struct ggml_tensor * ggml_turbo_wht( struct ggml_context * ctx, struct ggml_tensor * a, - int direction); + int direction, + int group_size, // 0 = auto (64 or 128 from ne[0]) + struct ggml_tensor * scale); // NULL = no InnerQ scaling // custom operators diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index cea656134a9..7b56097282f 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -293,6 +293,18 @@ typedef struct { } block_turbo4_0; // 68 bytes total static_assert(sizeof(block_turbo4_0) == 2*sizeof(ggml_half) + QK_TURBO4*3/8 + QK_TURBO4/8, "wrong turbo4_0 block size/padding"); +// TurboQuant 2-bit: 2-bit PolarQuant indices only (no QJL) +// Per block: norm(fp16) + 2-bit indices (8 bytes) = 10 bytes per 32 values +// = 2.5 bits/value → 6.4× compression vs fp16 +// 4 centroids (Lloyd-Max for N(0, 1/128)): {-0.133462, -0.039994, 0.039994, 0.133462} +#define QK_TURBO2 32 // Block size 32 +#define QK_TURBO2_GROUP 128 // rotation group size = head_dim +typedef struct { + ggml_half norm; // 2 bytes: corrected L2 norm + uint8_t qs[QK_TURBO2 / 4]; // 8 bytes: 2-bit indices (4 per byte) +} block_turbo2_0; // 10 bytes total +static_assert(sizeof(block_turbo2_0) == sizeof(ggml_half) + QK_TURBO2/4, "wrong turbo2_0 block size/padding"); + // // Super-block quantization structures // diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 5943d7f19be..65c7c9931af 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -7,6 +7,7 @@ #include "ggml-cpu-impl.h" #include "ggml-impl.h" #include "quants.h" +#include "ggml-quants.h" #include "ggml-threading.h" #include "unary-ops.h" #include "binary-ops.h" @@ -204,6 +205,14 @@ typedef pthread_t ggml_thread_t; #include #endif +// Forward declarations — defined below, after utility functions +static void ggml_vec_dot_turbo3_0_f32(int n, float * GGML_RESTRICT s, size_t bs, + const void * GGML_RESTRICT vx, size_t bx, + const void * GGML_RESTRICT vy, size_t by, int nrc); +static void ggml_vec_dot_turbo2_0_f32(int n, float * GGML_RESTRICT s, size_t bs, + const void * GGML_RESTRICT vx, size_t bx, + const void * GGML_RESTRICT vy, size_t by, int nrc); + static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = { .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp32, @@ -393,6 +402,18 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { [GGML_TYPE_I32] = { .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32, }, + [GGML_TYPE_TURBO3_0] = { + .from_float = (ggml_from_float_t) quantize_row_turbo3_0_ref, + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_turbo3_0_f32, + .vec_dot_type = GGML_TYPE_F32, + .nrows = 1, + }, + [GGML_TYPE_TURBO2_0] = { + .from_float = (ggml_from_float_t) quantize_row_turbo2_0_ref, + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_turbo2_0_f32, + .vec_dot_type = GGML_TYPE_F32, + .nrows = 1, + }, }; const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) { @@ -3318,6 +3339,46 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g return ggml_graph_compute(cgraph, &cplan); } +// TurboQuant3 vec_dot: dequantize turbo3 block to f32, then dot with f32 operand. +// Used by CPU flash attention for models with D not supported by CUDA FA (e.g. D=192). +static void ggml_vec_dot_turbo3_0_f32(int n, float * GGML_RESTRICT s, size_t bs, + const void * GGML_RESTRICT vx, size_t bx, + const void * GGML_RESTRICT vy, size_t by, int nrc) { + GGML_ASSERT(nrc == 1); + GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc); + + // Dequantize turbo3 to f32 temp buffer, then dot + float tmp[4096]; // max head_dim + GGML_ASSERT(n <= 4096); + ggml_get_type_traits(GGML_TYPE_TURBO3_0)->to_float(vx, tmp, n); + + const float * y = (const float *)vy; + float sum = 0.0f; + for (int i = 0; i < n; i++) { + sum += tmp[i] * y[i]; + } + *s = sum; +} + +// TurboQuant2 vec_dot: dequantize turbo2 block to f32, then dot with f32 operand. +static void ggml_vec_dot_turbo2_0_f32(int n, float * GGML_RESTRICT s, size_t bs, + const void * GGML_RESTRICT vx, size_t bx, + const void * GGML_RESTRICT vy, size_t by, int nrc) { + GGML_ASSERT(nrc == 1); + GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc); + + float tmp[4096]; + GGML_ASSERT(n <= 4096); + ggml_get_type_traits(GGML_TYPE_TURBO2_0)->to_float(vx, tmp, n); + + const float * y = (const float *)vy; + float sum = 0.0f; + for (int i = 0; i < n; i++) { + sum += tmp[i] * y[i]; + } + *s = sum; +} + void ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) { memcpy(y, x, n * sizeof(float)); } diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 9e41111da96..cec4855f2d0 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -4926,6 +4926,14 @@ static void ggml_compute_forward_set_rows_f32( ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float; + // For turbo types: communicate WHT group size to the quantize function via global + if (dst->type == GGML_TYPE_TURBO3_0 || dst->type == GGML_TYPE_TURBO4_0 || dst->type == GGML_TYPE_TURBO2_0) { + extern int turbo3_cpu_wht_group_size; + int gs = 0; + memcpy(&gs, dst->op_params, sizeof(int)); + turbo3_cpu_wht_group_size = (gs == 64 || gs == 128) ? gs : 0; + } + for (int64_t i03 = 0; i03 < ne03; ++i03) { for (int64_t i02 = 0; i02 < ne02; ++i02) { for (int64_t i = ir0; i < ir1; ++i) { @@ -10626,17 +10634,23 @@ static void ggml_compute_forward_turbo_wht_f32( const ggml_compute_params * params, ggml_tensor * dst) { const ggml_tensor * src = dst->src[0]; + const ggml_tensor * scale_tensor = dst->src[1]; // InnerQ scale_inv (may be NULL) const float * src_data = (const float *) src->data; float * dst_data = (float *) dst->data; + const float * scale_inv = scale_tensor ? (const float *) scale_tensor->data : NULL; int direction; - memcpy(&direction, dst->op_params, sizeof(int)); + int group_size; + memcpy(&direction, dst->op_params + 0, sizeof(int)); + memcpy(&group_size, dst->op_params + sizeof(int), sizeof(int)); - const float * s_first = (direction == 0) ? turbo_wht_s1 : turbo_wht_s2; - const float * s_second = (direction == 0) ? turbo_wht_s2 : turbo_wht_s1; + const int64_t head_dim = src->ne[0]; + const int64_t n_heads = ggml_nelements(src) / head_dim; + const int64_t groups_per_head = head_dim / group_size; + const int tail_size = (int)(head_dim % group_size); + const int64_t n_groups = groups_per_head * n_heads; - const int64_t n_total = ggml_nelements(src); - const int64_t n_groups = n_total / 128; + const float inv_sqrt = 1.0f / sqrtf((float)group_size); // Parallel over groups const int64_t ith = params->ith; @@ -10644,16 +10658,31 @@ static void ggml_compute_forward_turbo_wht_f32( const int64_t grp_start = (n_groups * ith) / nth; const int64_t grp_end = (n_groups * (ith + 1)) / nth; + // Select sign arrays: for 64-group, use first 64 elements of the 128-element arrays + const float * s_first = (direction == 0) ? turbo_wht_s1 : turbo_wht_s2; + const float * s_second = (direction == 0) ? turbo_wht_s2 : turbo_wht_s1; + for (int64_t g = grp_start; g < grp_end; g++) { - float x[128]; - const float * in = src_data + g * 128; + const int64_t head_idx = g / groups_per_head; + const int64_t grp_in_head = g % groups_per_head; + const int64_t base = head_idx * head_dim + grp_in_head * group_size; + + float x[128]; // max group_size + const float * in = src_data + base; + + // InnerQ forward: apply scale_inv BEFORE signs+WHT (for Q pre-rotation) + if (direction == 0 && scale_inv != NULL) { + for (int i = 0; i < group_size; i++) x[i] = in[i] * scale_inv[i % group_size]; + } else { + for (int i = 0; i < group_size; i++) x[i] = in[i]; + } // Apply first signs - for (int i = 0; i < 128; i++) x[i] = in[i] * s_first[i]; + for (int i = 0; i < group_size; i++) x[i] *= s_first[i]; - // WHT butterfly (7 stages) - for (int h = 1; h < 128; h *= 2) { - for (int i = 0; i < 128; i += h * 2) { + // WHT butterfly (log2(group_size) stages) + for (int h = 1; h < group_size; h *= 2) { + for (int i = 0; i < group_size; i += h * 2) { for (int j = i; j < i + h; j++) { float a = x[j], b = x[j + h]; x[j] = a + b; @@ -10663,10 +10692,23 @@ static void ggml_compute_forward_turbo_wht_f32( } // Normalize + second signs - const float inv_sqrt_128 = 0.08838834764831845f; - float * out = dst_data + g * 128; - for (int i = 0; i < 128; i++) { - out[i] = x[i] * inv_sqrt_128 * s_second[i]; + float * out = dst_data + base; + for (int i = 0; i < group_size; i++) { + float val = x[i] * inv_sqrt * s_second[i]; + // InnerQ inverse: apply scale_inv AFTER WHT+signs (for V un-rotation) + if (direction == 1 && scale_inv != NULL) { + val *= scale_inv[i % group_size]; + } + out[i] = val; + } + } + + // Copy tail elements unchanged (identity pass-through) + if (tail_size > 0 && ith == 0) { + const int64_t tail_offset = groups_per_head * group_size; + for (int64_t h = 0; h < n_heads; h++) { + const int64_t base = h * head_dim + tail_offset; + memcpy(dst_data + base, src_data + base, tail_size * sizeof(float)); } } } diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index 419862101d1..94e48651388 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -120,7 +120,13 @@ if (CUDAToolkit_FOUND) template-instances/fattn-vec-instance-f16-f16.cu template-instances/fattn-vec-instance-q4_0-q4_0.cu template-instances/fattn-vec-instance-q8_0-q8_0.cu - template-instances/fattn-vec-instance-bf16-bf16.cu) + template-instances/fattn-vec-instance-bf16-bf16.cu + template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu + template-instances/fattn-vec-instance-turbo3_0-q8_0.cu + template-instances/fattn-vec-instance-q8_0-turbo3_0.cu + template-instances/fattn-vec-instance-turbo2_0-turbo2_0.cu + template-instances/fattn-vec-instance-turbo2_0-q8_0.cu + template-instances/fattn-vec-instance-q8_0-turbo2_0.cu) endif() ggml_add_backend_library(ggml-cuda diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 79ccfe568a2..e7269f95931 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -1,5 +1,6 @@ #include "convert.cuh" #include "dequantize.cuh" +#include "turbo-quant.cuh" #include @@ -756,6 +757,10 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_mxfp4_cuda; case GGML_TYPE_NVFP4: return dequantize_row_nvfp4_cuda; + case GGML_TYPE_TURBO3_0: + return dequantize_block_cont_cuda; + case GGML_TYPE_TURBO2_0: + return dequantize_block_cont_cuda; case GGML_TYPE_F32: return convert_unary_cont_cuda; case GGML_TYPE_BF16: @@ -809,6 +814,10 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_mxfp4_cuda; case GGML_TYPE_NVFP4: return dequantize_row_nvfp4_cuda; + case GGML_TYPE_TURBO3_0: + return dequantize_block_cont_cuda; + case GGML_TYPE_TURBO2_0: + return dequantize_block_cont_cuda; case GGML_TYPE_F16: return convert_unary_cont_cuda; case GGML_TYPE_BF16: @@ -832,6 +841,10 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) { return dequantize_block_cuda; case GGML_TYPE_Q8_0: return dequantize_block_cuda; + case GGML_TYPE_TURBO3_0: + return dequantize_block_cuda; + case GGML_TYPE_TURBO2_0: + return dequantize_block_cuda; case GGML_TYPE_BF16: return convert_unary_cuda; default: @@ -874,6 +887,10 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) { return dequantize_block_cuda; case GGML_TYPE_Q8_0: return dequantize_block_cuda; + case GGML_TYPE_TURBO3_0: + return dequantize_block_cuda; + case GGML_TYPE_TURBO2_0: + return dequantize_block_cuda; case GGML_TYPE_BF16: return convert_unary_cuda; default: diff --git a/ggml/src/ggml-cuda/dequantize.cuh b/ggml/src/ggml-cuda/dequantize.cuh index e060fb29fdc..513d748522a 100644 --- a/ggml/src/ggml-cuda/dequantize.cuh +++ b/ggml/src/ggml-cuda/dequantize.cuh @@ -1,4 +1,5 @@ #include "common.cuh" +#include "turbo-quant.cuh" static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, float2 & v){ const block_q4_0 * x = (const block_q4_0 *) vx; @@ -75,3 +76,20 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in v.x *= d; v.y *= d; } + +// Turbo3: 3-bit PolarQuant (2-bit qs + 1-bit sign), block size 32 +// iqs is the element index within the block (even), produces elements iqs and iqs+1 +static __device__ __forceinline__ void dequantize_turbo3_0(const void * vx, const int64_t ib, const int iqs, float2 & v){ + const block_turbo3_0 * x = (const block_turbo3_0 *) vx; + const float norm = __half2float(x[ib].norm); + v.x = turbo3_dequant_element(&x[ib], iqs + 0, norm); + v.y = turbo3_dequant_element(&x[ib], iqs + 1, norm); +} + +// Turbo2: 2-bit PolarQuant (2-bit qs only, no sign), block size 32 +static __device__ __forceinline__ void dequantize_turbo2_0(const void * vx, const int64_t ib, const int iqs, float2 & v){ + const block_turbo2_0 * x = (const block_turbo2_0 *) vx; + const float norm = __half2float(x[ib].norm); + v.x = turbo2_dequant_element(&x[ib], iqs + 0, norm); + v.y = turbo2_dequant_element(&x[ib], iqs + 1, norm); +} diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index c59a4db3999..66e5ba620a1 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -3,6 +3,7 @@ #include "common.cuh" #include "convert.cuh" #include "vecdotq.cuh" +#include "turbo-quant.cuh" #include @@ -288,6 +289,115 @@ static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q8_0( return sum; } +// Turbo3 KQ dot product: dequantize K from turbo3 blocks, dot with Q (float2/half2) +// Uses float Q path (like f16), not q8_1 integer path. +// Q_v is half2[] or float2[] with D/2 pairs, partitioned nthreads-strided. +// +// Matches the f16 pattern: outer loop steps by nthreads*cpy_ne, inner loop +// processes cpy_ne pairs per thread per iteration so Q_v and K indices stay aligned. +// elem0 = 2*k_KQ is always even, so elem0 and elem0+1 always share the same +// turbo3 block (ib), qs byte, and signs byte — loaded once per pair. +template +static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_turbo3_0( + const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { + + const block_turbo3_0 * K_turbo = (const block_turbo3_0 *) K_c; + GGML_UNUSED(Q_q8); + GGML_UNUSED(Q_ds_v); + + constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes(); + constexpr int cpy_ne = cpy_nb / 4; + + float sum = 0.0f; + +#pragma unroll + for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) { +#pragma unroll + for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) { + const int k_KQ = k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne + k_KQ_1; + + // elem0 is always even; elem0 and elem1 are always in the same block, + // the same qs byte (j0%4 ∈ {0,2}), and the same signs byte (j0%8 ∈ {0,2,4,6}). + const int elem0 = k_KQ * 2; // always even + const int ib = elem0 / QK_TURBO3; // shared block index + const int j0 = elem0 % QK_TURBO3; // always even, 0..30 + + // Single loads for the shared block fields + const float norm = __half2float(K_turbo[ib].norm); + const uint8_t qs_byte = K_turbo[ib].qs[j0 / 4]; // covers both j0 and j0+1 + const uint8_t sgn_byte = K_turbo[ib].signs[j0 / 8]; // covers both j0 and j0+1 + + // Extract 3-bit indices for elem0 and elem1 from shared bytes + const int shift = (j0 % 4) * 2; // 0 or 4 + const uint8_t idx0 = ((qs_byte >> shift) & 0x3) | (((sgn_byte >> (j0 % 8)) & 0x1) << 2); + const uint8_t idx1 = ((qs_byte >> (shift+2)) & 0x3) | (((sgn_byte >> (j0 % 8 + 1)) & 0x1) << 2); + + float2 kv; + kv.x = TURBO_CENTROIDS_3BIT[idx0] * norm; + kv.y = TURBO_CENTROIDS_3BIT[idx1] * norm; + +#ifdef V_DOT2_F32_F16_AVAILABLE + const half2 qv = ((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]; + ggml_cuda_mad(sum, make_float2(kv.x, kv.y), __half22float2(qv)); +#else + const float2 qv = ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]; + sum += kv.x * qv.x + kv.y * qv.y; +#endif // V_DOT2_F32_F16_AVAILABLE + } + } + + return sum; +} + +// Turbo2 KQ dot product: dequantize K from turbo2 blocks, dot with Q (float2/half2) +// Same structure as turbo3 but reads 2-bit indices from qs only (no signs). +template +static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_turbo2_0( + const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { + + const block_turbo2_0 * K_turbo = (const block_turbo2_0 *) K_c; + GGML_UNUSED(Q_q8); + GGML_UNUSED(Q_ds_v); + + constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes(); + constexpr int cpy_ne = cpy_nb / 4; + + float sum = 0.0f; + +#pragma unroll + for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) { +#pragma unroll + for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) { + const int k_KQ = k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne + k_KQ_1; + + const int elem0 = k_KQ * 2; + const int ib = elem0 / QK_TURBO2; + const int j0 = elem0 % QK_TURBO2; + + const float norm = __half2float(K_turbo[ib].norm); + const uint8_t qs_byte = K_turbo[ib].qs[j0 / 4]; + + const int shift = (j0 % 4) * 2; + const uint8_t idx0 = (qs_byte >> shift) & 0x3; + const uint8_t idx1 = (qs_byte >> (shift+2)) & 0x3; + + float2 kv; + kv.x = TURBO_CENTROIDS_2BIT[idx0] * norm; + kv.y = TURBO_CENTROIDS_2BIT[idx1] * norm; + +#ifdef V_DOT2_F32_F16_AVAILABLE + const half2 qv = ((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]; + ggml_cuda_mad(sum, make_float2(kv.x, kv.y), __half22float2(qv)); +#else + const float2 qv = ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]; + sum += kv.x * qv.x + kv.y * qv.y; +#endif // V_DOT2_F32_F16_AVAILABLE + } + } + + return sum; +} + template static __device__ __forceinline__ void quantize_q8_1_to_shared( const float * __restrict__ x, const float scale, int * __restrict__ yq32, void * __restrict__ yds) { @@ -577,6 +687,126 @@ static __device__ __forceinline__ void dequantize_V_q8_0(const void * __restrict } } +// Turbo3 V dequantize: extract `ne` float/half values at position i0. +// +// Optimised for the ne==4 path (used by the VEC kernel with turbo3 V): +// i0 is always a multiple of 4 from the VEC kernel access pattern, so all 4 +// elements share one qs byte and one signs byte — we load each once. +template +static __device__ __forceinline__ void dequantize_V_turbo3_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) { + const block_turbo3_0 * x = (const block_turbo3_0 *) vx; + + const int64_t ib = i0 / QK_TURBO3; + const int j0 = i0 % QK_TURBO3; + const float norm = __half2float(x[ib].norm); + + static_assert(ne == 2 || ne == 4, "bad ne"); + + if constexpr (ne == 4) { + // When j0 % 4 == 0 (always true from VEC kernel), all 4 elements share one + // qs byte (4 elements per byte) and one signs byte (8 elements per byte). + const uint8_t qs_byte = x[ib].qs[j0 / 4]; + const uint8_t sgn_byte = x[ib].signs[j0 / 8]; + const int shift_s = j0 % 8; // 0 or 4 + + const uint8_t idx0 = ((qs_byte >> 0) & 0x3) | (((sgn_byte >> (shift_s+0)) & 0x1) << 2); + const uint8_t idx1 = ((qs_byte >> 2) & 0x3) | (((sgn_byte >> (shift_s+1)) & 0x1) << 2); + const uint8_t idx2 = ((qs_byte >> 4) & 0x3) | (((sgn_byte >> (shift_s+2)) & 0x1) << 2); + const uint8_t idx3 = ((qs_byte >> 6) & 0x3) | (((sgn_byte >> (shift_s+3)) & 0x1) << 2); + +#ifdef FP16_AVAILABLE + if constexpr (std::is_same_v) { + ((half2 *) dst)[0] = make_half2( + __float2half(TURBO_CENTROIDS_3BIT[idx0] * norm), + __float2half(TURBO_CENTROIDS_3BIT[idx1] * norm)); + ((half2 *) dst)[1] = make_half2( + __float2half(TURBO_CENTROIDS_3BIT[idx2] * norm), + __float2half(TURBO_CENTROIDS_3BIT[idx3] * norm)); + } else +#endif // FP16_AVAILABLE + if constexpr (std::is_same_v) { + ((float2 *) dst)[0] = make_float2( + TURBO_CENTROIDS_3BIT[idx0] * norm, + TURBO_CENTROIDS_3BIT[idx1] * norm); + ((float2 *) dst)[1] = make_float2( + TURBO_CENTROIDS_3BIT[idx2] * norm, + TURBO_CENTROIDS_3BIT[idx3] * norm); + } else { + static_assert(std::is_same_v, "unsupported type"); + } + } else { // ne == 2 +#ifdef FP16_AVAILABLE + if constexpr (std::is_same_v) { + float v0 = turbo3_dequant_element(&x[ib], j0, norm); + float v1 = turbo3_dequant_element(&x[ib], j0+1, norm); + ((half2 *) dst)[0] = make_half2(__float2half(v0), __float2half(v1)); + } else +#endif // FP16_AVAILABLE + if constexpr (std::is_same_v) { + ((float *) dst)[0] = turbo3_dequant_element(&x[ib], j0, norm); + ((float *) dst)[1] = turbo3_dequant_element(&x[ib], j0+1, norm); + } else { + static_assert(std::is_same_v, "unsupported type"); + } + } +} + +// Turbo2 V dequantize: extract `ne` float/half values at position i0. +template +static __device__ __forceinline__ void dequantize_V_turbo2_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) { + const block_turbo2_0 * x = (const block_turbo2_0 *) vx; + + const int64_t ib = i0 / QK_TURBO2; + const int j0 = i0 % QK_TURBO2; + const float norm = __half2float(x[ib].norm); + + static_assert(ne == 2 || ne == 4, "bad ne"); + + if constexpr (ne == 4) { + const uint8_t qs_byte = x[ib].qs[j0 / 4]; + + const uint8_t idx0 = (qs_byte >> 0) & 0x3; + const uint8_t idx1 = (qs_byte >> 2) & 0x3; + const uint8_t idx2 = (qs_byte >> 4) & 0x3; + const uint8_t idx3 = (qs_byte >> 6) & 0x3; + +#ifdef FP16_AVAILABLE + if constexpr (std::is_same_v) { + ((half2 *) dst)[0] = make_half2( + __float2half(TURBO_CENTROIDS_2BIT[idx0] * norm), + __float2half(TURBO_CENTROIDS_2BIT[idx1] * norm)); + ((half2 *) dst)[1] = make_half2( + __float2half(TURBO_CENTROIDS_2BIT[idx2] * norm), + __float2half(TURBO_CENTROIDS_2BIT[idx3] * norm)); + } else +#endif // FP16_AVAILABLE + if constexpr (std::is_same_v) { + ((float2 *) dst)[0] = make_float2( + TURBO_CENTROIDS_2BIT[idx0] * norm, + TURBO_CENTROIDS_2BIT[idx1] * norm); + ((float2 *) dst)[1] = make_float2( + TURBO_CENTROIDS_2BIT[idx2] * norm, + TURBO_CENTROIDS_2BIT[idx3] * norm); + } else { + static_assert(std::is_same_v, "unsupported type"); + } + } else { // ne == 2 +#ifdef FP16_AVAILABLE + if constexpr (std::is_same_v) { + float v0 = turbo2_dequant_element(&x[ib], j0, norm); + float v1 = turbo2_dequant_element(&x[ib], j0+1, norm); + ((half2 *) dst)[0] = make_half2(__float2half(v0), __float2half(v1)); + } else +#endif // FP16_AVAILABLE + if constexpr (std::is_same_v) { + ((float *) dst)[0] = turbo2_dequant_element(&x[ib], j0, norm); + ((float *) dst)[1] = turbo2_dequant_element(&x[ib], j0+1, norm); + } else { + static_assert(std::is_same_v, "unsupported type"); + } + } +} + template constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() { if constexpr (type_K == GGML_TYPE_F16) { @@ -593,6 +823,10 @@ constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() { return vec_dot_fattn_vec_KQ_q8_0; } else if constexpr (type_K == GGML_TYPE_BF16) { return vec_dot_fattn_vec_KQ_bf16; + } else if constexpr (type_K == GGML_TYPE_TURBO3_0) { + return vec_dot_fattn_vec_KQ_turbo3_0; + } else if constexpr (type_K == GGML_TYPE_TURBO2_0) { + return vec_dot_fattn_vec_KQ_turbo2_0; } else { static_assert(type_K == -1, "bad type"); return nullptr; @@ -615,6 +849,10 @@ constexpr __device__ dequantize_V_t get_dequantize_V() { return dequantize_V_q8_0; } else if constexpr (type_V == GGML_TYPE_BF16) { return dequantize_V_bf16; + } else if constexpr (type_V == GGML_TYPE_TURBO3_0) { + return dequantize_V_turbo3_0; + } else if constexpr (type_V == GGML_TYPE_TURBO2_0) { + return dequantize_V_turbo2_0; } else { static_assert(type_V == -1, "bad type"); return nullptr; diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh index f0bd42a5761..78170cbab35 100644 --- a/ggml/src/ggml-cuda/fattn-vec.cuh +++ b/ggml/src/ggml-cuda/fattn-vec.cuh @@ -75,17 +75,20 @@ static __global__ void flash_attn_ext_vec( #endif // GGML_USE_HIP constexpr int nthreads = ggml_cuda_fattn_vec_get_nthreads_device(); - constexpr int nthreads_KQ = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_KQ_q; - constexpr int nthreads_V = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_V_q; + // Turbo3 uses the float Q path (like f16/bf16), not q8_1 integer path + constexpr bool K_is_unquantized = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16 || type_K == GGML_TYPE_TURBO3_0 || type_K == GGML_TYPE_TURBO2_0); + constexpr bool V_is_unquantized = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16 || type_V == GGML_TYPE_TURBO3_0 || type_V == GGML_TYPE_TURBO2_0); + constexpr int nthreads_KQ = K_is_unquantized ? 128 / cpy_nb : nthreads_KQ_q; + constexpr int nthreads_V = V_is_unquantized ? ((type_V == GGML_TYPE_TURBO3_0 || type_V == GGML_TYPE_TURBO2_0) ? nthreads_V_q : 128 / cpy_nb) : nthreads_V_q; static_assert(WARP_SIZE % nthreads_KQ == 0, "bad nthreads_K"); static_assert(WARP_SIZE % nthreads_V == 0, "bad nthreads_V"); - constexpr int V_rows_per_thread = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 2*cpy_ne : 4; + constexpr int V_rows_per_thread = V_is_unquantized ? ((type_V == GGML_TYPE_TURBO3_0 || type_V == GGML_TYPE_TURBO2_0) ? 4 : 2*cpy_ne) : 4; constexpr int V_cols_per_iter = WARP_SIZE / nthreads_V; constexpr vec_dot_KQ_t vec_dot_KQ = get_vec_dot_KQ(); - constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16 && type_K != GGML_TYPE_BF16; + constexpr bool Q_q8_1 = !K_is_unquantized; #ifdef V_DOT2_F32_F16_AVAILABLE constexpr dequantize_V_t dequantize_V = get_dequantize_V(); #else @@ -120,6 +123,14 @@ static __global__ void flash_attn_ext_vec( __shared__ float KQ[ne_KQ > ne_combine ? ne_KQ : ne_combine]; #endif // V_DOT2_F32_F16_AVAILABLE + // Sparse V: skip V dequant for positions with negligible attention weights. + // At long context, most V positions contribute < 1e-6 to the output — skipping + // their dequant saves significant compute (especially for quantized V types). + constexpr float sparse_v_threshold_f = 1e-6f; +#ifdef V_DOT2_F32_F16_AVAILABLE + const half sparse_v_threshold_h = __float2half(sparse_v_threshold_f); +#endif + float KQ_max[ncols]; float KQ_sum[ncols]; #pragma unroll @@ -320,6 +331,17 @@ static __global__ void flash_attn_ext_vec( for (int j = 0; j < ncols; ++j) { KQ_k[j] = __half2half2(KQ[j*nthreads + k]); } + + // Sparse V: skip V dequant if all attention weights for this position are negligible + { + bool dominated = true; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + if (__hgt(__low2half(KQ_k[j]), sparse_v_threshold_h)) { dominated = false; break; } + } + if (dominated) { continue; } + } + #pragma unroll for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) { half2 tmp[V_rows_per_thread/2]; @@ -349,6 +371,17 @@ static __global__ void flash_attn_ext_vec( for (int j = 0; j < ncols; ++j) { KQ_k[j] = KQ[j*nthreads + k]; } + + // Sparse V: skip V dequant if all attention weights for this position are negligible + { + bool dominated = true; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + if (KQ_k[j] >= sparse_v_threshold_f) { dominated = false; break; } + } + if (dominated) { continue; } + } + #pragma unroll for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) { float2 tmp[V_rows_per_thread/2]; @@ -598,3 +631,31 @@ EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_0) EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_1) EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q8_0) EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_BF16) + +// TurboQuant3 — turbo3 K + turbo3 V (KV cache uses same type) +extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0); +extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0); +extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0); + +// Mixed turbo3/q8_0 KV cache types +extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0); + +extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0); +extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0); +extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0); + +// TurboQuant2 -- turbo2 K + turbo2 V +extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0); +extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0); +extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0); + +// Mixed turbo2/q8_0 KV cache types +extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0); +extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0); + +extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0); +extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0); +extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0); diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index a25a890db6d..7e8ff3d8fb1 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -280,6 +280,20 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_BF16) #endif // GGML_CUDA_FA_ALL_QUANTS + // TurboQuant3 KV cache types (always enabled) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0) + + // Mixed turbo3/q8_0 KV cache types + FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0) + + // TurboQuant2 KV cache types (always enabled) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0) + + // Mixed turbo2/q8_0 KV cache types + FATTN_VEC_CASES_ALL_D(GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0) + GGML_ABORT("fatal error"); } @@ -354,7 +368,14 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const #ifndef GGML_CUDA_FA_ALL_QUANTS if (K->type != V->type) { - return BEST_FATTN_KERNEL_NONE; + // Allow mixed turbo/q8_0 KV types + const bool turbo_q8_mix = (K->type == GGML_TYPE_TURBO3_0 && V->type == GGML_TYPE_Q8_0) || + (K->type == GGML_TYPE_Q8_0 && V->type == GGML_TYPE_TURBO3_0) || + (K->type == GGML_TYPE_TURBO2_0 && V->type == GGML_TYPE_Q8_0) || + (K->type == GGML_TYPE_Q8_0 && V->type == GGML_TYPE_TURBO2_0); + if (!turbo_q8_mix) { + return BEST_FATTN_KERNEL_NONE; + } } #endif // GGML_CUDA_FA_ALL_QUANTS @@ -372,6 +393,18 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const case GGML_TYPE_Q8_0: case GGML_TYPE_BF16: break; + case GGML_TYPE_TURBO3_0: + // turbo3 VEC kernel instantiated for D in {64, 128, 256}. + if (K->ne[0] % 64 != 0) { + return BEST_FATTN_KERNEL_NONE; + } + break; + case GGML_TYPE_TURBO2_0: + // turbo2 VEC kernel instantiated for D in {64, 128, 256}. + if (K->ne[0] % 64 != 0) { + return BEST_FATTN_KERNEL_NONE; + } + break; default: return BEST_FATTN_KERNEL_NONE; } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index cc80eb3ffc2..c89e64ef1c6 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -56,6 +56,7 @@ #include "ggml-cuda/gated_delta_net.cuh" #include "ggml-cuda/set.cuh" #include "ggml-cuda/set-rows.cuh" +#include "ggml-cuda/turbo-wht.cuh" #include "ggml-cuda/pad_reflect_1d.cuh" #include "ggml-cuda/solve_tri.cuh" #include "ggml-cuda/tri.cuh" @@ -2510,6 +2511,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_SET_ROWS: ggml_cuda_op_set_rows(ctx, dst); break; + case GGML_OP_TURBO_WHT: + ggml_cuda_turbo_wht(ctx, dst); + break; case GGML_OP_SET: ggml_cuda_op_set(ctx, dst); break; @@ -4835,9 +4839,14 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } break; case GGML_OP_SET_ROWS: { + // turbo types require head_dim divisible by 64 (supports 64 and 128 WHT groups) + if ((op->type == GGML_TYPE_TURBO3_0 || op->type == GGML_TYPE_TURBO2_0) && op->src[0]->ne[0] % 64 != 0) { + return false; + } return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 || - op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) && + op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL || + op->type == GGML_TYPE_TURBO3_0 || op->type == GGML_TYPE_TURBO2_0) && op->src[0]->type == GGML_TYPE_F32 && (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32); } break; @@ -4964,6 +4973,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_CLAMP: case GGML_OP_LOG: return true; + case GGML_OP_TURBO_WHT: + return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 && + op->src[0]->ne[0] % 64 == 0; // head dim must be divisible by 64 (supports 64 and 128 WHT groups) case GGML_OP_SSM_SCAN: { if (op->src[3]->ne[0] == 1) { // Mamba2 diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index 631de7e8fa5..569185c4a2b 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -1,5 +1,6 @@ #include "set-rows.cuh" #include "cpy-utils.cuh" +#include "turbo-quant.cuh" typedef void (*set_rows_kernel_t)(const char * src, char * dst); @@ -209,6 +210,724 @@ static void set_rows_cuda( } } +// ---- TurboQuant3 set_rows: GROUP_SIZE-element groups with WHT rotation + norm correction ---- +// +// Templated on GROUP_SIZE (128 or 64). +// Parallel kernel: one CUDA block per group, GROUP_SIZE threads per block. +// Thread j handles element j within the group. +// +// Steps (all parallel): +// 1. Load element j from global memory +// 2. Parallel L2 norm (warp reduce + inter-warp via shared memory) +// 3. Normalize +// 4. Forward WHT (log2(GROUP_SIZE) butterfly stages, shared memory) +// 5. Quantize element j to 3-bit centroid index +// 6. Pack qs (warp shuffle) and signs (__ballot_sync) into turbo3 block, no atomics +// 7. Parallel reconstruction norm (same pattern as step 2) +// 8. Write corrected norm (one thread per sub-block) + +template +__launch_bounds__(128) // max of 128 or 64 +static __global__ void k_set_rows_turbo3( + const float * __restrict__ src0, + const idx_t * __restrict__ src1, + block_turbo3_0 * __restrict__ dst, + const int64_t ne00, + const int64_t ne01, + const int64_t ne10, + const int64_t ne11, + const int64_t ne12, + const int64_t ne13, + const int64_t s01, + const int64_t s02, + const int64_t s03, + const int64_t s10, + const int64_t s11, + const int64_t s12, + const int64_t s1, + const int64_t s2, + const int64_t s3) { + + static_assert(GROUP_SIZE == 128 || GROUP_SIZE == 64, "GROUP_SIZE must be 128 or 64"); + + // blockIdx.x = flat group index; threadIdx.x = element within group (0..GROUP_SIZE-1) + const int j = threadIdx.x; + + // Decode blockIdx.x → (i_grp, i01, i02, i03) + constexpr int blocks_per_group = GROUP_SIZE / QK_TURBO3; + const int64_t n_groups_per_row = ne00 / GROUP_SIZE; + const int64_t g = blockIdx.x; + const int64_t i_grp = g % n_groups_per_row; + int64_t tmp = g / n_groups_per_row; + const int64_t i01 = tmp % ne01; + tmp = tmp / ne01; + const int64_t i02 = tmp % ne12; + const int64_t i03 = tmp / ne12; + + const int64_t i12 = i02; + const int64_t i11 = i01 % ne11; + const int64_t i10 = i01; + + const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); + const float * src_row = src0 + i01*s01 + i02*s02 + i03*s03; + block_turbo3_0 * dst_row_ptr = (block_turbo3_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3); + block_turbo3_0 * blk_base = dst_row_ptr + i_grp * blocks_per_group; + + // ---- Step 1: Load element j (coalesced) ---- + __shared__ float x[GROUP_SIZE]; + x[j] = src_row[i_grp * GROUP_SIZE + j]; + __syncthreads(); + + // ---- InnerQ: calibrate on original (unscaled) values ---- + if (d_innerq_calibrating) { + atomicAdd(&d_innerq_sq_accum[j], x[j] * x[j]); + if (j == 0) atomicAdd(&d_innerq_count, 1); + } + + // ---- InnerQ: apply channel scale (only when active) ---- + if (d_innerq_active) { + x[j] *= d_innerq_scale[j]; + } + __syncthreads(); + + // ---- Step 2: Parallel L2 norm ---- + constexpr int n_warps = GROUP_SIZE / WARP_SIZE; + __shared__ float warp_accum[n_warps]; + float v = x[j]; + float v2 = v * v; + for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) + v2 += __shfl_xor_sync(0xffffffff, v2, offset); + if (j % WARP_SIZE == 0) + warp_accum[j / WARP_SIZE] = v2; + __syncthreads(); + + __shared__ float s_norm_sq; + if (j == 0) { + float total = 0.0f; + for (int w = 0; w < n_warps; w++) total += warp_accum[w]; + s_norm_sq = total; + } + __syncthreads(); + const float grp_norm = sqrtf(s_norm_sq); + const float inv_norm = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f; + + // ---- Step 3: Normalize ---- + x[j] *= inv_norm; + __syncthreads(); + + // ---- Step 4: Forward WHT (signs1 → butterfly → signs2, normalized) ---- + if (GROUP_SIZE == 128) { + x[j] *= TURBO_WHT_SIGNS1[j]; + } else { + x[j] *= TURBO_WHT_SIGNS1_64[j]; + } + __syncthreads(); + +#define WHT_STAGE_SHARED(h) \ + if (j % (2*(h)) < (h)) { float a = x[j], b = x[j+(h)]; x[j] = a+b; x[j+(h)] = a-b; } \ + __syncthreads(); + + // Butterfly stages: loop from h=1 to hqs[qs_byte_idx] = qs_byte; + + // Pack signs: 8 elements per byte, 1 bit each. __ballot_sync across warp. + const uint32_t ballot = __ballot_sync(0xffffffff, (idx >> 2) & 1); + const int signs_byte_idx = lane / 8; + const uint8_t signs_byte = (uint8_t)((ballot >> (signs_byte_idx * 8)) & 0xFF); + if (lane % 8 == 0) blk->signs[signs_byte_idx] = signs_byte; + + // ---- Step 7: Reconstruction norm (parallel, same pattern as step 2) ---- + const float c = TURBO_CENTROIDS_3BIT[idx]; + float rc = c * c; + for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) + rc += __shfl_xor_sync(0xffffffff, rc, offset); + if (j % WARP_SIZE == 0) + warp_accum[j / WARP_SIZE] = rc; + __syncthreads(); + + __shared__ float s_recon_sq; + if (j == 0) { + float total = 0.0f; + for (int w = 0; w < n_warps; w++) total += warp_accum[w]; + s_recon_sq = total; + } + __syncthreads(); + const float recon_norm = sqrtf(s_recon_sq); + const float corrected_norm = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm; + + // ---- Step 8: Write corrected norm (one thread per turbo3 sub-block) ---- + if (lane == 0) blk->norm = __float2half(corrected_norm); + + GGML_UNUSED(ne10); + GGML_UNUSED(ne13); +} + +// ---- TurboQuant3 tail kernel: straight 3-bit quantize without WHT rotation ---- +// +// For head dims not divisible by 128 (e.g. 576 = 4*128 + 64), the remainder +// elements can't use the 128-element WHT. They are quantised directly into +// standard turbo3 blocks. Q is also NOT rotated for these positions (the graph +// guards on ne[0] % 128), so stays in the original space. +// +// One CUDA block per row, with tail_size threads (must be multiple of 32). + +template +static __global__ void k_set_rows_turbo3_tail( + const float * __restrict__ src0, + const idx_t * __restrict__ src1, + block_turbo3_0 * __restrict__ dst, + const int64_t ne00, + const int64_t ne01, + const int64_t ne10, + const int64_t ne11, + const int64_t ne12, + const int64_t ne13, + const int64_t s01, + const int64_t s02, + const int64_t s03, + const int64_t s10, + const int64_t s11, + const int64_t s12, + const int64_t s1, + const int64_t s2, + const int64_t s3, + const int tail_size) { + + const int j = threadIdx.x; // 0 .. tail_size-1 + + // Decode blockIdx.x → (i01, i02, i03) + int64_t tmp = blockIdx.x; + const int64_t i01 = tmp % ne01; tmp /= ne01; + const int64_t i02 = tmp % ne12; + const int64_t i03 = tmp / ne12; + + const int64_t i11 = i01 % ne11; + const int64_t i10 = i01; + const int64_t i12 = i02; + + const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); + const float * src_row = src0 + i01*s01 + i02*s02 + i03*s03; + block_turbo3_0 * dst_row_ptr = (block_turbo3_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3); + + // Tail starts after all full 128-element groups + const int64_t n_full = ne00 / QK_TURBO3_GROUP; + const int64_t tail_start = n_full * QK_TURBO3_GROUP; + block_turbo3_0 * blk_base = dst_row_ptr + n_full * (QK_TURBO3_GROUP / QK_TURBO3); + + // ---- Load ---- + const float val = src_row[tail_start + j]; + + // ---- L2 norm over the tail group (warp reduce + inter-warp) ---- + const int n_warps = tail_size / WARP_SIZE; + const int warp_id = j / WARP_SIZE; + const int lane = j % WARP_SIZE; + + __shared__ float warp_accum[4]; // max 3 warps (tail ≤ 96) + float v2 = val * val; + for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) + v2 += __shfl_xor_sync(0xffffffff, v2, offset); + if (lane == 0) warp_accum[warp_id] = v2; + __syncthreads(); + + __shared__ float s_norm_sq; + if (j == 0) { + float total = 0.0f; + for (int w = 0; w < n_warps; w++) total += warp_accum[w]; + s_norm_sq = total; + } + __syncthreads(); + const float grp_norm = sqrtf(s_norm_sq); + const float inv_norm = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f; + + // ---- Normalize (no WHT!) ---- + const float rv = val * inv_norm; + + // ---- Quantize ---- + const uint8_t idx = turbo_nearest_centroid_3bit(rv); + + // ---- Pack qs and signs (same warp-cooperative logic) ---- + block_turbo3_0 * blk = blk_base + warp_id; + + const uint8_t my_low2 = idx & 0x3; + uint8_t qs_byte = 0; +#pragma unroll + for (int k = 0; k < 4; k++) { + uint8_t contrib = __shfl_sync(0xffffffff, my_low2, (lane & ~3) + k); + qs_byte |= contrib << (k * 2); + } + if (lane % 4 == 0) blk->qs[lane / 4] = qs_byte; + + const uint32_t ballot = __ballot_sync(0xffffffff, (idx >> 2) & 1); + const int signs_byte_idx = lane / 8; + const uint8_t signs_byte = (uint8_t)((ballot >> (signs_byte_idx * 8)) & 0xFF); + if (lane % 8 == 0) blk->signs[signs_byte_idx] = signs_byte; + + // ---- Reconstruction norm ---- + const float c = TURBO_CENTROIDS_3BIT[idx]; + float rc = c * c; + for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) + rc += __shfl_xor_sync(0xffffffff, rc, offset); + if (lane == 0) warp_accum[warp_id] = rc; + __syncthreads(); + + __shared__ float s_recon_sq; + if (j == 0) { + float total = 0.0f; + for (int w = 0; w < n_warps; w++) total += warp_accum[w]; + s_recon_sq = total; + } + __syncthreads(); + const float recon_norm = sqrtf(s_recon_sq); + const float corrected_norm = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm; + + if (lane == 0) blk->norm = __float2half(corrected_norm); + + GGML_UNUSED(ne10); + GGML_UNUSED(ne13); +} + +template +static void set_rows_cuda_turbo3( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst) { + + const float * src0_d = (const float *)src0->data; + const idx_t * src1_d = (const idx_t *)src1->data; + + GGML_TENSOR_BINARY_OP_LOCALS + GGML_ASSERT(ne00 % QK_TURBO3 == 0); // must be block-aligned (32) + + cudaStream_t stream = ctx.stream(); + + // Read WHT group size from op_params (set by llama-kv-cache.cpp based on head_dim). + // Default to 128 if not set (backward compat with head_dim=128 models). + int group_size = 128; + memcpy(&group_size, dst->op_params, sizeof(int)); + if (group_size != 64 && group_size != 128) group_size = 128; + GGML_ASSERT(ne00 % group_size == 0); + + const int64_t n_full_groups = ne00 / group_size; + const int tail_size = (int)(ne00 % group_size); + + const int64_t s01 = nb01/sizeof(float); + const int64_t s02 = nb02/sizeof(float); + const int64_t s03 = nb03/sizeof(float); + const int64_t s10 = nb10/sizeof(idx_t); + const int64_t s11 = nb11/sizeof(idx_t); + const int64_t s12 = nb12/sizeof(idx_t); + + // InnerQ: check/finalize calibration before kernel launch + turbo_innerq_check_finalize(group_size, ne00); + + // Launch 1: full groups with WHT rotation + if (n_full_groups > 0) { + const int64_t ne_total = n_full_groups * ne01 * ne02 * ne03; + if (group_size == 128) { + k_set_rows_turbo3<<<(int)ne_total, 128, 0, stream>>>( + src0_d, src1_d, (block_turbo3_0 *)dst->data, + ne00, ne01, ne10, ne11, ne12, ne13, + s01, s02, s03, s10, s11, s12, + nb1, nb2, nb3); + } else { + k_set_rows_turbo3<<<(int)ne_total, 64, 0, stream>>>( + src0_d, src1_d, (block_turbo3_0 *)dst->data, + ne00, ne01, ne10, ne11, ne12, ne13, + s01, s02, s03, s10, s11, s12, + nb1, nb2, nb3); + } + } + + // Launch 2: tail elements (no WHT, straight quantize) + // Not needed for 64-aligned dims but kept for potential future use + if (tail_size > 0) { + GGML_ASSERT(tail_size % QK_TURBO3 == 0); // tail must be block-aligned + const int64_t n_rows = ne01 * ne02 * ne03; + k_set_rows_turbo3_tail<<<(int)n_rows, tail_size, 0, stream>>>( + src0_d, src1_d, (block_turbo3_0 *)dst->data, + ne00, ne01, ne10, ne11, ne12, ne13, + s01, s02, s03, s10, s11, s12, + nb1, nb2, nb3, tail_size); + } +} + +// ---- TurboQuant2 set_rows: GROUP_SIZE-element groups with WHT rotation + norm correction ---- +// +// Same structure as turbo3 but 2-bit quantization only (no signs byte). + +template +__launch_bounds__(128) +static __global__ void k_set_rows_turbo2( + const float * __restrict__ src0, + const idx_t * __restrict__ src1, + block_turbo2_0 * __restrict__ dst, + const int64_t ne00, + const int64_t ne01, + const int64_t ne10, + const int64_t ne11, + const int64_t ne12, + const int64_t ne13, + const int64_t s01, + const int64_t s02, + const int64_t s03, + const int64_t s10, + const int64_t s11, + const int64_t s12, + const int64_t s1, + const int64_t s2, + const int64_t s3) { + + static_assert(GROUP_SIZE == 128 || GROUP_SIZE == 64, "GROUP_SIZE must be 128 or 64"); + + const int j = threadIdx.x; + + constexpr int blocks_per_group = GROUP_SIZE / QK_TURBO2; + const int64_t n_groups_per_row = ne00 / GROUP_SIZE; + const int64_t g = blockIdx.x; + const int64_t i_grp = g % n_groups_per_row; + int64_t tmp = g / n_groups_per_row; + const int64_t i01 = tmp % ne01; + tmp = tmp / ne01; + const int64_t i02 = tmp % ne12; + const int64_t i03 = tmp / ne12; + + const int64_t i12 = i02; + const int64_t i11 = i01 % ne11; + const int64_t i10 = i01; + + const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); + const float * src_row = src0 + i01*s01 + i02*s02 + i03*s03; + block_turbo2_0 * dst_row_ptr = (block_turbo2_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3); + block_turbo2_0 * blk_base = dst_row_ptr + i_grp * blocks_per_group; + + // ---- Step 1: Load element j (coalesced) ---- + __shared__ float x[GROUP_SIZE]; + x[j] = src_row[i_grp * GROUP_SIZE + j]; + __syncthreads(); + + // ---- InnerQ: calibrate on original (unscaled) values ---- + if (d_innerq_calibrating) { + atomicAdd(&d_innerq_sq_accum[j], x[j] * x[j]); + if (j == 0) atomicAdd(&d_innerq_count, 1); + } + + // ---- InnerQ: apply channel scale (only when active) ---- + if (d_innerq_active) { + x[j] *= d_innerq_scale[j]; + } + __syncthreads(); + + // ---- Step 2: Parallel L2 norm ---- + constexpr int n_warps = GROUP_SIZE / WARP_SIZE; + __shared__ float warp_accum[n_warps]; + float v = x[j]; + float v2 = v * v; + for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) + v2 += __shfl_xor_sync(0xffffffff, v2, offset); + if (j % WARP_SIZE == 0) + warp_accum[j / WARP_SIZE] = v2; + __syncthreads(); + + __shared__ float s_norm_sq; + if (j == 0) { + float total = 0.0f; + for (int w = 0; w < n_warps; w++) total += warp_accum[w]; + s_norm_sq = total; + } + __syncthreads(); + const float grp_norm = sqrtf(s_norm_sq); + const float inv_norm = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f; + + // ---- Step 3: Normalize ---- + x[j] *= inv_norm; + __syncthreads(); + + // ---- Step 4: Forward WHT ---- + if (GROUP_SIZE == 128) { + x[j] *= TURBO_WHT_SIGNS1[j]; + } else { + x[j] *= TURBO_WHT_SIGNS1_64[j]; + } + __syncthreads(); + +#define WHT_STAGE_SHARED_T2(h) \ + if (j % (2*(h)) < (h)) { float a = x[j], b = x[j+(h)]; x[j] = a+b; x[j+(h)] = a-b; } \ + __syncthreads(); + + WHT_STAGE_SHARED_T2(1) + WHT_STAGE_SHARED_T2(2) + WHT_STAGE_SHARED_T2(4) + WHT_STAGE_SHARED_T2(8) + WHT_STAGE_SHARED_T2(16) + WHT_STAGE_SHARED_T2(32) + if (GROUP_SIZE == 128) { WHT_STAGE_SHARED_T2(64) } +#undef WHT_STAGE_SHARED_T2 + + constexpr float inv_sqrt_group = (GROUP_SIZE == 128) ? 0.08838834764831845f : 0.125f; + if (GROUP_SIZE == 128) { + x[j] = x[j] * inv_sqrt_group * TURBO_WHT_SIGNS2[j]; + } else { + x[j] = x[j] * inv_sqrt_group * TURBO_WHT_SIGNS2_64[j]; + } + __syncthreads(); + + // ---- Step 5: Quantize element j to 2-bit centroid ---- + const float rv = x[j]; + const uint8_t idx = turbo_nearest_centroid_2bit(rv); + + // ---- Step 6: Pack qs (warp-cooperative, no atomics) ---- + const int warp_id = j / WARP_SIZE; + const int lane = j % WARP_SIZE; + block_turbo2_0 * blk = blk_base + warp_id; + + // Pack qs: 4 elements per byte, 2 bits each. + const uint8_t my_bits = idx & 0x3; + uint8_t qs_byte = 0; +#pragma unroll + for (int k = 0; k < 4; k++) { + uint8_t contrib = __shfl_sync(0xffffffff, my_bits, (lane & ~3) + k); + qs_byte |= contrib << (k * 2); + } + if (lane % 4 == 0) blk->qs[lane / 4] = qs_byte; + + // No signs packing needed for turbo2 + + // ---- Step 7: Reconstruction norm ---- + const float c = TURBO_CENTROIDS_2BIT[idx]; + float rc = c * c; + for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) + rc += __shfl_xor_sync(0xffffffff, rc, offset); + if (j % WARP_SIZE == 0) + warp_accum[j / WARP_SIZE] = rc; + __syncthreads(); + + __shared__ float s_recon_sq; + if (j == 0) { + float total = 0.0f; + for (int w = 0; w < n_warps; w++) total += warp_accum[w]; + s_recon_sq = total; + } + __syncthreads(); + const float recon_norm = sqrtf(s_recon_sq); + const float corrected_norm = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm; + + // ---- Step 8: Write corrected norm ---- + if (lane == 0) blk->norm = __float2half(corrected_norm); + + GGML_UNUSED(ne10); + GGML_UNUSED(ne13); +} + +// ---- TurboQuant2 tail kernel: straight 2-bit quantize without WHT rotation ---- + +template +static __global__ void k_set_rows_turbo2_tail( + const float * __restrict__ src0, + const idx_t * __restrict__ src1, + block_turbo2_0 * __restrict__ dst, + const int64_t ne00, + const int64_t ne01, + const int64_t ne10, + const int64_t ne11, + const int64_t ne12, + const int64_t ne13, + const int64_t s01, + const int64_t s02, + const int64_t s03, + const int64_t s10, + const int64_t s11, + const int64_t s12, + const int64_t s1, + const int64_t s2, + const int64_t s3, + const int tail_size) { + + const int j = threadIdx.x; + + int64_t tmp = blockIdx.x; + const int64_t i01 = tmp % ne01; tmp /= ne01; + const int64_t i02 = tmp % ne12; + const int64_t i03 = tmp / ne12; + + const int64_t i11 = i01 % ne11; + const int64_t i10 = i01; + const int64_t i12 = i02; + + const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); + const float * src_row = src0 + i01*s01 + i02*s02 + i03*s03; + block_turbo2_0 * dst_row_ptr = (block_turbo2_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3); + + const int64_t n_full = ne00 / QK_TURBO2_GROUP; + const int64_t tail_start = n_full * QK_TURBO2_GROUP; + block_turbo2_0 * blk_base = dst_row_ptr + n_full * (QK_TURBO2_GROUP / QK_TURBO2); + + // ---- Load ---- + const float val = src_row[tail_start + j]; + + // ---- L2 norm ---- + const int n_warps = tail_size / WARP_SIZE; + const int warp_id = j / WARP_SIZE; + const int lane = j % WARP_SIZE; + + __shared__ float warp_accum[4]; + float v2 = val * val; + for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) + v2 += __shfl_xor_sync(0xffffffff, v2, offset); + if (lane == 0) warp_accum[warp_id] = v2; + __syncthreads(); + + __shared__ float s_norm_sq; + if (j == 0) { + float total = 0.0f; + for (int w = 0; w < n_warps; w++) total += warp_accum[w]; + s_norm_sq = total; + } + __syncthreads(); + const float grp_norm = sqrtf(s_norm_sq); + const float inv_norm = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f; + + // ---- Normalize (no WHT!) ---- + const float rv = val * inv_norm; + + // ---- Quantize ---- + const uint8_t idx = turbo_nearest_centroid_2bit(rv); + + // ---- Pack qs ---- + block_turbo2_0 * blk = blk_base + warp_id; + + const uint8_t my_bits = idx & 0x3; + uint8_t qs_byte = 0; +#pragma unroll + for (int k = 0; k < 4; k++) { + uint8_t contrib = __shfl_sync(0xffffffff, my_bits, (lane & ~3) + k); + qs_byte |= contrib << (k * 2); + } + if (lane % 4 == 0) blk->qs[lane / 4] = qs_byte; + + // ---- Reconstruction norm ---- + const float c = TURBO_CENTROIDS_2BIT[idx]; + float rc = c * c; + for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) + rc += __shfl_xor_sync(0xffffffff, rc, offset); + if (lane == 0) warp_accum[warp_id] = rc; + __syncthreads(); + + __shared__ float s_recon_sq; + if (j == 0) { + float total = 0.0f; + for (int w = 0; w < n_warps; w++) total += warp_accum[w]; + s_recon_sq = total; + } + __syncthreads(); + const float recon_norm = sqrtf(s_recon_sq); + const float corrected_norm = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm; + + if (lane == 0) blk->norm = __float2half(corrected_norm); + + GGML_UNUSED(ne10); + GGML_UNUSED(ne13); + GGML_UNUSED(ne00); +} + +template +static void set_rows_cuda_turbo2( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst) { + + const float * src0_d = (const float *)src0->data; + const idx_t * src1_d = (const idx_t *)src1->data; + + GGML_TENSOR_BINARY_OP_LOCALS + GGML_ASSERT(ne00 % QK_TURBO2 == 0); + + cudaStream_t stream = ctx.stream(); + + int group_size = 128; + memcpy(&group_size, dst->op_params, sizeof(int)); + if (group_size != 64 && group_size != 128) group_size = 128; + GGML_ASSERT(ne00 % group_size == 0); + + const int64_t n_full_groups = ne00 / group_size; + const int tail_size = (int)(ne00 % group_size); + + const int64_t s01 = nb01/sizeof(float); + const int64_t s02 = nb02/sizeof(float); + const int64_t s03 = nb03/sizeof(float); + const int64_t s10 = nb10/sizeof(idx_t); + const int64_t s11 = nb11/sizeof(idx_t); + const int64_t s12 = nb12/sizeof(idx_t); + + // InnerQ: check/finalize calibration before kernel launch + turbo_innerq_check_finalize(group_size, ne00); + + if (n_full_groups > 0) { + const int64_t ne_total = n_full_groups * ne01 * ne02 * ne03; + if (group_size == 128) { + k_set_rows_turbo2<<<(int)ne_total, 128, 0, stream>>>( + src0_d, src1_d, (block_turbo2_0 *)dst->data, + ne00, ne01, ne10, ne11, ne12, ne13, + s01, s02, s03, s10, s11, s12, + nb1, nb2, nb3); + } else { + k_set_rows_turbo2<<<(int)ne_total, 64, 0, stream>>>( + src0_d, src1_d, (block_turbo2_0 *)dst->data, + ne00, ne01, ne10, ne11, ne12, ne13, + s01, s02, s03, s10, s11, s12, + nb1, nb2, nb3); + } + } + + if (tail_size > 0) { + GGML_ASSERT(tail_size % QK_TURBO2 == 0); + const int64_t n_rows = ne01 * ne02 * ne03; + k_set_rows_turbo2_tail<<<(int)n_rows, tail_size, 0, stream>>>( + src0_d, src1_d, (block_turbo2_0 *)dst->data, + ne00, ne01, ne10, ne11, ne12, ne13, + s01, s02, s03, s10, s11, s12, + nb1, nb2, nb3, tail_size); + } +} + template static void set_rows_cuda(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const src_t * src0_d = (const src_t *)src0->data; @@ -309,6 +1028,10 @@ static void set_rows_cuda(ggml_backend_cuda_context & ctx, const ggml_tensor * s nb1, nb2, nb3, stream ); + } else if (dst->type == GGML_TYPE_TURBO3_0) { + set_rows_cuda_turbo3(ctx, src0, src1, dst); + } else if (dst->type == GGML_TYPE_TURBO2_0) { + set_rows_cuda_turbo2(ctx, src0, src1, dst); } else { GGML_ABORT("unsupported type %s", ggml_type_name(dst->type)); } diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo2_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo2_0.cu new file mode 100644 index 00000000000..3630d871af4 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo2_0.cu @@ -0,0 +1,7 @@ +// Mixed KV: q8_0 K + turbo2 V + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO2_0); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo3_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo3_0.cu new file mode 100644 index 00000000000..c8a4d9f8993 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-turbo3_0.cu @@ -0,0 +1,7 @@ +// Mixed KV: q8_0 K + turbo3 V + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_TURBO3_0); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-q8_0.cu new file mode 100644 index 00000000000..c9cd1afa8eb --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-q8_0.cu @@ -0,0 +1,7 @@ +// Mixed KV: turbo2 K + q8_0 V + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_Q8_0); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo2_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo2_0.cu new file mode 100644 index 00000000000..f68422d398a --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo2_0-turbo2_0.cu @@ -0,0 +1,7 @@ +// TurboQuant2 CUDA flash attention vec kernel instantiation + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO2_0, GGML_TYPE_TURBO2_0); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-q8_0.cu new file mode 100644 index 00000000000..601e4da41d4 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-q8_0.cu @@ -0,0 +1,7 @@ +// Mixed KV: turbo3 K + q8_0 V + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_Q8_0); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu new file mode 100644 index 00000000000..0322a3b0d09 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu @@ -0,0 +1,7 @@ +// TurboQuant3 CUDA flash attention vec kernel instantiation + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0); diff --git a/ggml/src/ggml-cuda/turbo-innerq.cu b/ggml/src/ggml-cuda/turbo-innerq.cu new file mode 100644 index 00000000000..a8455aafc37 --- /dev/null +++ b/ggml/src/ggml-cuda/turbo-innerq.cu @@ -0,0 +1,32 @@ +#include "turbo-innerq.cuh" +#include + +// Host-side shared state for InnerQ cross-TU communication +bool g_innerq_finalized = false; +float g_innerq_scale_inv_host[INNERQ_MAX_CHANNELS] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +}; + +static bool g_innerq_tensor_needs_update = false; + +void turbo_innerq_publish(const float * scale_inv, int group_size) { + for (int i = 0; i < group_size && i < INNERQ_MAX_CHANNELS; i++) { + g_innerq_scale_inv_host[i] = scale_inv[i]; + } + for (int i = group_size; i < INNERQ_MAX_CHANNELS; i++) { + g_innerq_scale_inv_host[i] = 1.0f; + } + g_innerq_finalized = true; + g_innerq_tensor_needs_update = true; +} + +bool turbo_innerq_needs_tensor_update(void) { + return g_innerq_tensor_needs_update; +} + +void turbo_innerq_mark_tensor_updated(void) { + g_innerq_tensor_needs_update = false; +} diff --git a/ggml/src/ggml-cuda/turbo-innerq.cuh b/ggml/src/ggml-cuda/turbo-innerq.cuh new file mode 100644 index 00000000000..90103aee339 --- /dev/null +++ b/ggml/src/ggml-cuda/turbo-innerq.cuh @@ -0,0 +1,21 @@ +#pragma once + +// TurboQuant InnerQ per-channel equalization — cross-TU shared state +// The host-side state lives in turbo-innerq.cu; device-side state is per-TU +// in turbo-quant.cuh (only set-rows.cu needs device access). + +#define INNERQ_MAX_CHANNELS 128 + +// Host-side shared state (defined in turbo-innerq.cu) +extern bool g_innerq_finalized; +extern float g_innerq_scale_inv_host[INNERQ_MAX_CHANNELS]; + +// Called from set-rows.cu after InnerQ finalization to publish scale_inv +void turbo_innerq_publish(const float * scale_inv, int group_size); + +// Called from llama-kv-cache.cpp (or equivalent) to check if tensor needs update +// Returns true if there are new scale_inv values to upload +bool turbo_innerq_needs_tensor_update(void); + +// Called after tensor update to clear the flag +void turbo_innerq_mark_tensor_updated(void); diff --git a/ggml/src/ggml-cuda/turbo-quant.cuh b/ggml/src/ggml-cuda/turbo-quant.cuh new file mode 100644 index 00000000000..8590cae1145 --- /dev/null +++ b/ggml/src/ggml-cuda/turbo-quant.cuh @@ -0,0 +1,361 @@ +/* + * TurboQuant CUDA kernels for KV cache compression + * Based on: arXiv 2504.19874 (ICLR 2026) + * + * Implements GGML_TYPE_TURBO3_0 (3-bit PolarQuant, block size 32) + * Constants, WHT rotation, quantize/dequantize device functions. + */ + +#pragma once + +#include "common.cuh" +#include "turbo-innerq.cuh" +#include +#include + +// ---- Quantization ratios for dequantize_block template ---- +#define QR_TURBO3 1 // Each dequantize call produces 2 consecutive elements (like q8_0) +#define QR_TURBO2 1 // Each dequantize call produces 2 consecutive elements (like q8_0) + +// ---- 2-bit centroids (Lloyd-Max for N(0, 1/128)) ---- + +static __constant__ float TURBO_CENTROIDS_2BIT[4] = { + -0.133462f, -0.039994f, 0.039994f, 0.133462f +}; + +static __constant__ float TURBO_MID_2BIT[3] = { + -0.086728f, 0.0f, 0.086728f +}; + +// ---- 3-bit centroids (Lloyd-Max for N(0, 1/128)) ---- + +static __constant__ float TURBO_CENTROIDS_3BIT[8] = { + -0.190685f, -0.117832f, -0.065717f, -0.021460f, + 0.021460f, 0.065717f, 0.117832f, 0.190685f +}; + +// ---- Midpoints for nearest centroid lookup ---- + +static __constant__ float TURBO_MID_3BIT[7] = { + -0.154259f, -0.091775f, -0.043589f, 0.0f, + 0.043589f, 0.091775f, 0.154259f +}; + +// ---- WHT sign arrays (seed=42) ---- + +static __constant__ float TURBO_WHT_SIGNS1[128] = { + -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, + -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, + -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, + 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, + -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, + 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f +}; + +static __constant__ float TURBO_WHT_SIGNS2[128] = { + 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, + 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, + 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, + 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, + -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, + 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, + -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f +}; + +// ---- 64-element WHT sign arrays (first 64 of the 128-element arrays) ---- + +static __constant__ float TURBO_WHT_SIGNS1_64[64] = { + -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, + -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f +}; + +static __constant__ float TURBO_WHT_SIGNS2_64[64] = { + 1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, + 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, + 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f +}; + +// ---- Fast Walsh-Hadamard Transform (in-place, normalized) ---- +// O(n log n) = 896 ops for n=128 + +static __device__ __forceinline__ void turbo_fwht_128(float * x) { + for (int h = 1; h < 128; h *= 2) { + for (int i = 0; i < 128; i += h * 2) { + for (int j = i; j < i + h; j++) { + float a = x[j]; + float b = x[j + h]; + x[j] = a + b; + x[j + h] = a - b; + } + } + } + const float inv_sqrt_128 = 0.08838834764831845f; + for (int i = 0; i < 128; i++) { + x[i] *= inv_sqrt_128; + } +} + +// ---- Fast Walsh-Hadamard Transform for 64-element groups ---- +// O(n log n) = 384 ops for n=64 + +static __device__ __forceinline__ void turbo_fwht_64(float * x) { + for (int h = 1; h < 64; h *= 2) { + for (int i = 0; i < 64; i += h * 2) { + for (int j = i; j < i + h; j++) { + float a = x[j]; + float b = x[j + h]; + x[j] = a + b; + x[j + h] = a - b; + } + } + } + const float inv_sqrt_64 = 0.125f; + for (int i = 0; i < 64; i++) { + x[i] *= inv_sqrt_64; + } +} + +// ---- Forward rotation: signs1 → FWHT → signs2 ---- + +static __device__ __forceinline__ void turbo_rotate_forward(float * x) { + for (int i = 0; i < 128; i++) x[i] *= TURBO_WHT_SIGNS1[i]; + turbo_fwht_128(x); + for (int i = 0; i < 128; i++) x[i] *= TURBO_WHT_SIGNS2[i]; +} + +// ---- Forward rotation for 64-element groups ---- + +static __device__ __forceinline__ void turbo_rotate_forward_64(float * x) { + for (int i = 0; i < 64; i++) x[i] *= TURBO_WHT_SIGNS1_64[i]; + turbo_fwht_64(x); + for (int i = 0; i < 64; i++) x[i] *= TURBO_WHT_SIGNS2_64[i]; +} + +// ---- InnerQ per-channel equalization ---- +// Equalizes K channel variances before WHT rotation to reduce quantization error. +// Enabled via TURBO_INNERQ=N env var (N = calibration token count). +// Math: = preserves dot products. +// INNERQ_MAX_CHANNELS is defined in turbo-innerq.cuh + +static __device__ float d_innerq_scale[INNERQ_MAX_CHANNELS]; +static __device__ float d_innerq_scale_inv[INNERQ_MAX_CHANNELS]; +static __device__ float d_innerq_sq_accum[INNERQ_MAX_CHANNELS]; +static __device__ int d_innerq_count; +static __device__ int d_innerq_active; // 0 = scales are identity, 1 = scales applied +static __device__ int d_innerq_calibrating; // 1 = accumulating K² stats + +static int innerq_enabled = 0; // host: 0=off, 1=calibrating, 2=active +static int innerq_target_tokens = 0; +static float innerq_strength = 0.5f; +static bool innerq_initialized = false; + +// Host: read TURBO_INNERQ env, start calibration if enabled +static void turbo_innerq_init(void) { + if (innerq_initialized) return; + innerq_initialized = true; + + const char * env = getenv("TURBO_INNERQ"); + if (!env || atoi(env) <= 0) { + innerq_enabled = 0; + return; + } + innerq_target_tokens = atoi(env); + innerq_enabled = 1; // calibrating + + const char * env_str = getenv("TURBO_INNERQ_STRENGTH"); + if (env_str) innerq_strength = atof(env_str); + if (innerq_strength <= 0.0f || innerq_strength > 1.0f) innerq_strength = 0.5f; + + // Zero accumulators and set calibrating flag on device + float zeros[INNERQ_MAX_CHANNELS] = {0}; + int zero = 0, one = 1; + cudaMemcpyToSymbol(d_innerq_sq_accum, zeros, sizeof(zeros)); + cudaMemcpyToSymbol(d_innerq_count, &zero, sizeof(int)); + cudaMemcpyToSymbol(d_innerq_active, &zero, sizeof(int)); + cudaMemcpyToSymbol(d_innerq_calibrating, &one, sizeof(int)); + + GGML_LOG_INFO("%s: InnerQ calibration started (target=%d tokens, strength=%.2f)\n", + __func__, innerq_target_tokens, innerq_strength); +} + +// Host: finalize calibration — compute scales, upload, activate +static void turbo_innerq_finalize(int group_size) { + // Read accumulators from device + float sq_accum[INNERQ_MAX_CHANNELS]; + int count = 0; + cudaMemcpyFromSymbol(sq_accum, d_innerq_sq_accum, group_size * sizeof(float)); + cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int)); + + if (count <= 0) { + GGML_LOG_WARN("%s: InnerQ calibration got 0 tokens, disabling\n", __func__); + innerq_enabled = 0; + int zero = 0; + cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)); + return; + } + + // Compute per-channel RMS + float rms[INNERQ_MAX_CHANNELS]; + float mean_rms = 0.0f; + float max_ratio = 0.0f, min_ratio = 1e30f; + for (int i = 0; i < group_size; i++) { + rms[i] = sqrtf(sq_accum[i] / (float)count); + mean_rms += rms[i]; + } + mean_rms /= (float)group_size; + + // Compute scale[i] = (mean_rms / channel_rms[i])^strength, clamp to [0.5, 2.0] + float scale[INNERQ_MAX_CHANNELS]; + float scale_inv[INNERQ_MAX_CHANNELS]; + for (int i = 0; i < group_size; i++) { + float ratio = (rms[i] > 1e-10f) ? (mean_rms / rms[i]) : 1.0f; + float s = powf(ratio, innerq_strength); + if (s < 0.5f) s = 0.5f; + if (s > 2.0f) s = 2.0f; + scale[i] = s; + scale_inv[i] = 1.0f / s; + if (ratio > max_ratio) max_ratio = ratio; + if (ratio < min_ratio) min_ratio = ratio; + } + + // Auto-skip if max channel ratio < 1.2 (already balanced) + if (max_ratio < 1.2f && min_ratio > (1.0f / 1.2f)) { + GGML_LOG_INFO("%s: InnerQ auto-disabled (channels already balanced, max_ratio=%.3f)\n", + __func__, max_ratio); + innerq_enabled = 0; + int zero = 0; + cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)); + return; + } + + // Stop calibrating, upload scales, activate + int zero = 0, one = 1; + cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)); + cudaMemcpyToSymbol(d_innerq_scale, scale, group_size * sizeof(float)); + cudaMemcpyToSymbol(d_innerq_scale_inv, scale_inv, group_size * sizeof(float)); + cudaDeviceSynchronize(); // ensure scales are visible before activating + cudaMemcpyToSymbol(d_innerq_active, &one, sizeof(int)); + + innerq_enabled = 2; // active + + // Publish scale_inv to shared host state for cross-TU tensor update + turbo_innerq_publish(scale_inv, group_size); + + GGML_LOG_INFO("%s: InnerQ finalized (%d tokens, max_ratio=%.3f, min_ratio=%.3f)\n", + __func__, count, max_ratio, min_ratio); +} + +// Host: called before each set_rows kernel launch +static void turbo_innerq_check_finalize(int group_size, int64_t ne00) { + if (!innerq_initialized) { + turbo_innerq_init(); + } + if (innerq_enabled == 0) return; + + // InnerQ only works when each WHT group = one head (group_size == head_dim). + // For standard models: ne00 = n_heads * head_dim, group_size = head_dim → ne00 % group_size == 0, fine. + // For non-standard models (head_dim > group_size, e.g. GLM 576 → 64-group): + // ne00 = head_dim (single head), group_size = 64, ne00/group_size = 9 groups per head → WRONG. + // Detect: if ne00 / group_size doesn't divide evenly into standard head counts (1,2,4,8,16,32,64,128), + // it's likely multi-group-per-head. Simpler check: group_size < 128 means head_dim > 128. + const bool multi_group_per_head = (group_size < 128); // 64-group → head_dim > 128, multi-group + if (multi_group_per_head) { + if (innerq_enabled == 1) { + GGML_LOG_WARN("%s: InnerQ disabled (ne00=%lld != group_size=%d, multi-group heads)\n", + __func__, (long long)ne00, group_size); + innerq_enabled = 0; + int zero = 0; + cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)); + } + return; + } + + // Check if calibration is complete + if (innerq_enabled == 1) { + int count = 0; + cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int)); + if (count >= innerq_target_tokens) { + turbo_innerq_finalize(group_size); + } + } +} + +// Host: check if InnerQ is currently active (finalized) +static bool turbo_innerq_is_active(void) { + return innerq_enabled == 2; +} + +// ---- Nearest 3-bit centroid index ---- + +static __device__ __forceinline__ uint8_t turbo_nearest_centroid_3bit(float val) { + if (val < TURBO_MID_3BIT[0]) return 0; + else if (val < TURBO_MID_3BIT[1]) return 1; + else if (val < TURBO_MID_3BIT[2]) return 2; + else if (val < TURBO_MID_3BIT[3]) return 3; + else if (val < TURBO_MID_3BIT[4]) return 4; + else if (val < TURBO_MID_3BIT[5]) return 5; + else if (val < TURBO_MID_3BIT[6]) return 6; + else return 7; +} + +// ---- Per-block quantize (32 elements, expects already-rotated input) ---- +// Used by set_rows after group-level WHT rotation + +static __device__ void quantize_f32_turbo3_0_block(const float * __restrict__ src, + block_turbo3_0 * __restrict__ dst) { + for (int j = 0; j < QK_TURBO3 / 4; j++) dst->qs[j] = 0; + for (int j = 0; j < QK_TURBO3 / 8; j++) dst->signs[j] = 0; + + for (int j = 0; j < QK_TURBO3; j++) { + uint8_t idx = turbo_nearest_centroid_3bit(src[j]); + dst->qs[j / 4] |= (idx & 0x3) << ((j % 4) * 2); + if (idx & 0x4) { + dst->signs[j / 8] |= (1 << (j % 8)); + } + } +} + +// ---- Inline dequant helper: extract one float from turbo3 block ---- + +static __device__ __forceinline__ float turbo3_dequant_element( + const block_turbo3_0 * __restrict__ x, int j, float norm) { + uint8_t low2 = (x->qs[j / 4] >> ((j % 4) * 2)) & 0x3; + uint8_t hi1 = (x->signs[j / 8] >> (j % 8)) & 0x1; + uint8_t idx = low2 | (hi1 << 2); + return TURBO_CENTROIDS_3BIT[idx] * norm; +} + +// ---- Nearest 2-bit centroid index ---- + +static __device__ __forceinline__ uint8_t turbo_nearest_centroid_2bit(float val) { + if (val < TURBO_MID_2BIT[0]) return 0; + else if (val < TURBO_MID_2BIT[1]) return 1; + else if (val < TURBO_MID_2BIT[2]) return 2; + else return 3; +} + +// ---- Per-block quantize for turbo2 (32 elements, expects already-rotated input) ---- + +static __device__ void quantize_f32_turbo2_0_block(const float * __restrict__ src, + block_turbo2_0 * __restrict__ dst) { + for (int j = 0; j < QK_TURBO2 / 4; j++) dst->qs[j] = 0; + + for (int j = 0; j < QK_TURBO2; j++) { + uint8_t idx = turbo_nearest_centroid_2bit(src[j]); + dst->qs[j / 4] |= (idx & 0x3) << ((j % 4) * 2); + } +} + +// ---- Inline dequant helper: extract one float from turbo2 block ---- + +static __device__ __forceinline__ float turbo2_dequant_element( + const block_turbo2_0 * __restrict__ x, int j, float norm) { + uint8_t idx = (x->qs[j / 4] >> ((j % 4) * 2)) & 0x3; + return TURBO_CENTROIDS_2BIT[idx] * norm; +} diff --git a/ggml/src/ggml-cuda/turbo-wht.cu b/ggml/src/ggml-cuda/turbo-wht.cu new file mode 100644 index 00000000000..ea9b63229e5 --- /dev/null +++ b/ggml/src/ggml-cuda/turbo-wht.cu @@ -0,0 +1,174 @@ +#include "turbo-quant.cuh" +#include "turbo-wht.cuh" + +// ─── CUDA kernel ────────────────────────────────────────────────────────────── +// +// Templated on direction and group_size (128 or 64). +// One block per group, group_size threads per block. +// direction: 0 = forward (signs1 → WHT → signs2), 1 = inverse (signs2 → WHT → signs1) +// +// When head_dim is not a multiple of group_size, only the full groups +// within each head are processed. Tail elements are left unchanged (identity). +// +// Algorithm mirrors the CPU implementation in ggml-cpu/ops.cpp: +// 1. Apply s_first elementwise +// 2. Radix-2 Hadamard butterfly (log2(group_size) stages, in-place) +// 3. Normalize by 1/sqrt(group_size) and apply s_second elementwise +// +// InnerQ scale_inv: when non-null, applies per-channel inverse scaling for +// Q/V equalization. For forward (Q rotation): multiply BEFORE signs+WHT. +// For inverse (V un-rotation): multiply AFTER WHT+signs. + +template +static __global__ void k_turbo_wht_f32(const float * __restrict__ src, + float * __restrict__ dst, + const float * __restrict__ scale_inv, + int64_t n_groups, + int64_t head_dim, + int64_t groups_per_head) { + static_assert(group_size == 128 || group_size == 64, "group_size must be 128 or 64"); + + const int64_t g = blockIdx.x; + if (g >= n_groups) return; + + const int t = threadIdx.x; // 0 .. group_size-1 + + // Map group index to position in the tensor: + // each head has groups_per_head full groups, then a gap of tail elements. + const int64_t head_idx = g / groups_per_head; + const int64_t grp_in_head = g % groups_per_head; + const int64_t base = head_idx * head_dim + grp_in_head * group_size; + + __shared__ float x[group_size]; + + // Load from global memory + x[t] = src[base + t]; + __syncthreads(); + + // InnerQ forward: apply scale_inv BEFORE signs+WHT (for Q pre-rotation) + if (direction == 0 && scale_inv != nullptr) { + x[t] *= scale_inv[t % group_size]; + __syncthreads(); + } + + // Apply first sign array + if (group_size == 128) { + x[t] *= (direction == 0) ? TURBO_WHT_SIGNS1[t] : TURBO_WHT_SIGNS2[t]; + } else { + x[t] *= (direction == 0) ? TURBO_WHT_SIGNS1_64[t] : TURBO_WHT_SIGNS2_64[t]; + } + __syncthreads(); + + // WHT butterfly — log2(group_size) stages. + // In stage h, threads where (t % (2h)) < h read x[t] and x[t+h], + // then write x[t] = a+b and x[t+h] = a-b. Each active thread + // owns a disjoint pair, so no intra-stage conflicts exist. +#define WHT_STAGE(h) \ + if (t % (2*(h)) < (h)) { float a = x[t], b = x[t+(h)]; x[t] = a+b; x[t+(h)] = a-b; } \ + __syncthreads(); + + WHT_STAGE(1) + WHT_STAGE(2) + WHT_STAGE(4) + WHT_STAGE(8) + WHT_STAGE(16) + WHT_STAGE(32) + if (group_size == 128) { WHT_STAGE(64) } +#undef WHT_STAGE + + // Normalize and apply second sign array, write to output + constexpr float inv_sqrt = (group_size == 128) ? 0.08838834764831845f : 0.125f; + float result; + if (group_size == 128) { + result = x[t] * inv_sqrt * + ((direction == 0) ? TURBO_WHT_SIGNS2[t] : TURBO_WHT_SIGNS1[t]); + } else { + result = x[t] * inv_sqrt * + ((direction == 0) ? TURBO_WHT_SIGNS2_64[t] : TURBO_WHT_SIGNS1_64[t]); + } + + // InnerQ inverse: apply scale_inv AFTER WHT+signs (for V un-rotation) + if (direction == 1 && scale_inv != nullptr) { + result *= scale_inv[t % group_size]; + } + + dst[base + t] = result; +} + +// ─── Simple copy kernel for tail elements (identity pass-through) ──────────── + +static __global__ void k_turbo_wht_copy_tail(const float * __restrict__ src, + float * __restrict__ dst, + int64_t n_heads, + int64_t head_dim, + int64_t tail_offset, + int tail_size) { + const int64_t i = (int64_t)blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n_heads * tail_size) return; + + const int64_t head_idx = i / tail_size; + const int64_t tail_elem = i % tail_size; + const int64_t offset = head_idx * head_dim + tail_offset + tail_elem; + dst[offset] = src[offset]; +} + +// ─── Dispatch ───────────────────────────────────────────────────────────────── + +void ggml_cuda_turbo_wht(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src = dst->src[0]; + const ggml_tensor * scale_tensor = dst->src[1]; // InnerQ scale_inv (may be NULL) + + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + int direction; + int group_size; + memcpy(&direction, dst->op_params + 0, sizeof(int)); + memcpy(&group_size, dst->op_params + sizeof(int), sizeof(int)); + + const int64_t head_dim = src->ne[0]; + const int64_t n_heads = ggml_nelements(src) / head_dim; + + GGML_ASSERT(group_size == 64 || group_size == 128); + const int64_t groups_per_head = head_dim / group_size; + const int tail_size = (int)(head_dim % group_size); + const int64_t n_groups = groups_per_head * n_heads; + + const float * src_ptr = (const float *) src->data; + float * dst_ptr = (float *) dst->data; + const float * scale_inv_ptr = scale_tensor ? (const float *) scale_tensor->data : nullptr; + + cudaStream_t stream = ctx.stream(); + + // Process full groups + if (n_groups > 0) { + dim3 blocks(n_groups); + if (group_size == 128) { + dim3 threads(128); + if (direction == 0) { + k_turbo_wht_f32<0, 128><<>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head); + } else { + k_turbo_wht_f32<1, 128><<>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head); + } + } else { + dim3 threads(64); + if (direction == 0) { + k_turbo_wht_f32<0, 64><<>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head); + } else { + k_turbo_wht_f32<1, 64><<>>(src_ptr, dst_ptr, scale_inv_ptr, n_groups, head_dim, groups_per_head); + } + } + } + + // Pass through tail elements unchanged (no rotation) + // Not needed for 64-aligned dims but kept for completeness + if (tail_size > 0) { + const int64_t total_tail = n_heads * tail_size; + const int block_sz = 256; + const int n_blocks = (int)((total_tail + block_sz - 1) / block_sz); + k_turbo_wht_copy_tail<<>>( + src_ptr, dst_ptr, n_heads, head_dim, groups_per_head * group_size, tail_size); + } +} diff --git a/ggml/src/ggml-cuda/turbo-wht.cuh b/ggml/src/ggml-cuda/turbo-wht.cuh new file mode 100644 index 00000000000..3038a1ab082 --- /dev/null +++ b/ggml/src/ggml-cuda/turbo-wht.cuh @@ -0,0 +1,5 @@ +#pragma once + +#include "common.cuh" + +void ggml_cuda_turbo_wht(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 7432b91b19b..051378ec3bc 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -106,6 +106,9 @@ GGML_API void dequantize_row_turbo3_0(const block_turbo3_0 * GGML_RESTRICT x, fl GGML_API void dequantize_row_turbo4_0(const block_turbo4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API size_t quantize_turbo3_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_turbo4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API void quantize_row_turbo2_0_ref(const float * GGML_RESTRICT x, block_turbo2_0 * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_turbo2_0(const block_turbo2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_turbo2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API void iq2xs_init_impl(enum ggml_type type); GGML_API void iq2xs_free_impl(enum ggml_type type); diff --git a/ggml/src/ggml-turbo-quant.c b/ggml/src/ggml-turbo-quant.c index 6b0afbf900a..29b0d2a1538 100644 --- a/ggml/src/ggml-turbo-quant.c +++ b/ggml/src/ggml-turbo-quant.c @@ -2,8 +2,8 @@ * TurboQuant: KV cache compression via PolarQuant + QJL * Based on: arXiv 2504.19874 (ICLR 2026) * - * Implements GGML_TYPE_TURBO3_0 (3-bit) and GGML_TYPE_TURBO4_0 (4-bit) - * for use as --cache-type-k turbo3 --cache-type-v turbo3 in llama-server. + * Implements GGML_TYPE_TURBO2_0 (2-bit), GGML_TYPE_TURBO3_0 (3-bit) and + * GGML_TYPE_TURBO4_0 (4-bit) for use as --cache-type-k turboN in llama-server. */ #include "ggml-quants.h" @@ -15,6 +15,9 @@ #include #include +/* Global: WHT group size for CPU quantize path (set by CPU SET_ROWS handler) */ +int turbo3_cpu_wht_group_size = 0; + /* ---------- constants ---------- */ #define TURBO_SEED_ROTATION 42 @@ -170,18 +173,110 @@ static int nearest_centroid_3bit(float val) { return 7; } -/* ---------- TURBO3_0: 2-bit PolarQuant + 1-bit QJL ---------- */ +/* ---------- WHT sign arrays (must match CUDA/Metal, seed=42) ---------- */ + +static const float turbo_cpu_s1[128] = { + -1,1,1,-1,-1,1,-1,1,-1,-1,1,1,1,1,1,1,1,-1,1,-1,1,-1,-1,1,1,1,-1,1,1,-1,-1,-1, + -1,1,1,-1,1,1,-1,1,-1,1,1,-1,-1,1,-1,1,1,1,1,-1,-1,-1,-1,-1,1,-1,1,1,1,1,-1,1, + -1,-1,1,-1,-1,-1,1,-1,-1,-1,1,-1,-1,-1,1,1,1,-1,-1,1,1,1,-1,-1,1,1,-1,1,1,-1,1,-1, + -1,1,1,-1,1,-1,1,-1,1,1,1,1,-1,1,-1,1,1,-1,1,1,-1,-1,-1,-1,-1,1,1,-1,1,1,-1,1 +}; + +static const float turbo_cpu_s2[128] = { + 1,1,1,1,-1,1,1,-1,1,-1,-1,-1,1,-1,-1,-1,1,1,-1,-1,1,-1,1,-1,1,-1,-1,1,-1,1,1,1, + 1,1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,1,1,1,-1,1,-1,1,1,1,-1,-1,1,-1,-1,-1,-1,-1,-1,1,1, + 1,-1,1,-1,-1,-1,-1,1,-1,1,-1,1,-1,-1,1,1,-1,1,-1,1,1,-1,1,-1,-1,-1,-1,1,-1,-1,1,-1, + 1,-1,1,1,1,-1,-1,1,-1,1,-1,1,1,-1,-1,1,-1,1,-1,1,1,-1,1,-1,1,-1,-1,-1,-1,-1,1,-1 +}; + +/* ---------- CPU forward WHT (in-place, group_size elements) ---------- */ + +static void turbo_cpu_fwht(float * x, int group_size) { + const float * s1 = turbo_cpu_s1; + const float * s2 = turbo_cpu_s2; + const float inv_sqrt = (group_size == 128) ? 0.08838834764831845f : 0.125f; + + // signs1 + for (int i = 0; i < group_size; i++) x[i] *= s1[i]; + + // butterfly stages + for (int h = 1; h < group_size; h *= 2) { + for (int i = 0; i < group_size; i += h * 2) { + for (int j = i; j < i + h; j++) { + float a = x[j], b = x[j + h]; + x[j] = a + b; + x[j + h] = a - b; + } + } + } + + // normalize + signs2 + for (int i = 0; i < group_size; i++) x[i] *= inv_sqrt * s2[i]; +} + +/* ---------- TURBO3_0: 3-bit PolarQuant with WHT rotation ---------- */ void quantize_row_turbo3_0_ref(const float * GGML_RESTRICT x, block_turbo3_0 * GGML_RESTRICT y, int64_t k) { - // Stub — Metal shader handles quantize on GPU. CPU path is simplified. assert(k % QK_TURBO3 == 0); - const int nb = k / QK_TURBO3; - for (int i = 0; i < nb; i++) { - float norm = 0.0f; - for (int j = 0; j < QK_TURBO3; j++) norm += x[i*QK_TURBO3 + j] * x[i*QK_TURBO3 + j]; - y[i].norm = GGML_FP32_TO_FP16(sqrtf(norm)); - memset(y[i].qs, 0, QK_TURBO3 / 4); - memset(y[i].signs, 0, QK_TURBO3 / 8); + + // Read WHT group size from global (set by CPU SET_ROWS handler before each call). + // Fallback: 128 if row is 128-aligned, else 64. + extern int turbo3_cpu_wht_group_size; + int group_size = turbo3_cpu_wht_group_size; + if (group_size != 64 && group_size != 128) { + group_size = (k % 128 == 0) ? 128 : 64; + } + if (k % group_size != 0) group_size = (group_size == 128) ? 64 : 128; + assert(k % group_size == 0); + + const int n_groups = k / group_size; + const int blocks_per_group = group_size / QK_TURBO3; + + for (int g = 0; g < n_groups; g++) { + const float * grp_src = x + g * group_size; + block_turbo3_0 * grp_dst = y + g * blocks_per_group; + + // 1. L2 norm over the group + float norm_sq = 0.0f; + float buf[128]; // max group_size + for (int j = 0; j < group_size; j++) { + buf[j] = grp_src[j]; + norm_sq += buf[j] * buf[j]; + } + float grp_norm = sqrtf(norm_sq); + float inv_norm = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f; + + // 2. Normalize + for (int j = 0; j < group_size; j++) buf[j] *= inv_norm; + + // 3. Forward WHT rotation + turbo_cpu_fwht(buf, group_size); + + // 4. Quantize + pack into sub-blocks + float recon_sq = 0.0f; + for (int b = 0; b < blocks_per_group; b++) { + block_turbo3_0 * blk = &grp_dst[b]; + const int off = b * QK_TURBO3; + + memset(blk->qs, 0, QK_TURBO3 / 4); + memset(blk->signs, 0, QK_TURBO3 / 8); + + for (int j = 0; j < QK_TURBO3; j++) { + int idx = nearest_centroid_3bit(buf[off + j]); + blk->qs[j / 4] |= (idx & 0x3) << ((j % 4) * 2); + if (idx & 0x4) { + blk->signs[j / 8] |= (1 << (j % 8)); + } + recon_sq += CENTROIDS_3BIT[idx] * CENTROIDS_3BIT[idx]; + } + } + + // 5. Corrected norm: grp_norm / recon_norm (matching CUDA kernel) + float recon_norm = sqrtf(recon_sq); + float corrected = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm; + for (int b = 0; b < blocks_per_group; b++) { + grp_dst[b].norm = GGML_FP32_TO_FP16(corrected); + } } } @@ -216,6 +311,94 @@ size_t quantize_turbo3_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT d return nrows * row_size; } +/* ---------- TURBO2_0: 2-bit PolarQuant (no QJL) ---------- */ + +void quantize_row_turbo2_0_ref(const float * GGML_RESTRICT x, block_turbo2_0 * GGML_RESTRICT y, int64_t k) { + assert(k % QK_TURBO2 == 0); + + extern int turbo3_cpu_wht_group_size; + int group_size = turbo3_cpu_wht_group_size; + if (group_size != 64 && group_size != 128) { + group_size = (k % 128 == 0) ? 128 : 64; + } + if (k % group_size != 0) group_size = (group_size == 128) ? 64 : 128; + assert(k % group_size == 0); + + const int n_groups = k / group_size; + const int blocks_per_group = group_size / QK_TURBO2; + + for (int g = 0; g < n_groups; g++) { + const float * grp_src = x + g * group_size; + block_turbo2_0 * grp_dst = y + g * blocks_per_group; + + /* 1. L2 norm over the group */ + float norm_sq = 0.0f; + float buf[128]; + for (int j = 0; j < group_size; j++) { + buf[j] = grp_src[j]; + norm_sq += buf[j] * buf[j]; + } + float grp_norm = sqrtf(norm_sq); + float inv_norm = (grp_norm > 1e-10f) ? 1.0f / grp_norm : 0.0f; + + /* 2. Normalize */ + for (int j = 0; j < group_size; j++) buf[j] *= inv_norm; + + /* 3. Forward WHT rotation */ + turbo_cpu_fwht(buf, group_size); + + /* 4. Quantize + pack into sub-blocks */ + float recon_sq = 0.0f; + for (int b = 0; b < blocks_per_group; b++) { + block_turbo2_0 * blk = &grp_dst[b]; + const int off = b * QK_TURBO2; + + memset(blk->qs, 0, QK_TURBO2 / 4); + + for (int j = 0; j < QK_TURBO2; j++) { + int idx = nearest_centroid_2bit(buf[off + j]); + blk->qs[j / 4] |= (idx & 0x3) << ((j % 4) * 2); + recon_sq += CENTROIDS_2BIT[idx] * CENTROIDS_2BIT[idx]; + } + } + + /* 5. Corrected norm */ + float recon_norm = sqrtf(recon_sq); + float corrected = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm; + for (int b = 0; b < blocks_per_group; b++) { + grp_dst[b].norm = GGML_FP32_TO_FP16(corrected); + } + } +} + +void dequantize_row_turbo2_0(const block_turbo2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK_TURBO2 == 0); + const int nb = k / QK_TURBO2; + for (int block = 0; block < nb; block++) { + float norm = GGML_FP16_TO_FP32(x[block].norm); + for (int j = 0; j < QK_TURBO2; j++) { + uint8_t idx = (x[block].qs[j/4] >> ((j%4)*2)) & 0x3; + y[block * QK_TURBO2 + j] = CENTROIDS_2BIT[idx] * norm; + } + } +} + +size_t quantize_turbo2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, + int64_t nrows, int64_t n_per_row, const float * imatrix) { + GGML_UNUSED(imatrix); + assert(n_per_row % QK_TURBO2 == 0); + + size_t row_size = (n_per_row / QK_TURBO2) * sizeof(block_turbo2_0); + for (int64_t row = 0; row < nrows; row++) { + quantize_row_turbo2_0_ref( + src + row * n_per_row, + (block_turbo2_0 *)((char *)dst + row * row_size), + n_per_row + ); + } + return nrows * row_size; +} + /* ---------- TURBO4_0: 3-bit PolarQuant + 1-bit QJL ---------- */ void quantize_row_turbo4_0_ref(const float * GGML_RESTRICT x, block_turbo4_0 * GGML_RESTRICT y, int64_t k) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index af42112aaec..43513d9d650 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -742,6 +742,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_turbo4_0, .from_float_ref = (ggml_from_float_t) quantize_row_turbo4_0_ref, }, + [GGML_TYPE_TURBO2_0] = { + .type_name = "turbo2", + .blck_size = QK_TURBO2, + .type_size = sizeof(block_turbo2_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_turbo2_0, + .from_float_ref = (ggml_from_float_t) quantize_row_turbo2_0_ref, + }, [GGML_TYPE_Q2_K] = { .type_name = "q2_K", .blck_size = QK_K, @@ -6217,19 +6225,29 @@ struct ggml_tensor * ggml_gated_delta_net( struct ggml_tensor * ggml_turbo_wht( struct ggml_context * ctx, struct ggml_tensor * a, - int direction) { + int direction, + int group_size, + struct ggml_tensor * scale) { GGML_ASSERT(ggml_is_contiguous(a)); GGML_ASSERT(a->type == GGML_TYPE_F32); - GGML_ASSERT(a->ne[0] % 128 == 0); // ne[0] must be divisible by rotation group size GGML_ASSERT(direction == 0 || direction == 1); + // Auto-detect group size from tensor dimension if not specified + if (group_size == 0) { + group_size = (a->ne[0] % 128 == 0) ? 128 : 64; + } + GGML_ASSERT(group_size == 64 || group_size == 128); + GGML_ASSERT(a->ne[0] % group_size == 0); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, a->ne); result->op = GGML_OP_TURBO_WHT; result->src[0] = a; + result->src[1] = scale; // InnerQ scale_inv (NULL = no scaling) - // Store direction in op_params: 0 = forward, 1 = inverse - memcpy(result->op_params, &direction, sizeof(int)); + // Store direction and group_size in op_params + memcpy(result->op_params + 0, &direction, sizeof(int)); + memcpy(result->op_params + sizeof(int), &group_size, sizeof(int)); return result; } @@ -7717,6 +7735,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_TURBO3_0: result = quantize_turbo3_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_TURBO4_0: result = quantize_turbo4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_TURBO2_0: result = quantize_turbo2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index a66e03a7dcd..ff2b881cf26 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1800,11 +1800,9 @@ ggml_tensor * llm_graph_context::build_attn_mha( k = ggml_permute(ctx0, k, 0, 2, 1, 3); v = ggml_permute(ctx0, v, 0, 2, 1, 3); - // TODO: TurboQuant pre-rotate-queries optimization (WIP — PPL 23.5 vs 6.19 target) - // The graph-side rotation approach works mechanically (ggml_mul_mat rotates correctly) - // but gives 4x worse PPL than dequant-side rotation for unknown reasons. - // Keeping dequant inverse rotation for now until this is resolved. - // See: docs/turbo-speed-investigation.md for full debugging history + // TurboQuant note: graph-side Q rotation (pre-rotate-queries) is implemented below + // in the flash-attn path. The VEC kernel bug (wrong Q/K stride in + // vec_dot_fattn_vec_KQ_turbo3_0) was fixed in fattn-common.cuh to match f16 pattern. ggml_tensor * cur; @@ -1832,6 +1830,20 @@ ggml_tensor * llm_graph_context::build_attn_mha( ggml_flash_attn_ext_add_sinks(cur, sinks); ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32); + // TurboQuant: inverse WHT on FA output when V values are WHT-rotated. + // For MLA, V is a view of K with different ne[0] (e.g. V=512, K=576). + // Group size must come from K (which determines the WHT rotation), not V. + if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) { + const bool k_is_turbo = (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0); + const ggml_tensor * group_src = k_is_turbo ? k : v; + const int turbo_group = (group_src->ne[0] % 128 == 0) ? 128 : 64; + if (cur->ne[0] % turbo_group == 0) { + if (!ggml_is_contiguous(cur)) { cur = ggml_cont(ctx0, cur); } + ggml_tensor * innerq_scale = mctx ? mctx->get_turbo_innerq_scale_inv() : nullptr; + cur = ggml_turbo_wht(ctx0, cur, 1, turbo_group, innerq_scale); // 1 = inverse + } + } + if (v_mla) { #if 0 // v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens. @@ -1898,6 +1910,18 @@ ggml_tensor * llm_graph_context::build_attn_mha( ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); cb(kqv, "kqv", il); + // TurboQuant: inverse WHT on attention output (non-FA path) + if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) { + const bool k_is_turbo = (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0); + const ggml_tensor * group_src = k_is_turbo ? k : v; + const int turbo_group = (group_src->ne[0] % 128 == 0) ? 128 : 64; + if (kqv->ne[0] % turbo_group == 0) { + if (!ggml_is_contiguous(kqv)) { kqv = ggml_cont(ctx0, kqv); } + ggml_tensor * innerq_scale = mctx ? mctx->get_turbo_innerq_scale_inv() : nullptr; + kqv = ggml_turbo_wht(ctx0, kqv, 1, turbo_group, innerq_scale); + } + } + // for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA if (v_mla) { kqv = ggml_mul_mat(ctx0, v_mla, kqv); @@ -1915,8 +1939,7 @@ ggml_tensor * llm_graph_context::build_attn_mha( } } - // TODO: TurboQuant V inverse rotation (WIP — part of pre-rotate-queries optimization) - // See comment above for status + // TurboQuant: graph-side inverse WHT on attention output (undoes V rotation) ggml_build_forward_expand(gf, cur); @@ -2068,23 +2091,38 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * v = mctx_cur->get_v(ctx0, il); // TurboQuant pre-rotate-queries: O(d log d) WHT rotation via custom op - // Q shape: (n_embd_head, n_head, n_tokens) — ne[0] divisible by 128 - // No reshape/cont/matmul needed — the custom kernel handles groups internally - if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0) { - if (q->ne[0] % 128 == 0) { - if (!ggml_is_contiguous(q)) { q = ggml_cont(ctx0, q); } - q = ggml_turbo_wht(ctx0, q, 0); // 0 = forward + // Q shape: (n_embd_head, n_head, n_tokens) + // For zero-padded models (head_dim not 128-aligned), pad Q to match padded K dim first. + if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) { + // Pad Q per-head to next multiple of 128 if needed + if (q->ne[0] % 128 != 0) { + const int64_t pad = ((q->ne[0] + 127) / 128) * 128 - q->ne[0]; + q = ggml_pad(ctx0, q, pad, 0, 0, 0); } + if (!ggml_is_contiguous(q)) { q = ggml_cont(ctx0, q); } + ggml_tensor * innerq_scale = mctx_cur->get_turbo_innerq_scale_inv(); + q = ggml_turbo_wht(ctx0, q, 0, 0, innerq_scale); // 0 = forward, 0 = auto group size from q->ne[0] } ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il); cb(cur, "kqv_out", il); - // TurboQuant V un-rotation: O(d log d) inverse WHT on attention output - if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0) { - if (cur->ne[0] % 128 == 0) { - if (!ggml_is_contiguous(cur)) { cur = ggml_cont(ctx0, cur); } - cur = ggml_turbo_wht(ctx0, cur, 1); // 1 = inverse + // TurboQuant: if V was padded, the output has padded dimensions. + // Extract original V head_dim after inverse WHT (applied inside build_attn_mha). + if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) { + const int64_t orig_v_head = hparams.n_embd_head_v(il); + // cur is 2D: (n_embd_head * n_head, n_tokens) after build_attn_mha + const int64_t padded_v_head = v->ne[0]; + if (padded_v_head != orig_v_head) { + // Reshape to 4D, extract original head_dim, reshape back to 2D + const int64_t n_head_v = hparams.n_head_kv(il); + const int64_t n_tokens_cur = cur->ne[1]; + cur = ggml_reshape_3d(ctx0, cur, padded_v_head, n_head_v, n_tokens_cur); + // ggml_view_3d to extract first orig_v_head elements per head + cur = ggml_view_3d(ctx0, cur, orig_v_head, n_head_v, n_tokens_cur, + cur->nb[1], cur->nb[2], 0); + cur = ggml_cont(ctx0, cur); + cur = ggml_reshape_2d(ctx0, cur, orig_v_head * n_head_v, n_tokens_cur); } } @@ -2168,9 +2206,39 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k = mctx_cur->get_k(ctx0, il); ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0); + // TurboQuant: pre-rotate Q for K-only (MLA) attention + // For zero-padded models, pad Q to match padded K dim first. + if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) { + // Pad Q per-head to next multiple of 128 if needed + if (q->ne[0] % 128 != 0) { + const int64_t pad = ((q->ne[0] + 127) / 128) * 128 - q->ne[0]; + q = ggml_pad(ctx0, q, pad, 0, 0, 0); + } + if (!ggml_is_contiguous(q)) { q = ggml_cont(ctx0, q); } + ggml_tensor * innerq_scale = mctx_cur->get_turbo_innerq_scale_inv(); + q = ggml_turbo_wht(ctx0, q, 0, 0, innerq_scale); // 0 = forward, 0 = auto group size + } + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il); cb(cur, "kqv_out", il); + // TurboQuant: if V was padded (MLA: V is view of K, may have padded dim), + // extract original V head_dim after inverse WHT. + if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) { + const int64_t orig_v_head = v_cur->ne[0]; // original V head_dim from model + const int64_t padded_v_head = v->ne[0]; // padded V head_dim in cache + if (padded_v_head != orig_v_head) { + // cur is 2D: (padded_v_head * n_head, n_tokens) after build_attn_mha + const int64_t n_head_v = hparams.n_head_kv(il); + const int64_t n_tokens_cur = cur->ne[1]; + cur = ggml_reshape_3d(ctx0, cur, padded_v_head, n_head_v, n_tokens_cur); + cur = ggml_view_3d(ctx0, cur, orig_v_head, n_head_v, n_tokens_cur, + cur->nb[1], cur->nb[2], 0); + cur = ggml_cont(ctx0, cur); + cur = ggml_reshape_2d(ctx0, cur, orig_v_head * n_head_v, n_tokens_cur); + } + } + if (wo) { cur = build_lora_mm(wo, cur); if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) { @@ -2235,9 +2303,35 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k = mctx_cur->get_k(ctx0, il); ggml_tensor * v = mctx_cur->get_v(ctx0, il); + // TurboQuant: pre-rotate Q for ISWA attention (same logic as non-ISWA) + if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) { + if (q->ne[0] % 128 != 0) { + const int64_t pad = ((q->ne[0] + 127) / 128) * 128 - q->ne[0]; + q = ggml_pad(ctx0, q, pad, 0, 0, 0); + } + if (!ggml_is_contiguous(q)) { q = ggml_cont(ctx0, q); } + ggml_tensor * innerq_scale = mctx_cur->get_turbo_innerq_scale_inv(); + q = ggml_turbo_wht(ctx0, q, 0, 0, innerq_scale); + } + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il); cb(cur, "kqv_out", il); + // TurboQuant: if V was padded, extract original V head_dim after inverse WHT + if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) { + const int64_t orig_v_head = hparams.n_embd_head_v(il); + const int64_t padded_v_head = v->ne[0]; + if (padded_v_head != orig_v_head) { + const int64_t n_head_v = hparams.n_head_kv(il); + const int64_t n_tokens_cur = cur->ne[1]; + cur = ggml_reshape_3d(ctx0, cur, padded_v_head, n_head_v, n_tokens_cur); + cur = ggml_view_3d(ctx0, cur, orig_v_head, n_head_v, n_tokens_cur, + cur->nb[1], cur->nb[2], 0); + cur = ggml_cont(ctx0, cur); + cur = ggml_reshape_2d(ctx0, cur, orig_v_head * n_head_v, n_tokens_cur); + } + } + if (wo) { cur = build_lora_mm(wo, cur); } diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 2c61d96d693..6b7302e8fb8 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -13,6 +13,25 @@ #include #include +// InnerQ: cross-TU shared state for CUDA per-channel equalization. +// These are defined in ggml-cuda/turbo-innerq.cu (when CUDA is enabled). +// When CUDA is not available, we provide stub implementations. +#ifndef INNERQ_MAX_CHANNELS +#define INNERQ_MAX_CHANNELS 128 +#endif + +#ifdef GGML_USE_CUDA +extern bool g_innerq_finalized; +extern float g_innerq_scale_inv_host[INNERQ_MAX_CHANNELS]; +extern bool turbo_innerq_needs_tensor_update(void); +extern void turbo_innerq_mark_tensor_updated(void); +#else +static bool g_innerq_finalized = false; +static float g_innerq_scale_inv_host[INNERQ_MAX_CHANNELS] = {}; +static bool turbo_innerq_needs_tensor_update(void) { return false; } +static void turbo_innerq_mark_tensor_updated(void) {} +#endif + // // llama_kv_cache // @@ -51,8 +70,8 @@ llama_kv_cache::llama_kv_cache( auto it = ctx_map.find(buft); if (it == ctx_map.end()) { ggml_init_params params = { - // +2 for turbo rotation matrices (turbo_rotation + turbo_rotation_inv) - /*.mem_size =*/ size_t((2u*(1 + n_stream)*n_layer_kv + 2)*ggml_tensor_overhead()), + // +3 for turbo rotation matrices (turbo_rotation + turbo_rotation_inv + turbo_innerq_scale_inv) + /*.mem_size =*/ size_t((2u*(1 + n_stream)*n_layer_kv + 3)*ggml_tensor_overhead()), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -133,6 +152,13 @@ llama_kv_cache::llama_kv_cache( throw std::runtime_error("failed to create ggml context for kv cache"); } + // TurboQuant zero-padding: for models with non-128-aligned head_dim (e.g. DeepSeek + // head_dim_k=192), pad each head to the next multiple of 128. The padded zeros don't + // affect dot products since WHT preserves inner products: + // = = + <0, 0> = + const uint32_t n_embd_head_k = hparams.n_embd_head_k(il); + + const bool has_k = true; const bool has_v = !is_mla; @@ -151,7 +177,7 @@ llama_kv_cache::llama_kv_cache( } return mode; }(); - const bool is_turbo = (type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0); + const bool is_turbo = (type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0 || type_k == GGML_TYPE_TURBO2_0); const uint32_t n_layer = hparams.n_layer; if (adaptive_mode == 1 && is_turbo && n_layer >= 8) { if (il < 4 || il >= n_layer - 4) { @@ -165,8 +191,35 @@ llama_kv_cache::llama_kv_cache( } } } - ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, layer_type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr; - ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, layer_type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr; + // For turbo types, pad K head_dim to next multiple of 128 for full WHT groups + uint32_t n_embd_k_gqa_eff = n_embd_k_gqa; + const bool k_is_turbo = (layer_type_k == GGML_TYPE_TURBO3_0 || layer_type_k == GGML_TYPE_TURBO4_0 || layer_type_k == GGML_TYPE_TURBO2_0); + if (k_is_turbo && n_embd_head_k % 128 != 0) { + const uint32_t padded_head_k = ((n_embd_head_k + 127) / 128) * 128; + const uint32_t n_head_kv = n_embd_k_gqa / n_embd_head_k; + n_embd_k_gqa_eff = n_head_kv * padded_head_k; + if (il == 0) { + LLAMA_LOG_INFO("%s: turbo zero-padding K head_dim %u -> %u (cache %u -> %u)\n", + __func__, n_embd_head_k, padded_head_k, n_embd_k_gqa, n_embd_k_gqa_eff); + } + } + + // For turbo types, pad V head_dim to next multiple of 128 if needed + const uint32_t n_embd_head_v = hparams.n_embd_head_v(il); + uint32_t n_embd_v_gqa_eff = n_embd_v_gqa; + const bool v_is_turbo = (layer_type_v == GGML_TYPE_TURBO3_0 || layer_type_v == GGML_TYPE_TURBO4_0 || layer_type_v == GGML_TYPE_TURBO2_0); + if (v_is_turbo && !is_mla && n_embd_head_v % 128 != 0) { + const uint32_t padded_head_v = ((n_embd_head_v + 127) / 128) * 128; + const uint32_t n_head_kv = n_embd_v_gqa / n_embd_head_v; + n_embd_v_gqa_eff = n_head_kv * padded_head_v; + if (il == 0) { + LLAMA_LOG_INFO("%s: turbo zero-padding V head_dim %u -> %u (cache %u -> %u)\n", + __func__, n_embd_head_v, padded_head_v, n_embd_v_gqa, n_embd_v_gqa_eff); + } + } + + ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, layer_type_k, n_embd_k_gqa_eff, kv_size, n_stream) : nullptr; + ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, layer_type_v, n_embd_v_gqa_eff, kv_size, n_stream) : nullptr; has_k && ggml_format_name(k, "cache_k_l%d", il); has_v && ggml_format_name(v, "cache_v_l%d", il); @@ -175,8 +228,8 @@ llama_kv_cache::llama_kv_cache( std::vector v_stream; for (uint32_t s = 0; s < n_stream; ++s) { - k_stream.push_back(has_k ? ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]) : nullptr); - v_stream.push_back(has_v ? ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]) : nullptr); + k_stream.push_back(has_k ? ggml_view_2d(ctx, k, n_embd_k_gqa_eff, kv_size, k->nb[1], s*k->nb[2]) : nullptr); + v_stream.push_back(has_v ? ggml_view_2d(ctx, v, n_embd_v_gqa_eff, kv_size, v->nb[1], s*v->nb[2]) : nullptr); } map_layer_ids[il] = layers.size(); @@ -185,11 +238,15 @@ llama_kv_cache::llama_kv_cache( // TurboQuant: create rotation matrix tensors (once, shared across layers) if (turbo_rotation == nullptr && - (type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0)) { + (type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0 || type_k == GGML_TYPE_TURBO2_0)) { turbo_rotation = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 128, 128); ggml_format_name(turbo_rotation, "turbo_rotation"); // R^T turbo_rotation_inv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 128, 128); ggml_format_name(turbo_rotation_inv, "turbo_rotation_inv"); // R + + // InnerQ: per-channel scale_inv tensor (128 floats, initialized to all 1.0) + turbo_innerq_scale_inv = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, INNERQ_MAX_CHANNELS); + ggml_format_name(turbo_innerq_scale_inv, "turbo_innerq_scale_inv"); } } @@ -247,6 +304,14 @@ llama_kv_cache::llama_kv_cache( // ggml_mul_mat(A,x) computes A@x for row-major stored A (verified by test) ggml_backend_tensor_set(turbo_rotation, TURBO_ROTATION_R, 0, 128 * 128 * sizeof(float)); ggml_backend_tensor_set(turbo_rotation_inv, TURBO_ROTATION_RT, 0, 128 * 128 * sizeof(float)); + + // Initialize InnerQ scale_inv to all 1.0 (identity scaling) + if (turbo_innerq_scale_inv != nullptr && turbo_innerq_scale_inv->buffer != nullptr) { + float ones[INNERQ_MAX_CHANNELS]; + for (int i = 0; i < INNERQ_MAX_CHANNELS; i++) ones[i] = 1.0f; + ggml_backend_tensor_set(turbo_innerq_scale_inv, ones, 0, INNERQ_MAX_CHANNELS * sizeof(float)); + } + LLAMA_LOG_INFO("%s: TurboQuant rotation matrices initialized (128x128)\n", __func__); } ctxs_bufs.emplace_back(std::move(ctx), buf); @@ -282,6 +347,13 @@ void llama_kv_cache::clear(bool data) { #include "turbo-rotation-data.h" ggml_backend_tensor_set(turbo_rotation, TURBO_ROTATION_R, 0, 128 * 128 * sizeof(float)); ggml_backend_tensor_set(turbo_rotation_inv, TURBO_ROTATION_RT, 0, 128 * 128 * sizeof(float)); + + // Re-initialize InnerQ scale_inv to all 1.0 + if (turbo_innerq_scale_inv != nullptr && turbo_innerq_scale_inv->buffer != nullptr) { + float ones[INNERQ_MAX_CHANNELS]; + for (int i = 0; i < INNERQ_MAX_CHANNELS; i++) ones[i] = 1.0f; + ggml_backend_tensor_set(turbo_innerq_scale_inv, ones, 0, INNERQ_MAX_CHANNELS * sizeof(float)); + } } } } @@ -1088,13 +1160,24 @@ ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_k const uint64_t kv_size = get_size(); const uint64_t n_embd_k_gqa = k->ne[0]; - assert(n_embd_k_gqa == hparams.n_embd_k_gqa(il)); + // For turbo-padded caches, n_embd_k_gqa may be larger than hparams value + const bool k_is_turbo = (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0); + if (k_is_turbo) { + assert(n_embd_k_gqa >= hparams.n_embd_k_gqa(il)); + } else { + assert(n_embd_k_gqa == hparams.n_embd_k_gqa(il)); + } + + // Use padded head_dim for turbo types so the full padded data is returned + const uint32_t head_k = hparams.n_embd_head_k(il); + const uint32_t head_k_eff = (k_is_turbo && head_k % 128 != 0) + ? ((head_k + 127) / 128) * 128 : head_k; const uint32_t ns = sinfo.s1 - sinfo.s0 + 1; return ggml_view_4d(ctx, k, - hparams.n_embd_head_k(il), hparams.n_head_kv(il), n_kv, ns, - ggml_row_size(k->type, hparams.n_embd_head_k(il)), + head_k_eff, hparams.n_head_kv(il), n_kv, ns, + ggml_row_size(k->type, head_k_eff), ggml_row_size(k->type, n_embd_k_gqa), ggml_row_size(k->type, n_embd_k_gqa*kv_size), ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0); @@ -1108,27 +1191,33 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k const uint64_t kv_size = get_size(); const uint64_t n_embd_v_gqa = v->ne[0]; - // [TAG_V_CACHE_VARIABLE] + // [TAG_V_CACHE_VARIABLE] — for turbo-padded V, cache may be larger assert(n_embd_v_gqa >= hparams.n_embd_v_gqa(il)); + // Use padded head_dim for turbo types + const bool v_is_turbo = (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0); + const uint32_t head_v = hparams.n_embd_head_v(il); + const uint32_t head_v_eff = (v_is_turbo && head_v % 128 != 0) + ? ((head_v + 127) / 128) * 128 : head_v; + const uint32_t ns = sinfo.s1 - sinfo.s0 + 1; if (!v_trans) { // note: v->nb[1] <= v->nb[2] return ggml_view_4d(ctx, v, - hparams.n_embd_head_v(il), hparams.n_head_kv(il), n_kv, ns, - ggml_row_size(v->type, hparams.n_embd_head_v(il)), // v->nb[1] - ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2] - ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3] + head_v_eff, hparams.n_head_kv(il), n_kv, ns, + ggml_row_size(v->type, head_v_eff), // v->nb[1] + ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2] + ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3] ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0); } // note: v->nb[1] > v->nb[2] return ggml_view_4d(ctx, v, - n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v(il), ns, - ggml_row_size(v->type, kv_size*hparams.n_embd_head_v(il)), // v->nb[1] - ggml_row_size(v->type, kv_size), // v->nb[2] - ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3] + n_kv, hparams.n_head_kv(il), head_v_eff, ns, + ggml_row_size(v->type, kv_size*head_v_eff), // v->nb[1] + ggml_row_size(v->type, kv_size), // v->nb[2] + ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3] ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0); } @@ -1139,11 +1228,22 @@ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggm ggml_tensor * k = layers[ikv].k; - const int64_t n_embd_head = k_cur->ne[0]; + int64_t n_embd_head = k_cur->ne[0]; const int64_t n_head = k_cur->ne[1]; const int64_t n_tokens = k_cur->ne[2]; - const int64_t n_embd_gqa = n_embd_head*n_head; + // Turbo zero-padding: pad each head to next multiple of 128 before merging dims. + // k_cur shape here is (n_embd_head, n_head, n_tokens). + // ggml_pad pads ne[0] with zeros — exactly what we need per-head. + const bool k_is_turbo = (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0); + const bool k_needs_pad = k_is_turbo && (n_embd_head % 128 != 0); + if (k_needs_pad) { + const int64_t pad_amount = ((n_embd_head + 127) / 128) * 128 - n_embd_head; + k_cur = ggml_pad(ctx, k_cur, pad_amount, 0, 0, 0); + n_embd_head = k_cur->ne[0]; // now 128-aligned + } + + int64_t n_embd_gqa = n_embd_head * n_head; // we can merge dims 0 and 1 // TODO: add ggml helper function for this? @@ -1164,7 +1264,16 @@ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggm } // store the current K values into the cache - return ggml_set_rows(ctx, k, k_cur, k_idxs); + ggml_tensor * result = ggml_set_rows(ctx, k, k_cur, k_idxs); + + // For turbo: store WHT group size in op_params so the CUDA kernel knows. + // With zero-padding, all groups are always full 128-element WHT groups. + if (k_is_turbo) { + int32_t wht_group = 128; // always 128 with padding + memcpy(result->op_params, &wht_group, sizeof(int32_t)); + } + + return result; } ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const { @@ -1174,11 +1283,20 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm auto * v = layers[ikv].v; - const int64_t n_embd_head = v_cur->ne[0]; + int64_t n_embd_head = v_cur->ne[0]; const int64_t n_head = v_cur->ne[1]; const int64_t n_tokens = v_cur->ne[2]; - const int64_t n_embd_gqa = n_embd_head*n_head; + // Turbo zero-padding: pad V head_dim to next multiple of 128 + const bool v_is_turbo = (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0); + const bool v_needs_pad = v_is_turbo && (n_embd_head % 128 != 0); + if (v_needs_pad) { + const int64_t pad_amount = ((n_embd_head + 127) / 128) * 128 - n_embd_head; + v_cur = ggml_pad(ctx, v_cur, pad_amount, 0, 0, 0); + n_embd_head = v_cur->ne[0]; // now 128-aligned + } + + int64_t n_embd_gqa = n_embd_head * n_head; // we can merge dims 0 and 1 GGML_ASSERT(ggml_row_size(v_cur->type, n_embd_head) == v_cur->nb[1]); @@ -1199,7 +1317,13 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm v = ggml_reshape_2d(ctx, v, n_embd_gqa, kv_size*n_stream); } - return ggml_set_rows(ctx, v, v_cur, v_idxs); + ggml_tensor * result = ggml_set_rows(ctx, v, v_cur, v_idxs); + // With zero-padding, all groups are always full 128-element WHT groups + if (v_is_turbo) { + int32_t wht_group = 128; // always 128 with padding + memcpy(result->op_params, &wht_group, sizeof(int32_t)); + } + return result; } if (ggml_row_size(v_cur->type, n_embd_gqa) == v_cur->nb[2]) { @@ -2282,6 +2406,16 @@ bool llama_kv_cache_context::apply() { kv->apply_ubatch(sinfos[i_cur], ubatches[i_cur]); n_kv = kv->get_n_kv(sinfos[i_cur]); + // InnerQ: check if CUDA calibration finalized and tensor needs update + if (kv->get_turbo_innerq_scale_inv() != nullptr && turbo_innerq_needs_tensor_update()) { + ggml_tensor * t = kv->get_turbo_innerq_scale_inv(); + if (t->buffer != nullptr) { + ggml_backend_tensor_set(t, g_innerq_scale_inv_host, 0, INNERQ_MAX_CHANNELS * sizeof(float)); + turbo_innerq_mark_tensor_updated(); + LLAMA_LOG_INFO("%s: InnerQ scale_inv tensor updated\n", __func__); + } + } + return true; } @@ -2323,6 +2457,10 @@ ggml_tensor * llama_kv_cache_context::get_turbo_rot_inverse() const { return kv->get_turbo_rotation_inv(); } +ggml_tensor * llama_kv_cache_context::get_turbo_innerq_scale_inv() const { + return kv->get_turbo_innerq_scale_inv(); +} + ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const { return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]); } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 72b9588bdac..3f2009265a3 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -168,6 +168,9 @@ class llama_kv_cache : public llama_memory_i { ggml_tensor * get_turbo_rotation() const { return turbo_rotation; } ggml_tensor * get_turbo_rotation_inv() const { return turbo_rotation_inv; } + // TurboQuant InnerQ: per-channel scale_inv for Q/V equalization + ggml_tensor * get_turbo_innerq_scale_inv() const { return turbo_innerq_scale_inv; } + // store k_cur and v_cur in the cache based on the provided head location ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const; ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const; @@ -259,6 +262,9 @@ class llama_kv_cache : public llama_memory_i { ggml_tensor * turbo_rotation = nullptr; // R (forward rotation) ggml_tensor * turbo_rotation_inv = nullptr; // R^T = R^{-1} (inverse rotation) + // TurboQuant InnerQ: per-channel scale_inv for Q/V equalization (128 floats) + ggml_tensor * turbo_innerq_scale_inv = nullptr; + // model layer id -> KV cache layer id std::unordered_map map_layer_ids; @@ -350,6 +356,9 @@ class llama_kv_cache_context : public llama_memory_context_i { ggml_tensor * get_turbo_rot_forward() const override; ggml_tensor * get_turbo_rot_inverse() const override; + // TurboQuant InnerQ: per-channel scale_inv for Q/V equalization + ggml_tensor * get_turbo_innerq_scale_inv() const override; + // store k_cur and v_cur in the cache based on the provided head location // note: the heads in k_cur and v_cur should be layed out contiguously in memory // - k_cur [n_embd_head_k, n_head_k, n_tokens] diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index 73415f52d24..3499e84b7e2 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -271,6 +271,10 @@ ggml_tensor * llama_memory_hybrid_context::get_turbo_rot_inverse() const { return ctx_attn ? ctx_attn->get_turbo_rot_inverse() : nullptr; } +ggml_tensor * llama_memory_hybrid_context::get_turbo_innerq_scale_inv() const { + return ctx_attn ? ctx_attn->get_turbo_innerq_scale_inv() : nullptr; +} + const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const { return static_cast(ctx_recr.get()); } diff --git a/src/llama-memory-hybrid.h b/src/llama-memory-hybrid.h index f9469ad25ab..87dd3487692 100644 --- a/src/llama-memory-hybrid.h +++ b/src/llama-memory-hybrid.h @@ -122,6 +122,7 @@ class llama_memory_hybrid_context : public llama_memory_context_i { // TurboQuant: delegate to the KV cache context ggml_tensor * get_turbo_rot_forward() const override; ggml_tensor * get_turbo_rot_inverse() const override; + ggml_tensor * get_turbo_innerq_scale_inv() const override; // // llama_memory_hybrid_context diff --git a/src/llama-memory.h b/src/llama-memory.h index f815222e855..2704096da71 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -64,6 +64,10 @@ struct llama_memory_context_i { // Returns null for non-turbo memory types. Override in KV cache contexts. virtual ggml_tensor * get_turbo_rot_forward() const { return nullptr; } virtual ggml_tensor * get_turbo_rot_inverse() const { return nullptr; } + + // TurboQuant InnerQ: get per-channel scale_inv tensor for Q/V equalization + // Returns nullptr when InnerQ is not active. Override in KV cache contexts. + virtual ggml_tensor * get_turbo_innerq_scale_inv() const { return nullptr; } }; using llama_memory_context_ptr = std::unique_ptr; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index b79a5270b52..0e0cb2c83ba 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -747,8 +747,8 @@ struct server_context_impl { int n_ctx_slot = llama_n_ctx_seq(ctx); if (n_ctx_slot > n_ctx_train) { - SRV_WRN("the slot context (%d) exceeds the training context of the model (%d) - capping\n", n_ctx_slot, n_ctx_train); - n_ctx_slot = n_ctx_train; + SRV_WRN("the slot context (%d) exceeds the training context of the model (%d) - using rope scaling to extend\n", n_ctx_slot, n_ctx_train); + // Do not cap: caller has configured rope scaling (--rope-scale / --rope-scaling yarn) to handle extended context. } slots.clear();