Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ggml/include/ggml-rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ extern "C" {

#define RPC_PROTO_MAJOR_VERSION 3
#define RPC_PROTO_MINOR_VERSION 6
#define RPC_PROTO_PATCH_VERSION 1
#define RPC_PROTO_PATCH_VERSION 2

#ifdef __cplusplus
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
#endif

#define GGML_RPC_MAX_SERVERS 16
Expand Down
3 changes: 3 additions & 0 deletions ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5594,6 +5594,9 @@ void ggml_compute_forward_clamp(
case GGML_TYPE_TQ2_0:
case GGML_TYPE_TQ3_1S:
case GGML_TYPE_TQ4_1S:
case GGML_TYPE_TURBO2_0:
case GGML_TYPE_TURBO3_0:
case GGML_TYPE_TURBO4_0:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
Expand Down
30 changes: 15 additions & 15 deletions ggml/src/ggml-cuda/turbo-quant.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,10 @@ static void turbo_innerq_init(void) {
// Zero accumulators and set calibrating flag on device
float zeros[INNERQ_MAX_CHANNELS] = {0};
int zero = 0, one = 1;
cudaMemcpyToSymbol(d_innerq_sq_accum, zeros, sizeof(zeros));
cudaMemcpyToSymbol(d_innerq_count, &zero, sizeof(int));
cudaMemcpyToSymbol(d_innerq_active, &zero, sizeof(int));
cudaMemcpyToSymbol(d_innerq_calibrating, &one, sizeof(int));
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_sq_accum, zeros, sizeof(zeros)));
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_count, &zero, sizeof(int)));
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_active, &zero, sizeof(int)));
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &one, sizeof(int)));

GGML_LOG_INFO("%s: InnerQ calibration started (target=%d tokens, strength=%.2f)\n",
__func__, innerq_target_tokens, innerq_strength);
Expand All @@ -190,14 +190,14 @@ static void turbo_innerq_finalize(int group_size) {
// Read accumulators from device
float sq_accum[INNERQ_MAX_CHANNELS];
int count = 0;
cudaMemcpyFromSymbol(sq_accum, d_innerq_sq_accum, group_size * sizeof(float));
cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int));
CUDA_CHECK(cudaMemcpyFromSymbol(sq_accum, d_innerq_sq_accum, group_size * sizeof(float)));
CUDA_CHECK(cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int)));

if (count <= 0) {
GGML_LOG_WARN("%s: InnerQ calibration got 0 tokens, disabling\n", __func__);
innerq_enabled = 0;
int zero = 0;
cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
return;
}

Expand Down Expand Up @@ -231,17 +231,17 @@ static void turbo_innerq_finalize(int group_size) {
__func__, max_ratio);
innerq_enabled = 0;
int zero = 0;
cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
return;
}

// Stop calibrating, upload scales, activate
int zero = 0, one = 1;
cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
cudaMemcpyToSymbol(d_innerq_scale, scale, group_size * sizeof(float));
cudaMemcpyToSymbol(d_innerq_scale_inv, scale_inv, group_size * sizeof(float));
cudaDeviceSynchronize(); // ensure scales are visible before activating
cudaMemcpyToSymbol(d_innerq_active, &one, sizeof(int));
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_scale, scale, group_size * sizeof(float)));
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_scale_inv, scale_inv, group_size * sizeof(float)));
CUDA_CHECK(cudaDeviceSynchronize()); // ensure scales are visible before activating
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_active, &one, sizeof(int)));

innerq_enabled = 2; // active

Expand Down Expand Up @@ -272,15 +272,15 @@ static void turbo_innerq_check_finalize(int group_size, int64_t ne00) {
__func__, (long long)ne00, group_size);
innerq_enabled = 0;
int zero = 0;
cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
}
return;
}

// Check if calibration is complete
if (innerq_enabled == 1) {
int count = 0;
cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int));
CUDA_CHECK(cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int)));
if (count >= innerq_target_tokens) {
turbo_innerq_finalize(group_size);
}
Expand Down
27 changes: 22 additions & 5 deletions tests/test-quantize-fns.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {

// Total quantization error on test data
static float total_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size);
// Buffer must be large enough for the row's byte size. For types whose
// vec_dot_type is GGML_TYPE_F32 (e.g. turbo quants), from_float writes
// test_size*sizeof(float) bytes, which exceeds the legacy 2*test_size sizing.
std::vector<uint8_t> tmp_q(std::max<size_t>(2*test_size, test_size * sizeof(float)));
std::vector<float> tmp_out(test_size);

qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
Expand All @@ -58,7 +61,7 @@ static float total_quantization_error(const ggml_type_traits * qfns, const ggml_

// Total quantization error on test data
static float reference_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size);
std::vector<uint8_t> tmp_q(std::max<size_t>(2*test_size, test_size * sizeof(float)));
std::vector<float> tmp_out(test_size);
std::vector<float> tmp_out_ref(test_size);

Expand All @@ -84,8 +87,10 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) {
GGML_UNUSED(qfns);

std::vector<uint8_t> tmp_q1(2*test_size);
std::vector<uint8_t> tmp_q2(2*test_size);
// For turbo quants vec_dot_type is GGML_TYPE_F32, so vdot->from_float writes
// test_size*sizeof(float) bytes. Size buffers accordingly.
std::vector<uint8_t> tmp_q1(std::max<size_t>(2*test_size, test_size * sizeof(float)));
std::vector<uint8_t> tmp_q2(std::max<size_t>(2*test_size, test_size * sizeof(float)));

const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);

Expand Down Expand Up @@ -137,6 +142,16 @@ int main(int argc, char * argv[]) {
continue;
}

// TurboQuant KV-cache types (TURBO2_0/TURBO3_0/TURBO4_0) intentionally keep
// their dequantized output in the WHT-rotated domain; the inverse WHT is
// applied separately via GGML_OP_TURBO_WHT in the attention graph. They do
// not round-trip through float space, so the total/reference/dot-product
// error tests in this harness are not applicable.
if (type == GGML_TYPE_TURBO2_0 || type == GGML_TYPE_TURBO3_0 || type == GGML_TYPE_TURBO4_0) {
printf("Testing %s (skipped: rotated-domain KV quant)\n", ggml_type_name(type));
continue;
}

const ggml_type ei = (ggml_type)i;

printf("Testing %s\n", ggml_type_name((ggml_type) i));
Expand All @@ -152,6 +167,7 @@ int main(int argc, char * argv[]) {
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS :
type == GGML_TYPE_TQ3_1S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
type == GGML_TYPE_NVFP4 ? MAX_QUANTIZATION_TOTAL_ERROR_FP4 : MAX_QUANTIZATION_TOTAL_ERROR;
failed = !(total_error < max_quantization_error);
num_failed += failed;
Expand All @@ -168,7 +184,8 @@ int main(int argc, char * argv[]) {

const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data());
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S ||
type == GGML_TYPE_TQ3_1S
? MAX_DOT_PRODUCT_ERROR_LOWBIT
: type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
? MAX_DOT_PRODUCT_ERROR_TERNARY
Expand Down
Loading