Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions ggml/src/ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -1916,7 +1916,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
if (x[i] < min) min = x[i];
if (x[i] > max) max = x[i];
}
if (max == min) {
if (max - min < 1e-10f) {
for (int i = 0; i < n; ++i) L[i] = 0;
*the_min = 0;
return 0.f;
Expand Down Expand Up @@ -1971,7 +1971,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
sum_x += w * x[i];
}
if (min > 0) min = 0;
if (max == min) {
if (max - min < 1e-10f) {
for (int i = 0; i < n; ++i) L[i] = 0;
*the_min = -min;
return 0.f;
Expand Down Expand Up @@ -2218,7 +2218,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
if (min > 0) {
min = 0;
}
if (max <= min) {
if (max - min < 1e-10f) {
memset(L, 0, n);
*the_min = -min;
return 0.f;
Expand Down Expand Up @@ -2340,7 +2340,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t *
for (int i = 0; i < n; ++i) {
max = MAX(max, x[i]);
}
if (!max) { // all zero
if (max < 1e-16f) { // all zero
for (int i = 0; i < n; ++i) { L[i] = 0; }
return 0.f;
}
Expand Down Expand Up @@ -2733,6 +2733,10 @@ void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, in
float av_x = sqrtf(sum_x2/32);
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
if (isnan(scales[j])) {
printf("Oops: NaN scale\n");
GGML_ABORT("Fatal error");
}
float scale = scales[j];
if (scale > max_scale) {
max_scale = scale;
Expand Down Expand Up @@ -2846,10 +2850,18 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
for (int l = 0; l < 32; ++l) sumw += weights[l];
sw[j] = sumw;
scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
if (isnan(scales[j])) {
printf("%s: got NaN scale\n", __func__);
GGML_ABORT("Fatal error");
}
}

float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw);
if (isnan(d_block) || isnan(m_block)) {
printf("%s: d_block = %g, m_block = %g\n", __func__, (double)d_block, (double)m_block);
GGML_ABORT("Fatal error");
}
for (int j = 0; j < QK_K/32; ++j) {
uint8_t ls = Ls[j];
uint8_t lm = Lm[j];
Expand Down
50 changes: 38 additions & 12 deletions ggml/src/iqk/iqk_quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1053,9 +1053,16 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl
for (int j = 0; j < kBlockSize; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j];
}
sw[ib] = 0;
float amax = 0;
for (int j = 0; j < kBlockSize; ++j) {
sw[ib] += weight[j];
pairs[j] = {xb[j], j};
float ax = std::abs(xb[j]);
amax = std::max(amax, ax);
}
if (amax < 1e-16f) {
scales[ib] = 0;
continue;
}
std::sort(pairs.begin(), pairs.end());
sumx[0] = sumw[0] = 0;
Expand Down Expand Up @@ -1269,9 +1276,16 @@ void quantize_row_iq2_ks_impl(const float * x, void * vy, int n_per_row, const f
for (int j = 0; j < kBlockSize; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j];
}
sw[ib] = 0;
float amax = 0;
for (int j = 0; j < kBlockSize; ++j) {
sw[ib] += weight[j];
pairs[j] = {xb[j], j};
float ax = std::abs(xb[j]);
amax = std::max(amax, ax);
}
if (amax < 1e-16f) {
scales[ib] = 0;
continue;
}
//float amax = 0, max = 0;
//for (int j = 0; j < kBlockSize; ++j) {
Expand Down Expand Up @@ -1678,7 +1692,7 @@ void quantize_row_iq2_kl_impl(const float * x, void * vy, int n_per_row, const f
amax = ax; max = xb[j];
}
}
if (!amax) {
if (amax < 1e-16f) {
scales[ib] = 0;
continue;
}
Expand Down Expand Up @@ -1929,7 +1943,7 @@ static void quantize_row_iq3_k_impl(const float * x, void * vy, int n_per_row, c
amax = ax; max = xb[j];
}
}
if (amax < 1e-9f) {
if (amax < 1e-16f) {
scales[ib] = 0;
continue;
}
Expand Down Expand Up @@ -2216,7 +2230,7 @@ static void quantize_row_iq3_ks_impl(const int super_block_size, const int block
amax = ax; max = xb[j];
}
}
if (amax < 1e-9f) {
if (amax < 1e-16f) {
scales[ib] = 0;
continue;
}
Expand Down Expand Up @@ -2544,7 +2558,7 @@ static void quantize_row_iq4_k_impl_bs16(const int super_block_size, const int b
amax = ax; max = xb[j];
}
}
if (!amax) {
if (amax < 1e-16f) {
scales[ib] = 0;
continue;
}
Expand Down Expand Up @@ -2862,7 +2876,7 @@ void quantize_row_iq5_k_impl(const float * x, void * vy, int n_per_row, const fl
amax = ax; max = xb[j];
}
}
if (!amax) {
if (amax < 1e-16f) {
scales[ib] = 0;
continue;
}
Expand Down Expand Up @@ -3216,7 +3230,7 @@ void quantize_row_iq6_k_impl(const float * x, void * vy, int n_per_row, const fl
amax = ax; max = xb[j];
}
}
if (!amax) {
if (amax < 1e-16f) {
scales[ib] = 0;
continue;
}
Expand Down Expand Up @@ -3918,7 +3932,7 @@ static void quantize_row_iq4_k_impl_bs128(const int super_block_size, const int
amax = ax; max = xb[j];
}
}
if (!amax) {
if (amax < 1e-16f) {
scales[ib] = 0;
continue;
}
Expand Down Expand Up @@ -4167,7 +4181,7 @@ static void quantize_row_iq5_ks_impl(const int super_block_size, const int block
amax = ax; max = xb[j];
}
}
if (amax < 1e-15f) {
if (amax < 1e-16f) {
scales[ib] = 0;
continue;
}
Expand Down Expand Up @@ -4470,7 +4484,7 @@ static void quantize_row_iq4_kss_impl(int n_per_row, const float * x, char * cy,
amax = ax; max = xb[j];
}
}
if (!amax) {
if (amax < 1e-16f) {
scales[ib] = 0;
continue;
}
Expand Down Expand Up @@ -8733,6 +8747,11 @@ void quantize_row_iq1_kt_impl(const float * x, void * vy, int n_per_row, const f
float ax = std::abs(xb[j]);
amax = std::max(amax, ax);
}
if (amax < 1e-16f) {
scales[ib] = 0.0f;
for (int ig = 0; ig < Q::kNg; ++ig) all_idx[(ibl*Q::kSuperBlockSize + ib*Q::kBlockSize)/Q::kGroupSize + ig] = 0;
continue;
}
float scale_0 = std::max(90.f, 124.f*amax/amax_row);
quantizer.find_best_match( amax/scale_0, xb, weight, best_idx);
auto [dp, score_p] = quantizer.find_best_scale(xb, weight, best_idx);
Expand Down Expand Up @@ -8998,6 +9017,11 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
float ax = std::abs(xb[j]);
amax = std::max(amax, ax);
}
if (amax < 1e-16f) {
scales[ib] = 0.0f;
for (int ig = 0; ig < Q::kNg; ++ig) all_idx[(ibl*Q::kSuperBlockSize + ib*Q::kBlockSize)/Q::kGroupSize + ig] = 0;
continue;
}
float scale_0 = std::max(90.f, 124.f*amax/amax_row);
quantizer.find_best_match( amax/scale_0, xb, weight, best_idx);
auto [dp, score_p] = quantizer.find_best_scale(xb, weight, best_idx);
Expand Down Expand Up @@ -9289,8 +9313,10 @@ void quantize_row_iq3_kt_impl(const float * x, void * vy, int n_per_row, const f
xaux[j] = ax;
amax = std::max(amax, ax);
}
scales[ib] = 0;
if (!amax) continue;
if (amax < 1e-16f) {
scales[ib] = 0.0f;
continue;
}

//quantizer.find_best_match(amax/96.f, xaux, weight, best_idx+Q::kNg);
//scales[ib] = quantizer.find_best_scale(xaux, weight, best_idx+Q::kNg).first;
Expand Down Expand Up @@ -9577,7 +9603,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
float ax = std::abs(xaux[j]);
amax = std::max(amax, ax);
}
if (!amax) {
if (amax < 1e-16f) {
scales[ib] = 0;
continue;
}
Expand Down