Mozilla-Ocho · jart · Aug 24, 2024 · Aug 23, 2024 · Aug 24, 2024 · Aug 24, 2024
diff --git a/llama.cpp/ggml-quants.inc b/llama.cpp/ggml-quants.inc
@@ -1950,6 +1950,44 @@ void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, in
 
     const float q4scale = 15.f;
 
+    // [kawrakow] Detect TriNet
+    {
+        int n = k;
+        float max = 0;
+        for (int j = 0; j < n; ++j) {
+            float ax = fabsf(x[j]);
+            max = MAX(max, ax);
+        }
+        float mse0 = 0, mse = 0;
+        for (int j = 0; j < n; ++j) {
+            int l = x[j] < -0.5f*max ? -1 : x[j] < 0.5f*max ? 0 : 1;
+            mse0 += x[j]*x[j];
+            float diff = x[j] - max*l;
+            mse += diff*diff;
+        }
+        if (mse < 0.1f*mse0) {
+            // yes, most likely trinet
+            // => simply set all block scales to 1, set dmin = d = max, set quants to -1, 0, 1
+            for (int ibl = 0; ibl < nb; ++ibl) {
+                y[ibl].d = GGML_FP32_TO_FP16(max);
+                y[ibl].dmin = GGML_FP32_TO_FP16(max);
+                for (int ib = 0; ib < QK_K/16; ++ib) y[ibl].scales[ib] = 1 | (1 << 4);
+                const float * xb = x + QK_K * ibl;
+                for (int j = 0; j < QK_K; ++j) {
+                    L[j] = xb[j] < -0.5f*max ? 0 : xb[j] < 0.5f*max ? 1 : 2;
+                }
+                uint8_t * qs = y[ibl].qs;
+                for (int j = 0; j < QK_K; j += 128) {
+                    for (int l = 0; l < 32; ++l) {
+                        qs[l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
+                    }
+                    qs += 32;
+                }
+            }
+            return;
+        }
+    }
+
     for (int i = 0; i < nb; i++) {
         float max_scale = 0; // as we are deducting the min, scales are always positive
         float max_min = 0;

diff --git a/llama.cpp/llama.cpp b/llama.cpp/llama.cpp
@@ -16234,12 +16234,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                     }
                 }
             }
-            if ((new_type == GGML_TYPE_IQ2_XXS ||
-                 new_type == GGML_TYPE_IQ2_XS  ||
+            if (!params->ignore_imatrix_rules && !imatrix && // [kawrakow] - be able to ignore imatrix rules
+                (new_type == GGML_TYPE_IQ2_XS  ||
                  new_type == GGML_TYPE_IQ2_S   ||
                  new_type == GGML_TYPE_IQ1_S   ||
                 (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight"))  ||
-                (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
+                (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) {
                 LLAMA_LOG_ERROR("\n\n============================================================\n");
                 LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
                 LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
@@ -16606,6 +16606,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
         /*.only_copy                   =*/ false,
         /*.pure                        =*/ false,
         /*.keep_split                  =*/ false,
+        /*.ignore_imatrix_rules        =*/ false, // [kawrakow]
         /*.imatrix                     =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
     };

diff --git a/llama.cpp/llama.h b/llama.cpp/llama.h
@@ -354,6 +354,7 @@ extern "C" {
         bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
         bool pure;                           // quantize all tensors to the default type
         bool keep_split;                     // quantize to the same number of shards
+        bool ignore_imatrix_rules;           // [kawrakow] If set to true, the built-in rules for refusing to quantize into certain quants without imatrix are ignored
         void * imatrix;                      // pointer to importance matrix data
         void * kv_overrides;                 // pointer to vector containing overrides
     } llama_model_quantize_params;

diff --git a/llama.cpp/quantize/quantize.cpp b/llama.cpp/quantize/quantize.cpp
@@ -104,6 +104,7 @@ static void usage(const char * executable) {
     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
     printf("  --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
+    printf("  --ignore-imatrix-rules: ignore built-in rules for mandatory imatrix for certain quantization types\n"); // [kawrakow]
     printf("  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
@@ -268,6 +269,8 @@ int main(int argc, char ** argv) {
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
             params.quantize_output_tensor = false;
+        } else if (strcmp(argv[arg_idx], "--ignore-imatrix-rules") == 0) {
+            params.ignore_imatrix_rules = true; // [kawrakow]
         } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
             if (arg_idx < argc-1) {
                 params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
@@ -422,11 +425,12 @@ int main(int argc, char ** argv) {
         }
     }
 
-    if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
+    if (!params.ignore_imatrix_rules && imatrix_data.empty() && // [kawrakpow] - be able to ignore imatrix rules
+        (params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
          params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  ||
          params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
          params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
+         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)) {
         fprintf(stderr, "\n==========================================================================================================\n");
         fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
         fprintf(stderr, "==========================================================================================================\n\n\n");