From d766c5a62e53776133ef2145fb05ca815b9f33d1 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 13 Apr 2024 13:53:59 +0200
Subject: [PATCH 01/13] imatrix: save the dataset file used in the output file

---
 examples/imatrix/imatrix.cpp | 74 ++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 33 deletions(-)
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 73609d3e6ae00..18ea8016ff104 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -23,6 +23,7 @@ struct Stats {
 };
 
 struct StatParams {
+    std::string dataset;
     std::string ofile = "imatrix.dat";
     int         n_output_frequency = 10;
     int         verbosity = 1;
@@ -46,7 +47,7 @@ class IMatrixCollector {
     std::vector<float>                     m_src1_data;
     std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
                                                   //
-    void save_imatrix(const char * file_name) const;
+    void save_imatrix(const char * file_name, const char * dataset) const;
     void keep_imatrix(int ncall) const;
 };
 
@@ -184,7 +185,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
 }
 
 void IMatrixCollector::save_imatrix() const {
-    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
+    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
 }
 
 void IMatrixCollector::keep_imatrix(int ncall) const {
@@ -192,24 +193,30 @@ void IMatrixCollector::keep_imatrix(int ncall) const {
     if (file_name.empty()) file_name = "imatrix.dat";
     file_name += ".at_";
     file_name += std::to_string(ncall);
-    save_imatrix(file_name.c_str());
+    save_imatrix(file_name.c_str(), m_params.dataset.c_str());
 }
 
-void IMatrixCollector::save_imatrix(const char * fname) const {
+void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
     std::ofstream out(fname, std::ios::binary);
     int n_entries = m_stats.size();
-    out.write((const char*)&n_entries, sizeof(n_entries));
-    for (auto& p : m_stats) {
+    out.write((const char *) &n_entries, sizeof(n_entries));
+    for (const auto & p : m_stats) {
         int len = p.first.size();
-        out.write((const char*)&len, sizeof(len));
+        out.write((const char *) &len, sizeof(len));
         out.write(p.first.c_str(), len);
-        out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
+        out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
         int nval = p.second.values.size();
-        out.write((const char*)&nval, sizeof(nval));
-        if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
+        out.write((const char *) &nval, sizeof(nval));
+        if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
     }
+
+    // Write the dataset name at the end of the file to later on specify it in quantize
+    int n_dataset = strlen(dataset);
+    out.write((const char *) &n_dataset, sizeof(n_dataset));
+    out.write(dataset, n_dataset);
+
     if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
+        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
     }
 }
 
@@ -532,6 +539,29 @@ int main(int argc, char ** argv) {
         }
     }
 
+    gpt_params params;
+    params.n_batch = 512;
+    if (!gpt_params_parse(args.size(), args.data(), params)) {
+        return 1;
+    }
+
+    params.logits_all = true;
+    params.n_batch = std::min(params.n_batch, params.n_ctx);
+
+    print_build_info();
+
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+
+    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    sparams.dataset = params.prompt_file;
     g_collector.set_parameters(std::move(sparams));
 
     if (!combine_files.empty()) {
@@ -570,28 +600,6 @@ int main(int argc, char ** argv) {
         }
     }
 
-    gpt_params params;
-    params.n_batch = 512;
-    if (!gpt_params_parse(args.size(), args.data(), params)) {
-        return 1;
-    }
-
-    params.logits_all = true;
-    params.n_batch = std::min(params.n_batch, params.n_ctx);
-
-    print_build_info();
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
     llama_backend_init();
     llama_numa_init(params.numa);
 

From 01e779593004fb767b4578c46d30954f111d4368 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 13 Apr 2024 14:49:23 +0200
Subject: [PATCH 02/13] llama: support kv overrides type string string

---
 llama.cpp | 6 ++++++
 llama.h   | 6 ++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index b93c1abcd85d6..29caae419c19e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2801,6 +2801,7 @@ namespace GGUFMeta {
                 case LLAMA_KV_OVERRIDE_TYPE_BOOL:  return "bool";
                 case LLAMA_KV_OVERRIDE_TYPE_INT:   return "int";
                 case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
+                case LLAMA_KV_OVERRIDE_TYPE_STR:   return "str";
             }
             return "unknown";
         }
@@ -2820,6 +2821,9 @@ namespace GGUFMeta {
                     case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
                         LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
                     } break;
+                    case LLAMA_KV_OVERRIDE_TYPE_STR: {
+                        LLAMA_LOG_INFO("%s\n", ovrd->str_value);
+                    } break;
                     default:
                         // Shouldn't be possible to end up here, but just in case...
                         throw std::runtime_error(
@@ -13698,6 +13702,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 gguf_set_val_i32(ctx_out, o.key, o.int_value);
             } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
                 gguf_set_val_bool(ctx_out, o.key, o.bool_value);
+            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
+                gguf_set_val_str(ctx_out, o.key, o.str_value);
             } else {
                 LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
             }
diff --git a/llama.h b/llama.h
index b5da686f7b7e5..89428501141e3 100644
--- a/llama.h
+++ b/llama.h
@@ -195,6 +195,7 @@ extern "C" {
         LLAMA_KV_OVERRIDE_TYPE_INT,
         LLAMA_KV_OVERRIDE_TYPE_FLOAT,
         LLAMA_KV_OVERRIDE_TYPE_BOOL,
+        LLAMA_KV_OVERRIDE_TYPE_STR,
     };
 
     struct llama_model_kv_override {
@@ -202,8 +203,9 @@ extern "C" {
         enum llama_model_kv_override_type tag;
         union {
             int64_t int_value;
-            double float_value;
-            bool bool_value;
+            double  float_value;
+            bool    bool_value;
+            char *  str_value   = nullptr;
         };
     };
 

From a9202fb155de9a5939f62588c16698be920cd88c Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 13 Apr 2024 14:49:48 +0200
Subject: [PATCH 03/13] common: factorize KV Overrides parsing between common
 and server

---
 common/common.cpp          | 82 ++++++++++++++++++++------------------
 common/common.h            |  2 +
 examples/server/server.cpp | 36 +----------------
 3 files changed, 48 insertions(+), 72 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index dda514785171b..ca10ae098ac38 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -157,6 +157,48 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     return result;
 }
 
+bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
+    const char* sep = strchr(data, '=');
+    if (sep == nullptr || sep - data >= 128) {
+        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
+        return false;
+    }
+    llama_model_kv_override kvo;
+    std::strncpy(kvo.key, data, sep - data);
+    kvo.key[sep - data] = 0;
+    sep++;
+    if (strncmp(sep, "int:", 4) == 0) {
+        sep += 4;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+        kvo.int_value = std::atol(sep);
+    } else if (strncmp(sep, "float:", 6) == 0) {
+        sep += 6;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+        kvo.float_value = std::atof(sep);
+    } else if (strncmp(sep, "bool:", 5) == 0) {
+        sep += 5;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+        if (std::strcmp(sep, "true") == 0) {
+            kvo.bool_value = true;
+        } else if (std::strcmp(sep, "false") == 0) {
+            kvo.bool_value = false;
+        } else {
+            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
+            return false;
+        }
+
+    } else if (strncmp(sep, "str:", 4) == 0) {
+        sep += 4;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+        kvo.str_value = strdup(sep);
+    } else {
+        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
+        return false;
+    }
+    overrides.emplace_back(std::move(kvo));
+    return true;
+}
+
 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
     llama_sampling_params& sparams = params.sparams;
 
@@ -1153,47 +1195,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             invalid_param = true;
             return true;
         }
-        char* sep = strchr(argv[i], '=');
-        if (sep == nullptr || sep - argv[i] >= 128) {
-            fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        struct llama_model_kv_override kvo;
-        std::strncpy(kvo.key, argv[i], sep - argv[i]);
-        kvo.key[sep - argv[i]] = 0;
-        sep++;
-        if (strncmp(sep, "int:", 4) == 0) {
-            sep += 4;
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-            kvo.int_value = std::atol(sep);
-        }
-        else if (strncmp(sep, "float:", 6) == 0) {
-            sep += 6;
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-            kvo.float_value = std::atof(sep);
-        }
-        else if (strncmp(sep, "bool:", 5) == 0) {
-            sep += 5;
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
-            if (std::strcmp(sep, "true") == 0) {
-                kvo.bool_value = true;
-            }
-            else if (std::strcmp(sep, "false") == 0) {
-                kvo.bool_value = false;
-            }
-            else {
-                fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
-                invalid_param = true;
-                return true;
-            }
-        }
-        else {
+        if (!parse_kv_override(argv[i], params.kv_overrides)) {
             fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
             invalid_param = true;
             return true;
         }
-        params.kv_overrides.push_back(kvo);
         return true;
     }
 #ifndef LOG_DISABLE_LOGS
@@ -1461,7 +1467,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
+    printf("                        types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
     printf("  -ptc N, --print-token-count N\n");
     printf("                        print token count every N tokens (default: %d)\n", params.n_print);
     printf("\n");
diff --git a/common/common.h b/common/common.h
index 65272b0baaa41..aa3c06a68d556 100644
--- a/common/common.h
+++ b/common/common.h
@@ -169,6 +169,8 @@ struct gpt_params {
     std::string image  = ""; // path to an image file
 };
 
+bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
+
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 634e653ada284..64f4da94d9908 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2372,7 +2372,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
     printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("                            advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                            types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
+    printf("                            types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
     printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
     printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
     printf("  --chat-template JINJA_TEMPLATE\n");
@@ -2803,43 +2803,11 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                 invalid_param = true;
                 break;
             }
-            char * sep = strchr(argv[i], '=');
-            if (sep == nullptr || sep - argv[i] >= 128) {
-                fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-
-            struct llama_model_kv_override kvo;
-            std::strncpy(kvo.key, argv[i], sep - argv[i]);
-            kvo.key[sep - argv[i]] = 0;
-            sep++;
-            if (strncmp(sep, "int:", 4) == 0) {
-                sep += 4;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-                kvo.int_value = std::atol(sep);
-            } else if (strncmp(sep, "float:", 6) == 0) {
-                sep += 6;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-                kvo.float_value = std::atof(sep);
-            } else if (strncmp(sep, "bool:", 5) == 0) {
-                sep += 5;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
-                if (std::strcmp(sep, "true") == 0) {
-                    kvo.bool_value = true;
-                } else if (std::strcmp(sep, "false") == 0) {
-                    kvo.bool_value = false;
-                } else {
-                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
-                    invalid_param = true;
-                    break;
-                }
-            } else {
+            if (!parse_kv_override(argv[i], params.kv_overrides)) {
                 fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
                 invalid_param = true;
                 break;
             }
-            params.kv_overrides.push_back(kvo);
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             server_print_usage(argv[0], default_params, default_sparams);

From 262c95ab6349e82af50584e6019c4a9d3424cad9 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 13 Apr 2024 14:50:32 +0200
Subject: [PATCH 04/13] quantize: add imatrix n entries and dataset KV metadata
 quantize: factorize KV Overrides parsing between common #6656

---
 Makefile                         |  2 +-
 examples/quantize/CMakeLists.txt |  2 +-
 examples/quantize/quantize.cpp   | 74 ++++++++++++++------------------
 3 files changed, 35 insertions(+), 43 deletions(-)

diff --git a/Makefile b/Makefile
index 7a69ad1b3c14f..61e2e005a0590 100644
--- a/Makefile
+++ b/Makefile
@@ -760,7 +760,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-quantize: examples/quantize/quantize.cpp                      build-info.o ggml.o llama.o $(OBJS)
+quantize: examples/quantize/quantize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
index 6f374a2bd3b46..6b977fde86ab2 100644
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 64cb6db19d004..7d5e1d5bb332a 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -8,7 +8,6 @@
 #include <unordered_map>
 #include <fstream>
 #include <cmath>
-#include <algorithm>
 
 struct quant_option {
     std::string name;
@@ -53,6 +52,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
 };
 
+static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix.dataset";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.n_entries";
 
 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
     std::string ftype_str;
@@ -112,7 +113,7 @@ static void usage(const char * executable) {
     exit(1);
 }
 
-static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+static void load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
     std::ifstream in(imatrix_file.c_str(), std::ios::binary);
     if (!in) {
         printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
@@ -159,15 +160,27 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
             printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
         }
     }
+
+    // latest imatrix version contains the dataset filename at the end of the file
+    if (in.peek() != EOF) {
+        int dataset_len;
+        in.read((char *)&dataset_len, sizeof(dataset_len));
+        std::vector<char> dataset_as_vec(dataset_len+1);
+        in.read((char *)dataset_as_vec.data(), dataset_len);
+        dataset_as_vec[dataset_len] = 0;
+        imatrix_dataset = std::string{dataset_as_vec.data()};
+        printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
+    }
     printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
 }
 
 static void prepare_imatrix(const std::string & imatrix_file,
+        std::string & imatrix_dataset,
         const std::vector<std::string> & included_weights,
         const std::vector<std::string> & excluded_weights,
         std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
     if (!imatrix_file.empty()) {
-        load_imatrix(imatrix_file, imatrix_data);
+        load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
     }
     if (imatrix_data.empty()) {
         return;
@@ -210,43 +223,6 @@ static ggml_type parse_ggml_type(const char * arg) {
     return result;
 }
 
-static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
-    const char* sep = strchr(data, '=');
-    if (sep == nullptr || sep - data >= 128) {
-        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
-        return false;
-    }
-    llama_model_kv_override kvo;
-    std::strncpy(kvo.key, data, sep - data);
-    kvo.key[sep - data] = 0;
-    sep++;
-    if (strncmp(sep, "int:", 4) == 0) {
-        sep += 4;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-        kvo.int_value = std::atol(sep);
-    } else if (strncmp(sep, "float:", 6) == 0) {
-        sep += 6;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-        kvo.float_value = std::atof(sep);
-    } else if (strncmp(sep, "bool:", 5) == 0) {
-        sep += 5;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
-        if (std::strcmp(sep, "true") == 0) {
-            kvo.bool_value = true;
-        } else if (std::strcmp(sep, "false") == 0) {
-            kvo.bool_value = false;
-        } else {
-            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
-            return false;
-        }
-    } else {
-        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
-        return false;
-    }
-    overrides.emplace_back(std::move(kvo));
-    return true;
-}
-
 int main(int argc, char ** argv) {
     if (argc < 3) {
         usage(argv[0]);
@@ -313,10 +289,26 @@ int main(int argc, char ** argv) {
         usage(argv[0]);
     }
 
+    std::string imatrix_dataset;
     std::unordered_map<std::string, std::vector<float>> imatrix_data;
-    prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
+    prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
     if (!imatrix_data.empty()) {
         params.imatrix = &imatrix_data;
+        if (!imatrix_dataset.empty()) {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+            kvo.str_value = strdup(imatrix_dataset.c_str());
+            kv_overrides.emplace_back(std::move(kvo));
+        }
+
+        {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+            kvo.int_value = imatrix_data.size();
+            kv_overrides.emplace_back(std::move(kvo));
+        }
     }
     if (!kv_overrides.empty()) {
         kv_overrides.emplace_back();

From cbc43aa411c7ca7064612838cc6711ee1381fbb9 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 13 Apr 2024 15:12:03 +0200
Subject: [PATCH 05/13] llama: remove kv override str_value initialization as
 it does not compile on some toolchain

---
 llama.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.h b/llama.h
index 89428501141e3..0cc0d3f291428 100644
--- a/llama.h
+++ b/llama.h
@@ -205,7 +205,7 @@ extern "C" {
             int64_t int_value;
             double  float_value;
             bool    bool_value;
-            char *  str_value   = nullptr;
+            char *  str_value;
         };
     };
 

From 851de160dd3b4b3f48eac94e914aaa861eaa2ddd Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 13 Apr 2024 18:03:10 +0200
Subject: [PATCH 06/13] quantize: add imatrix m_last_call as
 `quantize.imatrix.chunks_count`

---
 examples/imatrix/imatrix.cpp   |  3 +++
 examples/quantize/quantize.cpp | 28 +++++++++++++++++++++-------
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 18ea8016ff104..b2d81311536a8 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -210,6 +210,9 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
         if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
     }
 
+    // Write the number of call the matrix was computed with
+    out.write((const char *) &m_last_call, sizeof(m_last_call));
+
     // Write the dataset name at the end of the file to later on specify it in quantize
     int n_dataset = strlen(dataset);
     out.write((const char *) &n_dataset, sizeof(n_dataset));
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 7d5e1d5bb332a..dcc1fec39347c 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -53,7 +53,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
 };
 
 static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix.dataset";
-static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.n_entries";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS   = "quantize.imatrix.chunks_count";
 
 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
     std::string ftype_str;
@@ -113,7 +114,7 @@ static void usage(const char * executable) {
     exit(1);
 }
 
-static void load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
     std::ifstream in(imatrix_file.c_str(), std::ios::binary);
     if (!in) {
         printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
@@ -162,7 +163,9 @@ static void load_imatrix(const std::string & imatrix_file, std::string & imatrix
     }
 
     // latest imatrix version contains the dataset filename at the end of the file
+    int m_last_call = 0;
     if (in.peek() != EOF) {
+        in.read((char *)&m_last_call, sizeof(m_last_call));
         int dataset_len;
         in.read((char *)&dataset_len, sizeof(dataset_len));
         std::vector<char> dataset_as_vec(dataset_len+1);
@@ -171,19 +174,21 @@ static void load_imatrix(const std::string & imatrix_file, std::string & imatrix
         imatrix_dataset = std::string{dataset_as_vec.data()};
         printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
     }
-    printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
+    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
+    return m_last_call;
 }
 
-static void prepare_imatrix(const std::string & imatrix_file,
+static int prepare_imatrix(const std::string & imatrix_file,
         std::string & imatrix_dataset,
         const std::vector<std::string> & included_weights,
         const std::vector<std::string> & excluded_weights,
         std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+    int m_last_call = -1;
     if (!imatrix_file.empty()) {
-        load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
+        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
     }
     if (imatrix_data.empty()) {
-        return;
+        return m_last_call;
     }
     if (!excluded_weights.empty()) {
         for (auto& name : excluded_weights) {
@@ -209,6 +214,7 @@ static void prepare_imatrix(const std::string & imatrix_file,
     if (!imatrix_data.empty()) {
         printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
     }
+    return m_last_call;
 }
 
 static ggml_type parse_ggml_type(const char * arg) {
@@ -291,7 +297,7 @@ int main(int argc, char ** argv) {
 
     std::string imatrix_dataset;
     std::unordered_map<std::string, std::vector<float>> imatrix_data;
-    prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
+    int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
     if (!imatrix_data.empty()) {
         params.imatrix = &imatrix_data;
         if (!imatrix_dataset.empty()) {
@@ -309,6 +315,14 @@ int main(int argc, char ** argv) {
             kvo.int_value = imatrix_data.size();
             kv_overrides.emplace_back(std::move(kvo));
         }
+
+        if (m_last_call > 0) {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+            kvo.int_value = m_last_call;
+            kv_overrides.emplace_back(std::move(kvo));
+        }
     }
     if (!kv_overrides.empty()) {
         kv_overrides.emplace_back();

From 0d82da6f79b3031ad5c0250d2d3af5df1d7b31a3 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 13 Apr 2024 19:06:50 +0200
Subject: [PATCH 07/13] quantize: add imatrix filename in KV

---
 examples/quantize/quantize.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index dcc1fec39347c..a0d9a2f46f6f4 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -52,6 +52,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
 };
 
+static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix.dataset";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS   = "quantize.imatrix.chunks_count";
@@ -300,6 +301,13 @@ int main(int argc, char ** argv) {
     int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
     if (!imatrix_data.empty()) {
         params.imatrix = &imatrix_data;
+        {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+            kvo.str_value = strdup(imatrix_file.c_str());
+            kv_overrides.emplace_back(std::move(kvo));
+        }
         if (!imatrix_dataset.empty()) {
             llama_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);

From 82e4187f95f969b90b9e27c660cb5168c3e33ec7 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Fri, 19 Apr 2024 13:16:42 +0200
Subject: [PATCH 08/13] llama: add llama_model_kv_override_free

---
 common/common.cpp |  1 -
 llama.cpp         | 18 ++++++++++++++++++
 llama.h           |  3 +++
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 29fa5cdbca6b4..9838a538fc62a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -263,7 +263,6 @@ bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> &
             fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
             return false;
         }
-
     } else if (strncmp(sep, "str:", 4) == 0) {
         sep += 4;
         kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
diff --git a/llama.cpp b/llama.cpp
index 8bf1fdbc6602b..2907283f4e82c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2929,6 +2929,16 @@ namespace GGUFMeta {
             return false;
         }
 
+        template<typename OT>
+        static typename std::enable_if<std::is_same<OT, char *>::value, char *>::type
+        try_override(T & target, const struct llama_model_kv_override * ovrd) {
+            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
+                target = ovrd->str_value;
+                return true;
+            }
+            return false;
+        }
+
         template<typename OT>
         static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
         try_override(T & target, const struct llama_model_kv_override * ovrd) {
@@ -14977,6 +14987,14 @@ void llama_free_model(struct llama_model * model) {
     delete model;
 }
 
+void llama_model_kv_override_free(struct llama_model_kv_override * kv_overrides) {
+    for (const struct llama_model_kv_override *p = kv_overrides; p->key[0] != 0; p++) {
+        if (p->tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
+            delete p->str_value;
+        }
+    }
+}
+
 struct llama_context * llama_new_context_with_model(
                  struct llama_model * model,
         struct llama_context_params   params) {
diff --git a/llama.h b/llama.h
index 0cc0d3f291428..0f215f336c2e6 100644
--- a/llama.h
+++ b/llama.h
@@ -209,6 +209,9 @@ extern "C" {
         };
     };
 
+    // Frees all allocated memory
+    LLAMA_API void llama_model_kv_override_free(struct llama_model_kv_override * ctx);
+
     struct llama_model_params {
         int32_t n_gpu_layers; // number of layers to store in VRAM
         enum llama_split_mode split_mode; // how to split the model across multiple GPUs

From aa0e28f8fcc540e4c01b71ff65761551518edc27 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 20 Apr 2024 10:17:03 +0200
Subject: [PATCH 09/13] common: add llama_model_kv_override_free common: free
 kv override if used after model loading

---
 common/common.cpp | 12 ++++++++++++
 common/common.h   |  3 +++
 llama.cpp         |  8 --------
 llama.h           |  3 ---
 4 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 9838a538fc62a..c30b9e59fecd4 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2232,6 +2232,10 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         return std::make_tuple(nullptr, nullptr);
     }
 
+    if (!params.kv_overrides.empty()) {
+        llama_model_kv_override_free(params.kv_overrides.data());
+    }
+
     auto cparams = llama_context_params_from_gpt_params(params);
 
     llama_context * lctx = llama_new_context_with_model(model, cparams);
@@ -2952,3 +2956,11 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
 
     return result;
 }
+
+void llama_model_kv_override_free(struct llama_model_kv_override * kv_overrides) {
+    for (const struct llama_model_kv_override *p = kv_overrides; p->key[0] != 0; p++) {
+        if (p->tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
+            delete p->str_value;
+        }
+    }
+}
diff --git a/common/common.h b/common/common.h
index 562d3a1195f65..916023e2504a9 100644
--- a/common/common.h
+++ b/common/common.h
@@ -172,6 +172,9 @@ struct gpt_params {
 
 bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 
+// Frees all allocated memory
+void llama_model_kv_override_free(struct llama_model_kv_override * ctx);
+
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
diff --git a/llama.cpp b/llama.cpp
index 2907283f4e82c..9d9f7b4e12628 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -14987,14 +14987,6 @@ void llama_free_model(struct llama_model * model) {
     delete model;
 }
 
-void llama_model_kv_override_free(struct llama_model_kv_override * kv_overrides) {
-    for (const struct llama_model_kv_override *p = kv_overrides; p->key[0] != 0; p++) {
-        if (p->tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
-            delete p->str_value;
-        }
-    }
-}
-
 struct llama_context * llama_new_context_with_model(
                  struct llama_model * model,
         struct llama_context_params   params) {
diff --git a/llama.h b/llama.h
index 0f215f336c2e6..0cc0d3f291428 100644
--- a/llama.h
+++ b/llama.h
@@ -209,9 +209,6 @@ extern "C" {
         };
     };
 
-    // Frees all allocated memory
-    LLAMA_API void llama_model_kv_override_free(struct llama_model_kv_override * ctx);
-
     struct llama_model_params {
         int32_t n_gpu_layers; // number of layers to store in VRAM
         enum llama_split_mode split_mode; // how to split the model across multiple GPUs

From 4bd26644bf690ac482d9c60bced2228e8ad7948a Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 20 Apr 2024 13:24:58 +0200
Subject: [PATCH 10/13] llama: finally move the string KV override value to the
 stack

---
 common/common.cpp              | 14 +-------------
 common/common.h                |  3 ---
 examples/quantize/quantize.cpp |  4 ++--
 llama.h                        |  2 +-
 4 files changed, 4 insertions(+), 19 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index c30b9e59fecd4..f07b4d1a4e95b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -266,7 +266,7 @@ bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> &
     } else if (strncmp(sep, "str:", 4) == 0) {
         sep += 4;
         kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
-        kvo.str_value = strdup(sep);
+        strncpy(kvo.str_value, sep, 128);
     } else {
         fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
         return false;
@@ -2232,10 +2232,6 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         return std::make_tuple(nullptr, nullptr);
     }
 
-    if (!params.kv_overrides.empty()) {
-        llama_model_kv_override_free(params.kv_overrides.data());
-    }
-
     auto cparams = llama_context_params_from_gpt_params(params);
 
     llama_context * lctx = llama_new_context_with_model(model, cparams);
@@ -2956,11 +2952,3 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
 
     return result;
 }
-
-void llama_model_kv_override_free(struct llama_model_kv_override * kv_overrides) {
-    for (const struct llama_model_kv_override *p = kv_overrides; p->key[0] != 0; p++) {
-        if (p->tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
-            delete p->str_value;
-        }
-    }
-}
diff --git a/common/common.h b/common/common.h
index 916023e2504a9..562d3a1195f65 100644
--- a/common/common.h
+++ b/common/common.h
@@ -172,9 +172,6 @@ struct gpt_params {
 
 bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 
-// Frees all allocated memory
-void llama_model_kv_override_free(struct llama_model_kv_override * ctx);
-
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index a0d9a2f46f6f4..b6464be3dfd5d 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -305,14 +305,14 @@ int main(int argc, char ** argv) {
             llama_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
             kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
-            kvo.str_value = strdup(imatrix_file.c_str());
+            strncpy(kvo.str_value, imatrix_file.c_str(), 128);
             kv_overrides.emplace_back(std::move(kvo));
         }
         if (!imatrix_dataset.empty()) {
             llama_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
             kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
-            kvo.str_value = strdup(imatrix_dataset.c_str());
+            strncpy(kvo.str_value, imatrix_dataset.c_str(), 128);
             kv_overrides.emplace_back(std::move(kvo));
         }
 
diff --git a/llama.h b/llama.h
index 0cc0d3f291428..73d9733f78cfa 100644
--- a/llama.h
+++ b/llama.h
@@ -201,11 +201,11 @@ extern "C" {
     struct llama_model_kv_override {
         char key[128];
         enum llama_model_kv_override_type tag;
+        char str_value[128];
         union {
             int64_t int_value;
             double  float_value;
             bool    bool_value;
-            char *  str_value;
         };
     };
 

From 5cf8ccb19191b829ce3701bf3a9d236457a2bf1a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 21 Apr 2024 20:06:30 +0300
Subject: [PATCH 11/13] llama : minor

---
 common/common.cpp              | 14 ++++++-------
 examples/quantize/quantize.cpp |  8 ++++----
 llama.cpp                      | 37 ++++++++++++----------------------
 llama.h                        | 12 ++++++-----
 4 files changed, 31 insertions(+), 40 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index f07b4d1a4e95b..277244e5ad0be 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -235,7 +235,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 }
 
 bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
-    const char* sep = strchr(data, '=');
+    const char * sep = strchr(data, '=');
     if (sep == nullptr || sep - data >= 128) {
         fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
         return false;
@@ -247,18 +247,18 @@ bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> &
     if (strncmp(sep, "int:", 4) == 0) {
         sep += 4;
         kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-        kvo.int_value = std::atol(sep);
+        kvo.val_i64 = std::atol(sep);
     } else if (strncmp(sep, "float:", 6) == 0) {
         sep += 6;
         kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-        kvo.float_value = std::atof(sep);
+        kvo.val_f64 = std::atof(sep);
     } else if (strncmp(sep, "bool:", 5) == 0) {
         sep += 5;
         kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
         if (std::strcmp(sep, "true") == 0) {
-            kvo.bool_value = true;
+            kvo.val_bool = true;
         } else if (std::strcmp(sep, "false") == 0) {
-            kvo.bool_value = false;
+            kvo.val_bool = false;
         } else {
             fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
             return false;
@@ -266,7 +266,7 @@ bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> &
     } else if (strncmp(sep, "str:", 4) == 0) {
         sep += 4;
         kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
-        strncpy(kvo.str_value, sep, 128);
+        strncpy(kvo.val_str, sep, 128);
     } else {
         fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
         return false;
@@ -276,7 +276,7 @@ bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> &
 }
 
 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
-    llama_sampling_params& sparams = params.sparams;
+    llama_sampling_params & sparams = params.sparams;
 
     if (arg == "-s" || arg == "--seed") {
         if (++i >= argc) {
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index b6464be3dfd5d..4419c04712c96 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -305,14 +305,14 @@ int main(int argc, char ** argv) {
             llama_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
             kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
-            strncpy(kvo.str_value, imatrix_file.c_str(), 128);
+            strncpy(kvo.val_str, imatrix_file.c_str(), 128);
             kv_overrides.emplace_back(std::move(kvo));
         }
         if (!imatrix_dataset.empty()) {
             llama_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
             kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
-            strncpy(kvo.str_value, imatrix_dataset.c_str(), 128);
+            strncpy(kvo.val_str, imatrix_dataset.c_str(), 128);
             kv_overrides.emplace_back(std::move(kvo));
         }
 
@@ -320,7 +320,7 @@ int main(int argc, char ** argv) {
             llama_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
             kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-            kvo.int_value = imatrix_data.size();
+            kvo.val_i64 = imatrix_data.size();
             kv_overrides.emplace_back(std::move(kvo));
         }
 
@@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
             llama_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
             kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-            kvo.int_value = m_last_call;
+            kvo.val_i64 = m_last_call;
             kv_overrides.emplace_back(std::move(kvo));
         }
     }
diff --git a/llama.cpp b/llama.cpp
index 9d9f7b4e12628..14c7f67416f1c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2875,16 +2875,16 @@ namespace GGUFMeta {
                     __func__, override_type_to_str(ovrd->tag), ovrd->key);
                 switch (ovrd->tag) {
                     case LLAMA_KV_OVERRIDE_TYPE_BOOL:  {
-                        LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
+                        LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
                     } break;
                     case LLAMA_KV_OVERRIDE_TYPE_INT:   {
-                        LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
+                        LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
                     } break;
                     case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
-                        LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
+                        LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
                     } break;
                     case LLAMA_KV_OVERRIDE_TYPE_STR: {
-                        LLAMA_LOG_INFO("%s\n", ovrd->str_value);
+                        LLAMA_LOG_INFO("%s\n", ovrd->val_str);
                     } break;
                     default:
                         // Shouldn't be possible to end up here, but just in case...
@@ -2903,7 +2903,7 @@ namespace GGUFMeta {
         static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
         try_override(OT & target, const struct llama_model_kv_override * ovrd) {
             if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
-                target = ovrd->bool_value;
+                target = ovrd->val_bool;
                 return true;
             }
             return false;
@@ -2913,7 +2913,7 @@ namespace GGUFMeta {
         static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
         try_override(OT & target, const struct llama_model_kv_override * ovrd) {
             if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
-                target = ovrd->int_value;
+                target = ovrd->val_i64;
                 return true;
             }
             return false;
@@ -2923,33 +2923,22 @@ namespace GGUFMeta {
         static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
         try_override(T & target, const struct llama_model_kv_override * ovrd) {
             if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
-                target = ovrd->float_value;
+                target = ovrd->val_f64;
                 return true;
             }
             return false;
         }
 
         template<typename OT>
-        static typename std::enable_if<std::is_same<OT, char *>::value, char *>::type
+        static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
         try_override(T & target, const struct llama_model_kv_override * ovrd) {
             if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
-                target = ovrd->str_value;
+                target = ovrd->val_str;
                 return true;
             }
             return false;
         }
 
-        template<typename OT>
-        static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
-        try_override(T & target, const struct llama_model_kv_override * ovrd) {
-            (void)target;
-            (void)ovrd;
-            if (!ovrd) { return false; }
-            // Currently, we should never end up here so it would be a bug if we do.
-            throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
-                ovrd ? ovrd->key : "NULL"));
-        }
-
         static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
             if (try_override<T>(target, ovrd)) {
                 return true;
@@ -14276,13 +14265,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         for (auto & o : overrides) {
             if (o.key[0] == 0) break;
             if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
-                gguf_set_val_f32(ctx_out, o.key, o.float_value);
+                gguf_set_val_f32(ctx_out, o.key, o.val_f64);
             } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
-                gguf_set_val_i32(ctx_out, o.key, o.int_value);
+                gguf_set_val_i32(ctx_out, o.key, o.val_i64);
             } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
-                gguf_set_val_bool(ctx_out, o.key, o.bool_value);
+                gguf_set_val_bool(ctx_out, o.key, o.val_bool);
             } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
-                gguf_set_val_str(ctx_out, o.key, o.str_value);
+                gguf_set_val_str(ctx_out, o.key, o.val_str);
             } else {
                 LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
             }
diff --git a/llama.h b/llama.h
index 73d9733f78cfa..afe3c0466447d 100644
--- a/llama.h
+++ b/llama.h
@@ -199,13 +199,15 @@ extern "C" {
     };
 
     struct llama_model_kv_override {
-        char key[128];
         enum llama_model_kv_override_type tag;
-        char str_value[128];
+
+        char key[128];
+
         union {
-            int64_t int_value;
-            double  float_value;
-            bool    bool_value;
+            int64_t val_i64;
+            double  val_f64;
+            bool    val_bool;
+            char    val_str[128];
         };
     };
 

From 8360e0c96049a81ef970e4db03b79106558ecfc7 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sun, 21 Apr 2024 21:00:34 +0200
Subject: [PATCH 12/13] =?UTF-8?q?no=20need=20to=20add=20a=20NUL=20to=20the?=
 =?UTF-8?q?=C2=A0std::vector,=C2=A0std::string=C2=A0can=20be=20initialized?=
 =?UTF-8?q?=20from=20a=20pair=20of=20iterators.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: slaren <slarengh@gmail.com>
---
 examples/quantize/quantize.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 4419c04712c96..a934f699a857f 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -169,10 +169,9 @@ static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_
         in.read((char *)&m_last_call, sizeof(m_last_call));
         int dataset_len;
         in.read((char *)&dataset_len, sizeof(dataset_len));
-        std::vector<char> dataset_as_vec(dataset_len+1);
-        in.read((char *)dataset_as_vec.data(), dataset_len);
-        dataset_as_vec[dataset_len] = 0;
-        imatrix_dataset = std::string{dataset_as_vec.data()};
+        std::vector<char> dataset_as_vec(dataset_len);
+        in.read(dataset_as_vec.data(), dataset_len);
+        imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
         printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
     }
     printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);

From bcbdd28cfd42ac25b5753a1c5bd7e1f991f4b653 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Fri, 26 Apr 2024 12:10:39 +0200
Subject: [PATCH 13/13] kv override: ensure string termination

---
 common/common.cpp              | 7 ++++++-
 examples/quantize/quantize.cpp | 6 ++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index f64943f678f22..007864dc784aa 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -266,7 +266,12 @@ bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> &
     } else if (strncmp(sep, "str:", 4) == 0) {
         sep += 4;
         kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
-        strncpy(kvo.val_str, sep, 128);
+        if (strlen(sep) > 127) {
+            fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
+            return false;
+        }
+        strncpy(kvo.val_str, sep, 127);
+        kvo.val_str[127] = '\0';
     } else {
         fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
         return false;
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index e910c1e3a7fdc..432cc2b4feadf 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -307,14 +307,16 @@ int main(int argc, char ** argv) {
             llama_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
             kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
-            strncpy(kvo.val_str, imatrix_file.c_str(), 128);
+            strncpy(kvo.val_str, imatrix_file.c_str(), 127);
+            kvo.val_str[127] = '\0';
             kv_overrides.emplace_back(std::move(kvo));
         }
         if (!imatrix_dataset.empty()) {
             llama_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
             kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
-            strncpy(kvo.val_str, imatrix_dataset.c_str(), 128);
+            strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
+            kvo.val_str[127] = '\0';
             kv_overrides.emplace_back(std::move(kvo));
         }