55#include < cstring>
66#include < vector>
77#include < string>
8+ #include < unordered_map>
9+ #include < fstream>
10+ #include < cmath>
11+ #include < algorithm>
812
913struct quant_option {
1014 std::string name;
@@ -17,6 +21,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
1721 { " Q4_1" , LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B" , },
1822 { " Q5_0" , LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B" , },
1923 { " Q5_1" , LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B" , },
24+ { " IQ2_XXS" ,LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization" , },
25+ { " IQ2_XS" , LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization" , },
2026 { " Q2_K" , LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B" , },
2127 { " Q2_K_S" , LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B" , },
2228 { " Q3_K" , LLAMA_FTYPE_MOSTLY_Q3_K_M, " alias for Q3_K_M" },
@@ -72,22 +78,108 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
7278//
7379[[noreturn]]
7480static void usage (const char * executable) {
75- printf (" usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n " , executable);
81+ printf (" usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n " , executable);
7682 printf (" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n " );
7783 printf (" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n " );
7884 printf (" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n " );
85+ printf (" --imatrixfile_name: use data in file_name as importance matrix for quant optimizations\n " );
86+ printf (" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n " );
87+ printf (" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n " );
88+ printf (" Note: --include-weights and --exclude-weights cannot be used together\n " );
7989 printf (" \n Allowed quantization types:\n " );
8090 for (auto & it : QUANT_OPTIONS) {
8191 if (it.name != " COPY" ) {
8292 printf (" %2d or " , it.ftype );
8393 } else {
8494 printf (" " );
8595 }
86- printf (" %-6s : %s\n " , it.name .c_str (), it.desc .c_str ());
96+ printf (" %-7s : %s\n " , it.name .c_str (), it.desc .c_str ());
8797 }
8898 exit (1 );
8999}
90100
101+ static void load_imatrix (const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float >>& imatrix_data) {
102+ std::ifstream in (imatrix_file.c_str (), std::ios::binary);
103+ if (!in) {
104+ printf (" %s: failed to open %s\n " ,__func__,imatrix_file.c_str ());
105+ return ;
106+ }
107+ int n_entries;
108+ in.read ((char *)&n_entries, sizeof (n_entries));
109+ if (in.fail () || n_entries < 1 ) {
110+ printf (" %s: no data in file %s\n " , __func__, imatrix_file.c_str ());
111+ return ;
112+ }
113+ for (int i = 0 ; i < n_entries; ++i) {
114+ int len; in.read ((char *)&len, sizeof (len));
115+ std::vector<char > name_as_vec (len+1 );
116+ in.read ((char *)name_as_vec.data (), len);
117+ if (in.fail ()) {
118+ printf (" %s: failed reading name for entry %d from %s\n " ,__func__,i+1 ,imatrix_file.c_str ());
119+ return ;
120+ }
121+ name_as_vec[len] = 0 ;
122+ std::string name{name_as_vec.data ()};
123+ auto & e = imatrix_data[std::move (name)];
124+ int ncall;
125+ in.read ((char *)&ncall, sizeof (ncall));
126+ int nval;
127+ in.read ((char *)&nval, sizeof (nval));
128+ if (in.fail () || nval < 1 ) {
129+ printf (" %s: failed reading number of values for entry %d\n " ,__func__,i);
130+ imatrix_data = {};
131+ return ;
132+ }
133+ e.resize (nval);
134+ in.read ((char *)e.data (), nval*sizeof (float ));
135+ if (in.fail ()) {
136+ printf (" %s: failed reading data for entry %d\n " ,__func__,i);
137+ imatrix_data = {};
138+ return ;
139+ }
140+ if (ncall > 0 ) {
141+ for (auto & v : e) v /= ncall;
142+ }
143+ }
144+ printf (" %s: loaded %d importance matrix entries from %s\n " ,__func__,int (imatrix_data.size ()),imatrix_file.c_str ());
145+ }
146+
147+ static void prepare_imatrix (const std::string& imatrix_file,
148+ const std::vector<std::string>& included_weights,
149+ const std::vector<std::string>& excluded_weights,
150+ std::unordered_map<std::string, std::vector<float >>& imatrix_data) {
151+ if (!imatrix_file.empty ()) {
152+ load_imatrix (imatrix_file, imatrix_data);
153+ }
154+ if (imatrix_data.empty ()) {
155+ return ;
156+ }
157+ if (!excluded_weights.empty ()) {
158+ for (auto & name : excluded_weights) {
159+ for (auto it = imatrix_data.begin (); it != imatrix_data.end (); ) {
160+ auto pos = it->first .find (name);
161+ if (pos != std::string::npos) it = imatrix_data.erase (it);
162+ else ++it;
163+ }
164+ }
165+ }
166+ if (!included_weights.empty ()) {
167+ std::unordered_map<std::string, std::vector<float >> tmp;
168+ for (auto & name : included_weights) {
169+ for (auto & e : imatrix_data) {
170+ auto pos = e.first .find (name);
171+ if (pos != std::string::npos) {
172+ tmp.emplace (std::move (e));
173+ }
174+ }
175+ }
176+ imatrix_data = std::move (tmp);
177+ }
178+ if (!imatrix_data.empty ()) {
179+ printf (" %s: have %d importance matrix entries\n " , __func__, int (imatrix_data.size ()));
180+ }
181+ }
182+
91183int main (int argc, char ** argv) {
92184 if (argc < 3 ) {
93185 usage (argv[0 ]);
@@ -96,6 +188,8 @@ int main(int argc, char ** argv) {
96188 llama_model_quantize_params params = llama_model_quantize_default_params ();
97189
98190 int arg_idx = 1 ;
191+ std::string imatrix_file;
192+ std::vector<std::string> included_weights, excluded_weights;
99193
100194 for (; arg_idx < argc && strncmp (argv[arg_idx], " --" , 2 ) == 0 ; arg_idx++) {
101195 if (strcmp (argv[arg_idx], " --leave-output-tensor" ) == 0 ) {
@@ -104,15 +198,43 @@ int main(int argc, char ** argv) {
104198 params.allow_requantize = true ;
105199 } else if (strcmp (argv[arg_idx], " --pure" ) == 0 ) {
106200 params.pure = true ;
201+ } else if (strcmp (argv[arg_idx], " --imatrix" ) == 0 ) {
202+ if (arg_idx < argc-1 ) {
203+ imatrix_file = argv[++arg_idx];
204+ } else {
205+ usage (argv[0 ]);
206+ }
207+ } else if (strcmp (argv[arg_idx], " --include-weights" ) == 0 ) {
208+ if (arg_idx < argc-1 ) {
209+ included_weights.push_back (argv[++arg_idx]);
210+ } else {
211+ usage (argv[0 ]);
212+ }
213+ } else if (strcmp (argv[arg_idx], " --exclude-weights" ) == 0 ) {
214+ if (arg_idx < argc-1 ) {
215+ excluded_weights.push_back (argv[++arg_idx]);
216+ } else {
217+ usage (argv[0 ]);
218+ }
107219 } else {
108220 usage (argv[0 ]);
109221 }
110222 }
111223
112224 if (argc - arg_idx < 2 ) {
225+ printf (" %s: bad arguments\n " , argv[0 ]);
226+ usage (argv[0 ]);
227+ }
228+ if (!included_weights.empty () && !excluded_weights.empty ()) {
113229 usage (argv[0 ]);
114230 }
115231
232+ std::unordered_map<std::string, std::vector<float >> imatrix_data;
233+ prepare_imatrix (imatrix_file, included_weights, excluded_weights, imatrix_data);
234+ if (!imatrix_data.empty ()) {
235+ params.imatrix = &imatrix_data;
236+ }
237+
116238 llama_backend_init (false );
117239
118240 // parse command line arguments
@@ -163,6 +285,13 @@ int main(int argc, char ** argv) {
163285 }
164286 }
165287
288+ if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && imatrix_data.empty ()) {
289+ fprintf (stderr, " \n ===============================================================================================\n " );
290+ fprintf (stderr, " Please do not use IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n " );
291+ fprintf (stderr, " ===============================================================================================\n\n\n " );
292+ return 1 ;
293+ }
294+
166295 print_build_info ();
167296
168297 fprintf (stderr, " %s: quantizing '%s' to '%s' as %s" , __func__, fname_inp.c_str (), fname_out.c_str (), ftype_str.c_str ());
0 commit comments