@@ -313,6 +313,7 @@ def prepare_tensors(self):
313313                        gguf .MODEL_TENSOR .OUTPUT ,
314314                        gguf .MODEL_TENSOR .ATTN_V ,
315315                        gguf .MODEL_TENSOR .ATTN_K ,
316+                         gguf .MODEL_TENSOR .ATTN_QKV ,
316317                    )
317318                ):
318319                    if  self .ftype  in  (
@@ -323,9 +324,8 @@ def prepare_tensors(self):
323324                    elif  self .ftype  in  (
324325                        gguf .LlamaFileType .MOSTLY_Q5_0 ,
325326                        gguf .LlamaFileType .MOSTLY_Q5_1 ,
326-                         # gguf.LlamaFileType.MOSTLY_Q6_0, 
327327                    ):
328-                         data_qtype  =  gguf .GGMLQuantizationType .Q8_0 
328+                         data_qtype  =  gguf .GGMLQuantizationType .Q6_0 
329329
330330                # No override (data_qtype is False), or wants to be quantized (data_qtype is True) 
331331                if  isinstance (data_qtype , bool ):
@@ -343,8 +343,8 @@ def prepare_tensors(self):
343343                        data_qtype  =  gguf .GGMLQuantizationType .Q5_0 
344344                    elif  self .ftype  ==  gguf .LlamaFileType .MOSTLY_Q5_1 :
345345                        data_qtype  =  gguf .GGMLQuantizationType .Q5_1 
346-                     #  elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0: // To be implemented? 
347-                         #  data_qtype = gguf.GGMLQuantizationType.Q6_0
346+                     elif  self .ftype  ==  gguf .LlamaFileType .MOSTLY_Q6_0 :
347+                         data_qtype  =  gguf .GGMLQuantizationType .Q6_0 
348348                    elif  self .ftype  ==  gguf .LlamaFileType .MOSTLY_Q8_0 :
349349                        data_qtype  =  gguf .GGMLQuantizationType .Q8_0 
350350                    else :
@@ -419,12 +419,12 @@ def prepare_metadata(self, vocab_only: bool):
419419        logger .info ("Set model quantization version" )
420420        self .gguf_writer .add_quantization_version (gguf .GGML_QUANT_VERSION )
421421
422-         logger .info ("****************************************************************************************" )
423-         logger .info ("** quantizing  to `Q4_0 `,`Q4_1 `,`Q5_0 `, or `Q5_1` is not equiv to using `llama-quantize`" )
424-         logger .info ("** `Q4_0 `,`Q4_1 ` are here using  embeddings, output, attn_k and attn_v in q5_0" )
425-         logger .info ("** `Q5_0 `,`Q5_1 ` are here using  embeddings, output, attn_k and attn_v in q8_0 " )
426-         logger .info ("** This, in order to generate  a small but reliable conversion  to create  an iMatrix file." )
427-         logger .info ("****************************************************************************************" )
422+         logger .info ("*********************************************************************************************** " )
423+         logger .info ("** Converting  to `q4_0 `,`q4_1 `,`q5_0 `, `q5_1`  or `q6_0`  is not equiv to using `llama-quantize`! " )
424+         logger .info ("** Ftype `q4_0 `,`q4_1 ` are here converting  embeddings, output, attn_k and attn_v/qkv  in q5_0. " )
425+         logger .info ("** Ftype `q5_0 `,`q5_1 ` are here converting  embeddings, output, attn_k and attn_v/qkv  in q6_0. " )
426+         logger .info ("** This, in order to create  a small but viable conv.  to then for example make  an iMatrix file." )
427+         logger .info ("*********************************************************************************************** " )
428428
429429    def  write (self ):
430430        self .prepare_tensors ()
@@ -4113,8 +4113,8 @@ def parse_args() -> argparse.Namespace:
41134113        help = "path to write to; default: based on input. {ftype} will be replaced by the outtype." ,
41144114    )
41154115    parser .add_argument (
4116-         "--outtype" , type = str , choices = ["f32" , "f16" , "bf16" , "q8_0" , "q4_0" , "q4_1" , "q5_0" , "q5_1" , "auto" ], default = "f16" ,
4117-         help = "output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type" ,
4116+         "--outtype" , type = str , choices = ["f32" , "f16" , "bf16" , "q8_0" , "q4_0" , "q4_1" , "q5_0" , "q5_1" , "q6_0"  ,  " auto" ], default = "f16" ,
4117+         help = "output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1, q6_0  for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type" ,
41184118    )
41194119    parser .add_argument (
41204120        "--bigendian" , action = "store_true" ,
@@ -4204,7 +4204,7 @@ def main() -> None:
42044204        "q4_1" : gguf .LlamaFileType .MOSTLY_Q4_1 ,
42054205        "q5_0" : gguf .LlamaFileType .MOSTLY_Q5_0 ,
42064206        "q5_1" : gguf .LlamaFileType .MOSTLY_Q5_1 ,
4207-         #  "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0,
4207+         "q6_0" : gguf .LlamaFileType .MOSTLY_Q6_0 ,
42084208        "q8_0" : gguf .LlamaFileType .MOSTLY_Q8_0 ,
42094209        "auto" : gguf .LlamaFileType .GUESSED ,
42104210    }
0 commit comments