Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 45 additions & 3 deletions convert_hf_to_gguf.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,27 @@ def prepare_tensors(self):
):
data_qtype = gguf.GGMLQuantizationType.F32

if data_qtype is False and any(
self.match_model_tensor_name(new_name, key, bid)
for key in (
gguf.MODEL_TENSOR.TOKEN_EMBD,
gguf.MODEL_TENSOR.OUTPUT,
gguf.MODEL_TENSOR.ATTN_V,
gguf.MODEL_TENSOR.ATTN_K,
)
):
if self.ftype in (
gguf.LlamaFileType.MOSTLY_Q4_0,
gguf.LlamaFileType.MOSTLY_Q4_1,
):
data_qtype = gguf.GGMLQuantizationType.Q5_0
elif self.ftype in (
gguf.LlamaFileType.MOSTLY_Q5_0,
gguf.LlamaFileType.MOSTLY_Q5_1,
# gguf.LlamaFileType.MOSTLY_Q6_0,
):
data_qtype = gguf.GGMLQuantizationType.Q8_0

# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
if isinstance(data_qtype, bool):
if self.ftype == gguf.LlamaFileType.ALL_F32:
Expand All @@ -314,6 +335,16 @@ def prepare_tensors(self):
data_qtype = gguf.GGMLQuantizationType.F16
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
data_qtype = gguf.GGMLQuantizationType.BF16
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_0:
data_qtype = gguf.GGMLQuantizationType.Q4_0
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_1:
data_qtype = gguf.GGMLQuantizationType.Q4_1
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_0:
data_qtype = gguf.GGMLQuantizationType.Q5_0
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1:
data_qtype = gguf.GGMLQuantizationType.Q5_1
# elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0: // To be implemented?
# data_qtype = gguf.GGMLQuantizationType.Q6_0
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
data_qtype = gguf.GGMLQuantizationType.Q8_0
else:
Expand Down Expand Up @@ -387,6 +418,13 @@ def prepare_metadata(self, vocab_only: bool):

logger.info("Set model quantization version")
self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)

logger.info("****************************************************************************************")
logger.info("** quantizing to `Q4_0`,`Q4_1`,`Q5_0`, or `Q5_1`is not equiv to using `llama-quantize`")
logger.info("** `Q4_0`,`Q4_1` are here using embeddings, output, attn_k and attn_v in q5_0")
logger.info("** `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q8_0")
logger.info("** This, in order to generate a small but reliable conversion to create an iMatrix file.")
logger.info("****************************************************************************************")

def write(self):
self.prepare_tensors()
Expand Down Expand Up @@ -3375,7 +3413,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
if match and int(match.group(1)) >= block_count:
return []


# process the experts separately
if name.find("mlp.experts") != -1:
n_experts = self.hparams["n_routed_experts"]
Expand Down Expand Up @@ -4076,8 +4113,8 @@ def parse_args() -> argparse.Namespace:
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
)
parser.add_argument(
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "auto"], default="f16",
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
)
parser.add_argument(
"--bigendian", action="store_true",
Expand Down Expand Up @@ -4163,6 +4200,11 @@ def main() -> None:
"f32": gguf.LlamaFileType.ALL_F32,
"f16": gguf.LlamaFileType.MOSTLY_F16,
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
"q4_0": gguf.LlamaFileType.MOSTLY_Q4_0,
"q4_1": gguf.LlamaFileType.MOSTLY_Q4_1,
"q5_0": gguf.LlamaFileType.MOSTLY_Q5_0,
"q5_1": gguf.LlamaFileType.MOSTLY_Q5_1,
# "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0,
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
"auto": gguf.LlamaFileType.GUESSED,
}
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-cuda/mmvq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,7 @@ bool ggml_cuda_mmvq_type_supported(ggml_type src0_type) {
case GGML_TYPE_IQ4_KSS:
case GGML_TYPE_IQ2_KS:
case GGML_TYPE_IQ5_K:
case GGML_TYPE_IQ5_KS:
case GGML_TYPE_IQ6_K:
case GGML_TYPE_IQ3_S:
return true;
Expand Down
7 changes: 4 additions & 3 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18803,7 +18803,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4) {
new_type = !qs.has_output ? GGML_TYPE_IQ4_K_R4 : GGML_TYPE_Q5_K_R4;
}
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S_R4 ||
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_KL ||
ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S_R4 ||
ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KSS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS_R4) && !qs.has_output) {
new_type = GGML_TYPE_IQ5_K;
}
Expand Down Expand Up @@ -19165,8 +19166,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_K ||
ftype == LLAMA_FTYPE_MOSTLY_IQ4_KSS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS_R4 ||
ftype == LLAMA_FTYPE_MOSTLY_IQ5_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ5_KS_R4 ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_R4 ||
ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS_R8 ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_KL ||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS_R8 ||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_KT ||
ftype == LLAMA_FTYPE_MOSTLY_Q2_K_R4|| ftype == LLAMA_FTYPE_MOSTLY_IQ4_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K_R4 ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S_R4) {
Expand Down