From df03a303f8915d87475ed74a3678b0fc01951495 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 10 Mar 2026 11:59:04 -0500 Subject: [PATCH 1/2] llama-quant : correct `n_attention_wv` usage In #19770, I introduced a regression in the way the `quantize_state_impl` counter values were initialized. I was incrementing and using `n_attention_wv` in the same loop, when it should have been fixed by the time we're deciding tensor types in `llama_tensor_get_type_impl` (for `use_more_bits`). I never observed a difference in any of [my tests](https://github.com/ggml-org/llama.cpp/pull/19770#issuecomment-4000424712) - it was only after @bartowski kindly pointed this out that I realized it was incorrect. (Thanks!) --- src/llama-quant.cpp | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3b0d234fbed..c8b19c5f198 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -870,9 +870,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: quantize_state_impl qs(model, params); - // these need to be set to n_layer by default - qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; - if (params->only_copy) { ftype = ml.ftype; } @@ -979,6 +976,25 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // compute tensor metadata once and cache it std::vector metadata(tensors.size()); + // initialize quantization state before preliminary loop (counters for use_more_bits) + { + for (size_t i = 0; i < tensors.size(); ++i) { + const auto * it = tensors[i]; + const struct ggml_tensor * tensor = it->tensor; + const char * name = tensor->name; + const auto cat = tensor_get_category(name); + if (category_is_attn_v(cat)) { + ++qs.n_attention_wv; + } + if (cat == tensor_category::OUTPUT) { + qs.has_tied_embeddings = false; + } + metadata[i].category = cat; // save and re-use the category while we're at it + } + // these also need to be set to n_layer by default + qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer; + } + // flag for --dry-run bool will_require_imatrix = false; @@ -991,16 +1007,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const struct ggml_tensor * tensor = it->tensor; const std::string name = ggml_get_name(tensor); - metadata[i].category = tensor_get_category(name); - - if (category_is_attn_v(metadata[i].category)) { - ++qs.n_attention_wv; - } - - if (tensor_name_match_output_weight(name.c_str())) { - qs.has_tied_embeddings = false; - } - uint16_t i_split = params->keep_split ? it->idx : 0; if (!ctx_outs[i_split]) { ctx_outs[i_split].reset(gguf_init_empty()); From a988640b2a9ebc1631cf3ed449794b556bac58b4 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 10 Mar 2026 12:12:20 -0500 Subject: [PATCH 2/2] simplify --- src/llama-quant.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c8b19c5f198..8e8ce231249 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -979,10 +979,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // initialize quantization state before preliminary loop (counters for use_more_bits) { for (size_t i = 0; i < tensors.size(); ++i) { - const auto * it = tensors[i]; - const struct ggml_tensor * tensor = it->tensor; - const char * name = tensor->name; - const auto cat = tensor_get_category(name); + const auto cat = tensor_get_category(tensors[i]->tensor->name); if (category_is_attn_v(cat)) { ++qs.n_attention_wv; }