From df03a303f8915d87475ed74a3678b0fc01951495 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Tue, 10 Mar 2026 11:59:04 -0500
Subject: [PATCH 1/2] llama-quant : correct `n_attention_wv` usage

In #19770, I introduced a regression in the way the
`quantize_state_impl` counter values were initialized. I was
incrementing and using `n_attention_wv` in the same loop, when it should
have been fixed by the time we're deciding tensor types in
`llama_tensor_get_type_impl` (for `use_more_bits`).

I never observed a difference in any of [my
tests](https://github.com/ggml-org/llama.cpp/pull/19770#issuecomment-4000424712)
- it was only after @bartowski kindly pointed this out that I realized
it was incorrect. (Thanks!)
---
 src/llama-quant.cpp | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3b0d234fbed..c8b19c5f198 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -870,9 +870,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     quantize_state_impl qs(model, params);
 
-    // these need to be set to n_layer by default
-    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
-
     if (params->only_copy) {
         ftype = ml.ftype;
     }
@@ -979,6 +976,25 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     // compute tensor metadata once and cache it
     std::vector<tensor_metadata> metadata(tensors.size());
 
+    // initialize quantization state before preliminary loop (counters for use_more_bits)
+    {
+        for (size_t i = 0; i < tensors.size(); ++i) {
+            const auto * it = tensors[i];
+            const struct ggml_tensor * tensor = it->tensor;
+            const char * name = tensor->name;
+            const auto cat = tensor_get_category(name);
+            if (category_is_attn_v(cat)) {
+                ++qs.n_attention_wv;
+            }
+            if (cat == tensor_category::OUTPUT) {
+                qs.has_tied_embeddings = false;
+            }
+            metadata[i].category = cat; // save and re-use the category while we're at it
+        }
+        // these also need to be set to n_layer by default
+        qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
+    }
+
     // flag for --dry-run
     bool will_require_imatrix = false;
 
@@ -991,16 +1007,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         const struct ggml_tensor * tensor = it->tensor;
         const std::string name = ggml_get_name(tensor);
 
-        metadata[i].category = tensor_get_category(name);
-
-        if (category_is_attn_v(metadata[i].category)) {
-            ++qs.n_attention_wv;
-        }
-
-        if (tensor_name_match_output_weight(name.c_str())) {
-            qs.has_tied_embeddings = false;
-        }
-
         uint16_t i_split = params->keep_split ? it->idx : 0;
         if (!ctx_outs[i_split]) {
             ctx_outs[i_split].reset(gguf_init_empty());

From a988640b2a9ebc1631cf3ed449794b556bac58b4 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Tue, 10 Mar 2026 12:12:20 -0500
Subject: [PATCH 2/2] simplify

---
 src/llama-quant.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c8b19c5f198..8e8ce231249 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -979,10 +979,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     // initialize quantization state before preliminary loop (counters for use_more_bits)
     {
         for (size_t i = 0; i < tensors.size(); ++i) {
-            const auto * it = tensors[i];
-            const struct ggml_tensor * tensor = it->tensor;
-            const char * name = tensor->name;
-            const auto cat = tensor_get_category(name);
+            const auto cat = tensor_get_category(tensors[i]->tensor->name);
             if (category_is_attn_v(cat)) {
                 ++qs.n_attention_wv;
             }