ggml-org · ngxson · Jul 3, 2025 · Jun 29, 2025 · Jun 29, 2025
@@ -714,8 +714,8 @@ def add_max_alibi_bias(self, bias: float) -> None:
     def add_clamp_kqv(self, value: float) -> None:
         self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
 
-    def add_shared_kv_layers(self, value: float) -> None:
-        self.add_float32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value)
+    def add_shared_kv_layers(self, value: int) -> None:
+        self.add_uint32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value)
 
     def add_sliding_window_pattern(self, value: Sequence[bool]) -> None:
         self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
@@ -894,6 +894,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                 new_type = params->token_embedding_type;
             }
+            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "per_layer_token_embd.weight") == 0) {
+                new_type = params->token_embedding_type;
+            }
             if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
                 new_type = params->output_tensor_type;
             }