ggml-org · pwilkin · Apr 6, 2026 · Apr 6, 2026 · Apr 5, 2026 · Apr 6, 2026
@@ -3,14 +3,106 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
-
 #include <cstdlib>
+#include <optional>
 #include <string>
 #include <vector>
 #include <filesystem>
 #include <fstream>
 #include <regex>
 
+// ---------------------------------------------------------------------------
+// Tensor-saving callback
+// Saves each tensor matching the filter to <save_tensors_dir>/<name>.bin
+// (flat float32, row-major matching PyTorch layout) and a .shape sidecar.
+// ---------------------------------------------------------------------------
+
+static float tensor_elem_to_f32(const uint8_t * data, ggml_type type,
+                                  const size_t * nb,
+                                  int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+    size_t offset = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+    switch (type) {
+        case GGML_TYPE_F32:  return *(const float *)    (data + offset);
+        case GGML_TYPE_F16:  return ggml_fp16_to_fp32(*(const ggml_fp16_t *)(data + offset));
+        case GGML_TYPE_BF16: return ggml_bf16_to_fp32(*(const ggml_bf16_t *)(data + offset));
+        default: return 0.0f; // skip quantized
+    }
+}
+
+struct save_tensor_callback_data : base_callback_data {
+    std::string save_dir;
+
+    save_tensor_callback_data() = default;
+
+    save_tensor_callback_data(common_params & params,
+                               const std::vector<std::string> & filter_patterns,
+                               std::string dir)
+        : base_callback_data(params, filter_patterns), save_dir(std::move(dir)) {
+        // Override the callback set by base_callback_data
+        params.cb_eval           = save_cb;
+        params.cb_eval_user_data = this;
+    }
+
+    static bool save_cb(struct ggml_tensor * t, bool ask, void * user_data) {
+        auto * cb = static_cast<save_tensor_callback_data *>(user_data);
+
+        if (ask) return true;
+
+        // Check filter
+        bool match = cb->tensor_filters.empty();
+        for (const auto & f : cb->tensor_filters) {
+            if (std::regex_search(t->name, f)) { match = true; break; }
+        }
+        if (!match || ggml_is_quantized(t->type)) return true;
+
+        // Fetch data (may be on device)
+        const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+        const size_t n_bytes = ggml_nbytes(t);
+        if (!is_host) {
+            cb->data.resize(n_bytes);
+            ggml_backend_tensor_get(t, cb->data.data(), 0, n_bytes);
+        }
+        const uint8_t * raw = is_host ? (const uint8_t *) t->data : cb->data.data();
+
+        // Flatten to float32 in GGML dimension order (ne[0] fastest = innermost)
+        const int64_t ne0 = t->ne[0], ne1 = t->ne[1], ne2 = t->ne[2], ne3 = t->ne[3];
+        const int64_t n_elem = ne0 * ne1 * ne2 * ne3;
+        std::vector<float> floats;
+        floats.reserve(n_elem);
+        for (int64_t i3 = 0; i3 < ne3; i3++)
+        for (int64_t i2 = 0; i2 < ne2; i2++)
+        for (int64_t i1 = 0; i1 < ne1; i1++)
+        for (int64_t i0 = 0; i0 < ne0; i0++)
+            floats.push_back(tensor_elem_to_f32(raw, t->type, t->nb, i0, i1, i2, i3));
+
+        // Create output directory
+        std::filesystem::create_directories(cb->save_dir);
+
+        // Sanitize tensor name for filename (replace '/' with '_')
+        std::string safe_name = t->name;
+        for (char & c : safe_name) if (c == '/') c = '_';
+
+        // Write binary
+        {
+            std::string path = cb->save_dir + "/" + safe_name + ".bin";
+            std::ofstream f(path, std::ios::binary);
+            f.write((const char *) floats.data(), n_elem * sizeof(float));
+        }
+        // Write shape sidecar
+        {
+            std::string path = cb->save_dir + "/" + safe_name + ".shape";
+            std::ofstream f(path);
+            f << ne0 << " " << ne1 << " " << ne2 << " " << ne3 << "\n";
+        }
+
+        LOG("Saved tensor '%s' [%lld %lld %lld %lld] (%lld elems) to %s/\n",
+            t->name, (long long)ne0, (long long)ne1, (long long)ne2, (long long)ne3,
+            (long long)n_elem, cb->save_dir.c_str());
+
+        return true;
+    }
+};
+
 static void print_usage(int /*argc*/, char ** argv) {
     const std::string usage_template = R"(
         example usage:
@@ -25,7 +117,13 @@ static void print_usage(int /*argc*/, char ** argv) {
 
           {prog} -m model.gguf -p "Hello my name is" --save-logits
 
-          Add --embedding to save embeddings)" "\n";
+          Add --embedding to save embeddings
+
+          Save intermediate tensors for comparison:
+
+          {prog} -m model.gguf -p "Hello my name is" --tensor-filter "inp_scaled|attn_norm-0|attn_out-0|result_norm" --save-tensors data/tensors-llm
+
+          Each matching tensor is written as a flat float32 .bin + .shape sidecar.)" "\n";
 
     // Fix the source code indentation above that is introduced by the raw string literal.
     std::string usage = std::regex_replace(usage_template, std::regex("\\n {8}"), "\n");
@@ -213,6 +311,24 @@ static bool run(llama_context * ctx, const common_params & params) {
 int main(int argc, char ** argv) {
     common_params params;
 
+    // Pre-parse --save-tensors <dir> before handing off to common_params_parse.
+    // The option is consumed here and removed from argv so common_params_parse
+    // does not see an unknown flag.
+    std::string save_tensors_dir;
+    {
+        std::vector<char *> new_argv;
+        new_argv.push_back(argv[0]);
+        for (int i = 1; i < argc; i++) {
+            if (std::string(argv[i]) == "--save-tensors" && i + 1 < argc) {
+                save_tensors_dir = argv[++i];
+            } else {
+                new_argv.push_back(argv[i]);
+            }
+        }
+        argc = static_cast<int>(new_argv.size());
+        for (int i = 0; i < argc; i++) argv[i] = new_argv[i];
+    }
+
     common_init();
 
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DEBUG, print_usage)) {
@@ -222,7 +338,17 @@ int main(int argc, char ** argv) {
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    base_callback_data cb_data(params, params.tensor_filter);
+    // Use save_tensor_callback_data when --save-tensors is given, otherwise
+    // the standard verbose callback.
+    // Use optional+emplace to construct in-place so that the constructor's
+    // `this` pointer (stored in params.cb_eval_user_data) is stable.
+    std::optional<save_tensor_callback_data> save_cb;
+    std::optional<base_callback_data>        base_cb;
+    if (!save_tensors_dir.empty()) {
+        save_cb.emplace(params, params.tensor_filter, save_tensors_dir);
+    } else {
+        base_cb.emplace(params, params.tensor_filter);
+    }
 
     auto llama_init = common_init_from_params(params);
 

@@ -67,6 +67,10 @@ causal-verify-logits: causal-run-original-model causal-run-converted-model
 	@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/compare-logits.py
 	@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}
 
+causal-verify-logits-no-org: causal-run-converted-model
+	@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/compare-logits.py
+	@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}
+
 causal-run-original-embeddings:
 	@./scripts/causal/run-casual-gen-embeddings-org.py
 

@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+"""Compare intermediate tensors saved by run-org-model.py (--dump-tensors)
+and llama-debug (--save-tensors) to locate the first point of divergence
+in the Gemma4 (or any other) forward pass.
+
+Usage:
+  python compare-tensors.py <pytorch-dir> <llamacpp-dir> [tensor ...]
+
+If no tensor names are given, all .bin files found in pytorch-dir are compared.
+"""
+
+import sys
+import os
+import numpy as np
+from pathlib import Path
+
+
+def nmse_db(ref, test):
+    mse  = float(np.mean((ref - test) ** 2))
+    var  = float(np.var(ref))
+    nmse = mse / var if var > 1e-30 else 0.0
+    return nmse, 10 * np.log10(nmse + 1e-300)
+
+
+def load_tensor(directory, name):
+    path = Path(directory) / f"{name}.bin"
+    if not path.exists():
+        return None, None
+    arr = np.fromfile(path, dtype=np.float32)
+    shape_path = Path(directory) / f"{name}.shape"
+    shape = None
+    if shape_path.exists():
+        shape = tuple(int(x) for x in shape_path.read_text().split())
+    return arr, shape
+
+
+def compare(ref_dir, llm_dir, tensor_names):
+    ref_dir = Path(ref_dir)
+    llm_dir = Path(llm_dir)
+
+    if not tensor_names:
+        tensor_names = sorted(p.stem for p in ref_dir.glob("*.bin"))
+
+    col = 28
+    print(f"\n{'Tensor':<{col}}  {'N-elem':>8}  {'NMSE':>12}  {'dB':>8}  {'MaxErr':>10}  {'MeanErr':>10}")
+    print("-" * (col + 60))
+
+    any_missing = False
+    for name in tensor_names:
+        ref, ref_shape = load_tensor(ref_dir, name)
+        llm, llm_shape = load_tensor(llm_dir, name)
+
+        if ref is None:
+            print(f"{name:<{col}}  MISSING in pytorch dir")
+            any_missing = True
+            continue
+        if llm is None:
+            print(f"{name:<{col}}  MISSING in llamacpp dir")
+            any_missing = True
+            continue
+
+        # Shapes may differ (e.g. PyTorch saves [seq, hidden] as [seq, hidden]
+        # while llama.cpp saves [hidden, seq] flattened the same way), but the
+        # element count must match.
+        if ref.size != llm.size:
+            print(f"{name:<{col}}  SIZE MISMATCH  ref={ref.size} (shape {ref_shape})  llm={llm.size} (shape {llm_shape})")
+            continue
+
+        nmse, db = nmse_db(ref, llm)
+        max_err  = float(np.max(np.abs(ref - llm)))
+        mean_err = float(np.mean(np.abs(ref - llm)))
+
+        flag = ""
+        if nmse > 0.1:
+            flag = "  ❌ DIVERGED"
+        elif nmse > 1e-2:
+            flag = "  ⚠  high"
+        elif nmse > 1e-4:
+            flag = "  ~ ok"
+        else:
+            flag = "  ✓"
+
+        print(f"{name:<{col}}  {ref.size:>8d}  {nmse:>12.3e}  {db:>8.2f}  {max_err:>10.4f}  {mean_err:>10.4f}{flag}")
+
+    print()
+    if any_missing:
+        print("Some tensors were missing — check that both runs used the same prompt and filters.")
+
+
+def main():
+    args = sys.argv[1:]
+    if len(args) < 2:
+        print(__doc__)
+        sys.exit(1)
+
+    ref_dir  = args[0]
+    llm_dir  = args[1]
+    tensors  = args[2:]
+
+    compare(ref_dir, llm_dir, tensors)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,53 @@
+# Tensor pair config for Gemma4 model validation.
+# Maps llama.cpp cb() tensor names to PyTorch hook tensor names.
+#
+# Used by validate-conversion.py.
+
+model_type: gemma4
+
+# Total number of layers. Can be overridden with --n-layers.
+# Gemma4-E2B has 35 layers (0-34); Gemma4-27B has 62.
+n_layers: 35
+
+# Tensors that are not layer-specific.
+# 'llama' names are saved by llama-debug --save-tensors.
+# 'torch' names are saved by run-org-model.py --dump-tensors.
+global_tensors:
+  - llama: inp_scaled
+    torch: inp_scaled
+    description: "Embedding output after sqrt(n_embd) scaling (BF16 semantics)"
+
+  - llama: result_norm
+    torch: result_norm
+    description: "Final RMS norm output (last token only)"
+
+# Per-layer tensors. Use {layer} as a placeholder for the layer index.
+# Only dumped for the target layers (first, middle, last).
+layer_tensors:
+  - llama: "attn_norm-{layer}"
+    torch: "attn_norm-{layer}"
+    description: "Attention input RMS norm * weight"
+
+  - llama: "attn_raw_out-{layer}"
+    torch: "attn_raw_out-{layer}"
+    description: "Raw attention output (before post-attention norm)"
+
+  - llama: "attn_post_norm-{layer}"
+    torch: "attn_post_norm-{layer}"
+    description: "Post-attention norm * weight (before residual add)"
+
+  - llama: "attn_out-{layer}"
+    torch: "attn_out-{layer}"
+    description: "After residual add, before FFN branch"
+
+  - llama: "ffn_norm-{layer}"
+    torch: "ffn_norm-{layer}"
+    description: "Pre-FFN RMS norm * weight"
+
+  - llama: "ffn_post_norm-{layer}"
+    torch: "ffn_post_norm-{layer}"
+    description: "Post-FFN norm * weight (before residual add)"
+
+  - llama: "l_out-{layer}"
+    torch: "l_out-{layer}"
+    description: "Full layer output (after FFN residual add)"
@@ -28,4 +28,4 @@ echo $MODEL_TESTING_PROMPT
 
 cmake --build ${BUILD_DIR} --target llama-debug -j8
 
-${BUILD_DIR}/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits
+${BUILD_DIR}/bin/llama-debug --temp 0 -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits
Original file line number	Diff line number	Diff line change
Expand Up		@@ -28,4 +28,4 @@ echo $MODEL_TESTING_PROMPT

		cmake --build ${BUILD_DIR} --target llama-debug -j8

		${BUILD_DIR}/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits
		${BUILD_DIR}/bin/llama-debug --temp 0 -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits