Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 129 additions & 3 deletions examples/debug/debug.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,106 @@
#include "common.h"
#include "log.h"
#include "llama.h"

#include <cstdlib>
#include <optional>
#include <string>
#include <vector>
#include <filesystem>
#include <fstream>
#include <regex>

// ---------------------------------------------------------------------------
// Tensor-saving callback
// Saves each tensor matching the filter to <save_tensors_dir>/<name>.bin
// (flat float32, row-major matching PyTorch layout) and a .shape sidecar.
// ---------------------------------------------------------------------------

static float tensor_elem_to_f32(const uint8_t * data, ggml_type type,
const size_t * nb,
int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
size_t offset = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
switch (type) {
case GGML_TYPE_F32: return *(const float *) (data + offset);
case GGML_TYPE_F16: return ggml_fp16_to_fp32(*(const ggml_fp16_t *)(data + offset));
case GGML_TYPE_BF16: return ggml_bf16_to_fp32(*(const ggml_bf16_t *)(data + offset));
default: return 0.0f; // skip quantized
}
}

struct save_tensor_callback_data : base_callback_data {
std::string save_dir;

save_tensor_callback_data() = default;

save_tensor_callback_data(common_params & params,
const std::vector<std::string> & filter_patterns,
std::string dir)
: base_callback_data(params, filter_patterns), save_dir(std::move(dir)) {
// Override the callback set by base_callback_data
params.cb_eval = save_cb;
params.cb_eval_user_data = this;
}

static bool save_cb(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb = static_cast<save_tensor_callback_data *>(user_data);

if (ask) return true;

// Check filter
bool match = cb->tensor_filters.empty();
for (const auto & f : cb->tensor_filters) {
if (std::regex_search(t->name, f)) { match = true; break; }
}
if (!match || ggml_is_quantized(t->type)) return true;

// Fetch data (may be on device)
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
const size_t n_bytes = ggml_nbytes(t);
if (!is_host) {
cb->data.resize(n_bytes);
ggml_backend_tensor_get(t, cb->data.data(), 0, n_bytes);
}
const uint8_t * raw = is_host ? (const uint8_t *) t->data : cb->data.data();

// Flatten to float32 in GGML dimension order (ne[0] fastest = innermost)
const int64_t ne0 = t->ne[0], ne1 = t->ne[1], ne2 = t->ne[2], ne3 = t->ne[3];
const int64_t n_elem = ne0 * ne1 * ne2 * ne3;
std::vector<float> floats;
floats.reserve(n_elem);
for (int64_t i3 = 0; i3 < ne3; i3++)
for (int64_t i2 = 0; i2 < ne2; i2++)
for (int64_t i1 = 0; i1 < ne1; i1++)
for (int64_t i0 = 0; i0 < ne0; i0++)
floats.push_back(tensor_elem_to_f32(raw, t->type, t->nb, i0, i1, i2, i3));

// Create output directory
std::filesystem::create_directories(cb->save_dir);

// Sanitize tensor name for filename (replace '/' with '_')
std::string safe_name = t->name;
for (char & c : safe_name) if (c == '/') c = '_';

// Write binary
{
std::string path = cb->save_dir + "/" + safe_name + ".bin";
std::ofstream f(path, std::ios::binary);
f.write((const char *) floats.data(), n_elem * sizeof(float));
}
// Write shape sidecar
{
std::string path = cb->save_dir + "/" + safe_name + ".shape";
std::ofstream f(path);
f << ne0 << " " << ne1 << " " << ne2 << " " << ne3 << "\n";
}

LOG("Saved tensor '%s' [%lld %lld %lld %lld] (%lld elems) to %s/\n",
t->name, (long long)ne0, (long long)ne1, (long long)ne2, (long long)ne3,
(long long)n_elem, cb->save_dir.c_str());

return true;
}
};

static void print_usage(int /*argc*/, char ** argv) {
const std::string usage_template = R"(
example usage:
Expand All @@ -25,7 +117,13 @@ static void print_usage(int /*argc*/, char ** argv) {

{prog} -m model.gguf -p "Hello my name is" --save-logits

Add --embedding to save embeddings)" "\n";
Add --embedding to save embeddings

Save intermediate tensors for comparison:

{prog} -m model.gguf -p "Hello my name is" --tensor-filter "inp_scaled|attn_norm-0|attn_out-0|result_norm" --save-tensors data/tensors-llm

Each matching tensor is written as a flat float32 .bin + .shape sidecar.)" "\n";

// Fix the source code indentation above that is introduced by the raw string literal.
std::string usage = std::regex_replace(usage_template, std::regex("\\n {8}"), "\n");
Expand Down Expand Up @@ -213,6 +311,24 @@ static bool run(llama_context * ctx, const common_params & params) {
int main(int argc, char ** argv) {
common_params params;

// Pre-parse --save-tensors <dir> before handing off to common_params_parse.
// The option is consumed here and removed from argv so common_params_parse
// does not see an unknown flag.
std::string save_tensors_dir;
{
std::vector<char *> new_argv;
new_argv.push_back(argv[0]);
for (int i = 1; i < argc; i++) {
if (std::string(argv[i]) == "--save-tensors" && i + 1 < argc) {
save_tensors_dir = argv[++i];
} else {
new_argv.push_back(argv[i]);
}
}
argc = static_cast<int>(new_argv.size());
for (int i = 0; i < argc; i++) argv[i] = new_argv[i];
}

common_init();

if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DEBUG, print_usage)) {
Expand All @@ -222,7 +338,17 @@ int main(int argc, char ** argv) {
llama_backend_init();
llama_numa_init(params.numa);

base_callback_data cb_data(params, params.tensor_filter);
// Use save_tensor_callback_data when --save-tensors is given, otherwise
// the standard verbose callback.
// Use optional+emplace to construct in-place so that the constructor's
// `this` pointer (stored in params.cb_eval_user_data) is stable.
std::optional<save_tensor_callback_data> save_cb;
std::optional<base_callback_data> base_cb;
if (!save_tensors_dir.empty()) {
save_cb.emplace(params, params.tensor_filter, save_tensors_dir);
} else {
base_cb.emplace(params, params.tensor_filter);
}

auto llama_init = common_init_from_params(params);

Expand Down
4 changes: 4 additions & 0 deletions examples/model-conversion/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ causal-verify-logits: causal-run-original-model causal-run-converted-model
@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/compare-logits.py
@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}

causal-verify-logits-no-org: causal-run-converted-model
@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/compare-logits.py
@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}

causal-run-original-embeddings:
@./scripts/causal/run-casual-gen-embeddings-org.py

Expand Down
104 changes: 104 additions & 0 deletions examples/model-conversion/scripts/causal/compare-tensors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env python3
"""Compare intermediate tensors saved by run-org-model.py (--dump-tensors)
and llama-debug (--save-tensors) to locate the first point of divergence
in the Gemma4 (or any other) forward pass.

Usage:
python compare-tensors.py <pytorch-dir> <llamacpp-dir> [tensor ...]

If no tensor names are given, all .bin files found in pytorch-dir are compared.
"""

import sys
import os
import numpy as np
from pathlib import Path


def nmse_db(ref, test):
mse = float(np.mean((ref - test) ** 2))
var = float(np.var(ref))
nmse = mse / var if var > 1e-30 else 0.0
return nmse, 10 * np.log10(nmse + 1e-300)


def load_tensor(directory, name):
path = Path(directory) / f"{name}.bin"
if not path.exists():
return None, None
arr = np.fromfile(path, dtype=np.float32)
shape_path = Path(directory) / f"{name}.shape"
shape = None
if shape_path.exists():
shape = tuple(int(x) for x in shape_path.read_text().split())
return arr, shape


def compare(ref_dir, llm_dir, tensor_names):
ref_dir = Path(ref_dir)
llm_dir = Path(llm_dir)

if not tensor_names:
tensor_names = sorted(p.stem for p in ref_dir.glob("*.bin"))

col = 28
print(f"\n{'Tensor':<{col}} {'N-elem':>8} {'NMSE':>12} {'dB':>8} {'MaxErr':>10} {'MeanErr':>10}")
print("-" * (col + 60))

any_missing = False
for name in tensor_names:
ref, ref_shape = load_tensor(ref_dir, name)
llm, llm_shape = load_tensor(llm_dir, name)

if ref is None:
print(f"{name:<{col}} MISSING in pytorch dir")
any_missing = True
continue
if llm is None:
print(f"{name:<{col}} MISSING in llamacpp dir")
any_missing = True
continue

# Shapes may differ (e.g. PyTorch saves [seq, hidden] as [seq, hidden]
# while llama.cpp saves [hidden, seq] flattened the same way), but the
# element count must match.
if ref.size != llm.size:
print(f"{name:<{col}} SIZE MISMATCH ref={ref.size} (shape {ref_shape}) llm={llm.size} (shape {llm_shape})")
continue

nmse, db = nmse_db(ref, llm)
max_err = float(np.max(np.abs(ref - llm)))
mean_err = float(np.mean(np.abs(ref - llm)))

flag = ""
if nmse > 0.1:
flag = " ❌ DIVERGED"
elif nmse > 1e-2:
flag = " ⚠ high"
elif nmse > 1e-4:
flag = " ~ ok"
else:
flag = " ✓"

print(f"{name:<{col}} {ref.size:>8d} {nmse:>12.3e} {db:>8.2f} {max_err:>10.4f} {mean_err:>10.4f}{flag}")

print()
if any_missing:
print("Some tensors were missing — check that both runs used the same prompt and filters.")


def main():
args = sys.argv[1:]
if len(args) < 2:
print(__doc__)
sys.exit(1)

ref_dir = args[0]
llm_dir = args[1]
tensors = args[2:]

compare(ref_dir, llm_dir, tensors)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Tensor pair config for Gemma4 model validation.
# Maps llama.cpp cb() tensor names to PyTorch hook tensor names.
#
# Used by validate-conversion.py.

model_type: gemma4

# Total number of layers. Can be overridden with --n-layers.
# Gemma4-E2B has 35 layers (0-34); Gemma4-27B has 62.
n_layers: 35

# Tensors that are not layer-specific.
# 'llama' names are saved by llama-debug --save-tensors.
# 'torch' names are saved by run-org-model.py --dump-tensors.
global_tensors:
- llama: inp_scaled
torch: inp_scaled
description: "Embedding output after sqrt(n_embd) scaling (BF16 semantics)"

- llama: result_norm
torch: result_norm
description: "Final RMS norm output (last token only)"

# Per-layer tensors. Use {layer} as a placeholder for the layer index.
# Only dumped for the target layers (first, middle, last).
layer_tensors:
- llama: "attn_norm-{layer}"
torch: "attn_norm-{layer}"
description: "Attention input RMS norm * weight"

- llama: "attn_raw_out-{layer}"
torch: "attn_raw_out-{layer}"
description: "Raw attention output (before post-attention norm)"

- llama: "attn_post_norm-{layer}"
torch: "attn_post_norm-{layer}"
description: "Post-attention norm * weight (before residual add)"

- llama: "attn_out-{layer}"
torch: "attn_out-{layer}"
description: "After residual add, before FFN branch"

- llama: "ffn_norm-{layer}"
torch: "ffn_norm-{layer}"
description: "Pre-FFN RMS norm * weight"

- llama: "ffn_post_norm-{layer}"
torch: "ffn_post_norm-{layer}"
description: "Post-FFN norm * weight (before residual add)"

- llama: "l_out-{layer}"
torch: "l_out-{layer}"
description: "Full layer output (after FFN residual add)"
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ echo $MODEL_TESTING_PROMPT

cmake --build ${BUILD_DIR} --target llama-debug -j8

${BUILD_DIR}/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits
${BUILD_DIR}/bin/llama-debug --temp 0 -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits
Loading
Loading