From ddf4b9dbc1bb447289bbd79b80d4fc04772526c4 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Mon, 23 Feb 2026 17:44:57 +0100
Subject: [PATCH 01/18] tests: allow loading test-backend-ops tests from json

---
 tests/test-backend-ops.cpp | 160 ++++++++++++++++++++++++++++++++++---
 1 file changed, 148 insertions(+), 12 deletions(-)
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 32a83b001d8..6b0d8277cd0 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -31,16 +31,20 @@
 #include <cstring>
 #include <ctime>
 #include <future>
+#include <fstream>
 #include <memory>
 #include <random>
 #include <regex>
 #include <set>
+#include <sstream>
 #include <string>
 #include <string_view>
 #include <thread>
 #include <vector>
 #include <unordered_map>
 
+#include <nlohmann/json.hpp>
+
 #ifdef __EMSCRIPTEN__
 #   define N_THREADS 1
 #else
@@ -420,6 +424,10 @@ static std::string var_to_str(ggml_scale_mode mode) {
     return str;
 }
 
+static std::string var_to_str(ggml_op op) {
+    return ggml_op_name(op);
+}
+
 #define VAR_TO_STR(x) (#x "=" + var_to_str(x))
 
 #define VARS_TO_STR1(a) VAR_TO_STR(a)
@@ -6648,6 +6656,81 @@ struct test_diag : public test_case {
     }
 };
 
+// Deserializable generic test case
+struct input_tensor {
+    ggml_type type;
+    std::array<int64_t, 4> ne;
+};
+
+static std::string var_to_str(const std::vector<input_tensor>& sources) {
+    std::ostringstream oss;
+    bool first = true;
+    for (const auto& src : sources) {
+        if (!first) oss << ",";
+        oss << ggml_type_name(src.type) << "[" << src.ne[0] << "," << src.ne[1] << "," << src.ne[2] << "," << src.ne[3] << "]";
+        first = false;
+    }
+    return oss.str();
+}
+
+static std::string var_to_str(const std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)>& params) {
+    std::ostringstream oss;
+    oss << "[";
+    bool first = true;
+    for (size_t i = 0; i < params.size(); ++i) {
+        if (params[i] != 0) {
+            if (!first) oss << ",";
+            oss << i << ":" << params[i];
+            first = false;
+        }
+    }
+    oss << "]";
+    return oss.str();
+}
+
+
+struct test_generic_op : public test_case {
+    const ggml_op op;
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params;
+
+    const std::vector<input_tensor> sources;
+
+    std::string vars() override {
+        return VARS_TO_STR5(op, type, ne, op_params, sources);
+    }
+
+    test_generic_op(ggml_op op, ggml_type type, std::array<int64_t, 4> ne,
+                    std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params,
+                    std::vector<input_tensor> sources)
+        : op(op), type(type), ne(ne), op_params(op_params), sources(sources) {}
+
+    // Define how a simple GGML compute graph can be constructed for the new GGML op.
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const size_t source_count = std::min(sources.size(), (size_t)GGML_MAX_SRC);
+
+        std::array<ggml_tensor *, GGML_MAX_SRC> source_tensors;
+        for (size_t i = 0; i < source_count; ++i) {
+            const input_tensor& src = sources[i];
+            source_tensors[i] = ggml_new_tensor_4d(ctx, src.type, src.ne[0], src.ne[1], src.ne[2], src.ne[3]);
+        }
+
+        ggml_tensor * out = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        out->op = op;
+        for (size_t i = 0; i < source_count; ++i) {
+            out->src[i] = source_tensors[i];
+        }
+
+        memcpy(out->op_params, op_params.data(), GGML_MAX_OP_PARAMS);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+
+};
+
 
 enum llm_norm_type {
     LLM_NORM,
@@ -8733,8 +8816,41 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     return test_cases;
 }
 
+static std::vector<std::unique_ptr<test_case>> make_test_cases_from_json(const char * path) {
+    std::ifstream f(path);
+    nlohmann::json data = nlohmann::json::parse(f);
+
+    GGML_ASSERT(data.is_array());
+
+    std::vector<std::unique_ptr<test_case>> test_cases;
+
+    for (const auto& input_case : data) {
+        const ggml_op op = input_case["op"];
+        const ggml_type type = input_case["type"];
+        auto ne_arr = input_case["ne"];
+        const std::array<int64_t, 4> ne = {ne_arr[0], ne_arr[1], ne_arr[2], ne_arr[3]};
+
+        auto op_arr = input_case["op_params"];
+        std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params = {};
+        for (size_t i = 0; i < op_arr.size() && i < op_params.size(); i++) {
+            op_params[i] = op_arr[i];
+        }
+
+        std::vector<input_tensor> sources;
+        for (const auto& src : input_case["sources"]) {
+            auto ne_arr = src["ne"];
+            const std::array<int64_t, 4> src_ne = {ne_arr[0], ne_arr[1], ne_arr[2], ne_arr[3]};
+            sources.push_back({(ggml_type)src["type"], src_ne});
+        }
+
+        test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources));
+    }
+
+    return test_cases;
+}
+
 static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter,
-                         printer * output_printer) {
+                         printer * output_printer, const char * test_json_path) {
     auto filter_test_cases = [](std::vector<std::unique_ptr<test_case>> & test_cases, const char * params_filter) {
         if (params_filter == nullptr) {
             return;
@@ -8752,9 +8868,26 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
         }
     };
 
+    std::vector<std::unique_ptr<test_case>> test_cases;
+
+    if (test_json_path == nullptr) {
+        switch (mode) {
+        case MODE_TEST:
+        case MODE_GRAD:
+        case MODE_SUPPORT:
+            test_cases = make_test_cases_eval();
+            break;
+        case MODE_PERF:
+            test_cases = make_test_cases_perf();
+            break;
+        }
+    } else {
+        test_cases = make_test_cases_from_json(test_json_path);
+    }
+
+    filter_test_cases(test_cases, params_filter);
+
     if (mode == MODE_TEST) {
-        auto test_cases = make_test_cases_eval();
-        filter_test_cases(test_cases, params_filter);
         ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
         if (backend_cpu == NULL) {
             test_operation_info info("", "", "CPU");
@@ -8794,8 +8927,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     }
 
     if (mode == MODE_GRAD) {
-        auto test_cases = make_test_cases_eval();
-        filter_test_cases(test_cases, params_filter);
         size_t n_ok = 0;
         for (auto & test : test_cases) {
             if (test->eval_grad(backend, op_names_filter, output_printer)) {
@@ -8808,8 +8939,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     }
 
     if (mode == MODE_PERF) {
-        auto test_cases = make_test_cases_perf();
-        filter_test_cases(test_cases, params_filter);
         for (auto & test : test_cases) {
             test->eval_perf(backend, op_names_filter, output_printer);
         }
@@ -8817,9 +8946,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     }
 
     if (mode == MODE_SUPPORT) {
-        auto test_cases = make_test_cases_eval();
-        filter_test_cases(test_cases, params_filter);
-
         // Filter out fusion cases
         test_cases.erase(
             std::remove_if(test_cases.begin(), test_cases.end(), [](const std::unique_ptr<test_case> & tc) {
@@ -8938,7 +9064,8 @@ static void show_test_coverage() {
 }
 
 static void usage(char ** argv) {
-    printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops] [--show-coverage]\n", argv[0]);
+    printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops]", argv[0]);
+    printf(" [--show-coverage] [--test-json <path>]\n");
     printf("    valid modes:\n");
     printf("      - test (default, compare with CPU backend for correctness)\n");
     printf("      - grad (compare gradients from backpropagation with method of finite differences)\n");
@@ -8949,6 +9076,7 @@ static void usage(char ** argv) {
     printf("    --output specifies output format (default: console, options: console, sql, csv)\n");
     printf("    --list-ops lists all available GGML operations\n");
     printf("    --show-coverage shows test coverage\n");
+    printf("    --test-json reads test operators from a json\n");
 }
 
 int main(int argc, char ** argv) {
@@ -8957,6 +9085,7 @@ int main(int argc, char ** argv) {
     const char * op_names_filter = nullptr;
     const char * backend_filter = nullptr;
     const char * params_filter = nullptr;
+    const char * test_json_path = nullptr;
 
     for (int i = 1; i < argc; i++) {
         if (strcmp(argv[i], "test") == 0) {
@@ -9004,6 +9133,13 @@ int main(int argc, char ** argv) {
         } else if (strcmp(argv[i], "--show-coverage") == 0) {
             show_test_coverage();
             return 0;
+        } else if (strcmp(argv[i], "--test-json") == 0) {
+            if (i + 1 < argc) {
+                test_json_path = argv[++i];
+            } else {
+                usage(argv);
+                return 1;
+            }
         } else {
             usage(argv);
             return 1;
@@ -9056,7 +9192,7 @@ int main(int argc, char ** argv) {
                                                              false, "", ggml_backend_dev_description(dev),
                                                              total / 1024 / 1024, free / 1024 / 1024, true));
 
-        bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get());
+        bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_json_path);
 
         if (ok) {
             n_ok++;

From ff557ead958520adb417a858f3afc25d6dcd624e Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 24 Feb 2026 11:04:16 +0100
Subject: [PATCH 02/18] add error threshold based on op

---
 tests/test-backend-ops.cpp | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 6b0d8277cd0..6ef5b1c90b7 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -6728,7 +6728,27 @@ struct test_generic_op : public test_case {
         return out;
     }
 
-
+    double max_nmse_err() override {
+        switch (op) {
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
+        case GGML_OP_OUT_PROD:
+        case GGML_OP_FLASH_ATTN_EXT:
+        case GGML_OP_CONV_TRANSPOSE_2D:
+        case GGML_OP_IM2COL:
+        case GGML_OP_CONV_2D:
+        case GGML_OP_CONV_3D:
+        case GGML_OP_SET_ROWS:
+        case GGML_OP_CPY:
+            return 5e-4;
+        case GGML_OP_SOFT_MAX:
+            return 1e-6;
+        case GGML_OP_RWKV_WKV7:
+            return 5e-3;
+        default:
+            return 1e-7;
+        }
+    }
 };
 
 

From e77c43e5dff6146e2c30efbd22d6698adb2a6d8a Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 24 Feb 2026 11:16:34 +0100
Subject: [PATCH 03/18] add error when file cannot be read

---
 tests/test-backend-ops.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 6ef5b1c90b7..9e92e2de67d 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -8838,6 +8838,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
 
 static std::vector<std::unique_ptr<test_case>> make_test_cases_from_json(const char * path) {
     std::ifstream f(path);
+
+    if (!f.is_open()) {
+        throw std::runtime_error("Unable to read JSON file");
+    }
+
     nlohmann::json data = nlohmann::json::parse(f);
 
     GGML_ASSERT(data.is_array());

From c2450b5ac1349db45e0d10c6e6659dd9134549a6 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 24 Feb 2026 15:05:08 +0100
Subject: [PATCH 04/18] add graph operator json extraction tool

---
 common/arg.cpp                              |   3 +-
 common/common.h                             |   1 +
 tools/CMakeLists.txt                        |   1 +
 tools/export-graph-ops/CMakeLists.txt       |   9 ++
 tools/export-graph-ops/export-graph-ops.cpp | 147 ++++++++++++++++++++
 5 files changed, 160 insertions(+), 1 deletion(-)
 create mode 100644 tools/export-graph-ops/CMakeLists.txt
 create mode 100644 tools/export-graph-ops/export-graph-ops.cpp

diff --git a/common/arg.cpp b/common/arg.cpp
index 41da8563d63..24c0efc5cd8 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2666,7 +2666,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.out_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RESULTS}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE,
+	    	    LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_JSON}));
     add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
         string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
diff --git a/common/common.h b/common/common.h
index ffaeefd7c94..9f51d9629c8 100644
--- a/common/common.h
+++ b/common/common.h
@@ -105,6 +105,7 @@ enum llama_example {
     LLAMA_EXAMPLE_FINETUNE,
     LLAMA_EXAMPLE_FIT_PARAMS,
     LLAMA_EXAMPLE_RESULTS,
+    LLAMA_EXAMPLE_EXPORT_GRAPH_JSON,
 
     LLAMA_EXAMPLE_COUNT,
 };
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index b433c91d85e..849bdbafc42 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -39,4 +39,5 @@ else()
     endif()
     add_subdirectory(fit-params)
     add_subdirectory(results)
+    add_subdirectory(export-graph-ops)
 endif()
diff --git a/tools/export-graph-ops/CMakeLists.txt b/tools/export-graph-ops/CMakeLists.txt
new file mode 100644
index 00000000000..32b61376d93
--- /dev/null
+++ b/tools/export-graph-ops/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(TARGET llama-export-graph-ops)
+add_executable(${TARGET} export-graph-ops.cpp)
+target_include_directories(${TARGET} PRIVATE ../../src)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
\ No newline at end of file
diff --git a/tools/export-graph-ops/export-graph-ops.cpp b/tools/export-graph-ops/export-graph-ops.cpp
new file mode 100644
index 00000000000..9d65b91d573
--- /dev/null
+++ b/tools/export-graph-ops/export-graph-ops.cpp
@@ -0,0 +1,147 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "llama-context.h"
+#include "llama-graph.h"
+#include "ggml.h"
+
+#include "nlohmann/json.hpp"
+
+#include <array>
+#include <vector>
+#include <set>
+#include <fstream>
+#include <iostream>
+
+struct input_tensor {
+    ggml_type type;
+    std::array<int64_t, 4> ne;
+
+    input_tensor(ggml_type type, int64_t * ne): type(type) {
+        memcpy(this->ne.data(), ne, 4 * sizeof(int64_t));
+    }
+
+    bool operator<(const input_tensor &b) const {
+        return std::tie(type, ne) <
+               std::tie(b.type, b.ne);
+    }
+};
+
+struct test_object {
+    ggml_op op;
+    ggml_type type;
+    std::array<int64_t, 4> ne;
+    std::vector<int32_t> op_params;
+    std::vector<input_tensor> sources;
+
+    nlohmann::json to_json() const {
+        nlohmann::json test;
+
+        test["op"] = op;
+        test["op_name"] = ggml_op_name(op);
+
+        test["type"] = type;
+        test["type_name"] = ggml_type_name(type);
+
+        test["ne"] = { ne[0], ne[1], ne[2], ne[3] };
+
+        test["op_params"] = op_params;
+
+        nlohmann::json j_sources = nlohmann::json::array();
+        for (size_t s = 0; s < sources.size(); s++) {
+            j_sources.push_back({
+                {"type", sources[s].type},
+                {"type_name", ggml_type_name(sources[s].type)},
+                {"ne", { sources[s].ne[0], sources[s].ne[1], sources[s].ne[2], sources[s].ne[3] }},
+            });
+        }
+
+        test["sources"] = j_sources;
+
+        return test;
+    }
+
+    bool operator<(const test_object &b) const {
+        return std::tie(op, type, ne, op_params, sources) <
+               std::tie(b.op, b.type, b.ne, b.op_params, b.sources);
+    }
+};
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_JSON)) {
+        return 1;
+    }
+
+    common_init();
+
+    // Load CPU-only
+    ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    params.devices = { cpu_device, nullptr };
+    params.fit_params = false;
+    params.n_gpu_layers = 0;
+
+    params.warmup = false;
+
+    auto init_result = common_init_from_params(params);
+
+    llama_context * ctx = init_result->context();
+    auto * cgraph = ctx->get_gf_res_reserve()->get_gf();
+
+    std::set<test_object> tests;
+
+    int n_nodes = ggml_graph_n_nodes(cgraph);
+    int n_skipped = 0;
+    for (int i = 0; i < n_nodes; i++) {
+        ggml_tensor * node = ggml_graph_node(cgraph, i);
+
+        if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
+            n_skipped++;
+            continue;
+        }
+
+        test_object test;
+
+        test.op = node->op;
+        test.type = node->type;
+        memcpy(&test.ne, node->ne, 4 * sizeof(int64_t));
+
+        test.op_params.resize(GGML_MAX_OP_PARAMS / sizeof(int32_t));
+        memcpy(test.op_params.data(), node->op_params, GGML_MAX_OP_PARAMS);
+
+        for (size_t s = 0; s < GGML_MAX_SRC; s++) {
+            if (node->src[s] == nullptr) {
+                break;
+            }
+
+            test.sources.emplace_back(node->src[s]->type, node->src[s]->ne);
+        }
+
+        tests.insert(test);
+    }
+
+    LOG_INF("%d unique ops extracted, %d total nodes, %d skipped (view ops)\n",
+            (int) tests.size(), n_nodes, n_skipped);
+
+    nlohmann::json output_list = nlohmann::json::array();
+
+    for (const auto& test : tests) {
+        output_list.push_back(test.to_json());
+    }
+
+    if (!params.out_file.empty()) {
+        std::ofstream f(params.out_file);
+
+        if (!f.is_open()) {
+            throw std::runtime_error("Unable to open output file");
+        }
+
+        f << output_list.dump(2) << std::endl;
+    } else {
+        std::cout << output_list.dump(2) << std::endl;
+    }
+
+    return 0;
+}
\ No newline at end of file

From a9dc28ee51f1a7c6958fa08cb1c78c76562d6e60 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 24 Feb 2026 22:05:54 +0100
Subject: [PATCH 05/18] add nb parameter for non-contiguous input tensors

---
 tests/test-backend-ops.cpp                  | 114 +++++++++++++++++++-
 tools/export-graph-ops/export-graph-ops.cpp |  92 ++++++++++------
 2 files changed, 172 insertions(+), 34 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 9e92e2de67d..3bc4aa44c23 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -6660,6 +6660,7 @@ struct test_diag : public test_case {
 struct input_tensor {
     ggml_type type;
     std::array<int64_t, 4> ne;
+    std::array<size_t, 4> nb; // strides (0 = use default contiguous strides)
 };
 
 static std::string var_to_str(const std::vector<input_tensor>& sources) {
@@ -6668,6 +6669,9 @@ static std::string var_to_str(const std::vector<input_tensor>& sources) {
     for (const auto& src : sources) {
         if (!first) oss << ",";
         oss << ggml_type_name(src.type) << "[" << src.ne[0] << "," << src.ne[1] << "," << src.ne[2] << "," << src.ne[3] << "]";
+        if (src.nb[0] != 0) {
+            oss << "nb[" << src.nb[0] << "," << src.nb[1] << "," << src.nb[2] << "," << src.nb[3] << "]";
+        }
         first = false;
     }
     return oss.str();
@@ -6713,7 +6717,35 @@ struct test_generic_op : public test_case {
         std::array<ggml_tensor *, GGML_MAX_SRC> source_tensors;
         for (size_t i = 0; i < source_count; ++i) {
             const input_tensor& src = sources[i];
-            source_tensors[i] = ggml_new_tensor_4d(ctx, src.type, src.ne[0], src.ne[1], src.ne[2], src.ne[3]);
+            bool has_nb = src.nb[0] != 0;
+
+            if (has_nb) {
+                // Compute the total buffer size using the same method as ggml_nbytes
+                size_t total_size;
+                const size_t blck_size = ggml_blck_size(src.type);
+                if (blck_size == 1) {
+                    total_size = ggml_type_size(src.type);
+                    for (int d = 0; d < 4; d++) {
+                        total_size += (src.ne[d] - 1) * src.nb[d];
+                    }
+                } else {
+                    total_size = src.ne[0] * src.nb[0] / blck_size;
+                    for (int d = 1; d < 4; d++) {
+                        total_size += (src.ne[d] - 1) * src.nb[d];
+                    }
+                }
+
+                // Convert bytes to elements, padded to block size for quantized types
+                const size_t type_size = ggml_type_size(src.type);
+                size_t backing_elements = (total_size * blck_size + type_size - 1) / type_size;
+                backing_elements = ((backing_elements + blck_size - 1) / blck_size) * blck_size;
+                ggml_tensor * backing = ggml_new_tensor_1d(ctx, src.type, backing_elements);
+                source_tensors[i] = ggml_view_4d(ctx, backing,
+                    src.ne[0], src.ne[1], src.ne[2], src.ne[3],
+                    src.nb[1], src.nb[2], src.nb[3], 0);
+            } else {
+                source_tensors[i] = ggml_new_tensor_4d(ctx, src.type, src.ne[0], src.ne[1], src.ne[2], src.ne[3]);
+            }
         }
 
         ggml_tensor * out = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
@@ -6733,7 +6765,6 @@ struct test_generic_op : public test_case {
         case GGML_OP_MUL_MAT:
         case GGML_OP_MUL_MAT_ID:
         case GGML_OP_OUT_PROD:
-        case GGML_OP_FLASH_ATTN_EXT:
         case GGML_OP_CONV_TRANSPOSE_2D:
         case GGML_OP_IM2COL:
         case GGML_OP_CONV_2D:
@@ -6745,10 +6776,82 @@ struct test_generic_op : public test_case {
             return 1e-6;
         case GGML_OP_RWKV_WKV7:
             return 5e-3;
+        case GGML_OP_FLASH_ATTN_EXT:
+        {
+            // Scale error with kv length to account for accumulating floating point error
+            const int64_t kv = sources[1].ne[1];
+            return 5e-4 * std::max(1.0, kv / 20000.0);
+        }
         default:
             return 1e-7;
         }
     }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        ggml_tensor * out = ggml_get_tensor(ctx, "out");
+
+        for (size_t i = 0; i < sources.size() && i < GGML_MAX_SRC; i++) {
+            ggml_tensor * t = out->src[i];
+            if (!t) {
+                break;
+            }
+
+            // FLASH_ATTN_EXT: src[3] is the KQ mask
+            if (op == GGML_OP_FLASH_ATTN_EXT && i == 3) {
+                init_tensor_kq_mask(t);
+                continue;
+            }
+
+            if ((t->type == GGML_TYPE_I32 || t->type == GGML_TYPE_I64) && !ggml_is_view_op(t->op)) {
+                if (op == GGML_OP_GET_ROWS || op == GGML_OP_GET_ROWS_BACK) {
+                    const int64_t num_rows = sources[0].ne[1];
+                    const int64_t nels = ggml_nelements(t);
+                    std::vector<int32_t> data(nels);
+                    for (int64_t i = 0; i < nels; i++) {
+                        data[i] = rand() % num_rows;
+                    }
+                    ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
+                } else if (op == GGML_OP_SET_ROWS) {
+                    init_set_rows_row_ids(t, ne[1]);
+                } else if (op == GGML_OP_ROPE) {
+                    const int mode = op_params[2];
+                    const int64_t nels = (mode & GGML_ROPE_TYPE_MROPE) ? ne[2] * 4 : ne[2];
+                    std::vector<int32_t> data(nels);
+                    for (int64_t i = 0; i < nels; i++) {
+                        data[i] = rand() % ne[2];
+                    }
+                    ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
+                } else if (op == GGML_OP_MUL_MAT_ID || op == GGML_OP_ADD_ID) {
+                    const int64_t n_expert = (op == GGML_OP_MUL_MAT_ID) ? sources[0].ne[2] : sources[1].ne[1];
+                    std::random_device rd;
+                    std::default_random_engine rng(rd());
+                    for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                        std::vector<int32_t> data(t->ne[0]);
+                        for (int32_t i = 0; i < t->ne[0]; i++) {
+                            data[i] = i % n_expert;
+                        }
+                        std::shuffle(data.begin(), data.end(), rng);
+                        ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
+                    }
+                } else if (op == GGML_OP_SSM_SCAN) {
+                    std::random_device rd;
+                    std::default_random_engine rng(rd());
+                    for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                        std::vector<int32_t> data(t->ne[0]);
+                        for (int32_t i = 0; i < t->ne[0]; i++) {
+                            data[i] = i;
+                        }
+                        std::shuffle(data.begin(), data.end(), rng);
+                        ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
+                    }
+                } else {
+                    init_tensor_uniform(t);
+                }
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
 };
 
 
@@ -8865,7 +8968,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_from_json(const c
         for (const auto& src : input_case["sources"]) {
             auto ne_arr = src["ne"];
             const std::array<int64_t, 4> src_ne = {ne_arr[0], ne_arr[1], ne_arr[2], ne_arr[3]};
-            sources.push_back({(ggml_type)src["type"], src_ne});
+            std::array<size_t, 4> src_nb = {};
+            if (src.contains("nb")) {
+                auto nb_arr = src["nb"];
+                src_nb = {nb_arr[0], nb_arr[1], nb_arr[2], nb_arr[3]};
+            }
+            sources.push_back({(ggml_type)src["type"], src_ne, src_nb});
         }
 
         test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources));
diff --git a/tools/export-graph-ops/export-graph-ops.cpp b/tools/export-graph-ops/export-graph-ops.cpp
index 9d65b91d573..3fa08fec3ea 100644
--- a/tools/export-graph-ops/export-graph-ops.cpp
+++ b/tools/export-graph-ops/export-graph-ops.cpp
@@ -4,6 +4,7 @@
 #include "llama.h"
 #include "llama-context.h"
 #include "llama-graph.h"
+#include "llama-memory.h"
 #include "ggml.h"
 
 #include "nlohmann/json.hpp"
@@ -17,14 +18,16 @@
 struct input_tensor {
     ggml_type type;
     std::array<int64_t, 4> ne;
+    std::array<size_t, 4> nb;
 
-    input_tensor(ggml_type type, int64_t * ne): type(type) {
+    input_tensor(ggml_type type, int64_t * ne, size_t * nb): type(type) {
         memcpy(this->ne.data(), ne, 4 * sizeof(int64_t));
+        memcpy(this->nb.data(), nb, 4 * sizeof(size_t));
     }
 
     bool operator<(const input_tensor &b) const {
-        return std::tie(type, ne) <
-               std::tie(b.type, b.ne);
+        return std::tie(type, ne, nb) <
+               std::tie(b.type, b.ne, b.nb);
     }
 };
 
@@ -54,6 +57,7 @@ struct test_object {
                 {"type", sources[s].type},
                 {"type_name", ggml_type_name(sources[s].type)},
                 {"ne", { sources[s].ne[0], sources[s].ne[1], sources[s].ne[2], sources[s].ne[3] }},
+                {"nb", { sources[s].nb[0], sources[s].nb[1], sources[s].nb[2], sources[s].nb[3] }},
             });
         }
 
@@ -68,32 +72,10 @@ struct test_object {
     }
 };
 
-int main(int argc, char ** argv) {
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_JSON)) {
-        return 1;
-    }
-
-    common_init();
-
-    // Load CPU-only
-    ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-    params.devices = { cpu_device, nullptr };
-    params.fit_params = false;
-    params.n_gpu_layers = 0;
-
-    params.warmup = false;
-
-    auto init_result = common_init_from_params(params);
-
-    llama_context * ctx = init_result->context();
-    auto * cgraph = ctx->get_gf_res_reserve()->get_gf();
-
-    std::set<test_object> tests;
-
+static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set<test_object> & tests) {
     int n_nodes = ggml_graph_n_nodes(cgraph);
     int n_skipped = 0;
+    int n_before = (int) tests.size();
     for (int i = 0; i < n_nodes; i++) {
         ggml_tensor * node = ggml_graph_node(cgraph, i);
 
@@ -116,17 +98,65 @@ int main(int argc, char ** argv) {
                 break;
             }
 
-            test.sources.emplace_back(node->src[s]->type, node->src[s]->ne);
+            test.sources.emplace_back(node->src[s]->type, node->src[s]->ne, node->src[s]->nb);
         }
 
         tests.insert(test);
     }
 
-    LOG_INF("%d unique ops extracted, %d total nodes, %d skipped (view ops)\n",
-            (int) tests.size(), n_nodes, n_skipped);
+    int n_new = (int) tests.size() - n_before;
+    LOG_INF("%s: %d unique ops, %d total nodes, %d skipped (view ops)\n",
+            label, n_new, n_nodes, n_skipped);
+}
 
-    nlohmann::json output_list = nlohmann::json::array();
+int main(int argc, char ** argv) {
+    common_params params;
 
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_JSON)) {
+        return 1;
+    }
+
+    common_init();
+
+    // Load CPU-only
+    ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    params.devices = { cpu_device, nullptr };
+    params.fit_params = false;
+    params.n_gpu_layers = 0;
+
+    params.warmup = false;
+
+    auto init_result = common_init_from_params(params);
+
+    llama_context * ctx = init_result->context();
+
+    const auto & cparams = ctx->get_cparams();
+    const uint32_t n_seqs  = cparams.n_seq_max;
+    const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+    llama_memory_context_ptr mctx;
+    auto * memory = ctx->get_memory();
+    if (memory) {
+        mctx = memory->init_full();
+    }
+
+    std::set<test_object> tests;
+
+    auto * gf_pp = ctx->graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
+    if (!gf_pp) {
+        throw std::runtime_error("failed to reserve prompt processing graph");
+    }
+    extract_graph_ops(gf_pp, "pp", tests);
+
+    auto * gf_tg = ctx->graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
+    if (!gf_tg) {
+        throw std::runtime_error("failed to reserve token generation graph");
+    }
+    extract_graph_ops(gf_tg, "tg", tests);
+
+    LOG_INF("%d unique ops total\n", (int) tests.size());
+
+    nlohmann::json output_list = nlohmann::json::array();
     for (const auto& test : tests) {
         output_list.push_back(test.to_json());
     }

From 0c7a5058ef3e32cd314c8bba644f710b16b95066 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Wed, 25 Feb 2026 07:06:20 +0100
Subject: [PATCH 06/18] fix view check

---
 tests/test-backend-ops.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 3bc4aa44c23..f15c0fb15b2 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -424,10 +424,6 @@ static std::string var_to_str(ggml_scale_mode mode) {
     return str;
 }
 
-static std::string var_to_str(ggml_op op) {
-    return ggml_op_name(op);
-}
-
 #define VAR_TO_STR(x) (#x "=" + var_to_str(x))
 
 #define VARS_TO_STR1(a) VAR_TO_STR(a)
@@ -6702,7 +6698,7 @@ struct test_generic_op : public test_case {
     const std::vector<input_tensor> sources;
 
     std::string vars() override {
-        return VARS_TO_STR5(op, type, ne, op_params, sources);
+        return VARS_TO_STR4(type, ne, op_params, sources);
     }
 
     test_generic_op(ggml_op op, ggml_type type, std::array<int64_t, 4> ne,
@@ -6802,7 +6798,7 @@ struct test_generic_op : public test_case {
                 continue;
             }
 
-            if ((t->type == GGML_TYPE_I32 || t->type == GGML_TYPE_I64) && !ggml_is_view_op(t->op)) {
+            if (t->type == GGML_TYPE_I32 || t->type == GGML_TYPE_I64) {
                 if (op == GGML_OP_GET_ROWS || op == GGML_OP_GET_ROWS_BACK) {
                     const int64_t num_rows = sources[0].ne[1];
                     const int64_t nels = ggml_nelements(t);

From 99e62a629d377f789a8d85180608c1fca7da85cf Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Wed, 25 Feb 2026 09:20:47 +0100
Subject: [PATCH 07/18] only use view if non-contiguous/permuted, use C++
 random instead of rand()

---
 tests/test-backend-ops.cpp                  | 29 +++++++++++++++------
 tools/export-graph-ops/CMakeLists.txt       |  2 +-
 tools/export-graph-ops/export-graph-ops.cpp |  2 +-
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index f15c0fb15b2..e5db0a0f68f 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -6713,9 +6713,21 @@ struct test_generic_op : public test_case {
         std::array<ggml_tensor *, GGML_MAX_SRC> source_tensors;
         for (size_t i = 0; i < source_count; ++i) {
             const input_tensor& src = sources[i];
-            bool has_nb = src.nb[0] != 0;
 
-            if (has_nb) {
+            // Check if the exported strides differ from default contiguous layout.
+            bool needs_view = false;
+            if (src.nb[0] != 0) {
+                const size_t default_nb0 = ggml_type_size(src.type);
+                const size_t default_nb1 = default_nb0 * (src.ne[0] / ggml_blck_size(src.type));
+                const size_t default_nb2 = default_nb1 * src.ne[1];
+                const size_t default_nb3 = default_nb2 * src.ne[2];
+                needs_view = (src.nb[0] != default_nb0 ||
+                              src.nb[1] != default_nb1 ||
+                              src.nb[2] != default_nb2 ||
+                              src.nb[3] != default_nb3);
+            }
+
+            if (needs_view) {
                 // Compute the total buffer size using the same method as ggml_nbytes
                 size_t total_size;
                 const size_t blck_size = ggml_blck_size(src.type);
@@ -6786,6 +6798,9 @@ struct test_generic_op : public test_case {
     void initialize_tensors(ggml_context * ctx) override {
         ggml_tensor * out = ggml_get_tensor(ctx, "out");
 
+        std::random_device rd;
+        std::default_random_engine rng(rd());
+
         for (size_t i = 0; i < sources.size() && i < GGML_MAX_SRC; i++) {
             ggml_tensor * t = out->src[i];
             if (!t) {
@@ -6803,8 +6818,9 @@ struct test_generic_op : public test_case {
                     const int64_t num_rows = sources[0].ne[1];
                     const int64_t nels = ggml_nelements(t);
                     std::vector<int32_t> data(nels);
+                    std::uniform_int_distribution<int32_t> dist(0, num_rows - 1);
                     for (int64_t i = 0; i < nels; i++) {
-                        data[i] = rand() % num_rows;
+                        data[i] = dist(rng);
                     }
                     ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
                 } else if (op == GGML_OP_SET_ROWS) {
@@ -6813,14 +6829,13 @@ struct test_generic_op : public test_case {
                     const int mode = op_params[2];
                     const int64_t nels = (mode & GGML_ROPE_TYPE_MROPE) ? ne[2] * 4 : ne[2];
                     std::vector<int32_t> data(nels);
+                    std::uniform_int_distribution<int32_t> dist(0, ne[2] - 1);
                     for (int64_t i = 0; i < nels; i++) {
-                        data[i] = rand() % ne[2];
+                        data[i] = dist(rng);
                     }
                     ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
                 } else if (op == GGML_OP_MUL_MAT_ID || op == GGML_OP_ADD_ID) {
                     const int64_t n_expert = (op == GGML_OP_MUL_MAT_ID) ? sources[0].ne[2] : sources[1].ne[1];
-                    std::random_device rd;
-                    std::default_random_engine rng(rd());
                     for (int64_t r = 0; r < ggml_nrows(t); r++) {
                         std::vector<int32_t> data(t->ne[0]);
                         for (int32_t i = 0; i < t->ne[0]; i++) {
@@ -6830,8 +6845,6 @@ struct test_generic_op : public test_case {
                         ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
                     }
                 } else if (op == GGML_OP_SSM_SCAN) {
-                    std::random_device rd;
-                    std::default_random_engine rng(rd());
                     for (int64_t r = 0; r < ggml_nrows(t); r++) {
                         std::vector<int32_t> data(t->ne[0]);
                         for (int32_t i = 0; i < t->ne[0]; i++) {
diff --git a/tools/export-graph-ops/CMakeLists.txt b/tools/export-graph-ops/CMakeLists.txt
index 32b61376d93..785a3fa7788 100644
--- a/tools/export-graph-ops/CMakeLists.txt
+++ b/tools/export-graph-ops/CMakeLists.txt
@@ -6,4 +6,4 @@ target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 if(LLAMA_TOOLS_INSTALL)
     install(TARGETS ${TARGET} RUNTIME)
-endif()
\ No newline at end of file
+endif()
diff --git a/tools/export-graph-ops/export-graph-ops.cpp b/tools/export-graph-ops/export-graph-ops.cpp
index 3fa08fec3ea..782fd0bc2be 100644
--- a/tools/export-graph-ops/export-graph-ops.cpp
+++ b/tools/export-graph-ops/export-graph-ops.cpp
@@ -174,4 +174,4 @@ int main(int argc, char ** argv) {
     }
 
     return 0;
-}
\ No newline at end of file
+}

From a09708e56d173691369a0862f50769bc73143da4 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Wed, 25 Feb 2026 14:30:54 +0100
Subject: [PATCH 08/18] replace internal API calls with public
 llama_graph_reserve call

---
 include/llama.h                             |  7 +++++++
 src/llama-context.cpp                       | 13 +++++++++++++
 tools/export-graph-ops/CMakeLists.txt       |  1 -
 tools/export-graph-ops/export-graph-ops.cpp | 18 ++++--------------
 4 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 0bd10294cb8..594388282b9 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -629,6 +629,13 @@ extern "C" {
             const char * fname_out,
             const llama_model_quantize_params * params);
 
+    // Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
+    LLAMA_API struct ggml_cgraph * llama_graph_reserve(
+            struct llama_context * ctx,
+            uint32_t n_tokens,
+            uint32_t n_seqs,
+            uint32_t n_outputs);
+
     //
     // Adapters
     //
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index ee2669c154e..7ccbb502acc 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -3084,6 +3084,19 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
     return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
 }
 
+struct ggml_cgraph * llama_graph_reserve(
+        struct llama_context * ctx,
+        uint32_t n_tokens,
+        uint32_t n_seqs,
+        uint32_t n_outputs) {
+    auto * memory = ctx->get_memory();
+    llama_memory_context_ptr mctx;
+    if (memory) {
+        mctx = memory->init_full();
+    }
+    return ctx->graph_reserve(n_tokens, n_seqs, n_outputs, mctx.get());
+}
+
 // llama adapter API
 
 int32_t llama_set_adapters_lora(
diff --git a/tools/export-graph-ops/CMakeLists.txt b/tools/export-graph-ops/CMakeLists.txt
index 785a3fa7788..95d2ac891f2 100644
--- a/tools/export-graph-ops/CMakeLists.txt
+++ b/tools/export-graph-ops/CMakeLists.txt
@@ -1,6 +1,5 @@
 set(TARGET llama-export-graph-ops)
 add_executable(${TARGET} export-graph-ops.cpp)
-target_include_directories(${TARGET} PRIVATE ../../src)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
diff --git a/tools/export-graph-ops/export-graph-ops.cpp b/tools/export-graph-ops/export-graph-ops.cpp
index 782fd0bc2be..e1aec71c0f3 100644
--- a/tools/export-graph-ops/export-graph-ops.cpp
+++ b/tools/export-graph-ops/export-graph-ops.cpp
@@ -2,9 +2,6 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
-#include "llama-context.h"
-#include "llama-graph.h"
-#include "llama-memory.h"
 #include "ggml.h"
 
 #include "nlohmann/json.hpp"
@@ -130,25 +127,18 @@ int main(int argc, char ** argv) {
 
     llama_context * ctx = init_result->context();
 
-    const auto & cparams = ctx->get_cparams();
-    const uint32_t n_seqs  = cparams.n_seq_max;
-    const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
-    llama_memory_context_ptr mctx;
-    auto * memory = ctx->get_memory();
-    if (memory) {
-        mctx = memory->init_full();
-    }
+    const uint32_t n_seqs  = llama_n_seq_max(ctx);
+    const uint32_t n_tokens = std::min(llama_n_ctx(ctx), llama_n_ubatch(ctx));
 
     std::set<test_object> tests;
 
-    auto * gf_pp = ctx->graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
+    auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens);
     if (!gf_pp) {
         throw std::runtime_error("failed to reserve prompt processing graph");
     }
     extract_graph_ops(gf_pp, "pp", tests);
 
-    auto * gf_tg = ctx->graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
+    auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs);
     if (!gf_tg) {
         throw std::runtime_error("failed to reserve token generation graph");
     }

From c80b146178c5a5e03b7298a35c2c489fcd891055 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Wed, 25 Feb 2026 14:54:26 +0100
Subject: [PATCH 09/18] reduce test description length

---
 tests/test-backend-ops.cpp | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index e5db0a0f68f..9ca9a9464fd 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -6659,13 +6659,27 @@ struct input_tensor {
     std::array<size_t, 4> nb; // strides (0 = use default contiguous strides)
 };
 
+static bool is_non_contiguous(const input_tensor & src) {
+    if (src.nb[0] == 0) {
+        return false;
+    }
+    const size_t default_nb0 = ggml_type_size(src.type);
+    const size_t default_nb1 = default_nb0 * (src.ne[0] / ggml_blck_size(src.type));
+    const size_t default_nb2 = default_nb1 * src.ne[1];
+    const size_t default_nb3 = default_nb2 * src.ne[2];
+    return src.nb[0] != default_nb0 ||
+           src.nb[1] != default_nb1 ||
+           src.nb[2] != default_nb2 ||
+           src.nb[3] != default_nb3;
+}
+
 static std::string var_to_str(const std::vector<input_tensor>& sources) {
     std::ostringstream oss;
     bool first = true;
     for (const auto& src : sources) {
         if (!first) oss << ",";
         oss << ggml_type_name(src.type) << "[" << src.ne[0] << "," << src.ne[1] << "," << src.ne[2] << "," << src.ne[3] << "]";
-        if (src.nb[0] != 0) {
+        if (is_non_contiguous(src)) {
             oss << "nb[" << src.nb[0] << "," << src.nb[1] << "," << src.nb[2] << "," << src.nb[3] << "]";
         }
         first = false;
@@ -6714,20 +6728,7 @@ struct test_generic_op : public test_case {
         for (size_t i = 0; i < source_count; ++i) {
             const input_tensor& src = sources[i];
 
-            // Check if the exported strides differ from default contiguous layout.
-            bool needs_view = false;
-            if (src.nb[0] != 0) {
-                const size_t default_nb0 = ggml_type_size(src.type);
-                const size_t default_nb1 = default_nb0 * (src.ne[0] / ggml_blck_size(src.type));
-                const size_t default_nb2 = default_nb1 * src.ne[1];
-                const size_t default_nb3 = default_nb2 * src.ne[2];
-                needs_view = (src.nb[0] != default_nb0 ||
-                              src.nb[1] != default_nb1 ||
-                              src.nb[2] != default_nb2 ||
-                              src.nb[3] != default_nb3);
-            }
-
-            if (needs_view) {
+            if (is_non_contiguous(src)) {
                 // Compute the total buffer size using the same method as ggml_nbytes
                 size_t total_size;
                 const size_t blck_size = ggml_blck_size(src.type);

From 323f01b316047f5bc5a93045185b2f640d710e6b Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Wed, 25 Feb 2026 15:24:30 +0100
Subject: [PATCH 10/18] fix nb[0] not getting set for view

---
 tests/test-backend-ops.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 9ca9a9464fd..9f16bee3efc 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -6720,7 +6720,6 @@ struct test_generic_op : public test_case {
                     std::vector<input_tensor> sources)
         : op(op), type(type), ne(ne), op_params(op_params), sources(sources) {}
 
-    // Define how a simple GGML compute graph can be constructed for the new GGML op.
     ggml_tensor * build_graph(ggml_context * ctx) override {
         const size_t source_count = std::min(sources.size(), (size_t)GGML_MAX_SRC);
 
@@ -6729,7 +6728,6 @@ struct test_generic_op : public test_case {
             const input_tensor& src = sources[i];
 
             if (is_non_contiguous(src)) {
-                // Compute the total buffer size using the same method as ggml_nbytes
                 size_t total_size;
                 const size_t blck_size = ggml_blck_size(src.type);
                 if (blck_size == 1) {
@@ -6752,6 +6750,8 @@ struct test_generic_op : public test_case {
                 source_tensors[i] = ggml_view_4d(ctx, backing,
                     src.ne[0], src.ne[1], src.ne[2], src.ne[3],
                     src.nb[1], src.nb[2], src.nb[3], 0);
+                // nb[0] does not get set by view_4d, so set it manually
+                source_tensors[i]->nb[0] = src.nb[0];
             } else {
                 source_tensors[i] = ggml_new_tensor_4d(ctx, src.type, src.ne[0], src.ne[1], src.ne[2], src.ne[3]);
             }

From 6e388759a4c38e420bb3db4b527b1df224e922e7 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Wed, 25 Feb 2026 15:54:27 +0100
Subject: [PATCH 11/18] add name to tests

---
 tests/test-backend-ops.cpp                  | 18 ++++++++++++++----
 tools/export-graph-ops/export-graph-ops.cpp |  6 ++++++
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 9f16bee3efc..6103c6db0f7 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -6710,15 +6710,20 @@ struct test_generic_op : public test_case {
     const std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params;
 
     const std::vector<input_tensor> sources;
+    const std::string name;
 
     std::string vars() override {
-        return VARS_TO_STR4(type, ne, op_params, sources);
+        if (name.empty()) {
+            return VARS_TO_STR4(type, ne, op_params, sources);
+        }
+
+        return VARS_TO_STR5(name, type, ne, op_params, sources);
     }
 
     test_generic_op(ggml_op op, ggml_type type, std::array<int64_t, 4> ne,
                     std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params,
-                    std::vector<input_tensor> sources)
-        : op(op), type(type), ne(ne), op_params(op_params), sources(sources) {}
+                    std::vector<input_tensor> sources, std::string name = "")
+        : op(op), type(type), ne(ne), op_params(op_params), sources(sources), name(std::move(name)) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         const size_t source_count = std::min(sources.size(), (size_t)GGML_MAX_SRC);
@@ -8986,7 +8991,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_from_json(const c
             sources.push_back({(ggml_type)src["type"], src_ne, src_nb});
         }
 
-        test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources));
+        std::string name;
+        if (input_case.contains("name")) {
+            name = input_case["name"];
+        }
+
+        test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources, std::move(name)));
     }
 
     return test_cases;
diff --git a/tools/export-graph-ops/export-graph-ops.cpp b/tools/export-graph-ops/export-graph-ops.cpp
index e1aec71c0f3..5b836cf5b82 100644
--- a/tools/export-graph-ops/export-graph-ops.cpp
+++ b/tools/export-graph-ops/export-graph-ops.cpp
@@ -34,6 +34,7 @@ struct test_object {
     std::array<int64_t, 4> ne;
     std::vector<int32_t> op_params;
     std::vector<input_tensor> sources;
+    std::string name;
 
     nlohmann::json to_json() const {
         nlohmann::json test;
@@ -48,6 +49,10 @@ struct test_object {
 
         test["op_params"] = op_params;
 
+        if (!name.empty()) {
+            test["name"] = name;
+        }
+
         nlohmann::json j_sources = nlohmann::json::array();
         for (size_t s = 0; s < sources.size(); s++) {
             j_sources.push_back({
@@ -98,6 +103,7 @@ static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set
             test.sources.emplace_back(node->src[s]->type, node->src[s]->ne, node->src[s]->nb);
         }
 
+        test.name = node->name;
         tests.insert(test);
     }
 

From 91634e684f866797681b802fa2ab31b81afd93eb Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Sat, 28 Feb 2026 11:41:10 +0100
Subject: [PATCH 12/18] fix inplace error

---
 tests/test-backend-ops.cpp | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 6103c6db0f7..04d70be7780 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -6762,7 +6762,20 @@ struct test_generic_op : public test_case {
             }
         }
 
-        ggml_tensor * out = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        // Ops with an inplace flag create a view of src[0] as their output.
+        bool inplace = false;
+        if (op == GGML_OP_SET || op == GGML_OP_ACC) {
+            inplace = op_params[4] != 0;
+        } else if (op == GGML_OP_ADD_REL_POS) {
+            inplace = op_params[0] != 0;
+        }
+
+        ggml_tensor * out;
+        if (inplace && source_count > 0) {
+            out = ggml_view_tensor(ctx, source_tensors[0]);
+        } else {
+            out = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        }
         out->op = op;
         for (size_t i = 0; i < source_count; ++i) {
             out->src[i] = source_tensors[i];

From 7feb7a7f67bb5945db8cbd842f4c9ef7f6b5a9e1 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Mon, 2 Mar 2026 13:56:14 +0100
Subject: [PATCH 13/18] use text file instead of json

---
 common/arg.cpp                              |  2 +-
 common/common.h                             |  2 +-
 tests/test-backend-ops.cpp                  | 88 ++++++++++++---------
 tools/export-graph-ops/export-graph-ops.cpp | 73 ++++++++---------
 4 files changed, 87 insertions(+), 78 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 24c0efc5cd8..deb647065c1 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2667,7 +2667,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.out_file = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE,
-	    	    LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_JSON}));
+	    	    LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
     add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
         string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
diff --git a/common/common.h b/common/common.h
index 9f51d9629c8..7df73903d99 100644
--- a/common/common.h
+++ b/common/common.h
@@ -105,7 +105,7 @@ enum llama_example {
     LLAMA_EXAMPLE_FINETUNE,
     LLAMA_EXAMPLE_FIT_PARAMS,
     LLAMA_EXAMPLE_RESULTS,
-    LLAMA_EXAMPLE_EXPORT_GRAPH_JSON,
+    LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
 
     LLAMA_EXAMPLE_COUNT,
 };
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 04d70be7780..94f5d103307 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -43,8 +43,6 @@
 #include <vector>
 #include <unordered_map>
 
-#include <nlohmann/json.hpp>
-
 #ifdef __EMSCRIPTEN__
 #   define N_THREADS 1
 #else
@@ -8967,46 +8965,62 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     return test_cases;
 }
 
-static std::vector<std::unique_ptr<test_case>> make_test_cases_from_json(const char * path) {
+static std::vector<std::unique_ptr<test_case>> make_test_cases_from_file(const char * path) {
     std::ifstream f(path);
 
     if (!f.is_open()) {
-        throw std::runtime_error("Unable to read JSON file");
+        throw std::runtime_error("Unable to read test file");
     }
 
-    nlohmann::json data = nlohmann::json::parse(f);
-
-    GGML_ASSERT(data.is_array());
-
     std::vector<std::unique_ptr<test_case>> test_cases;
 
-    for (const auto& input_case : data) {
-        const ggml_op op = input_case["op"];
-        const ggml_type type = input_case["type"];
-        auto ne_arr = input_case["ne"];
-        const std::array<int64_t, 4> ne = {ne_arr[0], ne_arr[1], ne_arr[2], ne_arr[3]};
+    std::string line;
+
+    while (std::getline(f, line)) {
+        std::istringstream iss(line);
 
-        auto op_arr = input_case["op_params"];
+        ggml_op op;
+        ggml_type type;
+        std::array<int64_t, 4> ne;
         std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params = {};
-        for (size_t i = 0; i < op_arr.size() && i < op_params.size(); i++) {
-            op_params[i] = op_arr[i];
+        std::string name;
+        uint64_t tmp;
+
+        iss >> tmp;
+        op = (ggml_op)tmp;
+        iss >> tmp;
+        type = (ggml_type)tmp;
+
+        for (size_t i = 0; i < 4; i++) {
+            iss >> ne[i];
         }
 
-        std::vector<input_tensor> sources;
-        for (const auto& src : input_case["sources"]) {
-            auto ne_arr = src["ne"];
-            const std::array<int64_t, 4> src_ne = {ne_arr[0], ne_arr[1], ne_arr[2], ne_arr[3]};
-            std::array<size_t, 4> src_nb = {};
-            if (src.contains("nb")) {
-                auto nb_arr = src["nb"];
-                src_nb = {nb_arr[0], nb_arr[1], nb_arr[2], nb_arr[3]};
+        iss >> tmp;
+        for (size_t i = 0; i < tmp && i < op_params.size(); i++) {
+            iss >> op_params[i];
+        }
+
+        iss >> tmp;
+
+        size_t num_src = std::min((uint64_t)GGML_MAX_SRC, tmp);
+        std::vector<input_tensor> sources(num_src);
+        for (size_t i = 0; i < num_src; i++) {
+            input_tensor& src = sources[i];
+            iss >> tmp;
+            src.type = (ggml_type)tmp;
+
+            for (size_t i = 0; i < 4; i++) {
+                iss >> src.ne[i];
+            }
+            for (size_t i = 0; i < 4; i++) {
+                iss >> src.nb[i];
             }
-            sources.push_back({(ggml_type)src["type"], src_ne, src_nb});
         }
 
-        std::string name;
-        if (input_case.contains("name")) {
-            name = input_case["name"];
+        iss >> name;
+
+        if (name.length() == 1 && name[0] == '-') {
+            name = "";
         }
 
         test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources, std::move(name)));
@@ -9016,7 +9030,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_from_json(const c
 }
 
 static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter,
-                         printer * output_printer, const char * test_json_path) {
+                         printer * output_printer, const char * test_file_path) {
     auto filter_test_cases = [](std::vector<std::unique_ptr<test_case>> & test_cases, const char * params_filter) {
         if (params_filter == nullptr) {
             return;
@@ -9036,7 +9050,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
 
     std::vector<std::unique_ptr<test_case>> test_cases;
 
-    if (test_json_path == nullptr) {
+    if (test_file_path == nullptr) {
         switch (mode) {
         case MODE_TEST:
         case MODE_GRAD:
@@ -9048,7 +9062,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
             break;
         }
     } else {
-        test_cases = make_test_cases_from_json(test_json_path);
+        test_cases = make_test_cases_from_file(test_file_path);
     }
 
     filter_test_cases(test_cases, params_filter);
@@ -9231,7 +9245,7 @@ static void show_test_coverage() {
 
 static void usage(char ** argv) {
     printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops]", argv[0]);
-    printf(" [--show-coverage] [--test-json <path>]\n");
+    printf(" [--show-coverage] [--test-file <path>]\n");
     printf("    valid modes:\n");
     printf("      - test (default, compare with CPU backend for correctness)\n");
     printf("      - grad (compare gradients from backpropagation with method of finite differences)\n");
@@ -9242,7 +9256,7 @@ static void usage(char ** argv) {
     printf("    --output specifies output format (default: console, options: console, sql, csv)\n");
     printf("    --list-ops lists all available GGML operations\n");
     printf("    --show-coverage shows test coverage\n");
-    printf("    --test-json reads test operators from a json\n");
+    printf("    --test-file reads test operators from a test file generated by llama-export-graph-ops\n");
 }
 
 int main(int argc, char ** argv) {
@@ -9251,7 +9265,7 @@ int main(int argc, char ** argv) {
     const char * op_names_filter = nullptr;
     const char * backend_filter = nullptr;
     const char * params_filter = nullptr;
-    const char * test_json_path = nullptr;
+    const char * test_file_path = nullptr;
 
     for (int i = 1; i < argc; i++) {
         if (strcmp(argv[i], "test") == 0) {
@@ -9299,9 +9313,9 @@ int main(int argc, char ** argv) {
         } else if (strcmp(argv[i], "--show-coverage") == 0) {
             show_test_coverage();
             return 0;
-        } else if (strcmp(argv[i], "--test-json") == 0) {
+        } else if (strcmp(argv[i], "--test-file") == 0) {
             if (i + 1 < argc) {
-                test_json_path = argv[++i];
+                test_file_path = argv[++i];
             } else {
                 usage(argv);
                 return 1;
@@ -9358,7 +9372,7 @@ int main(int argc, char ** argv) {
                                                              false, "", ggml_backend_dev_description(dev),
                                                              total / 1024 / 1024, free / 1024 / 1024, true));
 
-        bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_json_path);
+        bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_file_path);
 
         if (ok) {
             n_ok++;
diff --git a/tools/export-graph-ops/export-graph-ops.cpp b/tools/export-graph-ops/export-graph-ops.cpp
index 5b836cf5b82..a8a075fa875 100644
--- a/tools/export-graph-ops/export-graph-ops.cpp
+++ b/tools/export-graph-ops/export-graph-ops.cpp
@@ -4,8 +4,6 @@
 #include "llama.h"
 #include "ggml.h"
 
-#include "nlohmann/json.hpp"
-
 #include <array>
 #include <vector>
 #include <set>
@@ -26,6 +24,16 @@ struct input_tensor {
         return std::tie(type, ne, nb) <
                std::tie(b.type, b.ne, b.nb);
     }
+
+    void serialize(std::ostream& out) const {
+        out << type << ' ';
+        for (size_t i = 0; i < 4; i++) {
+            out << ne[i] << ' ';
+        }
+        for (size_t i = 0; i < 4; i++) {
+            out << nb[i] << ' ';
+        }
+    }
 };
 
 struct test_object {
@@ -36,36 +44,29 @@ struct test_object {
     std::vector<input_tensor> sources;
     std::string name;
 
-    nlohmann::json to_json() const {
-        nlohmann::json test;
-
-        test["op"] = op;
-        test["op_name"] = ggml_op_name(op);
-
-        test["type"] = type;
-        test["type_name"] = ggml_type_name(type);
-
-        test["ne"] = { ne[0], ne[1], ne[2], ne[3] };
-
-        test["op_params"] = op_params;
+    void serialize(std::ostream& out) const {
+        out << op << ' ' << type << ' ';
+        for (size_t i = 0; i < 4; i++) {
+            out << ne[i] << ' ';
+        }
 
-        if (!name.empty()) {
-            test["name"] = name;
+        out << op_params.size() << ' ';
+        for (size_t i = 0; i < op_params.size(); i++) {
+            out << op_params[i] << ' ';
         }
 
-        nlohmann::json j_sources = nlohmann::json::array();
+        out << sources.size() << ' ';
         for (size_t s = 0; s < sources.size(); s++) {
-            j_sources.push_back({
-                {"type", sources[s].type},
-                {"type_name", ggml_type_name(sources[s].type)},
-                {"ne", { sources[s].ne[0], sources[s].ne[1], sources[s].ne[2], sources[s].ne[3] }},
-                {"nb", { sources[s].nb[0], sources[s].nb[1], sources[s].nb[2], sources[s].nb[3] }},
-            });
+            sources[s].serialize(out);
         }
 
-        test["sources"] = j_sources;
+        if (!name.empty()) {
+            out << name;
+        } else {
+            out << '-';
+        }
 
-        return test;
+        out << '\n';
     }
 
     bool operator<(const test_object &b) const {
@@ -114,8 +115,9 @@ static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set
 
 int main(int argc, char ** argv) {
     common_params params;
+    params.out_file = "tests.txt";
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_JSON)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS)) {
         return 1;
     }
 
@@ -152,21 +154,14 @@ int main(int argc, char ** argv) {
 
     LOG_INF("%d unique ops total\n", (int) tests.size());
 
-    nlohmann::json output_list = nlohmann::json::array();
-    for (const auto& test : tests) {
-        output_list.push_back(test.to_json());
-    }
+    std::ofstream f(params.out_file);
 
-    if (!params.out_file.empty()) {
-        std::ofstream f(params.out_file);
-
-        if (!f.is_open()) {
-            throw std::runtime_error("Unable to open output file");
-        }
+    if (!f.is_open()) {
+        throw std::runtime_error("Unable to open output file");
+    }
 
-        f << output_list.dump(2) << std::endl;
-    } else {
-        std::cout << output_list.dump(2) << std::endl;
+    for (const auto& test : tests) {
+        test.serialize(f);
     }
 
     return 0;

From 8055a050eb4b7d60d85e1f1bf94405ea24d1c6e2 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Fri, 6 Mar 2026 15:06:16 +0100
Subject: [PATCH 14/18] move llama_graph_reserve function to new llama-ext
 header, move export-graph-ops to tests/

---
 include/llama.h                                   |  7 -------
 src/llama-ext.h                                   | 15 +++++++++++++++
 tests/CMakeLists.txt                              |  4 +++-
 .../export-graph-ops.cpp                          |  1 +
 tools/CMakeLists.txt                              |  1 -
 tools/export-graph-ops/CMakeLists.txt             |  8 --------
 6 files changed, 19 insertions(+), 17 deletions(-)
 create mode 100644 src/llama-ext.h
 rename {tools/export-graph-ops => tests}/export-graph-ops.cpp (99%)
 delete mode 100644 tools/export-graph-ops/CMakeLists.txt

diff --git a/include/llama.h b/include/llama.h
index 594388282b9..0bd10294cb8 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -629,13 +629,6 @@ extern "C" {
             const char * fname_out,
             const llama_model_quantize_params * params);
 
-    // Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
-    LLAMA_API struct ggml_cgraph * llama_graph_reserve(
-            struct llama_context * ctx,
-            uint32_t n_tokens,
-            uint32_t n_seqs,
-            uint32_t n_outputs);
-
     //
     // Adapters
     //
diff --git a/src/llama-ext.h b/src/llama-ext.h
new file mode 100644
index 00000000000..9b58e0abb5d
--- /dev/null
+++ b/src/llama-ext.h
@@ -0,0 +1,15 @@
+#ifndef LLAMA_EXT_H
+#define LLAMA_EXT_H
+
+#include "llama-context.h"
+#include "ggml.h"
+#include "stdint.h"
+
+// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
+struct ggml_cgraph * llama_graph_reserve(
+        struct llama_context * ctx,
+        uint32_t n_tokens,
+        uint32_t n_seqs,
+        uint32_t n_outputs);
+
+#endif // LLAMA_EXT_H
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index bb0f0ef0ed8..9582164b580 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -260,6 +260,7 @@ endif()
 set(LLAMA_TEST_NAME test-mtmd-c-api)
 llama_build_and_test(test-mtmd-c-api.c)
 target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
+unset(LLAMA_TEST_NAME)
 
 # GGUF model data fetcher library for tests that need real model metadata
 # Only compile when cpp-httplib has SSL support (CPPHTTPLIB_OPENSSL_SUPPORT)
@@ -284,4 +285,5 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
 llama_build_and_test(test-alloc.cpp)
 target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
 
-
+llama_build(export-graph-ops.cpp)
+target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
diff --git a/tools/export-graph-ops/export-graph-ops.cpp b/tests/export-graph-ops.cpp
similarity index 99%
rename from tools/export-graph-ops/export-graph-ops.cpp
rename to tests/export-graph-ops.cpp
index a8a075fa875..754089d068e 100644
--- a/tools/export-graph-ops/export-graph-ops.cpp
+++ b/tests/export-graph-ops.cpp
@@ -2,6 +2,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "../src/llama-ext.h"
 #include "ggml.h"
 
 #include <array>
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 849bdbafc42..b433c91d85e 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -39,5 +39,4 @@ else()
     endif()
     add_subdirectory(fit-params)
     add_subdirectory(results)
-    add_subdirectory(export-graph-ops)
 endif()
diff --git a/tools/export-graph-ops/CMakeLists.txt b/tools/export-graph-ops/CMakeLists.txt
deleted file mode 100644
index 95d2ac891f2..00000000000
--- a/tools/export-graph-ops/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-set(TARGET llama-export-graph-ops)
-add_executable(${TARGET} export-graph-ops.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
-
-if(LLAMA_TOOLS_INSTALL)
-    install(TARGETS ${TARGET} RUNTIME)
-endif()

From cd7fe034f8850db00bee29b14866143beb24ca05 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Sat, 7 Mar 2026 07:33:11 +0100
Subject: [PATCH 15/18] fix missing declaration

---
 src/llama-context.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 7ccbb502acc..493b234f011 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -7,6 +7,7 @@
 #include "llama-memory.h"
 #include "llama-mmap.h"
 #include "llama-model.h"
+#include "llama-ext.h"
 
 #include <cinttypes>
 #include <cmath>

From a0c532c8116f72afc46e1c9c04c5d89d96a70225 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 10 Mar 2026 16:02:27 +0100
Subject: [PATCH 16/18] use pragma once

---
 src/llama-ext.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/llama-ext.h b/src/llama-ext.h
index 9b58e0abb5d..b3ea9108e0d 100644
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -1,5 +1,4 @@
-#ifndef LLAMA_EXT_H
-#define LLAMA_EXT_H
+#pragma once
 
 #include "llama-context.h"
 #include "ggml.h"
@@ -11,5 +10,3 @@ struct ggml_cgraph * llama_graph_reserve(
         uint32_t n_tokens,
         uint32_t n_seqs,
         uint32_t n_outputs);
-
-#endif // LLAMA_EXT_H

From 3e21a58d8aed6c78e8bac2cf966f799120a79104 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 10 Mar 2026 16:08:52 +0100
Subject: [PATCH 17/18] fix indent

---
 common/arg.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index deb647065c1..69092d6f9ea 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2667,7 +2667,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.out_file = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE,
-	    	    LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
+                    LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
     add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
         string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),

From 062e7b1c98f073a039b9a4a8918ce26bdbd58f88 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Wed, 11 Mar 2026 12:38:51 +0100
Subject: [PATCH 18/18] fix Windows build

---
 src/llama-ext.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-ext.h b/src/llama-ext.h
index b3ea9108e0d..13ced783b42 100644
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -5,7 +5,7 @@
 #include "stdint.h"
 
 // Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
-struct ggml_cgraph * llama_graph_reserve(
+LLAMA_API struct ggml_cgraph * llama_graph_reserve(
         struct llama_context * ctx,
         uint32_t n_tokens,
         uint32_t n_seqs,