From ddf4b9dbc1bb447289bbd79b80d4fc04772526c4 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Mon, 23 Feb 2026 17:44:57 +0100 Subject: [PATCH 01/18] tests: allow loading test-backend-ops tests from json --- tests/test-backend-ops.cpp | 160 ++++++++++++++++++++++++++++++++++--- 1 file changed, 148 insertions(+), 12 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 32a83b001d8..6b0d8277cd0 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -31,16 +31,20 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include #include #include +#include + #ifdef __EMSCRIPTEN__ # define N_THREADS 1 #else @@ -420,6 +424,10 @@ static std::string var_to_str(ggml_scale_mode mode) { return str; } +static std::string var_to_str(ggml_op op) { + return ggml_op_name(op); +} + #define VAR_TO_STR(x) (#x "=" + var_to_str(x)) #define VARS_TO_STR1(a) VAR_TO_STR(a) @@ -6648,6 +6656,81 @@ struct test_diag : public test_case { } }; +// Deserializable generic test case +struct input_tensor { + ggml_type type; + std::array ne; +}; + +static std::string var_to_str(const std::vector& sources) { + std::ostringstream oss; + bool first = true; + for (const auto& src : sources) { + if (!first) oss << ","; + oss << ggml_type_name(src.type) << "[" << src.ne[0] << "," << src.ne[1] << "," << src.ne[2] << "," << src.ne[3] << "]"; + first = false; + } + return oss.str(); +} + +static std::string var_to_str(const std::array& params) { + std::ostringstream oss; + oss << "["; + bool first = true; + for (size_t i = 0; i < params.size(); ++i) { + if (params[i] != 0) { + if (!first) oss << ","; + oss << i << ":" << params[i]; + first = false; + } + } + oss << "]"; + return oss.str(); +} + + +struct test_generic_op : public test_case { + const ggml_op op; + const ggml_type type; + const std::array ne; + const std::array op_params; + + const std::vector sources; + + std::string vars() override { + return VARS_TO_STR5(op, type, ne, op_params, sources); + } + + test_generic_op(ggml_op op, ggml_type type, std::array ne, + std::array op_params, + std::vector sources) + : op(op), type(type), ne(ne), op_params(op_params), sources(sources) {} + + // Define how a simple GGML compute graph can be constructed for the new GGML op. + ggml_tensor * build_graph(ggml_context * ctx) override { + const size_t source_count = std::min(sources.size(), (size_t)GGML_MAX_SRC); + + std::array source_tensors; + for (size_t i = 0; i < source_count; ++i) { + const input_tensor& src = sources[i]; + source_tensors[i] = ggml_new_tensor_4d(ctx, src.type, src.ne[0], src.ne[1], src.ne[2], src.ne[3]); + } + + ggml_tensor * out = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); + out->op = op; + for (size_t i = 0; i < source_count; ++i) { + out->src[i] = source_tensors[i]; + } + + memcpy(out->op_params, op_params.data(), GGML_MAX_OP_PARAMS); + ggml_set_name(out, "out"); + + return out; + } + + +}; + enum llm_norm_type { LLM_NORM, @@ -8733,8 +8816,41 @@ static std::vector> make_test_cases_perf() { return test_cases; } +static std::vector> make_test_cases_from_json(const char * path) { + std::ifstream f(path); + nlohmann::json data = nlohmann::json::parse(f); + + GGML_ASSERT(data.is_array()); + + std::vector> test_cases; + + for (const auto& input_case : data) { + const ggml_op op = input_case["op"]; + const ggml_type type = input_case["type"]; + auto ne_arr = input_case["ne"]; + const std::array ne = {ne_arr[0], ne_arr[1], ne_arr[2], ne_arr[3]}; + + auto op_arr = input_case["op_params"]; + std::array op_params = {}; + for (size_t i = 0; i < op_arr.size() && i < op_params.size(); i++) { + op_params[i] = op_arr[i]; + } + + std::vector sources; + for (const auto& src : input_case["sources"]) { + auto ne_arr = src["ne"]; + const std::array src_ne = {ne_arr[0], ne_arr[1], ne_arr[2], ne_arr[3]}; + sources.push_back({(ggml_type)src["type"], src_ne}); + } + + test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources)); + } + + return test_cases; +} + static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter, - printer * output_printer) { + printer * output_printer, const char * test_json_path) { auto filter_test_cases = [](std::vector> & test_cases, const char * params_filter) { if (params_filter == nullptr) { return; @@ -8752,9 +8868,26 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } }; + std::vector> test_cases; + + if (test_json_path == nullptr) { + switch (mode) { + case MODE_TEST: + case MODE_GRAD: + case MODE_SUPPORT: + test_cases = make_test_cases_eval(); + break; + case MODE_PERF: + test_cases = make_test_cases_perf(); + break; + } + } else { + test_cases = make_test_cases_from_json(test_json_path); + } + + filter_test_cases(test_cases, params_filter); + if (mode == MODE_TEST) { - auto test_cases = make_test_cases_eval(); - filter_test_cases(test_cases, params_filter); ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL); if (backend_cpu == NULL) { test_operation_info info("", "", "CPU"); @@ -8794,8 +8927,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } if (mode == MODE_GRAD) { - auto test_cases = make_test_cases_eval(); - filter_test_cases(test_cases, params_filter); size_t n_ok = 0; for (auto & test : test_cases) { if (test->eval_grad(backend, op_names_filter, output_printer)) { @@ -8808,8 +8939,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } if (mode == MODE_PERF) { - auto test_cases = make_test_cases_perf(); - filter_test_cases(test_cases, params_filter); for (auto & test : test_cases) { test->eval_perf(backend, op_names_filter, output_printer); } @@ -8817,9 +8946,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } if (mode == MODE_SUPPORT) { - auto test_cases = make_test_cases_eval(); - filter_test_cases(test_cases, params_filter); - // Filter out fusion cases test_cases.erase( std::remove_if(test_cases.begin(), test_cases.end(), [](const std::unique_ptr & tc) { @@ -8938,7 +9064,8 @@ static void show_test_coverage() { } static void usage(char ** argv) { - printf("Usage: %s [mode] [-o ] [-b ] [-p ] [--output ] [--list-ops] [--show-coverage]\n", argv[0]); + printf("Usage: %s [mode] [-o ] [-b ] [-p ] [--output ] [--list-ops]", argv[0]); + printf(" [--show-coverage] [--test-json ]\n"); printf(" valid modes:\n"); printf(" - test (default, compare with CPU backend for correctness)\n"); printf(" - grad (compare gradients from backpropagation with method of finite differences)\n"); @@ -8949,6 +9076,7 @@ static void usage(char ** argv) { printf(" --output specifies output format (default: console, options: console, sql, csv)\n"); printf(" --list-ops lists all available GGML operations\n"); printf(" --show-coverage shows test coverage\n"); + printf(" --test-json reads test operators from a json\n"); } int main(int argc, char ** argv) { @@ -8957,6 +9085,7 @@ int main(int argc, char ** argv) { const char * op_names_filter = nullptr; const char * backend_filter = nullptr; const char * params_filter = nullptr; + const char * test_json_path = nullptr; for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "test") == 0) { @@ -9004,6 +9133,13 @@ int main(int argc, char ** argv) { } else if (strcmp(argv[i], "--show-coverage") == 0) { show_test_coverage(); return 0; + } else if (strcmp(argv[i], "--test-json") == 0) { + if (i + 1 < argc) { + test_json_path = argv[++i]; + } else { + usage(argv); + return 1; + } } else { usage(argv); return 1; @@ -9056,7 +9192,7 @@ int main(int argc, char ** argv) { false, "", ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024, true)); - bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get()); + bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_json_path); if (ok) { n_ok++; From ff557ead958520adb417a858f3afc25d6dcd624e Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 24 Feb 2026 11:04:16 +0100 Subject: [PATCH 02/18] add error threshold based on op --- tests/test-backend-ops.cpp | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 6b0d8277cd0..6ef5b1c90b7 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6728,7 +6728,27 @@ struct test_generic_op : public test_case { return out; } - + double max_nmse_err() override { + switch (op) { + case GGML_OP_MUL_MAT: + case GGML_OP_MUL_MAT_ID: + case GGML_OP_OUT_PROD: + case GGML_OP_FLASH_ATTN_EXT: + case GGML_OP_CONV_TRANSPOSE_2D: + case GGML_OP_IM2COL: + case GGML_OP_CONV_2D: + case GGML_OP_CONV_3D: + case GGML_OP_SET_ROWS: + case GGML_OP_CPY: + return 5e-4; + case GGML_OP_SOFT_MAX: + return 1e-6; + case GGML_OP_RWKV_WKV7: + return 5e-3; + default: + return 1e-7; + } + } }; From e77c43e5dff6146e2c30efbd22d6698adb2a6d8a Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 24 Feb 2026 11:16:34 +0100 Subject: [PATCH 03/18] add error when file cannot be read --- tests/test-backend-ops.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 6ef5b1c90b7..9e92e2de67d 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -8838,6 +8838,11 @@ static std::vector> make_test_cases_perf() { static std::vector> make_test_cases_from_json(const char * path) { std::ifstream f(path); + + if (!f.is_open()) { + throw std::runtime_error("Unable to read JSON file"); + } + nlohmann::json data = nlohmann::json::parse(f); GGML_ASSERT(data.is_array()); From c2450b5ac1349db45e0d10c6e6659dd9134549a6 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 24 Feb 2026 15:05:08 +0100 Subject: [PATCH 04/18] add graph operator json extraction tool --- common/arg.cpp | 3 +- common/common.h | 1 + tools/CMakeLists.txt | 1 + tools/export-graph-ops/CMakeLists.txt | 9 ++ tools/export-graph-ops/export-graph-ops.cpp | 147 ++++++++++++++++++++ 5 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 tools/export-graph-ops/CMakeLists.txt create mode 100644 tools/export-graph-ops/export-graph-ops.cpp diff --git a/common/arg.cpp b/common/arg.cpp index 41da8563d63..24c0efc5cd8 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2666,7 +2666,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.out_file = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RESULTS})); + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, + LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_JSON})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), diff --git a/common/common.h b/common/common.h index ffaeefd7c94..9f51d9629c8 100644 --- a/common/common.h +++ b/common/common.h @@ -105,6 +105,7 @@ enum llama_example { LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FIT_PARAMS, LLAMA_EXAMPLE_RESULTS, + LLAMA_EXAMPLE_EXPORT_GRAPH_JSON, LLAMA_EXAMPLE_COUNT, }; diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index b433c91d85e..849bdbafc42 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -39,4 +39,5 @@ else() endif() add_subdirectory(fit-params) add_subdirectory(results) + add_subdirectory(export-graph-ops) endif() diff --git a/tools/export-graph-ops/CMakeLists.txt b/tools/export-graph-ops/CMakeLists.txt new file mode 100644 index 00000000000..32b61376d93 --- /dev/null +++ b/tools/export-graph-ops/CMakeLists.txt @@ -0,0 +1,9 @@ +set(TARGET llama-export-graph-ops) +add_executable(${TARGET} export-graph-ops.cpp) +target_include_directories(${TARGET} PRIVATE ../../src) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() \ No newline at end of file diff --git a/tools/export-graph-ops/export-graph-ops.cpp b/tools/export-graph-ops/export-graph-ops.cpp new file mode 100644 index 00000000000..9d65b91d573 --- /dev/null +++ b/tools/export-graph-ops/export-graph-ops.cpp @@ -0,0 +1,147 @@ +#include "arg.h" +#include "common.h" +#include "log.h" +#include "llama.h" +#include "llama-context.h" +#include "llama-graph.h" +#include "ggml.h" + +#include "nlohmann/json.hpp" + +#include +#include +#include +#include +#include + +struct input_tensor { + ggml_type type; + std::array ne; + + input_tensor(ggml_type type, int64_t * ne): type(type) { + memcpy(this->ne.data(), ne, 4 * sizeof(int64_t)); + } + + bool operator<(const input_tensor &b) const { + return std::tie(type, ne) < + std::tie(b.type, b.ne); + } +}; + +struct test_object { + ggml_op op; + ggml_type type; + std::array ne; + std::vector op_params; + std::vector sources; + + nlohmann::json to_json() const { + nlohmann::json test; + + test["op"] = op; + test["op_name"] = ggml_op_name(op); + + test["type"] = type; + test["type_name"] = ggml_type_name(type); + + test["ne"] = { ne[0], ne[1], ne[2], ne[3] }; + + test["op_params"] = op_params; + + nlohmann::json j_sources = nlohmann::json::array(); + for (size_t s = 0; s < sources.size(); s++) { + j_sources.push_back({ + {"type", sources[s].type}, + {"type_name", ggml_type_name(sources[s].type)}, + {"ne", { sources[s].ne[0], sources[s].ne[1], sources[s].ne[2], sources[s].ne[3] }}, + }); + } + + test["sources"] = j_sources; + + return test; + } + + bool operator<(const test_object &b) const { + return std::tie(op, type, ne, op_params, sources) < + std::tie(b.op, b.type, b.ne, b.op_params, b.sources); + } +}; + +int main(int argc, char ** argv) { + common_params params; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_JSON)) { + return 1; + } + + common_init(); + + // Load CPU-only + ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + params.devices = { cpu_device, nullptr }; + params.fit_params = false; + params.n_gpu_layers = 0; + + params.warmup = false; + + auto init_result = common_init_from_params(params); + + llama_context * ctx = init_result->context(); + auto * cgraph = ctx->get_gf_res_reserve()->get_gf(); + + std::set tests; + + int n_nodes = ggml_graph_n_nodes(cgraph); + int n_skipped = 0; + for (int i = 0; i < n_nodes; i++) { + ggml_tensor * node = ggml_graph_node(cgraph, i); + + if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) { + n_skipped++; + continue; + } + + test_object test; + + test.op = node->op; + test.type = node->type; + memcpy(&test.ne, node->ne, 4 * sizeof(int64_t)); + + test.op_params.resize(GGML_MAX_OP_PARAMS / sizeof(int32_t)); + memcpy(test.op_params.data(), node->op_params, GGML_MAX_OP_PARAMS); + + for (size_t s = 0; s < GGML_MAX_SRC; s++) { + if (node->src[s] == nullptr) { + break; + } + + test.sources.emplace_back(node->src[s]->type, node->src[s]->ne); + } + + tests.insert(test); + } + + LOG_INF("%d unique ops extracted, %d total nodes, %d skipped (view ops)\n", + (int) tests.size(), n_nodes, n_skipped); + + nlohmann::json output_list = nlohmann::json::array(); + + for (const auto& test : tests) { + output_list.push_back(test.to_json()); + } + + if (!params.out_file.empty()) { + std::ofstream f(params.out_file); + + if (!f.is_open()) { + throw std::runtime_error("Unable to open output file"); + } + + f << output_list.dump(2) << std::endl; + } else { + std::cout << output_list.dump(2) << std::endl; + } + + return 0; +} \ No newline at end of file From a9dc28ee51f1a7c6958fa08cb1c78c76562d6e60 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 24 Feb 2026 22:05:54 +0100 Subject: [PATCH 05/18] add nb parameter for non-contiguous input tensors --- tests/test-backend-ops.cpp | 114 +++++++++++++++++++- tools/export-graph-ops/export-graph-ops.cpp | 92 ++++++++++------ 2 files changed, 172 insertions(+), 34 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 9e92e2de67d..3bc4aa44c23 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6660,6 +6660,7 @@ struct test_diag : public test_case { struct input_tensor { ggml_type type; std::array ne; + std::array nb; // strides (0 = use default contiguous strides) }; static std::string var_to_str(const std::vector& sources) { @@ -6668,6 +6669,9 @@ static std::string var_to_str(const std::vector& sources) { for (const auto& src : sources) { if (!first) oss << ","; oss << ggml_type_name(src.type) << "[" << src.ne[0] << "," << src.ne[1] << "," << src.ne[2] << "," << src.ne[3] << "]"; + if (src.nb[0] != 0) { + oss << "nb[" << src.nb[0] << "," << src.nb[1] << "," << src.nb[2] << "," << src.nb[3] << "]"; + } first = false; } return oss.str(); @@ -6713,7 +6717,35 @@ struct test_generic_op : public test_case { std::array source_tensors; for (size_t i = 0; i < source_count; ++i) { const input_tensor& src = sources[i]; - source_tensors[i] = ggml_new_tensor_4d(ctx, src.type, src.ne[0], src.ne[1], src.ne[2], src.ne[3]); + bool has_nb = src.nb[0] != 0; + + if (has_nb) { + // Compute the total buffer size using the same method as ggml_nbytes + size_t total_size; + const size_t blck_size = ggml_blck_size(src.type); + if (blck_size == 1) { + total_size = ggml_type_size(src.type); + for (int d = 0; d < 4; d++) { + total_size += (src.ne[d] - 1) * src.nb[d]; + } + } else { + total_size = src.ne[0] * src.nb[0] / blck_size; + for (int d = 1; d < 4; d++) { + total_size += (src.ne[d] - 1) * src.nb[d]; + } + } + + // Convert bytes to elements, padded to block size for quantized types + const size_t type_size = ggml_type_size(src.type); + size_t backing_elements = (total_size * blck_size + type_size - 1) / type_size; + backing_elements = ((backing_elements + blck_size - 1) / blck_size) * blck_size; + ggml_tensor * backing = ggml_new_tensor_1d(ctx, src.type, backing_elements); + source_tensors[i] = ggml_view_4d(ctx, backing, + src.ne[0], src.ne[1], src.ne[2], src.ne[3], + src.nb[1], src.nb[2], src.nb[3], 0); + } else { + source_tensors[i] = ggml_new_tensor_4d(ctx, src.type, src.ne[0], src.ne[1], src.ne[2], src.ne[3]); + } } ggml_tensor * out = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); @@ -6733,7 +6765,6 @@ struct test_generic_op : public test_case { case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: case GGML_OP_OUT_PROD: - case GGML_OP_FLASH_ATTN_EXT: case GGML_OP_CONV_TRANSPOSE_2D: case GGML_OP_IM2COL: case GGML_OP_CONV_2D: @@ -6745,10 +6776,82 @@ struct test_generic_op : public test_case { return 1e-6; case GGML_OP_RWKV_WKV7: return 5e-3; + case GGML_OP_FLASH_ATTN_EXT: + { + // Scale error with kv length to account for accumulating floating point error + const int64_t kv = sources[1].ne[1]; + return 5e-4 * std::max(1.0, kv / 20000.0); + } default: return 1e-7; } } + + void initialize_tensors(ggml_context * ctx) override { + ggml_tensor * out = ggml_get_tensor(ctx, "out"); + + for (size_t i = 0; i < sources.size() && i < GGML_MAX_SRC; i++) { + ggml_tensor * t = out->src[i]; + if (!t) { + break; + } + + // FLASH_ATTN_EXT: src[3] is the KQ mask + if (op == GGML_OP_FLASH_ATTN_EXT && i == 3) { + init_tensor_kq_mask(t); + continue; + } + + if ((t->type == GGML_TYPE_I32 || t->type == GGML_TYPE_I64) && !ggml_is_view_op(t->op)) { + if (op == GGML_OP_GET_ROWS || op == GGML_OP_GET_ROWS_BACK) { + const int64_t num_rows = sources[0].ne[1]; + const int64_t nels = ggml_nelements(t); + std::vector data(nels); + for (int64_t i = 0; i < nels; i++) { + data[i] = rand() % num_rows; + } + ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t)); + } else if (op == GGML_OP_SET_ROWS) { + init_set_rows_row_ids(t, ne[1]); + } else if (op == GGML_OP_ROPE) { + const int mode = op_params[2]; + const int64_t nels = (mode & GGML_ROPE_TYPE_MROPE) ? ne[2] * 4 : ne[2]; + std::vector data(nels); + for (int64_t i = 0; i < nels; i++) { + data[i] = rand() % ne[2]; + } + ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t)); + } else if (op == GGML_OP_MUL_MAT_ID || op == GGML_OP_ADD_ID) { + const int64_t n_expert = (op == GGML_OP_MUL_MAT_ID) ? sources[0].ne[2] : sources[1].ne[1]; + std::random_device rd; + std::default_random_engine rng(rd()); + for (int64_t r = 0; r < ggml_nrows(t); r++) { + std::vector data(t->ne[0]); + for (int32_t i = 0; i < t->ne[0]; i++) { + data[i] = i % n_expert; + } + std::shuffle(data.begin(), data.end(), rng); + ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t)); + } + } else if (op == GGML_OP_SSM_SCAN) { + std::random_device rd; + std::default_random_engine rng(rd()); + for (int64_t r = 0; r < ggml_nrows(t); r++) { + std::vector data(t->ne[0]); + for (int32_t i = 0; i < t->ne[0]; i++) { + data[i] = i; + } + std::shuffle(data.begin(), data.end(), rng); + ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t)); + } + } else { + init_tensor_uniform(t); + } + } else { + init_tensor_uniform(t); + } + } + } }; @@ -8865,7 +8968,12 @@ static std::vector> make_test_cases_from_json(const c for (const auto& src : input_case["sources"]) { auto ne_arr = src["ne"]; const std::array src_ne = {ne_arr[0], ne_arr[1], ne_arr[2], ne_arr[3]}; - sources.push_back({(ggml_type)src["type"], src_ne}); + std::array src_nb = {}; + if (src.contains("nb")) { + auto nb_arr = src["nb"]; + src_nb = {nb_arr[0], nb_arr[1], nb_arr[2], nb_arr[3]}; + } + sources.push_back({(ggml_type)src["type"], src_ne, src_nb}); } test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources)); diff --git a/tools/export-graph-ops/export-graph-ops.cpp b/tools/export-graph-ops/export-graph-ops.cpp index 9d65b91d573..3fa08fec3ea 100644 --- a/tools/export-graph-ops/export-graph-ops.cpp +++ b/tools/export-graph-ops/export-graph-ops.cpp @@ -4,6 +4,7 @@ #include "llama.h" #include "llama-context.h" #include "llama-graph.h" +#include "llama-memory.h" #include "ggml.h" #include "nlohmann/json.hpp" @@ -17,14 +18,16 @@ struct input_tensor { ggml_type type; std::array ne; + std::array nb; - input_tensor(ggml_type type, int64_t * ne): type(type) { + input_tensor(ggml_type type, int64_t * ne, size_t * nb): type(type) { memcpy(this->ne.data(), ne, 4 * sizeof(int64_t)); + memcpy(this->nb.data(), nb, 4 * sizeof(size_t)); } bool operator<(const input_tensor &b) const { - return std::tie(type, ne) < - std::tie(b.type, b.ne); + return std::tie(type, ne, nb) < + std::tie(b.type, b.ne, b.nb); } }; @@ -54,6 +57,7 @@ struct test_object { {"type", sources[s].type}, {"type_name", ggml_type_name(sources[s].type)}, {"ne", { sources[s].ne[0], sources[s].ne[1], sources[s].ne[2], sources[s].ne[3] }}, + {"nb", { sources[s].nb[0], sources[s].nb[1], sources[s].nb[2], sources[s].nb[3] }}, }); } @@ -68,32 +72,10 @@ struct test_object { } }; -int main(int argc, char ** argv) { - common_params params; - - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_JSON)) { - return 1; - } - - common_init(); - - // Load CPU-only - ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - params.devices = { cpu_device, nullptr }; - params.fit_params = false; - params.n_gpu_layers = 0; - - params.warmup = false; - - auto init_result = common_init_from_params(params); - - llama_context * ctx = init_result->context(); - auto * cgraph = ctx->get_gf_res_reserve()->get_gf(); - - std::set tests; - +static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set & tests) { int n_nodes = ggml_graph_n_nodes(cgraph); int n_skipped = 0; + int n_before = (int) tests.size(); for (int i = 0; i < n_nodes; i++) { ggml_tensor * node = ggml_graph_node(cgraph, i); @@ -116,17 +98,65 @@ int main(int argc, char ** argv) { break; } - test.sources.emplace_back(node->src[s]->type, node->src[s]->ne); + test.sources.emplace_back(node->src[s]->type, node->src[s]->ne, node->src[s]->nb); } tests.insert(test); } - LOG_INF("%d unique ops extracted, %d total nodes, %d skipped (view ops)\n", - (int) tests.size(), n_nodes, n_skipped); + int n_new = (int) tests.size() - n_before; + LOG_INF("%s: %d unique ops, %d total nodes, %d skipped (view ops)\n", + label, n_new, n_nodes, n_skipped); +} - nlohmann::json output_list = nlohmann::json::array(); +int main(int argc, char ** argv) { + common_params params; + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_JSON)) { + return 1; + } + + common_init(); + + // Load CPU-only + ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + params.devices = { cpu_device, nullptr }; + params.fit_params = false; + params.n_gpu_layers = 0; + + params.warmup = false; + + auto init_result = common_init_from_params(params); + + llama_context * ctx = init_result->context(); + + const auto & cparams = ctx->get_cparams(); + const uint32_t n_seqs = cparams.n_seq_max; + const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + + llama_memory_context_ptr mctx; + auto * memory = ctx->get_memory(); + if (memory) { + mctx = memory->init_full(); + } + + std::set tests; + + auto * gf_pp = ctx->graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); + if (!gf_pp) { + throw std::runtime_error("failed to reserve prompt processing graph"); + } + extract_graph_ops(gf_pp, "pp", tests); + + auto * gf_tg = ctx->graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get()); + if (!gf_tg) { + throw std::runtime_error("failed to reserve token generation graph"); + } + extract_graph_ops(gf_tg, "tg", tests); + + LOG_INF("%d unique ops total\n", (int) tests.size()); + + nlohmann::json output_list = nlohmann::json::array(); for (const auto& test : tests) { output_list.push_back(test.to_json()); } From 0c7a5058ef3e32cd314c8bba644f710b16b95066 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Wed, 25 Feb 2026 07:06:20 +0100 Subject: [PATCH 06/18] fix view check --- tests/test-backend-ops.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 3bc4aa44c23..f15c0fb15b2 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -424,10 +424,6 @@ static std::string var_to_str(ggml_scale_mode mode) { return str; } -static std::string var_to_str(ggml_op op) { - return ggml_op_name(op); -} - #define VAR_TO_STR(x) (#x "=" + var_to_str(x)) #define VARS_TO_STR1(a) VAR_TO_STR(a) @@ -6702,7 +6698,7 @@ struct test_generic_op : public test_case { const std::vector sources; std::string vars() override { - return VARS_TO_STR5(op, type, ne, op_params, sources); + return VARS_TO_STR4(type, ne, op_params, sources); } test_generic_op(ggml_op op, ggml_type type, std::array ne, @@ -6802,7 +6798,7 @@ struct test_generic_op : public test_case { continue; } - if ((t->type == GGML_TYPE_I32 || t->type == GGML_TYPE_I64) && !ggml_is_view_op(t->op)) { + if (t->type == GGML_TYPE_I32 || t->type == GGML_TYPE_I64) { if (op == GGML_OP_GET_ROWS || op == GGML_OP_GET_ROWS_BACK) { const int64_t num_rows = sources[0].ne[1]; const int64_t nels = ggml_nelements(t); From 99e62a629d377f789a8d85180608c1fca7da85cf Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Wed, 25 Feb 2026 09:20:47 +0100 Subject: [PATCH 07/18] only use view if non-contiguous/permuted, use C++ random instead of rand() --- tests/test-backend-ops.cpp | 29 +++++++++++++++------ tools/export-graph-ops/CMakeLists.txt | 2 +- tools/export-graph-ops/export-graph-ops.cpp | 2 +- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f15c0fb15b2..e5db0a0f68f 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6713,9 +6713,21 @@ struct test_generic_op : public test_case { std::array source_tensors; for (size_t i = 0; i < source_count; ++i) { const input_tensor& src = sources[i]; - bool has_nb = src.nb[0] != 0; - if (has_nb) { + // Check if the exported strides differ from default contiguous layout. + bool needs_view = false; + if (src.nb[0] != 0) { + const size_t default_nb0 = ggml_type_size(src.type); + const size_t default_nb1 = default_nb0 * (src.ne[0] / ggml_blck_size(src.type)); + const size_t default_nb2 = default_nb1 * src.ne[1]; + const size_t default_nb3 = default_nb2 * src.ne[2]; + needs_view = (src.nb[0] != default_nb0 || + src.nb[1] != default_nb1 || + src.nb[2] != default_nb2 || + src.nb[3] != default_nb3); + } + + if (needs_view) { // Compute the total buffer size using the same method as ggml_nbytes size_t total_size; const size_t blck_size = ggml_blck_size(src.type); @@ -6786,6 +6798,9 @@ struct test_generic_op : public test_case { void initialize_tensors(ggml_context * ctx) override { ggml_tensor * out = ggml_get_tensor(ctx, "out"); + std::random_device rd; + std::default_random_engine rng(rd()); + for (size_t i = 0; i < sources.size() && i < GGML_MAX_SRC; i++) { ggml_tensor * t = out->src[i]; if (!t) { @@ -6803,8 +6818,9 @@ struct test_generic_op : public test_case { const int64_t num_rows = sources[0].ne[1]; const int64_t nels = ggml_nelements(t); std::vector data(nels); + std::uniform_int_distribution dist(0, num_rows - 1); for (int64_t i = 0; i < nels; i++) { - data[i] = rand() % num_rows; + data[i] = dist(rng); } ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t)); } else if (op == GGML_OP_SET_ROWS) { @@ -6813,14 +6829,13 @@ struct test_generic_op : public test_case { const int mode = op_params[2]; const int64_t nels = (mode & GGML_ROPE_TYPE_MROPE) ? ne[2] * 4 : ne[2]; std::vector data(nels); + std::uniform_int_distribution dist(0, ne[2] - 1); for (int64_t i = 0; i < nels; i++) { - data[i] = rand() % ne[2]; + data[i] = dist(rng); } ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t)); } else if (op == GGML_OP_MUL_MAT_ID || op == GGML_OP_ADD_ID) { const int64_t n_expert = (op == GGML_OP_MUL_MAT_ID) ? sources[0].ne[2] : sources[1].ne[1]; - std::random_device rd; - std::default_random_engine rng(rd()); for (int64_t r = 0; r < ggml_nrows(t); r++) { std::vector data(t->ne[0]); for (int32_t i = 0; i < t->ne[0]; i++) { @@ -6830,8 +6845,6 @@ struct test_generic_op : public test_case { ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t)); } } else if (op == GGML_OP_SSM_SCAN) { - std::random_device rd; - std::default_random_engine rng(rd()); for (int64_t r = 0; r < ggml_nrows(t); r++) { std::vector data(t->ne[0]); for (int32_t i = 0; i < t->ne[0]; i++) { diff --git a/tools/export-graph-ops/CMakeLists.txt b/tools/export-graph-ops/CMakeLists.txt index 32b61376d93..785a3fa7788 100644 --- a/tools/export-graph-ops/CMakeLists.txt +++ b/tools/export-graph-ops/CMakeLists.txt @@ -6,4 +6,4 @@ target_compile_features(${TARGET} PRIVATE cxx_std_17) if(LLAMA_TOOLS_INSTALL) install(TARGETS ${TARGET} RUNTIME) -endif() \ No newline at end of file +endif() diff --git a/tools/export-graph-ops/export-graph-ops.cpp b/tools/export-graph-ops/export-graph-ops.cpp index 3fa08fec3ea..782fd0bc2be 100644 --- a/tools/export-graph-ops/export-graph-ops.cpp +++ b/tools/export-graph-ops/export-graph-ops.cpp @@ -174,4 +174,4 @@ int main(int argc, char ** argv) { } return 0; -} \ No newline at end of file +} From a09708e56d173691369a0862f50769bc73143da4 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Wed, 25 Feb 2026 14:30:54 +0100 Subject: [PATCH 08/18] replace internal API calls with public llama_graph_reserve call --- include/llama.h | 7 +++++++ src/llama-context.cpp | 13 +++++++++++++ tools/export-graph-ops/CMakeLists.txt | 1 - tools/export-graph-ops/export-graph-ops.cpp | 18 ++++-------------- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/include/llama.h b/include/llama.h index 0bd10294cb8..594388282b9 100644 --- a/include/llama.h +++ b/include/llama.h @@ -629,6 +629,13 @@ extern "C" { const char * fname_out, const llama_model_quantize_params * params); + // Reserve a new compute graph. It is valid until the next call to llama_graph_reserve. + LLAMA_API struct ggml_cgraph * llama_graph_reserve( + struct llama_context * ctx, + uint32_t n_tokens, + uint32_t n_seqs, + uint32_t n_outputs); + // // Adapters // diff --git a/src/llama-context.cpp b/src/llama-context.cpp index ee2669c154e..7ccbb502acc 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -3084,6 +3084,19 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) { return static_cast(ctx->get_sampled_probs_count(i)); } +struct ggml_cgraph * llama_graph_reserve( + struct llama_context * ctx, + uint32_t n_tokens, + uint32_t n_seqs, + uint32_t n_outputs) { + auto * memory = ctx->get_memory(); + llama_memory_context_ptr mctx; + if (memory) { + mctx = memory->init_full(); + } + return ctx->graph_reserve(n_tokens, n_seqs, n_outputs, mctx.get()); +} + // llama adapter API int32_t llama_set_adapters_lora( diff --git a/tools/export-graph-ops/CMakeLists.txt b/tools/export-graph-ops/CMakeLists.txt index 785a3fa7788..95d2ac891f2 100644 --- a/tools/export-graph-ops/CMakeLists.txt +++ b/tools/export-graph-ops/CMakeLists.txt @@ -1,6 +1,5 @@ set(TARGET llama-export-graph-ops) add_executable(${TARGET} export-graph-ops.cpp) -target_include_directories(${TARGET} PRIVATE ../../src) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/export-graph-ops/export-graph-ops.cpp b/tools/export-graph-ops/export-graph-ops.cpp index 782fd0bc2be..e1aec71c0f3 100644 --- a/tools/export-graph-ops/export-graph-ops.cpp +++ b/tools/export-graph-ops/export-graph-ops.cpp @@ -2,9 +2,6 @@ #include "common.h" #include "log.h" #include "llama.h" -#include "llama-context.h" -#include "llama-graph.h" -#include "llama-memory.h" #include "ggml.h" #include "nlohmann/json.hpp" @@ -130,25 +127,18 @@ int main(int argc, char ** argv) { llama_context * ctx = init_result->context(); - const auto & cparams = ctx->get_cparams(); - const uint32_t n_seqs = cparams.n_seq_max; - const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - - llama_memory_context_ptr mctx; - auto * memory = ctx->get_memory(); - if (memory) { - mctx = memory->init_full(); - } + const uint32_t n_seqs = llama_n_seq_max(ctx); + const uint32_t n_tokens = std::min(llama_n_ctx(ctx), llama_n_ubatch(ctx)); std::set tests; - auto * gf_pp = ctx->graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); + auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens); if (!gf_pp) { throw std::runtime_error("failed to reserve prompt processing graph"); } extract_graph_ops(gf_pp, "pp", tests); - auto * gf_tg = ctx->graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get()); + auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs); if (!gf_tg) { throw std::runtime_error("failed to reserve token generation graph"); } From c80b146178c5a5e03b7298a35c2c489fcd891055 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Wed, 25 Feb 2026 14:54:26 +0100 Subject: [PATCH 09/18] reduce test description length --- tests/test-backend-ops.cpp | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index e5db0a0f68f..9ca9a9464fd 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6659,13 +6659,27 @@ struct input_tensor { std::array nb; // strides (0 = use default contiguous strides) }; +static bool is_non_contiguous(const input_tensor & src) { + if (src.nb[0] == 0) { + return false; + } + const size_t default_nb0 = ggml_type_size(src.type); + const size_t default_nb1 = default_nb0 * (src.ne[0] / ggml_blck_size(src.type)); + const size_t default_nb2 = default_nb1 * src.ne[1]; + const size_t default_nb3 = default_nb2 * src.ne[2]; + return src.nb[0] != default_nb0 || + src.nb[1] != default_nb1 || + src.nb[2] != default_nb2 || + src.nb[3] != default_nb3; +} + static std::string var_to_str(const std::vector& sources) { std::ostringstream oss; bool first = true; for (const auto& src : sources) { if (!first) oss << ","; oss << ggml_type_name(src.type) << "[" << src.ne[0] << "," << src.ne[1] << "," << src.ne[2] << "," << src.ne[3] << "]"; - if (src.nb[0] != 0) { + if (is_non_contiguous(src)) { oss << "nb[" << src.nb[0] << "," << src.nb[1] << "," << src.nb[2] << "," << src.nb[3] << "]"; } first = false; @@ -6714,20 +6728,7 @@ struct test_generic_op : public test_case { for (size_t i = 0; i < source_count; ++i) { const input_tensor& src = sources[i]; - // Check if the exported strides differ from default contiguous layout. - bool needs_view = false; - if (src.nb[0] != 0) { - const size_t default_nb0 = ggml_type_size(src.type); - const size_t default_nb1 = default_nb0 * (src.ne[0] / ggml_blck_size(src.type)); - const size_t default_nb2 = default_nb1 * src.ne[1]; - const size_t default_nb3 = default_nb2 * src.ne[2]; - needs_view = (src.nb[0] != default_nb0 || - src.nb[1] != default_nb1 || - src.nb[2] != default_nb2 || - src.nb[3] != default_nb3); - } - - if (needs_view) { + if (is_non_contiguous(src)) { // Compute the total buffer size using the same method as ggml_nbytes size_t total_size; const size_t blck_size = ggml_blck_size(src.type); From 323f01b316047f5bc5a93045185b2f640d710e6b Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Wed, 25 Feb 2026 15:24:30 +0100 Subject: [PATCH 10/18] fix nb[0] not getting set for view --- tests/test-backend-ops.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 9ca9a9464fd..9f16bee3efc 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6720,7 +6720,6 @@ struct test_generic_op : public test_case { std::vector sources) : op(op), type(type), ne(ne), op_params(op_params), sources(sources) {} - // Define how a simple GGML compute graph can be constructed for the new GGML op. ggml_tensor * build_graph(ggml_context * ctx) override { const size_t source_count = std::min(sources.size(), (size_t)GGML_MAX_SRC); @@ -6729,7 +6728,6 @@ struct test_generic_op : public test_case { const input_tensor& src = sources[i]; if (is_non_contiguous(src)) { - // Compute the total buffer size using the same method as ggml_nbytes size_t total_size; const size_t blck_size = ggml_blck_size(src.type); if (blck_size == 1) { @@ -6752,6 +6750,8 @@ struct test_generic_op : public test_case { source_tensors[i] = ggml_view_4d(ctx, backing, src.ne[0], src.ne[1], src.ne[2], src.ne[3], src.nb[1], src.nb[2], src.nb[3], 0); + // nb[0] does not get set by view_4d, so set it manually + source_tensors[i]->nb[0] = src.nb[0]; } else { source_tensors[i] = ggml_new_tensor_4d(ctx, src.type, src.ne[0], src.ne[1], src.ne[2], src.ne[3]); } From 6e388759a4c38e420bb3db4b527b1df224e922e7 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Wed, 25 Feb 2026 15:54:27 +0100 Subject: [PATCH 11/18] add name to tests --- tests/test-backend-ops.cpp | 18 ++++++++++++++---- tools/export-graph-ops/export-graph-ops.cpp | 6 ++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 9f16bee3efc..6103c6db0f7 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6710,15 +6710,20 @@ struct test_generic_op : public test_case { const std::array op_params; const std::vector sources; + const std::string name; std::string vars() override { - return VARS_TO_STR4(type, ne, op_params, sources); + if (name.empty()) { + return VARS_TO_STR4(type, ne, op_params, sources); + } + + return VARS_TO_STR5(name, type, ne, op_params, sources); } test_generic_op(ggml_op op, ggml_type type, std::array ne, std::array op_params, - std::vector sources) - : op(op), type(type), ne(ne), op_params(op_params), sources(sources) {} + std::vector sources, std::string name = "") + : op(op), type(type), ne(ne), op_params(op_params), sources(sources), name(std::move(name)) {} ggml_tensor * build_graph(ggml_context * ctx) override { const size_t source_count = std::min(sources.size(), (size_t)GGML_MAX_SRC); @@ -8986,7 +8991,12 @@ static std::vector> make_test_cases_from_json(const c sources.push_back({(ggml_type)src["type"], src_ne, src_nb}); } - test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources)); + std::string name; + if (input_case.contains("name")) { + name = input_case["name"]; + } + + test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources, std::move(name))); } return test_cases; diff --git a/tools/export-graph-ops/export-graph-ops.cpp b/tools/export-graph-ops/export-graph-ops.cpp index e1aec71c0f3..5b836cf5b82 100644 --- a/tools/export-graph-ops/export-graph-ops.cpp +++ b/tools/export-graph-ops/export-graph-ops.cpp @@ -34,6 +34,7 @@ struct test_object { std::array ne; std::vector op_params; std::vector sources; + std::string name; nlohmann::json to_json() const { nlohmann::json test; @@ -48,6 +49,10 @@ struct test_object { test["op_params"] = op_params; + if (!name.empty()) { + test["name"] = name; + } + nlohmann::json j_sources = nlohmann::json::array(); for (size_t s = 0; s < sources.size(); s++) { j_sources.push_back({ @@ -98,6 +103,7 @@ static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set test.sources.emplace_back(node->src[s]->type, node->src[s]->ne, node->src[s]->nb); } + test.name = node->name; tests.insert(test); } From 91634e684f866797681b802fa2ab31b81afd93eb Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Sat, 28 Feb 2026 11:41:10 +0100 Subject: [PATCH 12/18] fix inplace error --- tests/test-backend-ops.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 6103c6db0f7..04d70be7780 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6762,7 +6762,20 @@ struct test_generic_op : public test_case { } } - ggml_tensor * out = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); + // Ops with an inplace flag create a view of src[0] as their output. + bool inplace = false; + if (op == GGML_OP_SET || op == GGML_OP_ACC) { + inplace = op_params[4] != 0; + } else if (op == GGML_OP_ADD_REL_POS) { + inplace = op_params[0] != 0; + } + + ggml_tensor * out; + if (inplace && source_count > 0) { + out = ggml_view_tensor(ctx, source_tensors[0]); + } else { + out = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); + } out->op = op; for (size_t i = 0; i < source_count; ++i) { out->src[i] = source_tensors[i]; From 7feb7a7f67bb5945db8cbd842f4c9ef7f6b5a9e1 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Mon, 2 Mar 2026 13:56:14 +0100 Subject: [PATCH 13/18] use text file instead of json --- common/arg.cpp | 2 +- common/common.h | 2 +- tests/test-backend-ops.cpp | 88 ++++++++++++--------- tools/export-graph-ops/export-graph-ops.cpp | 73 ++++++++--------- 4 files changed, 87 insertions(+), 78 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 24c0efc5cd8..deb647065c1 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2667,7 +2667,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.out_file = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, - LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_JSON})); + LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), diff --git a/common/common.h b/common/common.h index 9f51d9629c8..7df73903d99 100644 --- a/common/common.h +++ b/common/common.h @@ -105,7 +105,7 @@ enum llama_example { LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FIT_PARAMS, LLAMA_EXAMPLE_RESULTS, - LLAMA_EXAMPLE_EXPORT_GRAPH_JSON, + LLAMA_EXAMPLE_EXPORT_GRAPH_OPS, LLAMA_EXAMPLE_COUNT, }; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 04d70be7780..94f5d103307 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -43,8 +43,6 @@ #include #include -#include - #ifdef __EMSCRIPTEN__ # define N_THREADS 1 #else @@ -8967,46 +8965,62 @@ static std::vector> make_test_cases_perf() { return test_cases; } -static std::vector> make_test_cases_from_json(const char * path) { +static std::vector> make_test_cases_from_file(const char * path) { std::ifstream f(path); if (!f.is_open()) { - throw std::runtime_error("Unable to read JSON file"); + throw std::runtime_error("Unable to read test file"); } - nlohmann::json data = nlohmann::json::parse(f); - - GGML_ASSERT(data.is_array()); - std::vector> test_cases; - for (const auto& input_case : data) { - const ggml_op op = input_case["op"]; - const ggml_type type = input_case["type"]; - auto ne_arr = input_case["ne"]; - const std::array ne = {ne_arr[0], ne_arr[1], ne_arr[2], ne_arr[3]}; + std::string line; + + while (std::getline(f, line)) { + std::istringstream iss(line); - auto op_arr = input_case["op_params"]; + ggml_op op; + ggml_type type; + std::array ne; std::array op_params = {}; - for (size_t i = 0; i < op_arr.size() && i < op_params.size(); i++) { - op_params[i] = op_arr[i]; + std::string name; + uint64_t tmp; + + iss >> tmp; + op = (ggml_op)tmp; + iss >> tmp; + type = (ggml_type)tmp; + + for (size_t i = 0; i < 4; i++) { + iss >> ne[i]; } - std::vector sources; - for (const auto& src : input_case["sources"]) { - auto ne_arr = src["ne"]; - const std::array src_ne = {ne_arr[0], ne_arr[1], ne_arr[2], ne_arr[3]}; - std::array src_nb = {}; - if (src.contains("nb")) { - auto nb_arr = src["nb"]; - src_nb = {nb_arr[0], nb_arr[1], nb_arr[2], nb_arr[3]}; + iss >> tmp; + for (size_t i = 0; i < tmp && i < op_params.size(); i++) { + iss >> op_params[i]; + } + + iss >> tmp; + + size_t num_src = std::min((uint64_t)GGML_MAX_SRC, tmp); + std::vector sources(num_src); + for (size_t i = 0; i < num_src; i++) { + input_tensor& src = sources[i]; + iss >> tmp; + src.type = (ggml_type)tmp; + + for (size_t i = 0; i < 4; i++) { + iss >> src.ne[i]; + } + for (size_t i = 0; i < 4; i++) { + iss >> src.nb[i]; } - sources.push_back({(ggml_type)src["type"], src_ne, src_nb}); } - std::string name; - if (input_case.contains("name")) { - name = input_case["name"]; + iss >> name; + + if (name.length() == 1 && name[0] == '-') { + name = ""; } test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources, std::move(name))); @@ -9016,7 +9030,7 @@ static std::vector> make_test_cases_from_json(const c } static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter, - printer * output_printer, const char * test_json_path) { + printer * output_printer, const char * test_file_path) { auto filter_test_cases = [](std::vector> & test_cases, const char * params_filter) { if (params_filter == nullptr) { return; @@ -9036,7 +9050,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op std::vector> test_cases; - if (test_json_path == nullptr) { + if (test_file_path == nullptr) { switch (mode) { case MODE_TEST: case MODE_GRAD: @@ -9048,7 +9062,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op break; } } else { - test_cases = make_test_cases_from_json(test_json_path); + test_cases = make_test_cases_from_file(test_file_path); } filter_test_cases(test_cases, params_filter); @@ -9231,7 +9245,7 @@ static void show_test_coverage() { static void usage(char ** argv) { printf("Usage: %s [mode] [-o ] [-b ] [-p ] [--output ] [--list-ops]", argv[0]); - printf(" [--show-coverage] [--test-json ]\n"); + printf(" [--show-coverage] [--test-file ]\n"); printf(" valid modes:\n"); printf(" - test (default, compare with CPU backend for correctness)\n"); printf(" - grad (compare gradients from backpropagation with method of finite differences)\n"); @@ -9242,7 +9256,7 @@ static void usage(char ** argv) { printf(" --output specifies output format (default: console, options: console, sql, csv)\n"); printf(" --list-ops lists all available GGML operations\n"); printf(" --show-coverage shows test coverage\n"); - printf(" --test-json reads test operators from a json\n"); + printf(" --test-file reads test operators from a test file generated by llama-export-graph-ops\n"); } int main(int argc, char ** argv) { @@ -9251,7 +9265,7 @@ int main(int argc, char ** argv) { const char * op_names_filter = nullptr; const char * backend_filter = nullptr; const char * params_filter = nullptr; - const char * test_json_path = nullptr; + const char * test_file_path = nullptr; for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "test") == 0) { @@ -9299,9 +9313,9 @@ int main(int argc, char ** argv) { } else if (strcmp(argv[i], "--show-coverage") == 0) { show_test_coverage(); return 0; - } else if (strcmp(argv[i], "--test-json") == 0) { + } else if (strcmp(argv[i], "--test-file") == 0) { if (i + 1 < argc) { - test_json_path = argv[++i]; + test_file_path = argv[++i]; } else { usage(argv); return 1; @@ -9358,7 +9372,7 @@ int main(int argc, char ** argv) { false, "", ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024, true)); - bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_json_path); + bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_file_path); if (ok) { n_ok++; diff --git a/tools/export-graph-ops/export-graph-ops.cpp b/tools/export-graph-ops/export-graph-ops.cpp index 5b836cf5b82..a8a075fa875 100644 --- a/tools/export-graph-ops/export-graph-ops.cpp +++ b/tools/export-graph-ops/export-graph-ops.cpp @@ -4,8 +4,6 @@ #include "llama.h" #include "ggml.h" -#include "nlohmann/json.hpp" - #include #include #include @@ -26,6 +24,16 @@ struct input_tensor { return std::tie(type, ne, nb) < std::tie(b.type, b.ne, b.nb); } + + void serialize(std::ostream& out) const { + out << type << ' '; + for (size_t i = 0; i < 4; i++) { + out << ne[i] << ' '; + } + for (size_t i = 0; i < 4; i++) { + out << nb[i] << ' '; + } + } }; struct test_object { @@ -36,36 +44,29 @@ struct test_object { std::vector sources; std::string name; - nlohmann::json to_json() const { - nlohmann::json test; - - test["op"] = op; - test["op_name"] = ggml_op_name(op); - - test["type"] = type; - test["type_name"] = ggml_type_name(type); - - test["ne"] = { ne[0], ne[1], ne[2], ne[3] }; - - test["op_params"] = op_params; + void serialize(std::ostream& out) const { + out << op << ' ' << type << ' '; + for (size_t i = 0; i < 4; i++) { + out << ne[i] << ' '; + } - if (!name.empty()) { - test["name"] = name; + out << op_params.size() << ' '; + for (size_t i = 0; i < op_params.size(); i++) { + out << op_params[i] << ' '; } - nlohmann::json j_sources = nlohmann::json::array(); + out << sources.size() << ' '; for (size_t s = 0; s < sources.size(); s++) { - j_sources.push_back({ - {"type", sources[s].type}, - {"type_name", ggml_type_name(sources[s].type)}, - {"ne", { sources[s].ne[0], sources[s].ne[1], sources[s].ne[2], sources[s].ne[3] }}, - {"nb", { sources[s].nb[0], sources[s].nb[1], sources[s].nb[2], sources[s].nb[3] }}, - }); + sources[s].serialize(out); } - test["sources"] = j_sources; + if (!name.empty()) { + out << name; + } else { + out << '-'; + } - return test; + out << '\n'; } bool operator<(const test_object &b) const { @@ -114,8 +115,9 @@ static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set int main(int argc, char ** argv) { common_params params; + params.out_file = "tests.txt"; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_JSON)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS)) { return 1; } @@ -152,21 +154,14 @@ int main(int argc, char ** argv) { LOG_INF("%d unique ops total\n", (int) tests.size()); - nlohmann::json output_list = nlohmann::json::array(); - for (const auto& test : tests) { - output_list.push_back(test.to_json()); - } + std::ofstream f(params.out_file); - if (!params.out_file.empty()) { - std::ofstream f(params.out_file); - - if (!f.is_open()) { - throw std::runtime_error("Unable to open output file"); - } + if (!f.is_open()) { + throw std::runtime_error("Unable to open output file"); + } - f << output_list.dump(2) << std::endl; - } else { - std::cout << output_list.dump(2) << std::endl; + for (const auto& test : tests) { + test.serialize(f); } return 0; From 8055a050eb4b7d60d85e1f1bf94405ea24d1c6e2 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Fri, 6 Mar 2026 15:06:16 +0100 Subject: [PATCH 14/18] move llama_graph_reserve function to new llama-ext header, move export-graph-ops to tests/ --- include/llama.h | 7 ------- src/llama-ext.h | 15 +++++++++++++++ tests/CMakeLists.txt | 4 +++- .../export-graph-ops.cpp | 1 + tools/CMakeLists.txt | 1 - tools/export-graph-ops/CMakeLists.txt | 8 -------- 6 files changed, 19 insertions(+), 17 deletions(-) create mode 100644 src/llama-ext.h rename {tools/export-graph-ops => tests}/export-graph-ops.cpp (99%) delete mode 100644 tools/export-graph-ops/CMakeLists.txt diff --git a/include/llama.h b/include/llama.h index 594388282b9..0bd10294cb8 100644 --- a/include/llama.h +++ b/include/llama.h @@ -629,13 +629,6 @@ extern "C" { const char * fname_out, const llama_model_quantize_params * params); - // Reserve a new compute graph. It is valid until the next call to llama_graph_reserve. - LLAMA_API struct ggml_cgraph * llama_graph_reserve( - struct llama_context * ctx, - uint32_t n_tokens, - uint32_t n_seqs, - uint32_t n_outputs); - // // Adapters // diff --git a/src/llama-ext.h b/src/llama-ext.h new file mode 100644 index 00000000000..9b58e0abb5d --- /dev/null +++ b/src/llama-ext.h @@ -0,0 +1,15 @@ +#ifndef LLAMA_EXT_H +#define LLAMA_EXT_H + +#include "llama-context.h" +#include "ggml.h" +#include "stdint.h" + +// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve. +struct ggml_cgraph * llama_graph_reserve( + struct llama_context * ctx, + uint32_t n_tokens, + uint32_t n_seqs, + uint32_t n_outputs); + +#endif // LLAMA_EXT_H diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index bb0f0ef0ed8..9582164b580 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -260,6 +260,7 @@ endif() set(LLAMA_TEST_NAME test-mtmd-c-api) llama_build_and_test(test-mtmd-c-api.c) target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd) +unset(LLAMA_TEST_NAME) # GGUF model data fetcher library for tests that need real model metadata # Only compile when cpp-httplib has SSL support (CPPHTTPLIB_OPENSSL_SUPPORT) @@ -284,4 +285,5 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama) llama_build_and_test(test-alloc.cpp) target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) - +llama_build(export-graph-ops.cpp) +target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) diff --git a/tools/export-graph-ops/export-graph-ops.cpp b/tests/export-graph-ops.cpp similarity index 99% rename from tools/export-graph-ops/export-graph-ops.cpp rename to tests/export-graph-ops.cpp index a8a075fa875..754089d068e 100644 --- a/tools/export-graph-ops/export-graph-ops.cpp +++ b/tests/export-graph-ops.cpp @@ -2,6 +2,7 @@ #include "common.h" #include "log.h" #include "llama.h" +#include "../src/llama-ext.h" #include "ggml.h" #include diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 849bdbafc42..b433c91d85e 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -39,5 +39,4 @@ else() endif() add_subdirectory(fit-params) add_subdirectory(results) - add_subdirectory(export-graph-ops) endif() diff --git a/tools/export-graph-ops/CMakeLists.txt b/tools/export-graph-ops/CMakeLists.txt deleted file mode 100644 index 95d2ac891f2..00000000000 --- a/tools/export-graph-ops/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -set(TARGET llama-export-graph-ops) -add_executable(${TARGET} export-graph-ops.cpp) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) - -if(LLAMA_TOOLS_INSTALL) - install(TARGETS ${TARGET} RUNTIME) -endif() From cd7fe034f8850db00bee29b14866143beb24ca05 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Sat, 7 Mar 2026 07:33:11 +0100 Subject: [PATCH 15/18] fix missing declaration --- src/llama-context.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7ccbb502acc..493b234f011 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -7,6 +7,7 @@ #include "llama-memory.h" #include "llama-mmap.h" #include "llama-model.h" +#include "llama-ext.h" #include #include From a0c532c8116f72afc46e1c9c04c5d89d96a70225 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 10 Mar 2026 16:02:27 +0100 Subject: [PATCH 16/18] use pragma once --- src/llama-ext.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/llama-ext.h b/src/llama-ext.h index 9b58e0abb5d..b3ea9108e0d 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -1,5 +1,4 @@ -#ifndef LLAMA_EXT_H -#define LLAMA_EXT_H +#pragma once #include "llama-context.h" #include "ggml.h" @@ -11,5 +10,3 @@ struct ggml_cgraph * llama_graph_reserve( uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs); - -#endif // LLAMA_EXT_H From 3e21a58d8aed6c78e8bac2cf966f799120a79104 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 10 Mar 2026 16:08:52 +0100 Subject: [PATCH 17/18] fix indent --- common/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index deb647065c1..69092d6f9ea 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2667,7 +2667,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.out_file = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, - LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS})); + LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), From 062e7b1c98f073a039b9a4a8918ce26bdbd58f88 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Wed, 11 Mar 2026 12:38:51 +0100 Subject: [PATCH 18/18] fix Windows build --- src/llama-ext.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-ext.h b/src/llama-ext.h index b3ea9108e0d..13ced783b42 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -5,7 +5,7 @@ #include "stdint.h" // Reserve a new compute graph. It is valid until the next call to llama_graph_reserve. -struct ggml_cgraph * llama_graph_reserve( +LLAMA_API struct ggml_cgraph * llama_graph_reserve( struct llama_context * ctx, uint32_t n_tokens, uint32_t n_seqs,