Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2666,7 +2666,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.out_file = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RESULTS}));
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE,
LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
add_opt(common_arg(
{"-ofreq", "--output-frequency"}, "N",
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ enum llama_example {
LLAMA_EXAMPLE_FINETUNE,
LLAMA_EXAMPLE_FIT_PARAMS,
LLAMA_EXAMPLE_RESULTS,
LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,

LLAMA_EXAMPLE_COUNT,
};
Expand Down
14 changes: 14 additions & 0 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "llama-memory.h"
#include "llama-mmap.h"
#include "llama-model.h"
#include "llama-ext.h"

#include <cinttypes>
#include <cmath>
Expand Down Expand Up @@ -3084,6 +3085,19 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
}

struct ggml_cgraph * llama_graph_reserve(
struct llama_context * ctx,
uint32_t n_tokens,
uint32_t n_seqs,
uint32_t n_outputs) {
auto * memory = ctx->get_memory();
llama_memory_context_ptr mctx;
if (memory) {
mctx = memory->init_full();
}
return ctx->graph_reserve(n_tokens, n_seqs, n_outputs, mctx.get());
}

// llama adapter API

int32_t llama_set_adapters_lora(
Expand Down
12 changes: 12 additions & 0 deletions src/llama-ext.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#pragma once

#include "llama-context.h"
#include "ggml.h"
#include "stdint.h"

// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
struct llama_context * ctx,
uint32_t n_tokens,
uint32_t n_seqs,
uint32_t n_outputs);
4 changes: 3 additions & 1 deletion tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ endif()
set(LLAMA_TEST_NAME test-mtmd-c-api)
llama_build_and_test(test-mtmd-c-api.c)
target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
unset(LLAMA_TEST_NAME)

# GGUF model data fetcher library for tests that need real model metadata
# Only compile when cpp-httplib has SSL support (CPPHTTPLIB_OPENSSL_SUPPORT)
Expand All @@ -284,4 +285,5 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
llama_build_and_test(test-alloc.cpp)
target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)


llama_build(export-graph-ops.cpp)
target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
169 changes: 169 additions & 0 deletions tests/export-graph-ops.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include "../src/llama-ext.h"
#include "ggml.h"

#include <array>
#include <vector>
#include <set>
#include <fstream>
#include <iostream>

struct input_tensor {
ggml_type type;
std::array<int64_t, 4> ne;
std::array<size_t, 4> nb;

input_tensor(ggml_type type, int64_t * ne, size_t * nb): type(type) {
memcpy(this->ne.data(), ne, 4 * sizeof(int64_t));
memcpy(this->nb.data(), nb, 4 * sizeof(size_t));
}

bool operator<(const input_tensor &b) const {
return std::tie(type, ne, nb) <
std::tie(b.type, b.ne, b.nb);
}

void serialize(std::ostream& out) const {
out << type << ' ';
for (size_t i = 0; i < 4; i++) {
out << ne[i] << ' ';
}
for (size_t i = 0; i < 4; i++) {
out << nb[i] << ' ';
}
}
};

struct test_object {
ggml_op op;
ggml_type type;
std::array<int64_t, 4> ne;
std::vector<int32_t> op_params;
std::vector<input_tensor> sources;
std::string name;

void serialize(std::ostream& out) const {
out << op << ' ' << type << ' ';
for (size_t i = 0; i < 4; i++) {
out << ne[i] << ' ';
}

out << op_params.size() << ' ';
for (size_t i = 0; i < op_params.size(); i++) {
out << op_params[i] << ' ';
}

out << sources.size() << ' ';
for (size_t s = 0; s < sources.size(); s++) {
sources[s].serialize(out);
}

if (!name.empty()) {
out << name;
} else {
out << '-';
}

out << '\n';
}

bool operator<(const test_object &b) const {
return std::tie(op, type, ne, op_params, sources) <
std::tie(b.op, b.type, b.ne, b.op_params, b.sources);
}
};

static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set<test_object> & tests) {
int n_nodes = ggml_graph_n_nodes(cgraph);
int n_skipped = 0;
int n_before = (int) tests.size();
for (int i = 0; i < n_nodes; i++) {
ggml_tensor * node = ggml_graph_node(cgraph, i);

if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
n_skipped++;
continue;
}

test_object test;

test.op = node->op;
test.type = node->type;
memcpy(&test.ne, node->ne, 4 * sizeof(int64_t));

test.op_params.resize(GGML_MAX_OP_PARAMS / sizeof(int32_t));
memcpy(test.op_params.data(), node->op_params, GGML_MAX_OP_PARAMS);

for (size_t s = 0; s < GGML_MAX_SRC; s++) {
if (node->src[s] == nullptr) {
break;
}

test.sources.emplace_back(node->src[s]->type, node->src[s]->ne, node->src[s]->nb);
}

test.name = node->name;
tests.insert(test);
}

int n_new = (int) tests.size() - n_before;
LOG_INF("%s: %d unique ops, %d total nodes, %d skipped (view ops)\n",
label, n_new, n_nodes, n_skipped);
}

int main(int argc, char ** argv) {
common_params params;
params.out_file = "tests.txt";

if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS)) {
return 1;
}

common_init();

// Load CPU-only
ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
params.devices = { cpu_device, nullptr };
params.fit_params = false;
params.n_gpu_layers = 0;

params.warmup = false;

auto init_result = common_init_from_params(params);

llama_context * ctx = init_result->context();

const uint32_t n_seqs = llama_n_seq_max(ctx);
const uint32_t n_tokens = std::min(llama_n_ctx(ctx), llama_n_ubatch(ctx));

std::set<test_object> tests;

auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens);
if (!gf_pp) {
throw std::runtime_error("failed to reserve prompt processing graph");
}
extract_graph_ops(gf_pp, "pp", tests);

auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs);
if (!gf_tg) {
throw std::runtime_error("failed to reserve token generation graph");
}
extract_graph_ops(gf_tg, "tg", tests);

LOG_INF("%d unique ops total\n", (int) tests.size());

std::ofstream f(params.out_file);

if (!f.is_open()) {
throw std::runtime_error("Unable to open output file");
}

for (const auto& test : tests) {
test.serialize(f);
}

return 0;
}
Loading
Loading