Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ggml/include/gguf.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ extern "C" {
GGML_API struct gguf_context * gguf_init_empty(void);
GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params);
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
GGML_API struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params);

GGML_API void gguf_free(struct gguf_context * ctx);

Expand Down
91 changes: 79 additions & 12 deletions ggml/src/gguf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,10 @@ struct gguf_reader {
nbytes_remain = file_remain(file);
}

gguf_reader(const void * data, size_t size)
: data(static_cast<const uint8_t *>(data)), nbytes_remain(size) {
}

// helper for remaining bytes in a file
static uint64_t file_remain(FILE * file) {
const int64_t cur = gguf_ftell(file);
Expand Down Expand Up @@ -260,7 +264,7 @@ struct gguf_reader {
if (nbytes_remain < size) {
return false;
}
const size_t nread = fread(&dst, 1, size, file);
const size_t nread = read_raw(&dst, size);
nbytes_remain -= nread;
return nread == size;
}
Expand Down Expand Up @@ -344,7 +348,7 @@ struct gguf_reader {
return false;
}
dst.resize(static_cast<size_t>(size));
const size_t nread = fread(dst.data(), 1, size, file);
const size_t nread = read_raw(dst.data(), static_cast<size_t>(size));
nbytes_remain -= nread;
return nread == size;
}
Expand All @@ -353,14 +357,64 @@ struct gguf_reader {
if (size > nbytes_remain) {
return false;
}
const size_t nread = fread(dst, 1, size, file);
const size_t nread = read_raw(dst, size);
nbytes_remain -= nread;
return nread == size;
}

uint64_t tell() const {
if (file != nullptr) {
const int64_t cur = gguf_ftell(file);
return cur < 0
? 0
: static_cast<uint64_t>(cur);
}

return data_offset;
}

bool seek(uint64_t absolute_offset) const {
if (file != nullptr) {
const int64_t cur = gguf_ftell(file);
const uint64_t end_offset = cur < 0
? nbytes_remain
: static_cast<uint64_t>(cur) + nbytes_remain;

if (absolute_offset > end_offset || gguf_fseek(file, absolute_offset, SEEK_SET) != 0) {
return false;
}

nbytes_remain = end_offset - absolute_offset;
} else {
const uint64_t end_offset = data_offset + nbytes_remain;
if (absolute_offset > end_offset) {
return false;
}

data_offset = static_cast<size_t>(absolute_offset);
nbytes_remain = end_offset - absolute_offset;
}

return true;
}

private:
FILE * file;
size_t read_raw(void * dst, size_t size) const {
if (file != nullptr) {
return fread(dst, 1, size, file);
} else if (data == nullptr || size > nbytes_remain || data_offset + size < data_offset) {
return 0;
}

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
} else if (data == nullptr || size > nbytes_remain || data_offset + size < data_offset) {
return 0;
}
}
if (data == nullptr || size > nbytes_remain) {
return 0;
}

The last check makes no sense to me. Is it a bug?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant to check for an overflow there since both data_offset and size are unsigned


memcpy(dst, data + data_offset, size);
data_offset += size;
return size;
}

FILE * file = nullptr;
const uint8_t * data = nullptr;

mutable size_t data_offset = 0;
mutable uint64_t nbytes_remain;
};

Expand Down Expand Up @@ -394,12 +448,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
return true;
}

struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
if (!file) {
return nullptr;
}

const struct gguf_reader gr(file);
static struct gguf_context * gguf_init_from_reader(const struct gguf_reader & gr, struct gguf_init_params params) {
struct gguf_context * ctx = new gguf_context;

bool ok = true;
Expand Down Expand Up @@ -700,14 +749,14 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para
GGML_ASSERT(int64_t(ctx->info.size()) == n_tensors);

// we require the data section to be aligned, so take into account any padding
if (gguf_fseek(file, GGML_PAD(gguf_ftell(file), ctx->alignment), SEEK_SET) != 0) {
if (!gr.seek(GGML_PAD(gr.tell(), ctx->alignment))) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (!gr.seek(GGML_PAD(gr.tell(), ctx->alignment))) {
if (n_tensors > 0 && !gr.seek(GGML_PAD(gr.tell(), ctx->alignment))) {

The CI is failing on the edge case of a GGUF file with 0 tensors in which case the file size is not padded. I would say to just handle that edge case like this.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__);
gguf_free(ctx);
return nullptr;
}

// store the current file offset - this is where the data section starts
ctx->offset = gguf_ftell(file);
ctx->offset = gr.tell();

// compute the total size of the data section, taking into account the alignment
{
Expand Down Expand Up @@ -844,6 +893,24 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para
return ctx;
}

struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
if (!file) {
return nullptr;
}

const struct gguf_reader gr(file);
return gguf_init_from_reader(gr, params);
}

struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params) {
if (data == nullptr || size == 0) {
return nullptr;
}

const struct gguf_reader gr(data, size);
return gguf_init_from_reader(gr, params);
}

struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
FILE * file = ggml_fopen(fname, "rb");

Expand Down
20 changes: 19 additions & 1 deletion src/llama-model-loader.cpp

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you changing the model loader?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When loading a model using llama_model_init_from_user from a gguf metadata gguf_context created with gguf_init_from_buffer then llama_model_loader::files is empty and so the result of the memory_breakdown of the model when loading with no_alloc=true is off and doesn't align with the results we would get when loading the same model from a file.
The changes I made here fix that

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A bugfix like that should be its own PR, remove the model loader changes from this one.

@giladgd giladgd Apr 27, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Will open a new PR once this one gets merged.

Update: opened #22566

Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,9 @@ llama_model_loader::llama_model_loader(
}

n_kv = gguf_get_n_kv(metadata);
n_tensors = weights_map.size();
n_tensors = files.empty()
? gguf_get_n_tensors(metadata)
: weights_map.size();

fver = (enum llama_fver) gguf_get_version(metadata);

Expand Down Expand Up @@ -1218,6 +1220,12 @@ struct ggml_tensor * llama_model_loader::create_tensor(
const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str());
if (tid != -1) {
type = gguf_get_tensor_type(metadata, tid);
} else if (no_alloc) {
if (flags & TENSOR_NOT_REQUIRED) {
return nullptr;
}

throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
}

// for tensors that are not required some of the dimensions can be invalid:
Expand All @@ -1243,6 +1251,16 @@ struct ggml_tensor * llama_model_loader::create_tensor(
ggml_backend_buffer_type_t buft = buft_for_tensor(&t_meta);
GGML_ASSERT(buft != nullptr);
ggml_context * ctx = ctx_for_buft(buft);

if (flags & TENSOR_DUPLICATED) {
ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
if (t) {
return t;
}
} else {
n_created++;
}

ggml_tensor * ret = ggml_dup_tensor(ctx, &t_meta);
ggml_set_name(ret, tn.str().c_str());
return ret;
Expand Down
4 changes: 4 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,10 @@ llama_build_and_test(test-gguf.cpp)
llama_build_and_test(test-backend-ops.cpp)

llama_build_and_test(test-model-load-cancel.cpp LABEL "model")

llama_build_and_test(test-model-load-buffer.cpp LABEL "model" ARGS "${MODEL_DEST}")
set_tests_properties(test-model-load-buffer PROPERTIES FIXTURES_REQUIRED test-download-model)

llama_build_and_test(test-autorelease.cpp LABEL "model")
llama_build_and_test(test-backend-sampler.cpp LABEL "model")

Expand Down
37 changes: 33 additions & 4 deletions tests/test-gguf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,22 @@ static void helper_write(FILE * file, const void * data, const size_t nbytes) {
GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes);
}

static std::vector<uint8_t> read_file_to_buffer(FILE * file) {
GGML_ASSERT(file != nullptr);
GGML_ASSERT(fseek(file, 0, SEEK_END) == 0);

const long size = ftell(file);
GGML_ASSERT(size >= 0);

rewind(file);

std::vector<uint8_t> data(static_cast<size_t>(size));
GGML_ASSERT(fread(data.data(), 1, data.size(), file) == data.size());

rewind(file);
return data;
}

static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) {
FILE * file = tmpfile();

Expand Down Expand Up @@ -1095,10 +1111,11 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml
return ok;
}

static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) {
static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta, const bool from_buffer = false) {
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
printf("%s: device=%s, backend=%s, only_meta=%s\n",
__func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no");
printf("%s: device=%s, backend=%s, only_meta=%s, from_buffer=%s\n",
__func__, ggml_backend_dev_description(dev), ggml_backend_name(backend),
only_meta ? "yes" : "no", from_buffer ? "yes" : "no");

int npass = 0;
int ntest = 0;
Expand Down Expand Up @@ -1133,7 +1150,14 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
/*no_alloc =*/ false,
/*ctx =*/ only_meta ? nullptr : &ctx_1,
};
struct gguf_context * gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params);
struct gguf_context * gguf_ctx_1 = nullptr;

if (from_buffer) {
const std::vector<uint8_t> data = read_file_to_buffer(file);
gguf_ctx_1 = gguf_init_from_buffer(data.data(), data.size(), gguf_params);
} else {
gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params);
}

printf("%s: same_version: ", __func__);
if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) {
Expand Down Expand Up @@ -1347,6 +1371,11 @@ int main(int argc, char ** argv) {
npass += result.first;
ntest += result.second;
}
{
std::pair<int, int> result = test_roundtrip(dev, seed, /*only_meta=*/false, /*from_buffer=*/true);
npass += result.first;
ntest += result.second;
}

{
std::pair<int, int> result = test_gguf_set_kv(dev, seed);
Expand Down
109 changes: 109 additions & 0 deletions tests/test-model-load-buffer.cpp

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove this file, the test cases in test-gguf.cpp should suffice.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added this file to validate the fix I made in llama-model-loader.cpp and ensure it doesn't break with future changes.
The crux of this test file is the comparison of ->memory_breakdown() from the model loaded from a file and the one loaded from a buffer.

If you remove the changes I made in llama-model-loader.cpp and run this test it'll fail

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed this file and will introduce it back in the next PR with the fixes to llama-model-loader.cpp

Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#include "ggml-backend.h"
#include "get-model.h"
#include "llama.h"
#include "gguf.h"

#include "../src/llama-model.h"

#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <vector>

static std::vector<uint8_t> read_file_to_buffer(FILE * file) {
if (file == nullptr || fseek(file, 0, SEEK_END) != 0) {
return {};
}

const long size = ftell(file);
if (size < 0) {
return {};
}

rewind(file);

std::vector<uint8_t> data(static_cast<size_t>(size));
if (fread(data.data(), 1, data.size(), file) != data.size()) {
return {};
}

return data;
}

static void set_tensor_data_noop(struct ggml_tensor * tensor, void * userdata) {
GGML_UNUSED(tensor);
GGML_UNUSED(userdata);
}

int main(int argc, char * argv[]) {
char * model_path = get_model_or_exit(argc, argv);
FILE * file = fopen(model_path, "rb");
if (file == nullptr) {
fprintf(stderr, "failed to open model at '%s'\n", model_path);
return EXIT_FAILURE;
}

const std::vector<uint8_t> data = read_file_to_buffer(file);
fclose(file);
if (data.empty()) {
fprintf(stderr, "failed to read model at '%s'\n", model_path);
return EXIT_FAILURE;
}

llama_backend_init();

ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpu_dev == nullptr) {
llama_backend_free();
return EXIT_FAILURE;
}

ggml_backend_dev_t devices[] = { cpu_dev, nullptr };

llama_model_params model_params = llama_model_default_params();
model_params.devices = devices;
model_params.no_alloc = true;
model_params.use_mmap = false;
model_params.progress_callback = [](float progress, void * user_data) {
GGML_UNUSED(progress);
GGML_UNUSED(user_data);
return true;
};

gguf_init_params gguf_params = {
/*.no_alloc = */ true,
/*.ctx = */ nullptr,
};
gguf_context * gguf_ctx = gguf_init_from_buffer(data.data(), data.size(), gguf_params);
if (gguf_ctx == nullptr || gguf_get_n_tensors(gguf_ctx) <= 0) {
gguf_free(gguf_ctx);
llama_backend_free();
return EXIT_FAILURE;
}

llama_model * model_from_file = llama_model_load_from_file(model_path, model_params);
if (model_from_file == nullptr) {
gguf_free(gguf_ctx);
llama_backend_free();
return EXIT_FAILURE;
}

llama_model * model_from_buffer = llama_model_init_from_user(gguf_ctx, set_tensor_data_noop, nullptr, model_params);
if (model_from_buffer == nullptr) {
llama_model_free(model_from_file);
gguf_free(gguf_ctx);
llama_backend_free();
return EXIT_FAILURE;
}

const auto mb_from_file = model_from_file->memory_breakdown();
const auto mb_from_buffer = model_from_buffer->memory_breakdown();
const bool ok = !mb_from_file.empty() && mb_from_file == mb_from_buffer;

llama_model_free(model_from_buffer);
llama_model_free(model_from_file);
gguf_free(gguf_ctx);
llama_backend_free();

return ok ? EXIT_SUCCESS : EXIT_FAILURE;
}