ggml-org · JohannesGaessler · May 25, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 25, 2026
@@ -79,7 +79,7 @@ extern "C" {
     GGML_API struct gguf_context * gguf_init_empty(void);
     GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params);
     GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
-    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
+    GGML_API struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params);
 
     GGML_API void gguf_free(struct gguf_context * ctx);
 

@@ -233,6 +233,10 @@ struct gguf_reader {
         nbytes_remain = file_remain(file);
     }
 
+    gguf_reader(const void * data, size_t size)
+        : data(static_cast<const uint8_t *>(data)), nbytes_remain(size) {
+    }
+
     // helper for remaining bytes in a file
     static uint64_t file_remain(FILE * file) {
         const int64_t cur = gguf_ftell(file);
@@ -260,7 +264,7 @@ struct gguf_reader {
         if (nbytes_remain < size) {
             return false;
         }
-        const size_t nread = fread(&dst, 1, size, file);
+        const size_t nread = read_raw(&dst, size);
         nbytes_remain -= nread;
         return nread == size;
     }
@@ -344,7 +348,7 @@ struct gguf_reader {
             return false;
         }
         dst.resize(static_cast<size_t>(size));
-        const size_t nread = fread(dst.data(), 1, size, file);
+        const size_t nread = read_raw(dst.data(), static_cast<size_t>(size));
         nbytes_remain -= nread;
         return nread == size;
     }
@@ -353,14 +357,64 @@ struct gguf_reader {
         if (size > nbytes_remain) {
             return false;
         }
-        const size_t nread = fread(dst, 1, size, file);
+        const size_t nread = read_raw(dst, size);
         nbytes_remain -= nread;
         return nread == size;
     }
 
+    uint64_t tell() const {
+        if (file != nullptr) {
+            const int64_t cur = gguf_ftell(file);
+            return cur < 0
+                ? 0
+                : static_cast<uint64_t>(cur);
+        }
+
+        return data_offset;
+    }
+
+    bool seek(uint64_t absolute_offset) const {
+        if (file != nullptr) {
+            const int64_t cur = gguf_ftell(file);
+            const uint64_t end_offset = cur < 0
+                ? nbytes_remain
+                : static_cast<uint64_t>(cur) + nbytes_remain;
+
+            if (absolute_offset > end_offset || gguf_fseek(file, absolute_offset, SEEK_SET) != 0) {
+                return false;
+            }
+
+            nbytes_remain = end_offset - absolute_offset;
+        } else {
+            const uint64_t end_offset = data_offset + nbytes_remain;
+            if (absolute_offset > end_offset) {
+                return false;
+            }
+
+            data_offset   = static_cast<size_t>(absolute_offset);
+            nbytes_remain = end_offset - absolute_offset;
+        }
+
+        return true;
+    }
+
 private:
-    FILE * file;
+    size_t read_raw(void * dst, size_t size) const {
+        if (file != nullptr) {
+            return fread(dst, 1, size, file);
+        } else if (data == nullptr || size > nbytes_remain || data_offset + size < data_offset) {
+            return 0;
+        }
-        } else if (data == nullptr || size > nbytes_remain || data_offset + size < data_offset) {
-            return 0;
-        }
+        }
+        if (data == nullptr || size > nbytes_remain) {
+            return 0;
+        }
-        } else if (data == nullptr || size > nbytes_remain || data_offset + size < data_offset) {
-            return 0;
-        }
+        }
+        if (data == nullptr || size > nbytes_remain) {
+            return 0;
+        }
 
+        memcpy(dst, data + data_offset, size);
+        data_offset += size;
+        return size;
+    }
+
+    FILE * file = nullptr;
+    const uint8_t * data = nullptr;
+
+    mutable size_t data_offset = 0;
     mutable uint64_t nbytes_remain;
 };
 
@@ -394,12 +448,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
     return true;
 }
 
-struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
-    if (!file) {
-        return nullptr;
-    }
-
-    const struct gguf_reader gr(file);
+static struct gguf_context * gguf_init_from_reader(const struct gguf_reader & gr, struct gguf_init_params params) {
     struct gguf_context * ctx = new gguf_context;
 
     bool ok = true;
@@ -700,14 +749,14 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para
     GGML_ASSERT(int64_t(ctx->info.size()) == n_tensors);
 
     // we require the data section to be aligned, so take into account any padding
-    if (gguf_fseek(file, GGML_PAD(gguf_ftell(file), ctx->alignment), SEEK_SET) != 0) {
+    if (!gr.seek(GGML_PAD(gr.tell(), ctx->alignment))) {
-    if (!gr.seek(GGML_PAD(gr.tell(), ctx->alignment))) {
+    if (n_tensors > 0 && !gr.seek(GGML_PAD(gr.tell(), ctx->alignment))) {
-    if (!gr.seek(GGML_PAD(gr.tell(), ctx->alignment))) {
+    if (n_tensors > 0 && !gr.seek(GGML_PAD(gr.tell(), ctx->alignment))) {
         GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__);
         gguf_free(ctx);
         return nullptr;
     }
 
     // store the current file offset - this is where the data section starts
-    ctx->offset = gguf_ftell(file);
+    ctx->offset = gr.tell();
 
     // compute the total size of the data section, taking into account the alignment
     {
@@ -844,6 +893,24 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para
     return ctx;
 }
 
+struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
+    if (!file) {
+        return nullptr;
+    }
+
+    const struct gguf_reader gr(file);
+    return gguf_init_from_reader(gr, params);
+}
+
+struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params) {
+    if (data == nullptr || size == 0) {
+        return nullptr;
+    }
+
+    const struct gguf_reader gr(data, size);
+    return gguf_init_from_reader(gr, params);
+}
+
 struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
     FILE * file = ggml_fopen(fname, "rb");
 

diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -697,7 +697,9 @@ llama_model_loader::llama_model_loader(
     }
 
     n_kv      = gguf_get_n_kv(metadata);
-    n_tensors = weights_map.size();
+    n_tensors = files.empty()
+        ? gguf_get_n_tensors(metadata)
+        : weights_map.size();
 
     fver = (enum llama_fver) gguf_get_version(metadata);
 
@@ -1218,6 +1220,12 @@ struct ggml_tensor * llama_model_loader::create_tensor(
         const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str());
         if (tid != -1) {
             type = gguf_get_tensor_type(metadata, tid);
+        } else if (no_alloc) {
+            if (flags & TENSOR_NOT_REQUIRED) {
+                return nullptr;
+            }
+
+            throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
         }
 
         // for tensors that are not required some of the dimensions can be invalid:
@@ -1243,6 +1251,16 @@ struct ggml_tensor * llama_model_loader::create_tensor(
         ggml_backend_buffer_type_t buft = buft_for_tensor(&t_meta);
         GGML_ASSERT(buft != nullptr);
         ggml_context * ctx = ctx_for_buft(buft);
+
+        if (flags & TENSOR_DUPLICATED) {
+            ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
+            if (t) {
+                return t;
+            }
+        } else {
+            n_created++;
+        }
+
         ggml_tensor * ret = ggml_dup_tensor(ctx, &t_meta);
         ggml_set_name(ret, tn.str().c_str());
         return ret;

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -243,6 +243,10 @@ llama_build_and_test(test-gguf.cpp)
 llama_build_and_test(test-backend-ops.cpp)
 
 llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
+
+llama_build_and_test(test-model-load-buffer.cpp LABEL "model" ARGS "${MODEL_DEST}")
+set_tests_properties(test-model-load-buffer PROPERTIES FIXTURES_REQUIRED test-download-model)
+
 llama_build_and_test(test-autorelease.cpp       LABEL "model")
 llama_build_and_test(test-backend-sampler.cpp   LABEL "model")
 

@@ -162,6 +162,22 @@ static void helper_write(FILE * file, const void * data, const size_t nbytes) {
     GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes);
 }
 
+static std::vector<uint8_t> read_file_to_buffer(FILE * file) {
+    GGML_ASSERT(file != nullptr);
+    GGML_ASSERT(fseek(file, 0, SEEK_END) == 0);
+
+    const long size = ftell(file);
+    GGML_ASSERT(size >= 0);
+
+    rewind(file);
+
+    std::vector<uint8_t> data(static_cast<size_t>(size));
+    GGML_ASSERT(fread(data.data(), 1, data.size(), file) == data.size());
+
+    rewind(file);
+    return data;
+}
+
 static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) {
     FILE * file = tmpfile();
 
@@ -1095,10 +1111,11 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml
     return ok;
 }
 
-static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) {
+static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta, const bool from_buffer = false) {
     ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
-    printf("%s: device=%s, backend=%s, only_meta=%s\n",
-        __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no");
+    printf("%s: device=%s, backend=%s, only_meta=%s, from_buffer=%s\n",
+        __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend),
+        only_meta ? "yes" : "no", from_buffer ? "yes" : "no");
 
     int npass = 0;
     int ntest = 0;
@@ -1133,7 +1150,14 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
         /*no_alloc =*/ false,
         /*ctx      =*/ only_meta ? nullptr : &ctx_1,
     };
-    struct gguf_context * gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params);
+    struct gguf_context * gguf_ctx_1 = nullptr;
+
+    if (from_buffer) {
+        const std::vector<uint8_t> data = read_file_to_buffer(file);
+        gguf_ctx_1 = gguf_init_from_buffer(data.data(), data.size(), gguf_params);
+    } else {
+        gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params);
+    }
 
     printf("%s: same_version: ", __func__);
     if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) {
@@ -1347,6 +1371,11 @@ int main(int argc, char ** argv) {
             npass += result.first;
             ntest += result.second;
         }
+        {
+            std::pair<int, int> result = test_roundtrip(dev, seed, /*only_meta=*/false, /*from_buffer=*/true);
+            npass += result.first;
+            ntest += result.second;
+        }
 
         {
             std::pair<int, int> result = test_gguf_set_kv(dev, seed);

diff --git a/tests/test-model-load-buffer.cpp b/tests/test-model-load-buffer.cpp
@@ -0,0 +1,109 @@
+#include "ggml-backend.h"
+#include "get-model.h"
+#include "llama.h"
+#include "gguf.h"
+
+#include "../src/llama-model.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+static std::vector<uint8_t> read_file_to_buffer(FILE * file) {
+    if (file == nullptr || fseek(file, 0, SEEK_END) != 0) {
+        return {};
+    }
+
+    const long size = ftell(file);
+    if (size < 0) {
+        return {};
+    }
+
+    rewind(file);
+
+    std::vector<uint8_t> data(static_cast<size_t>(size));
+    if (fread(data.data(), 1, data.size(), file) != data.size()) {
+        return {};
+    }
+
+    return data;
+}
+
+static void set_tensor_data_noop(struct ggml_tensor * tensor, void * userdata) {
+    GGML_UNUSED(tensor);
+    GGML_UNUSED(userdata);
+}
+
+int main(int argc, char * argv[]) {
+    char * model_path = get_model_or_exit(argc, argv);
+    FILE * file = fopen(model_path, "rb");
+    if (file == nullptr) {
+        fprintf(stderr, "failed to open model at '%s'\n", model_path);
+        return EXIT_FAILURE;
+    }
+
+    const std::vector<uint8_t> data = read_file_to_buffer(file);
+    fclose(file);
+    if (data.empty()) {
+        fprintf(stderr, "failed to read model at '%s'\n", model_path);
+        return EXIT_FAILURE;
+    }
+
+    llama_backend_init();
+
+    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (cpu_dev == nullptr) {
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    ggml_backend_dev_t devices[] = { cpu_dev, nullptr };
+
+    llama_model_params model_params = llama_model_default_params();
+    model_params.devices = devices;
+    model_params.no_alloc = true;
+    model_params.use_mmap = false;
+    model_params.progress_callback = [](float progress, void * user_data) {
+        GGML_UNUSED(progress);
+        GGML_UNUSED(user_data);
+        return true;
+    };
+
+    gguf_init_params gguf_params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ nullptr,
+    };
+    gguf_context * gguf_ctx = gguf_init_from_buffer(data.data(), data.size(), gguf_params);
+    if (gguf_ctx == nullptr || gguf_get_n_tensors(gguf_ctx) <= 0) {
+        gguf_free(gguf_ctx);
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    llama_model * model_from_file = llama_model_load_from_file(model_path, model_params);
+    if (model_from_file == nullptr) {
+        gguf_free(gguf_ctx);
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    llama_model * model_from_buffer = llama_model_init_from_user(gguf_ctx, set_tensor_data_noop, nullptr, model_params);
+    if (model_from_buffer == nullptr) {
+        llama_model_free(model_from_file);
+        gguf_free(gguf_ctx);
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    const auto mb_from_file   = model_from_file->memory_breakdown();
+    const auto mb_from_buffer = model_from_buffer->memory_breakdown();
+    const bool ok = !mb_from_file.empty() && mb_from_file == mb_from_buffer;
+
+    llama_model_free(model_from_buffer);
+    llama_model_free(model_from_file);
+    gguf_free(gguf_ctx);
+    llama_backend_free();
+
+    return ok ? EXIT_SUCCESS : EXIT_FAILURE;
+}