Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ggml/include/gguf.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ extern "C" {

GGML_API struct gguf_context * gguf_init_empty(void);
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params);
//GGML_API struct gguf_context * gguf_init_from_buffer(..);

GGML_API void gguf_free(struct gguf_context * ctx);
Expand Down
1 change: 0 additions & 1 deletion ggml/src/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,5 @@ inline bool ggml_check_edges(const struct ggml_cgraph * cgraph,

// expose GGUF internals for test code
GGML_API size_t gguf_type_size(enum gguf_type type);
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
#endif // __cplusplus
8 changes: 6 additions & 2 deletions ggml/src/gguf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,11 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
return true;
}

struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
if (!file) {
return nullptr;
}

const struct gguf_reader gr(file);
struct gguf_context * ctx = new gguf_context;

Expand Down Expand Up @@ -848,7 +852,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
return nullptr;
}

struct gguf_context * result = gguf_init_from_file_impl(file, params);
struct gguf_context * result = gguf_init_from_file_ptr(file, params);
fclose(file);
return result;
}
Expand Down
5 changes: 5 additions & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,11 @@ extern "C" {
const char * path_model,
struct llama_model_params params);

// Load a model from an open FILE pointer
LLAMA_API struct llama_model * llama_model_load_from_file_ptr(
FILE * file,
struct llama_model_params params);

// Load a model from multiple splits (support custom naming scheme)
// The paths must be in the correct order
LLAMA_API struct llama_model * llama_model_load_from_splits(
Expand Down
23 changes: 21 additions & 2 deletions src/llama-mmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,14 @@ struct llama_file::impl {
seek(0, SEEK_SET);
}

impl(FILE * file) : owns_fp(false) {
fp = file;
fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}

size_t tell() const {
LARGE_INTEGER li;
li.QuadPart = 0;
Expand Down Expand Up @@ -159,7 +167,7 @@ struct llama_file::impl {
}

~impl() {
if (fp) {
if (fp && owns_fp) {
std::fclose(fp);
}
}
Expand Down Expand Up @@ -209,6 +217,13 @@ struct llama_file::impl {
seek(0, SEEK_SET);
}

impl(FILE * file) : fname("(file*)"), owns_fp(false) {
fp = file;
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}

size_t tell() const {
if (fd == -1) {
long ret = std::ftell(fp);
Expand Down Expand Up @@ -353,7 +368,7 @@ struct llama_file::impl {
~impl() {
if (fd != -1) {
close(fd);
} else {
} else if (owns_fp) {
std::fclose(fp);
}
}
Expand All @@ -369,10 +384,14 @@ struct llama_file::impl {

FILE * fp{};
size_t size{};
bool owns_fp = true;
};

llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}

llama_file::llama_file(FILE * file) : pimpl(std::make_unique<impl>(file)) {}

llama_file::~llama_file() = default;

size_t llama_file::tell() const { return pimpl->tell(); }
Expand Down
3 changes: 2 additions & 1 deletion src/llama-mmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,13 @@ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;

struct llama_file {
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
llama_file(FILE * file);
~llama_file();

size_t tell() const;
size_t size() const;

int file_id() const; // fileno overload
int file_id() const;
Comment on lines -23 to +24

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What did "fileno" refer to here and why did you remove this comment?


void seek(size_t offset, int whence) const;

Expand Down
33 changes: 32 additions & 1 deletion src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,7 @@ llama_model_loader::llama_model_loader(
void * set_tensor_data_ud,
const std::string & fname,
std::vector<std::string> & splits,
FILE * file,
bool use_mmap,
bool use_direct_io,
bool check_tensors,
Expand Down Expand Up @@ -657,6 +658,36 @@ llama_model_loader::llama_model_loader(

LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
}
} else if (file) {
struct ggml_context * ctx = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx,
};

metadata_ptr.reset(gguf_init_from_file_ptr(file, params));
metadata = metadata_ptr.get();
if (metadata == nullptr) {
throw std::runtime_error(format("%s: failed to load model from file pointer", __func__));
}

get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));

files.emplace_back(new llama_file(file));
contexts.emplace_back(ctx);

// Save tensors data offset info of the main file.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
std::string tensor_name = std::string(cur->name);
// make sure there is no duplicated tensor names
if (weights_map.find(tensor_name) != weights_map.end()) {
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
}
n_elements += ggml_nelements(cur);
n_bytes += ggml_nbytes(cur);
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
}
} else {
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
Expand All @@ -668,7 +699,7 @@ llama_model_loader::llama_model_loader(
fver = (enum llama_fver) gguf_get_version(metadata);

LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
__func__, n_kv, n_tensors, fname.empty() ? "(file*)" : fname.c_str(), llama_file_version_name(fver));

// determine file type based on the number of tensors for each quantization and print meta data
// TODO: make optional
Expand Down
1 change: 1 addition & 0 deletions src/llama-model-loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ struct llama_model_loader {
void * set_tensor_data_ud,
const std::string & fname,
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
FILE * file,
bool use_mmap,
bool use_direct_io,
bool check_tensors,
Expand Down
2 changes: 1 addition & 1 deletion src/llama-quant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -859,7 +859,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::

std::vector<std::string> splits = {};
llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
ml.init_mappings(false); // no prefetching

llama_model model(llama_model_default_params());
Expand Down
28 changes: 21 additions & 7 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -828,7 +828,7 @@ int64_t llama_time_us(void) {

// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
const std::string & fname, std::vector<std::string> & splits, FILE * file, llama_model & model, llama_model_params & params) {
// loading time will be recalculated after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = 0;
Expand All @@ -837,7 +837,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens
model.t_start_us = tm.t_start_us;

try {
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io,
params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);

ml.print_info();
Expand Down Expand Up @@ -889,8 +889,12 @@ static struct llama_model * llama_model_load_from_file_impl(
void * set_tensor_data_ud,
const std::string & path_model,
std::vector<std::string> & splits,
FILE * file,
struct llama_model_params params) {
GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
if (metadata == nullptr && path_model.empty() && !file) {
LLAMA_LOG_ERROR("%s: no model source provided\n", __func__);
return nullptr;
}
Comment on lines -893 to +897

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The logic should remain that exactly one out of the three things needs to be defined. Something like this should work:

GGML_ASSERT(int(metadata != nullptr) + int(path_model.empty()) + int(file != nullptr) == 1 && "exactly one out of metadata and path_model needs to be defined");

ggml_time_init();

if (!params.vocab_only && ggml_backend_reg_count() == 0) {
Expand Down Expand Up @@ -1011,7 +1015,7 @@ static struct llama_model * llama_model_load_from_file_impl(
props.memory_free/1024/1024);
}

const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, file, *model, params);
GGML_ASSERT(status <= 0);
if (status < 0) {
if (status == -1) {
Expand All @@ -1037,7 +1041,7 @@ struct llama_model * llama_model_init_from_user(
std::vector<std::string> splits = {};
params.use_mmap = false;
params.use_extra_bufts = false;
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*file*/ nullptr, params);
}
// deprecated
struct llama_model * llama_load_model_from_file(
Expand All @@ -1050,7 +1054,7 @@ struct llama_model * llama_model_load_from_file(
const char * path_model,
struct llama_model_params params) {
std::vector<std::string> splits = {};
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*file*/ nullptr, params);
}

struct llama_model * llama_model_load_from_splits(
Expand All @@ -1066,7 +1070,17 @@ struct llama_model * llama_model_load_from_splits(
for (size_t i = 0; i < n_paths; ++i) {
splits.push_back(paths[i]);
}
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*file*/ nullptr, params);
}

struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params) {
if (!file) {
LLAMA_LOG_ERROR("%s: file is NULL\n", __func__);
return nullptr;
}
std::string path_model;
std::vector<std::string> splits = {};
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, file, params);
}

void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ llama_build_and_test(test-gguf.cpp)
llama_build_and_test(test-backend-ops.cpp)

llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
llama_build_and_test(test-model-load-fd.cpp LABEL "model")
llama_build_and_test(test-autorelease.cpp LABEL "model")
llama_build_and_test(test-backend-sampler.cpp LABEL "model")

Expand Down
4 changes: 2 additions & 2 deletions tests/test-gguf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -742,7 +742,7 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
/*ctx =*/ hft >= offset_has_data ? &ctx : nullptr,
};

struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params);
struct gguf_context * gguf_ctx = gguf_init_from_file_ptr(file, gguf_params);

if (expect_context_not_null(hft)) {
printf("%s: - context_not_null: ", __func__);
Expand Down Expand Up @@ -1137,7 +1137,7 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
/*no_alloc =*/ false,
/*ctx =*/ only_meta ? nullptr : &ctx_1,
};
struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params);
struct gguf_context * gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params);

printf("%s: same_version: ", __func__);
if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) {
Expand Down
54 changes: 54 additions & 0 deletions tests/test-model-load-fd.cpp

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am currently adding file saving/loading to the recently added end-to-end tests in test-llama-archs.cpp via #20503 . This will provide test coverage for your newly added code so I don't think we need this additional test.

Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#include "llama.h"
#include "get-model.h"

#include <cstdio>
#include <cstdlib>

#ifdef _WIN32
int main(int /*argc*/, char ** /*argv*/) {
fprintf(stderr, "skipping on Windows\n");
return EXIT_SUCCESS;
}
#else
# include <fcntl.h>
# include <unistd.h>

int main(int argc, char ** argv) {
auto * model_path = get_model_or_exit(argc, argv);

llama_backend_init();

const int fd = open(model_path, O_RDONLY);
if (fd < 0) {
fprintf(stderr, "failed to open %s\n", model_path);
return EXIT_FAILURE;
}

FILE * f = fdopen(dup(fd), "rb");
close(fd);
if (!f) {
fprintf(stderr, "failed to fdopen\n");
return EXIT_FAILURE;
}

auto params = llama_model_default_params();
params.use_mmap = true;
params.vocab_only = true;

struct llama_model * model = llama_model_load_from_file_ptr(f, params);
fclose(f);

if (model == nullptr) {
fprintf(stderr, "load from file pointer failed\n");
return EXIT_FAILURE;
}

const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
fprintf(stderr, "loaded %d tokens via file pointer\n", n_vocab);

llama_model_free(model);
llama_backend_free();

return n_vocab > 0 ? EXIT_SUCCESS : EXIT_FAILURE;
}
#endif // _WIN32