-
Notifications
You must be signed in to change notification settings - Fork 19.8k
llama : add fd-based model loading via llama_model_load_from_fd ( REWORK ) #20402
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
158239a
a4cfaf0
626823b
26c04d4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -464,6 +464,9 @@ extern "C" { | |
| const char * path_model, | ||
| struct llama_model_params params); | ||
|
|
||
| // Load a model from an open FILE pointer | ||
| LLAMA_API struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please keep the formatting consistent with the surrounding code.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, and sorry for the misunderstanding :) |
||
|
|
||
| // Load a model from multiple splits (support custom naming scheme) | ||
| // The paths must be in the correct order | ||
| LLAMA_API struct llama_model * llama_model_load_from_splits( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,12 +15,13 @@ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>; | |
|
|
||
| struct llama_file { | ||
| llama_file(const char * fname, const char * mode, bool use_direct_io = false); | ||
| llama_file(int fd); | ||
| ~llama_file(); | ||
|
|
||
| size_t tell() const; | ||
| size_t size() const; | ||
|
|
||
| int file_id() const; // fileno overload | ||
| int file_id() const; | ||
|
Comment on lines
-23
to
+24
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What did "fileno" refer to here and why did you remove this comment? |
||
|
|
||
| void seek(size_t offset, int whence) const; | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -828,7 +828,7 @@ int64_t llama_time_us(void) { | |
|
|
||
| // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback | ||
| static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud, | ||
| const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) { | ||
| const std::string & fname, std::vector<std::string> & splits, FILE * file, llama_model & model, llama_model_params & params) { | ||
| // loading time will be recalculated after the first eval, so | ||
| // we take page faults deferred by mmap() into consideration | ||
| model.t_load_us = 0; | ||
|
|
@@ -837,7 +837,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens | |
| model.t_start_us = tm.t_start_us; | ||
|
|
||
| try { | ||
| llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io, | ||
| llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io, | ||
| params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); | ||
|
|
||
| ml.print_info(); | ||
|
|
@@ -889,8 +889,12 @@ static struct llama_model * llama_model_load_from_file_impl( | |
| void * set_tensor_data_ud, | ||
| const std::string & path_model, | ||
| std::vector<std::string> & splits, | ||
| FILE * file, | ||
| struct llama_model_params params) { | ||
| GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined"); | ||
| if (metadata == nullptr && path_model.empty() && !file) { | ||
| LLAMA_LOG_ERROR("%s: no model source provided\n", __func__); | ||
| return nullptr; | ||
| } | ||
|
Comment on lines
-893
to
+897
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logic should remain that exactly one out of the three things needs to be defined. Something like this should work: |
||
| ggml_time_init(); | ||
|
|
||
| if (!params.vocab_only && ggml_backend_reg_count() == 0) { | ||
|
|
@@ -1011,7 +1015,7 @@ static struct llama_model * llama_model_load_from_file_impl( | |
| props.memory_free/1024/1024); | ||
| } | ||
|
|
||
| const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params); | ||
| const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, file, *model, params); | ||
| GGML_ASSERT(status <= 0); | ||
| if (status < 0) { | ||
| if (status == -1) { | ||
|
|
@@ -1037,7 +1041,7 @@ struct llama_model * llama_model_init_from_user( | |
| std::vector<std::string> splits = {}; | ||
| params.use_mmap = false; | ||
| params.use_extra_bufts = false; | ||
| return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params); | ||
| return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*file*/ nullptr, params); | ||
| } | ||
| // deprecated | ||
| struct llama_model * llama_load_model_from_file( | ||
|
|
@@ -1050,7 +1054,7 @@ struct llama_model * llama_model_load_from_file( | |
| const char * path_model, | ||
| struct llama_model_params params) { | ||
| std::vector<std::string> splits = {}; | ||
| return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params); | ||
| return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*file*/ nullptr, params); | ||
| } | ||
|
|
||
| struct llama_model * llama_model_load_from_splits( | ||
|
|
@@ -1066,7 +1070,17 @@ struct llama_model * llama_model_load_from_splits( | |
| for (size_t i = 0; i < n_paths; ++i) { | ||
| splits.push_back(paths[i]); | ||
| } | ||
| return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params); | ||
| return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*file*/ nullptr, params); | ||
| } | ||
|
|
||
| struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params) { | ||
| if (!file) { | ||
| LLAMA_LOG_ERROR("%s: file is NULL\n", __func__); | ||
| return nullptr; | ||
| } | ||
| std::string path_model; | ||
| std::vector<std::string> splits = {}; | ||
| return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, file, params); | ||
| } | ||
|
|
||
| void llama_model_save_to_file(const struct llama_model * model, const char * path_model) { | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am currently adding file saving/loading to the recently added end-to-end tests in |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| #include "llama.h" | ||
| #include "get-model.h" | ||
|
|
||
| #include <cstdio> | ||
| #include <cstdlib> | ||
|
|
||
| #ifdef _WIN32 | ||
| int main(int /*argc*/, char ** /*argv*/) { | ||
| fprintf(stderr, "skipping on Windows\n"); | ||
| return EXIT_SUCCESS; | ||
| } | ||
| #else | ||
| # include <fcntl.h> | ||
| # include <unistd.h> | ||
|
|
||
| int main(int argc, char ** argv) { | ||
| auto * model_path = get_model_or_exit(argc, argv); | ||
|
|
||
| llama_backend_init(); | ||
|
|
||
| const int fd = open(model_path, O_RDONLY); | ||
| if (fd < 0) { | ||
| fprintf(stderr, "failed to open %s\n", model_path); | ||
| return EXIT_FAILURE; | ||
| } | ||
|
|
||
| FILE * f = fdopen(dup(fd), "rb"); | ||
| close(fd); | ||
| if (!f) { | ||
| fprintf(stderr, "failed to fdopen\n"); | ||
| return EXIT_FAILURE; | ||
| } | ||
|
|
||
| auto params = llama_model_default_params(); | ||
| params.use_mmap = true; | ||
| params.vocab_only = true; | ||
|
|
||
| struct llama_model * model = llama_model_load_from_file_ptr(f, params); | ||
| fclose(f); | ||
|
|
||
| if (model == nullptr) { | ||
| fprintf(stderr, "load from file pointer failed\n"); | ||
| return EXIT_FAILURE; | ||
| } | ||
|
|
||
| const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); | ||
| fprintf(stderr, "loaded %d tokens via file pointer\n", n_vocab); | ||
|
|
||
| llama_model_free(model); | ||
| llama_backend_free(); | ||
|
|
||
| return n_vocab > 0 ? EXIT_SUCCESS : EXIT_FAILURE; | ||
| } | ||
| #endif // _WIN32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just rename
gguf_init_from_file_impltogguf_init_from_file_ptrand add the check there. Keep the check ingguf_init_from_filesince it is associated with a warning.