From 5bc7a7409fc39d2801b3eeddc9e22fdd4bda718d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 27 May 2025 21:28:22 +0200 Subject: [PATCH] llama: automatically set runtime pars to fit VRAM --- common/common.cpp | 71 ++++++++++++++++++++++++++++- ggml/include/ggml-alloc.h | 6 ++- ggml/include/ggml-backend.h | 1 + ggml/src/ggml-alloc.c | 60 ++++++++++++++++++++---- ggml/src/ggml-backend.cpp | 22 ++++++++- include/llama.h | 7 +++ src/llama-context.cpp | 67 ++++++++++++++++++++------- src/llama-context.h | 10 +++- src/llama-impl.cpp | 18 ++++++++ src/llama-impl.h | 12 +++++ src/llama-kv-cache-recurrent.cpp | 26 ++++++----- src/llama-kv-cache-recurrent.h | 7 +-- src/llama-kv-cache-unified-iswa.cpp | 11 +++-- src/llama-kv-cache-unified-iswa.h | 5 +- src/llama-kv-cache-unified.cpp | 27 ++++++----- src/llama-kv-cache-unified.h | 7 +-- src/llama-memory.h | 6 +++ src/llama-model.cpp | 46 +++++++++++++++---- src/llama-model.h | 6 ++- src/llama.cpp | 45 +++++++++++++++--- 20 files changed, 376 insertions(+), 84 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 218f1e1dc0e4d..a83e9d0737a76 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -885,9 +885,78 @@ std::string fs_get_cache_file(const std::string & filename) { // Model utils // +static void common_fit_to_free_memory( + const std::string & path_model, llama_model_params & mparams, llama_context_params & cparams, const size_t margin) { + + std::vector devices(ggml_backend_dev_count()); + for (size_t i = 0; i < devices.size(); i++) { + devices[i] = ggml_backend_dev_get(i); + } + + std::vector memory_total(devices.size()); + std::vector memory_free(devices.size()); + for (size_t i = 0; i < devices.size(); i++) { + ggml_backend_dev_memory(devices[i], memory_free.data() + i, memory_total.data() + i); + } + + auto get_min_margin = [path_model, memory_free](const llama_model_params & mparams_test, const llama_context_params & cparams_test) { + std::vector memory_expect(memory_free.size()); + GGML_ASSERT(llama_expected_memory_use(path_model.c_str(), mparams_test, cparams_test, memory_expect.data())); + + int64_t min_margin = INT64_MAX; + for (size_t i = 0; i < memory_free.size(); i++) { + min_margin = std::min(min_margin, int64_t(memory_free[i]) - int64_t(memory_expect[i])); + } + return min_margin; + }; + auto test_ngl = [mparams, cparams, get_min_margin](const int ngl) { + llama_model_params mparams_test = mparams; + mparams_test.n_gpu_layers = ngl; + return get_min_margin(mparams_test, cparams); + }; + + int ngl_low = 0; + int64_t margin_low = test_ngl(ngl_low); + if (margin_low < int64_t(margin)) { + mparams.n_gpu_layers = ngl_low; + return; + } + + int ngl_high = 128; // FIXME + int64_t margin_high = test_ngl(ngl_high); + if (margin_high >= int64_t(margin)) { + mparams.n_gpu_layers = ngl_high; + return; + } + + // TODO bisection is ineffient, better to interpolate if max ngl value is known + while (ngl_high - ngl_low > 1) { + const int ngl_test = (ngl_high + ngl_low) / 2; + const int64_t margin_test = test_ngl(ngl_test); + + if (margin_test < int64_t(margin)) { + ngl_high = ngl_test; + margin_high = margin_test; + } else { + ngl_low = ngl_test; + margin_low = margin_test; + } + } + + if (margin_high >= int64_t(margin)) { + mparams.n_gpu_layers = ngl_high; + } else { + mparams.n_gpu_layers = ngl_low; + } +} + struct common_init_result common_init_from_params(common_params & params) { common_init_result iparams; auto mparams = common_model_params_to_llama(params); + auto cparams = common_context_params_to_llama(params); + + constexpr size_t margin = 1024*1024*1024; + common_fit_to_free_memory(params.model.path, mparams, cparams, margin); llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); if (model == NULL) { @@ -925,8 +994,6 @@ struct common_init_result common_init_from_params(common_params & params) { } } - auto cparams = common_context_params_to_llama(params); - llama_context * lctx = llama_init_from_model(model, cparams); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str()); diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h index 2cb150fd2a313..3300cc52d9abe 100644 --- a/ggml/include/ggml-alloc.h +++ b/ggml/include/ggml-alloc.h @@ -9,6 +9,7 @@ extern "C" { typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; typedef struct ggml_backend_buffer * ggml_backend_buffer_t; typedef struct ggml_backend * ggml_backend_t; +typedef struct ggml_backend_device * ggml_backend_dev_t; // Tensor allocator struct ggml_tallocr { @@ -58,16 +59,19 @@ GGML_API bool ggml_gallocr_reserve_n( ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, - const int * leaf_buffer_ids); + const int * leaf_buffer_ids, + bool dry_run); // automatic reallocation if the topology changes when using a single buffer // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers) GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph); GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id); +size_t ggml_gallocr_get_max_size(ggml_gallocr_t galloc, ggml_backend_dev_t dev); // Utils // Create a buffer and allocate all the tensors in a ggml_context +GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend); diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 778927f68217a..ab99400664606 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -293,6 +293,7 @@ extern "C" { GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph + GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes); // result per backend is written to sizes GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 5fd379f6a9461..54138ba6bf002 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -150,6 +150,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs } #endif +// returns the offset for the allocation static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) { size = aligned_offset(NULL, size, alloc->alignment); @@ -472,7 +473,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) { } static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) { - return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated; + return t->data != NULL // tensor data already set externally + || t->buffer // tensor on external buffer (but may not yet be allocated) + || ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc } static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { @@ -670,7 +673,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr } } -bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { +bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, + const int * node_buffer_ids, const int * leaf_buffer_ids, bool dry_run) { size_t min_hash_size = graph->n_nodes + graph->n_leafs; // add 25% margin to avoid hash collisions min_hash_size += min_hash_size / 4; @@ -768,7 +772,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c #endif ggml_backend_buffer_free(galloc->buffers[i]); - galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); + galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], dry_run ? 0 : new_size); if (galloc->buffers[i] == NULL) { GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); return false; @@ -781,7 +785,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c } bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { - return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL); + return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL, /*dry_run =*/ false); } static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) { @@ -934,6 +938,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); } +size_t ggml_gallocr_get_max_size(ggml_gallocr_t galloc, ggml_backend_dev_t dev) { + for (int i = 0; i < galloc->n_buffers; i++) { + if (ggml_backend_buft_get_device(galloc->bufts[i]) == dev) { + return ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); + } + } + return 0; +} + // utils static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) { @@ -984,7 +997,8 @@ static bool alloc_tensor_range(struct ggml_context * ctx, return true; } -ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { +static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl( + struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool dry_run) { GGML_ASSERT(ggml_get_no_alloc(ctx) == true); size_t alignment = ggml_backend_buft_get_alignment(buft); @@ -992,6 +1006,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte ggml_backend_buffer_t * buffers = NULL; size_t n_buffers = 0; + *nbytes_total = 0; size_t cur_buf_size = 0; struct ggml_tensor * first = ggml_get_first_tensor(ctx); @@ -1003,10 +1018,13 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) { // allocate tensors in the current buffer - if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { - return NULL; + if (!dry_run) { + if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { + return NULL; + } } first = t; + *nbytes_total += cur_buf_size; cur_buf_size = this_size; } else { cur_buf_size += this_size; @@ -1015,15 +1033,23 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte // allocate remaining tensors if (cur_buf_size > 0) { - if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) { - return NULL; + *nbytes_total += cur_buf_size; + if (!dry_run) { + if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) { + return NULL; + } } } + if (dry_run) { + return NULL; + } + if (n_buffers == 0) { #ifndef NDEBUG GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__); #endif + GGML_ASSERT(!buffers); return NULL; } @@ -1033,10 +1059,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } else { buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers); } - free(buffers); + if (buffers) { + free(buffers); // can be NULL if dry_run or context is empty + } return buffer; } +size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { + size_t nbytes_total = 0; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*dry_run =*/ true); + GGML_ASSERT(!buf); + return nbytes_total; +} + +ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { + size_t nbytes_total = 0; + return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*dry_run =*/ false); +} + ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) { return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend)); } diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b1050ad59c26a..da5579e6ebb9a 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1347,7 +1347,8 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { #ifndef NDEBUG GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed); #endif - ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids); + ggml_gallocr_reserve_n(sched->galloc, &sched->graph, + sched->node_backend_ids, sched->leaf_backend_ids, /*dry_run =*/ false); if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__); return false; @@ -1546,6 +1547,22 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) { sched->is_alloc = false; } +void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) { + GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); + + ggml_backend_sched_split_graph(sched, measure_graph); + + ggml_backend_sched_synchronize(sched); + + GGML_ASSERT(ggml_gallocr_reserve_n(sched->galloc, &sched->graph, + sched->node_backend_ids, sched->leaf_backend_ids, /*dry_run =*/ true)); + for (int ib = 0; ib < sched->n_backends; ib++) { + sizes[ib] = ggml_gallocr_get_max_size(sched->galloc, ggml_backend_get_device(sched->backends[ib])); + } + + ggml_backend_sched_reset(sched); +} + bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); @@ -1553,7 +1570,8 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * ggml_backend_sched_synchronize(sched); - if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) { + if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, + sched->node_backend_ids, sched->leaf_backend_ids, /*dry_run =*/ false)) { return false; } diff --git a/include/llama.h b/include/llama.h index 015a57898e22d..08f092d1382e3 100644 --- a/include/llama.h +++ b/include/llama.h @@ -414,6 +414,13 @@ extern "C" { LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void); LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void); + // returns success + LLAMA_API bool llama_expected_memory_use( + const char * path_model, + struct llama_model_params mparams, + struct llama_context_params cparams, + size_t * nbytes_expect); + // Initialize the llama + ggml backend // If numa is true, use NUMA optimizations // Call once at the start of the program diff --git a/src/llama-context.cpp b/src/llama-context.cpp index b130b484bcf6f..ff52c867646fa 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -17,7 +17,8 @@ llama_context::llama_context( const llama_model & model, - llama_context_params params) : + llama_context_params params, + bool dry_run) : model(model) { LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__); @@ -192,7 +193,7 @@ llama_context::llama_context( /*.swa_full =*/ params.swa_full, }; - memory.reset(model.create_memory(params_mem, cparams)); + memory.reset(model.create_memory(params_mem, cparams, dry_run)); } // init backends @@ -265,6 +266,8 @@ llama_context::llama_context( // reserve worst-case graph if (!hparams.vocab_only && memory) { + backends_exp_max_size.resize(backend_ptrs.size()); + const uint32_t n_seqs = cparams.n_seq_max; const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); @@ -287,7 +290,7 @@ llama_context::llama_context( // reserve pp graph first so that buffers are only allocated once { - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get()); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get(), dry_run); if (!gf) { throw std::runtime_error("failed to allocate compute pp buffers"); } @@ -298,7 +301,7 @@ llama_context::llama_context( // reserve with tg graph to get the number of splits and nodes { - auto * gf = graph_reserve(1, 1, 1, mstate.get()); + auto * gf = graph_reserve(1, 1, 1, mstate.get(), dry_run); if (!gf) { throw std::runtime_error("failed to allocate compute tg buffers"); } @@ -309,16 +312,21 @@ llama_context::llama_context( // reserve again with pp graph to avoid ggml-alloc reallocations during inference { - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get()); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get(), dry_run); if (!gf) { throw std::runtime_error("failed to allocate compute pp buffers"); } } + if (!dry_run) { + for (size_t i = 0; i < backend_ptrs.size(); ++i) { + backends_exp_max_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend_ptrs[i]); + } + } + for (size_t i = 0; i < backend_ptrs.size(); ++i) { - ggml_backend_t backend = backend_ptrs[i]; - ggml_backend_buffer_type_t buft = backend_buft[i]; - size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend); + ggml_backend_buffer_type_t buft = backend_buft[i]; + const size_t size = backends_exp_max_size[i]; if (size > 1) { LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, ggml_backend_buft_name(buft), @@ -418,6 +426,10 @@ uint32_t llama_context::n_threads_batch() const { return cparams.n_threads_batch; } +size_t llama_context::total_size(ggml_backend_dev_t dev) const { + return memory->total_size(dev); +} + llama_memory_t llama_context::get_memory() const { return memory.get(); } @@ -476,7 +488,7 @@ bool llama_context::kv_self_update(bool optimize) { const uint32_t n_seqs = cparams.n_seq_max; const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get()); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get(), /*dry_run =*/ false); if (!gf) { LLAMA_LOG_ERROR("%s: failed to reserve graph after the memory update\n", __func__); } @@ -1232,6 +1244,15 @@ int llama_context::decode(llama_batch & inp_batch) { return 0; } +size_t llama_context::get_expected_max_size(ggml_backend_dev_t dev) const { + for (size_t i = 0; i < backend_buft.size(); i++) { + if (ggml_backend_buft_get_device(backend_buft[i]) == dev) { + return backends_exp_max_size[i]; + } + } + return 0; +} + // // output // @@ -1328,7 +1349,7 @@ ggml_cgraph * llama_context::graph_init() { return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false); } -ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate) { +ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate, bool dry_run) { LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs); if (n_tokens % n_seqs != 0) { @@ -1360,9 +1381,17 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u ggml_backend_sched_reset(sched.get()); // initialize scheduler with the specified graph - if (!ggml_backend_sched_reserve(sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - return nullptr; + if (dry_run) { + std::vector tmp(backend_ptrs.size()); + ggml_backend_sched_reserve_size(sched.get(), gf, tmp.data()); + for (size_t i = 0; i < backend_ptrs.size(); i++) { + backends_exp_max_size[i] = std::max(backends_exp_max_size[i], tmp[i]); + } + } else { + if (!ggml_backend_sched_reserve(sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + return nullptr; + } } return gf; @@ -2229,9 +2258,10 @@ llama_context_params llama_context_default_params() { return result; } -llama_context * llama_init_from_model( +llama_context * llama_init_from_model_impl( llama_model * model, - llama_context_params params) { + llama_context_params params, + bool dry_run) { if (!model) { LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__); return nullptr; @@ -2258,7 +2288,7 @@ llama_context * llama_init_from_model( } try { - auto * ctx = new llama_context(*model, params); + auto * ctx = new llama_context(*model, params, dry_run); return ctx; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to initialize the context: %s\n", __func__, err.what()); @@ -2267,6 +2297,11 @@ llama_context * llama_init_from_model( return nullptr; } +llama_context * llama_init_from_model( + llama_model * model, + llama_context_params params) { + return llama_init_from_model_impl(model, params, /*dry_run =*/ false); +} // deprecated llama_context * llama_new_context_with_model( llama_model * model, diff --git a/src/llama-context.h b/src/llama-context.h index 2e0da8c83bd59..07593d1993881 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -24,7 +24,8 @@ struct llama_context { // init scheduler and compute buffers, reserve worst-case graphs llama_context( const llama_model & model, - llama_context_params params); + llama_context_params params, + bool dry_run); ~llama_context(); @@ -46,6 +47,8 @@ struct llama_context { uint32_t n_threads() const; uint32_t n_threads_batch() const; + size_t total_size(ggml_backend_dev_t dev = nullptr) const; + llama_memory_t get_memory() const; // return true of the KV cache was updated @@ -105,6 +108,8 @@ struct llama_context { int encode(llama_batch & inp_batch); int decode(llama_batch & inp_batch); + size_t get_expected_max_size(ggml_backend_dev_t dev) const; + // // state save/load // @@ -197,7 +202,7 @@ struct llama_context { ggml_status graph_compute(ggml_cgraph * gf, bool batched); // reserve a graph with a dummy ubatch of the specified size - ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate); + ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate, bool dry_run); private: llm_graph_result_ptr graph_build( @@ -255,6 +260,7 @@ struct llama_context { ggml_backend_t backend_cpu = nullptr; std::vector backends; + std::vector backends_exp_max_size; ggml_context_ptr ctx_compute; diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp index 6ec709dd323a6..109bdaf4fd33f 100644 --- a/src/llama-impl.cpp +++ b/src/llama-impl.cpp @@ -165,3 +165,21 @@ std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0); } } + +size_t ctxs_total_size(const std::vector & ctxs, ggml_backend_dev_t dev) { + ggml_backend_buffer_type_t dev_buft = dev ? ggml_backend_dev_buffer_type(dev) : nullptr; + + size_t total_size = 0; + for (const ggml_context_ptr & ctx : ctxs) { + ggml_tensor * t = ggml_get_first_tensor(ctx.get()); + if (!t || !t->buffer) { + continue; + } + ggml_backend_buffer_type_t ctx_buft = ggml_backend_buffer_get_type(t->buffer); + if (dev_buft && ctx_buft != dev_buft) { + continue; + } + total_size += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), ctx_buft); + } + return total_size; +} diff --git a/src/llama-impl.h b/src/llama-impl.h index 02b1d07f8400d..b34cf7ff55700 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -1,6 +1,7 @@ #pragma once #include "ggml.h" // for ggml_log_level +#include "ggml-cpp.h" #include #include @@ -15,6 +16,11 @@ # define LLAMA_ATTRIBUTE_FORMAT(...) #endif +struct llama_context; +struct llama_model; +struct llama_context_params; +llama_context * llama_init_from_model_impl(llama_model * model, llama_context_params params, bool dry_run); + // // logging // @@ -59,3 +65,9 @@ std::string llama_format_tensor_shape(const std::vector & ne); std::string llama_format_tensor_shape(const struct ggml_tensor * t); std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i); + +// calculate the total number of bytes needed to allocate a vector of ggml contexts +// skips ggml contexts without ggml_backend buffers (dummy buffers are ok) +// assumes that all tensors in a context are on the same buffer +// if the optional device dev is set, return the number of bytes needed on that device only +size_t ctxs_total_size(const std::vector & ctxs, ggml_backend_dev_t dev = nullptr); diff --git a/src/llama-kv-cache-recurrent.cpp b/src/llama-kv-cache-recurrent.cpp index f5c6dcd66ce9e..3e601cd2f2c4c 100644 --- a/src/llama-kv-cache-recurrent.cpp +++ b/src/llama-kv-cache-recurrent.cpp @@ -21,7 +21,8 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent( ggml_type type_v, bool offload, uint32_t kv_size, - uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) { + uint32_t n_seq_max, + bool dry_run) : hparams(model.hparams), n_seq_max(n_seq_max) { const int32_t n_layer = hparams.n_layer; LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n", @@ -97,9 +98,17 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent( auto * buft = it.first; auto * ctx = it.second; - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (!buf) { - throw std::runtime_error("failed to allocate buffer for kv cache"); + ggml_backend_buffer_t buf; + if (dry_run) { + buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + t->buffer = buf; + } + } else { + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + throw std::runtime_error("failed to allocate buffer for kv cache"); + } } ggml_backend_buffer_clear(buf, 0); LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); @@ -648,13 +657,8 @@ float llama_kv_cache_recurrent::s_mask(int i) const { return res; } -size_t llama_kv_cache_recurrent::total_size() const { - size_t size = 0; - for (const auto & buf : bufs) { - size += ggml_backend_buffer_get_size(buf.get()); - } - - return size; +size_t llama_kv_cache_recurrent::total_size(ggml_backend_dev_t dev) const { + return ctxs_total_size(ctxs, dev); } size_t llama_kv_cache_recurrent::size_k_bytes() const { diff --git a/src/llama-kv-cache-recurrent.h b/src/llama-kv-cache-recurrent.h index d1da1225655fa..1b0144cb2ec3c 100644 --- a/src/llama-kv-cache-recurrent.h +++ b/src/llama-kv-cache-recurrent.h @@ -21,7 +21,8 @@ class llama_kv_cache_recurrent : public llama_memory_i { ggml_type type_v, bool offload, uint32_t kv_size, - uint32_t n_seq_max); + uint32_t n_seq_max, + bool dry_run); ~llama_kv_cache_recurrent() = default; @@ -50,6 +51,8 @@ class llama_kv_cache_recurrent : public llama_memory_i { llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override; + size_t total_size(ggml_backend_dev_t dev = nullptr) const override; + bool prepare(const std::vector & ubatches); // find a contiguous slot of kv cells and emplace the ubatch there @@ -108,8 +111,6 @@ class llama_kv_cache_recurrent : public llama_memory_i { std::vector ctxs; std::vector bufs; - size_t total_size() const; - size_t size_k_bytes() const; size_t size_v_bytes() const; diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-unified-iswa.cpp index 28d1826547649..7248ed446453a 100644 --- a/src/llama-kv-cache-unified-iswa.cpp +++ b/src/llama-kv-cache-unified-iswa.cpp @@ -21,7 +21,8 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa( uint32_t kv_size, uint32_t n_seq_max, uint32_t n_ubatch, - uint32_t n_pad) : hparams(model.hparams) { + uint32_t n_pad, + bool dry_run) : hparams(model.hparams) { llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); }; llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); }; @@ -42,14 +43,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa( kv_base = std::make_unique( model, std::move(filter_base), type_k, type_v, v_trans, offload, size_base, n_seq_max, n_pad, - 0, LLAMA_SWA_TYPE_NONE); + 0, LLAMA_SWA_TYPE_NONE, dry_run); LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa); kv_swa = std::make_unique( model, std::move(filter_swa), type_k, type_v, v_trans, offload, size_swa, n_seq_max, n_pad, - hparams.n_swa, hparams.swa_type); + hparams.n_swa, hparams.swa_type, dry_run); } void llama_kv_cache_unified_iswa::clear(bool data) { @@ -95,6 +96,10 @@ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const { return kv_swa->seq_pos_max(seq_id); } +size_t llama_kv_cache_unified_iswa::total_size(ggml_backend_dev_t dev) const { + return get_base()->total_size(dev) + get_swa()->total_size(dev); +} + llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) { GGML_UNUSED(embd_pooled); diff --git a/src/llama-kv-cache-unified-iswa.h b/src/llama-kv-cache-unified-iswa.h index 3dbf33ed7b960..bc9b43abc40b3 100644 --- a/src/llama-kv-cache-unified-iswa.h +++ b/src/llama-kv-cache-unified-iswa.h @@ -23,7 +23,8 @@ class llama_kv_cache_unified_iswa : public llama_memory_i { uint32_t kv_size, uint32_t n_seq_max, uint32_t n_ubatch, - uint32_t n_pad); + uint32_t n_pad, + bool dry_run); ~llama_kv_cache_unified_iswa() = default; @@ -54,6 +55,8 @@ class llama_kv_cache_unified_iswa : public llama_memory_i { llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override; + size_t total_size(ggml_backend_dev_t dev = nullptr) const override; + // state write/load void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override; diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index 3a40463fd29ca..1e9f36358ca0c 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -27,7 +27,8 @@ llama_kv_cache_unified::llama_kv_cache_unified( uint32_t n_seq_max, uint32_t n_pad, uint32_t n_swa, - llama_swa_type swa_type) : + llama_swa_type swa_type, + bool dry_run) : model(model), hparams(model.hparams), v_trans(v_trans), n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) { @@ -107,9 +108,17 @@ llama_kv_cache_unified::llama_kv_cache_unified( auto * buft = it.first; auto * ctx = it.second; - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (!buf) { - throw std::runtime_error("failed to allocate buffer for kv cache"); + ggml_backend_buffer_t buf; + if (dry_run) { + buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + t->buffer = buf; + } + } else { + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + throw std::runtime_error("failed to allocate buffer for kv cache"); + } } LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); @@ -835,14 +844,8 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama } } -size_t llama_kv_cache_unified::total_size() const { - size_t size = 0; - - for (const auto & buf : bufs) { - size += ggml_backend_buffer_get_size(buf.get()); - } - - return size; +size_t llama_kv_cache_unified::total_size(ggml_backend_dev_t dev) const { + return ctxs_total_size(ctxs, dev); } size_t llama_kv_cache_unified::size_k_bytes() const { diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache-unified.h index 49f410ef6ecab..2ff8f7d3258df 100644 --- a/src/llama-kv-cache-unified.h +++ b/src/llama-kv-cache-unified.h @@ -48,7 +48,8 @@ class llama_kv_cache_unified : public llama_memory_i { uint32_t n_seq_max, uint32_t n_pad, uint32_t n_swa, - llama_swa_type swa_type); + llama_swa_type swa_type, + bool dry_run); ~llama_kv_cache_unified() = default; @@ -79,6 +80,8 @@ class llama_kv_cache_unified : public llama_memory_i { llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override; + size_t total_size(ggml_backend_dev_t dev = nullptr) const override; + // state write/load void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override; @@ -173,8 +176,6 @@ class llama_kv_cache_unified : public llama_memory_i { // return non-empty vector if cells have been moved defrag_info defrag_prepare(int32_t n_max_nodes) const; - size_t total_size() const; - size_t size_k_bytes() const; size_t size_v_bytes() const; diff --git a/src/llama-memory.h b/src/llama-memory.h index 991aae781ba57..6946e0a61b2f8 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -108,6 +108,12 @@ struct llama_memory_i { virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0; virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0; + + // + // TODO + // + + virtual size_t total_size(ggml_backend_dev_t dev) const = 0; }; using llama_memory_ptr = std::unique_ptr; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c41ee24507fca..e0a2702165e63 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1464,7 +1464,7 @@ void llama_model::load_vocab(llama_model_loader & ml) { vocab.load(ml, kv); } -bool llama_model::load_tensors(llama_model_loader & ml) { +bool llama_model::load_tensors(llama_model_loader & ml, bool dry_run) { const auto & split_mode = params.split_mode; const auto & n_gpu_layers = params.n_gpu_layers; const auto & use_mlock = params.use_mlock; @@ -4192,11 +4192,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) { pimpl->bufs.emplace_back(buf); buf_map.emplace(idx, buf); } - } - else { - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (buf == nullptr) { - throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); + } else { + ggml_backend_buffer_t buf; + if (dry_run) { + buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // alloc dummy buffer + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + t->buffer = buf; + } + } else { + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (buf == nullptr) { + throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); + } } pimpl->bufs.emplace_back(buf); if (use_mlock && ggml_backend_buffer_is_host(buf)) { @@ -4249,6 +4256,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } + if (dry_run) { + for (auto & it : ctx_bufs) { + ggml_context * ctx = it.first; + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + if (!t->buffer) { + t->data = (void *) 0x12345678; // so that e.g. tensors which would normally be memory-mapped are treated as allocated + } + } + } + return true; + } + // load tensor data for (auto & it : ctx_bufs) { ggml_context * ctx = it.first; @@ -4291,6 +4310,10 @@ size_t llama_model::n_devices() const { return devices.size(); } +size_t llama_model::total_size(ggml_backend_dev_t dev) const { + return ctxs_total_size(pimpl->ctxs, dev); +} + uint64_t llama_model::n_elements() const { return pimpl->n_elements; } @@ -13203,7 +13226,7 @@ struct llm_build_bailingmoe : public llm_graph_context { } }; -llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const { +llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams, bool dry_run) const { llama_memory_i * res; switch (arch) { @@ -13227,7 +13250,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, GGML_TYPE_F32, cparams.offload_kqv, std::max((uint32_t) 1, cparams.n_seq_max), - cparams.n_seq_max); + cparams.n_seq_max, + dry_run); } break; default: { @@ -13250,7 +13274,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, cparams.n_ctx, cparams.n_seq_max, cparams.n_ubatch, - padding); + padding, + dry_run); } else { GGML_ASSERT(!hparams.is_swa_any()); @@ -13265,7 +13290,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, cparams.n_seq_max, padding, hparams.n_swa, - hparams.swa_type); + hparams.swa_type, + dry_run); } } } diff --git a/src/llama-model.h b/src/llama-model.h index 18b714620bbcf..8772b5f8c20b5 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -376,7 +376,7 @@ struct llama_model { void load_arch (llama_model_loader & ml); void load_hparams(llama_model_loader & ml); void load_vocab (llama_model_loader & ml); - bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback + bool load_tensors(llama_model_loader & ml, bool dry_run); // returns false if cancelled by progress_callback std::string arch_name() const; std::string type_name() const; @@ -387,6 +387,8 @@ struct llama_model { size_t n_tensors() const; size_t n_devices() const; + size_t total_size(ggml_backend_dev_t dev = nullptr) const; + // total number of parameters in the model uint64_t n_elements() const; @@ -408,7 +410,7 @@ struct llama_model { // note: can mutate `cparams` // TODO: move this to new llm_arch_model_i interface - llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const; + llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams, bool dry_run) const; // TODO: move this to new llm_arch_model_i interface llm_graph_result_ptr build_graph( diff --git a/src/llama.cpp b/src/llama.cpp index 2f06e0f8ce12d..af92ccec4d6ef 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1,6 +1,7 @@ #include "llama-impl.h" #include "llama-chat.h" +#include "llama-context.h" #include "llama-mmap.h" #include "llama-vocab.h" #include "llama-model-loader.h" @@ -84,7 +85,8 @@ int64_t llama_time_us(void) { } // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback -static int llama_model_load(const std::string & fname, std::vector & splits, llama_model & model, llama_model_params & params) { +static int llama_model_load(const std::string & fname, std::vector & splits, llama_model & model, + llama_model_params & params, bool dry_run) { // loading time will be recalculated after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = 0; @@ -123,7 +125,7 @@ static int llama_model_load(const std::string & fname, std::vector return 0; } - if (!model.load_tensors(ml)) { + if (!model.load_tensors(ml, dry_run)) { return -2; } } catch (const std::exception & err) { @@ -137,7 +139,8 @@ static int llama_model_load(const std::string & fname, std::vector static struct llama_model * llama_model_load_from_file_impl( const std::string & path_model, std::vector & splits, - struct llama_model_params params) { + struct llama_model_params params, + bool dry_run) { ggml_time_init(); if (!params.vocab_only && ggml_backend_reg_count() == 0) { @@ -214,7 +217,7 @@ static struct llama_model * llama_model_load_from_file_impl( LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024); } - const int status = llama_model_load(path_model, splits, *model, params); + const int status = llama_model_load(path_model, splits, *model, params, dry_run); GGML_ASSERT(status <= 0); if (status < 0) { if (status == -1) { @@ -230,6 +233,36 @@ static struct llama_model * llama_model_load_from_file_impl( return model; } +bool llama_expected_memory_use(const char * path_model, struct llama_model_params mparams, + struct llama_context_params cparams, size_t * nbytes_expect) { + mparams.use_mmap = false; // FIXME very slow otherwise + + std::vector splits = {}; + llama_model * model = llama_model_load_from_file_impl(path_model, splits, mparams, /*dry_run =*/ true); + if (model == NULL) { + LLAMA_LOG_ERROR("%s: failed to load model '%s'\n", __func__, path_model); + return false; + } + + llama_context * lctx = llama_init_from_model_impl(model, cparams, /*dry_run =*/ true); + if (lctx == NULL) { + LLAMA_LOG_ERROR("%s: failed to create context with model '%s'\n", __func__, path_model); + llama_model_free(model); + return false; + } + + for (size_t i = 0; i < model->n_devices(); i++) { + ggml_backend_dev_t dev = model->devices[i]; + const size_t nbytes_static = model->total_size(dev) + lctx->total_size(dev); + const size_t nbytes_compute = lctx->get_expected_max_size(dev); + nbytes_expect[i] = nbytes_static + nbytes_compute; + LLAMA_LOG_DEBUG("%s: %s: %zu + %zu = %zu MiB\n", __func__, ggml_backend_dev_name(dev), + nbytes_static/(1024*1024), nbytes_compute/(1024*1024), nbytes_expect[i]/(1024*1024)); + } + return true; +} + + // deprecated struct llama_model * llama_load_model_from_file( const char * path_model, @@ -241,7 +274,7 @@ struct llama_model * llama_model_load_from_file( const char * path_model, struct llama_model_params params) { std::vector splits = {}; - return llama_model_load_from_file_impl(path_model, splits, params); + return llama_model_load_from_file_impl(path_model, splits, params, /*dry_run =*/ false); } struct llama_model * llama_model_load_from_splits( @@ -256,7 +289,7 @@ struct llama_model * llama_model_load_from_splits( for (size_t i = 0; i < n_paths; ++i) { splits.push_back(paths[i]); } - return llama_model_load_from_file_impl(splits.front(), splits, params); + return llama_model_load_from_file_impl(splits.front(), splits, params, /*dry run =*/ false); } void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {