From e5f1e38be0abac1a671219ac9a726da00530be89 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 29 Dec 2025 22:00:34 +0100 Subject: [PATCH 1/8] lora: count lora nodes in graph_max_nodes --- src/llama-adapter.h | 4 ++++ src/llama-context.cpp | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 4f65247c0fe..813023d9833 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -77,6 +77,10 @@ struct llama_adapter_lora { ~llama_adapter_lora() = default; llama_adapter_lora_weight * get_weight(ggml_tensor * w); + + uint32_t get_n_nodes() const { + return ab_map.size() * 2u; + } }; using llama_adapter_loras = std::unordered_map; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 1c530fdc919..a00415e23ab 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1442,7 +1442,11 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const { if (model.arch == LLM_ARCH_QWEN3NEXT) { return std::max(n_tokens * 40, 32u * model.n_tensors()); } - return std::max(1024u, 8u*model.n_tensors()); + uint32_t res = std::max(1024u, 8u*model.n_tensors()); + for (const auto & lora : loras) { + res += lora.first->get_n_nodes(); + } + return res; } llm_graph_result * llama_context::get_gf_res_reserve() const { From fe2f7fc70209aee8248020c6fc3cb9f60e4b2359 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 29 Dec 2025 22:02:50 +0100 Subject: [PATCH 2/8] 3 nodes per weight --- src/llama-adapter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 813023d9833..acd3e6ed0d8 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -79,7 +79,7 @@ struct llama_adapter_lora { llama_adapter_lora_weight * get_weight(ggml_tensor * w); uint32_t get_n_nodes() const { - return ab_map.size() * 2u; + return ab_map.size() * 3u; // mul_mat, scale, add } }; From ac6392dbddefd86bdf0f59a80946d76127fcbaad Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 29 Dec 2025 22:05:52 +0100 Subject: [PATCH 3/8] 4 nodes --- src/llama-adapter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-adapter.h b/src/llama-adapter.h index acd3e6ed0d8..0c8cd4e9ea4 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -79,7 +79,7 @@ struct llama_adapter_lora { llama_adapter_lora_weight * get_weight(ggml_tensor * w); uint32_t get_n_nodes() const { - return ab_map.size() * 3u; // mul_mat, scale, add + return ab_map.size() * 4u; // scale, add, 2 x mul_mat } }; From d66c5cd4474bcdff6d569b528c4693ee6ea627fb Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 30 Dec 2025 00:08:56 +0100 Subject: [PATCH 4/8] keep track n_lora_nodes from llama_model --- include/llama.h | 2 ++ src/llama-adapter.cpp | 15 ++++++++++++--- src/llama-adapter.h | 4 +++- src/llama-context.cpp | 4 +--- src/llama-model.h | 4 ++++ 5 files changed, 22 insertions(+), 7 deletions(-) diff --git a/include/llama.h b/include/llama.h index 4f0124fdc87..8b3c8a7b10a 100644 --- a/include/llama.h +++ b/include/llama.h @@ -607,6 +607,8 @@ extern "C" { // // Load a LoRA adapter from file + // The adapter is valid as long as the associated model is not freed + // All adapters must be loaded before context creation LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init( struct llama_model * model, const char * path_lora); diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index d8eef75a7ad..4158a673585 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -146,9 +146,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) { return nullptr; } -static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) { +static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) { LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); + llama_model & model = adapter.model; + ggml_context * ctx_init; gguf_init_params meta_gguf_params = { /* .no_alloc = */ true, @@ -411,14 +413,17 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ } } + // update number of nodes used + adapter.model.n_lora_nodes += adapter.get_n_nodes(); + LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); } llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) { - llama_adapter_lora * adapter = new llama_adapter_lora(); + llama_adapter_lora * adapter = new llama_adapter_lora(*model); try { - llama_adapter_lora_init_impl(*model, path_lora, *adapter); + llama_adapter_lora_init_impl(path_lora, *adapter); return adapter; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); @@ -469,6 +474,10 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, } void llama_adapter_lora_free(llama_adapter_lora * adapter) { + // update number of nodes used + adapter->model.n_lora_nodes -= adapter->get_n_nodes(); + GGML_ASSERT(adapter->model.n_lora_nodes >= 0); + delete adapter; } diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 0c8cd4e9ea4..d0dad8a789e 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -59,6 +59,8 @@ struct llama_adapter_lora_weight { }; struct llama_adapter_lora { + llama_model & model; + // map tensor name to lora_a_b std::unordered_map ab_map; @@ -73,7 +75,7 @@ struct llama_adapter_lora { // activated lora (aLoRA) std::vector alora_invocation_tokens; - llama_adapter_lora() = default; + llama_adapter_lora(llama_model & model) : model(model) {} ~llama_adapter_lora() = default; llama_adapter_lora_weight * get_weight(ggml_tensor * w); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a00415e23ab..34dfcd4724b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1443,9 +1443,7 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const { return std::max(n_tokens * 40, 32u * model.n_tensors()); } uint32_t res = std::max(1024u, 8u*model.n_tensors()); - for (const auto & lora : loras) { - res += lora.first->get_n_nodes(); - } + res += model.n_lora_nodes; return res; } diff --git a/src/llama-model.h b/src/llama-model.h index dbe5edc1536..ae62a733731 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -12,6 +12,7 @@ #include #include #include +#include struct llama_cparams; struct llama_ubatch; @@ -475,6 +476,9 @@ struct llama_model { // for quantize-stats only std::vector> tensors_by_name; + // for keeping track of extra nodes used by lora adapters + uint32_t n_lora_nodes = 0; + int64_t t_load_us = 0; int64_t t_start_us = 0; From 11c4867cbde3f7d0611e6b1c0a9885da7354a659 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 30 Dec 2025 00:11:19 +0100 Subject: [PATCH 5/8] fix assert --- src/llama-adapter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 4158a673585..b77e3a70326 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -475,8 +475,8 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, void llama_adapter_lora_free(llama_adapter_lora * adapter) { // update number of nodes used + GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes()); adapter->model.n_lora_nodes -= adapter->get_n_nodes(); - GGML_ASSERT(adapter->model.n_lora_nodes >= 0); delete adapter; } From 771a4062fc0b070275ad2fafa822189c402169b5 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 30 Dec 2025 00:14:51 +0100 Subject: [PATCH 6/8] rm redundant header --- src/llama-model.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama-model.h b/src/llama-model.h index ae62a733731..f4f44a92b63 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -12,7 +12,6 @@ #include #include #include -#include struct llama_cparams; struct llama_ubatch; From f20b386a37e45edb49dafd105934a98eb51e578a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 30 Dec 2025 15:38:25 +0100 Subject: [PATCH 7/8] common: load adapters before context creation --- common/common.cpp | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 58fef595468..79c4756125b 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1109,6 +1109,25 @@ common_init_result::common_init_result(common_params & params) : const llama_vocab * vocab = llama_model_get_vocab(model); + // load and optionally apply lora adapters (must be loaded before context creation) + for (auto & la : params.lora_adapters) { + llama_adapter_lora_ptr lora; + lora.reset(llama_adapter_lora_init(model, la.path.c_str())); + if (lora == nullptr) { + LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str()); + pimpl->model.reset(model); + return; + } + + char buf[1024]; + la.ptr = lora.get(); + llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf)); + la.task_name = buf; + llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf)); + la.prompt_prefix = buf; + pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters + } + // updates params.sampling // TODO: fix naming common_init_sampler_from_model(model, params.sampling); @@ -1245,24 +1264,6 @@ common_init_result_ptr common_init_from_params(common_params & params) { } } - // load and optionally apply lora adapters - for (auto & la : params.lora_adapters) { - llama_adapter_lora_ptr lora; - lora.reset(llama_adapter_lora_init(model, la.path.c_str())); - if (lora == nullptr) { - LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); - return res; - } - - char buf[1024]; - la.ptr = lora.get(); - llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf)); - la.task_name = buf; - llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf)); - la.prompt_prefix = buf; - res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters - } - if (!params.lora_init_without_apply) { common_set_adapter_lora(lctx, params.lora_adapters); } From 8f637a6615648d027a0cbdddd54e2ce5547012d8 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 30 Dec 2025 15:38:38 +0100 Subject: [PATCH 8/8] use 6 nodes --- src/llama-adapter.cpp | 2 +- src/llama-adapter.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index b77e3a70326..bdc24c2d6b1 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -414,7 +414,7 @@ static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_l } // update number of nodes used - adapter.model.n_lora_nodes += adapter.get_n_nodes(); + model.n_lora_nodes += adapter.get_n_nodes(); LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); } diff --git a/src/llama-adapter.h b/src/llama-adapter.h index d0dad8a789e..42d64a6e0b5 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -81,7 +81,7 @@ struct llama_adapter_lora { llama_adapter_lora_weight * get_weight(ggml_tensor * w); uint32_t get_n_nodes() const { - return ab_map.size() * 4u; // scale, add, 2 x mul_mat + return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat } };