From e5f1e38be0abac1a671219ac9a726da00530be89 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 29 Dec 2025 22:00:34 +0100
Subject: [PATCH 1/8] lora: count lora nodes in graph_max_nodes

---
 src/llama-adapter.h   | 4 ++++
 src/llama-context.cpp | 6 +++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/llama-adapter.h b/src/llama-adapter.h
index 4f65247c0fe..813023d9833 100644
--- a/src/llama-adapter.h
+++ b/src/llama-adapter.h
@@ -77,6 +77,10 @@ struct llama_adapter_lora {
     ~llama_adapter_lora() = default;
 
     llama_adapter_lora_weight * get_weight(ggml_tensor * w);
+
+    uint32_t get_n_nodes() const {
+        return ab_map.size() * 2u;
+    }
 };
 
 using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 1c530fdc919..a00415e23ab 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1442,7 +1442,11 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
     if (model.arch == LLM_ARCH_QWEN3NEXT) {
         return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
     }
-    return std::max<uint32_t>(1024u, 8u*model.n_tensors());
+    uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
+    for (const auto & lora : loras) {
+        res += lora.first->get_n_nodes();
+    }
+    return res;
 }
 
 llm_graph_result * llama_context::get_gf_res_reserve() const {

From fe2f7fc70209aee8248020c6fc3cb9f60e4b2359 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 29 Dec 2025 22:02:50 +0100
Subject: [PATCH 2/8] 3 nodes per weight

---
 src/llama-adapter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-adapter.h b/src/llama-adapter.h
index 813023d9833..acd3e6ed0d8 100644
--- a/src/llama-adapter.h
+++ b/src/llama-adapter.h
@@ -79,7 +79,7 @@ struct llama_adapter_lora {
     llama_adapter_lora_weight * get_weight(ggml_tensor * w);
 
     uint32_t get_n_nodes() const {
-        return ab_map.size() * 2u;
+        return ab_map.size() * 3u; // mul_mat, scale, add
     }
 };
 

From ac6392dbddefd86bdf0f59a80946d76127fcbaad Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 29 Dec 2025 22:05:52 +0100
Subject: [PATCH 3/8] 4 nodes

---
 src/llama-adapter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-adapter.h b/src/llama-adapter.h
index acd3e6ed0d8..0c8cd4e9ea4 100644
--- a/src/llama-adapter.h
+++ b/src/llama-adapter.h
@@ -79,7 +79,7 @@ struct llama_adapter_lora {
     llama_adapter_lora_weight * get_weight(ggml_tensor * w);
 
     uint32_t get_n_nodes() const {
-        return ab_map.size() * 3u; // mul_mat, scale, add
+        return ab_map.size() * 4u; // scale, add, 2 x mul_mat
     }
 };
 

From d66c5cd4474bcdff6d569b528c4693ee6ea627fb Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 30 Dec 2025 00:08:56 +0100
Subject: [PATCH 4/8] keep track n_lora_nodes from llama_model

---
 include/llama.h       |  2 ++
 src/llama-adapter.cpp | 15 ++++++++++++---
 src/llama-adapter.h   |  4 +++-
 src/llama-context.cpp |  4 +---
 src/llama-model.h     |  4 ++++
 5 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 4f0124fdc87..8b3c8a7b10a 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -607,6 +607,8 @@ extern "C" {
     //
 
     // Load a LoRA adapter from file
+    // The adapter is valid as long as the associated model is not freed
+    // All adapters must be loaded before context creation
     LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
             struct llama_model * model,
             const char * path_lora);
diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp
index d8eef75a7ad..4158a673585 100644
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@@ -146,9 +146,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
     return nullptr;
 }
 
-static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
+static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
     LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
 
+    llama_model & model = adapter.model;
+
     ggml_context * ctx_init;
     gguf_init_params meta_gguf_params = {
         /* .no_alloc = */ true,
@@ -411,14 +413,17 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
         }
     }
 
+    // update number of nodes used
+    adapter.model.n_lora_nodes += adapter.get_n_nodes();
+
     LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 }
 
 llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
-    llama_adapter_lora * adapter = new llama_adapter_lora();
+    llama_adapter_lora * adapter = new llama_adapter_lora(*model);
 
     try {
-        llama_adapter_lora_init_impl(*model, path_lora, *adapter);
+        llama_adapter_lora_init_impl(path_lora, *adapter);
         return adapter;
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
@@ -469,6 +474,10 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
 }
 
 void llama_adapter_lora_free(llama_adapter_lora * adapter) {
+    // update number of nodes used
+    adapter->model.n_lora_nodes -= adapter->get_n_nodes();
+    GGML_ASSERT(adapter->model.n_lora_nodes >= 0);
+
     delete adapter;
 }
 
diff --git a/src/llama-adapter.h b/src/llama-adapter.h
index 0c8cd4e9ea4..d0dad8a789e 100644
--- a/src/llama-adapter.h
+++ b/src/llama-adapter.h
@@ -59,6 +59,8 @@ struct llama_adapter_lora_weight {
 };
 
 struct llama_adapter_lora {
+    llama_model & model;
+
     // map tensor name to lora_a_b
     std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
 
@@ -73,7 +75,7 @@ struct llama_adapter_lora {
     // activated lora (aLoRA)
     std::vector<llama_token> alora_invocation_tokens;
 
-    llama_adapter_lora() = default;
+    llama_adapter_lora(llama_model & model) : model(model) {}
     ~llama_adapter_lora() = default;
 
     llama_adapter_lora_weight * get_weight(ggml_tensor * w);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index a00415e23ab..34dfcd4724b 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1443,9 +1443,7 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
         return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
     }
     uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
-    for (const auto & lora : loras) {
-        res += lora.first->get_n_nodes();
-    }
+    res += model.n_lora_nodes;
     return res;
 }
 
diff --git a/src/llama-model.h b/src/llama-model.h
index dbe5edc1536..ae62a733731 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -12,6 +12,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include <set>
 
 struct llama_cparams;
 struct llama_ubatch;
@@ -475,6 +476,9 @@ struct llama_model {
     // for quantize-stats only
     std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
 
+    // for keeping track of extra nodes used by lora adapters
+    uint32_t n_lora_nodes = 0;
+
     int64_t t_load_us  = 0;
     int64_t t_start_us = 0;
 

From 11c4867cbde3f7d0611e6b1c0a9885da7354a659 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 30 Dec 2025 00:11:19 +0100
Subject: [PATCH 5/8] fix assert

---
 src/llama-adapter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp
index 4158a673585..b77e3a70326 100644
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@@ -475,8 +475,8 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
 
 void llama_adapter_lora_free(llama_adapter_lora * adapter) {
     // update number of nodes used
+    GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
     adapter->model.n_lora_nodes -= adapter->get_n_nodes();
-    GGML_ASSERT(adapter->model.n_lora_nodes >= 0);
 
     delete adapter;
 }

From 771a4062fc0b070275ad2fafa822189c402169b5 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 30 Dec 2025 00:14:51 +0100
Subject: [PATCH 6/8] rm redundant header

---
 src/llama-model.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/llama-model.h b/src/llama-model.h
index ae62a733731..f4f44a92b63 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -12,7 +12,6 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include <set>
 
 struct llama_cparams;
 struct llama_ubatch;

From f20b386a37e45edb49dafd105934a98eb51e578a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 30 Dec 2025 15:38:25 +0100
Subject: [PATCH 7/8] common: load adapters before context creation

---
 common/common.cpp | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 58fef595468..79c4756125b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1109,6 +1109,25 @@ common_init_result::common_init_result(common_params & params) :
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    // load and optionally apply lora adapters (must be loaded before context creation)
+    for (auto & la : params.lora_adapters) {
+        llama_adapter_lora_ptr lora;
+        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
+        if (lora == nullptr) {
+            LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
+            pimpl->model.reset(model);
+            return;
+        }
+
+        char buf[1024];
+        la.ptr = lora.get();
+        llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
+        la.task_name = buf;
+        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
+        la.prompt_prefix = buf;
+        pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
+    }
+
     // updates params.sampling
     // TODO: fix naming
     common_init_sampler_from_model(model, params.sampling);
@@ -1245,24 +1264,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
         }
     }
 
-    // load and optionally apply lora adapters
-    for (auto & la : params.lora_adapters) {
-        llama_adapter_lora_ptr lora;
-        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
-        if (lora == nullptr) {
-            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
-            return res;
-        }
-
-        char buf[1024];
-        la.ptr = lora.get();
-        llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
-        la.task_name = buf;
-        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
-        la.prompt_prefix = buf;
-        res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
-    }
-
     if (!params.lora_init_without_apply) {
         common_set_adapter_lora(lctx, params.lora_adapters);
     }

From 8f637a6615648d027a0cbdddd54e2ce5547012d8 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 30 Dec 2025 15:38:38 +0100
Subject: [PATCH 8/8] use 6 nodes

---
 src/llama-adapter.cpp | 2 +-
 src/llama-adapter.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp
index b77e3a70326..bdc24c2d6b1 100644
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@@ -414,7 +414,7 @@ static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_l
     }
 
     // update number of nodes used
-    adapter.model.n_lora_nodes += adapter.get_n_nodes();
+    model.n_lora_nodes += adapter.get_n_nodes();
 
     LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 }
diff --git a/src/llama-adapter.h b/src/llama-adapter.h
index d0dad8a789e..42d64a6e0b5 100644
--- a/src/llama-adapter.h
+++ b/src/llama-adapter.h
@@ -81,7 +81,7 @@ struct llama_adapter_lora {
     llama_adapter_lora_weight * get_weight(ggml_tensor * w);
 
     uint32_t get_n_nodes() const {
-        return ab_map.size() * 4u; // scale, add, 2 x mul_mat
+        return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
     }
 };