ggml-org · byko3y · Dec 25, 2025 · Dec 25, 2025 · Dec 26, 2025 · Dec 26, 2025
@@ -2293,7 +2293,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "path to LoRA adapter (use comma-separated values to load multiple adapters)",
         [](common_params & params, const std::string & value) {
             for (const auto & item : string_split<std::string>(value, ',')) {
-                params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
+                params.lora_adapters.push_back({ item, 1.0 });
             }
         }
         // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
@@ -2308,7 +2308,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 if (parts.size() != 2) {
                     throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
                 }
-                params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
+                params.lora_adapters.push_back({ parts[0], std::stof(parts[1]) });
             }
         }
         // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg

@@ -1083,7 +1083,7 @@ struct common_init_result::impl {
     llama_model_ptr   model;
     llama_context_ptr context;
 
-    std::vector<llama_adapter_lora_ptr> lora;
+    std::vector<common_adapter_lora_info> loras;
 
     std::vector<common_sampler_ptr> samplers;
 };
@@ -1149,6 +1149,27 @@ common_init_result::common_init_result(common_params & params) :
         pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
     }
 
+    // read and load lora adapters
+    uint64_t n_lora_tensors = 0;
+    for (auto & la : params.lora_adapters) {
+        llama_adapter_lora_ptr ptr{ llama_adapter_lora_init(model, la.path.c_str()) };
+        if (ptr == nullptr) {
+            LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
+            return;
+        }
+        auto & info = pimpl->loras.emplace_back(common_adapter_lora_info{std::move(ptr), la.path, "", ""});
+
+        char buf[1024];
+        auto *lora = info.ptr.get();
+        llama_adapter_meta_val_str(lora, "adapter.lora.task_name", buf, sizeof(buf));
+        info.task_name = buf;
+        llama_adapter_meta_val_str(lora, "adapter.lora.prompt_prefix", buf, sizeof(buf));
+        info.prompt_prefix = buf;
+
+        n_lora_tensors += llama_adapter_lora_get_n_tensors(lora);
+    }
+    cparams.n_lora_tensors = n_lora_tensors;
+
     llama_context * lctx = llama_init_from_model(model, cparams);
     if (lctx == NULL) {
         LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
@@ -1170,8 +1191,8 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
     return pimpl->samplers[seq_id].get();
 }
 
-std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
-    return pimpl->lora;
+const std::vector<common_adapter_lora_info> & common_init_result::loras() const {
+    return pimpl->loras;
 }
 
 void common_init_result::free_context() {
@@ -1245,26 +1266,8 @@ common_init_result_ptr common_init_from_params(common_params & params) {
         }
     }
 
-    // load and optionally apply lora adapters
-    for (auto & la : params.lora_adapters) {
-        llama_adapter_lora_ptr lora;
-        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
-        if (lora == nullptr) {
-            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
-            return res;
-        }
-
-        char buf[1024];
-        la.ptr = lora.get();
-        llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
-        la.task_name = buf;
-        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
-        la.prompt_prefix = buf;
-        res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
-    }
-
     if (!params.lora_init_without_apply) {
-        common_set_adapter_lora(lctx, params.lora_adapters);
+        common_set_adapter_lora(lctx, params.lora_adapters, res->loras());
     }
 
     if (params.warmup) {
@@ -1325,11 +1328,17 @@ std::string get_model_endpoint() {
     return model_endpoint;
 }
 
-void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
+void common_set_adapter_lora(
+            struct llama_context * ctx,
+            const std::vector<common_adapter_lora_param> & lora_params,
+            const std::vector<common_adapter_lora_info> & loras
+        ) {
+
     llama_clear_adapter_lora(ctx);
-    for (auto & la : lora) {
-        if (la.scale != 0.0f) {
-            llama_set_adapter_lora(ctx, la.ptr, la.scale);
+    GGML_ASSERT(loras.size() <= lora_params.size());
+    for (size_t i = 0; i < loras.size(); i++) {
+        if (lora_params[i].scale != 0.0f) {
+            llama_set_adapter_lora(ctx, loras[i].ptr.get(), lora_params[i].scale);
         }
     }
 }

@@ -39,14 +39,17 @@ struct common_time_meas {
     int64_t & t_acc;
 };
 
-struct common_adapter_lora_info {
+struct common_adapter_lora_param {
     std::string path;
     float scale;
+};
+
+struct common_adapter_lora_info {
+    llama_adapter_lora_ptr ptr;
 
+    std::string path;
     std::string task_name;
     std::string prompt_prefix;
-
-    struct llama_adapter_lora * ptr;
 };
 
 using llama_tokens = std::vector<llama_token>;
@@ -375,8 +378,8 @@ struct common_params {
     std::vector<llama_model_kv_override> kv_overrides;
     std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
 
-    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
-    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
+    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_set_adapter_lora)
+    std::vector<common_adapter_lora_param> lora_adapters; // lora adapter path with user defined scale
 
     std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
 
@@ -691,7 +694,7 @@ struct common_init_result {
     llama_context * context();
     common_sampler * sampler(llama_seq_id seq_id);
 
-    std::vector<llama_adapter_lora_ptr> & lora();
+    const std::vector<common_adapter_lora_info> & loras() const;
 
     void free_context();
 
@@ -709,7 +712,10 @@ struct llama_context_params   common_context_params_to_llama(const common_params
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
 
 // clear LoRA adapters from context, then apply new list of adapters
-void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
+void common_set_adapter_lora(
+            struct llama_context * ctx,
+            const std::vector<common_adapter_lora_param> & lora_params,
+            const std::vector<common_adapter_lora_info> & loras);
 
 std::string                   get_model_endpoint();
 

diff --git a/include/llama.h b/include/llama.h
@@ -364,6 +364,8 @@ extern "C" {
         bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
                           // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
                           // ref: https://github.com/ggml-org/llama.cpp/pull/14363
+
+        uint64_t n_lora_tensors;
     };
 
     // model quantization parameters
@@ -626,6 +628,9 @@ extern "C" {
     // NOTE: loaded adapters will be free when the associated model is deleted
     LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
 
+    // for llama_context_params::n_lora_tensors */
+    LLAMA_API uint64_t llama_adapter_lora_get_n_tensors(const struct llama_adapter_lora * adapter);
+
     // Get the invocation tokens if the current lora is an alora
     LLAMA_API uint64_t            llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
     LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens  (const struct llama_adapter_lora * adapter);

@@ -472,6 +472,13 @@ void llama_adapter_lora_free(llama_adapter_lora * adapter) {
     delete adapter;
 }
 
+uint64_t llama_adapter_lora_get_n_tensors(const struct llama_adapter_lora * adapter) {
+    if (!adapter) {
+        return 0;
+    }
+    return adapter->ab_map.size() * 2;
+}
+
 uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
     if (!adapter) {
         return 0;

@@ -283,7 +283,7 @@ llama_context::llama_context(
         const uint32_t n_seqs = cparams.n_seq_max;
         const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
-        const size_t max_nodes = this->graph_max_nodes(n_tokens);
+        const size_t max_nodes = this->graph_max_nodes(n_tokens, params.n_lora_tensors);
 
         LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
 
@@ -1438,11 +1438,11 @@ void llama_context::output_reorder() {
 // graph
 //
 
-uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
+uint32_t llama_context::graph_max_nodes(uint32_t n_tokens, uint32_t n_lora_tensors) const {
     if (model.arch == LLM_ARCH_QWEN3NEXT) {
-        return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
+        return std::max<uint32_t>(n_tokens * 40, 32u * (model.n_tensors() +  n_lora_tensors));
     }
-    return std::max<uint32_t>(1024u, 8u*model.n_tensors());
+    return std::max<uint32_t>(1024u, 8u * (model.n_tensors() + n_lora_tensors));
 }
 
 llm_graph_result * llama_context::get_gf_res_reserve() const {
@@ -1476,7 +1476,7 @@ ggml_cgraph * llama_context::graph_reserve(
     llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
 
     auto * res = gf_res_reserve.get();
-
+    /* build graph with all lora-s active? */
     const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
 
     res->reset();
@@ -2392,6 +2392,7 @@ llama_context_params llama_context_default_params() {
         /*.op_offload                  =*/ true,
         /*.swa_full                    =*/ true,
         /*.kv_unified                  =*/ false,
+        /*.n_lora_tensors              =*/ 0,
     };
 
     return result;

@@ -201,7 +201,7 @@ struct llama_context {
     //
 
 public:
-    uint32_t graph_max_nodes(uint32_t n_tokens) const;
+    uint32_t graph_max_nodes(uint32_t n_tokens, uint32_t n_lora_tensors) const;
 
     // can reuse the llm_graph_result instance of the context (for example to update a memory module)
     llm_graph_result * get_gf_res_reserve() const;

diff --git a/tools/export-lora/export-lora.cpp b/tools/export-lora/export-lora.cpp
@@ -129,7 +129,7 @@ struct lora_merge_ctx {
 
     lora_merge_ctx(
             std::string & base_fname,
-            std::vector<common_adapter_lora_info> & lora_files,
+            std::vector<common_adapter_lora_param> & lora_files,
             std::string & outfile,
             int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
         fout.exceptions(std::ofstream::failbit); // fail fast on write errors

@@ -88,11 +88,15 @@ std::string gen_tool_call_id() {
 // lora utils
 //
 
-bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras) {
+bool lora_all_alora(
+        const std::vector<common_adapter_lora_info> & lora_adapters,
+        const std::map<int, float> & loras) {
+
     bool found_alora = false;
-    for (const auto & lora : loras) {
-        if (lora.scale != 0) {
-            if (llama_adapter_get_alora_n_invocation_tokens(lora.ptr) == 0) {
+    for (size_t i = 0; i < lora_adapters.size(); i++) {
+        auto it = loras.find(i);
+        if (it != loras.end() && it->second != 0.0f) {
+            if (llama_adapter_get_alora_n_invocation_tokens(lora_adapters[i].ptr.get()) == 0) {
                 return false;
             }
             found_alora = true;
@@ -102,21 +106,22 @@ bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras) {
 }
 
 bool lora_should_clear_cache(
-        const std::vector<common_adapter_lora_info> & current,
-        const std::vector<common_adapter_lora_info> & next) {
+        const std::vector<common_adapter_lora_info> & lora_adapters,
+        const lora_scales & current,
+        const lora_scales & next) {
 
     // This should always be called after determining that the two sets are
     // _not_ equal. This assert is therefore some slightly wasted work and
     // should be safe to remove as long as this method is called correctly.
     GGML_ASSERT(!are_lora_equal(current, next));
 
     return (
-        !(lora_get_enabled_ids(current).empty() || lora_all_alora(current)) ||
-        !lora_all_alora(next));
+        !(lora_get_enabled_ids(current).empty() || lora_all_alora(lora_adapters, current)) ||
+        !lora_all_alora(lora_adapters, next));
 }
 
-std::map<int, float> parse_lora_request(const json & data) {
-    std::map<int, float> lora;
+lora_scales parse_lora_request(const json & data) {
+    lora_scales lora;
 
     // set value
     for (const auto & entry : data) {
@@ -129,25 +134,17 @@ std::map<int, float> parse_lora_request(const json & data) {
 }
 
 bool are_lora_equal(
-        const std::vector<common_adapter_lora_info> & l1,
-        const std::vector<common_adapter_lora_info> & l2) {
-    if (l1.size() != l2.size()) {
-        return false;
-    }
-    for (size_t i = 0; i < l1.size(); ++i) {
-        // we don't check lora.path to reduce the time complexity
-        if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) {
-            return false;
-        }
-    }
-    return true;
+        const lora_scales & l1,
+        const lora_scales & l2) {
+
+    return l1 == l2;
 }
 
-std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras) {
+std::vector<size_t> lora_get_enabled_ids(const lora_scales & loras) {
     std::vector<size_t> enabled_ids;
-    for (size_t i = 0; i < loras.size(); ++i) {
-        if (loras[i].scale > 0) {
-            enabled_ids.push_back(i);
+    for (const auto &it : loras) {
+        if (it.second != 0.0f) {
+            enabled_ids.push_back(it.first);
         }
     }
     return enabled_ids;

@@ -98,23 +98,26 @@ std::string gen_tool_call_id();
 // lora utils
 //
 
+using lora_scales = std::map<int, float>;
+
 // check whether the given lora set has only aloras activated (empty => false)
-bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras);
+bool lora_all_alora(const std::vector<common_adapter_lora_info> & lora_adapters, const lora_scales & loras);
 
 // if the two sets of loras are different, they require a cache clear unless the
 // change is only from aloras to aloras.
 bool lora_should_clear_cache(
-        const std::vector<common_adapter_lora_info> & current,
-        const std::vector<common_adapter_lora_info> & next);
+        const std::vector<common_adapter_lora_info> & lora_adapters,
+        const lora_scales & current,
+        const lora_scales & next);
 
 std::map<int, float> parse_lora_request(const json & data);
 
 bool are_lora_equal(
-        const std::vector<common_adapter_lora_info> & l1,
-        const std::vector<common_adapter_lora_info> & l2);
+        const lora_scales & l1,
+        const lora_scales & l2);
 
 // get the ids of all enabled loras
-std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras);
+std::vector<size_t> lora_get_enabled_ids(const lora_scales & loras);
 
 //
 // server_tokens