From fdfa89f12e75c37bc0883891151b5f93c2239126 Mon Sep 17 00:00:00 2001 From: Pavel Kostyuchenko Date: Fri, 26 Dec 2025 00:59:07 +0300 Subject: [PATCH 1/4] Fix a crash on multiple active LoRa (issue 18050) Split command line parameters and runtime adapter info into different struct-s. Bump max graph size according to LoRa count and tensor size. --- common/arg.cpp | 4 +- common/common.cpp | 61 ++++++++++-------- common/common.h | 20 +++--- include/llama.h | 5 ++ src/llama-adapter.cpp | 7 +++ src/llama-context.cpp | 11 ++-- src/llama-context.h | 2 +- tools/server/server-common.cpp | 49 +++++++-------- tools/server/server-common.h | 15 +++-- tools/server/server-context.cpp | 106 ++++++++++++++++++-------------- tools/server/server-task.cpp | 8 +-- tools/server/server-task.h | 6 +- 12 files changed, 171 insertions(+), 123 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 774f8731a9f..081905c6da3 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2293,7 +2293,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "path to LoRA adapter (use comma-separated values to load multiple adapters)", [](common_params & params, const std::string & value) { for (const auto & item : string_split(value, ',')) { - params.lora_adapters.push_back({ item, 1.0, "", "", nullptr }); + params.lora_adapters.push_back({ item, 1.0 }); } } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg @@ -2308,7 +2308,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex if (parts.size() != 2) { throw std::invalid_argument("lora-scaled format: FNAME:SCALE"); } - params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr }); + params.lora_adapters.push_back({ parts[0], std::stof(parts[1]) }); } } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg diff --git a/common/common.cpp b/common/common.cpp index acf2ec841d7..311f77abb62 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1083,7 +1083,7 @@ struct common_init_result::impl { llama_model_ptr model; llama_context_ptr context; - std::vector lora; + std::vector loras; std::vector samplers; }; @@ -1149,6 +1149,27 @@ common_init_result::common_init_result(common_params & params) : pimpl->samplers[i].reset(common_sampler_init(model, params.sampling)); } + // read and load lora adapters + uint64_t n_lora_tensors = 0; + for (auto & la : params.lora_adapters) { + llama_adapter_lora_ptr ptr{ llama_adapter_lora_init(model, la.path.c_str()) }; + if (ptr == nullptr) { + LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str()); + return; + } + auto & info = pimpl->loras.emplace_back(common_adapter_lora_info{std::move(ptr), la.path, "", ""}); + + char buf[1024]; + auto *lora = info.ptr.get(); + llama_adapter_meta_val_str(lora, "adapter.lora.task_name", buf, sizeof(buf)); + info.task_name = buf; + llama_adapter_meta_val_str(lora, "adapter.lora.prompt_prefix", buf, sizeof(buf)); + info.prompt_prefix = buf; + + n_lora_tensors += llama_adapter_lora_get_n_tensors(lora); + } + cparams.n_lora_tensors = n_lora_tensors; + llama_context * lctx = llama_init_from_model(model, cparams); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str()); @@ -1170,8 +1191,8 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) { return pimpl->samplers[seq_id].get(); } -std::vector & common_init_result::lora() { - return pimpl->lora; +const std::vector & common_init_result::loras() const { + return pimpl->loras; } void common_init_result::free_context() { @@ -1245,26 +1266,8 @@ common_init_result_ptr common_init_from_params(common_params & params) { } } - // load and optionally apply lora adapters - for (auto & la : params.lora_adapters) { - llama_adapter_lora_ptr lora; - lora.reset(llama_adapter_lora_init(model, la.path.c_str())); - if (lora == nullptr) { - LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); - return res; - } - - char buf[1024]; - la.ptr = lora.get(); - llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf)); - la.task_name = buf; - llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf)); - la.prompt_prefix = buf; - res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters - } - if (!params.lora_init_without_apply) { - common_set_adapter_lora(lctx, params.lora_adapters); + common_set_adapter_lora(lctx, params.lora_adapters, res->loras()); } if (params.warmup) { @@ -1325,11 +1328,17 @@ std::string get_model_endpoint() { return model_endpoint; } -void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora) { +void common_set_adapter_lora( + struct llama_context * ctx, + const std::vector & lora_params, + const std::vector & loras + ) { + llama_clear_adapter_lora(ctx); - for (auto & la : lora) { - if (la.scale != 0.0f) { - llama_set_adapter_lora(ctx, la.ptr, la.scale); + GGML_ASSERT(loras.size() <= lora_params.size()); + for (size_t i = 0; i < loras.size(); i++) { + if (lora_params[i].scale != 0.0f) { + llama_set_adapter_lora(ctx, loras[i].ptr.get(), lora_params[i].scale); } } } diff --git a/common/common.h b/common/common.h index 334372073a9..0a72b128370 100644 --- a/common/common.h +++ b/common/common.h @@ -39,14 +39,17 @@ struct common_time_meas { int64_t & t_acc; }; -struct common_adapter_lora_info { +struct common_adapter_lora_param { std::string path; float scale; +}; + +struct common_adapter_lora_info { + llama_adapter_lora_ptr ptr; + std::string path; std::string task_name; std::string prompt_prefix; - - struct llama_adapter_lora * ptr; }; using llama_tokens = std::vector; @@ -375,8 +378,8 @@ struct common_params { std::vector kv_overrides; std::vector tensor_buft_overrides; - bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply) - std::vector lora_adapters; // lora adapter path with user defined scale + bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_set_adapter_lora) + std::vector lora_adapters; // lora adapter path with user defined scale std::vector control_vectors; // control vector with user defined scale @@ -691,7 +694,7 @@ struct common_init_result { llama_context * context(); common_sampler * sampler(llama_seq_id seq_id); - std::vector & lora(); + const std::vector & loras() const; void free_context(); @@ -709,7 +712,10 @@ struct llama_context_params common_context_params_to_llama(const common_params struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); // clear LoRA adapters from context, then apply new list of adapters -void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora); +void common_set_adapter_lora( + struct llama_context * ctx, + const std::vector & lora_params, + const std::vector & loras); std::string get_model_endpoint(); diff --git a/include/llama.h b/include/llama.h index f8629300991..3697d26f5e0 100644 --- a/include/llama.h +++ b/include/llama.h @@ -364,6 +364,8 @@ extern "C" { bool kv_unified; // use a unified buffer across the input sequences when computing the attention // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix // ref: https://github.com/ggml-org/llama.cpp/pull/14363 + + uint64_t n_lora_tensors; }; // model quantization parameters @@ -626,6 +628,9 @@ extern "C" { // NOTE: loaded adapters will be free when the associated model is deleted LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter); + // for llama_context_params::n_lora_tensors */ + LLAMA_API uint64_t llama_adapter_lora_get_n_tensors(const struct llama_adapter_lora * adapter); + // Get the invocation tokens if the current lora is an alora LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter); LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter); diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index d8eef75a7ad..4fb72bbc52a 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -472,6 +472,13 @@ void llama_adapter_lora_free(llama_adapter_lora * adapter) { delete adapter; } +uint64_t llama_adapter_lora_get_n_tensors(const struct llama_adapter_lora * adapter) { + if (!adapter) { + return 0; + } + return adapter->ab_map.size() * 2; +} + uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) { if (!adapter) { return 0; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 015ebae71d6..85f3a3864bc 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -283,7 +283,7 @@ llama_context::llama_context( const uint32_t n_seqs = cparams.n_seq_max; const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - const size_t max_nodes = this->graph_max_nodes(n_tokens); + const size_t max_nodes = this->graph_max_nodes(n_tokens, params.n_lora_tensors); LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes); @@ -1438,11 +1438,11 @@ void llama_context::output_reorder() { // graph // -uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const { +uint32_t llama_context::graph_max_nodes(uint32_t n_tokens, uint32_t n_lora_tensors) const { if (model.arch == LLM_ARCH_QWEN3NEXT) { - return std::max(n_tokens * 40, 32u * model.n_tensors()); + return std::max(n_tokens * 40, 32u * (model.n_tensors() + n_lora_tensors)); } - return std::max(1024u, 8u*model.n_tensors()); + return std::max(1024u, 8u * (model.n_tensors() + n_lora_tensors)); } llm_graph_result * llama_context::get_gf_res_reserve() const { @@ -1476,7 +1476,7 @@ ggml_cgraph * llama_context::graph_reserve( llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs); auto * res = gf_res_reserve.get(); - + /* build graph with all lora-s active? */ const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT); res->reset(); @@ -2392,6 +2392,7 @@ llama_context_params llama_context_default_params() { /*.op_offload =*/ true, /*.swa_full =*/ true, /*.kv_unified =*/ false, + /*.n_lora_tensors =*/ 0, }; return result; diff --git a/src/llama-context.h b/src/llama-context.h index c31101330e2..d2bad562398 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -201,7 +201,7 @@ struct llama_context { // public: - uint32_t graph_max_nodes(uint32_t n_tokens) const; + uint32_t graph_max_nodes(uint32_t n_tokens, uint32_t n_lora_tensors) const; // can reuse the llm_graph_result instance of the context (for example to update a memory module) llm_graph_result * get_gf_res_reserve() const; diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index b02afaefda1..b062327f9b0 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -88,11 +88,15 @@ std::string gen_tool_call_id() { // lora utils // -bool lora_all_alora(const std::vector & loras) { +bool lora_all_alora( + const std::vector & lora_adapters, + const std::map & loras) { + bool found_alora = false; - for (const auto & lora : loras) { - if (lora.scale != 0) { - if (llama_adapter_get_alora_n_invocation_tokens(lora.ptr) == 0) { + for (size_t i = 0; i < lora_adapters.size(); i++) { + auto it = loras.find(i); + if (it != loras.end() && it->second != 0.0f) { + if (llama_adapter_get_alora_n_invocation_tokens(lora_adapters[i].ptr.get()) == 0) { return false; } found_alora = true; @@ -102,8 +106,9 @@ bool lora_all_alora(const std::vector & loras) { } bool lora_should_clear_cache( - const std::vector & current, - const std::vector & next) { + const std::vector & lora_adapters, + const lora_scales & current, + const lora_scales & next) { // This should always be called after determining that the two sets are // _not_ equal. This assert is therefore some slightly wasted work and @@ -111,12 +116,12 @@ bool lora_should_clear_cache( GGML_ASSERT(!are_lora_equal(current, next)); return ( - !(lora_get_enabled_ids(current).empty() || lora_all_alora(current)) || - !lora_all_alora(next)); + !(lora_get_enabled_ids(current).empty() || lora_all_alora(lora_adapters, current)) || + !lora_all_alora(lora_adapters, next)); } -std::map parse_lora_request(const json & data) { - std::map lora; +lora_scales parse_lora_request(const json & data) { + lora_scales lora; // set value for (const auto & entry : data) { @@ -129,25 +134,17 @@ std::map parse_lora_request(const json & data) { } bool are_lora_equal( - const std::vector & l1, - const std::vector & l2) { - if (l1.size() != l2.size()) { - return false; - } - for (size_t i = 0; i < l1.size(); ++i) { - // we don't check lora.path to reduce the time complexity - if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) { - return false; - } - } - return true; + const lora_scales & l1, + const lora_scales & l2) { + + return l1 == l2; } -std::vector lora_get_enabled_ids(const std::vector & loras) { +std::vector lora_get_enabled_ids(const lora_scales & loras) { std::vector enabled_ids; - for (size_t i = 0; i < loras.size(); ++i) { - if (loras[i].scale > 0) { - enabled_ids.push_back(i); + for (const auto &it : loras) { + if (it.second != 0.0f) { + enabled_ids.push_back(it.first); } } return enabled_ids; diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 152a2a3c46c..cf91871fd49 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -98,23 +98,26 @@ std::string gen_tool_call_id(); // lora utils // +using lora_scales = std::map; + // check whether the given lora set has only aloras activated (empty => false) -bool lora_all_alora(const std::vector & loras); +bool lora_all_alora(const std::vector & lora_adapters, const lora_scales & loras); // if the two sets of loras are different, they require a cache clear unless the // change is only from aloras to aloras. bool lora_should_clear_cache( - const std::vector & current, - const std::vector & next); + const std::vector & lora_adapters, + const lora_scales & current, + const lora_scales & next); std::map parse_lora_request(const json & data); bool are_lora_equal( - const std::vector & l1, - const std::vector & l2); + const lora_scales & l1, + const lora_scales & l2); // get the ids of all enabled loras -std::vector lora_get_enabled_ids(const std::vector & loras); +std::vector lora_get_enabled_ids(const lora_scales & loras); // // server_tokens diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 94825dc8621..89951d500d0 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -147,7 +147,7 @@ struct server_slot { return res; } - std::vector lora; + lora_scales lora; int32_t alora_invocation_start = -1; // sampling @@ -543,6 +543,10 @@ struct server_context_impl { llama_context * ctx = nullptr; + const std::vector & lora_adapters() const { + return llama_init->loras(); + } + bool vocab_dft_compatible = true; llama_model * model_dft = nullptr; @@ -1048,42 +1052,35 @@ struct server_context_impl { return res; } - std::vector construct_lora_list(const std::map & config) { - std::vector output = params_base.lora_adapters; // copy - for (size_t i = 0; i < output.size(); ++i) { - auto it = config.find(i); - if (it != config.end()) { - output[i].scale = it->second; - } else { - output[i].scale = 0.0f; - } - } - return output; - } - bool launch_slot_with_task(server_slot & slot, server_task && task) { slot.reset(); // process per-request lora adapters if (!task.params.lora.empty()) { - auto task_loras = construct_lora_list(task.params.lora); - if (!are_lora_equal(task_loras, slot.lora)) { + if (!are_lora_equal(task.params.lora, slot.lora)) { // if lora has changed, check to see if the cache should be cleared - if (lora_should_clear_cache(slot.lora, task_loras)) { + if (lora_should_clear_cache(lora_adapters(), slot.lora, task.params.lora)) { SLT_INF(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size()); slot.prompt.tokens.clear(); } else { - SLT_INF(slot, "keeping cache for alora. %zu target loras\n", task_loras.size()); + SLT_INF(slot, "keeping cache for alora. %zu target loras\n", task.params.lora.size()); } - slot.lora = task_loras; + slot.lora = task.params.lora; } } else { - slot.lora = params_base.lora_adapters; + // fetch default scales from params_base.lora_adapters + lora_scales scales; + for (size_t i = 0; i < params_base.lora_adapters.size(); i++) { + if (params_base.lora_adapters[i].scale != 0.0f) { + scales[i] = params_base.lora_adapters[i].scale; + } + } + slot.lora = scales; } // if using alora, make sure it's only a single one requested and active size_t alora_invocation_start = task.tokens.size(); - if (lora_all_alora(slot.lora)) { + if (lora_all_alora(lora_adapters(), slot.lora)) { const auto & enabled_ids = lora_get_enabled_ids(slot.lora); // TODO: This will error out if a user requests two aloras, but only // provides the activation string for one. We could, instead search @@ -1093,7 +1090,7 @@ struct server_context_impl { send_error(task, "Cannot run multiple aLoRAs in a single request", ERROR_TYPE_INVALID_REQUEST); return false; } - const auto & lora = slot.lora[enabled_ids[0]].ptr; + const auto * lora = lora_adapters().at(enabled_ids[0]).ptr.get(); // get the pointer and count for the invocation tokens const uint64_t n_invocation_tokens = llama_adapter_get_alora_n_invocation_tokens(lora); @@ -1122,7 +1119,7 @@ struct server_context_impl { // if the activation string is not found, disable the alora if (alora_invocation_start == task.tokens.size()) { SLT_DBG(slot, "alora %zu requested, but not found. deactivating\n", enabled_ids[0]); - slot.lora[enabled_ids[0]].scale = 0.0f; + slot.lora[enabled_ids[0]] = 0.0f; } else { SLT_DBG(slot, "alora %zu activated starting at %zu\n", enabled_ids[0], alora_invocation_start); slot.alora_invocation_start = alora_invocation_start; @@ -1811,38 +1808,50 @@ struct server_context_impl { case SERVER_TASK_TYPE_GET_LORA: { // TODO @ngxson : make lora_adapters a dedicated member of server_context - auto & loras = params_base.lora_adapters; + const auto & lora_params = params_base.lora_adapters; + GGML_ASSERT(lora_adapters().size() <= lora_params.size()); auto res = std::make_unique(); res->id = task.id; - for (size_t i = 0; i < loras.size(); ++i) { - auto & lora = loras[i]; - std::string alora_invocation_string = ""; - const uint64_t n_alora_tokens = llama_adapter_get_alora_n_invocation_tokens(lora.ptr); + for (size_t i = 0; i < lora_adapters().size(); ++i) { + const auto & param = lora_params[i]; + const auto & adapter = lora_adapters()[i]; + const auto * lora_ptr = adapter.ptr.get(); + std::string alora_invocation_string; + const uint64_t n_alora_tokens = llama_adapter_get_alora_n_invocation_tokens(lora_ptr); llama_tokens alora_invocation_tokens; if (n_alora_tokens) { - const llama_token * alora_tokens = llama_adapter_get_alora_invocation_tokens(lora.ptr); + const llama_token * alora_tokens = llama_adapter_get_alora_invocation_tokens(lora_ptr); for (uint64_t j = 0; j < n_alora_tokens; ++j) { alora_invocation_string += common_token_to_piece(vocab, alora_tokens[j]); alora_invocation_tokens.push_back(alora_tokens[j]); } } res->loras.push_back(server_task_result_get_lora::lora{ - lora, - alora_invocation_string, - alora_invocation_tokens, + param.path, + param.scale, + adapter.task_name, + adapter.prompt_prefix, + std::move(alora_invocation_string), + std::move(alora_invocation_tokens), }); } queue_results.send(std::move(res)); } break; case SERVER_TASK_TYPE_SET_LORA: { - auto new_loras = construct_lora_list(task.set_lora); - // logging - for (size_t i = 0; i < new_loras.size(); ++i) { - SRV_INF("set lora adapter idx=%zu scale=%f\n", i, new_loras[i].scale); - } // TODO @ngxson : make lora_adapters a dedicated member of server_context - params_base.lora_adapters = new_loras; + auto & params = params_base.lora_adapters; + for (size_t i = 0; i < params.size(); i++) { + auto it = task.set_lora.find(i); + if (it != task.set_lora.end()) { + params[i].scale = it->second; + } else { + params[i].scale = 0.0f; + } + + SRV_INF("set lora adapter idx=%zu scale=%f\n", i, params[i].scale); + } + auto res = std::make_unique(); res->id = task.id; queue_results.send(std::move(res)); @@ -2368,13 +2377,15 @@ struct server_context_impl { // tokens before the invocation sequence need to be // processed without the adapter in a separate batch, then // the adapter needs to be enabled for the remaining tokens. - if (lora_all_alora(slot.lora) && slot.alora_invocation_start - 1 > slot.prompt.n_tokens()) { + if (lora_all_alora(lora_adapters(), slot.lora) && slot.alora_invocation_start - 1 > slot.prompt.n_tokens()) { SLT_DBG(slot, "processing pre-alora tokens without the adapter (n_tokens = %d, alora_invocation_start = %d)\n", slot.prompt.n_tokens(), slot.alora_invocation_start); const auto & enabled_loras = lora_get_enabled_ids(slot.lora); GGML_ASSERT(enabled_loras.size() == 1); - alora_scale = slot.lora[enabled_loras[0]].scale; - slot.lora[enabled_loras[0]].scale = 0.0f; - alora_disabled_id = enabled_loras[0]; + auto it = slot.lora.find(enabled_loras[0]); + GGML_ASSERT(it != slot.lora.end()); + alora_scale = it->second; + alora_disabled_id = it->first; + slot.lora.erase(it); } bool do_checkpoint = params_base.n_ctx_checkpoints > 0; @@ -2509,13 +2520,18 @@ struct server_context_impl { if (slot_batched) { // apply lora, only need to do it once per batch - common_set_adapter_lora(ctx, slot_batched->lora); + llama_clear_adapter_lora(ctx); + for (const auto &scale : slot_batched->lora) { + if (scale.second != 0.0f) { + llama_set_adapter_lora(ctx, lora_adapters().at(scale.first).ptr.get(), scale.second); + } + } // if the lora is temporarily disabled for an alora, re-enable it // for next time - if (alora_scale > 0.0f) { + if (alora_scale != 0.0f) { SRV_DBG("re-enabling alora with scale %f\n", alora_scale); - slot_batched->lora[alora_disabled_id].scale = alora_scale; + slot_batched->lora[alora_disabled_id] = alora_scale; } llama_set_embeddings(ctx, slot_batched->need_embd()); diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 22f5b2059c0..3a157c97a0f 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -1332,10 +1332,10 @@ json server_task_result_get_lora::to_json() { auto & lora = loras[i]; json entry = { {"id", i}, - {"path", lora.info.path}, - {"scale", lora.info.scale}, - {"task_name", lora.info.task_name}, - {"prompt_prefix", lora.info.prompt_prefix}, + {"path", lora.path}, + {"scale", lora.scale}, + {"task_name", lora.task_name}, + {"prompt_prefix", lora.prompt_prefix}, }; if (!lora.alora_invocation_tokens.empty()) { entry["alora_invocation_string"] = lora.alora_invocation_string; diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 687770de5e9..9420c27bdea 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -444,7 +444,11 @@ struct server_task_result_slot_erase : server_task_result { struct server_task_result_get_lora : server_task_result { struct lora { - common_adapter_lora_info info; + std::string path; + float scale; + std::string task_name; + std::string prompt_prefix; + std::string alora_invocation_string; llama_tokens alora_invocation_tokens; }; From 210d65de764e4119f68276ad45b450810fcdcd4b Mon Sep 17 00:00:00 2001 From: Pavel Kostyuchenko Date: Fri, 26 Dec 2025 02:05:18 +0300 Subject: [PATCH 2/4] Update export-lora.cpp --- tools/export-lora/export-lora.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/export-lora/export-lora.cpp b/tools/export-lora/export-lora.cpp index f038019b007..c92da6ae0e6 100644 --- a/tools/export-lora/export-lora.cpp +++ b/tools/export-lora/export-lora.cpp @@ -129,7 +129,7 @@ struct lora_merge_ctx { lora_merge_ctx( std::string & base_fname, - std::vector & lora_files, + std::vector & lora_files, std::string & outfile, int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { fout.exceptions(std::ofstream::failbit); // fail fast on write errors From bccd2c36a901ac5180b57444dc168725c490e0d3 Mon Sep 17 00:00:00 2001 From: Pavel Kostyuchenko Date: Fri, 26 Dec 2025 13:40:55 +0300 Subject: [PATCH 3/4] ping From bd38e5956260ff2009c3dd4397b163b55c2e1369 Mon Sep 17 00:00:00 2001 From: Pavel Kostyuchenko Date: Fri, 26 Dec 2025 17:55:21 +0300 Subject: [PATCH 4/4] Correct alora_scale handling --- tools/server/server-context.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 89951d500d0..72c3ec81106 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2028,7 +2028,7 @@ struct server_context_impl { int32_t n_batch = llama_n_batch(ctx); int32_t n_ubatch = llama_n_ubatch(ctx); - float alora_scale = -1.0f; + float alora_scale = 0.0f; size_t alora_disabled_id = 0; // next, batch any pending prompts without exceeding n_batch @@ -2416,7 +2416,7 @@ struct server_context_impl { // if this is an alora request with pre-invocation // tokens that are not cached, we need to stop filling // this batch at those pre-invocation tokens. - if (alora_scale > 0 && slot.prompt.n_tokens() == slot.alora_invocation_start - 1) { + if (alora_scale != 0.0f && slot.prompt.n_tokens() == slot.alora_invocation_start - 1) { SLT_DBG(slot, "stop prompt batch filling at (n_tokens = %d, alora_invocation_start = %d)\n", slot.prompt.n_tokens(), slot.alora_invocation_start); break; }