Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2293,7 +2293,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"path to LoRA adapter (use comma-separated values to load multiple adapters)",
[](common_params & params, const std::string & value) {
for (const auto & item : string_split<std::string>(value, ',')) {
params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
params.lora_adapters.push_back({ item, 1.0 });
}
}
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
Expand All @@ -2308,7 +2308,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
if (parts.size() != 2) {
throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
}
params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
params.lora_adapters.push_back({ parts[0], std::stof(parts[1]) });
}
}
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
Expand Down
61 changes: 35 additions & 26 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1083,7 +1083,7 @@ struct common_init_result::impl {
llama_model_ptr model;
llama_context_ptr context;

std::vector<llama_adapter_lora_ptr> lora;
std::vector<common_adapter_lora_info> loras;

std::vector<common_sampler_ptr> samplers;
};
Expand Down Expand Up @@ -1149,6 +1149,27 @@ common_init_result::common_init_result(common_params & params) :
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
}

// read and load lora adapters
uint64_t n_lora_tensors = 0;
for (auto & la : params.lora_adapters) {
llama_adapter_lora_ptr ptr{ llama_adapter_lora_init(model, la.path.c_str()) };
if (ptr == nullptr) {
LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
return;
}
auto & info = pimpl->loras.emplace_back(common_adapter_lora_info{std::move(ptr), la.path, "", ""});

char buf[1024];
auto *lora = info.ptr.get();
llama_adapter_meta_val_str(lora, "adapter.lora.task_name", buf, sizeof(buf));
info.task_name = buf;
llama_adapter_meta_val_str(lora, "adapter.lora.prompt_prefix", buf, sizeof(buf));
info.prompt_prefix = buf;

n_lora_tensors += llama_adapter_lora_get_n_tensors(lora);
}
cparams.n_lora_tensors = n_lora_tensors;

llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
Expand All @@ -1170,8 +1191,8 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
return pimpl->samplers[seq_id].get();
}

std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
return pimpl->lora;
const std::vector<common_adapter_lora_info> & common_init_result::loras() const {
return pimpl->loras;
}

void common_init_result::free_context() {
Expand Down Expand Up @@ -1245,26 +1266,8 @@ common_init_result_ptr common_init_from_params(common_params & params) {
}
}

// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
llama_adapter_lora_ptr lora;
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
if (lora == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
return res;
}

char buf[1024];
la.ptr = lora.get();
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
la.task_name = buf;
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
la.prompt_prefix = buf;
res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
}

if (!params.lora_init_without_apply) {
common_set_adapter_lora(lctx, params.lora_adapters);
common_set_adapter_lora(lctx, params.lora_adapters, res->loras());
}

if (params.warmup) {
Expand Down Expand Up @@ -1325,11 +1328,17 @@ std::string get_model_endpoint() {
return model_endpoint;
}

void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
void common_set_adapter_lora(
struct llama_context * ctx,
const std::vector<common_adapter_lora_param> & lora_params,
const std::vector<common_adapter_lora_info> & loras
) {

llama_clear_adapter_lora(ctx);
for (auto & la : lora) {
if (la.scale != 0.0f) {
llama_set_adapter_lora(ctx, la.ptr, la.scale);
GGML_ASSERT(loras.size() <= lora_params.size());
for (size_t i = 0; i < loras.size(); i++) {
if (lora_params[i].scale != 0.0f) {
llama_set_adapter_lora(ctx, loras[i].ptr.get(), lora_params[i].scale);
}
}
}
Expand Down
20 changes: 13 additions & 7 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,17 @@ struct common_time_meas {
int64_t & t_acc;
};

struct common_adapter_lora_info {
struct common_adapter_lora_param {
std::string path;
float scale;
};

struct common_adapter_lora_info {
llama_adapter_lora_ptr ptr;

std::string path;
std::string task_name;
std::string prompt_prefix;

struct llama_adapter_lora * ptr;
};

using llama_tokens = std::vector<llama_token>;
Expand Down Expand Up @@ -375,8 +378,8 @@ struct common_params {
std::vector<llama_model_kv_override> kv_overrides;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;

bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_set_adapter_lora)
std::vector<common_adapter_lora_param> lora_adapters; // lora adapter path with user defined scale

std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale

Expand Down Expand Up @@ -691,7 +694,7 @@ struct common_init_result {
llama_context * context();
common_sampler * sampler(llama_seq_id seq_id);

std::vector<llama_adapter_lora_ptr> & lora();
const std::vector<common_adapter_lora_info> & loras() const;

void free_context();

Expand All @@ -709,7 +712,10 @@ struct llama_context_params common_context_params_to_llama(const common_params
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

// clear LoRA adapters from context, then apply new list of adapters
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
void common_set_adapter_lora(
struct llama_context * ctx,
const std::vector<common_adapter_lora_param> & lora_params,
const std::vector<common_adapter_lora_info> & loras);

std::string get_model_endpoint();

Expand Down
5 changes: 5 additions & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,8 @@ extern "C" {
bool kv_unified; // use a unified buffer across the input sequences when computing the attention
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
// ref: https://github.com/ggml-org/llama.cpp/pull/14363

uint64_t n_lora_tensors;
};

// model quantization parameters
Expand Down Expand Up @@ -626,6 +628,9 @@ extern "C" {
// NOTE: loaded adapters will be free when the associated model is deleted
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);

// for llama_context_params::n_lora_tensors */
LLAMA_API uint64_t llama_adapter_lora_get_n_tensors(const struct llama_adapter_lora * adapter);

// Get the invocation tokens if the current lora is an alora
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter);
Expand Down
7 changes: 7 additions & 0 deletions src/llama-adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,13 @@ void llama_adapter_lora_free(llama_adapter_lora * adapter) {
delete adapter;
}

uint64_t llama_adapter_lora_get_n_tensors(const struct llama_adapter_lora * adapter) {
if (!adapter) {
return 0;
}
return adapter->ab_map.size() * 2;
}

uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
if (!adapter) {
return 0;
Expand Down
11 changes: 6 additions & 5 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ llama_context::llama_context(
const uint32_t n_seqs = cparams.n_seq_max;
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);

const size_t max_nodes = this->graph_max_nodes(n_tokens);
const size_t max_nodes = this->graph_max_nodes(n_tokens, params.n_lora_tensors);

LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);

Expand Down Expand Up @@ -1438,11 +1438,11 @@ void llama_context::output_reorder() {
// graph
//

uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
uint32_t llama_context::graph_max_nodes(uint32_t n_tokens, uint32_t n_lora_tensors) const {
if (model.arch == LLM_ARCH_QWEN3NEXT) {
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
return std::max<uint32_t>(n_tokens * 40, 32u * (model.n_tensors() + n_lora_tensors));
}
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
return std::max<uint32_t>(1024u, 8u * (model.n_tensors() + n_lora_tensors));
}

llm_graph_result * llama_context::get_gf_res_reserve() const {
Expand Down Expand Up @@ -1476,7 +1476,7 @@ ggml_cgraph * llama_context::graph_reserve(
llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);

auto * res = gf_res_reserve.get();

/* build graph with all lora-s active? */
const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);

res->reset();
Expand Down Expand Up @@ -2392,6 +2392,7 @@ llama_context_params llama_context_default_params() {
/*.op_offload =*/ true,
/*.swa_full =*/ true,
/*.kv_unified =*/ false,
/*.n_lora_tensors =*/ 0,
};

return result;
Expand Down
2 changes: 1 addition & 1 deletion src/llama-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ struct llama_context {
//

public:
uint32_t graph_max_nodes(uint32_t n_tokens) const;
uint32_t graph_max_nodes(uint32_t n_tokens, uint32_t n_lora_tensors) const;

// can reuse the llm_graph_result instance of the context (for example to update a memory module)
llm_graph_result * get_gf_res_reserve() const;
Expand Down
2 changes: 1 addition & 1 deletion tools/export-lora/export-lora.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ struct lora_merge_ctx {

lora_merge_ctx(
std::string & base_fname,
std::vector<common_adapter_lora_info> & lora_files,
std::vector<common_adapter_lora_param> & lora_files,
std::string & outfile,
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
Expand Down
49 changes: 23 additions & 26 deletions tools/server/server-common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,15 @@ std::string gen_tool_call_id() {
// lora utils
//

bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras) {
bool lora_all_alora(
const std::vector<common_adapter_lora_info> & lora_adapters,
const std::map<int, float> & loras) {

bool found_alora = false;
for (const auto & lora : loras) {
if (lora.scale != 0) {
if (llama_adapter_get_alora_n_invocation_tokens(lora.ptr) == 0) {
for (size_t i = 0; i < lora_adapters.size(); i++) {
auto it = loras.find(i);
if (it != loras.end() && it->second != 0.0f) {
if (llama_adapter_get_alora_n_invocation_tokens(lora_adapters[i].ptr.get()) == 0) {
return false;
}
found_alora = true;
Expand All @@ -102,21 +106,22 @@ bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras) {
}

bool lora_should_clear_cache(
const std::vector<common_adapter_lora_info> & current,
const std::vector<common_adapter_lora_info> & next) {
const std::vector<common_adapter_lora_info> & lora_adapters,
const lora_scales & current,
const lora_scales & next) {

// This should always be called after determining that the two sets are
// _not_ equal. This assert is therefore some slightly wasted work and
// should be safe to remove as long as this method is called correctly.
GGML_ASSERT(!are_lora_equal(current, next));

return (
!(lora_get_enabled_ids(current).empty() || lora_all_alora(current)) ||
!lora_all_alora(next));
!(lora_get_enabled_ids(current).empty() || lora_all_alora(lora_adapters, current)) ||
!lora_all_alora(lora_adapters, next));
}

std::map<int, float> parse_lora_request(const json & data) {
std::map<int, float> lora;
lora_scales parse_lora_request(const json & data) {
lora_scales lora;

// set value
for (const auto & entry : data) {
Expand All @@ -129,25 +134,17 @@ std::map<int, float> parse_lora_request(const json & data) {
}

bool are_lora_equal(
const std::vector<common_adapter_lora_info> & l1,
const std::vector<common_adapter_lora_info> & l2) {
if (l1.size() != l2.size()) {
return false;
}
for (size_t i = 0; i < l1.size(); ++i) {
// we don't check lora.path to reduce the time complexity
if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) {
return false;
}
}
return true;
const lora_scales & l1,
const lora_scales & l2) {

return l1 == l2;
}

std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras) {
std::vector<size_t> lora_get_enabled_ids(const lora_scales & loras) {
std::vector<size_t> enabled_ids;
for (size_t i = 0; i < loras.size(); ++i) {
if (loras[i].scale > 0) {
enabled_ids.push_back(i);
for (const auto &it : loras) {
if (it.second != 0.0f) {
enabled_ids.push_back(it.first);
}
}
return enabled_ids;
Expand Down
15 changes: 9 additions & 6 deletions tools/server/server-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,23 +98,26 @@ std::string gen_tool_call_id();
// lora utils
//

using lora_scales = std::map<int, float>;

// check whether the given lora set has only aloras activated (empty => false)
bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras);
bool lora_all_alora(const std::vector<common_adapter_lora_info> & lora_adapters, const lora_scales & loras);

// if the two sets of loras are different, they require a cache clear unless the
// change is only from aloras to aloras.
bool lora_should_clear_cache(
const std::vector<common_adapter_lora_info> & current,
const std::vector<common_adapter_lora_info> & next);
const std::vector<common_adapter_lora_info> & lora_adapters,
const lora_scales & current,
const lora_scales & next);

std::map<int, float> parse_lora_request(const json & data);

bool are_lora_equal(
const std::vector<common_adapter_lora_info> & l1,
const std::vector<common_adapter_lora_info> & l2);
const lora_scales & l1,
const lora_scales & l2);

// get the ids of all enabled loras
std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras);
std::vector<size_t> lora_get_enabled_ids(const lora_scales & loras);

//
// server_tokens
Expand Down
Loading