Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 19 additions & 18 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1109,6 +1109,25 @@ common_init_result::common_init_result(common_params & params) :

const llama_vocab * vocab = llama_model_get_vocab(model);

// load and optionally apply lora adapters (must be loaded before context creation)
for (auto & la : params.lora_adapters) {
llama_adapter_lora_ptr lora;
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
if (lora == nullptr) {
LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
pimpl->model.reset(model);
return;
}

char buf[1024];
la.ptr = lora.get();
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
la.task_name = buf;
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
la.prompt_prefix = buf;
pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
}

// updates params.sampling
// TODO: fix naming
common_init_sampler_from_model(model, params.sampling);
Expand Down Expand Up @@ -1245,24 +1264,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
}
}

// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
llama_adapter_lora_ptr lora;
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
if (lora == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
return res;
}

char buf[1024];
la.ptr = lora.get();
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
la.task_name = buf;
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
la.prompt_prefix = buf;
res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
}

if (!params.lora_init_without_apply) {
common_set_adapter_lora(lctx, params.lora_adapters);
}
Expand Down
2 changes: 2 additions & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,8 @@ extern "C" {
//

// Load a LoRA adapter from file
// The adapter is valid as long as the associated model is not freed
// All adapters must be loaded before context creation
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
struct llama_model * model,
const char * path_lora);
Expand Down
15 changes: 12 additions & 3 deletions src/llama-adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
return nullptr;
}

static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);

llama_model & model = adapter.model;

ggml_context * ctx_init;
gguf_init_params meta_gguf_params = {
/* .no_alloc = */ true,
Expand Down Expand Up @@ -411,14 +413,17 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
}
}

// update number of nodes used
model.n_lora_nodes += adapter.get_n_nodes();

LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
}

llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
llama_adapter_lora * adapter = new llama_adapter_lora();
llama_adapter_lora * adapter = new llama_adapter_lora(*model);

try {
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
llama_adapter_lora_init_impl(path_lora, *adapter);
return adapter;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
Expand Down Expand Up @@ -469,6 +474,10 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
}

void llama_adapter_lora_free(llama_adapter_lora * adapter) {
// update number of nodes used
GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
adapter->model.n_lora_nodes -= adapter->get_n_nodes();

delete adapter;
}

Expand Down
8 changes: 7 additions & 1 deletion src/llama-adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ struct llama_adapter_lora_weight {
};

struct llama_adapter_lora {
llama_model & model;

// map tensor name to lora_a_b
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;

Expand All @@ -73,10 +75,14 @@ struct llama_adapter_lora {
// activated lora (aLoRA)
std::vector<llama_token> alora_invocation_tokens;

llama_adapter_lora() = default;
llama_adapter_lora(llama_model & model) : model(model) {}
~llama_adapter_lora() = default;

llama_adapter_lora_weight * get_weight(ggml_tensor * w);

uint32_t get_n_nodes() const {
return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
}
};

using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
4 changes: 3 additions & 1 deletion src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1442,7 +1442,9 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
if (model.arch == LLM_ARCH_QWEN3NEXT) {
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
}
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
res += model.n_lora_nodes;
return res;
}

llm_graph_result * llama_context::get_gf_res_reserve() const {
Expand Down
3 changes: 3 additions & 0 deletions src/llama-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,9 @@ struct llama_model {
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;

// for keeping track of extra nodes used by lora adapters
uint32_t n_lora_nodes = 0;

int64_t t_load_us = 0;
int64_t t_start_us = 0;

Expand Down
Loading