From 8e8e2007269670cb0fae82f6fe17da970210ed07 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Sun, 29 Mar 2026 10:00:49 +0200 Subject: [PATCH 01/15] server: add --models-memory-max parameter to allow dynamically unloading models when they exceed a memory size threshold --- common/arg.cpp | 7 +++ common/common.h | 1 + tools/server/server-context.cpp | 1 + tools/server/server-models.cpp | 91 ++++++++++++++++++++++++--------- tools/server/server-models.h | 1 + 5 files changed, 76 insertions(+), 25 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 6751a55ab0c..852b69d4252 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3072,6 +3072,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.models_max = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX")); + add_opt(common_arg( + {"--models-memory-max"}, "N", + string_format("for router server, maximum memory usage in MB (default: %d, 0 = unlimited)", params.models_memory_max), + [](common_params & params, int value) { + params.models_memory_max = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MAX")); add_opt(common_arg( {"--models-autoload"}, {"--no-models-autoload"}, diff --git a/common/common.h b/common/common.h index 4137a87f1d2..cfc68ce9264 100644 --- a/common/common.h +++ b/common/common.h @@ -610,6 +610,7 @@ struct common_params { std::string models_dir = ""; // directory containing models for the router server std::string models_preset = ""; // directory containing model presets for the router server int models_max = 4; // maximum number of models to load simultaneously + int models_memory_max = 0; // maximum memory usage in MB (0 = unlimited, estimated from model files) bool models_autoload = true; // automatically load models when requested via the router server bool log_json = false; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index a5372572f01..7a4ac804125 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3615,6 +3615,7 @@ void server_routes::init_routes() { { "total_slots", params.n_parallel }, { "model_alias", meta->model_name }, { "model_path", meta->model_path }, + { "memory_mb", meta->model_size / (1024 * 1024) }, { "modalities", json { {"vision", meta->has_inp_image}, {"audio", meta->has_inp_audio}, diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 6066611f51c..e60efb9f604 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -302,6 +302,7 @@ void server_models::load_models() { /* port */ 0, /* status */ SERVER_MODEL_STATUS_UNLOADED, /* last_used */ 0, + /* memory_mb */ 0, /* args */ std::vector(), /* exit_code */ 0, /* stop_timeout */ DEFAULT_STOP_TIMEOUT, @@ -496,34 +497,45 @@ std::vector server_models::get_all_meta() { } void server_models::unload_lru() { - if (base_params.models_max <= 0) { + if (base_params.models_max <= 0 && base_params.models_memory_max <= 0) { return; // no limit } - // remove one of the servers if we passed the models_max (least recently used - LRU) - std::string lru_model_name = ""; - int64_t lru_last_used = ggml_time_ms(); - size_t count_active = 0; - { - std::unique_lock lk(mutex); - for (const auto & m : mapping) { - if (m.second.meta.is_running()) { - count_active++; - if (m.second.meta.last_used < lru_last_used) { - lru_model_name = m.first; - lru_last_used = m.second.meta.last_used; + // Keep unloading LRU models until limits are satisfied + while (true) { + std::string lru_model_name = ""; + int64_t lru_last_used = ggml_time_ms(); + size_t count_active = 0; + uint64_t total_memory_mb = 0; + { + std::unique_lock lk(mutex); + for (const auto & m : mapping) { + if (m.second.meta.is_running()) { + count_active++; + total_memory_mb += m.second.meta.memory_mb; + if (m.second.meta.last_used < lru_last_used) { + lru_model_name = m.first; + lru_last_used = m.second.meta.last_used; + } } } } - } - if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) { - SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str()); - unload(lru_model_name); - // wait for unload to complete - { - std::unique_lock lk(mutex); - cv.wait(lk, [this, &lru_model_name]() { - return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED; - }); + // Check if limits exceeded + bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max; + bool memory_exceeded = base_params.models_memory_max > 0 && total_memory_mb >= (uint64_t)base_params.models_memory_max; + if (!lru_model_name.empty() && (count_exceeded || memory_exceeded)) { + SRV_INF("limits reached (count=%zu, memory=%lu MB), removing LRU name=%s\n", + count_active, (unsigned long)total_memory_mb, lru_model_name.c_str()); + unload(lru_model_name); + // wait for unload to complete + { + std::unique_lock lk(mutex); + cv.wait(lk, [this, &lru_model_name]() { + return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED; + }); + } + // Loop continues to check if more unloading is needed + } else { + break; // limits satisfied } } } @@ -546,14 +558,18 @@ void server_models::load(const std::string & name) { // exceeding models_max. Without this, the window between unload_lru() // releasing its lock and this lock_guard acquiring allows multiple // threads to each observe capacity and all proceed to load. - if (base_params.models_max > 0) { + if (base_params.models_max > 0 || base_params.models_memory_max > 0) { size_t count_active = 0; + uint64_t total_memory_mb = 0; for (const auto & m : mapping) { if (m.second.meta.is_running()) { count_active++; + total_memory_mb += m.second.meta.memory_mb; } } - if (count_active >= (size_t)base_params.models_max) { + bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max; + bool memory_exceeded = base_params.models_memory_max > 0 && total_memory_mb >= (uint64_t)base_params.models_memory_max; + if (count_exceeded || memory_exceeded) { throw std::runtime_error("model limit reached, try again later"); } } @@ -610,10 +626,35 @@ void server_models::load(const std::string & name) { // also handle status report from child process if (stdout_file) { char buffer[4096]; + bool ready_received = false; while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) { LOG("[%5d] %s", port, buffer); std::string str(buffer); if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) { + // Query memory usage from the child's /props endpoint + if (!ready_received) { + ready_received = true; + try { + httplib::Client cli("http://CHILD_ADDR"); + cli.set_connection_timeout(5, 0); + if (auto res = cli.Get("/props")) { + if (res->status == 200) { + json props = json::parse(res->body); + if (props.contains("memory_mb")) { + uint64_t memory_mb = props["memory_mb"].get(); + SRV_INF("model %s loaded, memory usage: %lu MB\n", name.c_str(), (unsigned long)memory_mb); + // Update memory_mb in meta + std::lock_guard lk(this->mutex); + if (mapping.find(name) != mapping.end()) { + mapping[name].meta.memory_mb = memory_mb; + } + } + } + } + } catch (const std::exception & e) { + SRV_WRN("failed to query memory for model %s: %s\n", name.c_str(), e.what()); + } + } this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0); } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) { this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0); diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 1db34b6c4df..c195dbeb26e 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -62,6 +62,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading + uint64_t memory_mb = 0; // estimated memory usage in MB std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown From 777395f6438dd5fe6dcfb25268575694fc229edb Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Sun, 29 Mar 2026 12:18:51 +0200 Subject: [PATCH 02/15] estimate with to-be-loaded model size included --- include/llama.h | 6 +++++ src/llama-model.cpp | 29 +++++++++++++++++++++++ tools/server/server-models.cpp | 43 ++++++++++++++++++++++++---------- tools/server/server-models.h | 4 ++-- 4 files changed, 67 insertions(+), 15 deletions(-) diff --git a/include/llama.h b/include/llama.h index eb869814097..03b83f40d56 100644 --- a/include/llama.h +++ b/include/llama.h @@ -595,6 +595,12 @@ extern "C" { // Returns the total size of all the tensors in the model in bytes LLAMA_API uint64_t llama_model_size(const struct llama_model * model); + // Returns the total size of all the tensors in the model in bytes from a model path + // without fully loading the model. Uses llama_model_loader with no_alloc=true. + // Returns 0 if the model cannot be loaded or the path is invalid. + // This function can be used to estimate memory requirements before loading a model. + LLAMA_API uint64_t llama_model_size_from_path(const char * path); + // Get the default chat template. Returns nullptr if not available // If name is NULL, returns the default chat template LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f77b2e9217f..3a363f55bee 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -9393,6 +9393,35 @@ uint64_t llama_model_size(const llama_model * model) { return model->size(); } +uint64_t llama_model_size_from_path(const char * path) { + if (!path) { + return 0; + } + + try { + std::vector splits; + + llama_model_loader loader( + /* metadata */ nullptr, + /* set_tensor_data */ nullptr, + /* set_tensor_data_ud */ nullptr, + /* fname */ path, + /* splits */ splits, + /* file */ nullptr, + /* use_mmap */ false, + /* use_direct_io */ false, + /* check_tensors */ false, + /* no_alloc */ true, + /* param_overrides_p */ nullptr, + /* param_tensor_buft_overrides_p */ nullptr + ); + + return loader.n_bytes; + } catch (...) { + return 0; + } +} + const char * llama_model_chat_template(const llama_model * model, const char * name) { const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE) : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE); diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index e60efb9f604..fc5cf7c9fd0 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -496,11 +496,10 @@ std::vector server_models::get_all_meta() { return result; } -void server_models::unload_lru() { +void server_models::unload_lru(uint64_t new_model_memory_mb) { if (base_params.models_max <= 0 && base_params.models_memory_max <= 0) { return; // no limit } - // Keep unloading LRU models until limits are satisfied while (true) { std::string lru_model_name = ""; int64_t lru_last_used = ggml_time_ms(); @@ -519,12 +518,14 @@ void server_models::unload_lru() { } } } - // Check if limits exceeded - bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max; - bool memory_exceeded = base_params.models_memory_max > 0 && total_memory_mb >= (uint64_t)base_params.models_memory_max; + bool count_exceeded = base_params.models_max > 0 && + (count_active + 1) >= (size_t)base_params.models_max; + uint64_t projected_memory = total_memory_mb + new_model_memory_mb; + bool memory_exceeded = base_params.models_memory_max > 0 && + projected_memory >= (uint64_t)base_params.models_memory_max; if (!lru_model_name.empty() && (count_exceeded || memory_exceeded)) { - SRV_INF("limits reached (count=%zu, memory=%lu MB), removing LRU name=%s\n", - count_active, (unsigned long)total_memory_mb, lru_model_name.c_str()); + SRV_INF("limits reached (count=%zu, memory=%lu MB + %lu MB new), removing LRU name=%s\n", + count_active, (unsigned long)total_memory_mb, (unsigned long)new_model_memory_mb, lru_model_name.c_str()); unload(lru_model_name); // wait for unload to complete { @@ -533,9 +534,8 @@ void server_models::unload_lru() { return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED; }); } - // Loop continues to check if more unloading is needed } else { - break; // limits satisfied + break; } } } @@ -544,7 +544,26 @@ void server_models::load(const std::string & name) { if (!has_model(name)) { throw std::runtime_error("model name=" + name + " is not found"); } - unload_lru(); + + uint64_t new_model_memory_mb = 0; + if (base_params.models_memory_max > 0) { + std::string model_path; + { + std::lock_guard lk(mutex); + auto & meta = mapping[name].meta; + if (meta.preset.get_option("LLAMA_ARG_MODEL", model_path) && !model_path.empty()) { + uint64_t size_bytes = llama_model_size_from_path(model_path.c_str()); + new_model_memory_mb = size_bytes / (1024 * 1024); + meta.memory_mb = new_model_memory_mb; + if (new_model_memory_mb > 0) { + SRV_INF("model %s estimated size: %lu MB\n", name.c_str(), + (unsigned long)new_model_memory_mb); + } + } + } + } + + unload_lru(new_model_memory_mb); std::lock_guard lk(mutex); @@ -631,7 +650,6 @@ void server_models::load(const std::string & name) { LOG("[%5d] %s", port, buffer); std::string str(buffer); if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) { - // Query memory usage from the child's /props endpoint if (!ready_received) { ready_received = true; try { @@ -642,8 +660,7 @@ void server_models::load(const std::string & name) { json props = json::parse(res->body); if (props.contains("memory_mb")) { uint64_t memory_mb = props["memory_mb"].get(); - SRV_INF("model %s loaded, memory usage: %lu MB\n", name.c_str(), (unsigned long)memory_mb); - // Update memory_mb in meta + SRV_INF("model %s loaded, actual memory: %lu MB\n", name.c_str(), (unsigned long)memory_mb); std::lock_guard lk(this->mutex); if (mapping.find(name) != mapping.end()) { mapping[name].meta.memory_mb = memory_mb; diff --git a/tools/server/server-models.h b/tools/server/server-models.h index c195dbeb26e..29c1c7c6f8d 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -62,7 +62,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading - uint64_t memory_mb = 0; // estimated memory usage in MB + uint64_t memory_mb = 0; // size in MB (estimate before load, actual after load) std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown @@ -111,7 +111,7 @@ struct server_models { void update_meta(const std::string & name, const server_model_meta & meta); // unload least recently used models if the limit is reached - void unload_lru(); + void unload_lru(uint64_t new_model_memory_mb = 0); // not thread-safe, caller must hold mutex void add_model(server_model_meta && meta); From 2603b4c5bc6b9e5641ca246748c987b974430839 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 31 Mar 2026 16:18:03 +0200 Subject: [PATCH 03/15] use no_alloc to get memory requirements for model load --- include/llama.h | 6 --- src/llama-model.cpp | 29 ----------- tools/server/server-context.cpp | 1 - tools/server/server-models.cpp | 86 +++++++++++++++++++-------------- tools/server/server-models.h | 2 +- 5 files changed, 51 insertions(+), 73 deletions(-) diff --git a/include/llama.h b/include/llama.h index 03b83f40d56..eb869814097 100644 --- a/include/llama.h +++ b/include/llama.h @@ -595,12 +595,6 @@ extern "C" { // Returns the total size of all the tensors in the model in bytes LLAMA_API uint64_t llama_model_size(const struct llama_model * model); - // Returns the total size of all the tensors in the model in bytes from a model path - // without fully loading the model. Uses llama_model_loader with no_alloc=true. - // Returns 0 if the model cannot be loaded or the path is invalid. - // This function can be used to estimate memory requirements before loading a model. - LLAMA_API uint64_t llama_model_size_from_path(const char * path); - // Get the default chat template. Returns nullptr if not available // If name is NULL, returns the default chat template LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 3a363f55bee..f77b2e9217f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -9393,35 +9393,6 @@ uint64_t llama_model_size(const llama_model * model) { return model->size(); } -uint64_t llama_model_size_from_path(const char * path) { - if (!path) { - return 0; - } - - try { - std::vector splits; - - llama_model_loader loader( - /* metadata */ nullptr, - /* set_tensor_data */ nullptr, - /* set_tensor_data_ud */ nullptr, - /* fname */ path, - /* splits */ splits, - /* file */ nullptr, - /* use_mmap */ false, - /* use_direct_io */ false, - /* check_tensors */ false, - /* no_alloc */ true, - /* param_overrides_p */ nullptr, - /* param_tensor_buft_overrides_p */ nullptr - ); - - return loader.n_bytes; - } catch (...) { - return 0; - } -} - const char * llama_model_chat_template(const llama_model * model, const char * name) { const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE) : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 7a4ac804125..a5372572f01 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3615,7 +3615,6 @@ void server_routes::init_routes() { { "total_slots", params.n_parallel }, { "model_alias", meta->model_name }, { "model_path", meta->model_path }, - { "memory_mb", meta->model_size / (1024 * 1024) }, { "modalities", json { {"vision", meta->has_inp_image}, {"audio", meta->has_inp_audio}, diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index fc5cf7c9fd0..42f7a1d2de4 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -540,6 +540,49 @@ void server_models::unload_lru(uint64_t new_model_memory_mb) { } } +static uint64_t get_model_memory_mb(const common_preset& preset) { + common_params params; + preset.apply_to_params(params); + + if(params.model.path.empty()) { + return 0; + } + + struct log_ud_t { + struct { + ggml_log_callback callback; + void * user_data; + } original; + ggml_log_level min_level; + } log_ud; + llama_log_get(&log_ud.original.callback, &log_ud.original.user_data); + log_ud.min_level = GGML_LOG_LEVEL_WARN; + + llama_log_set([](ggml_log_level level, const char * text, void * ud) { + log_ud_t * d = (log_ud_t *) ud; + const ggml_log_level eff = level >= d->min_level ? level : GGML_LOG_LEVEL_DEBUG; + d->original.callback(eff, text, d->original.user_data); + }, &log_ud); + + llama_model_params mparams = common_model_params_to_llama(params); + mparams.no_alloc = true; + mparams.use_mmap = false; + mparams.use_mlock = false; + + llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); + + llama_log_set(log_ud.original.callback, log_ud.original.user_data); + + if (!model) { + return 0; + } + + uint64_t size_bytes = llama_model_size(model); + llama_model_free(model); + + return size_bytes / (1024 * 1024); +} + void server_models::load(const std::string & name) { if (!has_model(name)) { throw std::runtime_error("model name=" + name + " is not found"); @@ -547,19 +590,13 @@ void server_models::load(const std::string & name) { uint64_t new_model_memory_mb = 0; if (base_params.models_memory_max > 0) { - std::string model_path; - { - std::lock_guard lk(mutex); - auto & meta = mapping[name].meta; - if (meta.preset.get_option("LLAMA_ARG_MODEL", model_path) && !model_path.empty()) { - uint64_t size_bytes = llama_model_size_from_path(model_path.c_str()); - new_model_memory_mb = size_bytes / (1024 * 1024); - meta.memory_mb = new_model_memory_mb; - if (new_model_memory_mb > 0) { - SRV_INF("model %s estimated size: %lu MB\n", name.c_str(), - (unsigned long)new_model_memory_mb); - } - } + std::lock_guard lk(mutex); + auto & meta = mapping[name].meta; + new_model_memory_mb = get_model_memory_mb(meta.preset); + meta.memory_mb = new_model_memory_mb; + if (new_model_memory_mb > 0) { + SRV_INF("model %s memory requirements: %lu MB\n", name.c_str(), + (unsigned long)new_model_memory_mb); } } @@ -645,33 +682,10 @@ void server_models::load(const std::string & name) { // also handle status report from child process if (stdout_file) { char buffer[4096]; - bool ready_received = false; while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) { LOG("[%5d] %s", port, buffer); std::string str(buffer); if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) { - if (!ready_received) { - ready_received = true; - try { - httplib::Client cli("http://CHILD_ADDR"); - cli.set_connection_timeout(5, 0); - if (auto res = cli.Get("/props")) { - if (res->status == 200) { - json props = json::parse(res->body); - if (props.contains("memory_mb")) { - uint64_t memory_mb = props["memory_mb"].get(); - SRV_INF("model %s loaded, actual memory: %lu MB\n", name.c_str(), (unsigned long)memory_mb); - std::lock_guard lk(this->mutex); - if (mapping.find(name) != mapping.end()) { - mapping[name].meta.memory_mb = memory_mb; - } - } - } - } - } catch (const std::exception & e) { - SRV_WRN("failed to query memory for model %s: %s\n", name.c_str(), e.what()); - } - } this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0); } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) { this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0); diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 29c1c7c6f8d..2cbdb35b321 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -62,7 +62,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading - uint64_t memory_mb = 0; // size in MB (estimate before load, actual after load) + uint64_t memory_mb = 0; // size in MB std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown From 9b5af58a9ae8162e57492e2a07a46b22a7cc1bc3 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 31 Mar 2026 17:37:16 +0200 Subject: [PATCH 04/15] only set model memory_mb if not previously calculated --- tools/server/server-models.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 42f7a1d2de4..1363585dff3 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -592,8 +592,12 @@ void server_models::load(const std::string & name) { if (base_params.models_memory_max > 0) { std::lock_guard lk(mutex); auto & meta = mapping[name].meta; - new_model_memory_mb = get_model_memory_mb(meta.preset); - meta.memory_mb = new_model_memory_mb; + if (meta.memory_mb > 0) { + new_model_memory_mb = meta.memory_mb; + } else { + new_model_memory_mb = get_model_memory_mb(meta.preset); + meta.memory_mb = new_model_memory_mb; + } if (new_model_memory_mb > 0) { SRV_INF("model %s memory requirements: %lu MB\n", name.c_str(), (unsigned long)new_model_memory_mb); From 56122b35ad6679d0efccb929bca5b1ff5420f950 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Thu, 2 Apr 2026 09:24:53 +0200 Subject: [PATCH 05/15] use memory margin instead of total size limit, apply to each device separately --- common/arg.cpp | 8 +- common/common.h | 10 +-- include/llama.h | 6 ++ src/llama-context.cpp | 13 +++ tools/server/server-models.cpp | 139 ++++++++++++++++++++++----------- tools/server/server-models.h | 12 ++- 6 files changed, 132 insertions(+), 56 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 852b69d4252..37e2c8dda1f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3073,12 +3073,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX")); add_opt(common_arg( - {"--models-memory-max"}, "N", - string_format("for router server, maximum memory usage in MB (default: %d, 0 = unlimited)", params.models_memory_max), + {"--models-memory-margin"}, "N", + string_format("for router server, MB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin), [](common_params & params, int value) { - params.models_memory_max = value; + params.models_memory_margin = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MAX")); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MARGIN")); add_opt(common_arg( {"--models-autoload"}, {"--no-models-autoload"}, diff --git a/common/common.h b/common/common.h index cfc68ce9264..8ac5b9a8bdb 100644 --- a/common/common.h +++ b/common/common.h @@ -607,11 +607,11 @@ struct common_params { std::vector server_tools; // router server configs - std::string models_dir = ""; // directory containing models for the router server - std::string models_preset = ""; // directory containing model presets for the router server - int models_max = 4; // maximum number of models to load simultaneously - int models_memory_max = 0; // maximum memory usage in MB (0 = unlimited, estimated from model files) - bool models_autoload = true; // automatically load models when requested via the router server + std::string models_dir = ""; // directory containing models for the router server + std::string models_preset = ""; // directory containing model presets for the router server + int models_max = 4; // maximum number of models to load simultaneously + int models_memory_margin = 1024; // MB of free memory to preserve per device (0 = disabled) + bool models_autoload = true; // automatically load models when requested via the router server bool log_json = false; diff --git a/include/llama.h b/include/llama.h index eb869814097..72fff81bb25 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1525,6 +1525,12 @@ extern "C" { LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); + // Returns the projected memory use (model + context + compute) in bytes + // for the given device within this context. Returns 0 if the device is not used. + LLAMA_API uint64_t llama_context_device_memory( + const struct llama_context * ctx, + ggml_backend_dev_t device); + // // training // diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8126249e143..79437bbd177 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -3493,6 +3493,19 @@ void llama_perf_context_reset(llama_context * ctx) { ctx->perf_reset(); } +uint64_t llama_context_device_memory(const llama_context * ctx, ggml_backend_dev_t device) { + const bool is_host = ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU; + uint64_t total = 0; + for (const auto & [buft, mb] : ctx->memory_breakdown()) { + const bool matches = is_host ? ggml_backend_buft_is_host(buft) : + ggml_backend_buft_get_device(buft) == device; + if (matches) { + total += mb.total(); + } + } + return total; +} + // // training // diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 1363585dff3..00301be17b6 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -180,6 +180,21 @@ server_models::server_models( LOG_WRN("failed to get server executable path: %s\n", e.what()); LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]); } + + const uint64_t memory_margin = base_params.models_memory_margin * 1024 * 1024; + + if (memory_margin > 0) { + const size_t n_devs = ggml_backend_dev_count(); + for (size_t i = 0; i < n_devs; i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + if (total > 0) { + memory_per_device[dev] = (free > memory_margin) ? free - memory_margin : 0; + } + } + } + load_models(); } @@ -295,17 +310,17 @@ void server_models::load_models() { // convert presets to server_model_meta and add to mapping for (const auto & preset : final_presets) { server_model_meta meta{ - /* preset */ preset.second, - /* name */ preset.first, - /* aliases */ {}, - /* tags */ {}, - /* port */ 0, - /* status */ SERVER_MODEL_STATUS_UNLOADED, - /* last_used */ 0, - /* memory_mb */ 0, - /* args */ std::vector(), - /* exit_code */ 0, - /* stop_timeout */ DEFAULT_STOP_TIMEOUT, + /* preset */ preset.second, + /* name */ preset.first, + /* aliases */ {}, + /* tags */ {}, + /* port */ 0, + /* status */ SERVER_MODEL_STATUS_UNLOADED, + /* last_used */ 0, + /* memory_per_device */ {}, + /* args */ std::vector(), + /* exit_code */ 0, + /* stop_timeout */ DEFAULT_STOP_TIMEOUT, }; add_model(std::move(meta)); } @@ -496,36 +511,63 @@ std::vector server_models::get_all_meta() { return result; } -void server_models::unload_lru(uint64_t new_model_memory_mb) { - if (base_params.models_max <= 0 && base_params.models_memory_max <= 0) { +uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_memory_per_device) const { + model_memory_map total_memory_per_device; + for (const auto & m : mapping) { + if (m.second.meta.is_running()) { + for (const auto& [key, value] : m.second.meta.memory_per_device) { + total_memory_per_device[key] += value; + } + } + } + + auto get = [](const model_memory_map & m, ggml_backend_dev_t k) { + auto it = m.find(k); + return it != m.end() ? it->second : 0; + }; + + uint64_t memory_exceeded = 0; + + for (const auto& [key, limit] : memory_per_device) { + if (get(new_model_memory_per_device, key) + get(total_memory_per_device, key) > limit) { + memory_exceeded++; + } + } + + return memory_exceeded; +} + +void server_models::unload_lru(const model_memory_map& new_model_memory_per_device) { + const bool check_memory = base_params.models_memory_margin > 0 && !memory_per_device.empty(); + + if (base_params.models_max <= 0 && !check_memory) { return; // no limit } + while (true) { std::string lru_model_name = ""; int64_t lru_last_used = ggml_time_ms(); size_t count_active = 0; - uint64_t total_memory_mb = 0; + uint64_t memory_exceeded = 0; { std::unique_lock lk(mutex); for (const auto & m : mapping) { if (m.second.meta.is_running()) { count_active++; - total_memory_mb += m.second.meta.memory_mb; if (m.second.meta.last_used < lru_last_used) { lru_model_name = m.first; lru_last_used = m.second.meta.last_used; } } } + memory_exceeded = get_memory_exceeded(new_model_memory_per_device); } bool count_exceeded = base_params.models_max > 0 && (count_active + 1) >= (size_t)base_params.models_max; - uint64_t projected_memory = total_memory_mb + new_model_memory_mb; - bool memory_exceeded = base_params.models_memory_max > 0 && - projected_memory >= (uint64_t)base_params.models_memory_max; - if (!lru_model_name.empty() && (count_exceeded || memory_exceeded)) { - SRV_INF("limits reached (count=%zu, memory=%lu MB + %lu MB new), removing LRU name=%s\n", - count_active, (unsigned long)total_memory_mb, (unsigned long)new_model_memory_mb, lru_model_name.c_str()); + + if (!lru_model_name.empty() && (count_exceeded || memory_exceeded > 0)) { + SRV_INF("limits reached (count=%zu, memory margin exceeded on %zu device(s)), removing LRU name=%s\n", + count_active, memory_exceeded, lru_model_name.c_str()); unload(lru_model_name); // wait for unload to complete { @@ -540,12 +582,12 @@ void server_models::unload_lru(uint64_t new_model_memory_mb) { } } -static uint64_t get_model_memory_mb(const common_preset& preset) { +static model_memory_map get_model_memory_per_device(const common_preset& preset) { common_params params; preset.apply_to_params(params); if(params.model.path.empty()) { - return 0; + return {}; } struct log_ud_t { @@ -569,18 +611,32 @@ static uint64_t get_model_memory_mb(const common_preset& preset) { mparams.use_mmap = false; mparams.use_mlock = false; - llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); + llama_model_ptr model{llama_model_load_from_file(params.model.path.c_str(), mparams)}; + + if (!model) { + llama_log_set(log_ud.original.callback, log_ud.original.user_data); + return {}; + } + llama_context_params cparams = common_context_params_to_llama(params); + llama_context_ptr ctx{llama_init_from_model(model.get(), cparams)}; llama_log_set(log_ud.original.callback, log_ud.original.user_data); - if (!model) { - return 0; + if (!ctx) { + return {}; } - uint64_t size_bytes = llama_model_size(model); - llama_model_free(model); + model_memory_map result; + const size_t n_devs = ggml_backend_dev_count(); + for (size_t i = 0; i < n_devs; i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + uint64_t bytes = llama_context_device_memory(ctx.get(), dev); + if (bytes > 0) { + result[dev] = bytes; + } + } - return size_bytes / (1024 * 1024); + return result; } void server_models::load(const std::string & name) { @@ -588,23 +644,18 @@ void server_models::load(const std::string & name) { throw std::runtime_error("model name=" + name + " is not found"); } - uint64_t new_model_memory_mb = 0; - if (base_params.models_memory_max > 0) { + model_memory_map new_model_memory_per_device; + if (base_params.models_memory_margin > 0) { std::lock_guard lk(mutex); auto & meta = mapping[name].meta; - if (meta.memory_mb > 0) { - new_model_memory_mb = meta.memory_mb; - } else { - new_model_memory_mb = get_model_memory_mb(meta.preset); - meta.memory_mb = new_model_memory_mb; - } - if (new_model_memory_mb > 0) { - SRV_INF("model %s memory requirements: %lu MB\n", name.c_str(), - (unsigned long)new_model_memory_mb); + if (meta.memory_per_device.empty()) { + meta.memory_per_device = get_model_memory_per_device(meta.preset); } + + new_model_memory_per_device = meta.memory_per_device; } - unload_lru(new_model_memory_mb); + unload_lru(new_model_memory_per_device); std::lock_guard lk(mutex); @@ -618,17 +669,15 @@ void server_models::load(const std::string & name) { // exceeding models_max. Without this, the window between unload_lru() // releasing its lock and this lock_guard acquiring allows multiple // threads to each observe capacity and all proceed to load. - if (base_params.models_max > 0 || base_params.models_memory_max > 0) { + if (base_params.models_max > 0 || base_params.models_memory_margin > 0) { size_t count_active = 0; - uint64_t total_memory_mb = 0; for (const auto & m : mapping) { if (m.second.meta.is_running()) { count_active++; - total_memory_mb += m.second.meta.memory_mb; } } bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max; - bool memory_exceeded = base_params.models_memory_max > 0 && total_memory_mb >= (uint64_t)base_params.models_memory_max; + bool memory_exceeded = get_memory_exceeded(new_model_memory_per_device) > 0; if (count_exceeded || memory_exceeded) { throw std::runtime_error("model limit reached, try again later"); } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 2cbdb35b321..38d6929a881 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -54,6 +54,8 @@ static std::string server_model_status_to_string(server_model_status status) { } } +using model_memory_map = std::map; + struct server_model_meta { common_preset preset; std::string name; @@ -62,7 +64,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading - uint64_t memory_mb = 0; // size in MB + model_memory_map memory_per_device; // projected bytes per device std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown @@ -108,14 +110,20 @@ struct server_models { std::vector base_env; common_preset base_preset; // base preset from llama-server CLI args + // available memory per device + std::map memory_per_device; + void update_meta(const std::string & name, const server_model_meta & meta); // unload least recently used models if the limit is reached - void unload_lru(uint64_t new_model_memory_mb = 0); + void unload_lru(const model_memory_map& new_model_memory_per_device); // not thread-safe, caller must hold mutex void add_model(server_model_meta && meta); + // not thread-safe, caller must hold mutex + uint64_t get_memory_exceeded(const model_memory_map& new_model_memory_per_device) const; + public: server_models(const common_params & params, int argc, char ** argv); From 51538c1f7864015601ac470127be06f72c9a6d30 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Thu, 2 Apr 2026 10:07:04 +0200 Subject: [PATCH 06/15] add server memory debug logging --- tools/server/server-models.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 00301be17b6..37cd81f2ef0 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -181,7 +181,7 @@ server_models::server_models( LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]); } - const uint64_t memory_margin = base_params.models_memory_margin * 1024 * 1024; + const uint64_t memory_margin = (uint64_t)base_params.models_memory_margin * 1024 * 1024; if (memory_margin > 0) { const size_t n_devs = ggml_backend_dev_count(); @@ -190,7 +190,11 @@ server_models::server_models( size_t free, total; ggml_backend_dev_memory(dev, &free, &total); if (total > 0) { - memory_per_device[dev] = (free > memory_margin) ? free - memory_margin : 0; + const uint64_t available = (free > memory_margin) ? free - memory_margin : 0; + memory_per_device[dev] = available; + SRV_DBG("device %s: available memory after margin=%lu MB\n", + ggml_backend_dev_name(dev), + (unsigned long)(available / (1024 * 1024))); } } } @@ -529,7 +533,15 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me uint64_t memory_exceeded = 0; for (const auto& [key, limit] : memory_per_device) { - if (get(new_model_memory_per_device, key) + get(total_memory_per_device, key) > limit) { + const uint64_t total_memory = get(total_memory_per_device, key); + const uint64_t new_memory = get(new_model_memory_per_device, key); + SRV_DBG("device %s: total=%lu MB, new=%lu MB, limit=%lu MB\n", + ggml_backend_dev_name(key), + (unsigned long)(total_memory / (1024 * 1024)), + (unsigned long)(new_memory / (1024 * 1024)), + (unsigned long)(limit / (1024 * 1024))); + + if (total_memory + new_memory > limit) { memory_exceeded++; } } From ba2521c6a06c11323b0f22e9ae3d3e4d56e4aa77 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Thu, 2 Apr 2026 11:39:07 +0200 Subject: [PATCH 07/15] move llama_context_device_memory function to llama-ext.h --- include/llama.h | 6 ------ src/llama-ext.h | 6 ++++++ tools/server/server-models.cpp | 2 ++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/llama.h b/include/llama.h index 72fff81bb25..eb869814097 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1525,12 +1525,6 @@ extern "C" { LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); - // Returns the projected memory use (model + context + compute) in bytes - // for the given device within this context. Returns 0 if the device is not used. - LLAMA_API uint64_t llama_context_device_memory( - const struct llama_context * ctx, - ggml_backend_dev_t device); - // // training // diff --git a/src/llama-ext.h b/src/llama-ext.h index 8ce29d217cb..ce87fa32a4a 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -88,3 +88,9 @@ LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model); LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i); LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx); + +// Returns the projected memory use (model + context + compute) in bytes +// for the given device within this context. Returns 0 if the device is not used. +LLAMA_API uint64_t llama_context_device_memory( + const struct llama_context * ctx, + ggml_backend_dev_t device); diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 37cd81f2ef0..ceacf6d7ec9 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -8,6 +8,8 @@ #include // TODO: remove this once we use HTTP client from download.h #include +#include "../../src/llama-ext.h" + #include #include #include From 7500063065f313c88c6c36efdb1e9e2bfe2f397b Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Thu, 2 Apr 2026 11:39:36 +0200 Subject: [PATCH 08/15] fix model count exceeded check --- tools/server/server-models.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index ceacf6d7ec9..48aef5a6a55 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -577,7 +577,7 @@ void server_models::unload_lru(const model_memory_map& new_model_memory_per_devi memory_exceeded = get_memory_exceeded(new_model_memory_per_device); } bool count_exceeded = base_params.models_max > 0 && - (count_active + 1) >= (size_t)base_params.models_max; + (count_active + 1) > (size_t)base_params.models_max; if (!lru_model_name.empty() && (count_exceeded || memory_exceeded > 0)) { SRV_INF("limits reached (count=%zu, memory margin exceeded on %zu device(s)), removing LRU name=%s\n", From 173da43c957e15beb8556b35dacd92a9474af783 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 7 Apr 2026 13:28:49 +0200 Subject: [PATCH 09/15] improve memory_per_device map naming --- tools/server/server-models.cpp | 14 +++++++------- tools/server/server-models.h | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 48aef5a6a55..22584db1c68 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -193,7 +193,7 @@ server_models::server_models( ggml_backend_dev_memory(dev, &free, &total); if (total > 0) { const uint64_t available = (free > memory_margin) ? free - memory_margin : 0; - memory_per_device[dev] = available; + available_memory_per_device[dev] = available; SRV_DBG("device %s: available memory after margin=%lu MB\n", ggml_backend_dev_name(dev), (unsigned long)(available / (1024 * 1024))); @@ -521,7 +521,7 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me model_memory_map total_memory_per_device; for (const auto & m : mapping) { if (m.second.meta.is_running()) { - for (const auto& [key, value] : m.second.meta.memory_per_device) { + for (const auto& [key, value] : m.second.meta.memory_usage_per_device) { total_memory_per_device[key] += value; } } @@ -534,7 +534,7 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me uint64_t memory_exceeded = 0; - for (const auto& [key, limit] : memory_per_device) { + for (const auto& [key, limit] : available_memory_per_device) { const uint64_t total_memory = get(total_memory_per_device, key); const uint64_t new_memory = get(new_model_memory_per_device, key); SRV_DBG("device %s: total=%lu MB, new=%lu MB, limit=%lu MB\n", @@ -552,7 +552,7 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me } void server_models::unload_lru(const model_memory_map& new_model_memory_per_device) { - const bool check_memory = base_params.models_memory_margin > 0 && !memory_per_device.empty(); + const bool check_memory = base_params.models_memory_margin > 0 && !available_memory_per_device.empty(); if (base_params.models_max <= 0 && !check_memory) { return; // no limit @@ -662,11 +662,11 @@ void server_models::load(const std::string & name) { if (base_params.models_memory_margin > 0) { std::lock_guard lk(mutex); auto & meta = mapping[name].meta; - if (meta.memory_per_device.empty()) { - meta.memory_per_device = get_model_memory_per_device(meta.preset); + if (meta.memory_usage_per_device.empty()) { + meta.memory_usage_per_device = get_model_memory_per_device(meta.preset); } - new_model_memory_per_device = meta.memory_per_device; + new_model_memory_per_device = meta.memory_usage_per_device; } unload_lru(new_model_memory_per_device); diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 38d6929a881..0f2f8f9a192 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -64,7 +64,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading - model_memory_map memory_per_device; // projected bytes per device + model_memory_map memory_usage_per_device; // bytes used per device std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown @@ -111,7 +111,7 @@ struct server_models { common_preset base_preset; // base preset from llama-server CLI args // available memory per device - std::map memory_per_device; + std::map available_memory_per_device; void update_meta(const std::string & name, const server_model_meta & meta); From 69e3086190009069cf85e6ade57c427c948b10bf Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 7 Apr 2026 13:35:02 +0200 Subject: [PATCH 10/15] improve variable naming, fix style --- common/arg.cpp | 2 +- tools/server/server-models.cpp | 24 ++++++++++++------------ tools/server/server-models.h | 6 +++--- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 37e2c8dda1f..7ba0f2fc256 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3074,7 +3074,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX")); add_opt(common_arg( {"--models-memory-margin"}, "N", - string_format("for router server, MB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin), + string_format("for router server, MiB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin), [](common_params & params, int value) { params.models_memory_margin = value; } diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 22584db1c68..544798c55eb 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -194,7 +194,7 @@ server_models::server_models( if (total > 0) { const uint64_t available = (free > memory_margin) ? free - memory_margin : 0; available_memory_per_device[dev] = available; - SRV_DBG("device %s: available memory after margin=%lu MB\n", + SRV_DBG("device %s: available memory after margin=%lu MiB\n", ggml_backend_dev_name(dev), (unsigned long)(available / (1024 * 1024))); } @@ -517,11 +517,11 @@ std::vector server_models::get_all_meta() { return result; } -uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_memory_per_device) const { +uint64_t server_models::get_memory_exceeded(const model_memory_map & new_model_memory_per_device) const { model_memory_map total_memory_per_device; for (const auto & m : mapping) { if (m.second.meta.is_running()) { - for (const auto& [key, value] : m.second.meta.memory_usage_per_device) { + for (const auto & [key, value] : m.second.meta.memory_usage_per_device) { total_memory_per_device[key] += value; } } @@ -532,9 +532,9 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me return it != m.end() ? it->second : 0; }; - uint64_t memory_exceeded = 0; + size_t count_memory_exceeded = 0; - for (const auto& [key, limit] : available_memory_per_device) { + for (const auto & [key, limit] : available_memory_per_device) { const uint64_t total_memory = get(total_memory_per_device, key); const uint64_t new_memory = get(new_model_memory_per_device, key); SRV_DBG("device %s: total=%lu MB, new=%lu MB, limit=%lu MB\n", @@ -544,14 +544,14 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me (unsigned long)(limit / (1024 * 1024))); if (total_memory + new_memory > limit) { - memory_exceeded++; + count_memory_exceeded++; } } - return memory_exceeded; + return count_memory_exceeded; } -void server_models::unload_lru(const model_memory_map& new_model_memory_per_device) { +void server_models::unload_lru(const model_memory_map & new_model_memory_per_device) { const bool check_memory = base_params.models_memory_margin > 0 && !available_memory_per_device.empty(); if (base_params.models_max <= 0 && !check_memory) { @@ -562,7 +562,7 @@ void server_models::unload_lru(const model_memory_map& new_model_memory_per_devi std::string lru_model_name = ""; int64_t lru_last_used = ggml_time_ms(); size_t count_active = 0; - uint64_t memory_exceeded = 0; + size_t count_memory_exceeded = 0; { std::unique_lock lk(mutex); for (const auto & m : mapping) { @@ -574,14 +574,14 @@ void server_models::unload_lru(const model_memory_map& new_model_memory_per_devi } } } - memory_exceeded = get_memory_exceeded(new_model_memory_per_device); + count_memory_exceeded = get_memory_exceeded(new_model_memory_per_device); } bool count_exceeded = base_params.models_max > 0 && (count_active + 1) > (size_t)base_params.models_max; - if (!lru_model_name.empty() && (count_exceeded || memory_exceeded > 0)) { + if (!lru_model_name.empty() && (count_exceeded || count_memory_exceeded > 0)) { SRV_INF("limits reached (count=%zu, memory margin exceeded on %zu device(s)), removing LRU name=%s\n", - count_active, memory_exceeded, lru_model_name.c_str()); + count_active, count_memory_exceeded, lru_model_name.c_str()); unload(lru_model_name); // wait for unload to complete { diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 0f2f8f9a192..f86cc0b2cc4 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -111,18 +111,18 @@ struct server_models { common_preset base_preset; // base preset from llama-server CLI args // available memory per device - std::map available_memory_per_device; + model_memory_map available_memory_per_device; void update_meta(const std::string & name, const server_model_meta & meta); // unload least recently used models if the limit is reached - void unload_lru(const model_memory_map& new_model_memory_per_device); + void unload_lru(const model_memory_map & new_model_memory_per_device); // not thread-safe, caller must hold mutex void add_model(server_model_meta && meta); // not thread-safe, caller must hold mutex - uint64_t get_memory_exceeded(const model_memory_map& new_model_memory_per_device) const; + uint64_t get_memory_exceeded(const model_memory_map & new_model_memory_per_device) const; public: server_models(const common_params & params, int argc, char ** argv); From eb2cf73ff9c54b693487134b80ee24ed15d0a975 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Mon, 13 Apr 2026 10:14:53 +0200 Subject: [PATCH 11/15] also strip models memory margin from child processes --- tools/server/server-models.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 544798c55eb..ef6acb57de8 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -96,6 +96,7 @@ static void unset_reserved_args(common_preset & preset, bool unset_model_args) { preset.unset_option("LLAMA_API_KEY"); preset.unset_option("LLAMA_ARG_MODELS_DIR"); preset.unset_option("LLAMA_ARG_MODELS_MAX"); + preset.unset_option("LLAMA_ARG_MODELS_MEMORY_MARGIN"); preset.unset_option("LLAMA_ARG_MODELS_PRESET"); preset.unset_option("LLAMA_ARG_MODELS_AUTOLOAD"); if (unset_model_args) { From 1a8aec0afd8209e49bc47e99c791d706dd84ef96 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 16 Apr 2026 14:32:47 +0300 Subject: [PATCH 12/15] cont : clean-up --- common/common.h | 2 +- tools/server/server-models.cpp | 129 ++++++++++++++++++--------------- tools/server/server-models.h | 12 +-- 3 files changed, 80 insertions(+), 63 deletions(-) diff --git a/common/common.h b/common/common.h index 8ac5b9a8bdb..2996d354049 100644 --- a/common/common.h +++ b/common/common.h @@ -610,7 +610,7 @@ struct common_params { std::string models_dir = ""; // directory containing models for the router server std::string models_preset = ""; // directory containing model presets for the router server int models_max = 4; // maximum number of models to load simultaneously - int models_memory_margin = 1024; // MB of free memory to preserve per device (0 = disabled) + int models_memory_margin = 1024; // MiB of free memory to preserve per device (0 = disabled) bool models_autoload = true; // automatically load models when requested via the router server bool log_json = false; diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index ef6acb57de8..96a291854d6 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -180,11 +180,11 @@ server_models::server_models( bin_path = get_server_exec_path().string(); } catch (const std::exception & e) { bin_path = argv[0]; - LOG_WRN("failed to get server executable path: %s\n", e.what()); - LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]); + SRV_WRN("failed to get server executable path: %s\n", e.what()); + SRV_WRN("using original argv[0] as fallback: %s\n", argv[0]); } - const uint64_t memory_margin = (uint64_t)base_params.models_memory_margin * 1024 * 1024; + const size_t memory_margin = (size_t) base_params.models_memory_margin * 1024 * 1024; if (memory_margin > 0) { const size_t n_devs = ggml_backend_dev_count(); @@ -193,11 +193,10 @@ server_models::server_models( size_t free, total; ggml_backend_dev_memory(dev, &free, &total); if (total > 0) { - const uint64_t available = (free > memory_margin) ? free - memory_margin : 0; - available_memory_per_device[dev] = available; - SRV_DBG("device %s: available memory after margin=%lu MiB\n", - ggml_backend_dev_name(dev), - (unsigned long)(available / (1024 * 1024))); + const size_t available = (free > memory_margin) ? free - memory_margin : 0; + dmm_available[dev] = available; + SRV_DBG("device %s: available memory after margin=%zu MiB\n", + ggml_backend_dev_name(dev), available / (1024 * 1024)); } } } @@ -518,52 +517,57 @@ std::vector server_models::get_all_meta() { return result; } -uint64_t server_models::get_memory_exceeded(const model_memory_map & new_model_memory_per_device) const { - model_memory_map total_memory_per_device; +int server_models::can_fit(const device_memory_map & dmm_req) const { + device_memory_map dmm_total; for (const auto & m : mapping) { if (m.second.meta.is_running()) { - for (const auto & [key, value] : m.second.meta.memory_usage_per_device) { - total_memory_per_device[key] += value; + for (const auto & [dev, mem] : m.second.meta.dmm_req) { + dmm_total[dev] += mem; } } } - auto get = [](const model_memory_map & m, ggml_backend_dev_t k) { - auto it = m.find(k); - return it != m.end() ? it->second : 0; + auto get = [](const device_memory_map & dmm, ggml_backend_dev_t dev) { + auto it = dmm.find(dev); + return it != dmm.end() ? it->second : 0; }; - size_t count_memory_exceeded = 0; + int res = 0; - for (const auto & [key, limit] : available_memory_per_device) { - const uint64_t total_memory = get(total_memory_per_device, key); - const uint64_t new_memory = get(new_model_memory_per_device, key); - SRV_DBG("device %s: total=%lu MB, new=%lu MB, limit=%lu MB\n", - ggml_backend_dev_name(key), - (unsigned long)(total_memory / (1024 * 1024)), - (unsigned long)(new_memory / (1024 * 1024)), - (unsigned long)(limit / (1024 * 1024))); + for (const auto & [dev, limit] : dmm_available) { + const size_t mem_total = get(dmm_total, dev); + const size_t mem_new = get(dmm_req, dev); - if (total_memory + new_memory > limit) { - count_memory_exceeded++; + SRV_DBG("device %s: total=%zu MiB, new=%zu MiB, limit=%zu MiB\n", + ggml_backend_dev_name(dev), + mem_total / (1024 * 1024), mem_new / (1024 * 1024), limit / (1024 * 1024)); + + if (mem_total + mem_new > limit) { + res++; } } - return count_memory_exceeded; + return res; } -void server_models::unload_lru(const model_memory_map & new_model_memory_per_device) { - const bool check_memory = base_params.models_memory_margin > 0 && !available_memory_per_device.empty(); +void server_models::unload_lru(const device_memory_map & dmm_req) { + const bool check_active = base_params.models_max > 0; + const bool check_memory = base_params.models_memory_margin > 0; - if (base_params.models_max <= 0 && !check_memory) { + if (!check_active && !check_memory) { return; // no limit } + if (check_memory) { + GGML_ASSERT(!dmm_available.empty()); + } + while (true) { - std::string lru_model_name = ""; + std::string lru_model_name; int64_t lru_last_used = ggml_time_ms(); - size_t count_active = 0; - size_t count_memory_exceeded = 0; + + int count_active = 0; + int count_exceed = 0; { std::unique_lock lk(mutex); for (const auto & m : mapping) { @@ -575,14 +579,17 @@ void server_models::unload_lru(const model_memory_map & new_model_memory_per_dev } } } - count_memory_exceeded = get_memory_exceeded(new_model_memory_per_device); + if (check_memory) { + count_exceed = can_fit(dmm_req); + } } - bool count_exceeded = base_params.models_max > 0 && - (count_active + 1) > (size_t)base_params.models_max; - if (!lru_model_name.empty() && (count_exceeded || count_memory_exceeded > 0)) { - SRV_INF("limits reached (count=%zu, memory margin exceeded on %zu device(s)), removing LRU name=%s\n", - count_active, count_memory_exceeded, lru_model_name.c_str()); + const bool active_exceeded = check_active && count_active >= base_params.models_max; + const bool memory_exceeded = check_memory && count_exceed > 0; + + if (!lru_model_name.empty() && (active_exceeded || memory_exceeded)) { + SRV_INF("limits reached (count=%d, memory margin exceeded on %d device(s)), removing LRU name=%s\n", + count_active, count_exceed, lru_model_name.c_str()); unload(lru_model_name); // wait for unload to complete { @@ -597,11 +604,11 @@ void server_models::unload_lru(const model_memory_map & new_model_memory_per_dev } } -static model_memory_map get_model_memory_per_device(const common_preset& preset) { +static device_memory_map get_model_memory_per_device(const common_preset & preset) { common_params params; preset.apply_to_params(params); - if(params.model.path.empty()) { + if (params.model.path.empty()) { return {}; } @@ -641,7 +648,7 @@ static model_memory_map get_model_memory_per_device(const common_preset& preset) return {}; } - model_memory_map result; + device_memory_map result; const size_t n_devs = ggml_backend_dev_count(); for (size_t i = 0; i < n_devs; i++) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); @@ -659,18 +666,19 @@ void server_models::load(const std::string & name) { throw std::runtime_error("model name=" + name + " is not found"); } - model_memory_map new_model_memory_per_device; + device_memory_map dmm_req; if (base_params.models_memory_margin > 0) { + // determine the required memory by the model upon its first load std::lock_guard lk(mutex); auto & meta = mapping[name].meta; - if (meta.memory_usage_per_device.empty()) { - meta.memory_usage_per_device = get_model_memory_per_device(meta.preset); + if (meta.dmm_req.empty()) { + meta.dmm_req = get_model_memory_per_device(meta.preset); } - new_model_memory_per_device = meta.memory_usage_per_device; + dmm_req = meta.dmm_req; } - unload_lru(new_model_memory_per_device); + unload_lru(dmm_req); std::lock_guard lk(mutex); @@ -684,17 +692,24 @@ void server_models::load(const std::string & name) { // exceeding models_max. Without this, the window between unload_lru() // releasing its lock and this lock_guard acquiring allows multiple // threads to each observe capacity and all proceed to load. - if (base_params.models_max > 0 || base_params.models_memory_margin > 0) { - size_t count_active = 0; - for (const auto & m : mapping) { - if (m.second.meta.is_running()) { - count_active++; + { + const bool check_active = base_params.models_max > 0; + const bool check_memory = base_params.models_memory_margin > 0; + + if (check_active || check_memory) { + int count_active = 0; + for (const auto & m : mapping) { + if (m.second.meta.is_running()) { + count_active++; + } + } + + const bool active_exceeded = check_active && count_active >= base_params.models_max; + const bool memory_exceeded = check_memory && can_fit(dmm_req) > 0; + + if (active_exceeded || memory_exceeded) { + throw std::runtime_error("model limit reached, try again later"); } - } - bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max; - bool memory_exceeded = get_memory_exceeded(new_model_memory_per_device) > 0; - if (count_exceeded || memory_exceeded) { - throw std::runtime_error("model limit reached, try again later"); } } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index f86cc0b2cc4..567e716bce0 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -54,7 +54,7 @@ static std::string server_model_status_to_string(server_model_status status) { } } -using model_memory_map = std::map; +using device_memory_map = std::map; struct server_model_meta { common_preset preset; @@ -64,7 +64,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading - model_memory_map memory_usage_per_device; // bytes used per device + device_memory_map dmm_req; // bytes required per device std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown @@ -111,18 +111,20 @@ struct server_models { common_preset base_preset; // base preset from llama-server CLI args // available memory per device - model_memory_map available_memory_per_device; + device_memory_map dmm_available; void update_meta(const std::string & name, const server_model_meta & meta); // unload least recently used models if the limit is reached - void unload_lru(const model_memory_map & new_model_memory_per_device); + void unload_lru(const device_memory_map & dmm_req); // not thread-safe, caller must hold mutex void add_model(server_model_meta && meta); + // return number of devices where the memory limit would be exceeded + // return 0 if the new model would fit on all devices // not thread-safe, caller must hold mutex - uint64_t get_memory_exceeded(const model_memory_map & new_model_memory_per_device) const; + int can_fit(const device_memory_map & dmm_req) const; public: server_models(const common_params & params, int argc, char ** argv); From b1623a614c682bad576ab7dc19cf613b2af94e6d Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Mon, 20 Apr 2026 14:48:55 +0200 Subject: [PATCH 13/15] handle models that need to be downloaded before estimation --- common/arg.cpp | 7 ++ common/common.h | 1 + tools/server/server-models.cpp | 122 ++++++++++++++++++++++++++++++++- tools/server/server-models.h | 25 +++++-- tools/server/server.cpp | 5 ++ 5 files changed, 151 insertions(+), 9 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 7ba0f2fc256..710955a86fb 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3308,6 +3308,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.offline = true; } ).set_env("LLAMA_OFFLINE")); + add_opt(common_arg( + {"--download-only"}, + "Download the model file(s) and exit", + [](common_params & params) { + params.download_only = true; + } + )); add_opt(common_arg( {"-lv", "--verbosity", "--log-verbosity"}, "N", string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n" diff --git a/common/common.h b/common/common.h index 2996d354049..066e5766502 100644 --- a/common/common.h +++ b/common/common.h @@ -482,6 +482,7 @@ struct common_params { int32_t control_vector_layer_start = -1; // layer range for control vector int32_t control_vector_layer_end = -1; // layer range for control vector bool offline = false; + bool download_only = false; // only download the model if required, don't start the server int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 96a291854d6..9f34a8cbc18 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -604,12 +604,33 @@ void server_models::unload_lru(const device_memory_map & dmm_req) { } } +static std::string resolve_model_path(const common_preset & preset) { + common_params params; + preset.apply_to_params(params); + + if (!params.model.path.empty()) { + return params.model.path; + } + + if (!params.model.hf_repo.empty() || !params.model.url.empty()) { + common_download_opts opts; + opts.offline = true; + auto result = common_download_model(params.model, opts); + return result.model_path; + } + + return ""; +} + static device_memory_map get_model_memory_per_device(const common_preset & preset) { common_params params; preset.apply_to_params(params); - if (params.model.path.empty()) { - return {}; + if(params.model.path.empty()) { + params.model.path = resolve_model_path(preset); + if(params.model.path.empty()) { + return {}; + } } struct log_ud_t { @@ -661,11 +682,98 @@ static device_memory_map get_model_memory_per_device(const common_preset & prese return result; } +bool server_models::download_model(const std::string & name) { + std::vector child_args; + std::vector child_env; + { + std::lock_guard lk(mutex); + auto & meta = mapping[name].meta; + child_args = meta.preset.to_args(bin_path); + child_env = base_env; + } + child_args.push_back("--download-only"); + + SRV_INF("downloading model name=%s\n", name.c_str()); + + std::vector argv = to_char_ptr_array(child_args); + std::vector envp = to_char_ptr_array(child_env); + + subprocess_s proc; + int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr; + if (subprocess_create_ex(argv.data(), options, envp.data(), &proc) != 0) { + SRV_ERR("failed to spawn download process for model name=%s\n", name.c_str()); + return false; + } + + FILE * out = subprocess_stdout(&proc); + if (out) { + char buffer[4096]; + while (fgets(buffer, sizeof(buffer), out) != nullptr) { + LOG("[dl:%s] %s", name.c_str(), buffer); + } + } + + int exit_code = 0; + subprocess_join(&proc, &exit_code); + subprocess_destroy(&proc); + + if (exit_code != 0) { + SRV_ERR("download process for model name=%s exited with code %d\n", name.c_str(), exit_code); + return false; + } + + SRV_INF("download complete for model name=%s\n", name.c_str()); + return true; +} + void server_models::load(const std::string & name) { if (!has_model(name)) { throw std::runtime_error("model name=" + name + " is not found"); } + { + common_preset preset_copy; + { + std::lock_guard lk(mutex); + preset_copy = mapping[name].meta.preset; + } + if (resolve_model_path(preset_copy).empty()) { + { + std::lock_guard lk(mutex); + auto & meta = mapping[name].meta; + if (meta.status != SERVER_MODEL_STATUS_UNLOADED) { + return; + } + meta.status = SERVER_MODEL_STATUS_DOWNLOADING; + cv.notify_all(); + } + std::thread([this, name]() { + if (!download_model(name)) { + update_status(name, SERVER_MODEL_STATUS_UNLOADED, 1); + return; + } + device_memory_map mem; + if (base_params.models_memory_margin > 0) { + std::lock_guard lk(mutex); + auto & meta = mapping[name].meta; + meta.dmm_req = get_model_memory_per_device(meta.preset); + if (meta.dmm_req.empty()) { + SRV_WRN("failed to estimate memory for model %s, memory limits will not apply\n", name.c_str()); + } + mem = meta.dmm_req; + } + update_status(name, SERVER_MODEL_STATUS_UNLOADED, 0); + try { + _load(name, mem); + } catch (const std::exception & e) { + SRV_ERR("failed to load model %s after download: %s\n", name.c_str(), e.what()); + update_status(name, SERVER_MODEL_STATUS_UNLOADED, 1); + } + }).detach(); + return; + } + } + device_memory_map dmm_req; if (base_params.models_memory_margin > 0) { // determine the required memory by the model upon its first load @@ -673,11 +781,18 @@ void server_models::load(const std::string & name) { auto & meta = mapping[name].meta; if (meta.dmm_req.empty()) { meta.dmm_req = get_model_memory_per_device(meta.preset); + if (meta.dmm_req.empty()) { + SRV_WRN("failed to estimate memory for model %s, memory limits will not apply\n", name.c_str()); + } } dmm_req = meta.dmm_req; } + _load(name, dmm_req); +} + +void server_models::_load(const std::string & name, const device_memory_map & dmm_req) { unload_lru(dmm_req); std::lock_guard lk(mutex); @@ -913,7 +1028,8 @@ void server_models::wait_until_loading_finished(const std::string & name) { cv.wait(lk, [this, &name]() { auto it = mapping.find(name); if (it != mapping.end()) { - return it->second.meta.status != SERVER_MODEL_STATUS_LOADING; + return it->second.meta.status != SERVER_MODEL_STATUS_LOADING && + it->second.meta.status != SERVER_MODEL_STATUS_DOWNLOADING; } return false; }); diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 567e716bce0..aa6abf7cac7 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -14,6 +14,9 @@ /** * state diagram: * + * + * ┌► DOWNLOADING ─┐ + * │ ▼ * UNLOADED ──► LOADING ──► LOADED ◄──── SLEEPING * ▲ │ │ ▲ * └───failed───┘ │ │ @@ -21,8 +24,8 @@ * └────────unloaded─────────┘ */ enum server_model_status { - // TODO: also add downloading state when the logic is added SERVER_MODEL_STATUS_UNLOADED, + SERVER_MODEL_STATUS_DOWNLOADING, SERVER_MODEL_STATUS_LOADING, SERVER_MODEL_STATUS_LOADED, SERVER_MODEL_STATUS_SLEEPING @@ -32,6 +35,9 @@ static server_model_status server_model_status_from_string(const std::string & s if (status_str == "unloaded") { return SERVER_MODEL_STATUS_UNLOADED; } + if (status_str == "downloading") { + return SERVER_MODEL_STATUS_DOWNLOADING; + } if (status_str == "loading") { return SERVER_MODEL_STATUS_LOADING; } @@ -46,11 +52,12 @@ static server_model_status server_model_status_from_string(const std::string & s static std::string server_model_status_to_string(server_model_status status) { switch (status) { - case SERVER_MODEL_STATUS_UNLOADED: return "unloaded"; - case SERVER_MODEL_STATUS_LOADING: return "loading"; - case SERVER_MODEL_STATUS_LOADED: return "loaded"; - case SERVER_MODEL_STATUS_SLEEPING: return "sleeping"; - default: return "unknown"; + case SERVER_MODEL_STATUS_UNLOADED: return "unloaded"; + case SERVER_MODEL_STATUS_DOWNLOADING: return "downloading"; + case SERVER_MODEL_STATUS_LOADING: return "loading"; + case SERVER_MODEL_STATUS_LOADED: return "loaded"; + case SERVER_MODEL_STATUS_SLEEPING: return "sleeping"; + default: return "unknown"; } } @@ -126,6 +133,12 @@ struct server_models { // not thread-safe, caller must hold mutex int can_fit(const device_memory_map & dmm_req) const; + // download model files, blocking call (caller must NOT hold mutex) + bool download_model(const std::string & name); + + // Internal helper for model loading + void _load(const std::string & name, const device_memory_map & dmm_req); + public: server_models(const common_params & params, int argc, char ** argv); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 6566949edf1..4ff962b89fc 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -83,6 +83,11 @@ int main(int argc, char ** argv) { return 1; } + if (params.download_only) { + LOG_INF("%s: model downloaded successfully, exiting\n", __func__); + return 0; + } + // validate batch size for embeddings // embeddings require all tokens to be processed in a single ubatch // see https://github.com/ggml-org/llama.cpp/issues/12836 From cf0ebc4e643155710c10f341ae7752b6f78ac456 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 21 Apr 2026 13:22:50 +0200 Subject: [PATCH 14/15] load directly from downloaded state --- tools/server/server-models.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 9f34a8cbc18..23ba9c944a1 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -762,7 +762,6 @@ void server_models::load(const std::string & name) { } mem = meta.dmm_req; } - update_status(name, SERVER_MODEL_STATUS_UNLOADED, 0); try { _load(name, mem); } catch (const std::exception & e) { @@ -798,7 +797,7 @@ void server_models::_load(const std::string & name, const device_memory_map & dm std::lock_guard lk(mutex); auto meta = mapping[name].meta; - if (meta.status != SERVER_MODEL_STATUS_UNLOADED) { + if (meta.status != SERVER_MODEL_STATUS_UNLOADED && meta.status != SERVER_MODEL_STATUS_DOWNLOADING) { SRV_INF("model %s is not ready\n", name.c_str()); return; } From a5355a02269570c4810105168e1bf779d3912a84 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Thu, 16 Apr 2026 13:40:13 +0200 Subject: [PATCH 15/15] server: keep router model refcount to avoid unloading models that have running requests this avoids a deadlock when models A and B don't fit together, but both have requests, so the server gets into a loop unloading A, loading B, unloading B, loading A again, and so on --- tools/server/server-http.h | 8 +- tools/server/server-models.cpp | 66 ++++++++++++- tools/server/server-models.h | 7 ++ tools/server/tests/unit/test_router.py | 124 +++++++++++++++++++++++++ 4 files changed, 200 insertions(+), 5 deletions(-) diff --git a/tools/server/server-http.h b/tools/server/server-http.h index 68ae2170cf6..42ea8a8e992 100644 --- a/tools/server/server-http.h +++ b/tools/server/server-http.h @@ -28,7 +28,13 @@ struct server_http_res { return next != nullptr; } - virtual ~server_http_res() = default; + std::function on_destroy = nullptr; + + virtual ~server_http_res() { + if (on_destroy) { + on_destroy(); + } + } }; // unique pointer, used by set_chunked_content_provider diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 23ba9c944a1..379b01a4f03 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -517,6 +517,19 @@ std::vector server_models::get_all_meta() { return result; } +void server_models::inc_refs(const std::string & name) { + std::lock_guard lk(mutex); + mapping[name].active_refs++; +} + +void server_models::dec_refs(const std::string & name) { + { + std::lock_guard lk(mutex); + mapping[name].active_refs--; + } + cv.notify_all(); +} + int server_models::can_fit(const device_memory_map & dmm_req) const { device_memory_map dmm_total; for (const auto & m : mapping) { @@ -573,7 +586,8 @@ void server_models::unload_lru(const device_memory_map & dmm_req) { for (const auto & m : mapping) { if (m.second.meta.is_running()) { count_active++; - if (m.second.meta.last_used < lru_last_used) { + // Only consider idle models + if (m.second.active_refs == 0 && m.second.meta.last_used < lru_last_used) { lru_model_name = m.first; lru_last_used = m.second.meta.last_used; } @@ -598,6 +612,21 @@ void server_models::unload_lru(const device_memory_map & dmm_req) { return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED; }); } + } else if (count_active > 0 && (active_exceeded || memory_exceeded)) { + // No model idle, wait for drain + std::unique_lock lk(mutex); + bool drained = cv.wait_for(lk, std::chrono::seconds(DEFAULT_STOP_TIMEOUT), [this]() { + for (const auto & m : mapping) { + if (m.second.meta.is_running() && m.second.active_refs == 0) { + return true; + } + } + return false; + }); + if (!drained) { + SRV_WRN("%s", "drain timeout, falling back to force eviction\n"); + break; + } } else { break; } @@ -833,6 +862,7 @@ void server_models::_load(const std::string & name, const device_memory_map & dm inst.meta.port = get_free_port(); inst.meta.status = SERVER_MODEL_STATUS_LOADING; inst.meta.last_used = ggml_time_ms(); + inst.active_refs = mapping[name].active_refs; if (inst.meta.port <= 0) { throw std::runtime_error("failed to get a port number"); @@ -1168,10 +1198,18 @@ static bool router_validate_model(std::string & name, server_models & models, bo } // resolve alias to canonical model name name = meta->name; + // To avoid unloading a model before it is loaded, protect with increased ref count before it starts loading + models.inc_refs(name); if (models_autoload) { - models.ensure_model_ready(name); + try { + models.ensure_model_ready(name); + } catch (...) { + models.dec_refs(name); + throw; + } } else { if (!meta->is_running()) { + models.dec_refs(name); res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST)); return false; } @@ -1222,7 +1260,17 @@ void server_models_routes::init_routes() { if (!router_validate_model(name, models, autoload, error_res)) { return error_res; } - return models.proxy_request(req, method, name, false); + server_http_res_ptr proxy; + try { + proxy = models.proxy_request(req, method, name, false); + } catch(...) { + models.dec_refs(name); + throw; + } + proxy->on_destroy = [this, name]() { + this->models.dec_refs(name); + }; + return proxy; }; this->proxy_post = [this](const server_http_req & req) { @@ -1234,7 +1282,17 @@ void server_models_routes::init_routes() { if (!router_validate_model(name, models, autoload, error_res)) { return error_res; } - return models.proxy_request(req, method, name, true); // update last usage for POST request only + server_http_res_ptr proxy; + try { + proxy = models.proxy_request(req, method, name, true); // update last usage for POST request only + } catch(...) { + models.dec_refs(name); + throw; + } + proxy->on_destroy = [this, name]() { + this->models.dec_refs(name); + }; + return proxy; }; this->post_router_models_load = [this](const server_http_req & req) { diff --git a/tools/server/server-models.h b/tools/server/server-models.h index aa6abf7cac7..36cd0296f60 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -100,6 +100,7 @@ struct server_models { std::thread th; server_model_meta meta; FILE * stdin_file = nullptr; + uint64_t active_refs = 0; }; std::mutex mutex; @@ -174,6 +175,12 @@ struct server_models { // proxy an HTTP request to the model instance server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used); + // Increase instance ref counter + void inc_refs(const std::string & name); + + // Decrease instance ref counter + void dec_refs(const std::string & name); + // return true if the current process is a child server instance static bool is_child_server(); diff --git a/tools/server/tests/unit/test_router.py b/tools/server/tests/unit/test_router.py index 79e60db4083..d471ff88b55 100644 --- a/tools/server/tests/unit/test_router.py +++ b/tools/server/tests/unit/test_router.py @@ -1,4 +1,5 @@ import pytest +import threading from utils import * server: ServerProcess @@ -205,3 +206,126 @@ def test_router_api_key_required(): ) assert authed.status_code == 200 assert "error" not in authed.body + + +# --- Drain-aware eviction tests --- + + +def _make_completion(model_id: str, max_tokens: int = 16) -> dict: + """Send a non-streaming completion request. Returns {"content": ..., "error": ...}.""" + result = {"content": "", "error": None} + try: + res = server.make_request("POST", "/v1/chat/completions", data={ + "model": model_id, + "max_tokens": max_tokens, + "messages": [{"role": "user", "content": "hi"}], + }) + if res.status_code == 200: + choices = res.body.get("choices", []) + if choices: + result["content"] = choices[0].get("message", {}).get("content", "") + else: + result["error"] = f"status {res.status_code}: {res.body}" + except Exception as e: + result["error"] = str(e) + return result + + +def test_router_concurrent_no_thrashing(): + """Concurrent requests for different models should all succeed, not thrash.""" + global server + server = ServerPreset.router() + server.models_max = 1 + server.start() + + model_a = "ggml-org/tinygemma3-GGUF:Q8_0" + model_b = "ggml-org/test-model-stories260K:F32" + n_per_model = 3 + results = {} + + def send_request(model_id, idx): + results[(model_id, idx)] = _make_completion(model_id) + + threads = [] + for i in range(n_per_model): + threads.append(threading.Thread(target=send_request, args=(model_a, i))) + threads.append(threading.Thread(target=send_request, args=(model_b, i))) + + for t in threads: + t.start() + for t in threads: + t.join(timeout=300) + + failures = [f"{m} #{i}: {r['error']}" for (m, i), r in results.items() if r["error"] is not None] + assert len(failures) == 0, f"{len(failures)} request(s) failed:\n" + "\n".join(failures) + + +def test_router_concurrent_partial_capacity(): + """With models_max=2 and 3 models, concurrent requests should all succeed.""" + global server + server = ServerPreset.router() + server.models_max = 2 + server.start() + + models = [ + "ggml-org/tinygemma3-GGUF:Q8_0", + "ggml-org/test-model-stories260K:F32", + "ggml-org/test-model-stories260K-infill:F32", + ] + results = {} + + def send_request(model_id, idx): + results[(model_id, idx)] = _make_completion(model_id) + + threads = [] + for model in models: + for i in range(2): + threads.append(threading.Thread(target=send_request, args=(model, i))) + + for t in threads: + t.start() + for t in threads: + t.join(timeout=300) + + failures = [f"{m} #{i}: {r['error']}" for (m, i), r in results.items() if r["error"] is not None] + assert len(failures) == 0, f"{len(failures)} request(s) failed:\n" + "\n".join(failures) + + +def test_router_alternating_requests(): + """Repeated alternating requests between two models should all succeed.""" + global server + server = ServerPreset.router() + server.models_max = 1 + server.start() + + model_a = "ggml-org/tinygemma3-GGUF:Q8_0" + model_b = "ggml-org/test-model-stories260K:F32" + + for i in range(3): + result = _make_completion(model_a) + assert result["error"] is None, f"Round {i} model A failed: {result['error']}" + result = _make_completion(model_b) + assert result["error"] is None, f"Round {i} model B failed: {result['error']}" + + +def test_router_concurrent_same_model(): + """Concurrent requests for the same model should all succeed.""" + global server + server = ServerPreset.router() + server.models_max = 1 + server.start() + + model_id = "ggml-org/tinygemma3-GGUF:Q8_0" + results = {} + + def send_request(idx): + results[idx] = _make_completion(model_id) + + threads = [threading.Thread(target=send_request, args=(i,)) for i in range(6)] + for t in threads: + t.start() + for t in threads: + t.join(timeout=300) + + failures = [f"#{i}: {r['error']}" for i, r in results.items() if r["error"] is not None] + assert len(failures) == 0, f"{len(failures)} request(s) failed:\n" + "\n".join(failures)