From 8e8e2007269670cb0fae82f6fe17da970210ed07 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Sun, 29 Mar 2026 10:00:49 +0200
Subject: [PATCH 01/15] server: add --models-memory-max parameter to allow
 dynamically unloading models when they exceed a memory size threshold

---
 common/arg.cpp                  |  7 +++
 common/common.h                 |  1 +
 tools/server/server-context.cpp |  1 +
 tools/server/server-models.cpp  | 91 ++++++++++++++++++++++++---------
 tools/server/server-models.h    |  1 +
 5 files changed, 76 insertions(+), 25 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 6751a55ab0c..852b69d4252 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3072,6 +3072,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.models_max = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
+    add_opt(common_arg(
+        {"--models-memory-max"}, "N",
+        string_format("for router server, maximum memory usage in MB (default: %d, 0 = unlimited)", params.models_memory_max),
+        [](common_params & params, int value) {
+            params.models_memory_max = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MAX"));
     add_opt(common_arg(
         {"--models-autoload"},
         {"--no-models-autoload"},
diff --git a/common/common.h b/common/common.h
index 4137a87f1d2..cfc68ce9264 100644
--- a/common/common.h
+++ b/common/common.h
@@ -610,6 +610,7 @@ struct common_params {
     std::string models_dir    = ""; // directory containing models for the router server
     std::string models_preset = ""; // directory containing model presets for the router server
     int models_max = 4;             // maximum number of models to load simultaneously
+    int models_memory_max = 0;      // maximum memory usage in MB (0 = unlimited, estimated from model files)
     bool models_autoload = true;    // automatically load models when requested via the router server
 
     bool log_json = false;
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index a5372572f01..7a4ac804125 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -3615,6 +3615,7 @@ void server_routes::init_routes() {
             { "total_slots",                 params.n_parallel },
             { "model_alias",                 meta->model_name },
             { "model_path",                  meta->model_path },
+            { "memory_mb",                   meta->model_size / (1024 * 1024) },
             { "modalities",                  json {
                 {"vision", meta->has_inp_image},
                 {"audio",  meta->has_inp_audio},
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 6066611f51c..e60efb9f604 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -302,6 +302,7 @@ void server_models::load_models() {
             /* port         */ 0,
             /* status       */ SERVER_MODEL_STATUS_UNLOADED,
             /* last_used    */ 0,
+            /* memory_mb    */ 0,
             /* args         */ std::vector<std::string>(),
             /* exit_code    */ 0,
             /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
@@ -496,34 +497,45 @@ std::vector<server_model_meta> server_models::get_all_meta() {
 }
 
 void server_models::unload_lru() {
-    if (base_params.models_max <= 0) {
+    if (base_params.models_max <= 0 && base_params.models_memory_max <= 0) {
         return; // no limit
     }
-    // remove one of the servers if we passed the models_max (least recently used - LRU)
-    std::string lru_model_name = "";
-    int64_t lru_last_used = ggml_time_ms();
-    size_t count_active = 0;
-    {
-        std::unique_lock<std::mutex> lk(mutex);
-        for (const auto & m : mapping) {
-            if (m.second.meta.is_running()) {
-                count_active++;
-                if (m.second.meta.last_used < lru_last_used) {
-                    lru_model_name = m.first;
-                    lru_last_used = m.second.meta.last_used;
+    // Keep unloading LRU models until limits are satisfied
+    while (true) {
+        std::string lru_model_name = "";
+        int64_t lru_last_used = ggml_time_ms();
+        size_t count_active = 0;
+        uint64_t total_memory_mb = 0;
+        {
+            std::unique_lock<std::mutex> lk(mutex);
+            for (const auto & m : mapping) {
+                if (m.second.meta.is_running()) {
+                    count_active++;
+                    total_memory_mb += m.second.meta.memory_mb;
+                    if (m.second.meta.last_used < lru_last_used) {
+                        lru_model_name = m.first;
+                        lru_last_used = m.second.meta.last_used;
+                    }
                 }
             }
         }
-    }
-    if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) {
-        SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str());
-        unload(lru_model_name);
-        // wait for unload to complete
-        {
-            std::unique_lock<std::mutex> lk(mutex);
-            cv.wait(lk, [this, &lru_model_name]() {
-                return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
-            });
+        // Check if limits exceeded
+        bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max;
+        bool memory_exceeded = base_params.models_memory_max > 0 && total_memory_mb >= (uint64_t)base_params.models_memory_max;
+        if (!lru_model_name.empty() && (count_exceeded || memory_exceeded)) {
+            SRV_INF("limits reached (count=%zu, memory=%lu MB), removing LRU name=%s\n",
+                    count_active, (unsigned long)total_memory_mb, lru_model_name.c_str());
+            unload(lru_model_name);
+            // wait for unload to complete
+            {
+                std::unique_lock<std::mutex> lk(mutex);
+                cv.wait(lk, [this, &lru_model_name]() {
+                    return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
+                });
+            }
+            // Loop continues to check if more unloading is needed
+        } else {
+            break; // limits satisfied
         }
     }
 }
@@ -546,14 +558,18 @@ void server_models::load(const std::string & name) {
     // exceeding models_max. Without this, the window between unload_lru()
     // releasing its lock and this lock_guard acquiring allows multiple
     // threads to each observe capacity and all proceed to load.
-    if (base_params.models_max > 0) {
+    if (base_params.models_max > 0 || base_params.models_memory_max > 0) {
         size_t count_active = 0;
+        uint64_t total_memory_mb = 0;
         for (const auto & m : mapping) {
             if (m.second.meta.is_running()) {
                 count_active++;
+                total_memory_mb += m.second.meta.memory_mb;
             }
         }
-        if (count_active >= (size_t)base_params.models_max) {
+        bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max;
+        bool memory_exceeded = base_params.models_memory_max > 0 && total_memory_mb >= (uint64_t)base_params.models_memory_max;
+        if (count_exceeded || memory_exceeded) {
             throw std::runtime_error("model limit reached, try again later");
         }
     }
@@ -610,10 +626,35 @@ void server_models::load(const std::string & name) {
             // also handle status report from child process
             if (stdout_file) {
                 char buffer[4096];
+                bool ready_received = false;
                 while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
                     LOG("[%5d] %s", port, buffer);
                     std::string str(buffer);
                     if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
+                        // Query memory usage from the child's /props endpoint
+                        if (!ready_received) {
+                            ready_received = true;
+                            try {
+                                httplib::Client cli("http://CHILD_ADDR");
+                                cli.set_connection_timeout(5, 0);
+                                if (auto res = cli.Get("/props")) {
+                                    if (res->status == 200) {
+                                        json props = json::parse(res->body);
+                                        if (props.contains("memory_mb")) {
+                                            uint64_t memory_mb = props["memory_mb"].get<uint64_t>();
+                                            SRV_INF("model %s loaded, memory usage: %lu MB\n", name.c_str(), (unsigned long)memory_mb);
+                                            // Update memory_mb in meta
+                                            std::lock_guard<std::mutex> lk(this->mutex);
+                                            if (mapping.find(name) != mapping.end()) {
+                                                mapping[name].meta.memory_mb = memory_mb;
+                                            }
+                                        }
+                                    }
+                                }
+                            } catch (const std::exception & e) {
+                                SRV_WRN("failed to query memory for model %s: %s\n", name.c_str(), e.what());
+                            }
+                        }
                         this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
                     } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
                         this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 1db34b6c4df..c195dbeb26e 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -62,6 +62,7 @@ struct server_model_meta {
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
+    uint64_t memory_mb = 0; // estimated memory usage in MB
     std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
     int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown

From 777395f6438dd5fe6dcfb25268575694fc229edb Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Sun, 29 Mar 2026 12:18:51 +0200
Subject: [PATCH 02/15] estimate with to-be-loaded model size included

---
 include/llama.h                |  6 +++++
 src/llama-model.cpp            | 29 +++++++++++++++++++++++
 tools/server/server-models.cpp | 43 ++++++++++++++++++++++++----------
 tools/server/server-models.h   |  4 ++--
 4 files changed, 67 insertions(+), 15 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index eb869814097..03b83f40d56 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -595,6 +595,12 @@ extern "C" {
     // Returns the total size of all the tensors in the model in bytes
     LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
 
+    // Returns the total size of all the tensors in the model in bytes from a model path
+    // without fully loading the model. Uses llama_model_loader with no_alloc=true.
+    // Returns 0 if the model cannot be loaded or the path is invalid.
+    // This function can be used to estimate memory requirements before loading a model.
+    LLAMA_API uint64_t llama_model_size_from_path(const char * path);
+
     // Get the default chat template. Returns nullptr if not available
     // If name is NULL, returns the default chat template
     LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index f77b2e9217f..3a363f55bee 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -9393,6 +9393,35 @@ uint64_t llama_model_size(const llama_model * model) {
     return model->size();
 }
 
+uint64_t llama_model_size_from_path(const char * path) {
+    if (!path) {
+        return 0;
+    }
+
+    try {
+        std::vector<std::string> splits;
+
+        llama_model_loader loader(
+            /* metadata                      */ nullptr,
+            /* set_tensor_data               */ nullptr,
+            /* set_tensor_data_ud            */ nullptr,
+            /* fname                         */ path,
+            /* splits                        */ splits,
+            /* file                          */ nullptr,
+            /* use_mmap                      */ false,
+            /* use_direct_io                 */ false,
+            /* check_tensors                 */ false,
+            /* no_alloc                      */ true,
+            /* param_overrides_p             */ nullptr,
+            /* param_tensor_buft_overrides_p */ nullptr
+        );
+
+        return loader.n_bytes;
+    } catch (...) {
+        return 0;
+    }
+}
+
 const char * llama_model_chat_template(const llama_model * model, const char * name) {
     const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
         : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index e60efb9f604..fc5cf7c9fd0 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -496,11 +496,10 @@ std::vector<server_model_meta> server_models::get_all_meta() {
     return result;
 }
 
-void server_models::unload_lru() {
+void server_models::unload_lru(uint64_t new_model_memory_mb) {
     if (base_params.models_max <= 0 && base_params.models_memory_max <= 0) {
         return; // no limit
     }
-    // Keep unloading LRU models until limits are satisfied
     while (true) {
         std::string lru_model_name = "";
         int64_t lru_last_used = ggml_time_ms();
@@ -519,12 +518,14 @@ void server_models::unload_lru() {
                 }
             }
         }
-        // Check if limits exceeded
-        bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max;
-        bool memory_exceeded = base_params.models_memory_max > 0 && total_memory_mb >= (uint64_t)base_params.models_memory_max;
+        bool count_exceeded = base_params.models_max > 0 &&
+                              (count_active + 1) >= (size_t)base_params.models_max;
+        uint64_t projected_memory = total_memory_mb + new_model_memory_mb;
+        bool memory_exceeded = base_params.models_memory_max > 0 &&
+                               projected_memory >= (uint64_t)base_params.models_memory_max;
         if (!lru_model_name.empty() && (count_exceeded || memory_exceeded)) {
-            SRV_INF("limits reached (count=%zu, memory=%lu MB), removing LRU name=%s\n",
-                    count_active, (unsigned long)total_memory_mb, lru_model_name.c_str());
+            SRV_INF("limits reached (count=%zu, memory=%lu MB + %lu MB new), removing LRU name=%s\n",
+                    count_active, (unsigned long)total_memory_mb, (unsigned long)new_model_memory_mb, lru_model_name.c_str());
             unload(lru_model_name);
             // wait for unload to complete
             {
@@ -533,9 +534,8 @@ void server_models::unload_lru() {
                     return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
                 });
             }
-            // Loop continues to check if more unloading is needed
         } else {
-            break; // limits satisfied
+            break;
         }
     }
 }
@@ -544,7 +544,26 @@ void server_models::load(const std::string & name) {
     if (!has_model(name)) {
         throw std::runtime_error("model name=" + name + " is not found");
     }
-    unload_lru();
+
+    uint64_t new_model_memory_mb = 0;
+    if (base_params.models_memory_max > 0) {
+        std::string model_path;
+        {
+            std::lock_guard<std::mutex> lk(mutex);
+            auto & meta = mapping[name].meta;
+            if (meta.preset.get_option("LLAMA_ARG_MODEL", model_path) && !model_path.empty()) {
+                uint64_t size_bytes = llama_model_size_from_path(model_path.c_str());
+                new_model_memory_mb = size_bytes / (1024 * 1024);
+                meta.memory_mb = new_model_memory_mb;
+                if (new_model_memory_mb > 0) {
+                    SRV_INF("model %s estimated size: %lu MB\n", name.c_str(),
+                            (unsigned long)new_model_memory_mb);
+                }
+            }
+        }
+    }
+
+    unload_lru(new_model_memory_mb);
 
     std::lock_guard<std::mutex> lk(mutex);
 
@@ -631,7 +650,6 @@ void server_models::load(const std::string & name) {
                     LOG("[%5d] %s", port, buffer);
                     std::string str(buffer);
                     if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
-                        // Query memory usage from the child's /props endpoint
                         if (!ready_received) {
                             ready_received = true;
                             try {
@@ -642,8 +660,7 @@ void server_models::load(const std::string & name) {
                                         json props = json::parse(res->body);
                                         if (props.contains("memory_mb")) {
                                             uint64_t memory_mb = props["memory_mb"].get<uint64_t>();
-                                            SRV_INF("model %s loaded, memory usage: %lu MB\n", name.c_str(), (unsigned long)memory_mb);
-                                            // Update memory_mb in meta
+                                            SRV_INF("model %s loaded, actual memory: %lu MB\n", name.c_str(), (unsigned long)memory_mb);
                                             std::lock_guard<std::mutex> lk(this->mutex);
                                             if (mapping.find(name) != mapping.end()) {
                                                 mapping[name].meta.memory_mb = memory_mb;
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index c195dbeb26e..29c1c7c6f8d 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -62,7 +62,7 @@ struct server_model_meta {
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
-    uint64_t memory_mb = 0; // estimated memory usage in MB
+    uint64_t memory_mb = 0; // size in MB (estimate before load, actual after load)
     std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
     int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
@@ -111,7 +111,7 @@ struct server_models {
     void update_meta(const std::string & name, const server_model_meta & meta);
 
     // unload least recently used models if the limit is reached
-    void unload_lru();
+    void unload_lru(uint64_t new_model_memory_mb = 0);
 
     // not thread-safe, caller must hold mutex
     void add_model(server_model_meta && meta);

From 2603b4c5bc6b9e5641ca246748c987b974430839 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 31 Mar 2026 16:18:03 +0200
Subject: [PATCH 03/15] use no_alloc to get memory requirements for model load

---
 include/llama.h                 |  6 ---
 src/llama-model.cpp             | 29 -----------
 tools/server/server-context.cpp |  1 -
 tools/server/server-models.cpp  | 86 +++++++++++++++++++--------------
 tools/server/server-models.h    |  2 +-
 5 files changed, 51 insertions(+), 73 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 03b83f40d56..eb869814097 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -595,12 +595,6 @@ extern "C" {
     // Returns the total size of all the tensors in the model in bytes
     LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
 
-    // Returns the total size of all the tensors in the model in bytes from a model path
-    // without fully loading the model. Uses llama_model_loader with no_alloc=true.
-    // Returns 0 if the model cannot be loaded or the path is invalid.
-    // This function can be used to estimate memory requirements before loading a model.
-    LLAMA_API uint64_t llama_model_size_from_path(const char * path);
-
     // Get the default chat template. Returns nullptr if not available
     // If name is NULL, returns the default chat template
     LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 3a363f55bee..f77b2e9217f 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -9393,35 +9393,6 @@ uint64_t llama_model_size(const llama_model * model) {
     return model->size();
 }
 
-uint64_t llama_model_size_from_path(const char * path) {
-    if (!path) {
-        return 0;
-    }
-
-    try {
-        std::vector<std::string> splits;
-
-        llama_model_loader loader(
-            /* metadata                      */ nullptr,
-            /* set_tensor_data               */ nullptr,
-            /* set_tensor_data_ud            */ nullptr,
-            /* fname                         */ path,
-            /* splits                        */ splits,
-            /* file                          */ nullptr,
-            /* use_mmap                      */ false,
-            /* use_direct_io                 */ false,
-            /* check_tensors                 */ false,
-            /* no_alloc                      */ true,
-            /* param_overrides_p             */ nullptr,
-            /* param_tensor_buft_overrides_p */ nullptr
-        );
-
-        return loader.n_bytes;
-    } catch (...) {
-        return 0;
-    }
-}
-
 const char * llama_model_chat_template(const llama_model * model, const char * name) {
     const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
         : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 7a4ac804125..a5372572f01 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -3615,7 +3615,6 @@ void server_routes::init_routes() {
             { "total_slots",                 params.n_parallel },
             { "model_alias",                 meta->model_name },
             { "model_path",                  meta->model_path },
-            { "memory_mb",                   meta->model_size / (1024 * 1024) },
             { "modalities",                  json {
                 {"vision", meta->has_inp_image},
                 {"audio",  meta->has_inp_audio},
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index fc5cf7c9fd0..42f7a1d2de4 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -540,6 +540,49 @@ void server_models::unload_lru(uint64_t new_model_memory_mb) {
     }
 }
 
+static uint64_t get_model_memory_mb(const common_preset& preset) {
+    common_params params;
+    preset.apply_to_params(params);
+
+    if(params.model.path.empty()) {
+        return 0;
+    }
+
+    struct log_ud_t {
+        struct {
+            ggml_log_callback callback;
+            void * user_data;
+        } original;
+        ggml_log_level min_level;
+    } log_ud;
+    llama_log_get(&log_ud.original.callback, &log_ud.original.user_data);
+    log_ud.min_level = GGML_LOG_LEVEL_WARN;
+
+    llama_log_set([](ggml_log_level level, const char * text, void * ud) {
+        log_ud_t * d = (log_ud_t *) ud;
+        const ggml_log_level eff = level >= d->min_level ? level : GGML_LOG_LEVEL_DEBUG;
+        d->original.callback(eff, text, d->original.user_data);
+    }, &log_ud);
+
+    llama_model_params mparams = common_model_params_to_llama(params);
+    mparams.no_alloc = true;
+    mparams.use_mmap = false;
+    mparams.use_mlock = false;
+
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
+
+    llama_log_set(log_ud.original.callback, log_ud.original.user_data);
+
+    if (!model) {
+        return 0;
+    }
+
+    uint64_t size_bytes = llama_model_size(model);
+    llama_model_free(model);
+
+    return size_bytes / (1024 * 1024);
+}
+
 void server_models::load(const std::string & name) {
     if (!has_model(name)) {
         throw std::runtime_error("model name=" + name + " is not found");
@@ -547,19 +590,13 @@ void server_models::load(const std::string & name) {
 
     uint64_t new_model_memory_mb = 0;
     if (base_params.models_memory_max > 0) {
-        std::string model_path;
-        {
-            std::lock_guard<std::mutex> lk(mutex);
-            auto & meta = mapping[name].meta;
-            if (meta.preset.get_option("LLAMA_ARG_MODEL", model_path) && !model_path.empty()) {
-                uint64_t size_bytes = llama_model_size_from_path(model_path.c_str());
-                new_model_memory_mb = size_bytes / (1024 * 1024);
-                meta.memory_mb = new_model_memory_mb;
-                if (new_model_memory_mb > 0) {
-                    SRV_INF("model %s estimated size: %lu MB\n", name.c_str(),
-                            (unsigned long)new_model_memory_mb);
-                }
-            }
+        std::lock_guard<std::mutex> lk(mutex);
+        auto & meta = mapping[name].meta;
+        new_model_memory_mb = get_model_memory_mb(meta.preset);
+        meta.memory_mb = new_model_memory_mb;
+        if (new_model_memory_mb > 0) {
+            SRV_INF("model %s memory requirements: %lu MB\n", name.c_str(),
+                    (unsigned long)new_model_memory_mb);
         }
     }
 
@@ -645,33 +682,10 @@ void server_models::load(const std::string & name) {
             // also handle status report from child process
             if (stdout_file) {
                 char buffer[4096];
-                bool ready_received = false;
                 while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
                     LOG("[%5d] %s", port, buffer);
                     std::string str(buffer);
                     if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
-                        if (!ready_received) {
-                            ready_received = true;
-                            try {
-                                httplib::Client cli("http://CHILD_ADDR");
-                                cli.set_connection_timeout(5, 0);
-                                if (auto res = cli.Get("/props")) {
-                                    if (res->status == 200) {
-                                        json props = json::parse(res->body);
-                                        if (props.contains("memory_mb")) {
-                                            uint64_t memory_mb = props["memory_mb"].get<uint64_t>();
-                                            SRV_INF("model %s loaded, actual memory: %lu MB\n", name.c_str(), (unsigned long)memory_mb);
-                                            std::lock_guard<std::mutex> lk(this->mutex);
-                                            if (mapping.find(name) != mapping.end()) {
-                                                mapping[name].meta.memory_mb = memory_mb;
-                                            }
-                                        }
-                                    }
-                                }
-                            } catch (const std::exception & e) {
-                                SRV_WRN("failed to query memory for model %s: %s\n", name.c_str(), e.what());
-                            }
-                        }
                         this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
                     } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
                         this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 29c1c7c6f8d..2cbdb35b321 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -62,7 +62,7 @@ struct server_model_meta {
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
-    uint64_t memory_mb = 0; // size in MB (estimate before load, actual after load)
+    uint64_t memory_mb = 0; // size in MB
     std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
     int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown

From 9b5af58a9ae8162e57492e2a07a46b22a7cc1bc3 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 31 Mar 2026 17:37:16 +0200
Subject: [PATCH 04/15] only set model memory_mb if not previously calculated

---
 tools/server/server-models.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 42f7a1d2de4..1363585dff3 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -592,8 +592,12 @@ void server_models::load(const std::string & name) {
     if (base_params.models_memory_max > 0) {
         std::lock_guard<std::mutex> lk(mutex);
         auto & meta = mapping[name].meta;
-        new_model_memory_mb = get_model_memory_mb(meta.preset);
-        meta.memory_mb = new_model_memory_mb;
+        if (meta.memory_mb > 0) {
+            new_model_memory_mb = meta.memory_mb;
+        } else {
+            new_model_memory_mb = get_model_memory_mb(meta.preset);
+            meta.memory_mb = new_model_memory_mb;
+        }
         if (new_model_memory_mb > 0) {
             SRV_INF("model %s memory requirements: %lu MB\n", name.c_str(),
                     (unsigned long)new_model_memory_mb);

From 56122b35ad6679d0efccb929bca5b1ff5420f950 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Thu, 2 Apr 2026 09:24:53 +0200
Subject: [PATCH 05/15] use memory margin instead of total size limit, apply to
 each device separately

---
 common/arg.cpp                 |   8 +-
 common/common.h                |  10 +--
 include/llama.h                |   6 ++
 src/llama-context.cpp          |  13 +++
 tools/server/server-models.cpp | 139 ++++++++++++++++++++++-----------
 tools/server/server-models.h   |  12 ++-
 6 files changed, 132 insertions(+), 56 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 852b69d4252..37e2c8dda1f 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3073,12 +3073,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
     add_opt(common_arg(
-        {"--models-memory-max"}, "N",
-        string_format("for router server, maximum memory usage in MB (default: %d, 0 = unlimited)", params.models_memory_max),
+        {"--models-memory-margin"}, "N",
+        string_format("for router server, MB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin),
         [](common_params & params, int value) {
-            params.models_memory_max = value;
+            params.models_memory_margin = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MAX"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MARGIN"));
     add_opt(common_arg(
         {"--models-autoload"},
         {"--no-models-autoload"},
diff --git a/common/common.h b/common/common.h
index cfc68ce9264..8ac5b9a8bdb 100644
--- a/common/common.h
+++ b/common/common.h
@@ -607,11 +607,11 @@ struct common_params {
     std::vector<std::string> server_tools;
 
     // router server configs
-    std::string models_dir    = ""; // directory containing models for the router server
-    std::string models_preset = ""; // directory containing model presets for the router server
-    int models_max = 4;             // maximum number of models to load simultaneously
-    int models_memory_max = 0;      // maximum memory usage in MB (0 = unlimited, estimated from model files)
-    bool models_autoload = true;    // automatically load models when requested via the router server
+    std::string models_dir    = "";  // directory containing models for the router server
+    std::string models_preset = "";  // directory containing model presets for the router server
+    int models_max = 4;              // maximum number of models to load simultaneously
+    int models_memory_margin = 1024; // MB of free memory to preserve per device (0 = disabled)
+    bool models_autoload = true;     // automatically load models when requested via the router server
 
     bool log_json = false;
 
diff --git a/include/llama.h b/include/llama.h
index eb869814097..72fff81bb25 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1525,6 +1525,12 @@ extern "C" {
     LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
     LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
 
+    // Returns the projected memory use (model + context + compute) in bytes
+    // for the given device within this context. Returns 0 if the device is not used.
+    LLAMA_API uint64_t llama_context_device_memory(
+            const struct llama_context * ctx,
+            ggml_backend_dev_t           device);
+
     //
     // training
     //
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 8126249e143..79437bbd177 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -3493,6 +3493,19 @@ void llama_perf_context_reset(llama_context * ctx) {
     ctx->perf_reset();
 }
 
+uint64_t llama_context_device_memory(const llama_context * ctx, ggml_backend_dev_t device) {
+    const bool is_host = ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU;
+    uint64_t total = 0;
+    for (const auto & [buft, mb] : ctx->memory_breakdown()) {
+        const bool matches = is_host ? ggml_backend_buft_is_host(buft) :
+                                       ggml_backend_buft_get_device(buft) == device;
+        if (matches) {
+            total += mb.total();
+        }
+    }
+    return total;
+}
+
 //
 // training
 //
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 1363585dff3..00301be17b6 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -180,6 +180,21 @@ server_models::server_models(
         LOG_WRN("failed to get server executable path: %s\n", e.what());
         LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]);
     }
+
+    const uint64_t memory_margin = base_params.models_memory_margin * 1024 * 1024;
+
+    if (memory_margin > 0) {
+        const size_t n_devs = ggml_backend_dev_count();
+        for (size_t i = 0; i < n_devs; i++) {
+            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+            size_t free, total;
+            ggml_backend_dev_memory(dev, &free, &total);
+            if (total > 0) {
+                memory_per_device[dev] = (free > memory_margin) ? free - memory_margin : 0;
+            }
+        }
+    }
+
     load_models();
 }
 
@@ -295,17 +310,17 @@ void server_models::load_models() {
     // convert presets to server_model_meta and add to mapping
     for (const auto & preset : final_presets) {
         server_model_meta meta{
-            /* preset       */ preset.second,
-            /* name         */ preset.first,
-            /* aliases      */ {},
-            /* tags         */ {},
-            /* port         */ 0,
-            /* status       */ SERVER_MODEL_STATUS_UNLOADED,
-            /* last_used    */ 0,
-            /* memory_mb    */ 0,
-            /* args         */ std::vector<std::string>(),
-            /* exit_code    */ 0,
-            /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
+            /* preset            */ preset.second,
+            /* name              */ preset.first,
+            /* aliases           */ {},
+            /* tags              */ {},
+            /* port              */ 0,
+            /* status            */ SERVER_MODEL_STATUS_UNLOADED,
+            /* last_used         */ 0,
+            /* memory_per_device */ {},
+            /* args              */ std::vector<std::string>(),
+            /* exit_code         */ 0,
+            /* stop_timeout      */ DEFAULT_STOP_TIMEOUT,
         };
         add_model(std::move(meta));
     }
@@ -496,36 +511,63 @@ std::vector<server_model_meta> server_models::get_all_meta() {
     return result;
 }
 
-void server_models::unload_lru(uint64_t new_model_memory_mb) {
-    if (base_params.models_max <= 0 && base_params.models_memory_max <= 0) {
+uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_memory_per_device) const {
+    model_memory_map total_memory_per_device;
+    for (const auto & m : mapping) {
+        if (m.second.meta.is_running()) {
+            for (const auto& [key, value] : m.second.meta.memory_per_device) {
+                total_memory_per_device[key] += value;
+            }
+        }
+    }
+
+    auto get = [](const model_memory_map & m, ggml_backend_dev_t k) {
+        auto it = m.find(k);
+        return it != m.end() ? it->second : 0;
+    };
+
+    uint64_t memory_exceeded = 0;
+
+    for (const auto& [key, limit] : memory_per_device) {
+        if (get(new_model_memory_per_device, key) + get(total_memory_per_device, key) > limit) {
+            memory_exceeded++;
+        }
+    }
+
+    return memory_exceeded;
+}
+
+void server_models::unload_lru(const model_memory_map& new_model_memory_per_device) {
+    const bool check_memory = base_params.models_memory_margin > 0 && !memory_per_device.empty();
+
+    if (base_params.models_max <= 0 && !check_memory) {
         return; // no limit
     }
+
     while (true) {
         std::string lru_model_name = "";
         int64_t lru_last_used = ggml_time_ms();
         size_t count_active = 0;
-        uint64_t total_memory_mb = 0;
+        uint64_t memory_exceeded = 0;
         {
             std::unique_lock<std::mutex> lk(mutex);
             for (const auto & m : mapping) {
                 if (m.second.meta.is_running()) {
                     count_active++;
-                    total_memory_mb += m.second.meta.memory_mb;
                     if (m.second.meta.last_used < lru_last_used) {
                         lru_model_name = m.first;
                         lru_last_used = m.second.meta.last_used;
                     }
                 }
             }
+            memory_exceeded = get_memory_exceeded(new_model_memory_per_device);
         }
         bool count_exceeded = base_params.models_max > 0 &&
                               (count_active + 1) >= (size_t)base_params.models_max;
-        uint64_t projected_memory = total_memory_mb + new_model_memory_mb;
-        bool memory_exceeded = base_params.models_memory_max > 0 &&
-                               projected_memory >= (uint64_t)base_params.models_memory_max;
-        if (!lru_model_name.empty() && (count_exceeded || memory_exceeded)) {
-            SRV_INF("limits reached (count=%zu, memory=%lu MB + %lu MB new), removing LRU name=%s\n",
-                    count_active, (unsigned long)total_memory_mb, (unsigned long)new_model_memory_mb, lru_model_name.c_str());
+
+        if (!lru_model_name.empty() && (count_exceeded || memory_exceeded > 0)) {
+            SRV_INF("limits reached (count=%zu, memory margin exceeded on %zu device(s)), removing LRU name=%s\n",
+                    count_active, memory_exceeded,  lru_model_name.c_str());
             unload(lru_model_name);
             // wait for unload to complete
             {
@@ -540,12 +582,12 @@ void server_models::unload_lru(uint64_t new_model_memory_mb) {
     }
 }
 
-static uint64_t get_model_memory_mb(const common_preset& preset) {
+static model_memory_map get_model_memory_per_device(const common_preset& preset) {
     common_params params;
     preset.apply_to_params(params);
 
     if(params.model.path.empty()) {
-        return 0;
+        return {};
     }
 
     struct log_ud_t {
@@ -569,18 +611,32 @@ static uint64_t get_model_memory_mb(const common_preset& preset) {
     mparams.use_mmap = false;
     mparams.use_mlock = false;
 
-    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
+    llama_model_ptr model{llama_model_load_from_file(params.model.path.c_str(), mparams)};
+
+    if (!model) {
+        llama_log_set(log_ud.original.callback, log_ud.original.user_data);
+        return {};
+    }
 
+    llama_context_params cparams = common_context_params_to_llama(params);
+    llama_context_ptr ctx{llama_init_from_model(model.get(), cparams)};
     llama_log_set(log_ud.original.callback, log_ud.original.user_data);
 
-    if (!model) {
-        return 0;
+    if (!ctx) {
+        return {};
     }
 
-    uint64_t size_bytes = llama_model_size(model);
-    llama_model_free(model);
+    model_memory_map result;
+    const size_t n_devs = ggml_backend_dev_count();
+    for (size_t i = 0; i < n_devs; i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        uint64_t bytes = llama_context_device_memory(ctx.get(), dev);
+        if (bytes > 0) {
+            result[dev] = bytes;
+        }
+    }
 
-    return size_bytes / (1024 * 1024);
+    return result;
 }
 
 void server_models::load(const std::string & name) {
@@ -588,23 +644,18 @@ void server_models::load(const std::string & name) {
         throw std::runtime_error("model name=" + name + " is not found");
     }
 
-    uint64_t new_model_memory_mb = 0;
-    if (base_params.models_memory_max > 0) {
+    model_memory_map new_model_memory_per_device;
+    if (base_params.models_memory_margin > 0) {
         std::lock_guard<std::mutex> lk(mutex);
         auto & meta = mapping[name].meta;
-        if (meta.memory_mb > 0) {
-            new_model_memory_mb = meta.memory_mb;
-        } else {
-            new_model_memory_mb = get_model_memory_mb(meta.preset);
-            meta.memory_mb = new_model_memory_mb;
-        }
-        if (new_model_memory_mb > 0) {
-            SRV_INF("model %s memory requirements: %lu MB\n", name.c_str(),
-                    (unsigned long)new_model_memory_mb);
+        if (meta.memory_per_device.empty()) {
+            meta.memory_per_device = get_model_memory_per_device(meta.preset);
         }
+
+        new_model_memory_per_device = meta.memory_per_device;
     }
 
-    unload_lru(new_model_memory_mb);
+    unload_lru(new_model_memory_per_device);
 
     std::lock_guard<std::mutex> lk(mutex);
 
@@ -618,17 +669,15 @@ void server_models::load(const std::string & name) {
     // exceeding models_max. Without this, the window between unload_lru()
     // releasing its lock and this lock_guard acquiring allows multiple
     // threads to each observe capacity and all proceed to load.
-    if (base_params.models_max > 0 || base_params.models_memory_max > 0) {
+    if (base_params.models_max > 0 || base_params.models_memory_margin > 0) {
         size_t count_active = 0;
-        uint64_t total_memory_mb = 0;
         for (const auto & m : mapping) {
             if (m.second.meta.is_running()) {
                 count_active++;
-                total_memory_mb += m.second.meta.memory_mb;
             }
         }
         bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max;
-        bool memory_exceeded = base_params.models_memory_max > 0 && total_memory_mb >= (uint64_t)base_params.models_memory_max;
+        bool memory_exceeded = get_memory_exceeded(new_model_memory_per_device) > 0;
         if (count_exceeded || memory_exceeded) {
             throw std::runtime_error("model limit reached, try again later");
         }
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 2cbdb35b321..38d6929a881 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -54,6 +54,8 @@ static std::string server_model_status_to_string(server_model_status status) {
     }
 }
 
+using model_memory_map = std::map<ggml_backend_dev_t, uint64_t>;
+
 struct server_model_meta {
     common_preset preset;
     std::string name;
@@ -62,7 +64,7 @@ struct server_model_meta {
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
-    uint64_t memory_mb = 0; // size in MB
+    model_memory_map memory_per_device; // projected bytes per device
     std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
     int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
@@ -108,14 +110,20 @@ struct server_models {
     std::vector<std::string> base_env;
     common_preset base_preset; // base preset from llama-server CLI args
 
+    // available memory per device
+    std::map<ggml_backend_dev_t, uint64_t> memory_per_device;
+
     void update_meta(const std::string & name, const server_model_meta & meta);
 
     // unload least recently used models if the limit is reached
-    void unload_lru(uint64_t new_model_memory_mb = 0);
+    void unload_lru(const model_memory_map& new_model_memory_per_device);
 
     // not thread-safe, caller must hold mutex
     void add_model(server_model_meta && meta);
 
+    // not thread-safe, caller must hold mutex
+    uint64_t get_memory_exceeded(const model_memory_map& new_model_memory_per_device) const;
+
 public:
     server_models(const common_params & params, int argc, char ** argv);
 

From 51538c1f7864015601ac470127be06f72c9a6d30 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Thu, 2 Apr 2026 10:07:04 +0200
Subject: [PATCH 06/15] add server memory debug logging

---
 tools/server/server-models.cpp | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 00301be17b6..37cd81f2ef0 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -181,7 +181,7 @@ server_models::server_models(
         LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]);
     }
 
-    const uint64_t memory_margin = base_params.models_memory_margin * 1024 * 1024;
+    const uint64_t memory_margin = (uint64_t)base_params.models_memory_margin * 1024 * 1024;
 
     if (memory_margin > 0) {
         const size_t n_devs = ggml_backend_dev_count();
@@ -190,7 +190,11 @@ server_models::server_models(
             size_t free, total;
             ggml_backend_dev_memory(dev, &free, &total);
             if (total > 0) {
-                memory_per_device[dev] = (free > memory_margin) ? free - memory_margin : 0;
+                const uint64_t available = (free > memory_margin) ? free - memory_margin : 0;
+                memory_per_device[dev] = available;
+                SRV_DBG("device %s: available memory after margin=%lu MB\n",
+                    ggml_backend_dev_name(dev),
+                    (unsigned long)(available / (1024 * 1024)));
             }
         }
     }
@@ -529,7 +533,15 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me
     uint64_t memory_exceeded = 0;
 
     for (const auto& [key, limit] : memory_per_device) {
-        if (get(new_model_memory_per_device, key) + get(total_memory_per_device, key) > limit) {
+        const uint64_t total_memory = get(total_memory_per_device, key);
+        const uint64_t new_memory = get(new_model_memory_per_device, key);
+        SRV_DBG("device %s: total=%lu MB, new=%lu MB, limit=%lu MB\n",
+            ggml_backend_dev_name(key),
+            (unsigned long)(total_memory / (1024 * 1024)),
+            (unsigned long)(new_memory / (1024 * 1024)),
+            (unsigned long)(limit / (1024 * 1024)));
+
+        if (total_memory + new_memory > limit) {
             memory_exceeded++;
         }
     }

From ba2521c6a06c11323b0f22e9ae3d3e4d56e4aa77 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Thu, 2 Apr 2026 11:39:07 +0200
Subject: [PATCH 07/15] move llama_context_device_memory function to
 llama-ext.h

---
 include/llama.h                | 6 ------
 src/llama-ext.h                | 6 ++++++
 tools/server/server-models.cpp | 2 ++
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 72fff81bb25..eb869814097 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1525,12 +1525,6 @@ extern "C" {
     LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
     LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
 
-    // Returns the projected memory use (model + context + compute) in bytes
-    // for the given device within this context. Returns 0 if the device is not used.
-    LLAMA_API uint64_t llama_context_device_memory(
-            const struct llama_context * ctx,
-            ggml_backend_dev_t           device);
-
     //
     // training
     //
diff --git a/src/llama-ext.h b/src/llama-ext.h
index 8ce29d217cb..ce87fa32a4a 100644
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -88,3 +88,9 @@ LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model);
 LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i);
 
 LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx);
+
+// Returns the projected memory use (model + context + compute) in bytes
+// for the given device within this context. Returns 0 if the device is not used.
+LLAMA_API uint64_t llama_context_device_memory(
+        const struct llama_context * ctx,
+        ggml_backend_dev_t           device);
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 37cd81f2ef0..ceacf6d7ec9 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -8,6 +8,8 @@
 #include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
 #include <sheredom/subprocess.h>
 
+#include "../../src/llama-ext.h"
+
 #include <functional>
 #include <algorithm>
 #include <thread>

From 7500063065f313c88c6c36efdb1e9e2bfe2f397b Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Thu, 2 Apr 2026 11:39:36 +0200
Subject: [PATCH 08/15] fix model count exceeded check

---
 tools/server/server-models.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index ceacf6d7ec9..48aef5a6a55 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -577,7 +577,7 @@ void server_models::unload_lru(const model_memory_map& new_model_memory_per_devi
             memory_exceeded = get_memory_exceeded(new_model_memory_per_device);
         }
         bool count_exceeded = base_params.models_max > 0 &&
-                              (count_active + 1) >= (size_t)base_params.models_max;
+                              (count_active + 1) > (size_t)base_params.models_max;
 
         if (!lru_model_name.empty() && (count_exceeded || memory_exceeded > 0)) {
             SRV_INF("limits reached (count=%zu, memory margin exceeded on %zu device(s)), removing LRU name=%s\n",

From 173da43c957e15beb8556b35dacd92a9474af783 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 7 Apr 2026 13:28:49 +0200
Subject: [PATCH 09/15] improve memory_per_device map naming

---
 tools/server/server-models.cpp | 14 +++++++-------
 tools/server/server-models.h   |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 48aef5a6a55..22584db1c68 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -193,7 +193,7 @@ server_models::server_models(
             ggml_backend_dev_memory(dev, &free, &total);
             if (total > 0) {
                 const uint64_t available = (free > memory_margin) ? free - memory_margin : 0;
-                memory_per_device[dev] = available;
+                available_memory_per_device[dev] = available;
                 SRV_DBG("device %s: available memory after margin=%lu MB\n",
                     ggml_backend_dev_name(dev),
                     (unsigned long)(available / (1024 * 1024)));
@@ -521,7 +521,7 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me
     model_memory_map total_memory_per_device;
     for (const auto & m : mapping) {
         if (m.second.meta.is_running()) {
-            for (const auto& [key, value] : m.second.meta.memory_per_device) {
+            for (const auto& [key, value] : m.second.meta.memory_usage_per_device) {
                 total_memory_per_device[key] += value;
             }
         }
@@ -534,7 +534,7 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me
 
     uint64_t memory_exceeded = 0;
 
-    for (const auto& [key, limit] : memory_per_device) {
+    for (const auto& [key, limit] : available_memory_per_device) {
         const uint64_t total_memory = get(total_memory_per_device, key);
         const uint64_t new_memory = get(new_model_memory_per_device, key);
         SRV_DBG("device %s: total=%lu MB, new=%lu MB, limit=%lu MB\n",
@@ -552,7 +552,7 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me
 }
 
 void server_models::unload_lru(const model_memory_map& new_model_memory_per_device) {
-    const bool check_memory = base_params.models_memory_margin > 0 && !memory_per_device.empty();
+    const bool check_memory = base_params.models_memory_margin > 0 && !available_memory_per_device.empty();
 
     if (base_params.models_max <= 0 && !check_memory) {
         return; // no limit
@@ -662,11 +662,11 @@ void server_models::load(const std::string & name) {
     if (base_params.models_memory_margin > 0) {
         std::lock_guard<std::mutex> lk(mutex);
         auto & meta = mapping[name].meta;
-        if (meta.memory_per_device.empty()) {
-            meta.memory_per_device = get_model_memory_per_device(meta.preset);
+        if (meta.memory_usage_per_device.empty()) {
+            meta.memory_usage_per_device = get_model_memory_per_device(meta.preset);
         }
 
-        new_model_memory_per_device = meta.memory_per_device;
+        new_model_memory_per_device = meta.memory_usage_per_device;
     }
 
     unload_lru(new_model_memory_per_device);
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 38d6929a881..0f2f8f9a192 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -64,7 +64,7 @@ struct server_model_meta {
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
-    model_memory_map memory_per_device; // projected bytes per device
+    model_memory_map memory_usage_per_device; // bytes used per device
     std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
     int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
@@ -111,7 +111,7 @@ struct server_models {
     common_preset base_preset; // base preset from llama-server CLI args
 
     // available memory per device
-    std::map<ggml_backend_dev_t, uint64_t> memory_per_device;
+    std::map<ggml_backend_dev_t, uint64_t> available_memory_per_device;
 
     void update_meta(const std::string & name, const server_model_meta & meta);
 

From 69e3086190009069cf85e6ade57c427c948b10bf Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 7 Apr 2026 13:35:02 +0200
Subject: [PATCH 10/15] improve variable naming, fix style

---
 common/arg.cpp                 |  2 +-
 tools/server/server-models.cpp | 24 ++++++++++++------------
 tools/server/server-models.h   |  6 +++---
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 37e2c8dda1f..7ba0f2fc256 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3074,7 +3074,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
     add_opt(common_arg(
         {"--models-memory-margin"}, "N",
-        string_format("for router server, MB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin),
+        string_format("for router server, MiB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin),
         [](common_params & params, int value) {
             params.models_memory_margin = value;
         }
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 22584db1c68..544798c55eb 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -194,7 +194,7 @@ server_models::server_models(
             if (total > 0) {
                 const uint64_t available = (free > memory_margin) ? free - memory_margin : 0;
                 available_memory_per_device[dev] = available;
-                SRV_DBG("device %s: available memory after margin=%lu MB\n",
+                SRV_DBG("device %s: available memory after margin=%lu MiB\n",
                     ggml_backend_dev_name(dev),
                     (unsigned long)(available / (1024 * 1024)));
             }
@@ -517,11 +517,11 @@ std::vector<server_model_meta> server_models::get_all_meta() {
     return result;
 }
 
-uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_memory_per_device) const {
+uint64_t server_models::get_memory_exceeded(const model_memory_map & new_model_memory_per_device) const {
     model_memory_map total_memory_per_device;
     for (const auto & m : mapping) {
         if (m.second.meta.is_running()) {
-            for (const auto& [key, value] : m.second.meta.memory_usage_per_device) {
+            for (const auto & [key, value] : m.second.meta.memory_usage_per_device) {
                 total_memory_per_device[key] += value;
             }
         }
@@ -532,9 +532,9 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me
         return it != m.end() ? it->second : 0;
     };
 
-    uint64_t memory_exceeded = 0;
+    size_t count_memory_exceeded = 0;
 
-    for (const auto& [key, limit] : available_memory_per_device) {
+    for (const auto & [key, limit] : available_memory_per_device) {
         const uint64_t total_memory = get(total_memory_per_device, key);
         const uint64_t new_memory = get(new_model_memory_per_device, key);
         SRV_DBG("device %s: total=%lu MB, new=%lu MB, limit=%lu MB\n",
@@ -544,14 +544,14 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me
             (unsigned long)(limit / (1024 * 1024)));
 
         if (total_memory + new_memory > limit) {
-            memory_exceeded++;
+            count_memory_exceeded++;
         }
     }
 
-    return memory_exceeded;
+    return count_memory_exceeded;
 }
 
-void server_models::unload_lru(const model_memory_map& new_model_memory_per_device) {
+void server_models::unload_lru(const model_memory_map & new_model_memory_per_device) {
     const bool check_memory = base_params.models_memory_margin > 0 && !available_memory_per_device.empty();
 
     if (base_params.models_max <= 0 && !check_memory) {
@@ -562,7 +562,7 @@ void server_models::unload_lru(const model_memory_map& new_model_memory_per_devi
         std::string lru_model_name = "";
         int64_t lru_last_used = ggml_time_ms();
         size_t count_active = 0;
-        uint64_t memory_exceeded = 0;
+        size_t count_memory_exceeded = 0;
         {
             std::unique_lock<std::mutex> lk(mutex);
             for (const auto & m : mapping) {
@@ -574,14 +574,14 @@ void server_models::unload_lru(const model_memory_map& new_model_memory_per_devi
                     }
                 }
             }
-            memory_exceeded = get_memory_exceeded(new_model_memory_per_device);
+            count_memory_exceeded = get_memory_exceeded(new_model_memory_per_device);
         }
         bool count_exceeded = base_params.models_max > 0 &&
                               (count_active + 1) > (size_t)base_params.models_max;
 
-        if (!lru_model_name.empty() && (count_exceeded || memory_exceeded > 0)) {
+        if (!lru_model_name.empty() && (count_exceeded || count_memory_exceeded > 0)) {
             SRV_INF("limits reached (count=%zu, memory margin exceeded on %zu device(s)), removing LRU name=%s\n",
-                    count_active, memory_exceeded,  lru_model_name.c_str());
+                    count_active, count_memory_exceeded,  lru_model_name.c_str());
             unload(lru_model_name);
             // wait for unload to complete
             {
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 0f2f8f9a192..f86cc0b2cc4 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -111,18 +111,18 @@ struct server_models {
     common_preset base_preset; // base preset from llama-server CLI args
 
     // available memory per device
-    std::map<ggml_backend_dev_t, uint64_t> available_memory_per_device;
+    model_memory_map available_memory_per_device;
 
     void update_meta(const std::string & name, const server_model_meta & meta);
 
     // unload least recently used models if the limit is reached
-    void unload_lru(const model_memory_map& new_model_memory_per_device);
+    void unload_lru(const model_memory_map & new_model_memory_per_device);
 
     // not thread-safe, caller must hold mutex
     void add_model(server_model_meta && meta);
 
     // not thread-safe, caller must hold mutex
-    uint64_t get_memory_exceeded(const model_memory_map& new_model_memory_per_device) const;
+    uint64_t get_memory_exceeded(const model_memory_map & new_model_memory_per_device) const;
 
 public:
     server_models(const common_params & params, int argc, char ** argv);

From eb2cf73ff9c54b693487134b80ee24ed15d0a975 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Mon, 13 Apr 2026 10:14:53 +0200
Subject: [PATCH 11/15] also strip models memory margin from child processes

---
 tools/server/server-models.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 544798c55eb..ef6acb57de8 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -96,6 +96,7 @@ static void unset_reserved_args(common_preset & preset, bool unset_model_args) {
     preset.unset_option("LLAMA_API_KEY");
     preset.unset_option("LLAMA_ARG_MODELS_DIR");
     preset.unset_option("LLAMA_ARG_MODELS_MAX");
+    preset.unset_option("LLAMA_ARG_MODELS_MEMORY_MARGIN");
     preset.unset_option("LLAMA_ARG_MODELS_PRESET");
     preset.unset_option("LLAMA_ARG_MODELS_AUTOLOAD");
     if (unset_model_args) {

From 1a8aec0afd8209e49bc47e99c791d706dd84ef96 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 16 Apr 2026 14:32:47 +0300
Subject: [PATCH 12/15] cont : clean-up

---
 common/common.h                |   2 +-
 tools/server/server-models.cpp | 129 ++++++++++++++++++---------------
 tools/server/server-models.h   |  12 +--
 3 files changed, 80 insertions(+), 63 deletions(-)

diff --git a/common/common.h b/common/common.h
index 8ac5b9a8bdb..2996d354049 100644
--- a/common/common.h
+++ b/common/common.h
@@ -610,7 +610,7 @@ struct common_params {
     std::string models_dir    = "";  // directory containing models for the router server
     std::string models_preset = "";  // directory containing model presets for the router server
     int models_max = 4;              // maximum number of models to load simultaneously
-    int models_memory_margin = 1024; // MB of free memory to preserve per device (0 = disabled)
+    int models_memory_margin = 1024; // MiB of free memory to preserve per device (0 = disabled)
     bool models_autoload = true;     // automatically load models when requested via the router server
 
     bool log_json = false;
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index ef6acb57de8..96a291854d6 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -180,11 +180,11 @@ server_models::server_models(
         bin_path = get_server_exec_path().string();
     } catch (const std::exception & e) {
         bin_path = argv[0];
-        LOG_WRN("failed to get server executable path: %s\n", e.what());
-        LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]);
+        SRV_WRN("failed to get server executable path: %s\n", e.what());
+        SRV_WRN("using original argv[0] as fallback: %s\n", argv[0]);
     }
 
-    const uint64_t memory_margin = (uint64_t)base_params.models_memory_margin * 1024 * 1024;
+    const size_t memory_margin = (size_t) base_params.models_memory_margin * 1024 * 1024;
 
     if (memory_margin > 0) {
         const size_t n_devs = ggml_backend_dev_count();
@@ -193,11 +193,10 @@ server_models::server_models(
             size_t free, total;
             ggml_backend_dev_memory(dev, &free, &total);
             if (total > 0) {
-                const uint64_t available = (free > memory_margin) ? free - memory_margin : 0;
-                available_memory_per_device[dev] = available;
-                SRV_DBG("device %s: available memory after margin=%lu MiB\n",
-                    ggml_backend_dev_name(dev),
-                    (unsigned long)(available / (1024 * 1024)));
+                const size_t available = (free > memory_margin) ? free - memory_margin : 0;
+                dmm_available[dev] = available;
+                SRV_DBG("device %s: available memory after margin=%zu MiB\n",
+                    ggml_backend_dev_name(dev), available / (1024 * 1024));
             }
         }
     }
@@ -518,52 +517,57 @@ std::vector<server_model_meta> server_models::get_all_meta() {
     return result;
 }
 
-uint64_t server_models::get_memory_exceeded(const model_memory_map & new_model_memory_per_device) const {
-    model_memory_map total_memory_per_device;
+int server_models::can_fit(const device_memory_map & dmm_req) const {
+    device_memory_map dmm_total;
     for (const auto & m : mapping) {
         if (m.second.meta.is_running()) {
-            for (const auto & [key, value] : m.second.meta.memory_usage_per_device) {
-                total_memory_per_device[key] += value;
+            for (const auto & [dev, mem] : m.second.meta.dmm_req) {
+                dmm_total[dev] += mem;
             }
         }
     }
 
-    auto get = [](const model_memory_map & m, ggml_backend_dev_t k) {
-        auto it = m.find(k);
-        return it != m.end() ? it->second : 0;
+    auto get = [](const device_memory_map & dmm, ggml_backend_dev_t dev) {
+        auto it = dmm.find(dev);
+        return it != dmm.end() ? it->second : 0;
     };
 
-    size_t count_memory_exceeded = 0;
+    int res = 0;
 
-    for (const auto & [key, limit] : available_memory_per_device) {
-        const uint64_t total_memory = get(total_memory_per_device, key);
-        const uint64_t new_memory = get(new_model_memory_per_device, key);
-        SRV_DBG("device %s: total=%lu MB, new=%lu MB, limit=%lu MB\n",
-            ggml_backend_dev_name(key),
-            (unsigned long)(total_memory / (1024 * 1024)),
-            (unsigned long)(new_memory / (1024 * 1024)),
-            (unsigned long)(limit / (1024 * 1024)));
+    for (const auto & [dev, limit] : dmm_available) {
+        const size_t mem_total = get(dmm_total, dev);
+        const size_t mem_new   = get(dmm_req,   dev);
 
-        if (total_memory + new_memory > limit) {
-            count_memory_exceeded++;
+        SRV_DBG("device %s: total=%zu MiB, new=%zu MiB, limit=%zu MiB\n",
+            ggml_backend_dev_name(dev),
+            mem_total / (1024 * 1024), mem_new / (1024 * 1024), limit / (1024 * 1024));
+
+        if (mem_total + mem_new > limit) {
+            res++;
         }
     }
 
-    return count_memory_exceeded;
+    return res;
 }
 
-void server_models::unload_lru(const model_memory_map & new_model_memory_per_device) {
-    const bool check_memory = base_params.models_memory_margin > 0 && !available_memory_per_device.empty();
+void server_models::unload_lru(const device_memory_map & dmm_req) {
+    const bool check_active = base_params.models_max > 0;
+    const bool check_memory = base_params.models_memory_margin > 0;
 
-    if (base_params.models_max <= 0 && !check_memory) {
+    if (!check_active && !check_memory) {
         return; // no limit
     }
 
+    if (check_memory) {
+        GGML_ASSERT(!dmm_available.empty());
+    }
+
     while (true) {
-        std::string lru_model_name = "";
+        std::string lru_model_name;
         int64_t lru_last_used = ggml_time_ms();
-        size_t count_active = 0;
-        size_t count_memory_exceeded = 0;
+
+        int count_active = 0;
+        int count_exceed = 0;
         {
             std::unique_lock<std::mutex> lk(mutex);
             for (const auto & m : mapping) {
@@ -575,14 +579,17 @@ void server_models::unload_lru(const model_memory_map & new_model_memory_per_dev
                     }
                 }
             }
-            count_memory_exceeded = get_memory_exceeded(new_model_memory_per_device);
+            if (check_memory) {
+                count_exceed = can_fit(dmm_req);
+            }
         }
-        bool count_exceeded = base_params.models_max > 0 &&
-                              (count_active + 1) > (size_t)base_params.models_max;
 
-        if (!lru_model_name.empty() && (count_exceeded || count_memory_exceeded > 0)) {
-            SRV_INF("limits reached (count=%zu, memory margin exceeded on %zu device(s)), removing LRU name=%s\n",
-                    count_active, count_memory_exceeded,  lru_model_name.c_str());
+        const bool active_exceeded = check_active && count_active >= base_params.models_max;
+        const bool memory_exceeded = check_memory && count_exceed > 0;
+
+        if (!lru_model_name.empty() && (active_exceeded || memory_exceeded)) {
+            SRV_INF("limits reached (count=%d, memory margin exceeded on %d device(s)), removing LRU name=%s\n",
+                    count_active, count_exceed, lru_model_name.c_str());
             unload(lru_model_name);
             // wait for unload to complete
             {
@@ -597,11 +604,11 @@ void server_models::unload_lru(const model_memory_map & new_model_memory_per_dev
     }
 }
 
-static model_memory_map get_model_memory_per_device(const common_preset& preset) {
+static device_memory_map get_model_memory_per_device(const common_preset & preset) {
     common_params params;
     preset.apply_to_params(params);
 
-    if(params.model.path.empty()) {
+    if (params.model.path.empty()) {
         return {};
     }
 
@@ -641,7 +648,7 @@ static model_memory_map get_model_memory_per_device(const common_preset& preset)
         return {};
     }
 
-    model_memory_map result;
+    device_memory_map result;
     const size_t n_devs = ggml_backend_dev_count();
     for (size_t i = 0; i < n_devs; i++) {
         ggml_backend_dev_t dev = ggml_backend_dev_get(i);
@@ -659,18 +666,19 @@ void server_models::load(const std::string & name) {
         throw std::runtime_error("model name=" + name + " is not found");
     }
 
-    model_memory_map new_model_memory_per_device;
+    device_memory_map dmm_req;
     if (base_params.models_memory_margin > 0) {
+        // determine the required memory by the model upon its first load
         std::lock_guard<std::mutex> lk(mutex);
         auto & meta = mapping[name].meta;
-        if (meta.memory_usage_per_device.empty()) {
-            meta.memory_usage_per_device = get_model_memory_per_device(meta.preset);
+        if (meta.dmm_req.empty()) {
+            meta.dmm_req = get_model_memory_per_device(meta.preset);
         }
 
-        new_model_memory_per_device = meta.memory_usage_per_device;
+        dmm_req = meta.dmm_req;
     }
 
-    unload_lru(new_model_memory_per_device);
+    unload_lru(dmm_req);
 
     std::lock_guard<std::mutex> lk(mutex);
 
@@ -684,17 +692,24 @@ void server_models::load(const std::string & name) {
     // exceeding models_max. Without this, the window between unload_lru()
     // releasing its lock and this lock_guard acquiring allows multiple
     // threads to each observe capacity and all proceed to load.
-    if (base_params.models_max > 0 || base_params.models_memory_margin > 0) {
-        size_t count_active = 0;
-        for (const auto & m : mapping) {
-            if (m.second.meta.is_running()) {
-                count_active++;
+    {
+        const bool check_active = base_params.models_max > 0;
+        const bool check_memory = base_params.models_memory_margin > 0;
+
+        if (check_active || check_memory) {
+            int count_active = 0;
+            for (const auto & m : mapping) {
+                if (m.second.meta.is_running()) {
+                    count_active++;
+                }
+            }
+
+            const bool active_exceeded = check_active && count_active >= base_params.models_max;
+            const bool memory_exceeded = check_memory && can_fit(dmm_req) > 0;
+
+            if (active_exceeded || memory_exceeded) {
+                throw std::runtime_error("model limit reached, try again later");
             }
-        }
-        bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max;
-        bool memory_exceeded = get_memory_exceeded(new_model_memory_per_device) > 0;
-        if (count_exceeded || memory_exceeded) {
-            throw std::runtime_error("model limit reached, try again later");
         }
     }
 
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index f86cc0b2cc4..567e716bce0 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -54,7 +54,7 @@ static std::string server_model_status_to_string(server_model_status status) {
     }
 }
 
-using model_memory_map = std::map<ggml_backend_dev_t, uint64_t>;
+using device_memory_map = std::map<ggml_backend_dev_t, size_t>;
 
 struct server_model_meta {
     common_preset preset;
@@ -64,7 +64,7 @@ struct server_model_meta {
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
-    model_memory_map memory_usage_per_device; // bytes used per device
+    device_memory_map dmm_req; // bytes required per device
     std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
     int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
@@ -111,18 +111,20 @@ struct server_models {
     common_preset base_preset; // base preset from llama-server CLI args
 
     // available memory per device
-    model_memory_map available_memory_per_device;
+    device_memory_map dmm_available;
 
     void update_meta(const std::string & name, const server_model_meta & meta);
 
     // unload least recently used models if the limit is reached
-    void unload_lru(const model_memory_map & new_model_memory_per_device);
+    void unload_lru(const device_memory_map & dmm_req);
 
     // not thread-safe, caller must hold mutex
     void add_model(server_model_meta && meta);
 
+    // return number of devices where the memory limit would be exceeded
+    // return 0 if the new model would fit on all devices
     // not thread-safe, caller must hold mutex
-    uint64_t get_memory_exceeded(const model_memory_map & new_model_memory_per_device) const;
+    int can_fit(const device_memory_map & dmm_req) const;
 
 public:
     server_models(const common_params & params, int argc, char ** argv);

From b1623a614c682bad576ab7dc19cf613b2af94e6d Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Mon, 20 Apr 2026 14:48:55 +0200
Subject: [PATCH 13/15] handle models that need to be downloaded before
 estimation

---
 common/arg.cpp                 |   7 ++
 common/common.h                |   1 +
 tools/server/server-models.cpp | 122 ++++++++++++++++++++++++++++++++-
 tools/server/server-models.h   |  25 +++++--
 tools/server/server.cpp        |   5 ++
 5 files changed, 151 insertions(+), 9 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 7ba0f2fc256..710955a86fb 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3308,6 +3308,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.offline = true;
         }
     ).set_env("LLAMA_OFFLINE"));
+    add_opt(common_arg(
+        {"--download-only"},
+        "Download the model file(s) and exit",
+        [](common_params & params) {
+            params.download_only = true;
+        }
+    ));
     add_opt(common_arg(
         {"-lv", "--verbosity", "--log-verbosity"}, "N",
         string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
diff --git a/common/common.h b/common/common.h
index 2996d354049..066e5766502 100644
--- a/common/common.h
+++ b/common/common.h
@@ -482,6 +482,7 @@ struct common_params {
     int32_t control_vector_layer_start = -1; // layer range for control vector
     int32_t control_vector_layer_end   = -1; // layer range for control vector
     bool    offline                    = false;
+    bool    download_only              = false; // only download the model if required, don't start the server
 
     int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
     int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 96a291854d6..9f34a8cbc18 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -604,12 +604,33 @@ void server_models::unload_lru(const device_memory_map & dmm_req) {
     }
 }
 
+static std::string resolve_model_path(const common_preset & preset) {
+    common_params params;
+    preset.apply_to_params(params);
+
+    if (!params.model.path.empty()) {
+        return params.model.path;
+    }
+
+    if (!params.model.hf_repo.empty() || !params.model.url.empty()) {
+        common_download_opts opts;
+        opts.offline = true;
+        auto result = common_download_model(params.model, opts);
+        return result.model_path;
+    }
+
+    return "";
+}
+
 static device_memory_map get_model_memory_per_device(const common_preset & preset) {
     common_params params;
     preset.apply_to_params(params);
 
-    if (params.model.path.empty()) {
-        return {};
+    if(params.model.path.empty()) {
+        params.model.path = resolve_model_path(preset);
+        if(params.model.path.empty()) {
+            return {};
+        }
     }
 
     struct log_ud_t {
@@ -661,11 +682,98 @@ static device_memory_map get_model_memory_per_device(const common_preset & prese
     return result;
 }
 
+bool server_models::download_model(const std::string & name) {
+    std::vector<std::string> child_args;
+    std::vector<std::string> child_env;
+    {
+        std::lock_guard<std::mutex> lk(mutex);
+        auto & meta = mapping[name].meta;
+        child_args = meta.preset.to_args(bin_path);
+        child_env  = base_env;
+    }
+    child_args.push_back("--download-only");
+
+    SRV_INF("downloading model name=%s\n", name.c_str());
+
+    std::vector<char *> argv = to_char_ptr_array(child_args);
+    std::vector<char *> envp = to_char_ptr_array(child_env);
+
+    subprocess_s proc;
+    int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr;
+    if (subprocess_create_ex(argv.data(), options, envp.data(), &proc) != 0) {
+        SRV_ERR("failed to spawn download process for model name=%s\n", name.c_str());
+        return false;
+    }
+
+    FILE * out = subprocess_stdout(&proc);
+    if (out) {
+        char buffer[4096];
+        while (fgets(buffer, sizeof(buffer), out) != nullptr) {
+            LOG("[dl:%s] %s", name.c_str(), buffer);
+        }
+    }
+
+    int exit_code = 0;
+    subprocess_join(&proc, &exit_code);
+    subprocess_destroy(&proc);
+
+    if (exit_code != 0) {
+        SRV_ERR("download process for model name=%s exited with code %d\n", name.c_str(), exit_code);
+        return false;
+    }
+
+    SRV_INF("download complete for model name=%s\n", name.c_str());
+    return true;
+}
+
 void server_models::load(const std::string & name) {
     if (!has_model(name)) {
         throw std::runtime_error("model name=" + name + " is not found");
     }
 
+    {
+        common_preset preset_copy;
+        {
+            std::lock_guard<std::mutex> lk(mutex);
+            preset_copy = mapping[name].meta.preset;
+        }
+        if (resolve_model_path(preset_copy).empty()) {
+            {
+                std::lock_guard<std::mutex> lk(mutex);
+                auto & meta = mapping[name].meta;
+                if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
+                    return;
+                }
+                meta.status = SERVER_MODEL_STATUS_DOWNLOADING;
+                cv.notify_all();
+            }
+            std::thread([this, name]() {
+                if (!download_model(name)) {
+                    update_status(name, SERVER_MODEL_STATUS_UNLOADED, 1);
+                    return;
+                }
+                device_memory_map mem;
+                if (base_params.models_memory_margin > 0) {
+                    std::lock_guard<std::mutex> lk(mutex);
+                    auto & meta = mapping[name].meta;
+                    meta.dmm_req = get_model_memory_per_device(meta.preset);
+                    if (meta.dmm_req.empty()) {
+                        SRV_WRN("failed to estimate memory for model %s, memory limits will not apply\n", name.c_str());
+                    }
+                    mem = meta.dmm_req;
+                }
+                update_status(name, SERVER_MODEL_STATUS_UNLOADED, 0);
+                try {
+                    _load(name, mem);
+                } catch (const std::exception & e) {
+                    SRV_ERR("failed to load model %s after download: %s\n", name.c_str(), e.what());
+                    update_status(name, SERVER_MODEL_STATUS_UNLOADED, 1);
+                }
+            }).detach();
+            return;
+        }
+    }
+
     device_memory_map dmm_req;
     if (base_params.models_memory_margin > 0) {
         // determine the required memory by the model upon its first load
@@ -673,11 +781,18 @@ void server_models::load(const std::string & name) {
         auto & meta = mapping[name].meta;
         if (meta.dmm_req.empty()) {
             meta.dmm_req = get_model_memory_per_device(meta.preset);
+            if (meta.dmm_req.empty()) {
+                SRV_WRN("failed to estimate memory for model %s, memory limits will not apply\n", name.c_str());
+            }
         }
 
         dmm_req = meta.dmm_req;
     }
 
+    _load(name, dmm_req);
+}
+
+void server_models::_load(const std::string & name, const device_memory_map & dmm_req) {
     unload_lru(dmm_req);
 
     std::lock_guard<std::mutex> lk(mutex);
@@ -913,7 +1028,8 @@ void server_models::wait_until_loading_finished(const std::string & name) {
     cv.wait(lk, [this, &name]() {
         auto it = mapping.find(name);
         if (it != mapping.end()) {
-            return it->second.meta.status != SERVER_MODEL_STATUS_LOADING;
+            return it->second.meta.status != SERVER_MODEL_STATUS_LOADING &&
+                   it->second.meta.status != SERVER_MODEL_STATUS_DOWNLOADING;
         }
         return false;
     });
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 567e716bce0..aa6abf7cac7 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -14,6 +14,9 @@
 /**
  * state diagram:
  *
+ *
+ *  ┌► DOWNLOADING ─┐
+ *  │               ▼
  * UNLOADED ──► LOADING ──► LOADED ◄──── SLEEPING
  *  ▲            │            │               ▲
  *  └───failed───┘            │               │
@@ -21,8 +24,8 @@
  *  └────────unloaded─────────┘
  */
 enum server_model_status {
-    // TODO: also add downloading state when the logic is added
     SERVER_MODEL_STATUS_UNLOADED,
+    SERVER_MODEL_STATUS_DOWNLOADING,
     SERVER_MODEL_STATUS_LOADING,
     SERVER_MODEL_STATUS_LOADED,
     SERVER_MODEL_STATUS_SLEEPING
@@ -32,6 +35,9 @@ static server_model_status server_model_status_from_string(const std::string & s
     if (status_str == "unloaded") {
         return SERVER_MODEL_STATUS_UNLOADED;
     }
+    if (status_str == "downloading") {
+        return SERVER_MODEL_STATUS_DOWNLOADING;
+    }
     if (status_str == "loading") {
         return SERVER_MODEL_STATUS_LOADING;
     }
@@ -46,11 +52,12 @@ static server_model_status server_model_status_from_string(const std::string & s
 
 static std::string server_model_status_to_string(server_model_status status) {
     switch (status) {
-        case SERVER_MODEL_STATUS_UNLOADED: return "unloaded";
-        case SERVER_MODEL_STATUS_LOADING:  return "loading";
-        case SERVER_MODEL_STATUS_LOADED:   return "loaded";
-        case SERVER_MODEL_STATUS_SLEEPING: return "sleeping";
-        default:                           return "unknown";
+        case SERVER_MODEL_STATUS_UNLOADED:     return "unloaded";
+        case SERVER_MODEL_STATUS_DOWNLOADING:  return "downloading";
+        case SERVER_MODEL_STATUS_LOADING:      return "loading";
+        case SERVER_MODEL_STATUS_LOADED:       return "loaded";
+        case SERVER_MODEL_STATUS_SLEEPING:     return "sleeping";
+        default:                               return "unknown";
     }
 }
 
@@ -126,6 +133,12 @@ struct server_models {
     // not thread-safe, caller must hold mutex
     int can_fit(const device_memory_map & dmm_req) const;
 
+    // download model files, blocking call (caller must NOT hold mutex)
+    bool download_model(const std::string & name);
+
+    // Internal helper for model loading
+    void _load(const std::string & name, const device_memory_map & dmm_req);
+
 public:
     server_models(const common_params & params, int argc, char ** argv);
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 6566949edf1..4ff962b89fc 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -83,6 +83,11 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    if (params.download_only) {
+        LOG_INF("%s: model downloaded successfully, exiting\n", __func__);
+        return 0;
+    }
+
     // validate batch size for embeddings
     // embeddings require all tokens to be processed in a single ubatch
     // see https://github.com/ggml-org/llama.cpp/issues/12836

From cf0ebc4e643155710c10f341ae7752b6f78ac456 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 21 Apr 2026 13:22:50 +0200
Subject: [PATCH 14/15] load directly from downloaded state

---
 tools/server/server-models.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 9f34a8cbc18..23ba9c944a1 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -762,7 +762,6 @@ void server_models::load(const std::string & name) {
                     }
                     mem = meta.dmm_req;
                 }
-                update_status(name, SERVER_MODEL_STATUS_UNLOADED, 0);
                 try {
                     _load(name, mem);
                 } catch (const std::exception & e) {
@@ -798,7 +797,7 @@ void server_models::_load(const std::string & name, const device_memory_map & dm
     std::lock_guard<std::mutex> lk(mutex);
 
     auto meta = mapping[name].meta;
-    if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
+    if (meta.status != SERVER_MODEL_STATUS_UNLOADED && meta.status != SERVER_MODEL_STATUS_DOWNLOADING) {
         SRV_INF("model %s is not ready\n", name.c_str());
         return;
     }

From a5355a02269570c4810105168e1bf779d3912a84 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Thu, 16 Apr 2026 13:40:13 +0200
Subject: [PATCH 15/15] server: keep router model refcount to avoid unloading
 models that have running requests

this avoids a deadlock when models A and B don't
fit together, but both have requests, so the server
gets into a loop unloading A, loading B, unloading B,
loading A again, and so on
---
 tools/server/server-http.h             |   8 +-
 tools/server/server-models.cpp         |  66 ++++++++++++-
 tools/server/server-models.h           |   7 ++
 tools/server/tests/unit/test_router.py | 124 +++++++++++++++++++++++++
 4 files changed, 200 insertions(+), 5 deletions(-)

diff --git a/tools/server/server-http.h b/tools/server/server-http.h
index 68ae2170cf6..42ea8a8e992 100644
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -28,7 +28,13 @@ struct server_http_res {
         return next != nullptr;
     }
 
-    virtual ~server_http_res() = default;
+    std::function<void()> on_destroy = nullptr;
+
+    virtual ~server_http_res() {
+        if (on_destroy) {
+            on_destroy();
+        }
+    }
 };
 
 // unique pointer, used by set_chunked_content_provider
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 23ba9c944a1..379b01a4f03 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -517,6 +517,19 @@ std::vector<server_model_meta> server_models::get_all_meta() {
     return result;
 }
 
+void server_models::inc_refs(const std::string & name) {
+    std::lock_guard<std::mutex> lk(mutex);
+    mapping[name].active_refs++;
+}
+
+void server_models::dec_refs(const std::string & name) {
+    {
+        std::lock_guard<std::mutex> lk(mutex);
+        mapping[name].active_refs--;
+    }
+    cv.notify_all();
+}
+
 int server_models::can_fit(const device_memory_map & dmm_req) const {
     device_memory_map dmm_total;
     for (const auto & m : mapping) {
@@ -573,7 +586,8 @@ void server_models::unload_lru(const device_memory_map & dmm_req) {
             for (const auto & m : mapping) {
                 if (m.second.meta.is_running()) {
                     count_active++;
-                    if (m.second.meta.last_used < lru_last_used) {
+                    // Only consider idle models
+                    if (m.second.active_refs == 0 && m.second.meta.last_used < lru_last_used) {
                         lru_model_name = m.first;
                         lru_last_used = m.second.meta.last_used;
                     }
@@ -598,6 +612,21 @@ void server_models::unload_lru(const device_memory_map & dmm_req) {
                     return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
                 });
             }
+        } else if (count_active > 0 && (active_exceeded || memory_exceeded)) {
+            // No model idle, wait for drain
+            std::unique_lock<std::mutex> lk(mutex);
+            bool drained = cv.wait_for(lk, std::chrono::seconds(DEFAULT_STOP_TIMEOUT), [this]() {
+                for (const auto & m : mapping) {
+                    if (m.second.meta.is_running() && m.second.active_refs == 0) {
+                        return true;
+                    }
+                }
+                return false;
+            });
+            if (!drained) {
+                SRV_WRN("%s", "drain timeout, falling back to force eviction\n");
+                break;
+            }
         } else {
             break;
         }
@@ -833,6 +862,7 @@ void server_models::_load(const std::string & name, const device_memory_map & dm
     inst.meta.port      = get_free_port();
     inst.meta.status    = SERVER_MODEL_STATUS_LOADING;
     inst.meta.last_used = ggml_time_ms();
+    inst.active_refs = mapping[name].active_refs;
 
     if (inst.meta.port <= 0) {
         throw std::runtime_error("failed to get a port number");
@@ -1168,10 +1198,18 @@ static bool router_validate_model(std::string & name, server_models & models, bo
     }
     // resolve alias to canonical model name
     name = meta->name;
+    // To avoid unloading a model before it is loaded, protect with increased ref count before it starts loading
+    models.inc_refs(name);
     if (models_autoload) {
-        models.ensure_model_ready(name);
+        try {
+            models.ensure_model_ready(name);
+        } catch (...) {
+            models.dec_refs(name);
+            throw;
+        }
     } else {
         if (!meta->is_running()) {
+            models.dec_refs(name);
             res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
             return false;
         }
@@ -1222,7 +1260,17 @@ void server_models_routes::init_routes() {
         if (!router_validate_model(name, models, autoload, error_res)) {
             return error_res;
         }
-        return models.proxy_request(req, method, name, false);
+        server_http_res_ptr proxy;
+        try {
+            proxy = models.proxy_request(req, method, name, false);
+        } catch(...) {
+            models.dec_refs(name);
+            throw;
+        }
+        proxy->on_destroy = [this, name]() {
+            this->models.dec_refs(name);
+        };
+        return proxy;
     };
 
     this->proxy_post = [this](const server_http_req & req) {
@@ -1234,7 +1282,17 @@ void server_models_routes::init_routes() {
         if (!router_validate_model(name, models, autoload, error_res)) {
             return error_res;
         }
-        return models.proxy_request(req, method, name, true); // update last usage for POST request only
+        server_http_res_ptr proxy;
+        try {
+            proxy = models.proxy_request(req, method, name, true); // update last usage for POST request only
+        } catch(...) {
+            models.dec_refs(name);
+            throw;
+        }
+        proxy->on_destroy = [this, name]() {
+            this->models.dec_refs(name);
+        };
+        return proxy;
     };
 
     this->post_router_models_load = [this](const server_http_req & req) {
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index aa6abf7cac7..36cd0296f60 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -100,6 +100,7 @@ struct server_models {
         std::thread th;
         server_model_meta meta;
         FILE * stdin_file = nullptr;
+        uint64_t active_refs = 0;
     };
 
     std::mutex mutex;
@@ -174,6 +175,12 @@ struct server_models {
     // proxy an HTTP request to the model instance
     server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);
 
+    // Increase instance ref counter
+    void inc_refs(const std::string & name);
+
+    // Decrease instance ref counter
+    void dec_refs(const std::string & name);
+
     // return true if the current process is a child server instance
     static bool is_child_server();
 
diff --git a/tools/server/tests/unit/test_router.py b/tools/server/tests/unit/test_router.py
index 79e60db4083..d471ff88b55 100644
--- a/tools/server/tests/unit/test_router.py
+++ b/tools/server/tests/unit/test_router.py
@@ -1,4 +1,5 @@
 import pytest
+import threading
 from utils import *
 
 server: ServerProcess
@@ -205,3 +206,126 @@ def test_router_api_key_required():
     )
     assert authed.status_code == 200
     assert "error" not in authed.body
+
+
+# --- Drain-aware eviction tests ---
+
+
+def _make_completion(model_id: str, max_tokens: int = 16) -> dict:
+    """Send a non-streaming completion request. Returns {"content": ..., "error": ...}."""
+    result = {"content": "", "error": None}
+    try:
+        res = server.make_request("POST", "/v1/chat/completions", data={
+            "model": model_id,
+            "max_tokens": max_tokens,
+            "messages": [{"role": "user", "content": "hi"}],
+        })
+        if res.status_code == 200:
+            choices = res.body.get("choices", [])
+            if choices:
+                result["content"] = choices[0].get("message", {}).get("content", "")
+        else:
+            result["error"] = f"status {res.status_code}: {res.body}"
+    except Exception as e:
+        result["error"] = str(e)
+    return result
+
+
+def test_router_concurrent_no_thrashing():
+    """Concurrent requests for different models should all succeed, not thrash."""
+    global server
+    server = ServerPreset.router()
+    server.models_max = 1
+    server.start()
+
+    model_a = "ggml-org/tinygemma3-GGUF:Q8_0"
+    model_b = "ggml-org/test-model-stories260K:F32"
+    n_per_model = 3
+    results = {}
+
+    def send_request(model_id, idx):
+        results[(model_id, idx)] = _make_completion(model_id)
+
+    threads = []
+    for i in range(n_per_model):
+        threads.append(threading.Thread(target=send_request, args=(model_a, i)))
+        threads.append(threading.Thread(target=send_request, args=(model_b, i)))
+
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join(timeout=300)
+
+    failures = [f"{m} #{i}: {r['error']}" for (m, i), r in results.items() if r["error"] is not None]
+    assert len(failures) == 0, f"{len(failures)} request(s) failed:\n" + "\n".join(failures)
+
+
+def test_router_concurrent_partial_capacity():
+    """With models_max=2 and 3 models, concurrent requests should all succeed."""
+    global server
+    server = ServerPreset.router()
+    server.models_max = 2
+    server.start()
+
+    models = [
+        "ggml-org/tinygemma3-GGUF:Q8_0",
+        "ggml-org/test-model-stories260K:F32",
+        "ggml-org/test-model-stories260K-infill:F32",
+    ]
+    results = {}
+
+    def send_request(model_id, idx):
+        results[(model_id, idx)] = _make_completion(model_id)
+
+    threads = []
+    for model in models:
+        for i in range(2):
+            threads.append(threading.Thread(target=send_request, args=(model, i)))
+
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join(timeout=300)
+
+    failures = [f"{m} #{i}: {r['error']}" for (m, i), r in results.items() if r["error"] is not None]
+    assert len(failures) == 0, f"{len(failures)} request(s) failed:\n" + "\n".join(failures)
+
+
+def test_router_alternating_requests():
+    """Repeated alternating requests between two models should all succeed."""
+    global server
+    server = ServerPreset.router()
+    server.models_max = 1
+    server.start()
+
+    model_a = "ggml-org/tinygemma3-GGUF:Q8_0"
+    model_b = "ggml-org/test-model-stories260K:F32"
+
+    for i in range(3):
+        result = _make_completion(model_a)
+        assert result["error"] is None, f"Round {i} model A failed: {result['error']}"
+        result = _make_completion(model_b)
+        assert result["error"] is None, f"Round {i} model B failed: {result['error']}"
+
+
+def test_router_concurrent_same_model():
+    """Concurrent requests for the same model should all succeed."""
+    global server
+    server = ServerPreset.router()
+    server.models_max = 1
+    server.start()
+
+    model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
+    results = {}
+
+    def send_request(idx):
+        results[idx] = _make_completion(model_id)
+
+    threads = [threading.Thread(target=send_request, args=(i,)) for i in range(6)]
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join(timeout=300)
+
+    failures = [f"#{i}: {r['error']}" for i, r in results.items() if r["error"] is not None]
+    assert len(failures) == 0, f"{len(failures)} request(s) failed:\n" + "\n".join(failures)