diff --git a/common/arg.cpp b/common/arg.cpp
index e0f6c606608..91ce3a95f5b 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3127,6 +3127,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.models_max = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
+    add_opt(common_arg(
+        {"--models-memory-margin"}, "N",
+        string_format("for router server, MiB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin),
+        [](common_params & params, int value) {
+            params.models_memory_margin = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MARGIN"));
     add_opt(common_arg(
         {"--models-autoload"},
         {"--no-models-autoload"},
@@ -3356,6 +3363,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.offline = true;
         }
     ).set_env("LLAMA_ARG_OFFLINE"));
+    add_opt(common_arg(
+        {"--download-only"},
+        "Download the model file(s) and exit",
+        [](common_params & params) {
+            params.download_only = true;
+        }
+    ));
+    add_opt(common_arg(
+        {"--measure-only"},
+        "Load the model to measure memory requirements, print to stdout, then exit",
+        [](common_params & params) {
+            params.measure_only = true;
+        }
+    ));
     add_opt(common_arg(
         {"-lv", "--verbosity", "--log-verbosity"}, "N",
         string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
diff --git a/common/common.h b/common/common.h
index 99898800d1d..34dcc9dd2dc 100644
--- a/common/common.h
+++ b/common/common.h
@@ -508,6 +508,8 @@ struct common_params {
     int32_t control_vector_layer_end   = -1; // layer range for control vector
     bool    offline                    = false;
     bool    skip_download              = false; // skip model file downloading
+    bool    download_only              = false; // only download the model if required, don't start the server
+    bool    measure_only               = false; // load model with no_alloc to measure memory, print to stdout, then exit
 
     int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
     int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -637,10 +639,11 @@ struct common_params {
     std::vector<std::string> server_tools;
 
     // router server configs
-    std::string models_dir    = ""; // directory containing models for the router server
-    std::string models_preset = ""; // directory containing model presets for the router server
-    int models_max = 4;             // maximum number of models to load simultaneously
-    bool models_autoload = true;    // automatically load models when requested via the router server
+    std::string models_dir    = "";  // directory containing models for the router server
+    std::string models_preset = "";  // directory containing model presets for the router server
+    int models_max = 4;              // maximum number of models to load simultaneously
+    int models_memory_margin = 1024; // MiB of free memory to preserve per device (0 = disabled)
+    bool models_autoload = true;     // automatically load models when requested via the router server
 
     bool log_json = false;
 
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 49b0e423f46..ea6dd2d4014 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -98,6 +98,7 @@ static void unset_reserved_args(common_preset & preset, bool unset_model_args) {
     preset.unset_option("LLAMA_API_KEY");
     preset.unset_option("LLAMA_ARG_MODELS_DIR");
     preset.unset_option("LLAMA_ARG_MODELS_MAX");
+    preset.unset_option("LLAMA_ARG_MODELS_MEMORY_MARGIN");
     preset.unset_option("LLAMA_ARG_MODELS_PRESET");
     preset.unset_option("LLAMA_ARG_MODELS_AUTOLOAD");
     if (unset_model_args) {
@@ -213,9 +214,39 @@ server_models::server_models(
         bin_path = get_server_exec_path().string();
     } catch (const std::exception & e) {
         bin_path = argv[0];
-        LOG_WRN("failed to get server executable path: %s\n", e.what());
-        LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]);
+        SRV_WRN("failed to get server executable path: %s\n", e.what());
+        SRV_WRN("using original argv[0] as fallback: %s\n", argv[0]);
     }
+
+    const size_t memory_margin = (size_t) base_params.models_memory_margin * 1024 * 1024;
+
+    if (memory_margin > 0) {
+        ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        ggml_backend_buffer_type_t cpu_buft = cpu_dev ? ggml_backend_dev_buffer_type(cpu_dev) : nullptr;
+
+        const size_t n_devs = ggml_backend_dev_count();
+        for (size_t i = 0; i < n_devs; i++) {
+            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+            ggml_backend_buffer_type_t dev_buft = ggml_backend_dev_buffer_type(dev);
+            if (dev_buft) {
+                buft_by_name[ggml_backend_buft_name(dev_buft)] = dev_buft;
+            }
+            ggml_backend_buffer_type_t host_buft = ggml_backend_dev_host_buffer_type(dev);
+            if (host_buft && cpu_buft) {
+                buft_by_name[ggml_backend_buft_name(host_buft)] = cpu_buft;
+            }
+
+            size_t free, total;
+            ggml_backend_dev_memory(dev, &free, &total);
+            if (total > 0 && dev_buft) {
+                const size_t available = (free > memory_margin) ? free - memory_margin : 0;
+                bmm_available[dev_buft] = available;
+                SRV_DBG("buft %s: available memory after margin=%zu MiB\n",
+                    ggml_backend_buft_name(dev_buft), available / (1024 * 1024));
+            }
+        }
+    }
+
     load_models();
 }
 
@@ -379,6 +410,7 @@ void server_models::load_models() {
                 /* port          */ 0,
                 /* status        */ SERVER_MODEL_STATUS_UNLOADED,
                 /* last_used     */ 0,
+                /* bmm_req       */ {},
                 /* args          */ std::vector<std::string>(),
                 /* loaded_info   */ {},
                 /* exit_code     */ 0,
@@ -533,6 +565,7 @@ void server_models::load_models() {
                     /* port          */ 0,
                     /* status        */ SERVER_MODEL_STATUS_UNLOADED,
                     /* last_used     */ 0,
+                    /* bmm_req       */ {},
                     /* args          */ std::vector<std::string>(),
                     /* loaded_info   */ {},
                     /* exit_code     */ 0,
@@ -692,30 +725,87 @@ std::vector<server_model_meta> server_models::get_all_meta() {
     return result;
 }
 
-void server_models::unload_lru() {
-    if (base_params.models_max <= 0) {
-        return; // no limit
+int server_models::can_fit(const buft_memory_map & bmm_req) const {
+    buft_memory_map bmm_total;
+    for (const auto & m : mapping) {
+        if (m.second.meta.is_running()) {
+            for (const auto & [buft, mem] : m.second.meta.bmm_req) {
+                bmm_total[buft] += mem;
+            }
+        }
     }
-    // remove one of the servers if we passed the models_max (least recently used - LRU)
-    std::string lru_model_name = "";
-    int64_t lru_last_used = ggml_time_ms();
-    size_t count_active = 0;
-    {
-        std::unique_lock<std::mutex> lk(mutex);
-        for (const auto & m : mapping) {
-            if (m.second.meta.is_running()) {
-                count_active++;
-                if (m.second.meta.last_used < lru_last_used) {
+
+    auto get = [](const buft_memory_map & dmm, ggml_backend_buffer_type_t buft) -> size_t {
+        auto it = dmm.find(buft);
+        return it != dmm.end() ? it->second : 0;
+    };
+
+    int res = 0;
+
+    for (const auto & [buft, limit] : bmm_available) {
+        const size_t mem_total = get(bmm_total, buft);
+        const size_t mem_new   = get(bmm_req,   buft);
+
+        SRV_DBG("buft %s: total=%zu MiB, new=%zu MiB, limit=%zu MiB\n",
+            ggml_backend_buft_name(buft),
+            mem_total / (1024 * 1024), mem_new / (1024 * 1024), limit / (1024 * 1024));
+
+        if (mem_total + mem_new > limit) {
+            res++;
+        }
+    }
+
+    return res;
+}
+
+bool server_models::limits_exceeded(const buft_memory_map & bmm_req) const {
+    const bool check_active = base_params.models_max > 0;
+    const bool check_memory = base_params.models_memory_margin > 0;
+
+    if (!check_active && !check_memory) {
+        return false;
+    }
+
+    int count_active = 0;
+    for (const auto & m : mapping) {
+        if (m.second.meta.is_running()) {
+            count_active++;
+        }
+    }
+
+    const bool active_exceeded = check_active && count_active >= base_params.models_max;
+    const bool memory_exceeded = check_memory && can_fit(bmm_req) > 0;
+
+    return active_exceeded || memory_exceeded;
+}
+
+void server_models::unload_lru(const buft_memory_map & bmm_req) {
+    if (base_params.models_memory_margin > 0) {
+        GGML_ASSERT(!bmm_available.empty());
+    }
+
+    while (true) {
+        std::string lru_model_name;
+        {
+            std::unique_lock<std::mutex> lk(mutex);
+            if (!limits_exceeded(bmm_req)) {
+                break;
+            }
+            int64_t lru_last_used = ggml_time_ms();
+            for (const auto & m : mapping) {
+                if (m.second.meta.is_running() && m.second.meta.last_used < lru_last_used) {
                     lru_model_name = m.first;
-                    lru_last_used = m.second.meta.last_used;
+                    lru_last_used  = m.second.meta.last_used;
                 }
             }
         }
-    }
-    if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) {
-        SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str());
+
+        if (lru_model_name.empty()) {
+            break;
+        }
+
+        SRV_INF("limits exceeded, removing LRU name=%s\n", lru_model_name.c_str());
         unload(lru_model_name);
-        // wait for unload to complete
         {
             std::unique_lock<std::mutex> lk(mutex);
             cv.wait(lk, [this, &lru_model_name]() {
@@ -725,11 +815,238 @@ void server_models::unload_lru() {
     }
 }
 
+static std::string resolve_model_path(const common_preset & preset) {
+    common_params params;
+    preset.apply_to_params(params);
+
+    if (!params.model.path.empty()) {
+        return params.model.path;
+    }
+
+    if (!params.model.hf_repo.empty() || !params.model.url.empty()) {
+        common_download_opts opts;
+        opts.offline = true;
+        auto result = common_download_model(params.model, opts);
+        return result.model_path;
+    }
+
+    return "";
+}
+
+bool server_models::download_model(const std::string & name) {
+    std::vector<std::string> child_args;
+    std::vector<std::string> child_env;
+    {
+        std::lock_guard<std::mutex> lk(mutex);
+        auto & meta = mapping[name].meta;
+        child_args = meta.preset.to_args(bin_path);
+        child_env  = base_env;
+    }
+    child_args.push_back("--download-only");
+
+    SRV_INF("downloading model name=%s\n", name.c_str());
+
+    std::vector<char *> argv = to_char_ptr_array(child_args);
+    std::vector<char *> envp = to_char_ptr_array(child_env);
+
+    subprocess_s proc;
+    int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr;
+    if (subprocess_create_ex(argv.data(), options, envp.data(), &proc) != 0) {
+        SRV_ERR("failed to spawn download process for model name=%s\n", name.c_str());
+        return false;
+    }
+
+    FILE * out = subprocess_stdout(&proc);
+    if (out) {
+        char buffer[4096];
+        while (fgets(buffer, sizeof(buffer), out) != nullptr) {
+            LOG("[dl:%s] %s", name.c_str(), buffer);
+        }
+    }
+
+    int exit_code = 0;
+    subprocess_join(&proc, &exit_code);
+    subprocess_destroy(&proc);
+
+    if (exit_code != 0) {
+        SRV_ERR("download process for model name=%s exited with code %d\n", name.c_str(), exit_code);
+        return false;
+    }
+
+    SRV_INF("download complete for model name=%s\n", name.c_str());
+    return true;
+}
+
+buft_memory_map server_models::estimate_model_memory(const std::string & name) {
+    std::vector<std::string> child_args;
+    std::vector<std::string> child_env;
+    {
+        std::lock_guard<std::mutex> lk(mutex);
+        auto & meta = mapping[name].meta;
+        child_args = meta.preset.to_args(bin_path);
+        child_env  = base_env;
+    }
+    child_args.push_back("--measure-only");
+    child_args.push_back("--offline");
+
+    SRV_INF("estimating memory for model name=%s\n", name.c_str());
+
+    std::vector<char *> argv = to_char_ptr_array(child_args);
+    std::vector<char *> envp = to_char_ptr_array(child_env);
+
+    subprocess_s proc;
+    int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr;
+    if (subprocess_create_ex(argv.data(), options, envp.data(), &proc) != 0) {
+        SRV_ERR("failed to spawn measure process for model name=%s\n", name.c_str());
+        return {};
+    }
+
+    buft_memory_map result;
+    FILE * out = subprocess_stdout(&proc);
+    if (out) {
+        char buffer[4096];
+        while (fgets(buffer, sizeof(buffer), out) != nullptr) {
+            LOG("[measure:%s] %s", name.c_str(), buffer);
+            std::string line(buffer);
+            if (string_starts_with(line, "measure:")) {
+                std::istringstream iss(line.substr(strlen("measure:")));
+                std::string buft_name;
+                size_t size = 0;
+                if (iss >> buft_name >> size) {
+                    auto it = buft_by_name.find(buft_name);
+                    if (it != buft_by_name.end()) {
+                        result[it->second] += size;
+                    } else {
+                        SRV_WRN("unknown buft name '%s' from measure child for model name=%s\n",
+                            buft_name.c_str(), name.c_str());
+                    }
+                }
+            }
+        }
+    }
+
+    int exit_code = 0;
+    subprocess_join(&proc, &exit_code);
+    subprocess_destroy(&proc);
+
+    if (exit_code != 0) {
+        SRV_ERR("measure process for model name=%s exited with code %d\n", name.c_str(), exit_code);
+        return {};
+    }
+
+    SRV_INF("memory estimation complete for model name=%s\n", name.c_str());
+    return result;
+}
+
+void server_models::join_completed_bg_tasks() {
+    std::vector<std::unique_ptr<bg_task>> to_join;
+    {
+        std::lock_guard<std::mutex> lk(mutex);
+        for (auto it = bg_tasks.begin(); it != bg_tasks.end(); ) {
+            if (it->second->done.load()) {
+                to_join.push_back(std::move(it->second));
+                it = bg_tasks.erase(it);
+            } else {
+                ++it;
+            }
+        }
+    }
+    for (auto & task : to_join) {
+        if (task->th.joinable()) {
+            task->th.join();
+        }
+    }
+}
+
 void server_models::load(const std::string & name) {
     if (!has_model(name)) {
         throw std::runtime_error("model name=" + name + " is not found");
     }
-    unload_lru();
+
+    join_completed_bg_tasks();
+
+    {
+        common_preset preset_copy;
+        {
+            std::lock_guard<std::mutex> lk(mutex);
+            preset_copy = mapping[name].meta.preset;
+        }
+        if (resolve_model_path(preset_copy).empty()) {
+            {
+                std::lock_guard<std::mutex> lk(mutex);
+                auto & meta = mapping[name].meta;
+                if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
+                    return;
+                }
+                // skip if a bg_task is already running for this model
+                auto bg_it = bg_tasks.find(name);
+                if (bg_it != bg_tasks.end() && !bg_it->second->done.load()) {
+                    return;
+                }
+                meta.status = SERVER_MODEL_STATUS_DOWNLOADING;
+                cv.notify_all();
+            }
+
+            auto task = std::make_unique<bg_task>();
+            auto * task_ptr = task.get();
+
+            task->th = std::thread([this, name, task_ptr]() {
+                if (!download_model(name)) {
+                    update_status(name, SERVER_MODEL_STATUS_UNLOADED, 1);
+                    task_ptr->done.store(true);
+                    return;
+                }
+                buft_memory_map mem;
+                if (base_params.models_memory_margin > 0) {
+                    mem = estimate_model_memory(name);
+                    if (mem.empty()) {
+                        SRV_WRN("failed to estimate memory for model %s, memory limits will not apply\n", name.c_str());
+                    }
+                    {
+                        std::lock_guard<std::mutex> lk(mutex);
+                        mapping[name].meta.bmm_req = mem;
+                    }
+                }
+                try {
+                    _load(name, mem);
+                } catch (const std::exception & e) {
+                    SRV_ERR("failed to load model %s after download: %s\n", name.c_str(), e.what());
+                    update_status(name, SERVER_MODEL_STATUS_UNLOADED, 1);
+                }
+                task_ptr->done.store(true);
+            });
+
+            {
+                std::lock_guard<std::mutex> lk(mutex);
+                bg_tasks[name] = std::move(task);
+            }
+            return;
+        }
+    }
+
+    buft_memory_map bmm_req;
+    if (base_params.models_memory_margin > 0) {
+        {
+            std::lock_guard<std::mutex> lk(mutex);
+            bmm_req = mapping[name].meta.bmm_req;
+        }
+        if (bmm_req.empty()) {
+            bmm_req = estimate_model_memory(name);
+            if (bmm_req.empty()) {
+                SRV_WRN("failed to estimate memory for model %s, memory limits will not apply\n", name.c_str());
+            }
+            {
+                std::lock_guard<std::mutex> lk(mutex);
+                mapping[name].meta.bmm_req = bmm_req;
+            }
+        }
+    }
+
+    _load(name, bmm_req);
+}
+
+void server_models::_load(const std::string & name, const buft_memory_map & bmm_req) {
+    unload_lru(bmm_req);
 
     std::unique_lock<std::mutex> lk(mutex);
     // edge case: block until any in-progress reload has finished so we always load
@@ -737,7 +1054,7 @@ void server_models::load(const std::string & name) {
     cv.wait(lk, [this]() { return !is_reloading; });
 
     auto meta = mapping[name].meta;
-    if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
+    if (meta.status != SERVER_MODEL_STATUS_UNLOADED && meta.status != SERVER_MODEL_STATUS_DOWNLOADING) {
         SRV_INF("model %s is not ready\n", name.c_str());
         return;
     }
@@ -746,16 +1063,8 @@ void server_models::load(const std::string & name) {
     // exceeding models_max. Without this, the window between unload_lru()
     // releasing its lock and this lock_guard acquiring allows multiple
     // threads to each observe capacity and all proceed to load.
-    if (base_params.models_max > 0) {
-        size_t count_active = 0;
-        for (const auto & m : mapping) {
-            if (m.second.meta.is_running()) {
-                count_active++;
-            }
-        }
-        if (count_active >= (size_t)base_params.models_max) {
-            throw std::runtime_error("model limit reached, try again later");
-        }
+    if (limits_exceeded(bmm_req)) {
+        throw std::runtime_error("model limit reached, try again later");
     }
 
     // prepare new instance info
@@ -931,6 +1240,7 @@ void server_models::unload(const std::string & name) {
 
 void server_models::unload_all() {
     std::vector<std::thread> to_join;
+    std::vector<std::unique_ptr<bg_task>> bg_to_join;
     {
         std::lock_guard<std::mutex> lk(mutex);
         for (auto & [name, inst] : mapping) {
@@ -943,15 +1253,26 @@ void server_models::unload_all() {
             // moving the thread to join list to avoid deadlock
             to_join.push_back(std::move(inst.th));
         }
+        for (auto & [name, task] : bg_tasks) {
+            bg_to_join.push_back(std::move(task));
+        }
+        bg_tasks.clear();
     }
     for (auto & th : to_join) {
         if (th.joinable()) {
             th.join();
         }
     }
+    for (auto & task : bg_to_join) {
+        if (task && task->th.joinable()) {
+            task->th.join();
+        }
+    }
 }
 
 void server_models::update_status(const std::string & name, server_model_status status, int exit_code) {
+    join_completed_bg_tasks();
+
     std::unique_lock<std::mutex> lk(mutex);
     auto it = mapping.find(name);
     if (it != mapping.end()) {
@@ -990,7 +1311,8 @@ void server_models::wait_until_loading_finished(const std::string & name) {
     cv.wait(lk, [this, &name]() {
         auto it = mapping.find(name);
         if (it != mapping.end()) {
-            return it->second.meta.status != SERVER_MODEL_STATUS_LOADING;
+            return it->second.meta.status != SERVER_MODEL_STATUS_LOADING &&
+                   it->second.meta.status != SERVER_MODEL_STATUS_DOWNLOADING;
         }
         return false;
     });
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 2198589a7aa..69bbf87bdc2 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -5,15 +5,20 @@
 #include "server-common.h"
 #include "server-http.h"
 
+#include <atomic>
 #include <mutex>
 #include <condition_variable>
 #include <functional>
 #include <memory>
 #include <set>
+#include <unordered_map>
 
 /**
  * state diagram:
  *
+ *
+ *  ┌► DOWNLOADING ─┐
+ *  │               ▼
  * UNLOADED ──► LOADING ──► LOADED ◄──── SLEEPING
  *  ▲            │            │               ▲
  *  └───failed───┘            │               │
@@ -21,8 +26,8 @@
  *  └────────unloaded─────────┘
  */
 enum server_model_status {
-    // TODO: also add downloading state when the logic is added
     SERVER_MODEL_STATUS_UNLOADED,
+    SERVER_MODEL_STATUS_DOWNLOADING,
     SERVER_MODEL_STATUS_LOADING,
     SERVER_MODEL_STATUS_LOADED,
     SERVER_MODEL_STATUS_SLEEPING
@@ -32,6 +37,9 @@ static server_model_status server_model_status_from_string(const std::string & s
     if (status_str == "unloaded") {
         return SERVER_MODEL_STATUS_UNLOADED;
     }
+    if (status_str == "downloading") {
+        return SERVER_MODEL_STATUS_DOWNLOADING;
+    }
     if (status_str == "loading") {
         return SERVER_MODEL_STATUS_LOADING;
     }
@@ -46,14 +54,17 @@ static server_model_status server_model_status_from_string(const std::string & s
 
 static std::string server_model_status_to_string(server_model_status status) {
     switch (status) {
-        case SERVER_MODEL_STATUS_UNLOADED: return "unloaded";
-        case SERVER_MODEL_STATUS_LOADING:  return "loading";
-        case SERVER_MODEL_STATUS_LOADED:   return "loaded";
-        case SERVER_MODEL_STATUS_SLEEPING: return "sleeping";
-        default:                           return "unknown";
+        case SERVER_MODEL_STATUS_UNLOADED:     return "unloaded";
+        case SERVER_MODEL_STATUS_DOWNLOADING:  return "downloading";
+        case SERVER_MODEL_STATUS_LOADING:      return "loading";
+        case SERVER_MODEL_STATUS_LOADED:       return "loaded";
+        case SERVER_MODEL_STATUS_SLEEPING:     return "sleeping";
+        default:                               return "unknown";
     }
 }
 
+using buft_memory_map = std::map<ggml_backend_buffer_type_t, size_t>;
+
 struct server_model_meta {
     common_preset preset;
     std::string name;
@@ -62,6 +73,7 @@ struct server_model_meta {
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
+    buft_memory_map bmm_req; // bytes required per buffer type
     std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
     json loaded_info; // info to be reflected via /v1/models endpoint
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
@@ -104,6 +116,13 @@ struct server_models {
     std::condition_variable cv_stop;
     std::set<std::string> stopping_models;
 
+    // background tasks for download/estimate/load pipelines, keyed by model name
+    struct bg_task {
+        std::thread th;
+        std::atomic<bool> done{false};
+    };
+    std::map<std::string, std::unique_ptr<bg_task>> bg_tasks;
+
     // set to true while load_models() is executing a reload; load() will wait until clear
     bool is_reloading = false;
 
@@ -114,14 +133,42 @@ struct server_models {
     std::vector<std::string> base_env;
     common_preset base_preset; // base preset from llama-server CLI args
 
+    // available memory per buffer type
+    buft_memory_map bmm_available;
+
+    // buft name -> buft lookup (host buffer types map to CPU buft)
+    std::unordered_map<std::string, ggml_backend_buffer_type_t> buft_by_name;
+
     void update_meta(const std::string & name, const server_model_meta & meta);
 
     // unload least recently used models if the limit is reached
-    void unload_lru();
+    void unload_lru(const buft_memory_map & bmm_req);
 
     // not thread-safe, caller must hold mutex
     void add_model(server_model_meta && meta);
 
+    // return number of buffer types where the memory limit would be exceeded
+    // return 0 if the new model would fit
+    // not thread-safe, caller must hold mutex
+    int can_fit(const buft_memory_map & bmm_req) const;
+
+    // check if active model count or memory limits would be exceeded
+    // not thread-safe, caller must hold mutex
+    bool limits_exceeded(const buft_memory_map & bmm_req) const;
+
+    // download model files, blocking call (caller must NOT hold mutex)
+    bool download_model(const std::string & name);
+
+    // estimate model memory by spawning a child process with --measure-only
+    // returns the buft memory map, or empty map on failure (caller must NOT hold mutex)
+    buft_memory_map estimate_model_memory(const std::string & name);
+
+    // join and remove completed background tasks
+    void join_completed_bg_tasks();
+
+    // Internal helper for model loading
+    void _load(const std::string & name, const buft_memory_map & bmm_req);
+
 public:
     server_models(const common_params & params, int argc, char ** argv);
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 4d56d45e83c..eae50776fb1 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -11,6 +11,8 @@
 #include "llama.h"
 #include "log.h"
 
+#include "../../src/llama-ext.h"
+
 #include <atomic>
 #include <clocale>
 #include <exception>
@@ -94,6 +96,11 @@ int llama_server(int argc, char ** argv) {
     const bool is_router_server = params.model.path.empty();
     common_params_print_info(params, !is_router_server);
 
+    if (params.download_only) {
+        LOG_INF("%s: model downloaded successfully, exiting\n", __func__);
+        return 0;
+    }
+
     // validate batch size for embeddings
     // embeddings require all tokens to be processed in a single ubatch
     // see https://github.com/ggml-org/llama.cpp/issues/12836
@@ -118,6 +125,44 @@ int llama_server(int argc, char ** argv) {
     // struct that contains llama context and inference
     server_context ctx_server;
 
+    if (params.measure_only) {
+        llama_model_params mparams = common_model_params_to_llama(params);
+        mparams.no_alloc  = true;
+        mparams.use_mmap  = false;
+        mparams.use_mlock = false;
+
+        llama_model_ptr model{llama_model_load_from_file(params.model.path.c_str(), mparams)};
+        if (!model) {
+            LOG_ERR("%s: failed to load model for measurement\n", __func__);
+            llama_backend_free();
+            return 1;
+        }
+
+        llama_context_params cparams = common_context_params_to_llama(params);
+        llama_context_ptr ctx{llama_init_from_model(model.get(), cparams)};
+        if (!ctx) {
+            LOG_ERR("%s: failed to create context for measurement\n", __func__);
+            llama_backend_free();
+            return 1;
+        }
+
+        common_log_pause(common_log_main());
+        for (const auto & [buft, data] : llama_get_memory_breakdown(ctx.get())) {
+            size_t total = data.total();
+            if (total > 0) {
+                fprintf(stdout, "measure:%s %zu\n", ggml_backend_buft_name(buft), total);
+            }
+        }
+        fflush(stdout);
+        common_log_resume(common_log_main());
+
+        llama_backend_free();
+        return 0;
+    }
+
+    LOG_INF("build_info: %s\n", llama_build_info());
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+
     server_http_context ctx_http;
     if (!ctx_http.init(params)) {
         SRV_ERR("%s", "failed to initialize HTTP server\n");