From 5bc7a7409fc39d2801b3eeddc9e22fdd4bda718d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Tue, 27 May 2025 21:28:22 +0200
Subject: [PATCH] llama: automatically set runtime pars to fit VRAM

---
 common/common.cpp                   | 71 ++++++++++++++++++++++++++++-
 ggml/include/ggml-alloc.h           |  6 ++-
 ggml/include/ggml-backend.h         |  1 +
 ggml/src/ggml-alloc.c               | 60 ++++++++++++++++++++----
 ggml/src/ggml-backend.cpp           | 22 ++++++++-
 include/llama.h                     |  7 +++
 src/llama-context.cpp               | 67 ++++++++++++++++++++-------
 src/llama-context.h                 | 10 +++-
 src/llama-impl.cpp                  | 18 ++++++++
 src/llama-impl.h                    | 12 +++++
 src/llama-kv-cache-recurrent.cpp    | 26 ++++++-----
 src/llama-kv-cache-recurrent.h      |  7 +--
 src/llama-kv-cache-unified-iswa.cpp | 11 +++--
 src/llama-kv-cache-unified-iswa.h   |  5 +-
 src/llama-kv-cache-unified.cpp      | 27 ++++++-----
 src/llama-kv-cache-unified.h        |  7 +--
 src/llama-memory.h                  |  6 +++
 src/llama-model.cpp                 | 46 +++++++++++++++----
 src/llama-model.h                   |  6 ++-
 src/llama.cpp                       | 45 +++++++++++++++---
 20 files changed, 376 insertions(+), 84 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 218f1e1dc0e4d..a83e9d0737a76 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -885,9 +885,78 @@ std::string fs_get_cache_file(const std::string & filename) {
 // Model utils
 //
 
+static void common_fit_to_free_memory(
+        const std::string & path_model, llama_model_params & mparams, llama_context_params & cparams, const size_t margin) {
+
+    std::vector<ggml_backend_dev_t> devices(ggml_backend_dev_count());
+    for (size_t i = 0; i < devices.size(); i++) {
+        devices[i] = ggml_backend_dev_get(i);
+    }
+
+    std::vector<size_t> memory_total(devices.size());
+    std::vector<size_t> memory_free(devices.size());
+    for (size_t i = 0; i < devices.size(); i++) {
+        ggml_backend_dev_memory(devices[i], memory_free.data() + i, memory_total.data() + i);
+    }
+
+    auto get_min_margin = [path_model, memory_free](const llama_model_params & mparams_test, const llama_context_params & cparams_test) {
+        std::vector<size_t> memory_expect(memory_free.size());
+        GGML_ASSERT(llama_expected_memory_use(path_model.c_str(), mparams_test, cparams_test, memory_expect.data()));
+
+        int64_t min_margin = INT64_MAX;
+        for (size_t i = 0; i < memory_free.size(); i++) {
+            min_margin = std::min(min_margin, int64_t(memory_free[i]) - int64_t(memory_expect[i]));
+        }
+        return min_margin;
+    };
+    auto test_ngl = [mparams, cparams, get_min_margin](const int ngl) {
+        llama_model_params mparams_test = mparams;
+        mparams_test.n_gpu_layers = ngl;
+        return get_min_margin(mparams_test, cparams);
+    };
+
+    int ngl_low = 0;
+    int64_t margin_low = test_ngl(ngl_low);
+    if (margin_low < int64_t(margin)) {
+        mparams.n_gpu_layers = ngl_low;
+        return;
+    }
+
+    int ngl_high = 128; // FIXME
+    int64_t margin_high = test_ngl(ngl_high);
+    if (margin_high >= int64_t(margin)) {
+        mparams.n_gpu_layers = ngl_high;
+        return;
+    }
+
+    // TODO bisection is ineffient, better to interpolate if max ngl value is known
+    while (ngl_high - ngl_low > 1) {
+        const int ngl_test = (ngl_high + ngl_low) / 2;
+        const int64_t margin_test = test_ngl(ngl_test);
+
+        if (margin_test < int64_t(margin)) {
+            ngl_high = ngl_test;
+            margin_high = margin_test;
+        } else {
+            ngl_low = ngl_test;
+            margin_low = margin_test;
+        }
+    }
+
+    if (margin_high >= int64_t(margin)) {
+        mparams.n_gpu_layers = ngl_high;
+    } else {
+        mparams.n_gpu_layers = ngl_low;
+    }
+}
+
 struct common_init_result common_init_from_params(common_params & params) {
     common_init_result iparams;
     auto mparams = common_model_params_to_llama(params);
+    auto cparams = common_context_params_to_llama(params);
+
+    constexpr size_t margin = 1024*1024*1024;
+    common_fit_to_free_memory(params.model.path, mparams, cparams, margin);
 
     llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
     if (model == NULL) {
@@ -925,8 +994,6 @@ struct common_init_result common_init_from_params(common_params & params) {
         }
     }
 
-    auto cparams = common_context_params_to_llama(params);
-
     llama_context * lctx = llama_init_from_model(model, cparams);
     if (lctx == NULL) {
         LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 2cb150fd2a313..3300cc52d9abe 100644
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -9,6 +9,7 @@ extern "C" {
 typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
 typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
 typedef struct             ggml_backend * ggml_backend_t;
+typedef struct      ggml_backend_device * ggml_backend_dev_t;
 
 // Tensor allocator
 struct ggml_tallocr {
@@ -58,16 +59,19 @@ GGML_API bool ggml_gallocr_reserve_n(
     ggml_gallocr_t galloc,
     struct ggml_cgraph * graph,
     const int * node_buffer_ids,
-    const int * leaf_buffer_ids);
+    const int * leaf_buffer_ids,
+    bool dry_run);
 
 // automatic reallocation if the topology changes when using a single buffer
 // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
 GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
 
 GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+size_t ggml_gallocr_get_max_size(ggml_gallocr_t galloc, ggml_backend_dev_t dev);
 
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
+GGML_API size_t                       ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
 
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 778927f68217a..ab99400664606 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -293,6 +293,7 @@ extern "C" {
     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
 
     // Initialize backend buffers from a measure graph
+    GGML_API void                 ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes); // result per backend is written to sizes
     GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
 
     GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 5fd379f6a9461..54138ba6bf002 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -150,6 +150,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
 }
 #endif
 
+// returns the offset for the allocation
 static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
     size = aligned_offset(NULL, size, alloc->alignment);
 
@@ -472,7 +473,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
 }
 
 static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
-    return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
+    return t->data != NULL // tensor data already set externally
+        || t->buffer // tensor on external buffer (but may not yet be allocated)
+        || ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
 }
 
 static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
@@ -670,7 +673,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
     }
 }
 
-bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph,
+        const int * node_buffer_ids, const int * leaf_buffer_ids, bool dry_run) {
     size_t min_hash_size = graph->n_nodes + graph->n_leafs;
     // add 25% margin to avoid hash collisions
     min_hash_size += min_hash_size / 4;
@@ -768,7 +772,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
 #endif
 
             ggml_backend_buffer_free(galloc->buffers[i]);
-            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
+            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], dry_run ? 0 : new_size);
             if (galloc->buffers[i] == NULL) {
                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
                 return false;
@@ -781,7 +785,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
 }
 
 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
-    return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
+    return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL, /*dry_run =*/ false);
 }
 
 static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
@@ -934,6 +938,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
     return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }
 
+size_t ggml_gallocr_get_max_size(ggml_gallocr_t galloc, ggml_backend_dev_t dev) {
+    for (int i = 0; i < galloc->n_buffers; i++) {
+        if (ggml_backend_buft_get_device(galloc->bufts[i]) == dev) {
+            return ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
+        }
+    }
+    return 0;
+}
+
 // utils
 
 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
@@ -984,7 +997,8 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
     return true;
 }
 
-ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
+        struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool dry_run) {
     GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
 
     size_t alignment = ggml_backend_buft_get_alignment(buft);
@@ -992,6 +1006,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
 
     ggml_backend_buffer_t * buffers = NULL;
     size_t n_buffers = 0;
+    *nbytes_total = 0;
 
     size_t cur_buf_size = 0;
     struct ggml_tensor * first = ggml_get_first_tensor(ctx);
@@ -1003,10 +1018,13 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
 
         if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
             // allocate tensors in the current buffer
-            if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
-                return NULL;
+            if (!dry_run) {
+                if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
+                    return NULL;
+                }
             }
             first = t;
+            *nbytes_total += cur_buf_size;
             cur_buf_size = this_size;
         } else {
             cur_buf_size += this_size;
@@ -1015,15 +1033,23 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
 
     // allocate remaining tensors
     if (cur_buf_size > 0) {
-        if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
-            return NULL;
+        *nbytes_total += cur_buf_size;
+        if (!dry_run) {
+            if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
+                return NULL;
+            }
         }
     }
 
+    if (dry_run) {
+        return NULL;
+    }
+
     if (n_buffers == 0) {
 #ifndef NDEBUG
         GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
 #endif
+        GGML_ASSERT(!buffers);
         return NULL;
     }
 
@@ -1033,10 +1059,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
     } else {
         buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
     }
-    free(buffers);
+    if (buffers) {
+        free(buffers); // can be NULL if dry_run or context is empty
+    }
     return buffer;
 }
 
+size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+    size_t nbytes_total = 0;
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*dry_run =*/ true);
+    GGML_ASSERT(!buf);
+    return nbytes_total;
+}
+
+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+    size_t nbytes_total = 0;
+    return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*dry_run =*/ false);
+}
+
 ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
     return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
 }
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index b1050ad59c26a..da5579e6ebb9a 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1347,7 +1347,8 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
 #ifndef NDEBUG
         GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
 #endif
-        ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
+        ggml_gallocr_reserve_n(sched->galloc, &sched->graph,
+            sched->node_backend_ids, sched->leaf_backend_ids, /*dry_run =*/ false);
         if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
             GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
             return false;
@@ -1546,6 +1547,22 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
     sched->is_alloc = false;
 }
 
+void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) {
+    GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
+
+    ggml_backend_sched_split_graph(sched, measure_graph);
+
+    ggml_backend_sched_synchronize(sched);
+
+    GGML_ASSERT(ggml_gallocr_reserve_n(sched->galloc, &sched->graph,
+            sched->node_backend_ids, sched->leaf_backend_ids, /*dry_run =*/ true));
+    for (int ib = 0; ib < sched->n_backends; ib++) {
+        sizes[ib] = ggml_gallocr_get_max_size(sched->galloc, ggml_backend_get_device(sched->backends[ib]));
+    }
+
+    ggml_backend_sched_reset(sched);
+}
+
 bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
     GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
 
@@ -1553,7 +1570,8 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
 
     ggml_backend_sched_synchronize(sched);
 
-    if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
+    if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph,
+            sched->node_backend_ids, sched->leaf_backend_ids, /*dry_run =*/ false)) {
         return false;
     }
 
diff --git a/include/llama.h b/include/llama.h
index 015a57898e22d..08f092d1382e3 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -414,6 +414,13 @@ extern "C" {
     LLAMA_API struct llama_sampler_chain_params  llama_sampler_chain_default_params(void);
     LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
 
+    // returns success
+    LLAMA_API bool llama_expected_memory_use(
+                             const char * path_model,
+              struct llama_model_params   mparams,
+            struct llama_context_params   cparams,
+                                 size_t * nbytes_expect);
+
     // Initialize the llama + ggml backend
     // If numa is true, use NUMA optimizations
     // Call once at the start of the program
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index b130b484bcf6f..ff52c867646fa 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -17,7 +17,8 @@
 
 llama_context::llama_context(
         const llama_model & model,
-              llama_context_params params) :
+              llama_context_params params,
+              bool dry_run) :
     model(model) {
     LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
 
@@ -192,7 +193,7 @@ llama_context::llama_context(
             /*.swa_full =*/ params.swa_full,
         };
 
-        memory.reset(model.create_memory(params_mem, cparams));
+        memory.reset(model.create_memory(params_mem, cparams, dry_run));
     }
 
     // init backends
@@ -265,6 +266,8 @@ llama_context::llama_context(
 
     // reserve worst-case graph
     if (!hparams.vocab_only && memory) {
+        backends_exp_max_size.resize(backend_ptrs.size());
+
         const uint32_t n_seqs = cparams.n_seq_max;
         const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
@@ -287,7 +290,7 @@ llama_context::llama_context(
 
         // reserve pp graph first so that buffers are only allocated once
         {
-            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get());
+            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get(), dry_run);
             if (!gf) {
                 throw std::runtime_error("failed to allocate compute pp buffers");
             }
@@ -298,7 +301,7 @@ llama_context::llama_context(
 
         // reserve with tg graph to get the number of splits and nodes
         {
-            auto * gf = graph_reserve(1, 1, 1, mstate.get());
+            auto * gf = graph_reserve(1, 1, 1, mstate.get(), dry_run);
             if (!gf) {
                 throw std::runtime_error("failed to allocate compute tg buffers");
             }
@@ -309,16 +312,21 @@ llama_context::llama_context(
 
         // reserve again with pp graph to avoid ggml-alloc reallocations during inference
         {
-            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get());
+            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get(), dry_run);
             if (!gf) {
                 throw std::runtime_error("failed to allocate compute pp buffers");
             }
         }
 
+        if (!dry_run) {
+            for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+                backends_exp_max_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend_ptrs[i]);
+            }
+        }
+
         for (size_t i = 0; i < backend_ptrs.size(); ++i) {
-            ggml_backend_t             backend = backend_ptrs[i];
-            ggml_backend_buffer_type_t buft    = backend_buft[i];
-            size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+            ggml_backend_buffer_type_t buft = backend_buft[i];
+            const size_t               size = backends_exp_max_size[i];
             if (size > 1) {
                 LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
                         ggml_backend_buft_name(buft),
@@ -418,6 +426,10 @@ uint32_t llama_context::n_threads_batch() const {
     return cparams.n_threads_batch;
 }
 
+size_t llama_context::total_size(ggml_backend_dev_t dev) const {
+    return memory->total_size(dev);
+}
+
 llama_memory_t llama_context::get_memory() const {
     return memory.get();
 }
@@ -476,7 +488,7 @@ bool llama_context::kv_self_update(bool optimize) {
         const uint32_t n_seqs   = cparams.n_seq_max;
         const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
-        auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get());
+        auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get(), /*dry_run =*/ false);
         if (!gf) {
             LLAMA_LOG_ERROR("%s: failed to reserve graph after the memory update\n", __func__);
         }
@@ -1232,6 +1244,15 @@ int llama_context::decode(llama_batch & inp_batch) {
     return 0;
 }
 
+size_t llama_context::get_expected_max_size(ggml_backend_dev_t dev) const {
+    for (size_t i = 0; i < backend_buft.size(); i++) {
+        if (ggml_backend_buft_get_device(backend_buft[i]) == dev) {
+            return backends_exp_max_size[i];
+        }
+    }
+    return 0;
+}
+
 //
 // output
 //
@@ -1328,7 +1349,7 @@ ggml_cgraph * llama_context::graph_init() {
     return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
 }
 
-ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate) {
+ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate, bool dry_run) {
     LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
 
     if (n_tokens % n_seqs != 0) {
@@ -1360,9 +1381,17 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
     ggml_backend_sched_reset(sched.get());
 
     // initialize scheduler with the specified graph
-    if (!ggml_backend_sched_reserve(sched.get(), gf)) {
-        LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-        return nullptr;
+    if (dry_run) {
+        std::vector<size_t> tmp(backend_ptrs.size());
+        ggml_backend_sched_reserve_size(sched.get(), gf, tmp.data());
+        for (size_t i = 0; i < backend_ptrs.size(); i++) {
+            backends_exp_max_size[i] = std::max(backends_exp_max_size[i], tmp[i]);
+        }
+    } else {
+        if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+            return nullptr;
+        }
     }
 
     return gf;
@@ -2229,9 +2258,10 @@ llama_context_params llama_context_default_params() {
     return result;
 }
 
-llama_context * llama_init_from_model(
+llama_context * llama_init_from_model_impl(
                  llama_model * model,
-        llama_context_params   params) {
+        llama_context_params   params,
+                        bool   dry_run) {
     if (!model) {
         LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
         return nullptr;
@@ -2258,7 +2288,7 @@ llama_context * llama_init_from_model(
     }
 
     try {
-        auto * ctx = new llama_context(*model, params);
+        auto * ctx = new llama_context(*model, params, dry_run);
         return ctx;
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: failed to initialize the context: %s\n", __func__, err.what());
@@ -2267,6 +2297,11 @@ llama_context * llama_init_from_model(
     return nullptr;
 }
 
+llama_context * llama_init_from_model(
+                 llama_model * model,
+        llama_context_params   params) {
+    return llama_init_from_model_impl(model, params, /*dry_run =*/ false);
+}
 // deprecated
 llama_context * llama_new_context_with_model(
                  llama_model * model,
diff --git a/src/llama-context.h b/src/llama-context.h
index 2e0da8c83bd59..07593d1993881 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -24,7 +24,8 @@ struct llama_context {
     // init scheduler and compute buffers, reserve worst-case graphs
     llama_context(
             const llama_model & model,
-                  llama_context_params params);
+                  llama_context_params params,
+                  bool dry_run);
 
     ~llama_context();
 
@@ -46,6 +47,8 @@ struct llama_context {
     uint32_t n_threads()       const;
     uint32_t n_threads_batch() const;
 
+    size_t total_size(ggml_backend_dev_t dev = nullptr) const;
+
     llama_memory_t get_memory() const;
 
     // return true of the KV cache was updated
@@ -105,6 +108,8 @@ struct llama_context {
     int encode(llama_batch & inp_batch);
     int decode(llama_batch & inp_batch);
 
+    size_t get_expected_max_size(ggml_backend_dev_t dev) const;
+
     //
     // state save/load
     //
@@ -197,7 +202,7 @@ struct llama_context {
     ggml_status graph_compute(ggml_cgraph * gf, bool batched);
 
     // reserve a graph with a dummy ubatch of the specified size
-    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate);
+    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate, bool dry_run);
 
 private:
     llm_graph_result_ptr graph_build(
@@ -255,6 +260,7 @@ struct llama_context {
 
     ggml_backend_t backend_cpu = nullptr;
     std::vector<ggml_backend_ptr> backends;
+    std::vector<size_t> backends_exp_max_size;
 
     ggml_context_ptr ctx_compute;
 
diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp
index 6ec709dd323a6..109bdaf4fd33f 100644
--- a/src/llama-impl.cpp
+++ b/src/llama-impl.cpp
@@ -165,3 +165,21 @@ std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
             return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
     }
 }
+
+size_t ctxs_total_size(const std::vector<ggml_context_ptr> & ctxs, ggml_backend_dev_t dev) {
+    ggml_backend_buffer_type_t dev_buft = dev ? ggml_backend_dev_buffer_type(dev) : nullptr;
+
+    size_t total_size = 0;
+    for (const ggml_context_ptr & ctx : ctxs) {
+        ggml_tensor * t = ggml_get_first_tensor(ctx.get());
+        if (!t || !t->buffer) {
+            continue;
+        }
+        ggml_backend_buffer_type_t ctx_buft = ggml_backend_buffer_get_type(t->buffer);
+        if (dev_buft && ctx_buft != dev_buft) {
+            continue;
+        }
+        total_size += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), ctx_buft);
+    }
+    return total_size;
+}
diff --git a/src/llama-impl.h b/src/llama-impl.h
index 02b1d07f8400d..b34cf7ff55700 100644
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "ggml.h" // for ggml_log_level
+#include "ggml-cpp.h"
 
 #include <string>
 #include <vector>
@@ -15,6 +16,11 @@
 #    define LLAMA_ATTRIBUTE_FORMAT(...)
 #endif
 
+struct llama_context;
+struct llama_model;
+struct llama_context_params;
+llama_context * llama_init_from_model_impl(llama_model * model, llama_context_params params, bool dry_run);
+
 //
 // logging
 //
@@ -59,3 +65,9 @@ std::string llama_format_tensor_shape(const std::vector<int64_t> & ne);
 std::string llama_format_tensor_shape(const struct ggml_tensor * t);
 
 std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
+
+// calculate the total number of bytes needed to allocate a vector of ggml contexts
+// skips ggml contexts without ggml_backend buffers (dummy buffers are ok)
+// assumes that all tensors in a context are on the same buffer
+// if the optional device dev is set, return the number of bytes needed on that device only
+size_t ctxs_total_size(const std::vector<ggml_context_ptr> & ctxs, ggml_backend_dev_t dev = nullptr);
diff --git a/src/llama-kv-cache-recurrent.cpp b/src/llama-kv-cache-recurrent.cpp
index f5c6dcd66ce9e..3e601cd2f2c4c 100644
--- a/src/llama-kv-cache-recurrent.cpp
+++ b/src/llama-kv-cache-recurrent.cpp
@@ -21,7 +21,8 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
                 ggml_type   type_v,
                      bool   offload,
                  uint32_t   kv_size,
-                 uint32_t   n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
+                 uint32_t   n_seq_max,
+                     bool   dry_run) : hparams(model.hparams), n_seq_max(n_seq_max) {
     const int32_t n_layer = hparams.n_layer;
 
     LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n",
@@ -97,9 +98,17 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
         auto * buft = it.first;
         auto * ctx  = it.second;
 
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-        if (!buf) {
-            throw std::runtime_error("failed to allocate buffer for kv cache");
+        ggml_backend_buffer_t buf;
+        if (dry_run) {
+            buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0);
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+                t->buffer = buf;
+            }
+        } else {
+            buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+            if (!buf) {
+                throw std::runtime_error("failed to allocate buffer for kv cache");
+            }
         }
         ggml_backend_buffer_clear(buf, 0);
         LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
@@ -648,13 +657,8 @@ float llama_kv_cache_recurrent::s_mask(int i) const {
     return res;
 }
 
-size_t llama_kv_cache_recurrent::total_size() const {
-    size_t size = 0;
-    for (const auto & buf : bufs) {
-        size += ggml_backend_buffer_get_size(buf.get());
-    }
-
-    return size;
+size_t llama_kv_cache_recurrent::total_size(ggml_backend_dev_t dev) const {
+    return ctxs_total_size(ctxs, dev);
 }
 
 size_t llama_kv_cache_recurrent::size_k_bytes() const {
diff --git a/src/llama-kv-cache-recurrent.h b/src/llama-kv-cache-recurrent.h
index d1da1225655fa..1b0144cb2ec3c 100644
--- a/src/llama-kv-cache-recurrent.h
+++ b/src/llama-kv-cache-recurrent.h
@@ -21,7 +21,8 @@ class llama_kv_cache_recurrent : public llama_memory_i {
                     ggml_type   type_v,
                          bool   offload,
                      uint32_t   kv_size,
-                     uint32_t   n_seq_max);
+                     uint32_t   n_seq_max,
+                         bool   dry_run);
 
     ~llama_kv_cache_recurrent() = default;
 
@@ -50,6 +51,8 @@ class llama_kv_cache_recurrent : public llama_memory_i {
     llama_pos seq_pos_min(llama_seq_id seq_id) const override;
     llama_pos seq_pos_max(llama_seq_id seq_id) const override;
 
+    size_t total_size(ggml_backend_dev_t dev = nullptr) const override;
+
     bool prepare(const std::vector<llama_ubatch> & ubatches);
 
     // find a contiguous slot of kv cells and emplace the ubatch there
@@ -108,8 +111,6 @@ class llama_kv_cache_recurrent : public llama_memory_i {
     std::vector<ggml_context_ptr>        ctxs;
     std::vector<ggml_backend_buffer_ptr> bufs;
 
-    size_t total_size() const;
-
     size_t size_k_bytes() const;
     size_t size_v_bytes() const;
 
diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-unified-iswa.cpp
index 28d1826547649..7248ed446453a 100644
--- a/src/llama-kv-cache-unified-iswa.cpp
+++ b/src/llama-kv-cache-unified-iswa.cpp
@@ -21,7 +21,8 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
                  uint32_t   kv_size,
                  uint32_t   n_seq_max,
                  uint32_t   n_ubatch,
-                 uint32_t   n_pad) : hparams(model.hparams) {
+                 uint32_t   n_pad,
+                     bool   dry_run) : hparams(model.hparams) {
     llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
     llama_kv_cache_unified::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
 
@@ -42,14 +43,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
     kv_base = std::make_unique<llama_kv_cache_unified>(
             model, std::move(filter_base), type_k, type_v,
             v_trans, offload, size_base, n_seq_max, n_pad,
-            0, LLAMA_SWA_TYPE_NONE);
+            0, LLAMA_SWA_TYPE_NONE, dry_run);
 
     LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
 
     kv_swa = std::make_unique<llama_kv_cache_unified>(
             model, std::move(filter_swa), type_k, type_v,
             v_trans, offload, size_swa, n_seq_max, n_pad,
-            hparams.n_swa, hparams.swa_type);
+            hparams.n_swa, hparams.swa_type, dry_run);
 }
 
 void llama_kv_cache_unified_iswa::clear(bool data) {
@@ -95,6 +96,10 @@ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
     return kv_swa->seq_pos_max(seq_id);
 }
 
+size_t llama_kv_cache_unified_iswa::total_size(ggml_backend_dev_t dev) const {
+    return get_base()->total_size(dev) + get_swa()->total_size(dev);
+}
+
 llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
     GGML_UNUSED(embd_pooled);
 
diff --git a/src/llama-kv-cache-unified-iswa.h b/src/llama-kv-cache-unified-iswa.h
index 3dbf33ed7b960..bc9b43abc40b3 100644
--- a/src/llama-kv-cache-unified-iswa.h
+++ b/src/llama-kv-cache-unified-iswa.h
@@ -23,7 +23,8 @@ class llama_kv_cache_unified_iswa : public llama_memory_i {
                      uint32_t   kv_size,
                      uint32_t   n_seq_max,
                      uint32_t   n_ubatch,
-                     uint32_t   n_pad);
+                     uint32_t   n_pad,
+                         bool   dry_run);
 
     ~llama_kv_cache_unified_iswa() = default;
 
@@ -54,6 +55,8 @@ class llama_kv_cache_unified_iswa : public llama_memory_i {
     llama_pos seq_pos_min(llama_seq_id seq_id) const override;
     llama_pos seq_pos_max(llama_seq_id seq_id) const override;
 
+    size_t total_size(ggml_backend_dev_t dev = nullptr) const override;
+
     // state write/load
 
     void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
index 3a40463fd29ca..1e9f36358ca0c 100644
--- a/src/llama-kv-cache-unified.cpp
+++ b/src/llama-kv-cache-unified.cpp
@@ -27,7 +27,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(
                  uint32_t    n_seq_max,
                  uint32_t    n_pad,
                  uint32_t    n_swa,
-           llama_swa_type    swa_type) :
+           llama_swa_type    swa_type,
+                     bool    dry_run) :
     model(model), hparams(model.hparams), v_trans(v_trans),
     n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
 
@@ -107,9 +108,17 @@ llama_kv_cache_unified::llama_kv_cache_unified(
         auto * buft = it.first;
         auto * ctx  = it.second;
 
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-        if (!buf) {
-            throw std::runtime_error("failed to allocate buffer for kv cache");
+        ggml_backend_buffer_t buf;
+        if (dry_run) {
+            buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0);
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+                t->buffer = buf;
+            }
+        } else {
+            buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+            if (!buf) {
+                throw std::runtime_error("failed to allocate buffer for kv cache");
+            }
         }
 
         LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
@@ -835,14 +844,8 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama
     }
 }
 
-size_t llama_kv_cache_unified::total_size() const {
-    size_t size = 0;
-
-    for (const auto & buf : bufs) {
-        size += ggml_backend_buffer_get_size(buf.get());
-    }
-
-    return size;
+size_t llama_kv_cache_unified::total_size(ggml_backend_dev_t dev) const {
+    return ctxs_total_size(ctxs, dev);
 }
 
 size_t llama_kv_cache_unified::size_k_bytes() const {
diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache-unified.h
index 49f410ef6ecab..2ff8f7d3258df 100644
--- a/src/llama-kv-cache-unified.h
+++ b/src/llama-kv-cache-unified.h
@@ -48,7 +48,8 @@ class llama_kv_cache_unified : public llama_memory_i {
                      uint32_t    n_seq_max,
                      uint32_t    n_pad,
                      uint32_t    n_swa,
-               llama_swa_type    swa_type);
+               llama_swa_type    swa_type,
+                         bool    dry_run);
 
     ~llama_kv_cache_unified() = default;
 
@@ -79,6 +80,8 @@ class llama_kv_cache_unified : public llama_memory_i {
     llama_pos seq_pos_min(llama_seq_id seq_id) const override;
     llama_pos seq_pos_max(llama_seq_id seq_id) const override;
 
+    size_t total_size(ggml_backend_dev_t dev = nullptr) const override;
+
     // state write/load
 
     void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
@@ -173,8 +176,6 @@ class llama_kv_cache_unified : public llama_memory_i {
     // return non-empty vector if cells have been moved
     defrag_info defrag_prepare(int32_t n_max_nodes) const;
 
-    size_t total_size() const;
-
     size_t size_k_bytes() const;
     size_t size_v_bytes() const;
 
diff --git a/src/llama-memory.h b/src/llama-memory.h
index 991aae781ba57..6946e0a61b2f8 100644
--- a/src/llama-memory.h
+++ b/src/llama-memory.h
@@ -108,6 +108,12 @@ struct llama_memory_i {
 
     virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
     virtual void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) = 0;
+
+    //
+    // TODO
+    //
+
+    virtual size_t total_size(ggml_backend_dev_t dev) const = 0;
 };
 
 using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index c41ee24507fca..e0a2702165e63 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1464,7 +1464,7 @@ void llama_model::load_vocab(llama_model_loader & ml) {
     vocab.load(ml, kv);
 }
 
-bool llama_model::load_tensors(llama_model_loader & ml) {
+bool llama_model::load_tensors(llama_model_loader & ml, bool dry_run) {
     const auto & split_mode   = params.split_mode;
     const auto & n_gpu_layers = params.n_gpu_layers;
     const auto & use_mlock    = params.use_mlock;
@@ -4192,11 +4192,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 pimpl->bufs.emplace_back(buf);
                 buf_map.emplace(idx, buf);
             }
-        }
-        else {
-            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-            if (buf == nullptr) {
-                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+        } else {
+            ggml_backend_buffer_t buf;
+            if (dry_run) {
+                buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // alloc dummy buffer
+                for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+                    t->buffer = buf;
+                }
+            } else {
+                buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+                if (buf == nullptr) {
+                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+                }
             }
             pimpl->bufs.emplace_back(buf);
             if (use_mlock && ggml_backend_buffer_is_host(buf)) {
@@ -4249,6 +4256,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         }
     }
 
+    if (dry_run) {
+        for (auto & it : ctx_bufs) {
+            ggml_context * ctx = it.first;
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+                if (!t->buffer) {
+                    t->data = (void *) 0x12345678; // so that e.g. tensors which would normally be memory-mapped are treated as allocated
+                }
+            }
+        }
+        return true;
+    }
+
     // load tensor data
     for (auto & it : ctx_bufs) {
         ggml_context * ctx = it.first;
@@ -4291,6 +4310,10 @@ size_t llama_model::n_devices() const {
     return devices.size();
 }
 
+size_t llama_model::total_size(ggml_backend_dev_t dev) const {
+    return ctxs_total_size(pimpl->ctxs, dev);
+}
+
 uint64_t llama_model::n_elements() const {
     return pimpl->n_elements;
 }
@@ -13203,7 +13226,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
     }
 };
 
-llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
+llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams, bool dry_run) const {
     llama_memory_i * res;
 
     switch (arch) {
@@ -13227,7 +13250,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                         GGML_TYPE_F32,
                         cparams.offload_kqv,
                         std::max((uint32_t) 1, cparams.n_seq_max),
-                        cparams.n_seq_max);
+                        cparams.n_seq_max,
+                        dry_run);
             } break;
         default:
             {
@@ -13250,7 +13274,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             cparams.n_ctx,
                             cparams.n_seq_max,
                             cparams.n_ubatch,
-                            padding);
+                            padding,
+                            dry_run);
                 } else {
                     GGML_ASSERT(!hparams.is_swa_any());
 
@@ -13265,7 +13290,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             cparams.n_seq_max,
                             padding,
                             hparams.n_swa,
-                            hparams.swa_type);
+                            hparams.swa_type,
+                            dry_run);
                 }
             }
     }
diff --git a/src/llama-model.h b/src/llama-model.h
index 18b714620bbcf..8772b5f8c20b5 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -376,7 +376,7 @@ struct llama_model {
     void load_arch   (llama_model_loader & ml);
     void load_hparams(llama_model_loader & ml);
     void load_vocab  (llama_model_loader & ml);
-    bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
+    bool load_tensors(llama_model_loader & ml, bool dry_run); // returns false if cancelled by progress_callback
 
     std::string arch_name() const;
     std::string type_name() const;
@@ -387,6 +387,8 @@ struct llama_model {
     size_t n_tensors() const;
     size_t n_devices() const;
 
+    size_t total_size(ggml_backend_dev_t dev = nullptr) const;
+
     // total number of parameters in the model
     uint64_t n_elements() const;
 
@@ -408,7 +410,7 @@ struct llama_model {
 
     // note: can mutate `cparams`
     // TODO: move this to new llm_arch_model_i interface
-    llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
+    llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams, bool dry_run) const;
 
     // TODO: move this to new llm_arch_model_i interface
     llm_graph_result_ptr build_graph(
diff --git a/src/llama.cpp b/src/llama.cpp
index 2f06e0f8ce12d..af92ccec4d6ef 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1,6 +1,7 @@
 #include "llama-impl.h"
 
 #include "llama-chat.h"
+#include "llama-context.h"
 #include "llama-mmap.h"
 #include "llama-vocab.h"
 #include "llama-model-loader.h"
@@ -84,7 +85,8 @@ int64_t llama_time_us(void) {
 }
 
 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
-static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
+static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model,
+        llama_model_params & params, bool dry_run) {
     // loading time will be recalculated after the first eval, so
     // we take page faults deferred by mmap() into consideration
     model.t_load_us = 0;
@@ -123,7 +125,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
             return 0;
         }
 
-        if (!model.load_tensors(ml)) {
+        if (!model.load_tensors(ml, dry_run)) {
             return -2;
         }
     } catch (const std::exception & err) {
@@ -137,7 +139,8 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
 static struct llama_model * llama_model_load_from_file_impl(
         const std::string & path_model,
         std::vector<std::string> & splits,
-        struct llama_model_params params) {
+        struct llama_model_params params,
+        bool dry_run) {
     ggml_time_init();
 
     if (!params.vocab_only && ggml_backend_reg_count() == 0) {
@@ -214,7 +217,7 @@ static struct llama_model * llama_model_load_from_file_impl(
         LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
     }
 
-    const int status = llama_model_load(path_model, splits, *model, params);
+    const int status = llama_model_load(path_model, splits, *model, params, dry_run);
     GGML_ASSERT(status <= 0);
     if (status < 0) {
         if (status == -1) {
@@ -230,6 +233,36 @@ static struct llama_model * llama_model_load_from_file_impl(
     return model;
 }
 
+bool llama_expected_memory_use(const char * path_model, struct llama_model_params mparams,
+        struct llama_context_params cparams, size_t * nbytes_expect) {
+    mparams.use_mmap = false; // FIXME very slow otherwise
+
+    std::vector<std::string> splits = {};
+    llama_model * model = llama_model_load_from_file_impl(path_model, splits, mparams, /*dry_run =*/ true);
+    if (model == NULL) {
+        LLAMA_LOG_ERROR("%s: failed to load model '%s'\n", __func__, path_model);
+        return false;
+    }
+
+    llama_context * lctx = llama_init_from_model_impl(model, cparams, /*dry_run =*/ true);
+    if (lctx == NULL) {
+        LLAMA_LOG_ERROR("%s: failed to create context with model '%s'\n", __func__, path_model);
+        llama_model_free(model);
+        return false;
+    }
+
+    for (size_t i = 0; i < model->n_devices(); i++) {
+        ggml_backend_dev_t dev = model->devices[i];
+        const size_t nbytes_static = model->total_size(dev) + lctx->total_size(dev);
+        const size_t nbytes_compute = lctx->get_expected_max_size(dev);
+        nbytes_expect[i] = nbytes_static + nbytes_compute;
+        LLAMA_LOG_DEBUG("%s: %s: %zu + %zu = %zu MiB\n", __func__, ggml_backend_dev_name(dev),
+            nbytes_static/(1024*1024), nbytes_compute/(1024*1024), nbytes_expect[i]/(1024*1024));
+    }
+    return true;
+}
+
+
 // deprecated
 struct llama_model * llama_load_model_from_file(
         const char * path_model,
@@ -241,7 +274,7 @@ struct llama_model * llama_model_load_from_file(
         const char * path_model,
         struct llama_model_params params) {
     std::vector<std::string> splits = {};
-    return llama_model_load_from_file_impl(path_model, splits, params);
+    return llama_model_load_from_file_impl(path_model, splits, params, /*dry_run =*/ false);
 }
 
 struct llama_model * llama_model_load_from_splits(
@@ -256,7 +289,7 @@ struct llama_model * llama_model_load_from_splits(
     for (size_t i = 0; i < n_paths; ++i) {
         splits.push_back(paths[i]);
     }
-    return llama_model_load_from_file_impl(splits.front(), splits, params);
+    return llama_model_load_from_file_impl(splits.front(), splits, params, /*dry run =*/ false);
 }
 
 void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {