ggml-org · nonml · May 20, 2026 · May 24, 2026
@@ -76,6 +76,7 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 // ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
+// ggml_backend_alloc_ctx_tensors_from_buft returns NULL on failure or if all tensors in ctx are already allocated or zero-sized
 GGML_API size_t                       ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);

@@ -394,15 +394,46 @@ static ggml_backend_buffer_type_t ggml_backend_meta_device_get_host_buffer_type(
 
 struct ggml_backend_meta_buffer_context {
     static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding);
+    static constexpr size_t initial_ctx_size = 4 * 1024 * 1024;  // 4MB small enough to not waste RAM
 
     std::map<std::pair<const ggml_tensor *, bool>, std::pair<ggml_backend_meta_split_state, char[nbtc]>> split_state_cache;
-    std::map<          const ggml_tensor *,        std::vector<ggml_tensor *>>                           simple_tensors;
+    std::map<const ggml_tensor *, std::vector<ggml_tensor *>> simple_tensors;
 
     struct buffer_config {
-        ggml_context          * ctx;
-        ggml_backend_buffer_t   buf;
+        ggml_backend_buffer_t buf;
+        ggml_context * ctx = nullptr;
+        std::vector<ggml_context *> ctxs;
+        std::vector<std::unique_ptr<uint8_t[]>> ctx_mems;
+        size_t chunk_size;
 
-        buffer_config(ggml_context * ctx, ggml_backend_buffer_t buf) : ctx(ctx), buf(buf) {}
+        buffer_config() : buf(nullptr), chunk_size(initial_ctx_size) {}
+        buffer_config(buffer_config&&) noexcept = default;
+        buffer_config& operator=(buffer_config&&) noexcept = default;
+        buffer_config(const buffer_config&) = delete;
+        buffer_config& operator=(const buffer_config&) = delete;
+
+        void init(size_t mem_size) {
+            chunk_size = mem_size;
+            add_chunk();
+        }
+
+        void add_chunk() {
+            ctx_mems.push_back(std::unique_ptr<uint8_t[]>(new uint8_t[chunk_size]));
+            ggml_init_params params = {
+                /*.mem_size   =*/ chunk_size,
+                /*.mem_buffer =*/ ctx_mems.back().get(),
+                /*.no_alloc   =*/ true,
+            };
+            ctx = ggml_init(params);
+            ctxs.push_back(ctx);
+            chunk_size = (chunk_size * 3) / 2;  // 1.5x growth (33% max waste)
+        }
+
+        ~buffer_config() {
+            for (auto * ctx : ctxs) {
+                ggml_free(ctx);
+            }
+        }
     };
     std::vector<buffer_config> buf_configs;
 
@@ -417,9 +448,10 @@ struct ggml_backend_meta_buffer_context {
 static void ggml_backend_meta_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
     ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
-    for (auto & [ctx, buf] : buf_ctx->buf_configs) {
-        ggml_backend_buffer_free(buf);
-        ggml_free(ctx);
+    for (auto & config : buf_ctx->buf_configs) {
+        if (config.buf) {
+            ggml_backend_buffer_free(config.buf);
+        }
     }
     delete buf_ctx;
 }
@@ -1103,74 +1135,84 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
 
     std::vector<ggml_tensor *> simple_tensors;
     simple_tensors.reserve(n_simple_bufs);
-    for (size_t j = 0; j < n_simple_bufs; j++) {
-        ggml_context          * simple_ctx = buf_ctx->buf_configs[j].ctx;
-        ggml_backend_buffer_t   simple_buf = buf_ctx->buf_configs[j].buf;
-
-        if (split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
-            // TODO: the following assert fails for llama-parallel even though the results are correct:
-            // GGML_ASSERT(ggml_is_contiguously_allocated(tensor));
-            ne[split_dim] = 0;
-            for (size_t s = 0; s < split_state.n_segments; s++) {
-                ne[split_dim] += split_state.ne[s*n_simple_bufs + j];
-            }
-            for (int i = 0; i < GGML_MAX_DIMS; i++) {
-                if (tensor->nb[i] > tensor->nb[split_dim]) {
-                    nb[i] = tensor->nb[i] * ne[split_dim]/tensor->ne[split_dim];
+    bool retry_alloc = true;
+    while (retry_alloc) {
+        retry_alloc = false;
+        simple_tensors.clear();
+        for (size_t j = 0; j < n_simple_bufs; j++) {
+            ggml_context          * simple_ctx = buf_ctx->buf_configs[j].ctx;
+            ggml_backend_buffer_t   simple_buf = buf_ctx->buf_configs[j].buf;
+
+            if (split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
+                // TODO: the following assert fails for llama-parallel even though the results are correct:
+                // GGML_ASSERT(ggml_is_contiguously_allocated(tensor));
+                ne[split_dim] = 0;
+                for (size_t s = 0; s < split_state.n_segments; s++) {
+                    ne[split_dim] += split_state.ne[s*n_simple_bufs + j];
                 }
-            }
-        }
-
-        ggml_tensor * t_ij = ggml_new_tensor(simple_ctx, tensor->type, GGML_MAX_DIMS, ne);
-        t_ij->op = tensor->op;
-        for (int i = 0; i < GGML_MAX_DIMS; i++) {
-            t_ij->nb[i] = nb[i];
-        }
-        t_ij->flags = tensor->flags;
-        memcpy(t_ij->op_params, tensor->op_params, sizeof(tensor->op_params));
-        ggml_set_name(t_ij, tensor->name);
-        t_ij->buffer = simple_buf;
-        t_ij->view_src = tensor->view_src;
-        t_ij->view_offs = tensor->view_offs;
-        if (t_ij->view_src != nullptr && ggml_backend_buffer_is_meta(t_ij->view_src->buffer)) {
-            t_ij->view_src = ggml_backend_meta_buffer_simple_tensor(tensor->view_src, j);
-            if (t_ij->view_offs > 0 && split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
-                GGML_ASSERT(tensor->ne[split_dim] != 0);
-                const int split_dim_view_src = ggml_backend_meta_get_split_state(tensor->view_src, /*assume_sync =*/ true).axis;
-                GGML_ASSERT(split_dim_view_src >= 0 && split_dim_view_src < GGML_MAX_DIMS);
-
-                // The offset can be internal to the data split, in those cases the view offset should not be scaled.
-                // If however, the offset is larger than the data split then it needs to be scaled proportionally.
-                bool split_internal_offset = t_ij->view_offs <= tensor->view_src->nb[split_dim_view_src];
                 for (int i = 0; i < GGML_MAX_DIMS; i++) {
-                    const size_t dim_size = tensor->ne[i] * tensor->nb[i];
-                    if (tensor->view_offs <= dim_size && dim_size < tensor->nb[split_dim]) {
-                        split_internal_offset = true;
-                        break;
+                    if (tensor->nb[i] > tensor->nb[split_dim]) {
+                        nb[i] = tensor->nb[i] * ne[split_dim]/tensor->ne[split_dim];
                     }
                 }
-                if (!split_internal_offset) {
-                    t_ij->view_offs = t_ij->view_offs * ne[split_dim]/tensor->ne[split_dim];
+            }
+
+            ggml_tensor * t_ij = ggml_new_tensor(simple_ctx, tensor->type, GGML_MAX_DIMS, ne);
+            if (t_ij == nullptr) {
+                buf_ctx->buf_configs[j].add_chunk();
+                retry_alloc = true;
+                break;
+            }
+            t_ij->op = tensor->op;
+            for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                t_ij->nb[i] = nb[i];
+            }
+            t_ij->flags = tensor->flags;
+            memcpy(t_ij->op_params, tensor->op_params, sizeof(tensor->op_params));
+            ggml_set_name(t_ij, tensor->name);
+            t_ij->buffer = simple_buf;
+            t_ij->view_src = tensor->view_src;
+            t_ij->view_offs = tensor->view_offs;
+            if (t_ij->view_src != nullptr && ggml_backend_buffer_is_meta(t_ij->view_src->buffer)) {
+                t_ij->view_src = ggml_backend_meta_buffer_simple_tensor(tensor->view_src, j);
+                if (t_ij->view_offs > 0 && split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
+                    GGML_ASSERT(tensor->ne[split_dim] != 0);
+                    const int split_dim_view_src = ggml_backend_meta_get_split_state(tensor->view_src, /*assume_sync =*/ true).axis;
+                    GGML_ASSERT(split_dim_view_src >= 0 && split_dim_view_src < GGML_MAX_DIMS);
+
+                    // The offset can be internal to the data split, in those cases the view offset should not be scaled.
+                    // If however, the offset is larger than the data split then it needs to be scaled proportionally.
+                    bool split_internal_offset = t_ij->view_offs <= tensor->view_src->nb[split_dim_view_src];
+                    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                        const size_t dim_size = tensor->ne[i] * tensor->nb[i];
+                        if (tensor->view_offs <= dim_size && dim_size < tensor->nb[split_dim]) {
+                            split_internal_offset = true;
+                            break;
+                        }
+                    }
+                    if (!split_internal_offset) {
+                        t_ij->view_offs = t_ij->view_offs * ne[split_dim]/tensor->ne[split_dim];
+                    }
                 }
             }
-        }
-        if (t_ij->view_src != nullptr) {
-            t_ij->data = (char *) t_ij->view_src->data + t_ij->view_offs;
-        } else if (simple_buf != nullptr) {
-            t_ij->data = (char *) ggml_backend_buffer_get_base(simple_buf)
-                + size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(buffer));
-        }
-        t_ij->extra = tensor->extra;
-        for (int i = 0; i < GGML_MAX_SRC; i++) {
-            t_ij->src[i] = tensor->src[i];
-            if (tensor->src[i] == tensor) {
-                t_ij->src[i] = t_ij;
-            } else if (t_ij->src[i] != nullptr && ggml_backend_buffer_is_meta(t_ij->src[i]->buffer)) {
-                t_ij->src[i] = ggml_backend_meta_buffer_simple_tensor(tensor->src[i], j);
+            if (t_ij->view_src != nullptr) {
+                t_ij->data = (char *) t_ij->view_src->data + t_ij->view_offs;
+            } else if (simple_buf != nullptr) {
+                t_ij->data = (char *) ggml_backend_buffer_get_base(simple_buf)
+                    + size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(buffer));
+            }
+            t_ij->extra = tensor->extra;
+            for (int i = 0; i < GGML_MAX_SRC; i++) {
+                t_ij->src[i] = tensor->src[i];
+                if (tensor->src[i] == tensor) {
+                    t_ij->src[i] = t_ij;
+                } else if (t_ij->src[i] != nullptr && ggml_backend_buffer_is_meta(t_ij->src[i]->buffer)) {
+                    t_ij->src[i] = ggml_backend_meta_buffer_simple_tensor(tensor->src[i], j);
+                }
             }
-        }
 
-        simple_tensors.push_back(t_ij);
+            simple_tensors.push_back(t_ij);
+        }
     }
 
     // If one of the sources has a zero-sized slice, disable the computation:
@@ -1275,6 +1317,9 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
             for (size_t j = 0; j < n_bufs; j++) {
                 ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                 const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
+                if (chunk_size_j == 0) {
+                    continue;
+                }
                 const size_t simple_offset = i_start * chunk_size_j;
                 ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full);
                 offset_j += chunk_size_j;
@@ -1382,6 +1427,9 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
             for (size_t j = 0; j < n_bufs; j++){
                 const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                 const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
+                if (chunk_size_j == 0) {
+                    continue;
+                }
                 const size_t simple_offset = i_start * chunk_size_j;
                 ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full);
                 offset_j += chunk_size_j;
@@ -1411,6 +1459,14 @@ static void ggml_backend_meta_buffer_reset(ggml_backend_buffer_t buffer) {
     for (size_t i = 0; i < n_buffers; i++) {
         ggml_backend_buffer_reset(ggml_backend_meta_buffer_simple_buffer(buffer, i));
     }
+
+    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
+    buf_ctx->simple_tensors.clear();
+    for (auto & config : buf_ctx->buf_configs) {
+        for (auto * ctx : config.ctxs) {
+            ggml_reset(ctx);
+        }
+    }
 }
 
 static const ggml_backend_buffer_i ggml_backend_meta_buffer_iface = {
@@ -1434,19 +1490,16 @@ bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf) {
 static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft);
 
-    ggml_init_params params = {
-        /*.mem_size   =*/ 1024*1024*1024, // FIXME
-        /*.mem_buffer =*/ nullptr,
-        /*.no_alloc   =*/ true,
-    };
-
     ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context();
     size_t max_size = 0;
     buf_ctx->buf_configs.reserve(n_simple_bufts);
     for (size_t i = 0; i < n_simple_bufts; i++) {
+        buf_ctx->buf_configs.emplace_back();
+        buf_ctx->buf_configs.back().init(ggml_backend_meta_buffer_context::initial_ctx_size);
         ggml_backend_buffer_t simple_buf = ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size);
+        GGML_ASSERT(simple_buf != nullptr);
         max_size = std::max(max_size, ggml_backend_buffer_get_size(simple_buf));
-        buf_ctx->buf_configs.emplace_back(ggml_init(params), simple_buf);
+        buf_ctx->buf_configs.back().buf = simple_buf;
     }
 
     return ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, buf_ctx, max_size);
@@ -1455,16 +1508,11 @@ static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_bac
 struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
     const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft);
 
-    ggml_init_params params = {
-        /*.mem_size   =*/ 1024*1024*1024, // FIXME
-        /*.mem_buffer =*/ nullptr,
-        /*.no_alloc   =*/ true,
-    };
-
     ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context();
     meta_buf_ctx->buf_configs.reserve(n_simple_bufts);
     for (size_t i = 0; i < n_simple_bufts; i++) {
-        meta_buf_ctx->buf_configs.emplace_back(ggml_init(params), nullptr);
+        meta_buf_ctx->buf_configs.emplace_back();
+        meta_buf_ctx->buf_configs.back().init(ggml_backend_meta_buffer_context::initial_ctx_size);
     }
 
     ggml_backend_buffer_t meta_buf = ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, meta_buf_ctx, 0);
@@ -1474,8 +1522,27 @@ struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struc
         t->data = (void *) 0x2000000000000000; // FIXME
     }
     for (size_t i = 0; i < n_simple_bufts; i++) {
-        meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft(
-            meta_buf_ctx->buf_configs[i].ctx, ggml_backend_meta_buft_simple_buft(buft, i));
+        ggml_context * ctx = meta_buf_ctx->buf_configs[i].ctx;
+        ggml_backend_buffer_type_t simple_buft = ggml_backend_meta_buft_simple_buft(buft, i);
+
+        // If a ggml_context only has zero-sized tensors, ggml_backend_alloc_ctx_tensors_from_buft returns NULL.
+        // For those edge cases, allocate a dummy buffer instead.
+        bool any_nonzero_slice = false;
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+            if (ggml_nelements(t) != 0) {
+                any_nonzero_slice = true;
+                break;
+            }
+        }
+        if (any_nonzero_slice) {
+            meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft);
+        } else {
+            meta_buf_ctx->buf_configs[i].buf = ggml_backend_buft_alloc_buffer(simple_buft, 0);
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+                t->buffer = meta_buf_ctx->buf_configs[i].buf;
+            }
+        }
+        GGML_ASSERT(meta_buf_ctx->buf_configs[i].buf != nullptr);
         meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->buf_configs[i].buf));
     }
     return meta_buf;
@@ -1605,6 +1672,9 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens
                 ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j);
                 ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                 const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
+                if (chunk_size_j == 0) {
+                    continue;
+                }
                 ggml_backend_tensor_set_2d_async(simple_backend, simple_tensor, (const char *) data + offset_j, offset, chunk_size_j,
                     i_stop - i_start, chunk_size_j, chunk_size_full);
                 offset_j += chunk_size_j;
@@ -1646,6 +1716,9 @@ static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggm
                 ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j);
                 const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                 const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
+                if (chunk_size_j == 0) {
+                    continue;
+                }
                 ggml_backend_tensor_get_2d_async(simple_backend, simple_tensor, (char *) data + offset_j, offset, chunk_size_j,
                     i_stop - i_start, chunk_size_j, chunk_size_full);
                 offset_j += chunk_size_j;

@@ -1693,11 +1693,8 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
     }
 
     if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
-        GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
+        GGML_LOG_DEBUG("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
                 __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
-#ifndef NDEBUG
-        GGML_ABORT("not enough space in the context's memory pool");
-#endif
         return NULL;
     }
 
@@ -1763,7 +1760,9 @@ static struct ggml_tensor * ggml_new_tensor_impl(
     GGML_ASSERT(GGML_TENSOR_SIZE <= SIZE_MAX - obj_alloc_size);
 
     struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
-    GGML_ASSERT(obj_new);
+    if (obj_new == NULL) {
+        return NULL;
+    }
 
     struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);