From f91e96681a1bd69cdd57a8d2dc321e682b60d40a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 2 May 2026 15:01:08 +0200 Subject: [PATCH 1/5] TP: fix ggml context size calculation, memory leak --- ggml/src/ggml-alloc.c | 6 + ggml/src/ggml-backend-impl.h | 3 + ggml/src/ggml-backend-meta.cpp | 223 +++++++++++++++++++++++---------- 3 files changed, 168 insertions(+), 64 deletions(-) diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index a4b01ccf8a1..05d08bd240a 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -823,6 +823,12 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr static bool ggml_gallocr_reserve_n_impl( ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) { + // FIXME long-term this should be replaced with better logic in ggml-alloc.c + for (int i = 0; i < galloc->n_buffers; i++) { + if (ggml_backend_buft_is_meta(galloc->bufts[i])) { + ggml_backend_meta_buft_update_max_n_tensors(galloc->bufts[i], graph->n_leafs + graph->n_nodes); + } + } size_t min_hash_size = graph->n_nodes + graph->n_leafs; // add 25% margin to avoid hash collisions min_hash_size += min_hash_size / 4; diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index 9c56ec30c5f..0630319fed3 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -98,6 +98,9 @@ extern "C" { // temporary workaround to statically allocate tensors from a context in a deduplicated way: GGML_API struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); + // another temporary workaround + GGML_API void ggml_backend_meta_buft_update_max_n_tensors(ggml_backend_buffer_type_t buft, size_t n_tensors); + // // Backend (stream) // diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp index 5f9ae9c1bc5..a7213d6cad4 100644 --- a/ggml/src/ggml-backend-meta.cpp +++ b/ggml/src/ggml-backend-meta.cpp @@ -6,6 +6,7 @@ #include "ggml-cpp.h" #include +#include #include #include #include @@ -13,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -250,6 +252,8 @@ struct ggml_backend_meta_buffer_type_context { std::string name; + std::atomic max_n_tensors = 1024; // FIXME replace with better handling in ggml-alloc.c + ggml_backend_meta_buffer_type_context(std::vector simple_bufts) : simple_bufts(std::move(simple_bufts)) { name = "Meta("; for (size_t i = 0; i < simple_bufts.size(); i++) { @@ -266,6 +270,18 @@ struct ggml_backend_meta_buffer_type_context { } }; +void ggml_backend_meta_buft_update_max_n_tensors(struct ggml_backend_buffer_type * buft, size_t n_tensors) { + GGML_ASSERT(ggml_backend_buft_is_meta(buft)); + ggml_backend_meta_buffer_type_context * ctx = (ggml_backend_meta_buffer_type_context *) buft->context; + size_t max_n_tensors_cur = ctx->max_n_tensors.load(); + while (max_n_tensors_cur < n_tensors) { + // If max_n_tensors_cur has not changed, it is written to the atomic (true), otherwise the new value is fetched (false). + if (ctx->max_n_tensors.compare_exchange_weak(/*expected =*/ max_n_tensors_cur, /*desired =*/n_tensors)) { + break; + } + } +} + static size_t ggml_backend_meta_buft_n_bufts(ggml_backend_buffer_type_t meta_buft) { GGML_ASSERT(ggml_backend_buft_is_meta(meta_buft)); const ggml_backend_meta_buffer_type_context * meta_buft_ctx = (const ggml_backend_meta_buffer_type_context *) meta_buft->context; @@ -392,64 +408,97 @@ static ggml_backend_buffer_type_t ggml_backend_meta_device_get_host_buffer_type( // meta backend buffer // -struct ggml_backend_meta_buffer_context { - static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding); +// Container to hold the tensor slices per simple ggml backend buffer. +struct ggml_backend_meta_simple_tensor_container { + std::vector ctxs; + std::map> simple_tensors; + static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding); std::map, std::pair> split_state_cache; - std::map< const ggml_tensor *, std::vector> simple_tensors; - struct buffer_config { - ggml_context * ctx; - ggml_backend_buffer_t buf; + ggml_backend_meta_simple_tensor_container(const ggml_init_params & params, const int n_simple) { + ctxs.reserve(n_simple); + for (int i = 0; i < n_simple; i++) { + ctxs.emplace_back(ggml_init(params)); + } + } + ggml_backend_meta_simple_tensor_container() {} +}; - buffer_config(ggml_context * ctx, ggml_backend_buffer_t buf) : ctx(ctx), buf(buf) {} - }; - std::vector buf_configs; +struct ggml_backend_meta_buffer_context { + // FIXME + // Most tensors can simply be stored statically in their own buffer. + // Externally created views however also need a mapping to simple tensors but they use the buffer of the view source. + // If external views are simply using that buffer they will slowly deplete its memory. + // Current solution: rotating set of 2 "compute" containers to hold external views, works correctly for llama.cpp. + // Long-term: tie the lifetime of external views to the meta backend executing the graph instead, + // currently not possible due to graph-external operations in the backend scheduler. + ggml_backend_meta_simple_tensor_container stc_static; + ggml_backend_meta_simple_tensor_container stc_compute[2]; + int stc_compute_index = 0; + int stc_compute_index_next = 0; + std::vector bufs; int debug; - ggml_backend_meta_buffer_context() { + ggml_backend_meta_buffer_context( + ggml_backend_meta_simple_tensor_container & stc_static, + ggml_backend_meta_simple_tensor_container & stc_compute_0, + ggml_backend_meta_simple_tensor_container & stc_compute_1, + const std::vector & bufs) + : stc_static(std::move(stc_static)), stc_compute{std::move(stc_compute_0), std::move(stc_compute_1)} { + this->bufs.reserve(bufs.size()); + for (ggml_backend_buffer_t buf : bufs) { + this->bufs.emplace_back(buf); + } const char * GGML_META_DEBUG = getenv("GGML_META_DEBUG"); debug = GGML_META_DEBUG ? atoi(GGML_META_DEBUG) : 0; } + + ggml_backend_meta_simple_tensor_container & get_simple_tensor_container(const ggml_tensor * tensor) { + if (stc_static.simple_tensors.find(tensor) != stc_static.simple_tensors.end()) { + return stc_static; + } + return stc_compute[stc_compute_index]; + } }; static void ggml_backend_meta_buffer_free_buffer(ggml_backend_buffer_t buffer) { GGML_ASSERT(ggml_backend_buffer_is_meta(buffer)); ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context; - for (auto & [ctx, buf] : buf_ctx->buf_configs) { - ggml_backend_buffer_free(buf); - ggml_free(ctx); - } delete buf_ctx; } static size_t ggml_backend_meta_buffer_n_bufs(ggml_backend_buffer_t meta_buf) { GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf)); ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context; - return buf_ctx->buf_configs.size(); + return buf_ctx->bufs.size(); } static ggml_backend_buffer_t ggml_backend_meta_buffer_simple_buffer(ggml_backend_buffer_t meta_buf, size_t index) { GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf)); ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context; - GGML_ASSERT(index < buf_ctx->buf_configs.size()); - return buf_ctx->buf_configs[index].buf; + GGML_ASSERT(index < buf_ctx->bufs.size()); + return buf_ctx->bufs[index].get(); } static struct ggml_tensor * ggml_backend_meta_buffer_simple_tensor(const struct ggml_tensor * tensor, size_t index) { GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer)); ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context; - GGML_ASSERT(index < buf_ctx->buf_configs.size()); + GGML_ASSERT(index < buf_ctx->bufs.size()); - auto it = buf_ctx->simple_tensors.find(tensor); - if (it == buf_ctx->simple_tensors.end()) { + ggml_backend_meta_simple_tensor_container & stc = buf_ctx->get_simple_tensor_container(tensor); + auto it = stc.simple_tensors.find(tensor); + if (it == stc.simple_tensors.end()) { return nullptr; } return it->second[index]; } -static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) { +static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync); + +static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state( + ggml_backend_meta_simple_tensor_container & stc, const struct ggml_tensor * tensor, bool assume_sync) { const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer); ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context; @@ -785,7 +834,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1}; continue; } - src_ss[i] = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true); + src_ss[i] = ggml_backend_meta_get_split_state(stc, tensor->src[i], /*assume_sync =*/ true); GGML_ASSERT(src_ss[i].axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN); } @@ -1024,15 +1073,15 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co }; const std::pair key = std::make_pair(tensor, assume_sync); - auto it = buf_ctx->split_state_cache.find(key); - if (it != buf_ctx->split_state_cache.end() && memcmp(it->second.second, (const char *) tensor, sizeof(it->second.second)) != 0) { - buf_ctx->split_state_cache.clear(); - it = buf_ctx->split_state_cache.end(); + auto it = stc.split_state_cache.find(key); + if (it != stc.split_state_cache.end() && memcmp(it->second.second, (const char *) tensor, sizeof(it->second.second)) != 0) { + stc.split_state_cache.clear(); + it = stc.split_state_cache.end(); } - if (it == buf_ctx->split_state_cache.end()) { - buf_ctx->split_state_cache[key].first = calculate_split_state(); - memcpy(buf_ctx->split_state_cache[key].second, tensor, sizeof(buf_ctx->split_state_cache[key].second)); + if (it == stc.split_state_cache.end()) { + stc.split_state_cache[key].first = calculate_split_state(); + memcpy(stc.split_state_cache[key].second, tensor, sizeof(stc.split_state_cache[key].second)); if (buf_ctx->debug > 0) { std::string srcs_info; for (size_t i = 0; i < GGML_MAX_SRC; i++) { @@ -1058,14 +1107,14 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co if (!ne_info.empty()) { ne_info += ", "; } - ne_info += std::to_string(buf_ctx->split_state_cache[key].first.ne[j]); + ne_info += std::to_string(stc.split_state_cache[key].first.ne[j]); } GGML_LOG_DEBUG("SPLIT_STATE: {%s} -> %s[%s, %s, {%s}]\n", srcs_info.c_str(), tensor->name, ggml_op_name(tensor->op), - ggml_backend_meta_split_axis_name(buf_ctx->split_state_cache[key].first.axis), ne_info.c_str()); + ggml_backend_meta_split_axis_name(stc.split_state_cache[key].first.axis), ne_info.c_str()); } } - ggml_backend_meta_split_state ret = buf_ctx->split_state_cache[key].first; + ggml_backend_meta_split_state ret = stc.split_state_cache[key].first; GGML_ASSERT(ret.axis != GGML_BACKEND_SPLIT_AXIS_NONE); #ifndef NDEBUG if (ret.axis >= 0 && ret.axis < GGML_MAX_DIMS) { @@ -1079,17 +1128,23 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co return ret; } +static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) { + GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer)); + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context; + return ggml_backend_meta_get_split_state(buf_ctx->get_simple_tensor_container(tensor), tensor, assume_sync); +} + static void * ggml_backend_meta_buffer_get_base(ggml_backend_buffer_t buffer) { GGML_UNUSED(buffer); return (void *) 0x1000000000000000; // FIXME } -static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - GGML_ASSERT(ggml_backend_buffer_is_meta(buffer)); - ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context; - const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(buffer); +static enum ggml_status ggml_backend_meta_buffer_init_tensor_impl(ggml_backend_meta_simple_tensor_container & stc, ggml_tensor * tensor) { + GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer)); + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context; + const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer); - const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ true); + const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(stc, tensor, /*assume_sync =*/ true); GGML_ASSERT(ggml_nelements(tensor) == 0 || split_state.axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN); GGML_ASSERT(split_state.n_segments <= 16); @@ -1104,8 +1159,8 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer std::vector simple_tensors; simple_tensors.reserve(n_simple_bufs); for (size_t j = 0; j < n_simple_bufs; j++) { - ggml_context * simple_ctx = buf_ctx->buf_configs[j].ctx; - ggml_backend_buffer_t simple_buf = buf_ctx->buf_configs[j].buf; + ggml_context * simple_ctx = stc.ctxs[j].get(); + ggml_backend_buffer_t simple_buf = buf_ctx->bufs[j].get(); if (split_dim >= 0 && split_dim < GGML_MAX_DIMS) { // TODO: the following assert fails for llama-parallel even though the results are correct: @@ -1158,7 +1213,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer t_ij->data = (char *) t_ij->view_src->data + t_ij->view_offs; } else if (simple_buf != nullptr) { t_ij->data = (char *) ggml_backend_buffer_get_base(simple_buf) - + size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(buffer)); + + size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(tensor->buffer)); } t_ij->extra = tensor->extra; for (int i = 0; i < GGML_MAX_SRC; i++) { @@ -1194,11 +1249,18 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer } } - buf_ctx->simple_tensors[tensor] = simple_tensors; + stc.simple_tensors[tensor] = simple_tensors; return GGML_STATUS_SUCCESS; } +static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + GGML_ASSERT(ggml_backend_buffer_is_meta(buffer)); + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context; + buf_ctx->stc_compute_index = buf_ctx->stc_compute_index_next; + return ggml_backend_meta_buffer_init_tensor_impl(buf_ctx->get_simple_tensor_container(tensor), tensor); +} + static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(buffer); GGML_ASSERT(ggml_is_contiguous(tensor)); @@ -1413,8 +1475,9 @@ static void ggml_backend_meta_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static void ggml_backend_meta_buffer_reset(ggml_backend_buffer_t buffer) { - const size_t n_buffers = ggml_backend_meta_buffer_n_bufs(buffer); - for (size_t i = 0; i < n_buffers; i++) { + GGML_ASSERT(ggml_backend_buffer_is_meta(buffer)); + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context; + for (size_t i = 0; i < buf_ctx->bufs.size(); i++) { ggml_backend_buffer_reset(ggml_backend_meta_buffer_simple_buffer(buffer, i)); } } @@ -1440,21 +1503,26 @@ bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf) { static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft); + const ggml_backend_meta_buffer_type_context * buft_ctx = (const ggml_backend_meta_buffer_type_context *) buft->context; + ggml_init_params params = { - /*.mem_size =*/ 1024*1024*1024, // FIXME + /*.mem_size =*/ buft_ctx->max_n_tensors.load()*ggml_tensor_overhead(), /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; + ggml_backend_meta_simple_tensor_container stc_static; + ggml_backend_meta_simple_tensor_container stc_compute_0(params, n_simple_bufts); + ggml_backend_meta_simple_tensor_container stc_compute_1(params, n_simple_bufts); - ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context(); size_t max_size = 0; - buf_ctx->buf_configs.reserve(n_simple_bufts); + std::vector bufs; + bufs.reserve(n_simple_bufts); for (size_t i = 0; i < n_simple_bufts; i++) { - ggml_backend_buffer_t simple_buf = ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size); - GGML_ASSERT(simple_buf != nullptr); - max_size = std::max(max_size, ggml_backend_buffer_get_size(simple_buf)); - buf_ctx->buf_configs.emplace_back(ggml_init(params), simple_buf); + bufs.push_back(ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size)); + GGML_ASSERT(bufs.back() != nullptr); + max_size = std::max(max_size, ggml_backend_buffer_get_size(bufs.back())); } + ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs); return ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, buf_ctx, max_size); } @@ -1462,26 +1530,32 @@ static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_bac struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft); - ggml_init_params params = { - /*.mem_size =*/ 1024*1024*1024, // FIXME + constexpr size_t compute_headroom = 8; + ggml_init_params params_static = { + /*.mem_size =*/ ggml_get_mem_size(ctx), /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; + ggml_init_params params_compute = { + /*.mem_size =*/ compute_headroom*ggml_get_mem_size(ctx), + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; + ggml_backend_meta_simple_tensor_container stc_static (params_static, n_simple_bufts); + ggml_backend_meta_simple_tensor_container stc_compute_0(params_compute, n_simple_bufts); + ggml_backend_meta_simple_tensor_container stc_compute_1(params_compute, n_simple_bufts); - ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context(); - meta_buf_ctx->buf_configs.reserve(n_simple_bufts); - for (size_t i = 0; i < n_simple_bufts; i++) { - meta_buf_ctx->buf_configs.emplace_back(ggml_init(params), nullptr); - } + std::vector bufs(n_simple_bufts, nullptr); + ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs); ggml_backend_buffer_t meta_buf = ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, meta_buf_ctx, 0); for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { t->buffer = meta_buf; - ggml_backend_meta_buffer_init_tensor(meta_buf, t); + ggml_backend_meta_buffer_init_tensor_impl(meta_buf_ctx->stc_static, t); t->data = (void *) 0x2000000000000000; // FIXME } for (size_t i = 0; i < n_simple_bufts; i++) { - ggml_context * ctx = meta_buf_ctx->buf_configs[i].ctx; + ggml_context * ctx = meta_buf_ctx->stc_static.ctxs[i].get(); ggml_backend_buffer_type_t simple_buft = ggml_backend_meta_buft_simple_buft(buft, i); // If a ggml_context only has zero-sized tensors, ggml_backend_alloc_ctx_tensors_from_buft returns NULL. @@ -1494,15 +1568,15 @@ struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struc } } if (any_nonzero_slice) { - meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft); + meta_buf_ctx->bufs[i].reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft)); } else { - meta_buf_ctx->buf_configs[i].buf = ggml_backend_buft_alloc_buffer(simple_buft, 0); + meta_buf_ctx->bufs[i].reset(ggml_backend_buft_alloc_buffer(simple_buft, 0)); for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { - t->buffer = meta_buf_ctx->buf_configs[i].buf; + t->buffer = meta_buf_ctx->bufs[i].get(); } } - GGML_ASSERT(meta_buf_ctx->buf_configs[i].buf != nullptr); - meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->buf_configs[i].buf)); + GGML_ASSERT(meta_buf_ctx->bufs[i]); + meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->bufs[i].get())); } return meta_buf; } @@ -1724,6 +1798,27 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend, } if (needs_rebuild) { + std::set used_buffers; + for (int i = 0; i < cgraph->n_leafs; i++) { + if (ggml_backend_buffer_is_meta(cgraph->leafs[i]->buffer)) { + used_buffers.emplace(cgraph->leafs[i]->buffer); + } + } + for (int i = 0; i < cgraph->n_nodes; i++) { + if (ggml_backend_buffer_is_meta(cgraph->nodes[i]->buffer)) { + used_buffers.emplace(cgraph->nodes[i]->buffer); + } + } + for (ggml_backend_buffer_t buf : used_buffers) { + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buf->context; + buf_ctx->stc_compute_index_next = buf_ctx->stc_compute_index ^ 1; + ggml_backend_meta_simple_tensor_container & stc = buf_ctx->stc_compute[buf_ctx->stc_compute_index_next]; + for (ggml_context_ptr & ctx : stc.ctxs) { + ggml_reset(ctx.get()); + } + stc.simple_tensors.clear(); + stc.split_state_cache.clear(); + } size_t n_subgraphs = 0; size_t max_tmp_size = 0; From 794e33d35c21933490a71fe9cfc219abe6407306 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 4 May 2026 09:16:52 +0200 Subject: [PATCH 2/5] move split state cache back into the context --- ggml/src/ggml-backend-meta.cpp | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp index a7213d6cad4..e13f5a8b85e 100644 --- a/ggml/src/ggml-backend-meta.cpp +++ b/ggml/src/ggml-backend-meta.cpp @@ -413,9 +413,6 @@ struct ggml_backend_meta_simple_tensor_container { std::vector ctxs; std::map> simple_tensors; - static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding); - std::map, std::pair> split_state_cache; - ggml_backend_meta_simple_tensor_container(const ggml_init_params & params, const int n_simple) { ctxs.reserve(n_simple); for (int i = 0; i < n_simple; i++) { @@ -439,6 +436,12 @@ struct ggml_backend_meta_buffer_context { int stc_compute_index_next = 0; std::vector bufs; + // FIXME + // The size of the split state cache is unbounded and can theoretically grow infinitely large. + // However, it is also expensive to build and clearing it on every rebuild in ggml_backend_meta_graph_compute is too expensive. + static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding); + std::map, std::pair> split_state_cache; + int debug; ggml_backend_meta_buffer_context( @@ -1073,15 +1076,15 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state( }; const std::pair key = std::make_pair(tensor, assume_sync); - auto it = stc.split_state_cache.find(key); - if (it != stc.split_state_cache.end() && memcmp(it->second.second, (const char *) tensor, sizeof(it->second.second)) != 0) { - stc.split_state_cache.clear(); - it = stc.split_state_cache.end(); + auto it = buf_ctx->split_state_cache.find(key); + if (it != buf_ctx->split_state_cache.end() && memcmp(it->second.second, (const char *) tensor, sizeof(it->second.second)) != 0) { + buf_ctx->split_state_cache.clear(); + it = buf_ctx->split_state_cache.end(); } - if (it == stc.split_state_cache.end()) { - stc.split_state_cache[key].first = calculate_split_state(); - memcpy(stc.split_state_cache[key].second, tensor, sizeof(stc.split_state_cache[key].second)); + if (it == buf_ctx->split_state_cache.end()) { + buf_ctx->split_state_cache[key].first = calculate_split_state(); + memcpy(buf_ctx->split_state_cache[key].second, tensor, sizeof(buf_ctx->split_state_cache[key].second)); if (buf_ctx->debug > 0) { std::string srcs_info; for (size_t i = 0; i < GGML_MAX_SRC; i++) { @@ -1107,14 +1110,14 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state( if (!ne_info.empty()) { ne_info += ", "; } - ne_info += std::to_string(stc.split_state_cache[key].first.ne[j]); + ne_info += std::to_string(buf_ctx->split_state_cache[key].first.ne[j]); } GGML_LOG_DEBUG("SPLIT_STATE: {%s} -> %s[%s, %s, {%s}]\n", srcs_info.c_str(), tensor->name, ggml_op_name(tensor->op), - ggml_backend_meta_split_axis_name(stc.split_state_cache[key].first.axis), ne_info.c_str()); + ggml_backend_meta_split_axis_name(buf_ctx->split_state_cache[key].first.axis), ne_info.c_str()); } } - ggml_backend_meta_split_state ret = stc.split_state_cache[key].first; + ggml_backend_meta_split_state ret = buf_ctx->split_state_cache[key].first; GGML_ASSERT(ret.axis != GGML_BACKEND_SPLIT_AXIS_NONE); #ifndef NDEBUG if (ret.axis >= 0 && ret.axis < GGML_MAX_DIMS) { @@ -1817,7 +1820,6 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend, ggml_reset(ctx.get()); } stc.simple_tensors.clear(); - stc.split_state_cache.clear(); } size_t n_subgraphs = 0; size_t max_tmp_size = 0; From 33bad2ffed891f755e173e0d3c040fa9dea765ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 19 May 2026 22:14:33 +0200 Subject: [PATCH 3/5] revert to constant ggml context size for cgraphs --- ggml/src/ggml-alloc.c | 6 ------ ggml/src/ggml-backend-impl.h | 3 --- ggml/src/ggml-backend-meta.cpp | 26 +++++--------------------- 3 files changed, 5 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 05d08bd240a..a4b01ccf8a1 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -823,12 +823,6 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr static bool ggml_gallocr_reserve_n_impl( ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) { - // FIXME long-term this should be replaced with better logic in ggml-alloc.c - for (int i = 0; i < galloc->n_buffers; i++) { - if (ggml_backend_buft_is_meta(galloc->bufts[i])) { - ggml_backend_meta_buft_update_max_n_tensors(galloc->bufts[i], graph->n_leafs + graph->n_nodes); - } - } size_t min_hash_size = graph->n_nodes + graph->n_leafs; // add 25% margin to avoid hash collisions min_hash_size += min_hash_size / 4; diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index 0630319fed3..9c56ec30c5f 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -98,9 +98,6 @@ extern "C" { // temporary workaround to statically allocate tensors from a context in a deduplicated way: GGML_API struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); - // another temporary workaround - GGML_API void ggml_backend_meta_buft_update_max_n_tensors(ggml_backend_buffer_type_t buft, size_t n_tensors); - // // Backend (stream) // diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp index e13f5a8b85e..2cad50578e0 100644 --- a/ggml/src/ggml-backend-meta.cpp +++ b/ggml/src/ggml-backend-meta.cpp @@ -252,8 +252,6 @@ struct ggml_backend_meta_buffer_type_context { std::string name; - std::atomic max_n_tensors = 1024; // FIXME replace with better handling in ggml-alloc.c - ggml_backend_meta_buffer_type_context(std::vector simple_bufts) : simple_bufts(std::move(simple_bufts)) { name = "Meta("; for (size_t i = 0; i < simple_bufts.size(); i++) { @@ -270,18 +268,6 @@ struct ggml_backend_meta_buffer_type_context { } }; -void ggml_backend_meta_buft_update_max_n_tensors(struct ggml_backend_buffer_type * buft, size_t n_tensors) { - GGML_ASSERT(ggml_backend_buft_is_meta(buft)); - ggml_backend_meta_buffer_type_context * ctx = (ggml_backend_meta_buffer_type_context *) buft->context; - size_t max_n_tensors_cur = ctx->max_n_tensors.load(); - while (max_n_tensors_cur < n_tensors) { - // If max_n_tensors_cur has not changed, it is written to the atomic (true), otherwise the new value is fetched (false). - if (ctx->max_n_tensors.compare_exchange_weak(/*expected =*/ max_n_tensors_cur, /*desired =*/n_tensors)) { - break; - } - } -} - static size_t ggml_backend_meta_buft_n_bufts(ggml_backend_buffer_type_t meta_buft) { GGML_ASSERT(ggml_backend_buft_is_meta(meta_buft)); const ggml_backend_meta_buffer_type_context * meta_buft_ctx = (const ggml_backend_meta_buffer_type_context *) meta_buft->context; @@ -1506,10 +1492,8 @@ bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf) { static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft); - const ggml_backend_meta_buffer_type_context * buft_ctx = (const ggml_backend_meta_buffer_type_context *) buft->context; - - ggml_init_params params = { - /*.mem_size =*/ buft_ctx->max_n_tensors.load()*ggml_tensor_overhead(), + const ggml_init_params params = { + /*.mem_size =*/ 1024*1024*ggml_tensor_overhead(), // FIXME /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; @@ -1534,12 +1518,12 @@ struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struc const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft); constexpr size_t compute_headroom = 8; - ggml_init_params params_static = { + const ggml_init_params params_static = { /*.mem_size =*/ ggml_get_mem_size(ctx), /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; - ggml_init_params params_compute = { + const ggml_init_params params_compute = { /*.mem_size =*/ compute_headroom*ggml_get_mem_size(ctx), /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, @@ -2006,7 +1990,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend, const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads); const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads); const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead(); - ggml_init_params params = { + const ggml_init_params params = { /*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux), /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, From bcf2c7c080b255514106fc05693da3bb24b2349e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 24 May 2026 11:00:04 +0200 Subject: [PATCH 4/5] increase headroom for statically allocated tensors --- ggml/src/ggml-backend-meta.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp index 2cad50578e0..829f29b293f 100644 --- a/ggml/src/ggml-backend-meta.cpp +++ b/ggml/src/ggml-backend-meta.cpp @@ -1517,7 +1517,7 @@ static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_bac struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft); - constexpr size_t compute_headroom = 8; + constexpr size_t compute_headroom = 16; // Maximum number of views per statically allocated tensor that can be created between evals. const ggml_init_params params_static = { /*.mem_size =*/ ggml_get_mem_size(ctx), /*.mem_buffer =*/ nullptr, From 10792f8036c1c395fc43b0dc73776642d5344963 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 24 May 2026 11:02:09 +0200 Subject: [PATCH 5/5] remove obsolete include --- ggml/src/ggml-backend-meta.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp index 829f29b293f..d0d64523b4a 100644 --- a/ggml/src/ggml-backend-meta.cpp +++ b/ggml/src/ggml-backend-meta.cpp @@ -6,7 +6,6 @@ #include "ggml-cpp.h" #include -#include #include #include #include