From b62c30514389d776d951c6d86b5de0f6311504a0 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 10 Jun 2026 01:00:14 +0200 Subject: [PATCH 01/13] mtmd: add batching API --- tools/mtmd/mtmd-helper.h | 28 ++++++-- tools/mtmd/mtmd.cpp | 82 +++++++++++++++++++++- tools/mtmd/mtmd.h | 35 +++++++-- tools/server/server-common.cpp | 8 +++ tools/server/server-common.h | 4 ++ tools/server/server-context.cpp | 121 ++++++++++++++++++++++++++++---- 6 files changed, 252 insertions(+), 26 deletions(-) diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h index 164b7c6689d9..8a473b01206f 100644 --- a/tools/mtmd/mtmd-helper.h +++ b/tools/mtmd/mtmd-helper.h @@ -67,8 +67,8 @@ MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, // helper function that automatically: // 1. run llama_decode() on text chunks -// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() -// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error +// 2. run mtmd_encode_chunk() on image chunks, then mtmd_get_output_embd() and then llama_decode() +// if any of the mtmd_encode_chunk() or llama_decode() calls return non-zero, stop and forward the error // otherwise, returns 0 on success // this function is NOT thread-safe MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, @@ -157,12 +157,30 @@ MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx, } // extern "C" #endif +#ifdef __cplusplus +#include +#include + +namespace mtmd_helper { + // -// C++ wrappers +// batching helpers (C++ only for now) // -#ifdef __cplusplus -namespace mtmd_helper { +MTMD_API + +MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunk * chunk, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + bool logits_last, + llama_pos * new_n_past); + +// +// C++ wrappers +// // video-related C++ wrappers struct mtmd_helper_video_deleter { diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 4140a3c4aa03..96bb7e91327b 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -159,6 +159,13 @@ struct mtmd_input_chunks { std::vector entries; }; +struct mtmd_batch { + mtmd_context * ctx; + std::vector entries; + std::vector output_embd; // aggregated output embedding for the whole batch + mtmd_batch(mtmd_context * ctx): ctx(ctx) {} +}; + // slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings // models not having it (llava-1.6) will process embeddings without any special tokens in-between enum mtmd_slice_tmpl { @@ -1327,6 +1334,9 @@ int32_t mtmd_tokenize(mtmd_context * ctx, } } +// forward declaration +int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens); + int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) { LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n"); @@ -1344,7 +1354,7 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { LOG_ERR("%s: image tokens batch is placeholder\n", __func__); return 1; } - return mtmd_encode(ctx, chunk->tokens_image.get()); + return mtmd_encode_impl(ctx, chunk->tokens_image.get()); } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { if (!ctx->ctx_a) { LOG_ERR("%s: model does not support audio input\n", __func__); @@ -1372,7 +1382,7 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { return 1; } -int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { +int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { clip_ctx * ctx_clip = ctx->ctx_v; if (!ctx_clip) { LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__); @@ -1422,10 +1432,78 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) return ok ? 0 : 1; } +int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { + try { + return mtmd_encode_impl(ctx, image_tokens); + } catch (const std::exception & e) { + LOG_ERR("%s: error: %s\n", __func__, e.what()); + return 1; + } +} + float * mtmd_get_output_embd(mtmd_context * ctx) { return ctx->image_embd_v.data(); } +mtmd_batch * mtmd_batch_init(mtmd_context * ctx) { + return new mtmd_batch(ctx); +} + +void mtmd_batch_free(mtmd_batch * batch) { + if (batch) { + delete batch; + } +} + +int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk) { + batch->entries.push_back(chunk); + if (batch->entries.size() > 4) { + return 1; // DEMO ONLY + } + return 0; +} + +int32_t mtmd_batch_encode(mtmd_batch * batch) { + // allocate output_embd + size_t n_embd = 0; + for (const auto * chunk : batch->entries) { + n_embd += mtmd_input_chunk_get_n_tokens(chunk) * batch->ctx->n_embd_text; + } + batch->output_embd.resize(n_embd); + + // TODO @ngxson : this is just for testing if the public API works; it is not true batching + size_t offset = 0; + for (const auto * chunk : batch->entries) { + int32_t res = mtmd_encode_chunk(batch->ctx, chunk); + if (res != 0) { + return res; + } + size_t len = mtmd_input_chunk_get_n_tokens(chunk) * batch->ctx->n_embd_text; + memcpy( + batch->output_embd.data() + offset, + mtmd_get_output_embd(batch->ctx), + len * sizeof(float)); + offset += len; + } + + return 0; +} + +float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk) { + size_t offset = 0; + for (const auto * c : batch->entries) { + size_t offset_prev = offset; + size_t n_tokens = mtmd_input_chunk_get_n_tokens(c); + offset += n_tokens * batch->ctx->n_embd_text; + GGML_ASSERT(offset_prev < batch->output_embd.size()); + GGML_ASSERT(offset <= batch->output_embd.size()); + if (c == chunk) { + return &batch->output_embd.data()[offset_prev]; + } + } + return nullptr; +} + bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk) { auto proj_type = ctx->proj_type_v(); if (chunk && chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index a76a6ec2b882..927b2d28dbd0 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -63,6 +63,7 @@ struct mtmd_bitmap; struct mtmd_image_tokens; struct mtmd_input_chunk; struct mtmd_input_chunks; +struct mtmd_batch; struct mtmd_input_text { const char * text; @@ -80,6 +81,7 @@ typedef struct mtmd_image_tokens mtmd_image_tokens; typedef struct mtmd_input_chunk mtmd_input_chunk; typedef struct mtmd_input_chunks mtmd_input_chunks; typedef struct mtmd_input_text mtmd_input_text; +typedef struct mtmd_batch mtmd_batch; struct mtmd_context_params { bool use_gpu; @@ -265,12 +267,12 @@ MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx, const mtmd_bitmap ** bitmaps, size_t n_bitmaps); -// returns 0 on success -// TODO: deprecate -MTMD_API int32_t mtmd_encode(mtmd_context * ctx, - const mtmd_image_tokens * image_tokens); +DEPRECATED(MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens), + "use mtmd_encode_chunk() instead"); +// text chunk will be ignored silently, only media chunk will be encoded // returns 0 on success +// returns 1 on generic error MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk); @@ -279,6 +281,26 @@ MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx, // llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float) MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); + +// batch encoding API +// chunks are not owned by the batch, they will not be freed by mtmd_batch_free() +// batch is valid for a given context, cannot be shared across contexts +MTMD_API mtmd_batch * mtmd_batch_init(mtmd_context * ctx); +MTMD_API void mtmd_batch_free(mtmd_batch * batch); + +// only media chunks are allowed, text chunks will be rejected +// returns 0 on success +// returns 1 on generic error +// returns 2 if the batch is too large (chunk won't be added) +// returns 3 if it cannot be batched with the existing chunks in the batch +MTMD_API int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk); + +// returns 0 on success +// returns 1 on generic error +MTMD_API int32_t mtmd_batch_encode(mtmd_batch * batch); +MTMD_API float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk); + + // Set callback for all future logging events. // If this is not called, or NULL is supplied, everything is output on stderr. MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data); @@ -336,6 +358,11 @@ struct mtmd_input_chunk_deleter { }; using input_chunk_ptr = std::unique_ptr; +struct mtmd_batch_deleter { + void operator()(mtmd_batch * val) { mtmd_batch_free(val); } +}; +using batch_ptr = std::unique_ptr; + struct bitmap { bitmap_ptr ptr; bitmap() : ptr(nullptr) {} diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 9f3caac8f723..4162d52098f9 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -344,6 +344,14 @@ const mtmd::input_chunk_ptr & server_tokens::find_chunk(size_t idx) const { throw std::runtime_error("Chunk not found"); } +std::pair server_tokens::find_next_media_chunk(size_t idx) const { + auto it = map_idx_to_media.upper_bound(idx); + if (it != map_idx_to_media.end()) { + return { &it->second, it->first }; + } + return { nullptr, 0 }; +} + void server_tokens::push_back(llama_token tok) { if (tok == LLAMA_TOKEN_NULL) { throw std::runtime_error("Invalid token"); diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 249b97c2fadb..857ffe14795f 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -180,6 +180,10 @@ struct server_tokens { const mtmd::input_chunk_ptr & find_chunk(size_t idx) const; + // find next media chunk after idx + // returns a pair of pointer to the chunk (nullptr if not found) and its start index in tokens + std::pair find_next_media_chunk(size_t idx) const; + void push_back(llama_token tok); // will create a copy of the chunk if it contains non-text data diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index bdfa51718080..a7cb45dcb987 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -80,6 +80,8 @@ struct server_slot { // multimodal mtmd_context * mctx = nullptr; + mtmd::batch_ptr mbatch = nullptr; + std::array mtgt = {nullptr, nullptr}; // [0] for main context, [1] for optional draft context // speculative decoding common_speculative * spec; @@ -239,6 +241,18 @@ struct server_slot { // clear alora start alora_invocation_start = -1; + + // clear multimodal state + mbatch.reset(); + mtgt[0] = ctx_tgt; + mtgt[1] = nullptr; + if (ctx_dft && llama_get_ctx_other(ctx_dft) != ctx_tgt) { + // TODO: in the future, figure out how to infuse target embeddings to the images + // for now, we re-decode the same chunk in both ctx_tgt and ctx_dft + // maybe we simply need to call `common_speculative_process()` ? + // [TAG_MTMD_DRAFT_PROCESSING] + mtgt[1] = ctx_dft; + } } void init_sampler() const { @@ -578,6 +592,86 @@ struct server_slot { other.prompt = prompt.clone(); other.init_sampler(); } + + // returns 0 on success + // caller need to update prompt.tokens after a successful call to keep track of the processing progress + int process_mtmd_chunk(size_t idx, size_t & n_tokens_out) { + GGML_ASSERT(mctx); + const auto & input_tokens = task->tokens; + auto & chunk = input_tokens.find_chunk(idx); + int32_t res = 0; + + auto try_decode = [&]() -> int32_t { + if (mbatch) { + float * embd = mtmd_batch_get_output_embd(mbatch.get(), chunk.get()); + if (embd) { + for (auto * lctx : mtgt) { + if (lctx == nullptr) { + continue; + } + llama_pos new_n_past; // unused for now + res = mtmd_helper_decode_image_chunk( + mctx, + ctx_tgt, + chunk.get(), + embd, + prompt.tokens.pos_next(), + id, + llama_n_batch(ctx_tgt), + &new_n_past + ); + if (res != 0) { + SLT_ERR(*this, "failed to decode mtmd chunk, idx = %zu, res = %d\n", idx, res); + return -1; + } + } + n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get()); + return 0; // success + } + } + return 1; // (non-error) need to create & encode batch + }; + + // if the batch is already exist, try searching & encode + res = try_decode(); + if (res == 0) { + return 0; + } else if (res < 0) { + // fatal error + return res; + } + + // otherwise, the batch is either uninitialized or is used up + // we need to create & encode a new batch + mbatch.reset(mtmd_batch_init(mctx)); + res = mtmd_batch_add_chunk(mbatch.get(), chunk.get()); + GGML_ASSERT(res == 0); // we should never have an empty batch + + // try batching as much as possible + int n_added = 1; + size_t idx_cur = idx; + while (res == 0) { + auto [next_chunk, next_idx] = input_tokens.find_next_media_chunk(idx_cur); + if (next_chunk == nullptr) { + break; + } + res = mtmd_batch_add_chunk(mbatch.get(), next_chunk->get()); + n_added += (res == 0 ? 1 : 0); + idx_cur = next_idx; + SLT_INF(*this, "try adding chunk idx = %zu to batch, res = %d\n", next_idx, res); + // if res != 0, batch is full or chunk is not compatible -> this loop breaks + } + + SLT_INF(*this, "encoding mtmd batch from idx = %zu, n_chunks = %d\n", idx, n_added); + + res = mtmd_batch_encode(mbatch.get()); + if (res != 0) { + SLT_ERR(*this, "failed to encode mtmd batch for chunk idx = %zu, res = %d\n", idx, res); + return -1; + } + + return try_decode(); + } }; @@ -2921,7 +3015,7 @@ struct server_context_impl { send_partial_response(slot, {}, false, true); } } - } + } // end of SLOT_STATE_STARTED if (!slot.can_split()) { // cannot fit the prompt in the current batch - will try next iter @@ -2976,10 +3070,18 @@ struct server_context_impl { bool has_mtmd = false; // check if we should process the image - while (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) { + while (true) { + auto cur_token_idx = slot.prompt.n_tokens(); + if ( + cur_token_idx >= slot.task->n_tokens() || + input_tokens[cur_token_idx] != LLAMA_TOKEN_NULL // encountered a text token + ) { + break; + } + // process the image size_t n_tokens_out = 0; - int32_t res = input_tokens.process_chunk(ctx_tgt, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out); + int32_t res = slot.process_mtmd_chunk(cur_token_idx, n_tokens_out); if (res != 0) { SLT_ERR(slot, "failed to process image, res = %d\n", res); send_error(slot, "failed to process image", ERROR_TYPE_SERVER); @@ -2987,22 +3089,11 @@ struct server_context_impl { continue; } - if (ctx_dft && llama_get_ctx_other(ctx_dft.get()) != ctx_tgt) { - // TODO: in the future, figure out how to infuse target embeddings to the images - // for now, we skip this for simplicity - // maybe we simply need to call `common_speculative_process()` on the mtmd batches in the `process_chunk` above? - // [TAG_MTMD_DRAFT_PROCESSING] - res = input_tokens.process_chunk(ctx_dft.get(), mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out); - if (res != 0) { - GGML_ABORT("failed to process multi-modal data on draft context\n"); - } - } - slot.n_prompt_tokens_processed += n_tokens_out; // add the image chunk to cache { - const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens()); + const auto & chunk = input_tokens.find_chunk(cur_token_idx); slot.prompt.tokens.push_back(chunk.get()); // copy } From 111d3f17360f147b5676b046d9cde52c1521bd40 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 11 Jun 2026 13:24:33 +0200 Subject: [PATCH 02/13] wip --- tools/mtmd/clip.cpp | 13 ++-- tools/mtmd/models/gemma4v.cpp | 22 +++--- tools/mtmd/mtmd.cpp | 140 +++++++++++++++++++++++++++------- tools/mtmd/mtmd.h | 5 ++ tools/server/server-http.cpp | 2 +- 5 files changed, 136 insertions(+), 46 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index bd33f430625a..adbb6efa60bd 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3482,14 +3482,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const clip_image_f32_batch & imgs = *imgs_c_ptr; int n_batch_cur = imgs.entries.size(); - // maximum supported batch size, usually == 2 for qwen-vl-based models - int n_batch_max = clip_model_n_batch_max(ctx); - - // TODO @ngxson : implement batch size > 1 as a loop - // we don't need true batching support because the cgraph will gonna be big anyway - if (n_batch_cur > n_batch_max) { - return false; - } + // TODO: check batching condition // if buffers are not allocated, we need to do a warmup run to allocate them if (!ctx->is_allocated) { @@ -3567,6 +3560,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const int n = nx * ny; for (int b = 0; b < n_batch_cur; b++) { + LOG_DBG("%s: copying image %d/%d to input buffer (nx=%d, ny=%d)\n", __func__, b+1, n_batch_cur, nx, ny); const auto & buf = imgs.entries[b]->get_ro_buf(); float * batch_entry = inp_raw.data() + b * (3*n); for (int y = 0; y < ny; y++) { @@ -4555,6 +4549,9 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) { return ctx->model.modality == CLIP_MODALITY_AUDIO; } +// TODO @ngxson : this is no longer true with mtmd_batch API +// this was only meant to be used by qwen-vl-based models, to fuse 2 input images into one (qwen-vl video support) +// this logic should be refactored in near future to distinctly handle "merge frames" and "batching" int clip_model_n_batch_max(const struct clip_ctx * ctx) { switch (ctx->proj_type()) { case PROJECTOR_TYPE_QWEN2VL: diff --git a/tools/mtmd/models/gemma4v.cpp b/tools/mtmd/models/gemma4v.cpp index 3570d6da1351..b4aadc4e16b9 100644 --- a/tools/mtmd/models/gemma4v.cpp +++ b/tools/mtmd/models/gemma4v.cpp @@ -10,7 +10,7 @@ ggml_cgraph * clip_graph_gemma4v::build() { ggml_set_name(inp_raw, "inp_raw_scaled"); ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd); + inp = ggml_reshape_3d(ctx0, inp, n_patches, n_embd, n_batch); inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); ggml_set_name(inp, "inp"); // note: no patch bias @@ -51,10 +51,11 @@ ggml_cgraph * clip_graph_gemma4v::build() { // first half ggml_tensor * first; { - first = ggml_view_3d(ctx0, cur, - n_dim/2, n_head, n_pos, + first = ggml_view_4d(ctx0, cur, + n_dim/2, n_head, n_pos, n_batch, cur->nb[1], cur->nb[2], + cur->nb[3], 0); first = ggml_rope_ext( ctx0, @@ -70,10 +71,11 @@ ggml_cgraph * clip_graph_gemma4v::build() { // second half ggml_tensor * second; { - second = ggml_view_3d(ctx0, cur, - n_dim/2, n_head, n_pos, + second = ggml_view_4d(ctx0, cur, + n_dim/2, n_head, n_pos, n_batch, cur->nb[1], cur->nb[2], + cur->nb[3], n_dim/2 * ggml_element_size(cur)); second = ggml_rope_ext( ctx0, @@ -103,14 +105,16 @@ ggml_cgraph * clip_graph_gemma4v::build() { const int kernel_size = hparams.n_merge; GGML_ASSERT(kernel_size > 0); - // [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, 1] - cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, 1); + // [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, n_batch] + cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, n_batch); cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); const int out_x = n_patches_x / kernel_size; const int out_y = n_patches_y / kernel_size; - // [out_x, out_y, n_embd, 1] -> [n_embd, out_x * out_y] - cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, 1); + printf("before pooling: n_patches_x=%d, n_patches_y=%d, after pooling: out_x=%d, out_y=%d\n", n_patches_x, n_patches_y, out_x, out_y); + printf("after pooling: ne[0]=%lld, ne[1]=%lld, ne[2]=%lld, ne[3]=%lld\n", cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]); + // [out_x, out_y, n_embd, n_batch] -> [n_embd, out_x * out_y, n_batch] + cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, n_batch); cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); cur = ggml_scale(ctx0, cur, sqrtf((float)n_embd)); cb(cur, "pooled", -1); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 96bb7e91327b..87f02dbeb937 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -110,6 +110,10 @@ struct mtmd_image_tokens { return false; } + bool can_batch_with(const mtmd_image_tokens & other) { + return nx == other.nx && ny == other.ny && pos == other.pos; + } + mtmd_image_tokens clone() { return mtmd_image_tokens{ nx, @@ -153,6 +157,29 @@ struct mtmd_input_chunk { std::vector tokens_text; mtmd_image_tokens_ptr tokens_image; mtmd_audio_tokens_ptr tokens_audio; + + bool can_batch_with(const mtmd_input_chunk & other) const { + if (type != other.type) { + return false; + } + + if (tokens_image && other.tokens_image) { + return tokens_image->can_batch_with(*other.tokens_image); + } + + // TODO: allow batching audio chunks of the same size + + return false; + } + + bool is_placeholder() const { + if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { + return tokens_image->is_placeholder(); + } else if (type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { + return tokens_audio->is_placeholder(); + } + return false; + } }; struct mtmd_input_chunks { @@ -164,6 +191,13 @@ struct mtmd_batch { std::vector entries; std::vector output_embd; // aggregated output embedding for the whole batch mtmd_batch(mtmd_context * ctx): ctx(ctx) {} + int32_t n_tokens() const { + int32_t n = 0; + for (const auto * chunk : entries) { + n += mtmd_input_chunk_get_n_tokens(chunk); + } + return n; + } }; // slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings @@ -204,6 +238,7 @@ mtmd_context_params mtmd_context_params_default() { /* image_max_tokens */ -1, /* cb_eval */ nullptr, /* cb_eval_user_data */ nullptr, + /* batch_max_tokens */ 2048, }; return params; } @@ -211,7 +246,7 @@ mtmd_context_params mtmd_context_params_default() { struct mtmd_context { struct clip_ctx * ctx_v; // vision struct clip_ctx * ctx_a; // audio - std::vector image_embd_v; // image embedding vector + std::vector out_embd; // image embedding vector bool print_timings; int n_threads; @@ -246,17 +281,21 @@ struct mtmd_context { std::unique_ptr audio_preproc; std::unique_ptr image_preproc; + // batching + int32_t batch_max_tokens; + // TODO @ngxson : add timings mtmd_context(const char * mmproj_fname, const llama_model * text_model, const mtmd_context_params & ctx_params, bool no_alloc = false) : - print_timings(ctx_params.print_timings), - n_threads (ctx_params.n_threads), - media_marker (ctx_params.media_marker), - n_embd_text (text_model ? llama_model_n_embd_inp(text_model) : -1), - vocab (text_model ? llama_model_get_vocab(text_model) : nullptr) + print_timings (ctx_params.print_timings), + n_threads (ctx_params.n_threads), + media_marker (ctx_params.media_marker), + n_embd_text (text_model ? llama_model_n_embd_inp(text_model) : -1), + vocab (text_model ? llama_model_get_vocab(text_model) : nullptr), + batch_max_tokens(ctx_params.batch_max_tokens) { if (ctx_params.image_marker != nullptr) { throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead"); @@ -1369,12 +1408,12 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { return 1; } int n_mmproj_embd = ctx->n_embd_text; - ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); + ctx->out_embd.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); bool ok = clip_image_batch_encode( ctx->ctx_a, ctx->n_threads, &chunk->tokens_audio->batch_f32, - ctx->image_embd_v.data()); + ctx->out_embd.data()); return ok ? 0 : 1; } @@ -1390,7 +1429,7 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok } auto proj_type = clip_get_projector_type(ctx_clip); int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip); - ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); + ctx->out_embd.resize(image_tokens->n_tokens() * n_mmproj_embd); bool ok = false; if (clip_is_llava(ctx_clip) @@ -1414,7 +1453,7 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok ctx_clip, ctx->n_threads, entries[i].get(), - ctx->image_embd_v.data() + offset); + ctx->out_embd.data() + offset); offset += static_cast(n_mmproj_embd) * n_tokens_per_image; } } else { @@ -1426,7 +1465,7 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok ctx_clip, ctx->n_threads, &image_tokens->batch_f32, - ctx->image_embd_v.data()); + ctx->out_embd.data()); } return ok ? 0 : 1; @@ -1442,7 +1481,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) } float * mtmd_get_output_embd(mtmd_context * ctx) { - return ctx->image_embd_v.data(); + return ctx->out_embd.data(); } mtmd_batch * mtmd_batch_init(mtmd_context * ctx) { @@ -1456,36 +1495,81 @@ void mtmd_batch_free(mtmd_batch * batch) { } int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk) { - batch->entries.push_back(chunk); - if (batch->entries.size() > 4) { - return 1; // DEMO ONLY + if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + LOG_ERR("%s: text chunk is not supported in batch\n", __func__); + return 1; } - return 0; + + if (batch->entries.empty()) { + // batch must have at least one chunk + batch->entries.push_back(chunk); + return 0; + } + + int32_t new_n_tokens = batch->n_tokens() + (int32_t)mtmd_input_chunk_get_n_tokens(chunk); + if (new_n_tokens > batch->ctx->batch_max_tokens) { + return 2; // "batch too large" error code + } + + auto & first_chunk = batch->entries[0]; + if (first_chunk->can_batch_with(*chunk)) { + batch->entries.push_back(chunk); + return 0; + } + + return 3; // "cannot batch" error code } int32_t mtmd_batch_encode(mtmd_batch * batch) { + if (batch->entries.empty()) { + LOG_ERR("%s: batch is empty\n", __func__); + return 1; + } + // allocate output_embd size_t n_embd = 0; for (const auto * chunk : batch->entries) { + if (chunk->is_placeholder()) { + LOG_ERR("%s: chunk is placeholder\n", __func__); + return 1; + } n_embd += mtmd_input_chunk_get_n_tokens(chunk) * batch->ctx->n_embd_text; } batch->output_embd.resize(n_embd); - // TODO @ngxson : this is just for testing if the public API works; it is not true batching - size_t offset = 0; - for (const auto * chunk : batch->entries) { - int32_t res = mtmd_encode_chunk(batch->ctx, chunk); - if (res != 0) { - return res; + // represent the whole batch as one single chunk + mtmd::input_chunk_ptr batch_chunk(mtmd_input_chunk_copy(batch->entries[0])); + if (batch_chunk->tokens_image) { + auto & b0_f32 = batch_chunk->tokens_image->batch_f32; + for (const auto * chunk : batch->entries) { + auto b1_f32 = chunk->tokens_image->batch_f32.clone(); + for (size_t i = 0; i < b1_f32.entries.size(); i++) { + b0_f32.entries.push_back(std::move(b1_f32.entries[i])); + } } - size_t len = mtmd_input_chunk_get_n_tokens(chunk) * batch->ctx->n_embd_text; - memcpy( - batch->output_embd.data() + offset, - mtmd_get_output_embd(batch->ctx), - len * sizeof(float)); - offset += len; + } else { + LOG_ERR("%s: unsupported chunk type\n", __func__); + return 1; } + LOG_DBG("%s: encoding batch with %zu entries and total %zu tokens\n", + __func__, batch->entries.size(), mtmd_input_chunk_get_n_tokens(batch_chunk.get())); + int32_t res = mtmd_encode_chunk(batch->ctx, batch_chunk.get()); + if (res != 0) { + return res; + } + + if (batch->ctx->out_embd.size() != batch->output_embd.size()) { + LOG_ERR("%s: output embedding size mismatch: expected %zu, got %zu\n", + __func__, batch->output_embd.size(), batch->ctx->out_embd.size()); + return 1; + } + + memcpy( + batch->output_embd.data(), + batch->ctx->out_embd.data(), + batch->output_embd.size() * sizeof(float)); + return 0; } diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 927b2d28dbd0..66dda62e9d51 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -99,6 +99,11 @@ struct mtmd_context_params { // callback function passed over to mtmd proper ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; + + // batching params + int32_t batch_max_tokens; // maximum number of output tokens in a batch + // (note: this is not a hard-limit, the first image will always be added even if it exceeds this limit) + // (default: 2048) }; MTMD_API const char * mtmd_default_marker(void); diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index 34a20c9d22de..ee54f2695b6e 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -113,7 +113,7 @@ bool server_http_context::init(const common_params & params) { #endif srv->set_default_headers({{"Server", "llama.cpp"}}); - srv->set_logger(log_server_request); + // srv->set_logger(log_server_request); // TODO @ngxson : this is too spamy, no very useful; improve it in the future srv->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) { // this is fail-safe; exceptions should already handled by `ex_wrapper` From f77cfd73a2db764554941e652d24267ccea8cbba Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 11 Jun 2026 16:10:38 +0200 Subject: [PATCH 03/13] first working version (gemma4v) --- tools/mtmd/clip.cpp | 22 ++++++++---- tools/mtmd/clip.h | 4 +-- tools/mtmd/models/gemma4v.cpp | 2 -- tools/mtmd/mtmd.cpp | 64 ++++++++++++++++++++--------------- 4 files changed, 53 insertions(+), 39 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 45a90f2f0d33..8ca66ac5c46b 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3484,16 +3484,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im return n_patches; } -bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { +bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, std::vector & out_vec) { clip_image_f32_batch imgs; clip_image_f32_ptr img_copy(clip_image_f32_init()); *img_copy = *img; imgs.entries.push_back(std::move(img_copy)); - return clip_image_batch_encode(ctx, n_threads, &imgs, vec); + return clip_image_batch_encode(ctx, n_threads, &imgs, out_vec); } -bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) { +bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, std::vector & out_batch_embd) { const clip_image_f32_batch & imgs = *imgs_c_ptr; int n_batch_cur = imgs.entries.size(); @@ -4410,7 +4410,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // the last node is the embedding tensor ggml_tensor * embeddings = ggml_graph_node(gf, -1); - // sanity check (only support batch size of 1 for now) + // sanity check (assuming that all images in batch have the same number of tokens, so we only check the first one) const int n_tokens_out = embeddings->ne[1]; const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get()); if (n_tokens_out != expected_n_tokens_out) { @@ -4418,9 +4418,17 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima GGML_ABORT("Invalid number of output tokens"); } - // copy the embeddings to the location passed by the user - if (vec != nullptr) { - ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); + LOG_DBG("%s: output embedding shape [%d, %d, %d]\n", __func__, + (int)embeddings->ne[0], (int)embeddings->ne[1], (int)embeddings->ne[2]); + + // copy output to user buffer if provided + // if output is empty, skip the copy + if (!out_batch_embd.empty()) { + if (out_batch_embd.size() != (size_t)ggml_nelements(embeddings)) { + LOG_ERR("%s: output buffer has %zu elements but expected %zu\n", __func__, out_batch_embd.size(), (size_t)ggml_nelements(embeddings)); + GGML_ABORT("Output buffer size mismatch"); + } + ggml_backend_tensor_get(embeddings, out_batch_embd.data(), 0, ggml_nbytes(embeddings)); } // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 18c7a1d1a7c4..e88d9ab202f6 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -97,8 +97,8 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data -bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); -bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); +bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, std::vector & out_vec); +bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, std::vector & out_batch_embd); bool clip_is_llava(const struct clip_ctx * ctx); // note for contributor: this clip_is_(model) pattern is deprecated diff --git a/tools/mtmd/models/gemma4v.cpp b/tools/mtmd/models/gemma4v.cpp index b4aadc4e16b9..87cbd43fc5fd 100644 --- a/tools/mtmd/models/gemma4v.cpp +++ b/tools/mtmd/models/gemma4v.cpp @@ -111,8 +111,6 @@ ggml_cgraph * clip_graph_gemma4v::build() { kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); const int out_x = n_patches_x / kernel_size; const int out_y = n_patches_y / kernel_size; - printf("before pooling: n_patches_x=%d, n_patches_y=%d, after pooling: out_x=%d, out_y=%d\n", n_patches_x, n_patches_y, out_x, out_y); - printf("after pooling: ne[0]=%lld, ne[1]=%lld, ne[2]=%lld, ne[3]=%lld\n", cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]); // [out_x, out_y, n_embd, n_batch] -> [n_embd, out_x * out_y, n_batch] cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, n_batch); cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 87f02dbeb937..7acdc17959d1 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -726,6 +726,16 @@ struct mtmd_context { return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN; } + int64_t n_embd_out() const { + if (ctx_v) { + return clip_n_mmproj_embd(ctx_v); + } else if (ctx_a) { + return clip_n_mmproj_embd(ctx_a); + } else { + throw std::runtime_error("no CLIP model loaded"); + } + } + ~mtmd_context() { clip_free(ctx_a); clip_free(ctx_v); @@ -1374,7 +1384,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx, } // forward declaration -int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens); +int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector * out_batch_embd = nullptr); int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) { @@ -1407,13 +1417,13 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { LOG_ERR("%s: audio tokens batch is placeholder\n", __func__); return 1; } - int n_mmproj_embd = ctx->n_embd_text; + int n_mmproj_embd = ctx->n_embd_out(); ctx->out_embd.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); bool ok = clip_image_batch_encode( ctx->ctx_a, ctx->n_threads, &chunk->tokens_audio->batch_f32, - ctx->out_embd.data()); + ctx->out_embd); return ok ? 0 : 1; } @@ -1421,7 +1431,7 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { return 1; } -int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { +int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector * out_batch_embd) { clip_ctx * ctx_clip = ctx->ctx_v; if (!ctx_clip) { LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__); @@ -1429,7 +1439,14 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok } auto proj_type = clip_get_projector_type(ctx_clip); int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip); - ctx->out_embd.resize(image_tokens->n_tokens() * n_mmproj_embd); + std::vector & out_embd = ctx->out_embd; + if (out_batch_embd) { + // caller need to resize out_batch_embd + out_embd = *out_batch_embd; + } else { + ctx->out_embd.resize(image_tokens->n_tokens() * n_mmproj_embd); + out_embd = ctx->out_embd; + } bool ok = false; if (clip_is_llava(ctx_clip) @@ -1449,11 +1466,13 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok return 1; } int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get()); + std::vector tmp_embd(n_tokens_per_image * n_mmproj_embd); ok = clip_image_encode( ctx_clip, ctx->n_threads, entries[i].get(), - ctx->out_embd.data() + offset); + tmp_embd); + std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset); offset += static_cast(n_mmproj_embd) * n_tokens_per_image; } } else { @@ -1465,7 +1484,7 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok ctx_clip, ctx->n_threads, &image_tokens->batch_f32, - ctx->out_embd.data()); + out_embd); } return ok ? 0 : 1; @@ -1533,7 +1552,7 @@ int32_t mtmd_batch_encode(mtmd_batch * batch) { LOG_ERR("%s: chunk is placeholder\n", __func__); return 1; } - n_embd += mtmd_input_chunk_get_n_tokens(chunk) * batch->ctx->n_embd_text; + n_embd += mtmd_input_chunk_get_n_tokens(chunk) * batch->ctx->n_embd_out(); } batch->output_embd.resize(n_embd); @@ -1541,7 +1560,11 @@ int32_t mtmd_batch_encode(mtmd_batch * batch) { mtmd::input_chunk_ptr batch_chunk(mtmd_input_chunk_copy(batch->entries[0])); if (batch_chunk->tokens_image) { auto & b0_f32 = batch_chunk->tokens_image->batch_f32; - for (const auto * chunk : batch->entries) { + // copy all entries from other chunks into the first chunk's batch_f32 + // note: skip first entry because it's already in batch_chunk + for (size_t ic = 1; ic < batch->entries.size(); ic++) { + auto & chunk = batch->entries[ic]; + GGML_ASSERT(chunk->tokens_image); auto b1_f32 = chunk->tokens_image->batch_f32.clone(); for (size_t i = 0; i < b1_f32.entries.size(); i++) { b0_f32.entries.push_back(std::move(b1_f32.entries[i])); @@ -1554,23 +1577,8 @@ int32_t mtmd_batch_encode(mtmd_batch * batch) { LOG_DBG("%s: encoding batch with %zu entries and total %zu tokens\n", __func__, batch->entries.size(), mtmd_input_chunk_get_n_tokens(batch_chunk.get())); - int32_t res = mtmd_encode_chunk(batch->ctx, batch_chunk.get()); - if (res != 0) { - return res; - } - - if (batch->ctx->out_embd.size() != batch->output_embd.size()) { - LOG_ERR("%s: output embedding size mismatch: expected %zu, got %zu\n", - __func__, batch->output_embd.size(), batch->ctx->out_embd.size()); - return 1; - } - - memcpy( - batch->output_embd.data(), - batch->ctx->out_embd.data(), - batch->output_embd.size() * sizeof(float)); - - return 0; + int32_t res = mtmd_encode_impl(batch->ctx, batch_chunk->tokens_image.get(), &batch->output_embd); + return res; } float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk) { @@ -1578,7 +1586,7 @@ float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * for (const auto * c : batch->entries) { size_t offset_prev = offset; size_t n_tokens = mtmd_input_chunk_get_n_tokens(c); - offset += n_tokens * batch->ctx->n_embd_text; + offset += n_tokens * batch->ctx->n_embd_out(); GGML_ASSERT(offset_prev < batch->output_embd.size()); GGML_ASSERT(offset <= batch->output_embd.size()); if (c == chunk) { @@ -1963,7 +1971,7 @@ static void mtmd_debug_encode_impl(mtmd_context * ctx, clip_ctx * ctx_clip, clip ctx_clip, ctx->n_threads, &image, - embd_output.data()); + embd_output); if (!ok) { LOG_ERR("%s: failed to encode image\n", __func__); } From 190bef3b9b78bd698b1b7fdc14b18fc64849f2ae Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 11 Jun 2026 16:56:32 +0200 Subject: [PATCH 04/13] add arg --- common/arg.cpp | 7 +++++++ common/common.h | 1 + tools/mtmd/mtmd.cpp | 2 +- tools/mtmd/mtmd.h | 2 +- tools/server/server-context.cpp | 1 + 5 files changed, 11 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 55795d357d90..8382c1d85dde 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2243,6 +2243,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.image_max_tokens = value; } ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS")); + add_opt(common_arg( + {"--mtmd-batch-max-tokens"}, "N", + string_format("maximum number of image tokens per batch when encoding images (default: %d)", params.mtmd_batch_max_tokens), + [](common_params & params, int value) { + params.mtmd_batch_max_tokens = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MTMD_BATCH_MAX_TOKENS")); if (llama_supports_rpc()) { add_opt(common_arg( {"--rpc"}, "SERVERS", diff --git a/common/common.h b/common/common.h index 4864186f6287..0b284cbb36c7 100644 --- a/common/common.h +++ b/common/common.h @@ -575,6 +575,7 @@ struct common_params { std::vector image; // path to image file(s) ; TODO: change the name to "media" int image_min_tokens = -1; int image_max_tokens = -1; + int mtmd_batch_max_tokens = 1024; // finetune struct lr_opt lr; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 7acdc17959d1..568431e5f518 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -238,7 +238,7 @@ mtmd_context_params mtmd_context_params_default() { /* image_max_tokens */ -1, /* cb_eval */ nullptr, /* cb_eval_user_data */ nullptr, - /* batch_max_tokens */ 2048, + /* batch_max_tokens */ 1024, }; return params; } diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 66dda62e9d51..2fd149e48069 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -103,7 +103,7 @@ struct mtmd_context_params { // batching params int32_t batch_max_tokens; // maximum number of output tokens in a batch // (note: this is not a hard-limit, the first image will always be added even if it exceeds this limit) - // (default: 2048) + // (default: 1024) }; MTMD_API const char * mtmd_default_marker(void); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index bed047a920bb..d5aa22b5f697 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -875,6 +875,7 @@ struct server_context_impl { mparams.warmup = params_base.warmup; mparams.image_min_tokens = params_base.image_min_tokens; mparams.image_max_tokens = params_base.image_max_tokens; + mparams.batch_max_tokens = params_base.mtmd_batch_max_tokens; mparams.media_marker = get_media_marker(); } From a773d7b84be411e079d20abe5acb24a2e7b72537 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 11 Jun 2026 16:57:07 +0200 Subject: [PATCH 05/13] nits --- tools/mtmd/mtmd-helper.h | 15 --------------- tools/server/server-context.cpp | 2 +- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h index 8a473b01206f..719aae988568 100644 --- a/tools/mtmd/mtmd-helper.h +++ b/tools/mtmd/mtmd-helper.h @@ -163,21 +163,6 @@ MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx, namespace mtmd_helper { -// -// batching helpers (C++ only for now) -// - -MTMD_API - -MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, - struct llama_context * lctx, - const mtmd_input_chunk * chunk, - llama_pos n_past, - llama_seq_id seq_id, - int32_t n_batch, - bool logits_last, - llama_pos * new_n_past); - // // C++ wrappers // diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index d5aa22b5f697..c7bf73813bd3 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -614,7 +614,7 @@ struct server_slot { mctx, ctx_tgt, chunk.get(), - embd, + embd, prompt.tokens.pos_next(), id, llama_n_batch(ctx_tgt), From 3eecd674d890a4cd11c74280b1050c629193a66a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 11 Jun 2026 17:42:39 +0200 Subject: [PATCH 06/13] wire up support_batch() --- tools/mtmd/clip-graph.h | 4 ++++ tools/mtmd/clip.cpp | 34 ++++++++++++++++++++++++++------- tools/mtmd/clip.h | 4 +++- tools/mtmd/models/models.h | 1 + tools/mtmd/mtmd.cpp | 11 +++++++++++ tools/server/server-context.cpp | 3 ++- 6 files changed, 48 insertions(+), 9 deletions(-) diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 7d10586217b8..c84b32880b5d 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -54,6 +54,10 @@ struct clip_graph { virtual ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const; // TODO: build_mm(w, b, x) to support bias + virtual bool support_batch() const { + return false; + } + // // utility functions // diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 8ca66ac5c46b..f3d4f81e44b5 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -171,6 +171,8 @@ struct clip_ctx { std::map mem_usage; std::map mem_compute; + bool support_batch = false; + clip_ctx(clip_context_params & ctx_params) { flash_attn_type = ctx_params.flash_attn_type; no_alloc = ctx_params.no_alloc; @@ -862,7 +864,7 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale return cur; } -static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) { +static std::unique_ptr clip_get_graph_builder(clip_ctx * ctx, const clip_image_f32_batch & imgs) { const clip_image_f32 & img = *imgs.entries[0]; std::unique_ptr builder; @@ -1025,7 +1027,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // TODO [QWEN_VIDEO]: improve this in the future builder->n_batch = imgs.entries.size(); - return builder->build(); + return builder; } // @@ -2819,7 +2821,7 @@ struct clip_model_loader { std::vector ops; }; - static void warmup(clip_ctx & ctx_clip) { + static clip_image_f32_batch get_dummy_batch(clip_ctx & ctx_clip) { // create a fake batch const auto & hparams = ctx_clip.model.hparams; clip_image_f32_batch batch; @@ -2833,6 +2835,20 @@ struct clip_model_loader { LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size); } batch.entries.push_back(std::move(img)); + return batch; + } + + static void init_ctx(clip_ctx & ctx_clip) { + ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); + + // check batching support + auto batch = get_dummy_batch(ctx_clip); + auto builder = clip_get_graph_builder(&ctx_clip, batch); + ctx_clip.support_batch = builder->support_batch(); + } + + static void warmup(clip_ctx & ctx_clip) { + auto batch = get_dummy_batch(ctx_clip); warmup(ctx_clip, batch); } @@ -2905,9 +2921,7 @@ struct clip_model_loader { // only initialize backend buffers, but do not allocate them yet static support_info_graph reserve_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) { - ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); - - ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch); + ggml_cgraph * gf = clip_get_graph_builder(&ctx_clip, batch)->build(); ggml_backend_sched_reserve(ctx_clip.sched.get(), gf); ctx_clip.mem_compute.clear(); @@ -3070,6 +3084,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_vision = new clip_ctx(ctx_params); loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION); loader.load_tensors(*ctx_vision); + loader.init_ctx(*ctx_vision); if (ctx_params.warmup) { loader.warmup(*ctx_vision); } @@ -3083,6 +3098,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_audio = new clip_ctx(ctx_params); loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO); loader.load_tensors(*ctx_audio); + loader.init_ctx(*ctx_audio); if (ctx_params.warmup) { loader.warmup(*ctx_audio); } @@ -3506,7 +3522,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // build the inference graph ggml_backend_sched_reset(ctx->sched.get()); - ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); + ggml_cgraph * gf = clip_get_graph_builder(ctx, imgs)->build(); ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); // set inputs @@ -4572,6 +4588,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) { return ctx->model.modality == CLIP_MODALITY_AUDIO; } +bool clip_support_batch(const struct clip_ctx * ctx) { + return ctx->support_batch; +} + // TODO @ngxson : this is no longer true with mtmd_batch API // this was only meant to be used by qwen-vl-based models, to fuse 2 input images into one (qwen-vl video support) // this logic should be refactored in near future to distinctly handle "merge frames" and "batching" diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index e88d9ab202f6..03cf649f41cb 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -107,7 +107,9 @@ bool clip_is_llava(const struct clip_ctx * ctx); bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); -int clip_model_n_batch_max(const struct clip_ctx * ctx); +bool clip_support_batch(const struct clip_ctx * ctx); + +int clip_model_n_batch_max(const struct clip_ctx * ctx); // TODO @ngxson : remove, refactor this std::map clip_get_mem_usage(const struct clip_ctx * ctx); diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 12082a5280a8..3a15f76829b9 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -16,6 +16,7 @@ struct clip_graph_gemma4v : clip_graph { clip_graph_gemma4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override; + bool support_batch() const override { return true; } }; struct clip_graph_gemma4uv : clip_graph { diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 568431e5f518..c947bf085b9c 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -1519,12 +1519,23 @@ int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk) return 1; } + auto * ctx = batch->ctx->get_clip_ctx(chunk); + if (!ctx) { + LOG_ERR("%s: model does not support input chunk type %d\n", __func__, (int)chunk->type); + return 1; + } + if (batch->entries.empty()) { // batch must have at least one chunk batch->entries.push_back(chunk); return 0; } + if (!clip_support_batch(ctx)) { + // if no batching support, batch can only have one single chunk + return 2; // "batch too large" error code + } + int32_t new_n_tokens = batch->n_tokens() + (int32_t)mtmd_input_chunk_get_n_tokens(chunk); if (new_n_tokens > batch->ctx->batch_max_tokens) { return 2; // "batch too large" error code diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index c7bf73813bd3..112500b09f75 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -658,10 +658,11 @@ struct server_slot { res = mtmd_batch_add_chunk(mbatch.get(), next_chunk->get()); n_added += (res == 0 ? 1 : 0); idx_cur = next_idx; - SLT_INF(*this, "try adding chunk idx = %zu to batch, res = %d\n", next_idx, res); + SLT_DBG(*this, "try adding media chunk idx = %zu to batch, res = %d\n", next_idx, res); // if res != 0, batch is full or chunk is not compatible -> this loop breaks } + // TODO @ngxson : move this log line to debug when it become more stable SLT_INF(*this, "encoding mtmd batch from idx = %zu, n_chunks = %d\n", idx, n_added); res = mtmd_batch_encode(mbatch.get()); From 7a22484c58b6033feaa84a6751358667bc00a9a8 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 11 Jun 2026 18:03:39 +0200 Subject: [PATCH 07/13] fix 0.0 output embd --- tools/mtmd/clip.cpp | 4 +++- tools/mtmd/mtmd.cpp | 9 +++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index f3d4f81e44b5..b08589831fc2 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -4445,13 +4445,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima GGML_ABORT("Output buffer size mismatch"); } ggml_backend_tensor_get(embeddings, out_batch_embd.data(), 0, ggml_nbytes(embeddings)); + } else { + LOG_WRN("%s: output buffer is empty, skipping copy\n", __func__); } // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set if (ctx->debug_output_embeddings) { const int64_t n_embd = embeddings->ne[0]; const int64_t n_tokens = embeddings->ne[1]; - std::vector emb_data(n_embd * n_tokens); + std::vector emb_data(ggml_nelements(embeddings)); ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings)); LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n"); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index c947bf085b9c..d8844fb84f7d 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -1439,14 +1439,15 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok } auto proj_type = clip_get_projector_type(ctx_clip); int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip); - std::vector & out_embd = ctx->out_embd; + std::vector * out_embd_ptr; if (out_batch_embd) { - // caller need to resize out_batch_embd - out_embd = *out_batch_embd; + // IMPORTANT: caller must ensure out_batch_embd has enough capacity + out_embd_ptr = out_batch_embd; } else { ctx->out_embd.resize(image_tokens->n_tokens() * n_mmproj_embd); - out_embd = ctx->out_embd; + out_embd_ptr = &ctx->out_embd; } + std::vector & out_embd = *out_embd_ptr; bool ok = false; if (clip_is_llava(ctx_clip) From 2dd581ae5b91e2e334190ec51718cc54f59f0b63 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 11 Jun 2026 18:14:12 +0200 Subject: [PATCH 08/13] fix audio --- tools/mtmd/mtmd.cpp | 124 ++++++++++++++++++++++++++------------------ 1 file changed, 73 insertions(+), 51 deletions(-) diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index d8844fb84f7d..f99c774d4dcf 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -1383,55 +1383,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx, } } -// forward declaration -int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector * out_batch_embd = nullptr); - -int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { - if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) { - LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n"); - return 0; - } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { - if (!ctx->ctx_v) { - LOG_ERR("%s: model does not support vision input\n", __func__); - return 1; - } - if (chunk->tokens_image == nullptr) { - LOG_ERR("%s: image tokens are null\n", __func__); - return 1; - } - if (chunk->tokens_image->is_placeholder()) { - LOG_ERR("%s: image tokens batch is placeholder\n", __func__); - return 1; - } - return mtmd_encode_impl(ctx, chunk->tokens_image.get()); - } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { - if (!ctx->ctx_a) { - LOG_ERR("%s: model does not support audio input\n", __func__); - return 1; - } - if (chunk->tokens_audio == nullptr) { - LOG_ERR("%s: audio tokens are null\n", __func__); - return 1; - } - if (chunk->tokens_audio->is_placeholder()) { - LOG_ERR("%s: audio tokens batch is placeholder\n", __func__); - return 1; - } - int n_mmproj_embd = ctx->n_embd_out(); - ctx->out_embd.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); - bool ok = clip_image_batch_encode( - ctx->ctx_a, - ctx->n_threads, - &chunk->tokens_audio->batch_f32, - ctx->out_embd); - return ok ? 0 : 1; - } - - LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type); - return 1; -} - -int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector * out_batch_embd) { +static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector * out_batch_embd) { clip_ctx * ctx_clip = ctx->ctx_v; if (!ctx_clip) { LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__); @@ -1491,9 +1443,64 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok return ok ? 0 : 1; } +static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk * chunk, std::vector * out_batch_embd = nullptr) { + if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n"); + return 0; + } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { + if (!ctx->ctx_v) { + LOG_ERR("%s: model does not support vision input\n", __func__); + return 1; + } + if (chunk->tokens_image == nullptr) { + LOG_ERR("%s: image tokens are null\n", __func__); + return 1; + } + if (chunk->tokens_image->is_placeholder()) { + LOG_ERR("%s: image tokens batch is placeholder\n", __func__); + return 1; + } + return mtmd_encode_impl(ctx, chunk->tokens_image.get(), out_batch_embd); + } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { + if (!ctx->ctx_a) { + LOG_ERR("%s: model does not support audio input\n", __func__); + return 1; + } + if (chunk->tokens_audio == nullptr) { + LOG_ERR("%s: audio tokens are null\n", __func__); + return 1; + } + if (chunk->tokens_audio->is_placeholder()) { + LOG_ERR("%s: audio tokens batch is placeholder\n", __func__); + return 1; + } + int n_mmproj_embd = ctx->n_embd_out(); + ctx->out_embd.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); + bool ok = clip_image_batch_encode( + ctx->ctx_a, + ctx->n_threads, + &chunk->tokens_audio->batch_f32, + out_batch_embd ? *out_batch_embd : ctx->out_embd); + return ok ? 0 : 1; + } + + LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type); + return 1; +} + +int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { + // this is the non-batching version + try { + return mtmd_encode_chunk_impl(ctx, chunk, &ctx->out_embd); + } catch (const std::exception & e) { + LOG_ERR("%s: error: %s\n", __func__, e.what()); + return 1; + } +} + int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { try { - return mtmd_encode_impl(ctx, image_tokens); + return mtmd_encode_impl(ctx, image_tokens, &ctx->out_embd); } catch (const std::exception & e) { LOG_ERR("%s: error: %s\n", __func__, e.what()); return 1; @@ -1582,6 +1589,18 @@ int32_t mtmd_batch_encode(mtmd_batch * batch) { b0_f32.entries.push_back(std::move(b1_f32.entries[i])); } } + } else if (batch_chunk->tokens_audio) { + auto & b0_f32 = batch_chunk->tokens_audio->batch_f32; + // copy all entries from other chunks into the first chunk's batch_f32 + // note: skip first entry because it's already in batch_chunk + for (size_t ic = 1; ic < batch->entries.size(); ic++) { + auto & chunk = batch->entries[ic]; + GGML_ASSERT(chunk->tokens_audio); + auto b1_f32 = chunk->tokens_audio->batch_f32.clone(); + for (size_t i = 0; i < b1_f32.entries.size(); i++) { + b0_f32.entries.push_back(std::move(b1_f32.entries[i])); + } + } } else { LOG_ERR("%s: unsupported chunk type\n", __func__); return 1; @@ -1589,7 +1608,10 @@ int32_t mtmd_batch_encode(mtmd_batch * batch) { LOG_DBG("%s: encoding batch with %zu entries and total %zu tokens\n", __func__, batch->entries.size(), mtmd_input_chunk_get_n_tokens(batch_chunk.get())); - int32_t res = mtmd_encode_impl(batch->ctx, batch_chunk->tokens_image.get(), &batch->output_embd); + int32_t res = mtmd_encode_chunk_impl( + batch->ctx, + batch_chunk.get(), + &batch->output_embd); return res; } From de656cc356135612a6c6719d14b3f07d93351aa1 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 11 Jun 2026 18:43:06 +0200 Subject: [PATCH 09/13] nits --- tools/mtmd/mtmd.cpp | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index f99c774d4dcf..c8850951c484 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -174,9 +174,9 @@ struct mtmd_input_chunk { bool is_placeholder() const { if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { - return tokens_image->is_placeholder(); + return tokens_image && tokens_image->is_placeholder(); } else if (type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { - return tokens_audio->is_placeholder(); + return tokens_audio && tokens_audio->is_placeholder(); } return false; } @@ -1393,7 +1393,7 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip); std::vector * out_embd_ptr; if (out_batch_embd) { - // IMPORTANT: caller must ensure out_batch_embd has enough capacity + // IMPORTANT: caller must ensure out_batch_embd has enough capacity; clip_image_encode will check for it out_embd_ptr = out_batch_embd; } else { ctx->out_embd.resize(image_tokens->n_tokens() * n_mmproj_embd); @@ -1420,11 +1420,16 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im } int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get()); std::vector tmp_embd(n_tokens_per_image * n_mmproj_embd); - ok = clip_image_encode( + bool ok_i = clip_image_encode( ctx_clip, ctx->n_threads, entries[i].get(), tmp_embd); + if (!ok_i) { + LOG_ERR("%s: failed to encode image %zu\n", __func__, i); + return 1; + } + ok = true; std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset); offset += static_cast(n_mmproj_embd) * n_tokens_per_image; } @@ -1558,7 +1563,7 @@ int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk) return 3; // "cannot batch" error code } -int32_t mtmd_batch_encode(mtmd_batch * batch) { +static int32_t mtmd_batch_encode_impl(mtmd_batch * batch) { if (batch->entries.empty()) { LOG_ERR("%s: batch is empty\n", __func__); return 1; @@ -1615,19 +1620,33 @@ int32_t mtmd_batch_encode(mtmd_batch * batch) { return res; } +int32_t mtmd_batch_encode(mtmd_batch * batch) { + try { + return mtmd_batch_encode_impl(batch); + } catch (const std::exception & e) { + LOG_ERR("%s: error: %s\n", __func__, e.what()); + return 1; + } +} + float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk) { + if (batch->output_embd.empty()) { + LOG_ERR("%s: batch has not been encoded yet\n", __func__); + return nullptr; + } size_t offset = 0; + const size_t n_embd = batch->ctx->n_embd_out(); for (const auto * c : batch->entries) { size_t offset_prev = offset; size_t n_tokens = mtmd_input_chunk_get_n_tokens(c); - offset += n_tokens * batch->ctx->n_embd_out(); + offset += n_tokens * n_embd; GGML_ASSERT(offset_prev < batch->output_embd.size()); GGML_ASSERT(offset <= batch->output_embd.size()); if (c == chunk) { return &batch->output_embd.data()[offset_prev]; } } - return nullptr; + return nullptr; // not found } bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk) { From 67d433505da770e70ed137d4a95771a0b5725a45 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 11 Jun 2026 19:09:36 +0200 Subject: [PATCH 10/13] refactor a bit --- tools/mtmd/clip.cpp | 4 +-- tools/mtmd/clip.h | 2 +- tools/mtmd/mtmd.cpp | 69 +++++++++++++++++++++++++-------------------- 3 files changed, 41 insertions(+), 34 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index b08589831fc2..1028006505ba 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -4594,10 +4594,10 @@ bool clip_support_batch(const struct clip_ctx * ctx) { return ctx->support_batch; } -// TODO @ngxson : this is no longer true with mtmd_batch API +// TODO @ngxson : this is no longer correct with mtmd_batch API // this was only meant to be used by qwen-vl-based models, to fuse 2 input images into one (qwen-vl video support) // this logic should be refactored in near future to distinctly handle "merge frames" and "batching" -int clip_model_n_batch_max(const struct clip_ctx * ctx) { +int clip_model_n_temporal_merge(const struct clip_ctx * ctx) { switch (ctx->proj_type()) { case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 03cf649f41cb..7197af8569e7 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -109,7 +109,7 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx); bool clip_support_batch(const struct clip_ctx * ctx); -int clip_model_n_batch_max(const struct clip_ctx * ctx); // TODO @ngxson : remove, refactor this +int clip_model_n_temporal_merge(const struct clip_ctx * ctx); // TODO @ngxson : remove, refactor this std::map clip_get_mem_usage(const struct clip_ctx * ctx); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index c8850951c484..9131012d9841 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -69,8 +69,8 @@ struct mtmd_bitmap { return data.size(); } - bool can_batch_with(const mtmd_bitmap & other) const { - // [QWEN_VIDEO] can batch if both are images with same size + bool can_merge_with(const mtmd_bitmap & other) const { + // [QWEN_VIDEO] can (temporal) merge if both are images with same size return !is_audio && !other.is_audio && nx == other.nx && ny == other.ny; } @@ -90,12 +90,24 @@ struct mtmd_image_tokens { uint32_t ny = 0; // number of tokens in y direction mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL; uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL) + uint32_t n_temporal_merge = 1; // for qwen-vl style temporal merge uint32_t n_tokens() const { if (pos == MTMD_POS_TYPE_HUNYUANVL) { // [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI] return (nx + 1) * ny + 2; } - return nx * ny; + // [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future + if (batch_f32.entries.size() == 1) { + return nx * ny; + } + uint32_t nz = batch_f32.entries.size(); + // TODO: simplify this by repeating the last frame until it fits the temporal merge + if (nz % n_temporal_merge != 0) { + nz = nz / n_temporal_merge + 1; + } else { + nz = nz / n_temporal_merge; + } + return nx * ny * nz; } clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking @@ -120,6 +132,7 @@ struct mtmd_image_tokens { ny, pos, image_idx, + n_temporal_merge, batch_f32.clone(), id }; @@ -901,7 +914,7 @@ struct mtmd_tokenizer { // [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl) int n_merge_frames = 1; if (ctx->ctx_v) { - n_merge_frames = clip_model_n_batch_max(ctx->ctx_v); + n_merge_frames = clip_model_n_temporal_merge(ctx->ctx_v); GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more"); } @@ -916,7 +929,7 @@ struct mtmd_tokenizer { if (i + 1 < parts.size() && parts[i + 1].bitmap != nullptr) { const mtmd_bitmap * bm_a = parts[i].bitmap; const mtmd_bitmap * bm_b = parts[i + 1].bitmap; - if (bm_a->can_batch_with(*bm_b)) { + if (bm_a->can_merge_with(*bm_b)) { LOG_DBG("%s: merging 2 frames at part index %zu and %zu\n", __func__, i, i + 1); merged_bitmaps.push_back({bm_a, bm_b}); parts.erase(parts.begin() + i + 1); // collapse the second bitmap part @@ -1159,13 +1172,17 @@ struct mtmd_tokenizer { size_t n_tokens = 0; for (const auto & e : batch_f32.entries) { n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get()); - if (clip_model_n_batch_max(ctx->ctx_v) == 2) { + if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) { // [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image break; } } mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); + + // [QWEN_VIDEO] improve this in the future + image_tokens->n_temporal_merge = clip_model_n_temporal_merge(ctx->ctx_v); + if (mtmd_decode_use_mrope(ctx)) { // for Qwen2VL, we need this information for M-RoPE decoding positions image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get()); @@ -1383,23 +1400,18 @@ int32_t mtmd_tokenize(mtmd_context * ctx, } } -static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector * out_batch_embd) { +static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector & out_embd) { clip_ctx * ctx_clip = ctx->ctx_v; if (!ctx_clip) { LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__); return 1; } auto proj_type = clip_get_projector_type(ctx_clip); - int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip); - std::vector * out_embd_ptr; - if (out_batch_embd) { - // IMPORTANT: caller must ensure out_batch_embd has enough capacity; clip_image_encode will check for it - out_embd_ptr = out_batch_embd; - } else { - ctx->out_embd.resize(image_tokens->n_tokens() * n_mmproj_embd); - out_embd_ptr = &ctx->out_embd; - } - std::vector & out_embd = *out_embd_ptr; + + int n_embd_out = ctx->n_embd_out(); + auto n_tokens_out = image_tokens->n_tokens(); + out_embd.resize((size_t)n_embd_out * n_tokens_out); + bool ok = false; if (clip_is_llava(ctx_clip) @@ -1419,7 +1431,7 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im return 1; } int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get()); - std::vector tmp_embd(n_tokens_per_image * n_mmproj_embd); + std::vector tmp_embd(n_tokens_per_image * n_embd_out); bool ok_i = clip_image_encode( ctx_clip, ctx->n_threads, @@ -1431,7 +1443,7 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im } ok = true; std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset); - offset += static_cast(n_mmproj_embd) * n_tokens_per_image; + offset += static_cast(n_embd_out) * n_tokens_per_image; } } else { if (image_tokens->is_placeholder()) { @@ -1448,7 +1460,7 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im return ok ? 0 : 1; } -static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk * chunk, std::vector * out_batch_embd = nullptr) { +static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk * chunk, std::vector & out_embd) { if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) { LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n"); return 0; @@ -1465,7 +1477,7 @@ static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk LOG_ERR("%s: image tokens batch is placeholder\n", __func__); return 1; } - return mtmd_encode_impl(ctx, chunk->tokens_image.get(), out_batch_embd); + return mtmd_encode_impl(ctx, chunk->tokens_image.get(), out_embd); } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { if (!ctx->ctx_a) { LOG_ERR("%s: model does not support audio input\n", __func__); @@ -1480,12 +1492,12 @@ static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk return 1; } int n_mmproj_embd = ctx->n_embd_out(); - ctx->out_embd.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); + out_embd.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); bool ok = clip_image_batch_encode( ctx->ctx_a, ctx->n_threads, &chunk->tokens_audio->batch_f32, - out_batch_embd ? *out_batch_embd : ctx->out_embd); + out_embd); return ok ? 0 : 1; } @@ -1496,7 +1508,7 @@ static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { // this is the non-batching version try { - return mtmd_encode_chunk_impl(ctx, chunk, &ctx->out_embd); + return mtmd_encode_chunk_impl(ctx, chunk, ctx->out_embd); } catch (const std::exception & e) { LOG_ERR("%s: error: %s\n", __func__, e.what()); return 1; @@ -1505,7 +1517,7 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { try { - return mtmd_encode_impl(ctx, image_tokens, &ctx->out_embd); + return mtmd_encode_impl(ctx, image_tokens, ctx->out_embd); } catch (const std::exception & e) { LOG_ERR("%s: error: %s\n", __func__, e.what()); return 1; @@ -1568,17 +1580,12 @@ static int32_t mtmd_batch_encode_impl(mtmd_batch * batch) { LOG_ERR("%s: batch is empty\n", __func__); return 1; } - - // allocate output_embd - size_t n_embd = 0; for (const auto * chunk : batch->entries) { if (chunk->is_placeholder()) { LOG_ERR("%s: chunk is placeholder\n", __func__); return 1; } - n_embd += mtmd_input_chunk_get_n_tokens(chunk) * batch->ctx->n_embd_out(); } - batch->output_embd.resize(n_embd); // represent the whole batch as one single chunk mtmd::input_chunk_ptr batch_chunk(mtmd_input_chunk_copy(batch->entries[0])); @@ -1616,7 +1623,7 @@ static int32_t mtmd_batch_encode_impl(mtmd_batch * batch) { int32_t res = mtmd_encode_chunk_impl( batch->ctx, batch_chunk.get(), - &batch->output_embd); + batch->output_embd); return res; } From 0d6bc77df748223357aa50f56f485a361515bb67 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 11 Jun 2026 19:32:42 +0200 Subject: [PATCH 11/13] nits --- tools/mtmd/clip.cpp | 6 +++++- tools/mtmd/mtmd.cpp | 4 ++-- tools/server/server-context.cpp | 4 ++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 1028006505ba..603f95d11723 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3513,7 +3513,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const clip_image_f32_batch & imgs = *imgs_c_ptr; int n_batch_cur = imgs.entries.size(); - // TODO: check batching condition + // [QWEN_VIDEO] for video models, the batch dimension is used as temporal dimension for merged frames + if (!ctx->support_batch && n_batch_cur > clip_model_n_temporal_merge(ctx)) { + LOG_ERR("%s: batch size %d exceeds maximum supported batch/temporal-merge size %d\n", __func__, n_batch_cur, clip_model_n_temporal_merge(ctx)); + return false; + } // if buffers are not allocated, we need to do a warmup run to allocate them if (!ctx->is_allocated) { diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 9131012d9841..5a0bb982346c 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -1431,7 +1431,7 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im return 1; } int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get()); - std::vector tmp_embd(n_tokens_per_image * n_embd_out); + std::vector tmp_embd((size_t)n_tokens_per_image * n_embd_out); bool ok_i = clip_image_encode( ctx_clip, ctx->n_threads, @@ -1492,7 +1492,7 @@ static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk return 1; } int n_mmproj_embd = ctx->n_embd_out(); - out_embd.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); + out_embd.resize((size_t)chunk->tokens_audio->n_tokens * n_mmproj_embd); bool ok = clip_image_batch_encode( ctx->ctx_a, ctx->n_threads, diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 112500b09f75..595fc219b28c 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -612,12 +612,12 @@ struct server_slot { llama_pos new_n_past; // unused for now res = mtmd_helper_decode_image_chunk( mctx, - ctx_tgt, + lctx, chunk.get(), embd, prompt.tokens.pos_next(), id, - llama_n_batch(ctx_tgt), + llama_n_batch(lctx), &new_n_past ); if (res != 0) { From b3a5ca93c6e7b8f2ddbc9e2b033b79b60faa1567 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 12 Jun 2026 19:11:58 +0200 Subject: [PATCH 12/13] fix non-batching case --- tools/mtmd/mtmd.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 5a0bb982346c..8e839ef8f46a 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -97,7 +97,7 @@ struct mtmd_image_tokens { return (nx + 1) * ny + 2; } // [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future - if (batch_f32.entries.size() == 1) { + if (batch_f32.entries.size() == 1 || n_temporal_merge == 1) { return nx * ny; } uint32_t nz = batch_f32.entries.size(); From 4cf7759921e8ad89b84450d0a3e2f9e3d71ea02c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 12 Jun 2026 20:52:22 +0200 Subject: [PATCH 13/13] fix comment --- tools/mtmd/clip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 603f95d11723..208486fd153b 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -316,7 +316,7 @@ ggml_tensor * clip_graph::build_vit( std::function add_pos, const build_vit_opts & opts ) { - // batch dim: inp is [n_embd, n_pos] (B==1) or [n_embd, n_pos, B] (multi-tile encode) + // batch dim: inp is [n_embd, n_pos, B] const int64_t B = inp->ne[2]; if (learned_pos_embd) {