From b62c30514389d776d951c6d86b5de0f6311504a0 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 10 Jun 2026 01:00:14 +0200
Subject: [PATCH 01/13] mtmd: add batching API

---
 tools/mtmd/mtmd-helper.h        |  28 ++++++--
 tools/mtmd/mtmd.cpp             |  82 +++++++++++++++++++++-
 tools/mtmd/mtmd.h               |  35 +++++++--
 tools/server/server-common.cpp  |   8 +++
 tools/server/server-common.h    |   4 ++
 tools/server/server-context.cpp | 121 ++++++++++++++++++++++++++++----
 6 files changed, 252 insertions(+), 26 deletions(-)
diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h
index 164b7c6689d9..8a473b01206f 100644
--- a/tools/mtmd/mtmd-helper.h
+++ b/tools/mtmd/mtmd-helper.h
@@ -67,8 +67,8 @@ MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image,
 
 // helper function that automatically:
 // 1. run llama_decode() on text chunks
-// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
-// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
+// 2. run mtmd_encode_chunk() on image chunks, then mtmd_get_output_embd() and then llama_decode()
+// if any of the mtmd_encode_chunk() or llama_decode() calls return non-zero, stop and forward the error
 // otherwise, returns 0 on success
 // this function is NOT thread-safe
 MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
@@ -157,12 +157,30 @@ MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx,
 } // extern "C"
 #endif
 
+#ifdef __cplusplus
+#include <set>
+#include <memory>
+
+namespace mtmd_helper {
+
 //
-// C++ wrappers
+// batching helpers (C++ only for now)
 //
 
-#ifdef __cplusplus
-namespace mtmd_helper {
+MTMD_API 
+
+MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
+        struct llama_context * lctx,
+        const mtmd_input_chunk * chunk,
+        llama_pos n_past,
+        llama_seq_id seq_id,
+        int32_t n_batch,
+        bool logits_last,
+        llama_pos * new_n_past);
+
+//
+// C++ wrappers
+//
 
 // video-related C++ wrappers
 struct mtmd_helper_video_deleter {
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 4140a3c4aa03..96bb7e91327b 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -159,6 +159,13 @@ struct mtmd_input_chunks {
     std::vector<mtmd_input_chunk> entries;
 };
 
+struct mtmd_batch {
+    mtmd_context * ctx;
+    std::vector<const mtmd_input_chunk *> entries;
+    std::vector<float> output_embd; // aggregated output embedding for the whole batch
+    mtmd_batch(mtmd_context * ctx): ctx(ctx) {}
+};
+
 // slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
 // models not having it (llava-1.6) will process embeddings without any special tokens in-between
 enum mtmd_slice_tmpl {
@@ -1327,6 +1334,9 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
     }
 }
 
+// forward declaration
+int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens);
+
 int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
     if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
         LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
@@ -1344,7 +1354,7 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
             LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
             return 1;
         }
-        return mtmd_encode(ctx, chunk->tokens_image.get());
+        return mtmd_encode_impl(ctx, chunk->tokens_image.get());
     } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
         if (!ctx->ctx_a) {
             LOG_ERR("%s: model does not support audio input\n", __func__);
@@ -1372,7 +1382,7 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
     return 1;
 }
 
-int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
+int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     clip_ctx * ctx_clip = ctx->ctx_v;
     if (!ctx_clip) {
         LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
@@ -1422,10 +1432,78 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
     return ok ? 0 : 1;
 }
 
+int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
+    try {
+        return mtmd_encode_impl(ctx, image_tokens);
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: error: %s\n", __func__, e.what());
+        return 1;
+    }
+}
+
 float * mtmd_get_output_embd(mtmd_context * ctx) {
     return ctx->image_embd_v.data();
 }
 
+mtmd_batch * mtmd_batch_init(mtmd_context * ctx) {
+    return new mtmd_batch(ctx);
+}
+
+void mtmd_batch_free(mtmd_batch * batch) {
+    if (batch) {
+        delete batch;
+    }
+}
+
+int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk) {
+    batch->entries.push_back(chunk);
+    if (batch->entries.size() > 4) {
+        return 1; // DEMO ONLY
+    }
+    return 0;
+}
+
+int32_t mtmd_batch_encode(mtmd_batch * batch) {
+    // allocate output_embd
+    size_t n_embd = 0;
+    for (const auto * chunk : batch->entries) {
+        n_embd += mtmd_input_chunk_get_n_tokens(chunk) * batch->ctx->n_embd_text;
+    }
+    batch->output_embd.resize(n_embd);
+
+    // TODO @ngxson : this is just for testing if the public API works; it is not true batching
+    size_t offset = 0;
+    for (const auto * chunk : batch->entries) {
+        int32_t res = mtmd_encode_chunk(batch->ctx, chunk);
+        if (res != 0) {
+            return res;
+        }
+        size_t len = mtmd_input_chunk_get_n_tokens(chunk) * batch->ctx->n_embd_text;
+        memcpy(
+            batch->output_embd.data() + offset,
+            mtmd_get_output_embd(batch->ctx),
+            len * sizeof(float));
+        offset += len;
+    }
+
+    return 0;
+}
+
+float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk) {
+    size_t offset = 0;
+    for (const auto * c : batch->entries) {
+        size_t offset_prev = offset;
+        size_t n_tokens = mtmd_input_chunk_get_n_tokens(c);
+        offset += n_tokens * batch->ctx->n_embd_text;
+        GGML_ASSERT(offset_prev <  batch->output_embd.size());
+        GGML_ASSERT(offset      <= batch->output_embd.size());
+        if (c == chunk) {
+            return &batch->output_embd.data()[offset_prev];
+        }
+    }
+    return nullptr;
+}
+
 bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk) {
     auto proj_type = ctx->proj_type_v();
     if (chunk && chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index a76a6ec2b882..927b2d28dbd0 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -63,6 +63,7 @@ struct mtmd_bitmap;
 struct mtmd_image_tokens;
 struct mtmd_input_chunk;
 struct mtmd_input_chunks;
+struct mtmd_batch;
 
 struct mtmd_input_text {
     const char * text;
@@ -80,6 +81,7 @@ typedef struct mtmd_image_tokens mtmd_image_tokens;
 typedef struct mtmd_input_chunk  mtmd_input_chunk;
 typedef struct mtmd_input_chunks mtmd_input_chunks;
 typedef struct mtmd_input_text   mtmd_input_text;
+typedef struct mtmd_batch        mtmd_batch;
 
 struct mtmd_context_params {
     bool use_gpu;
@@ -265,12 +267,12 @@ MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
                                const mtmd_bitmap ** bitmaps,
                                size_t n_bitmaps);
 
-// returns 0 on success
-// TODO: deprecate
-MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
-                             const mtmd_image_tokens * image_tokens);
+DEPRECATED(MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens),
+           "use mtmd_encode_chunk() instead");
 
+// text chunk will be ignored silently, only media chunk will be encoded
 // returns 0 on success
+// returns 1 on generic error
 MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
                                    const mtmd_input_chunk * chunk);
 
@@ -279,6 +281,26 @@ MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
 // llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
 MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 
+
+// batch encoding API
+// chunks are not owned by the batch, they will not be freed by mtmd_batch_free()
+// batch is valid for a given context, cannot be shared across contexts
+MTMD_API mtmd_batch * mtmd_batch_init(mtmd_context * ctx);
+MTMD_API void         mtmd_batch_free(mtmd_batch * batch);
+
+// only media chunks are allowed, text chunks will be rejected
+// returns 0 on success
+// returns 1 on generic error
+// returns 2 if the batch is too large (chunk won't be added)
+// returns 3 if it cannot be batched with the existing chunks in the batch
+MTMD_API int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk);
+
+// returns 0 on success
+// returns 1 on generic error
+MTMD_API int32_t mtmd_batch_encode(mtmd_batch * batch);
+MTMD_API float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk);
+
+
 // Set callback for all future logging events.
 // If this is not called, or NULL is supplied, everything is output on stderr.
 MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
@@ -336,6 +358,11 @@ struct mtmd_input_chunk_deleter {
 };
 using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
 
+struct mtmd_batch_deleter {
+    void operator()(mtmd_batch * val) { mtmd_batch_free(val); }
+};
+using batch_ptr = std::unique_ptr<mtmd_batch, mtmd_batch_deleter>;
+
 struct bitmap {
     bitmap_ptr ptr;
     bitmap() : ptr(nullptr) {}
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index 9f3caac8f723..4162d52098f9 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -344,6 +344,14 @@ const mtmd::input_chunk_ptr & server_tokens::find_chunk(size_t idx) const {
     throw std::runtime_error("Chunk not found");
 }
 
+std::pair<const mtmd::input_chunk_ptr *, size_t> server_tokens::find_next_media_chunk(size_t idx) const {
+    auto it = map_idx_to_media.upper_bound(idx);
+    if (it != map_idx_to_media.end()) {
+        return { &it->second, it->first };
+    }
+    return { nullptr, 0 };
+}
+
 void server_tokens::push_back(llama_token tok) {
     if (tok == LLAMA_TOKEN_NULL) {
         throw std::runtime_error("Invalid token");
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
index 249b97c2fadb..857ffe14795f 100644
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@@ -180,6 +180,10 @@ struct server_tokens {
 
     const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;
 
+    // find next media chunk after idx
+    // returns a pair of pointer to the chunk (nullptr if not found) and its start index in tokens
+    std::pair<const mtmd::input_chunk_ptr *, size_t> find_next_media_chunk(size_t idx) const;
+
     void push_back(llama_token tok);
 
     // will create a copy of the chunk if it contains non-text data
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index bdfa51718080..a7cb45dcb987 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -80,6 +80,8 @@ struct server_slot {
 
     // multimodal
     mtmd_context * mctx = nullptr;
+    mtmd::batch_ptr mbatch = nullptr;
+    std::array<llama_context *, 2> mtgt = {nullptr, nullptr}; // [0] for main context, [1] for optional draft context
 
     // speculative decoding
     common_speculative * spec;
@@ -239,6 +241,18 @@ struct server_slot {
 
         // clear alora start
         alora_invocation_start = -1;
+
+        // clear multimodal state
+        mbatch.reset();
+        mtgt[0] = ctx_tgt;
+        mtgt[1] = nullptr;
+        if (ctx_dft && llama_get_ctx_other(ctx_dft) != ctx_tgt) {
+            // TODO: in the future, figure out how to infuse target embeddings to the images
+            //       for now, we re-decode the same chunk in both ctx_tgt and ctx_dft
+            //       maybe we simply need to call `common_speculative_process()` ?
+            //       [TAG_MTMD_DRAFT_PROCESSING]
+            mtgt[1] = ctx_dft;
+        }
     }
 
     void init_sampler() const {
@@ -578,6 +592,86 @@ struct server_slot {
         other.prompt = prompt.clone();
         other.init_sampler();
     }
+
+    // returns 0 on success
+    // caller need to update prompt.tokens after a successful call to keep track of the processing progress
+    int process_mtmd_chunk(size_t idx, size_t & n_tokens_out) {
+        GGML_ASSERT(mctx);
+        const auto & input_tokens = task->tokens;
+        auto & chunk = input_tokens.find_chunk(idx);
+        int32_t res = 0;
+
+        auto try_decode = [&]() -> int32_t {
+            if (mbatch) {
+                float * embd = mtmd_batch_get_output_embd(mbatch.get(), chunk.get());
+                if (embd) {
+                    for (auto * lctx : mtgt) {
+                        if (lctx == nullptr) {
+                            continue;
+                        }
+                        llama_pos new_n_past; // unused for now
+                        res = mtmd_helper_decode_image_chunk(
+                            mctx,
+                            ctx_tgt,
+                            chunk.get(),
+                            embd, 
+                            prompt.tokens.pos_next(),
+                            id,
+                            llama_n_batch(ctx_tgt),
+                            &new_n_past
+                        );
+                        if (res != 0) {
+                            SLT_ERR(*this, "failed to decode mtmd chunk, idx = %zu, res = %d\n", idx, res);
+                            return -1;
+                        }
+                    }
+                    n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
+                    return 0; // success
+                }
+            }
+            return 1; // (non-error) need to create & encode batch
+        };
+
+        // if the batch is already exist, try searching & encode
+        res = try_decode();
+        if (res == 0) {
+            return 0;
+        } else if (res < 0) {
+            // fatal error
+            return res;
+        }
+
+        // otherwise, the batch is either uninitialized or is used up
+        // we need to create & encode a new batch
+        mbatch.reset(mtmd_batch_init(mctx));
+        res = mtmd_batch_add_chunk(mbatch.get(), chunk.get());
+        GGML_ASSERT(res == 0); // we should never have an empty batch
+
+        // try batching as much as possible
+        int n_added = 1;
+        size_t idx_cur = idx;
+        while (res == 0) {
+            auto [next_chunk, next_idx] = input_tokens.find_next_media_chunk(idx_cur);
+            if (next_chunk == nullptr) {
+                break;
+            }
+            res = mtmd_batch_add_chunk(mbatch.get(), next_chunk->get());
+            n_added += (res == 0 ? 1 : 0);
+            idx_cur = next_idx;
+            SLT_INF(*this, "try adding chunk idx = %zu to batch, res = %d\n", next_idx, res);
+            // if res != 0, batch is full or chunk is not compatible -> this loop breaks
+        }
+
+        SLT_INF(*this, "encoding mtmd batch from idx = %zu, n_chunks = %d\n", idx, n_added);
+
+        res = mtmd_batch_encode(mbatch.get());
+        if (res != 0) {
+            SLT_ERR(*this, "failed to encode mtmd batch for chunk idx = %zu, res = %d\n", idx, res);
+            return -1;
+        }
+
+        return try_decode();
+    }
 };
 
 
@@ -2921,7 +3015,7 @@ struct server_context_impl {
                                 send_partial_response(slot, {}, false, true);
                             }
                         }
-                    }
+                    } // end of SLOT_STATE_STARTED
 
                     if (!slot.can_split()) {
                         // cannot fit the prompt in the current batch - will try next iter
@@ -2976,10 +3070,18 @@ struct server_context_impl {
                     bool has_mtmd = false;
 
                     // check if we should process the image
-                    while (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
+                    while (true) {
+                        auto cur_token_idx = slot.prompt.n_tokens();
+                        if (
+                            cur_token_idx >= slot.task->n_tokens() ||
+                            input_tokens[cur_token_idx] != LLAMA_TOKEN_NULL // encountered a text token
+                        ) {
+                            break;
+                        }
+
                         // process the image
                         size_t n_tokens_out = 0;
-                        int32_t res = input_tokens.process_chunk(ctx_tgt, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
+                        int32_t res = slot.process_mtmd_chunk(cur_token_idx, n_tokens_out);
                         if (res != 0) {
                             SLT_ERR(slot, "failed to process image, res = %d\n", res);
                             send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
@@ -2987,22 +3089,11 @@ struct server_context_impl {
                             continue;
                         }
 
-                        if (ctx_dft && llama_get_ctx_other(ctx_dft.get()) != ctx_tgt) {
-                            // TODO: in the future, figure out how to infuse target embeddings to the images
-                            //       for now, we skip this for simplicity
-                            //       maybe we simply need to call `common_speculative_process()` on the mtmd batches in the `process_chunk` above?
-                            //       [TAG_MTMD_DRAFT_PROCESSING]
-                            res = input_tokens.process_chunk(ctx_dft.get(), mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
-                            if (res != 0) {
-                                GGML_ABORT("failed to process multi-modal data on draft context\n");
-                            }
-                        }
-
                         slot.n_prompt_tokens_processed += n_tokens_out;
 
                         // add the image chunk to cache
                         {
-                            const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens());
+                            const auto & chunk = input_tokens.find_chunk(cur_token_idx);
                             slot.prompt.tokens.push_back(chunk.get()); // copy
                         }
 

From 111d3f17360f147b5676b046d9cde52c1521bd40 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 11 Jun 2026 13:24:33 +0200
Subject: [PATCH 02/13] wip

---
 tools/mtmd/clip.cpp           |  13 ++--
 tools/mtmd/models/gemma4v.cpp |  22 +++---
 tools/mtmd/mtmd.cpp           | 140 +++++++++++++++++++++++++++-------
 tools/mtmd/mtmd.h             |   5 ++
 tools/server/server-http.cpp  |   2 +-
 5 files changed, 136 insertions(+), 46 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index bd33f430625a..adbb6efa60bd 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3482,14 +3482,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
     int n_batch_cur = imgs.entries.size();
 
-    // maximum supported batch size, usually == 2 for qwen-vl-based models
-    int n_batch_max = clip_model_n_batch_max(ctx);
-
-    // TODO @ngxson : implement batch size > 1 as a loop
-    //                we don't need true batching support because the cgraph will gonna be big anyway
-    if (n_batch_cur > n_batch_max) {
-        return false;
-    }
+    // TODO: check batching condition
 
     // if buffers are not allocated, we need to do a warmup run to allocate them
     if (!ctx->is_allocated) {
@@ -3567,6 +3560,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             const int n  = nx * ny;
 
             for (int b = 0; b < n_batch_cur; b++) {
+                LOG_DBG("%s: copying image %d/%d to input buffer (nx=%d, ny=%d)\n", __func__, b+1, n_batch_cur, nx, ny);
                 const auto & buf = imgs.entries[b]->get_ro_buf();
                 float * batch_entry = inp_raw.data() + b * (3*n);
                 for (int y = 0; y < ny; y++) {
@@ -4555,6 +4549,9 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
     return ctx->model.modality == CLIP_MODALITY_AUDIO;
 }
 
+// TODO @ngxson : this is no longer true with mtmd_batch API
+// this was only meant to be used by qwen-vl-based models, to fuse 2 input images into one (qwen-vl video support)
+// this logic should be refactored in near future to distinctly handle "merge frames" and "batching"
 int clip_model_n_batch_max(const struct clip_ctx * ctx) {
     switch (ctx->proj_type()) {
         case PROJECTOR_TYPE_QWEN2VL:
diff --git a/tools/mtmd/models/gemma4v.cpp b/tools/mtmd/models/gemma4v.cpp
index 3570d6da1351..b4aadc4e16b9 100644
--- a/tools/mtmd/models/gemma4v.cpp
+++ b/tools/mtmd/models/gemma4v.cpp
@@ -10,7 +10,7 @@ ggml_cgraph * clip_graph_gemma4v::build() {
     ggml_set_name(inp_raw, "inp_raw_scaled");
 
     ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-    inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
+    inp = ggml_reshape_3d(ctx0, inp, n_patches, n_embd, n_batch);
     inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
     ggml_set_name(inp, "inp");
     // note: no patch bias
@@ -51,10 +51,11 @@ ggml_cgraph * clip_graph_gemma4v::build() {
         // first half
         ggml_tensor * first;
         {
-            first = ggml_view_3d(ctx0, cur,
-                n_dim/2, n_head, n_pos,
+            first = ggml_view_4d(ctx0, cur,
+                n_dim/2, n_head, n_pos, n_batch,
                 cur->nb[1],
                 cur->nb[2],
+                cur->nb[3],
                 0);
             first = ggml_rope_ext(
                 ctx0,
@@ -70,10 +71,11 @@ ggml_cgraph * clip_graph_gemma4v::build() {
         // second half
         ggml_tensor * second;
         {
-            second = ggml_view_3d(ctx0, cur,
-                n_dim/2, n_head, n_pos,
+            second = ggml_view_4d(ctx0, cur,
+                n_dim/2, n_head, n_pos, n_batch,
                 cur->nb[1],
                 cur->nb[2],
+                cur->nb[3],
                 n_dim/2 * ggml_element_size(cur));
             second = ggml_rope_ext(
                 ctx0,
@@ -103,14 +105,16 @@ ggml_cgraph * clip_graph_gemma4v::build() {
         const int kernel_size = hparams.n_merge;
         GGML_ASSERT(kernel_size > 0);
 
-        // [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, 1]
-        cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, 1);
+        // [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, n_batch]
+        cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, n_batch);
         cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG,
                            kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
         const int out_x = n_patches_x / kernel_size;
         const int out_y = n_patches_y / kernel_size;
-        // [out_x, out_y, n_embd, 1] -> [n_embd, out_x * out_y]
-        cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, 1);
+        printf("before pooling: n_patches_x=%d, n_patches_y=%d, after pooling: out_x=%d, out_y=%d\n", n_patches_x, n_patches_y, out_x, out_y);
+        printf("after pooling: ne[0]=%lld, ne[1]=%lld, ne[2]=%lld, ne[3]=%lld\n", cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
+        // [out_x, out_y, n_embd, n_batch] -> [n_embd, out_x * out_y, n_batch]
+        cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, n_batch);
         cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
         cur = ggml_scale(ctx0, cur, sqrtf((float)n_embd));
         cb(cur, "pooled", -1);
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 96bb7e91327b..87f02dbeb937 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -110,6 +110,10 @@ struct mtmd_image_tokens {
         return false;
     }
 
+    bool can_batch_with(const mtmd_image_tokens & other) {
+        return nx == other.nx && ny == other.ny && pos == other.pos;
+    }
+
     mtmd_image_tokens clone() {
         return mtmd_image_tokens{
             nx,
@@ -153,6 +157,29 @@ struct mtmd_input_chunk {
     std::vector<llama_token> tokens_text;
     mtmd_image_tokens_ptr tokens_image;
     mtmd_audio_tokens_ptr tokens_audio;
+
+    bool can_batch_with(const mtmd_input_chunk & other) const {
+        if (type != other.type) {
+            return false;
+        }
+
+        if (tokens_image && other.tokens_image) {
+            return tokens_image->can_batch_with(*other.tokens_image);
+        }
+
+        // TODO: allow batching audio chunks of the same size
+
+        return false;
+    }
+
+    bool is_placeholder() const {
+        if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            return tokens_image->is_placeholder();
+        } else if (type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+            return tokens_audio->is_placeholder();
+        }
+        return false;
+    }
 };
 
 struct mtmd_input_chunks {
@@ -164,6 +191,13 @@ struct mtmd_batch {
     std::vector<const mtmd_input_chunk *> entries;
     std::vector<float> output_embd; // aggregated output embedding for the whole batch
     mtmd_batch(mtmd_context * ctx): ctx(ctx) {}
+    int32_t n_tokens() const {
+        int32_t n = 0;
+        for (const auto * chunk : entries) {
+            n += mtmd_input_chunk_get_n_tokens(chunk);
+        }
+        return n;
+    }
 };
 
 // slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
@@ -204,6 +238,7 @@ mtmd_context_params mtmd_context_params_default() {
         /* image_max_tokens  */ -1,
         /* cb_eval           */ nullptr,
         /* cb_eval_user_data */ nullptr,
+        /* batch_max_tokens  */ 2048,
     };
     return params;
 }
@@ -211,7 +246,7 @@ mtmd_context_params mtmd_context_params_default() {
 struct mtmd_context {
     struct clip_ctx * ctx_v; // vision
     struct clip_ctx * ctx_a; // audio
-    std::vector<float> image_embd_v; // image embedding vector
+    std::vector<float> out_embd; // image embedding vector
 
     bool print_timings;
     int n_threads;
@@ -246,17 +281,21 @@ struct mtmd_context {
     std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
     std::unique_ptr<mtmd_image_preprocessor> image_preproc;
 
+    // batching
+    int32_t batch_max_tokens;
+
     // TODO @ngxson : add timings
 
     mtmd_context(const char * mmproj_fname,
                    const llama_model * text_model,
                    const mtmd_context_params & ctx_params,
                    bool no_alloc = false) :
-        print_timings(ctx_params.print_timings),
-        n_threads    (ctx_params.n_threads),
-        media_marker (ctx_params.media_marker),
-        n_embd_text  (text_model ? llama_model_n_embd_inp(text_model) : -1),
-        vocab        (text_model ? llama_model_get_vocab(text_model) : nullptr)
+        print_timings   (ctx_params.print_timings),
+        n_threads       (ctx_params.n_threads),
+        media_marker    (ctx_params.media_marker),
+        n_embd_text     (text_model ? llama_model_n_embd_inp(text_model) : -1),
+        vocab           (text_model ? llama_model_get_vocab(text_model) : nullptr),
+        batch_max_tokens(ctx_params.batch_max_tokens)
     {
         if (ctx_params.image_marker != nullptr) {
             throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
@@ -1369,12 +1408,12 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
             return 1;
         }
         int n_mmproj_embd = ctx->n_embd_text;
-        ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
+        ctx->out_embd.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
         bool ok = clip_image_batch_encode(
             ctx->ctx_a,
             ctx->n_threads,
             &chunk->tokens_audio->batch_f32,
-            ctx->image_embd_v.data());
+            ctx->out_embd.data());
         return ok ? 0 : 1;
     }
 
@@ -1390,7 +1429,7 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok
     }
     auto proj_type = clip_get_projector_type(ctx_clip);
     int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
-    ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
+    ctx->out_embd.resize(image_tokens->n_tokens() * n_mmproj_embd);
     bool ok = false;
 
     if (clip_is_llava(ctx_clip)
@@ -1414,7 +1453,7 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok
                 ctx_clip,
                 ctx->n_threads,
                 entries[i].get(),
-                ctx->image_embd_v.data() + offset);
+                ctx->out_embd.data() + offset);
             offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
         }
     } else {
@@ -1426,7 +1465,7 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok
             ctx_clip,
             ctx->n_threads,
             &image_tokens->batch_f32,
-            ctx->image_embd_v.data());
+            ctx->out_embd.data());
     }
 
     return ok ? 0 : 1;
@@ -1442,7 +1481,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
 }
 
 float * mtmd_get_output_embd(mtmd_context * ctx) {
-    return ctx->image_embd_v.data();
+    return ctx->out_embd.data();
 }
 
 mtmd_batch * mtmd_batch_init(mtmd_context * ctx) {
@@ -1456,36 +1495,81 @@ void mtmd_batch_free(mtmd_batch * batch) {
 }
 
 int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk) {
-    batch->entries.push_back(chunk);
-    if (batch->entries.size() > 4) {
-        return 1; // DEMO ONLY
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        LOG_ERR("%s: text chunk is not supported in batch\n", __func__);
+        return 1;
     }
-    return 0;
+
+    if (batch->entries.empty()) {
+        // batch must have at least one chunk
+        batch->entries.push_back(chunk);
+        return 0;
+    }
+
+    int32_t new_n_tokens = batch->n_tokens() + (int32_t)mtmd_input_chunk_get_n_tokens(chunk);
+    if (new_n_tokens > batch->ctx->batch_max_tokens) {
+        return 2; // "batch too large" error code
+    }
+
+    auto & first_chunk = batch->entries[0];
+    if (first_chunk->can_batch_with(*chunk)) {
+        batch->entries.push_back(chunk);
+        return 0;
+    }
+
+    return 3; // "cannot batch" error code
 }
 
 int32_t mtmd_batch_encode(mtmd_batch * batch) {
+    if (batch->entries.empty()) {
+        LOG_ERR("%s: batch is empty\n", __func__);
+        return 1;
+    }
+
     // allocate output_embd
     size_t n_embd = 0;
     for (const auto * chunk : batch->entries) {
+        if (chunk->is_placeholder()) {
+            LOG_ERR("%s: chunk is placeholder\n", __func__);
+            return 1;
+        }
         n_embd += mtmd_input_chunk_get_n_tokens(chunk) * batch->ctx->n_embd_text;
     }
     batch->output_embd.resize(n_embd);
 
-    // TODO @ngxson : this is just for testing if the public API works; it is not true batching
-    size_t offset = 0;
-    for (const auto * chunk : batch->entries) {
-        int32_t res = mtmd_encode_chunk(batch->ctx, chunk);
-        if (res != 0) {
-            return res;
+    // represent the whole batch as one single chunk
+    mtmd::input_chunk_ptr batch_chunk(mtmd_input_chunk_copy(batch->entries[0]));
+    if (batch_chunk->tokens_image) {
+        auto & b0_f32 = batch_chunk->tokens_image->batch_f32;
+        for (const auto * chunk : batch->entries) {
+            auto b1_f32 = chunk->tokens_image->batch_f32.clone();
+            for (size_t i = 0; i < b1_f32.entries.size(); i++) {
+                b0_f32.entries.push_back(std::move(b1_f32.entries[i]));
+            }
         }
-        size_t len = mtmd_input_chunk_get_n_tokens(chunk) * batch->ctx->n_embd_text;
-        memcpy(
-            batch->output_embd.data() + offset,
-            mtmd_get_output_embd(batch->ctx),
-            len * sizeof(float));
-        offset += len;
+    } else {
+        LOG_ERR("%s: unsupported chunk type\n", __func__);
+        return 1;
     }
 
+    LOG_DBG("%s: encoding batch with %zu entries and total %zu tokens\n",
+            __func__, batch->entries.size(), mtmd_input_chunk_get_n_tokens(batch_chunk.get()));
+    int32_t res = mtmd_encode_chunk(batch->ctx, batch_chunk.get());
+    if (res != 0) {
+        return res;
+    }
+
+    if (batch->ctx->out_embd.size() != batch->output_embd.size()) {
+        LOG_ERR("%s: output embedding size mismatch: expected %zu, got %zu\n",
+                __func__, batch->output_embd.size(), batch->ctx->out_embd.size());
+        return 1;
+    }
+
+    memcpy(
+        batch->output_embd.data(),
+        batch->ctx->out_embd.data(),
+        batch->output_embd.size() * sizeof(float));
+
     return 0;
 }
 
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index 927b2d28dbd0..66dda62e9d51 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -99,6 +99,11 @@ struct mtmd_context_params {
     // callback function passed over to mtmd proper
     ggml_backend_sched_eval_callback cb_eval;
     void * cb_eval_user_data;
+
+    // batching params
+    int32_t batch_max_tokens; // maximum number of output tokens in a batch
+                              // (note: this is not a hard-limit, the first image will always be added even if it exceeds this limit)
+                              // (default: 2048)
 };
 
 MTMD_API const char * mtmd_default_marker(void);
diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index 34a20c9d22de..ee54f2695b6e 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -113,7 +113,7 @@ bool server_http_context::init(const common_params & params) {
 #endif
 
     srv->set_default_headers({{"Server", "llama.cpp"}});
-    srv->set_logger(log_server_request);
+    // srv->set_logger(log_server_request); // TODO @ngxson : this is too spamy, no very useful; improve it in the future
     srv->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
         // this is fail-safe; exceptions should already handled by `ex_wrapper`
 

From f77cfd73a2db764554941e652d24267ccea8cbba Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 11 Jun 2026 16:10:38 +0200
Subject: [PATCH 03/13] first working version (gemma4v)

---
 tools/mtmd/clip.cpp           | 22 ++++++++----
 tools/mtmd/clip.h             |  4 +--
 tools/mtmd/models/gemma4v.cpp |  2 --
 tools/mtmd/mtmd.cpp           | 64 ++++++++++++++++++++---------------
 4 files changed, 53 insertions(+), 39 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 45a90f2f0d33..8ca66ac5c46b 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3484,16 +3484,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
     return n_patches;
 }
 
-bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
+bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, std::vector<float> & out_vec) {
     clip_image_f32_batch imgs;
     clip_image_f32_ptr img_copy(clip_image_f32_init());
     *img_copy = *img;
     imgs.entries.push_back(std::move(img_copy));
 
-    return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
+    return clip_image_batch_encode(ctx, n_threads, &imgs, out_vec);
 }
 
-bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
+bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, std::vector<float> & out_batch_embd) {
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
     int n_batch_cur = imgs.entries.size();
 
@@ -4410,7 +4410,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     // the last node is the embedding tensor
     ggml_tensor * embeddings = ggml_graph_node(gf, -1);
 
-    // sanity check (only support batch size of 1 for now)
+    // sanity check (assuming that all images in batch have the same number of tokens, so we only check the first one)
     const int n_tokens_out = embeddings->ne[1];
     const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
     if (n_tokens_out != expected_n_tokens_out) {
@@ -4418,9 +4418,17 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         GGML_ABORT("Invalid number of output tokens");
     }
 
-    // copy the embeddings to the location passed by the user
-    if (vec != nullptr) {
-        ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
+    LOG_DBG("%s: output embedding shape [%d, %d, %d]\n", __func__,
+        (int)embeddings->ne[0], (int)embeddings->ne[1], (int)embeddings->ne[2]);
+
+    // copy output to user buffer if provided
+    // if output is empty, skip the copy
+    if (!out_batch_embd.empty()) {
+        if (out_batch_embd.size() != (size_t)ggml_nelements(embeddings)) {
+            LOG_ERR("%s: output buffer has %zu elements but expected %zu\n", __func__, out_batch_embd.size(), (size_t)ggml_nelements(embeddings));
+            GGML_ABORT("Output buffer size mismatch");
+        }
+        ggml_backend_tensor_get(embeddings, out_batch_embd.data(), 0, ggml_nbytes(embeddings));
     }
 
     // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 18c7a1d1a7c4..e88d9ab202f6 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -97,8 +97,8 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
 size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
 struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
 
-bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
-bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
+bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, std::vector<float> & out_vec);
+bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, std::vector<float> & out_batch_embd);
 
 bool clip_is_llava(const struct clip_ctx * ctx);
 // note for contributor: this clip_is_(model) pattern is deprecated
diff --git a/tools/mtmd/models/gemma4v.cpp b/tools/mtmd/models/gemma4v.cpp
index b4aadc4e16b9..87cbd43fc5fd 100644
--- a/tools/mtmd/models/gemma4v.cpp
+++ b/tools/mtmd/models/gemma4v.cpp
@@ -111,8 +111,6 @@ ggml_cgraph * clip_graph_gemma4v::build() {
                            kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
         const int out_x = n_patches_x / kernel_size;
         const int out_y = n_patches_y / kernel_size;
-        printf("before pooling: n_patches_x=%d, n_patches_y=%d, after pooling: out_x=%d, out_y=%d\n", n_patches_x, n_patches_y, out_x, out_y);
-        printf("after pooling: ne[0]=%lld, ne[1]=%lld, ne[2]=%lld, ne[3]=%lld\n", cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
         // [out_x, out_y, n_embd, n_batch] -> [n_embd, out_x * out_y, n_batch]
         cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, n_batch);
         cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 87f02dbeb937..7acdc17959d1 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -726,6 +726,16 @@ struct mtmd_context {
         return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
     }
 
+    int64_t n_embd_out() const {
+        if (ctx_v) {
+            return clip_n_mmproj_embd(ctx_v);
+        } else if (ctx_a) {
+            return clip_n_mmproj_embd(ctx_a);
+        } else {
+            throw std::runtime_error("no CLIP model loaded");
+        }
+    }
+
     ~mtmd_context() {
         clip_free(ctx_a);
         clip_free(ctx_v);
@@ -1374,7 +1384,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
 }
 
 // forward declaration
-int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens);
+int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector<float> * out_batch_embd = nullptr);
 
 int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
     if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
@@ -1407,13 +1417,13 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
             LOG_ERR("%s: audio tokens batch is placeholder\n", __func__);
             return 1;
         }
-        int n_mmproj_embd = ctx->n_embd_text;
+        int n_mmproj_embd = ctx->n_embd_out();
         ctx->out_embd.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
         bool ok = clip_image_batch_encode(
             ctx->ctx_a,
             ctx->n_threads,
             &chunk->tokens_audio->batch_f32,
-            ctx->out_embd.data());
+            ctx->out_embd);
         return ok ? 0 : 1;
     }
 
@@ -1421,7 +1431,7 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
     return 1;
 }
 
-int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
+int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector<float> * out_batch_embd) {
     clip_ctx * ctx_clip = ctx->ctx_v;
     if (!ctx_clip) {
         LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
@@ -1429,7 +1439,14 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok
     }
     auto proj_type = clip_get_projector_type(ctx_clip);
     int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
-    ctx->out_embd.resize(image_tokens->n_tokens() * n_mmproj_embd);
+    std::vector<float> & out_embd = ctx->out_embd;
+    if (out_batch_embd) {
+        // caller need to resize out_batch_embd
+        out_embd = *out_batch_embd;
+    } else {
+        ctx->out_embd.resize(image_tokens->n_tokens() * n_mmproj_embd);
+        out_embd = ctx->out_embd;
+    }
     bool ok = false;
 
     if (clip_is_llava(ctx_clip)
@@ -1449,11 +1466,13 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok
                 return 1;
             }
             int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
+            std::vector<float> tmp_embd(n_tokens_per_image * n_mmproj_embd);
             ok = clip_image_encode(
                 ctx_clip,
                 ctx->n_threads,
                 entries[i].get(),
-                ctx->out_embd.data() + offset);
+                tmp_embd);
+            std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset);
             offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
         }
     } else {
@@ -1465,7 +1484,7 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok
             ctx_clip,
             ctx->n_threads,
             &image_tokens->batch_f32,
-            ctx->out_embd.data());
+            out_embd);
     }
 
     return ok ? 0 : 1;
@@ -1533,7 +1552,7 @@ int32_t mtmd_batch_encode(mtmd_batch * batch) {
             LOG_ERR("%s: chunk is placeholder\n", __func__);
             return 1;
         }
-        n_embd += mtmd_input_chunk_get_n_tokens(chunk) * batch->ctx->n_embd_text;
+        n_embd += mtmd_input_chunk_get_n_tokens(chunk) * batch->ctx->n_embd_out();
     }
     batch->output_embd.resize(n_embd);
 
@@ -1541,7 +1560,11 @@ int32_t mtmd_batch_encode(mtmd_batch * batch) {
     mtmd::input_chunk_ptr batch_chunk(mtmd_input_chunk_copy(batch->entries[0]));
     if (batch_chunk->tokens_image) {
         auto & b0_f32 = batch_chunk->tokens_image->batch_f32;
-        for (const auto * chunk : batch->entries) {
+        // copy all entries from other chunks into the first chunk's batch_f32
+        // note: skip first entry because it's already in batch_chunk
+        for (size_t ic = 1; ic < batch->entries.size(); ic++) {
+            auto & chunk = batch->entries[ic];
+            GGML_ASSERT(chunk->tokens_image);
             auto b1_f32 = chunk->tokens_image->batch_f32.clone();
             for (size_t i = 0; i < b1_f32.entries.size(); i++) {
                 b0_f32.entries.push_back(std::move(b1_f32.entries[i]));
@@ -1554,23 +1577,8 @@ int32_t mtmd_batch_encode(mtmd_batch * batch) {
 
     LOG_DBG("%s: encoding batch with %zu entries and total %zu tokens\n",
             __func__, batch->entries.size(), mtmd_input_chunk_get_n_tokens(batch_chunk.get()));
-    int32_t res = mtmd_encode_chunk(batch->ctx, batch_chunk.get());
-    if (res != 0) {
-        return res;
-    }
-
-    if (batch->ctx->out_embd.size() != batch->output_embd.size()) {
-        LOG_ERR("%s: output embedding size mismatch: expected %zu, got %zu\n",
-                __func__, batch->output_embd.size(), batch->ctx->out_embd.size());
-        return 1;
-    }
-
-    memcpy(
-        batch->output_embd.data(),
-        batch->ctx->out_embd.data(),
-        batch->output_embd.size() * sizeof(float));
-
-    return 0;
+    int32_t res = mtmd_encode_impl(batch->ctx, batch_chunk->tokens_image.get(), &batch->output_embd);
+    return res;
 }
 
 float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk) {
@@ -1578,7 +1586,7 @@ float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk *
     for (const auto * c : batch->entries) {
         size_t offset_prev = offset;
         size_t n_tokens = mtmd_input_chunk_get_n_tokens(c);
-        offset += n_tokens * batch->ctx->n_embd_text;
+        offset += n_tokens * batch->ctx->n_embd_out();
         GGML_ASSERT(offset_prev <  batch->output_embd.size());
         GGML_ASSERT(offset      <= batch->output_embd.size());
         if (c == chunk) {
@@ -1963,7 +1971,7 @@ static void mtmd_debug_encode_impl(mtmd_context * ctx, clip_ctx * ctx_clip, clip
         ctx_clip,
         ctx->n_threads,
         &image,
-        embd_output.data());
+        embd_output);
     if (!ok) {
         LOG_ERR("%s: failed to encode image\n", __func__);
     }

From 190bef3b9b78bd698b1b7fdc14b18fc64849f2ae Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 11 Jun 2026 16:56:32 +0200
Subject: [PATCH 04/13] add arg

---
 common/arg.cpp                  | 7 +++++++
 common/common.h                 | 1 +
 tools/mtmd/mtmd.cpp             | 2 +-
 tools/mtmd/mtmd.h               | 2 +-
 tools/server/server-context.cpp | 1 +
 5 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 55795d357d90..8382c1d85dde 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2243,6 +2243,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.image_max_tokens = value;
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
+    add_opt(common_arg(
+        {"--mtmd-batch-max-tokens"}, "N",
+        string_format("maximum number of image tokens per batch when encoding images (default: %d)", params.mtmd_batch_max_tokens),
+        [](common_params & params, int value) {
+            params.mtmd_batch_max_tokens = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MTMD_BATCH_MAX_TOKENS"));
     if (llama_supports_rpc()) {
         add_opt(common_arg(
             {"--rpc"}, "SERVERS",
diff --git a/common/common.h b/common/common.h
index 4864186f6287..0b284cbb36c7 100644
--- a/common/common.h
+++ b/common/common.h
@@ -575,6 +575,7 @@ struct common_params {
     std::vector<std::string> image; // path to image file(s) ; TODO: change the name to "media"
     int image_min_tokens = -1;
     int image_max_tokens = -1;
+    int mtmd_batch_max_tokens = 1024;
 
     // finetune
     struct lr_opt lr;
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 7acdc17959d1..568431e5f518 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -238,7 +238,7 @@ mtmd_context_params mtmd_context_params_default() {
         /* image_max_tokens  */ -1,
         /* cb_eval           */ nullptr,
         /* cb_eval_user_data */ nullptr,
-        /* batch_max_tokens  */ 2048,
+        /* batch_max_tokens  */ 1024,
     };
     return params;
 }
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index 66dda62e9d51..2fd149e48069 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -103,7 +103,7 @@ struct mtmd_context_params {
     // batching params
     int32_t batch_max_tokens; // maximum number of output tokens in a batch
                               // (note: this is not a hard-limit, the first image will always be added even if it exceeds this limit)
-                              // (default: 2048)
+                              // (default: 1024)
 };
 
 MTMD_API const char * mtmd_default_marker(void);
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index bed047a920bb..d5aa22b5f697 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -875,6 +875,7 @@ struct server_context_impl {
             mparams.warmup           = params_base.warmup;
             mparams.image_min_tokens = params_base.image_min_tokens;
             mparams.image_max_tokens = params_base.image_max_tokens;
+            mparams.batch_max_tokens = params_base.mtmd_batch_max_tokens;
             mparams.media_marker     = get_media_marker();
         }
 

From a773d7b84be411e079d20abe5acb24a2e7b72537 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 11 Jun 2026 16:57:07 +0200
Subject: [PATCH 05/13] nits

---
 tools/mtmd/mtmd-helper.h        | 15 ---------------
 tools/server/server-context.cpp |  2 +-
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h
index 8a473b01206f..719aae988568 100644
--- a/tools/mtmd/mtmd-helper.h
+++ b/tools/mtmd/mtmd-helper.h
@@ -163,21 +163,6 @@ MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx,
 
 namespace mtmd_helper {
 
-//
-// batching helpers (C++ only for now)
-//
-
-MTMD_API 
-
-MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
-        struct llama_context * lctx,
-        const mtmd_input_chunk * chunk,
-        llama_pos n_past,
-        llama_seq_id seq_id,
-        int32_t n_batch,
-        bool logits_last,
-        llama_pos * new_n_past);
-
 //
 // C++ wrappers
 //
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index d5aa22b5f697..c7bf73813bd3 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -614,7 +614,7 @@ struct server_slot {
                             mctx,
                             ctx_tgt,
                             chunk.get(),
-                            embd, 
+                            embd,
                             prompt.tokens.pos_next(),
                             id,
                             llama_n_batch(ctx_tgt),

From 3eecd674d890a4cd11c74280b1050c629193a66a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 11 Jun 2026 17:42:39 +0200
Subject: [PATCH 06/13] wire up support_batch()

---
 tools/mtmd/clip-graph.h         |  4 ++++
 tools/mtmd/clip.cpp             | 34 ++++++++++++++++++++++++++-------
 tools/mtmd/clip.h               |  4 +++-
 tools/mtmd/models/models.h      |  1 +
 tools/mtmd/mtmd.cpp             | 11 +++++++++++
 tools/server/server-context.cpp |  3 ++-
 6 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h
index 7d10586217b8..c84b32880b5d 100644
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -54,6 +54,10 @@ struct clip_graph {
     virtual ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const;
     // TODO: build_mm(w, b, x) to support bias
 
+    virtual bool support_batch() const {
+        return false;
+    }
+
     //
     // utility functions
     //
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 8ca66ac5c46b..f3d4f81e44b5 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -171,6 +171,8 @@ struct clip_ctx {
     std::map<ggml_backend_dev_t, size_t> mem_usage;
     std::map<ggml_backend_dev_t, size_t> mem_compute;
 
+    bool support_batch = false;
+
     clip_ctx(clip_context_params & ctx_params) {
         flash_attn_type = ctx_params.flash_attn_type;
         no_alloc = ctx_params.no_alloc;
@@ -862,7 +864,7 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
     return cur;
 }
 
-static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
+static std::unique_ptr<clip_graph> clip_get_graph_builder(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
     const clip_image_f32 & img = *imgs.entries[0];
     std::unique_ptr<clip_graph> builder;
 
@@ -1025,7 +1027,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     // TODO [QWEN_VIDEO]: improve this in the future
     builder->n_batch = imgs.entries.size();
 
-    return builder->build();
+    return builder;
 }
 
 //
@@ -2819,7 +2821,7 @@ struct clip_model_loader {
         std::vector<support_info_op> ops;
     };
 
-    static void warmup(clip_ctx & ctx_clip) {
+    static clip_image_f32_batch get_dummy_batch(clip_ctx & ctx_clip) {
         // create a fake batch
         const auto & hparams = ctx_clip.model.hparams;
         clip_image_f32_batch batch;
@@ -2833,6 +2835,20 @@ struct clip_model_loader {
             LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size);
         }
         batch.entries.push_back(std::move(img));
+        return batch;
+    }
+
+    static void init_ctx(clip_ctx & ctx_clip) {
+        ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
+
+        // check batching support
+        auto batch = get_dummy_batch(ctx_clip);
+        auto builder = clip_get_graph_builder(&ctx_clip, batch);
+        ctx_clip.support_batch = builder->support_batch();
+    }
+
+    static void warmup(clip_ctx & ctx_clip) {
+        auto batch = get_dummy_batch(ctx_clip);
         warmup(ctx_clip, batch);
     }
 
@@ -2905,9 +2921,7 @@ struct clip_model_loader {
 
     // only initialize backend buffers, but do not allocate them yet
     static support_info_graph reserve_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
-        ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
-
-        ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
+        ggml_cgraph * gf = clip_get_graph_builder(&ctx_clip, batch)->build();
         ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
 
         ctx_clip.mem_compute.clear();
@@ -3070,6 +3084,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
             ctx_vision = new clip_ctx(ctx_params);
             loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
             loader.load_tensors(*ctx_vision);
+            loader.init_ctx(*ctx_vision);
             if (ctx_params.warmup) {
                 loader.warmup(*ctx_vision);
             }
@@ -3083,6 +3098,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
             ctx_audio = new clip_ctx(ctx_params);
             loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
             loader.load_tensors(*ctx_audio);
+            loader.init_ctx(*ctx_audio);
             if (ctx_params.warmup) {
                 loader.warmup(*ctx_audio);
             }
@@ -3506,7 +3522,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
     // build the inference graph
     ggml_backend_sched_reset(ctx->sched.get());
-    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
+    ggml_cgraph * gf = clip_get_graph_builder(ctx, imgs)->build();
     ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
 
     // set inputs
@@ -4572,6 +4588,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
     return ctx->model.modality == CLIP_MODALITY_AUDIO;
 }
 
+bool clip_support_batch(const struct clip_ctx * ctx) {
+    return ctx->support_batch;
+}
+
 // TODO @ngxson : this is no longer true with mtmd_batch API
 // this was only meant to be used by qwen-vl-based models, to fuse 2 input images into one (qwen-vl video support)
 // this logic should be refactored in near future to distinctly handle "merge frames" and "batching"
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index e88d9ab202f6..03cf649f41cb 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -107,7 +107,9 @@ bool clip_is_llava(const struct clip_ctx * ctx);
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);
 
-int clip_model_n_batch_max(const struct clip_ctx * ctx);
+bool clip_support_batch(const struct clip_ctx * ctx);
+
+int clip_model_n_batch_max(const struct clip_ctx * ctx); // TODO @ngxson : remove, refactor this
 
 std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);
 
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
index 12082a5280a8..3a15f76829b9 100644
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -16,6 +16,7 @@ struct clip_graph_gemma4v : clip_graph {
     clip_graph_gemma4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     ggml_cgraph * build() override;
     ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
+    bool support_batch() const override { return true; }
 };
 
 struct clip_graph_gemma4uv : clip_graph {
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 568431e5f518..c947bf085b9c 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -1519,12 +1519,23 @@ int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk)
         return 1;
     }
 
+    auto * ctx = batch->ctx->get_clip_ctx(chunk);
+    if (!ctx) {
+        LOG_ERR("%s: model does not support input chunk type %d\n", __func__, (int)chunk->type);
+        return 1;
+    }
+
     if (batch->entries.empty()) {
         // batch must have at least one chunk
         batch->entries.push_back(chunk);
         return 0;
     }
 
+    if (!clip_support_batch(ctx)) {
+        // if no batching support, batch can only have one single chunk
+        return 2; // "batch too large" error code
+    }
+
     int32_t new_n_tokens = batch->n_tokens() + (int32_t)mtmd_input_chunk_get_n_tokens(chunk);
     if (new_n_tokens > batch->ctx->batch_max_tokens) {
         return 2; // "batch too large" error code
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index c7bf73813bd3..112500b09f75 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -658,10 +658,11 @@ struct server_slot {
             res = mtmd_batch_add_chunk(mbatch.get(), next_chunk->get());
             n_added += (res == 0 ? 1 : 0);
             idx_cur = next_idx;
-            SLT_INF(*this, "try adding chunk idx = %zu to batch, res = %d\n", next_idx, res);
+            SLT_DBG(*this, "try adding media chunk idx = %zu to batch, res = %d\n", next_idx, res);
             // if res != 0, batch is full or chunk is not compatible -> this loop breaks
         }
 
+        // TODO @ngxson : move this log line to debug when it become more stable
         SLT_INF(*this, "encoding mtmd batch from idx = %zu, n_chunks = %d\n", idx, n_added);
 
         res = mtmd_batch_encode(mbatch.get());

From 7a22484c58b6033feaa84a6751358667bc00a9a8 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 11 Jun 2026 18:03:39 +0200
Subject: [PATCH 07/13] fix 0.0 output embd

---
 tools/mtmd/clip.cpp | 4 +++-
 tools/mtmd/mtmd.cpp | 9 +++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index f3d4f81e44b5..b08589831fc2 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -4445,13 +4445,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             GGML_ABORT("Output buffer size mismatch");
         }
         ggml_backend_tensor_get(embeddings, out_batch_embd.data(), 0, ggml_nbytes(embeddings));
+    } else {
+        LOG_WRN("%s: output buffer is empty, skipping copy\n", __func__);
     }
 
     // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
     if (ctx->debug_output_embeddings) {
         const int64_t n_embd = embeddings->ne[0];
         const int64_t n_tokens = embeddings->ne[1];
-        std::vector<float> emb_data(n_embd * n_tokens);
+        std::vector<float> emb_data(ggml_nelements(embeddings));
         ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings));
 
         LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n");
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index c947bf085b9c..d8844fb84f7d 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -1439,14 +1439,15 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok
     }
     auto proj_type = clip_get_projector_type(ctx_clip);
     int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
-    std::vector<float> & out_embd = ctx->out_embd;
+    std::vector<float> * out_embd_ptr;
     if (out_batch_embd) {
-        // caller need to resize out_batch_embd
-        out_embd = *out_batch_embd;
+        // IMPORTANT: caller must ensure out_batch_embd has enough capacity
+        out_embd_ptr = out_batch_embd;
     } else {
         ctx->out_embd.resize(image_tokens->n_tokens() * n_mmproj_embd);
-        out_embd = ctx->out_embd;
+        out_embd_ptr = &ctx->out_embd;
     }
+    std::vector<float> & out_embd = *out_embd_ptr;
     bool ok = false;
 
     if (clip_is_llava(ctx_clip)

From 2dd581ae5b91e2e334190ec51718cc54f59f0b63 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 11 Jun 2026 18:14:12 +0200
Subject: [PATCH 08/13] fix audio

---
 tools/mtmd/mtmd.cpp | 124 ++++++++++++++++++++++++++------------------
 1 file changed, 73 insertions(+), 51 deletions(-)

diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index d8844fb84f7d..f99c774d4dcf 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -1383,55 +1383,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
     }
 }
 
-// forward declaration
-int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector<float> * out_batch_embd = nullptr);
-
-int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
-    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-        LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
-        return 0;
-    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-        if (!ctx->ctx_v) {
-            LOG_ERR("%s: model does not support vision input\n", __func__);
-            return 1;
-        }
-        if (chunk->tokens_image == nullptr) {
-            LOG_ERR("%s: image tokens are null\n", __func__);
-            return 1;
-        }
-        if (chunk->tokens_image->is_placeholder()) {
-            LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
-            return 1;
-        }
-        return mtmd_encode_impl(ctx, chunk->tokens_image.get());
-    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
-        if (!ctx->ctx_a) {
-            LOG_ERR("%s: model does not support audio input\n", __func__);
-            return 1;
-        }
-        if (chunk->tokens_audio == nullptr) {
-            LOG_ERR("%s: audio tokens are null\n", __func__);
-            return 1;
-        }
-        if (chunk->tokens_audio->is_placeholder()) {
-            LOG_ERR("%s: audio tokens batch is placeholder\n", __func__);
-            return 1;
-        }
-        int n_mmproj_embd = ctx->n_embd_out();
-        ctx->out_embd.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
-        bool ok = clip_image_batch_encode(
-            ctx->ctx_a,
-            ctx->n_threads,
-            &chunk->tokens_audio->batch_f32,
-            ctx->out_embd);
-        return ok ? 0 : 1;
-    }
-
-    LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
-    return 1;
-}
-
-int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector<float> * out_batch_embd) {
+static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector<float> * out_batch_embd) {
     clip_ctx * ctx_clip = ctx->ctx_v;
     if (!ctx_clip) {
         LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
@@ -1491,9 +1443,64 @@ int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tok
     return ok ? 0 : 1;
 }
 
+static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk * chunk, std::vector<float> * out_batch_embd = nullptr) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
+        return 0;
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        if (!ctx->ctx_v) {
+            LOG_ERR("%s: model does not support vision input\n", __func__);
+            return 1;
+        }
+        if (chunk->tokens_image == nullptr) {
+            LOG_ERR("%s: image tokens are null\n", __func__);
+            return 1;
+        }
+        if (chunk->tokens_image->is_placeholder()) {
+            LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
+            return 1;
+        }
+        return mtmd_encode_impl(ctx, chunk->tokens_image.get(), out_batch_embd);
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        if (!ctx->ctx_a) {
+            LOG_ERR("%s: model does not support audio input\n", __func__);
+            return 1;
+        }
+        if (chunk->tokens_audio == nullptr) {
+            LOG_ERR("%s: audio tokens are null\n", __func__);
+            return 1;
+        }
+        if (chunk->tokens_audio->is_placeholder()) {
+            LOG_ERR("%s: audio tokens batch is placeholder\n", __func__);
+            return 1;
+        }
+        int n_mmproj_embd = ctx->n_embd_out();
+        ctx->out_embd.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
+        bool ok = clip_image_batch_encode(
+            ctx->ctx_a,
+            ctx->n_threads,
+            &chunk->tokens_audio->batch_f32,
+            out_batch_embd ? *out_batch_embd : ctx->out_embd);
+        return ok ? 0 : 1;
+    }
+
+    LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
+    return 1;
+}
+
+int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
+    // this is the non-batching version
+    try {
+        return mtmd_encode_chunk_impl(ctx, chunk, &ctx->out_embd);
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: error: %s\n", __func__, e.what());
+        return 1;
+    }
+}
+
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     try {
-        return mtmd_encode_impl(ctx, image_tokens);
+        return mtmd_encode_impl(ctx, image_tokens, &ctx->out_embd);
     } catch (const std::exception & e) {
         LOG_ERR("%s: error: %s\n", __func__, e.what());
         return 1;
@@ -1582,6 +1589,18 @@ int32_t mtmd_batch_encode(mtmd_batch * batch) {
                 b0_f32.entries.push_back(std::move(b1_f32.entries[i]));
             }
         }
+    } else if (batch_chunk->tokens_audio) {
+        auto & b0_f32 = batch_chunk->tokens_audio->batch_f32;
+        // copy all entries from other chunks into the first chunk's batch_f32
+        // note: skip first entry because it's already in batch_chunk
+        for (size_t ic = 1; ic < batch->entries.size(); ic++) {
+            auto & chunk = batch->entries[ic];
+            GGML_ASSERT(chunk->tokens_audio);
+            auto b1_f32 = chunk->tokens_audio->batch_f32.clone();
+            for (size_t i = 0; i < b1_f32.entries.size(); i++) {
+                b0_f32.entries.push_back(std::move(b1_f32.entries[i]));
+            }
+        }
     } else {
         LOG_ERR("%s: unsupported chunk type\n", __func__);
         return 1;
@@ -1589,7 +1608,10 @@ int32_t mtmd_batch_encode(mtmd_batch * batch) {
 
     LOG_DBG("%s: encoding batch with %zu entries and total %zu tokens\n",
             __func__, batch->entries.size(), mtmd_input_chunk_get_n_tokens(batch_chunk.get()));
-    int32_t res = mtmd_encode_impl(batch->ctx, batch_chunk->tokens_image.get(), &batch->output_embd);
+    int32_t res = mtmd_encode_chunk_impl(
+        batch->ctx,
+        batch_chunk.get(),
+        &batch->output_embd);
     return res;
 }
 

From de656cc356135612a6c6719d14b3f07d93351aa1 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 11 Jun 2026 18:43:06 +0200
Subject: [PATCH 09/13] nits

---
 tools/mtmd/mtmd.cpp | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index f99c774d4dcf..c8850951c484 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -174,9 +174,9 @@ struct mtmd_input_chunk {
 
     bool is_placeholder() const {
         if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            return tokens_image->is_placeholder();
+            return tokens_image && tokens_image->is_placeholder();
         } else if (type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
-            return tokens_audio->is_placeholder();
+            return tokens_audio && tokens_audio->is_placeholder();
         }
         return false;
     }
@@ -1393,7 +1393,7 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
     int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
     std::vector<float> * out_embd_ptr;
     if (out_batch_embd) {
-        // IMPORTANT: caller must ensure out_batch_embd has enough capacity
+        // IMPORTANT: caller must ensure out_batch_embd has enough capacity; clip_image_encode will check for it
         out_embd_ptr = out_batch_embd;
     } else {
         ctx->out_embd.resize(image_tokens->n_tokens() * n_mmproj_embd);
@@ -1420,11 +1420,16 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
             }
             int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
             std::vector<float> tmp_embd(n_tokens_per_image * n_mmproj_embd);
-            ok = clip_image_encode(
+            bool ok_i = clip_image_encode(
                 ctx_clip,
                 ctx->n_threads,
                 entries[i].get(),
                 tmp_embd);
+            if (!ok_i) {
+                LOG_ERR("%s: failed to encode image %zu\n", __func__, i);
+                return 1;
+            }
+            ok = true;
             std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset);
             offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
         }
@@ -1558,7 +1563,7 @@ int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk)
     return 3; // "cannot batch" error code
 }
 
-int32_t mtmd_batch_encode(mtmd_batch * batch) {
+static int32_t mtmd_batch_encode_impl(mtmd_batch * batch) {
     if (batch->entries.empty()) {
         LOG_ERR("%s: batch is empty\n", __func__);
         return 1;
@@ -1615,19 +1620,33 @@ int32_t mtmd_batch_encode(mtmd_batch * batch) {
     return res;
 }
 
+int32_t mtmd_batch_encode(mtmd_batch * batch) {
+    try {
+        return mtmd_batch_encode_impl(batch);
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: error: %s\n", __func__, e.what());
+        return 1;
+    }
+}
+
 float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk) {
+    if (batch->output_embd.empty()) {
+        LOG_ERR("%s: batch has not been encoded yet\n", __func__);
+        return nullptr;
+    }
     size_t offset = 0;
+    const size_t n_embd = batch->ctx->n_embd_out();
     for (const auto * c : batch->entries) {
         size_t offset_prev = offset;
         size_t n_tokens = mtmd_input_chunk_get_n_tokens(c);
-        offset += n_tokens * batch->ctx->n_embd_out();
+        offset += n_tokens * n_embd;
         GGML_ASSERT(offset_prev <  batch->output_embd.size());
         GGML_ASSERT(offset      <= batch->output_embd.size());
         if (c == chunk) {
             return &batch->output_embd.data()[offset_prev];
         }
     }
-    return nullptr;
+    return nullptr; // not found
 }
 
 bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk) {

From 67d433505da770e70ed137d4a95771a0b5725a45 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 11 Jun 2026 19:09:36 +0200
Subject: [PATCH 10/13] refactor a bit

---
 tools/mtmd/clip.cpp |  4 +--
 tools/mtmd/clip.h   |  2 +-
 tools/mtmd/mtmd.cpp | 69 +++++++++++++++++++++++++--------------------
 3 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index b08589831fc2..1028006505ba 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -4594,10 +4594,10 @@ bool clip_support_batch(const struct clip_ctx * ctx) {
     return ctx->support_batch;
 }
 
-// TODO @ngxson : this is no longer true with mtmd_batch API
+// TODO @ngxson : this is no longer correct with mtmd_batch API
 // this was only meant to be used by qwen-vl-based models, to fuse 2 input images into one (qwen-vl video support)
 // this logic should be refactored in near future to distinctly handle "merge frames" and "batching"
-int clip_model_n_batch_max(const struct clip_ctx * ctx) {
+int clip_model_n_temporal_merge(const struct clip_ctx * ctx) {
     switch (ctx->proj_type()) {
         case PROJECTOR_TYPE_QWEN2VL:
         case PROJECTOR_TYPE_QWEN25VL:
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 03cf649f41cb..7197af8569e7 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -109,7 +109,7 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx);
 
 bool clip_support_batch(const struct clip_ctx * ctx);
 
-int clip_model_n_batch_max(const struct clip_ctx * ctx); // TODO @ngxson : remove, refactor this
+int clip_model_n_temporal_merge(const struct clip_ctx * ctx); // TODO @ngxson : remove, refactor this
 
 std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);
 
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index c8850951c484..9131012d9841 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -69,8 +69,8 @@ struct mtmd_bitmap {
         return data.size();
     }
 
-    bool can_batch_with(const mtmd_bitmap & other) const {
-        // [QWEN_VIDEO] can batch if both are images with same size
+    bool can_merge_with(const mtmd_bitmap & other) const {
+        // [QWEN_VIDEO] can (temporal) merge if both are images with same size
         return !is_audio && !other.is_audio && nx == other.nx && ny == other.ny;
     }
 
@@ -90,12 +90,24 @@ struct mtmd_image_tokens {
     uint32_t ny = 0; // number of tokens in y direction
     mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL;
     uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL)
+    uint32_t n_temporal_merge = 1; // for qwen-vl style temporal merge
     uint32_t n_tokens() const {
         if (pos == MTMD_POS_TYPE_HUNYUANVL) {
             // [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
             return (nx + 1) * ny + 2;
         }
-        return nx * ny;
+        // [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future
+        if (batch_f32.entries.size() == 1) {
+            return nx * ny;
+        }
+        uint32_t nz = batch_f32.entries.size();
+        // TODO: simplify this by repeating the last frame until it fits the temporal merge
+        if (nz % n_temporal_merge != 0) {
+            nz = nz / n_temporal_merge + 1;
+        } else {
+            nz = nz / n_temporal_merge;
+        }
+        return nx * ny * nz;
     }
     clip_image_f32_batch batch_f32; // preprocessed image patches
     std::string id; // optional user-defined ID, useful for KV cache tracking
@@ -120,6 +132,7 @@ struct mtmd_image_tokens {
             ny,
             pos,
             image_idx,
+            n_temporal_merge,
             batch_f32.clone(),
             id
         };
@@ -901,7 +914,7 @@ struct mtmd_tokenizer {
         // [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl)
         int n_merge_frames = 1;
         if (ctx->ctx_v) {
-            n_merge_frames = clip_model_n_batch_max(ctx->ctx_v);
+            n_merge_frames = clip_model_n_temporal_merge(ctx->ctx_v);
             GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more");
         }
 
@@ -916,7 +929,7 @@ struct mtmd_tokenizer {
                 if (i + 1 < parts.size() && parts[i + 1].bitmap != nullptr) {
                     const mtmd_bitmap * bm_a = parts[i].bitmap;
                     const mtmd_bitmap * bm_b = parts[i + 1].bitmap;
-                    if (bm_a->can_batch_with(*bm_b)) {
+                    if (bm_a->can_merge_with(*bm_b)) {
                         LOG_DBG("%s: merging 2 frames at part index %zu and %zu\n", __func__, i, i + 1);
                         merged_bitmaps.push_back({bm_a, bm_b});
                         parts.erase(parts.begin() + i + 1); // collapse the second bitmap part
@@ -1159,13 +1172,17 @@ struct mtmd_tokenizer {
                 size_t n_tokens = 0;
                 for (const auto & e : batch_f32.entries) {
                     n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
-                    if (clip_model_n_batch_max(ctx->ctx_v) == 2) {
+                    if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) {
                         // [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
                         break;
                     }
                 }
 
                 mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+
+                // [QWEN_VIDEO] improve this in the future
+                image_tokens->n_temporal_merge = clip_model_n_temporal_merge(ctx->ctx_v);
+
                 if (mtmd_decode_use_mrope(ctx)) {
                     // for Qwen2VL, we need this information for M-RoPE decoding positions
                     image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
@@ -1383,23 +1400,18 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
     }
 }
 
-static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector<float> * out_batch_embd) {
+static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector<float> & out_embd) {
     clip_ctx * ctx_clip = ctx->ctx_v;
     if (!ctx_clip) {
         LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
         return 1;
     }
     auto proj_type = clip_get_projector_type(ctx_clip);
-    int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
-    std::vector<float> * out_embd_ptr;
-    if (out_batch_embd) {
-        // IMPORTANT: caller must ensure out_batch_embd has enough capacity; clip_image_encode will check for it
-        out_embd_ptr = out_batch_embd;
-    } else {
-        ctx->out_embd.resize(image_tokens->n_tokens() * n_mmproj_embd);
-        out_embd_ptr = &ctx->out_embd;
-    }
-    std::vector<float> & out_embd = *out_embd_ptr;
+
+    int n_embd_out = ctx->n_embd_out();
+    auto n_tokens_out = image_tokens->n_tokens();
+    out_embd.resize((size_t)n_embd_out * n_tokens_out);
+
     bool ok = false;
 
     if (clip_is_llava(ctx_clip)
@@ -1419,7 +1431,7 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
                 return 1;
             }
             int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
-            std::vector<float> tmp_embd(n_tokens_per_image * n_mmproj_embd);
+            std::vector<float> tmp_embd(n_tokens_per_image * n_embd_out);
             bool ok_i = clip_image_encode(
                 ctx_clip,
                 ctx->n_threads,
@@ -1431,7 +1443,7 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
             }
             ok = true;
             std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset);
-            offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
+            offset += static_cast<size_t>(n_embd_out) * n_tokens_per_image;
         }
     } else {
         if (image_tokens->is_placeholder()) {
@@ -1448,7 +1460,7 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
     return ok ? 0 : 1;
 }
 
-static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk * chunk, std::vector<float> * out_batch_embd = nullptr) {
+static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk * chunk, std::vector<float> & out_embd) {
     if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
         LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
         return 0;
@@ -1465,7 +1477,7 @@ static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk
             LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
             return 1;
         }
-        return mtmd_encode_impl(ctx, chunk->tokens_image.get(), out_batch_embd);
+        return mtmd_encode_impl(ctx, chunk->tokens_image.get(), out_embd);
     } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
         if (!ctx->ctx_a) {
             LOG_ERR("%s: model does not support audio input\n", __func__);
@@ -1480,12 +1492,12 @@ static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk
             return 1;
         }
         int n_mmproj_embd = ctx->n_embd_out();
-        ctx->out_embd.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
+        out_embd.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
         bool ok = clip_image_batch_encode(
             ctx->ctx_a,
             ctx->n_threads,
             &chunk->tokens_audio->batch_f32,
-            out_batch_embd ? *out_batch_embd : ctx->out_embd);
+            out_embd);
         return ok ? 0 : 1;
     }
 
@@ -1496,7 +1508,7 @@ static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk
 int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
     // this is the non-batching version
     try {
-        return mtmd_encode_chunk_impl(ctx, chunk, &ctx->out_embd);
+        return mtmd_encode_chunk_impl(ctx, chunk, ctx->out_embd);
     } catch (const std::exception & e) {
         LOG_ERR("%s: error: %s\n", __func__, e.what());
         return 1;
@@ -1505,7 +1517,7 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
 
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     try {
-        return mtmd_encode_impl(ctx, image_tokens, &ctx->out_embd);
+        return mtmd_encode_impl(ctx, image_tokens, ctx->out_embd);
     } catch (const std::exception & e) {
         LOG_ERR("%s: error: %s\n", __func__, e.what());
         return 1;
@@ -1568,17 +1580,12 @@ static int32_t mtmd_batch_encode_impl(mtmd_batch * batch) {
         LOG_ERR("%s: batch is empty\n", __func__);
         return 1;
     }
-
-    // allocate output_embd
-    size_t n_embd = 0;
     for (const auto * chunk : batch->entries) {
         if (chunk->is_placeholder()) {
             LOG_ERR("%s: chunk is placeholder\n", __func__);
             return 1;
         }
-        n_embd += mtmd_input_chunk_get_n_tokens(chunk) * batch->ctx->n_embd_out();
     }
-    batch->output_embd.resize(n_embd);
 
     // represent the whole batch as one single chunk
     mtmd::input_chunk_ptr batch_chunk(mtmd_input_chunk_copy(batch->entries[0]));
@@ -1616,7 +1623,7 @@ static int32_t mtmd_batch_encode_impl(mtmd_batch * batch) {
     int32_t res = mtmd_encode_chunk_impl(
         batch->ctx,
         batch_chunk.get(),
-        &batch->output_embd);
+        batch->output_embd);
     return res;
 }
 

From 0d6bc77df748223357aa50f56f485a361515bb67 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 11 Jun 2026 19:32:42 +0200
Subject: [PATCH 11/13] nits

---
 tools/mtmd/clip.cpp             | 6 +++++-
 tools/mtmd/mtmd.cpp             | 4 ++--
 tools/server/server-context.cpp | 4 ++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 1028006505ba..603f95d11723 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3513,7 +3513,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
     int n_batch_cur = imgs.entries.size();
 
-    // TODO: check batching condition
+    // [QWEN_VIDEO] for video models, the batch dimension is used as temporal dimension for merged frames
+    if (!ctx->support_batch && n_batch_cur > clip_model_n_temporal_merge(ctx)) {
+        LOG_ERR("%s: batch size %d exceeds maximum supported batch/temporal-merge size %d\n", __func__, n_batch_cur, clip_model_n_temporal_merge(ctx));
+        return false;
+    }
 
     // if buffers are not allocated, we need to do a warmup run to allocate them
     if (!ctx->is_allocated) {
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 9131012d9841..5a0bb982346c 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -1431,7 +1431,7 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
                 return 1;
             }
             int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
-            std::vector<float> tmp_embd(n_tokens_per_image * n_embd_out);
+            std::vector<float> tmp_embd((size_t)n_tokens_per_image * n_embd_out);
             bool ok_i = clip_image_encode(
                 ctx_clip,
                 ctx->n_threads,
@@ -1492,7 +1492,7 @@ static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk
             return 1;
         }
         int n_mmproj_embd = ctx->n_embd_out();
-        out_embd.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
+        out_embd.resize((size_t)chunk->tokens_audio->n_tokens * n_mmproj_embd);
         bool ok = clip_image_batch_encode(
             ctx->ctx_a,
             ctx->n_threads,
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 112500b09f75..595fc219b28c 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -612,12 +612,12 @@ struct server_slot {
                         llama_pos new_n_past; // unused for now
                         res = mtmd_helper_decode_image_chunk(
                             mctx,
-                            ctx_tgt,
+                            lctx,
                             chunk.get(),
                             embd,
                             prompt.tokens.pos_next(),
                             id,
-                            llama_n_batch(ctx_tgt),
+                            llama_n_batch(lctx),
                             &new_n_past
                         );
                         if (res != 0) {

From b3a5ca93c6e7b8f2ddbc9e2b033b79b60faa1567 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 12 Jun 2026 19:11:58 +0200
Subject: [PATCH 12/13] fix non-batching case

---
 tools/mtmd/mtmd.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 5a0bb982346c..8e839ef8f46a 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -97,7 +97,7 @@ struct mtmd_image_tokens {
             return (nx + 1) * ny + 2;
         }
         // [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future
-        if (batch_f32.entries.size() == 1) {
+        if (batch_f32.entries.size() == 1 || n_temporal_merge == 1) {
             return nx * ny;
         }
         uint32_t nz = batch_f32.entries.size();

From 4cf7759921e8ad89b84450d0a3e2f9e3d71ea02c Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 12 Jun 2026 20:52:22 +0200
Subject: [PATCH 13/13] fix comment

---
 tools/mtmd/clip.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 603f95d11723..208486fd153b 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -316,7 +316,7 @@ ggml_tensor * clip_graph::build_vit(
             std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos,
             const build_vit_opts & opts
         ) {
-    // batch dim: inp is [n_embd, n_pos] (B==1) or [n_embd, n_pos, B] (multi-tile encode)
+    // batch dim: inp is [n_embd, n_pos, B]
     const int64_t B = inp->ne[2];
 
     if (learned_pos_embd) {