Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2243,6 +2243,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.image_max_tokens = value;
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
add_opt(common_arg(
{"--mtmd-batch-max-tokens"}, "N",
string_format("maximum number of image tokens per batch when encoding images (default: %d)", params.mtmd_batch_max_tokens),
[](common_params & params, int value) {
params.mtmd_batch_max_tokens = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MTMD_BATCH_MAX_TOKENS"));
if (llama_supports_rpc()) {
add_opt(common_arg(
{"--rpc"}, "SERVERS",
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,7 @@ struct common_params {
std::vector<std::string> image; // path to image file(s) ; TODO: change the name to "media"
int image_min_tokens = -1;
int image_max_tokens = -1;
int mtmd_batch_max_tokens = 1024;

// finetune
struct lr_opt lr;
Expand Down
4 changes: 4 additions & 0 deletions tools/mtmd/clip-graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ struct clip_graph {
virtual ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const;
// TODO: build_mm(w, b, x) to support bias

virtual bool support_batch() const {
return false;
}

//
// utility functions
//
Expand Down
75 changes: 53 additions & 22 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ struct clip_ctx {
std::map<ggml_backend_dev_t, size_t> mem_usage;
std::map<ggml_backend_dev_t, size_t> mem_compute;

bool support_batch = false;

clip_ctx(clip_context_params & ctx_params) {
flash_attn_type = ctx_params.flash_attn_type;
no_alloc = ctx_params.no_alloc;
Expand Down Expand Up @@ -862,7 +864,7 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
return cur;
}

static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
static std::unique_ptr<clip_graph> clip_get_graph_builder(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
const clip_image_f32 & img = *imgs.entries[0];
std::unique_ptr<clip_graph> builder;

Expand Down Expand Up @@ -1025,7 +1027,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// TODO [QWEN_VIDEO]: improve this in the future
builder->n_batch = imgs.entries.size();

return builder->build();
return builder;
}

//
Expand Down Expand Up @@ -2819,7 +2821,7 @@ struct clip_model_loader {
std::vector<support_info_op> ops;
};

static void warmup(clip_ctx & ctx_clip) {
static clip_image_f32_batch get_dummy_batch(clip_ctx & ctx_clip) {
// create a fake batch
const auto & hparams = ctx_clip.model.hparams;
clip_image_f32_batch batch;
Expand All @@ -2833,6 +2835,20 @@ struct clip_model_loader {
LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size);
}
batch.entries.push_back(std::move(img));
return batch;
}

static void init_ctx(clip_ctx & ctx_clip) {
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());

// check batching support
auto batch = get_dummy_batch(ctx_clip);
auto builder = clip_get_graph_builder(&ctx_clip, batch);
ctx_clip.support_batch = builder->support_batch();
}

static void warmup(clip_ctx & ctx_clip) {
auto batch = get_dummy_batch(ctx_clip);
warmup(ctx_clip, batch);
}

Expand Down Expand Up @@ -2905,9 +2921,7 @@ struct clip_model_loader {

// only initialize backend buffers, but do not allocate them yet
static support_info_graph reserve_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());

ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
ggml_cgraph * gf = clip_get_graph_builder(&ctx_clip, batch)->build();
ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);

ctx_clip.mem_compute.clear();
Expand Down Expand Up @@ -3070,6 +3084,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
ctx_vision = new clip_ctx(ctx_params);
loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
loader.load_tensors(*ctx_vision);
loader.init_ctx(*ctx_vision);
if (ctx_params.warmup) {
loader.warmup(*ctx_vision);
}
Expand All @@ -3083,6 +3098,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
ctx_audio = new clip_ctx(ctx_params);
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
loader.load_tensors(*ctx_audio);
loader.init_ctx(*ctx_audio);
if (ctx_params.warmup) {
loader.warmup(*ctx_audio);
}
Expand Down Expand Up @@ -3484,25 +3500,22 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
return n_patches;
}

bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, std::vector<float> & out_vec) {
clip_image_f32_batch imgs;
clip_image_f32_ptr img_copy(clip_image_f32_init());
*img_copy = *img;
imgs.entries.push_back(std::move(img_copy));

return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
return clip_image_batch_encode(ctx, n_threads, &imgs, out_vec);
}

bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, std::vector<float> & out_batch_embd) {
const clip_image_f32_batch & imgs = *imgs_c_ptr;
int n_batch_cur = imgs.entries.size();

// maximum supported batch size, usually == 2 for qwen-vl-based models
int n_batch_max = clip_model_n_batch_max(ctx);

// TODO @ngxson : implement batch size > 1 as a loop
// we don't need true batching support because the cgraph will gonna be big anyway
if (n_batch_cur > n_batch_max) {
// [QWEN_VIDEO] for video models, the batch dimension is used as temporal dimension for merged frames
if (!ctx->support_batch && n_batch_cur > clip_model_n_temporal_merge(ctx)) {
LOG_ERR("%s: batch size %d exceeds maximum supported batch/temporal-merge size %d\n", __func__, n_batch_cur, clip_model_n_temporal_merge(ctx));
return false;
}

Expand All @@ -3513,7 +3526,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima

// build the inference graph
ggml_backend_sched_reset(ctx->sched.get());
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
ggml_cgraph * gf = clip_get_graph_builder(ctx, imgs)->build();
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);

// set inputs
Expand Down Expand Up @@ -3582,6 +3595,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int n = nx * ny;

for (int b = 0; b < n_batch_cur; b++) {
LOG_DBG("%s: copying image %d/%d to input buffer (nx=%d, ny=%d)\n", __func__, b+1, n_batch_cur, nx, ny);
const auto & buf = imgs.entries[b]->get_ro_buf();
float * batch_entry = inp_raw.data() + b * (3*n);
for (int y = 0; y < ny; y++) {
Expand Down Expand Up @@ -4416,24 +4430,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// the last node is the embedding tensor
ggml_tensor * embeddings = ggml_graph_node(gf, -1);

// sanity check (only support batch size of 1 for now)
// sanity check (assuming that all images in batch have the same number of tokens, so we only check the first one)
const int n_tokens_out = embeddings->ne[1];
const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
if (n_tokens_out != expected_n_tokens_out) {
LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
GGML_ABORT("Invalid number of output tokens");
}

// copy the embeddings to the location passed by the user
if (vec != nullptr) {
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
LOG_DBG("%s: output embedding shape [%d, %d, %d]\n", __func__,
(int)embeddings->ne[0], (int)embeddings->ne[1], (int)embeddings->ne[2]);

// copy output to user buffer if provided
// if output is empty, skip the copy
if (!out_batch_embd.empty()) {
if (out_batch_embd.size() != (size_t)ggml_nelements(embeddings)) {
LOG_ERR("%s: output buffer has %zu elements but expected %zu\n", __func__, out_batch_embd.size(), (size_t)ggml_nelements(embeddings));
GGML_ABORT("Output buffer size mismatch");
}
ggml_backend_tensor_get(embeddings, out_batch_embd.data(), 0, ggml_nbytes(embeddings));
} else {
LOG_WRN("%s: output buffer is empty, skipping copy\n", __func__);
}

// Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
if (ctx->debug_output_embeddings) {
const int64_t n_embd = embeddings->ne[0];
const int64_t n_tokens = embeddings->ne[1];
std::vector<float> emb_data(n_embd * n_tokens);
std::vector<float> emb_data(ggml_nelements(embeddings));
ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings));

LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n");
Expand Down Expand Up @@ -4570,7 +4594,14 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
return ctx->model.modality == CLIP_MODALITY_AUDIO;
}

int clip_model_n_batch_max(const struct clip_ctx * ctx) {
bool clip_support_batch(const struct clip_ctx * ctx) {
return ctx->support_batch;
}

// TODO @ngxson : this is no longer correct with mtmd_batch API
// this was only meant to be used by qwen-vl-based models, to fuse 2 input images into one (qwen-vl video support)
// this logic should be refactored in near future to distinctly handle "merge frames" and "batching"
int clip_model_n_temporal_merge(const struct clip_ctx * ctx) {
switch (ctx->proj_type()) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
Expand Down
8 changes: 5 additions & 3 deletions tools/mtmd/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data

bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, std::vector<float> & out_vec);
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, std::vector<float> & out_batch_embd);

bool clip_is_llava(const struct clip_ctx * ctx);
// note for contributor: this clip_is_(model) pattern is deprecated
Expand All @@ -107,7 +107,9 @@ bool clip_is_llava(const struct clip_ctx * ctx);
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
bool clip_has_audio_encoder(const struct clip_ctx * ctx);

int clip_model_n_batch_max(const struct clip_ctx * ctx);
bool clip_support_batch(const struct clip_ctx * ctx);

int clip_model_n_temporal_merge(const struct clip_ctx * ctx); // TODO @ngxson : remove, refactor this

std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);

Expand Down
20 changes: 11 additions & 9 deletions tools/mtmd/models/gemma4v.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ ggml_cgraph * clip_graph_gemma4v::build() {
ggml_set_name(inp_raw, "inp_raw_scaled");

ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
inp = ggml_reshape_3d(ctx0, inp, n_patches, n_embd, n_batch);
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
ggml_set_name(inp, "inp");
// note: no patch bias
Expand Down Expand Up @@ -51,10 +51,11 @@ ggml_cgraph * clip_graph_gemma4v::build() {
// first half
ggml_tensor * first;
{
first = ggml_view_3d(ctx0, cur,
n_dim/2, n_head, n_pos,
first = ggml_view_4d(ctx0, cur,
n_dim/2, n_head, n_pos, n_batch,
cur->nb[1],
cur->nb[2],
cur->nb[3],
0);
first = ggml_rope_ext(
ctx0,
Expand All @@ -70,10 +71,11 @@ ggml_cgraph * clip_graph_gemma4v::build() {
// second half
ggml_tensor * second;
{
second = ggml_view_3d(ctx0, cur,
n_dim/2, n_head, n_pos,
second = ggml_view_4d(ctx0, cur,
n_dim/2, n_head, n_pos, n_batch,
cur->nb[1],
cur->nb[2],
cur->nb[3],
n_dim/2 * ggml_element_size(cur));
second = ggml_rope_ext(
ctx0,
Expand Down Expand Up @@ -103,14 +105,14 @@ ggml_cgraph * clip_graph_gemma4v::build() {
const int kernel_size = hparams.n_merge;
GGML_ASSERT(kernel_size > 0);

// [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, 1]
cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, 1);
// [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, n_batch]
cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, n_batch);
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG,
kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
const int out_x = n_patches_x / kernel_size;
const int out_y = n_patches_y / kernel_size;
// [out_x, out_y, n_embd, 1] -> [n_embd, out_x * out_y]
cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, 1);
// [out_x, out_y, n_embd, n_batch] -> [n_embd, out_x * out_y, n_batch]
cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, n_batch);
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
cur = ggml_scale(ctx0, cur, sqrtf((float)n_embd));
cb(cur, "pooled", -1);
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/models/models.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ struct clip_graph_gemma4v : clip_graph {
clip_graph_gemma4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
bool support_batch() const override { return true; }
};

struct clip_graph_gemma4uv : clip_graph {
Expand Down
13 changes: 8 additions & 5 deletions tools/mtmd/mtmd-helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image,

// helper function that automatically:
// 1. run llama_decode() on text chunks
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
// 2. run mtmd_encode_chunk() on image chunks, then mtmd_get_output_embd() and then llama_decode()
// if any of the mtmd_encode_chunk() or llama_decode() calls return non-zero, stop and forward the error
// otherwise, returns 0 on success
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
Expand Down Expand Up @@ -157,13 +157,16 @@ MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx,
} // extern "C"
#endif

#ifdef __cplusplus
#include <set>
#include <memory>

namespace mtmd_helper {

//
// C++ wrappers
//

#ifdef __cplusplus
namespace mtmd_helper {

// video-related C++ wrappers
struct mtmd_helper_video_deleter {
void operator()(mtmd_helper_video * val) { mtmd_helper_video_free(val); }
Expand Down
Loading
Loading