Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 24 additions & 16 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3596,9 +3596,17 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima

// set input pixel values
if (!imgs.is_audio) {
// detect number of channels from the buffer size
const int nx = imgs.entries[0]->nx;
const int ny = imgs.entries[0]->ny;
const int n = nx * ny;
const size_t buf_size = imgs.entries[0]->buf.size();
const int n_channels = (int)(buf_size / n);
GGML_ASSERT(n_channels == 3 || n_channels == 6);

size_t nelem = 0;
for (const auto & img : imgs.entries) {
nelem += img->nx * img->ny * 3;
nelem += img->nx * img->ny * n_channels;
}
std::vector<float> inp_raw(nelem);

Expand All @@ -3612,21 +3620,21 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// │ H │ channel = B
// └─────┘ │
// ──────┘ x B

for (size_t i = 0; i < imgs.entries.size(); i++) {
const int nx = imgs.entries[i]->nx;
const int ny = imgs.entries[i]->ny;
const int n = nx * ny;

for (int b = 0; b < batch_size; b++) {
float * batch_entry = inp_raw.data() + b * (3*n);
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
size_t base_src = 3*(y * nx + x); // idx of the first channel
size_t base_dst = y * nx + x; // idx of the first channel
batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ];
batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
//
// for 6-channel video input, same layout but with 6 planar channels

for (int b = 0; b < batch_size; b++) {
const int cur_nx = imgs.entries[b]->nx;
const int cur_ny = imgs.entries[b]->ny;
const int cur_n = cur_nx * cur_ny;

float * batch_entry = inp_raw.data() + b * (n_channels * cur_n);
for (int y = 0; y < cur_ny; y++) {
for (int x = 0; x < cur_nx; x++) {
size_t base_src = n_channels * (y * cur_nx + x);
size_t base_dst = y * cur_nx + x;
for (int c = 0; c < n_channels; c++) {
batch_entry[c * cur_n + base_dst] = imgs.entries[b]->buf[base_src + c];
}
Comment on lines +3623 to 3638

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably better to enter multiple images via the batch dimension, rather than using 6 channels

}
}
Expand Down
28 changes: 23 additions & 5 deletions tools/mtmd/models/qwen3vl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,34 @@ ggml_cgraph * clip_graph_qwen3vl::build() {

int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};

ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
// detect video: 6-channel input means interleaved frame pairs (even_rgb + odd_rgb)
// for images (3ch), both Conv2Ds receive the same input (original behavior)
// for video (6ch), Conv2D_0 gets even frames (ch 0-2), Conv2D_1 gets odd frames (ch 3-5)
const bool is_video = (img.buf.size() == (size_t)img.nx * img.ny * 6);
const int n_channels = is_video ? 6 : 3;

ggml_tensor * inp_raw = build_inp_raw(n_channels);

ggml_tensor * inp;
if (is_video) {
const size_t nb1 = ggml_row_size(inp_raw->type, img.nx);
const size_t nb2 = nb1 * img.ny;
ggml_tensor * inp_even = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, 0);
ggml_tensor * inp_odd = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, nb2 * 3);
inp = ggml_add(ctx0,
ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_even, patch_size, patch_size, 0, 0, 1, 1),
ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_odd, patch_size, patch_size, 0, 0, 1, 1));
} else {
inp = ggml_add(ctx0,
ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1),
ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1));
}
Comment on lines +25 to +37

@ngxson ngxson Apr 13, 2026

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if I read this correctly, that means number of output token stays unchanged whether we input single image, or 2 frames


GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);

// second conv dimension
// spatial merge
{
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);

inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(
Expand Down
29 changes: 28 additions & 1 deletion tools/mtmd/mtmd-helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,28 @@ struct decode_embd_batch {
}
}

// M-RoPE for video: 3D positions [temporal, height, width]
void set_position_mrope_3d(llama_pos pos_0, int nx, int ny, int nt, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4);
seq_id_0[0] = seq_id;
for (int t = 0; t < nt; t++) {
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
int i = t * ny * nx + y * nx + x;
pos[i ] = pos_0 + t;
pos[i + batch.n_tokens ] = pos_0 + y;
pos[i + batch.n_tokens * 2] = pos_0 + x;
pos[i + batch.n_tokens * 3] = 0;
}
}
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
Comment on lines +178 to +197

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note: to be replaced by #21851


// M-RoPE for audio
void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4);
Expand Down Expand Up @@ -260,7 +282,12 @@ int32_t mtmd_helper_decode_image_chunk(
}
const int nx = mtmd_image_tokens_get_nx(image_tokens);
const int ny = mtmd_image_tokens_get_ny(image_tokens);
batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
const int nt = mtmd_image_tokens_get_nt(image_tokens);
if (nt > 1) {
batch_embd.set_position_mrope_3d(n_past, nx, ny, nt, seq_id);
} else {
batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
}
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
batch_embd.set_position_mrope_1d(n_past, seq_id);
} else {
Expand Down
145 changes: 139 additions & 6 deletions tools/mtmd/mtmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@

// represents raw image data, layout is RGBRGBRGB...
// length of data must be nx * ny * 3
// for video: data is n_frames sequential RGB frames, each nx * ny * 3 bytes
struct mtmd_bitmap {
uint32_t nx;
uint32_t ny;
uint32_t n_frames = 0; // 0 for single images, >= 2 (even) for video
std::vector<unsigned char> data;
std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
bool is_audio = false; // true if the bitmap is audio
Expand All @@ -34,15 +36,17 @@ struct mtmd_bitmap {
struct mtmd_image_tokens {
uint32_t nx; // number of tokens in x direction
uint32_t ny; // number of tokens in y direction
uint32_t nt = 1; // number of temporal positions (1 for images, > 1 for video)
bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
uint32_t n_tokens() const { return nx * ny; }
uint32_t n_tokens() const { return nt * nx * ny; }
Comment on lines +39 to +41

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems to be wrong for qwen. it merges 2 frame into one output, so the output token count should stays nx * ny

clip_image_f32_batch batch_f32; // preprocessed image patches
std::string id; // optional user-defined ID, useful for KV cache tracking

mtmd_image_tokens clone() {
return mtmd_image_tokens{
nx,
ny,
nt,
use_mrope_pos,
batch_f32.clone(),
id
Expand Down Expand Up @@ -549,6 +553,10 @@ struct mtmd_tokenizer {
}

int32_t add_media(const mtmd_bitmap * bitmap) {
if (bitmap->n_frames >= 2) {
return add_video(bitmap);
}

if (!bitmap->is_audio) {
// handle image

Expand Down Expand Up @@ -739,6 +747,102 @@ struct mtmd_tokenizer {
return 0;
}

// preprocess video frames and create an image chunk with temporal dimension
// frames are paired (even+odd), each pair becomes one 6-channel image
// each pair is encoded independently through the ViT (per-frame attention)
int32_t add_video(const mtmd_bitmap * bitmap) {
if (!ctx->ctx_v) {
LOG_ERR("%s: error: model does not support vision input\n", __func__);
return 2;
}

const uint32_t n_frames = bitmap->n_frames;
const uint32_t n_pairs = n_frames / 2;
const size_t frame_bytes = (size_t)bitmap->nx * bitmap->ny * 3;

if (!ctx->img_beg.empty()) {
add_text(ctx->img_beg, true);
}

// preprocess each frame individually
clip_image_f32_batch all_frames;
for (uint32_t f = 0; f < n_frames; f++) {
clip_image_u8_ptr img_u8(clip_image_u8_init());
img_u8->nx = bitmap->nx;
img_u8->ny = bitmap->ny;
img_u8->buf.resize(frame_bytes);
std::memcpy(img_u8->buf.data(), bitmap->data.data() + f * frame_bytes, frame_bytes);

clip_image_f32_batch frame_batch;
bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &frame_batch);
if (!ok) {
LOG_ERR("Unable to preprocess video frame %u\n", f);
return 2;
}
GGML_ASSERT(frame_batch.entries.size() == 1);
all_frames.entries.push_back(std::move(frame_batch.entries[0]));
}

const int frame_nx = all_frames.entries[0]->nx;
const int frame_ny = all_frames.entries[0]->ny;
const int n_pixels = frame_nx * frame_ny;

// interleave frame pairs into 6-channel images (even_rgb + odd_rgb)
// each pair is a separate batch entry, encoded independently
clip_image_f32_batch pair_batch;
for (uint32_t p = 0; p < n_pairs; p++) {
const auto & even = all_frames.entries[p * 2];
const auto & odd = all_frames.entries[p * 2 + 1];
GGML_ASSERT(even->nx == frame_nx && even->ny == frame_ny);
GGML_ASSERT(odd->nx == frame_nx && odd->ny == frame_ny);

clip_image_f32_ptr pair(clip_image_f32_init());
pair->nx = frame_nx;
pair->ny = frame_ny;
pair->buf.resize((size_t)n_pixels * 6);

for (int i = 0; i < n_pixels; i++) {
const int dst = i * 6;
const int src = i * 3;
pair->buf[dst + 0] = even->buf[src + 0];
pair->buf[dst + 1] = even->buf[src + 1];
pair->buf[dst + 2] = even->buf[src + 2];
pair->buf[dst + 3] = odd->buf[src + 0];
pair->buf[dst + 4] = odd->buf[src + 1];
pair->buf[dst + 5] = odd->buf[src + 2];
}
pair_batch.entries.push_back(std::move(pair));
}

const uint32_t tokens_x = clip_n_output_tokens_x(ctx->ctx_v, pair_batch.entries[0].get());
const uint32_t tokens_y = clip_n_output_tokens_y(ctx->ctx_v, pair_batch.entries[0].get());

mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
image_tokens->nx = tokens_x;
image_tokens->ny = tokens_y;
image_tokens->nt = n_pairs;
image_tokens->use_mrope_pos = true;
image_tokens->batch_f32 = std::move(pair_batch);
image_tokens->id = bitmap->id;

LOG_DBG("video: nt=%u, nx=%u, ny=%u, n_tokens=%u\n",
image_tokens->nt, image_tokens->nx, image_tokens->ny, image_tokens->n_tokens());

mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_IMAGE,
{}, // text tokens
std::move(image_tokens),
nullptr, // audio tokens
};
cur.entries.emplace_back(std::move(chunk));

if (!ctx->img_end.empty()) {
add_text(ctx->img_end, true);
}

return 0;
}

std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
std::vector<mtmd_input_chunk> chunks;

Expand Down Expand Up @@ -851,10 +955,13 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
bool ok = false;

if (clip_is_llava(ctx_clip)
if (image_tokens->nt > 1
|| clip_is_llava(ctx_clip)
|| clip_is_minicpmv(ctx_clip)
|| clip_is_glm(ctx_clip)) {
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
// encode each batch entry independently
// video: each entry is one frame pair, encoded with per-frame attention
// llava/minicpmv/glm: does not support batched encoding
const auto & entries = image_tokens->batch_f32.entries;
for (size_t i = 0; i < entries.size(); i++) {
int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
Expand Down Expand Up @@ -934,6 +1041,21 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
return bitmap;
}

mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx,
uint32_t ny,
uint32_t n_frames,
const unsigned char * data) {
GGML_ASSERT(n_frames >= 2 && n_frames % 2 == 0);
mtmd_bitmap * bitmap = new mtmd_bitmap;
bitmap->nx = nx;
bitmap->ny = ny;
bitmap->n_frames = n_frames;
size_t data_size = (size_t)nx * ny * 3 * n_frames;
bitmap->data.resize(data_size);
std::memcpy(bitmap->data.data(), data, data_size);
return bitmap;
}

mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
const float * data) {
mtmd_bitmap * bitmap = new mtmd_bitmap;
Expand Down Expand Up @@ -966,6 +1088,14 @@ bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
return bitmap->is_audio;
}

bool mtmd_bitmap_is_video(const mtmd_bitmap * bitmap) {
return bitmap->n_frames >= 2;
}

uint32_t mtmd_bitmap_get_n_frames(const mtmd_bitmap * bitmap) {
return bitmap->n_frames;
}

const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
return bitmap->id.c_str();
}
Expand Down Expand Up @@ -1102,15 +1232,18 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
return image_tokens->ny;
}

size_t mtmd_image_tokens_get_nt(const mtmd_image_tokens * image_tokens) {
return image_tokens->nt;
}

const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
return image_tokens->id.c_str();
}

llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
if (image_tokens->use_mrope_pos) {
// for M-RoPE, temporal dimension = max(t,h,w)
// t is omitted as we don't support video input
return std::max(image_tokens->nx, image_tokens->ny);
// for M-RoPE, n_pos = max(t, h, w)
return (llama_pos)std::max({image_tokens->nt, image_tokens->nx, image_tokens->ny});
}
return image_tokens->n_tokens();
}
Expand Down
Loading
Loading