-
Notifications
You must be signed in to change notification settings - Fork 19.8k
feat: add video support to mtmd #20224
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,16 +13,34 @@ ggml_cgraph * clip_graph_qwen3vl::build() { | |
|
|
||
| int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; | ||
|
|
||
| ggml_tensor * inp_raw = build_inp_raw(); | ||
| ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); | ||
| // detect video: 6-channel input means interleaved frame pairs (even_rgb + odd_rgb) | ||
| // for images (3ch), both Conv2Ds receive the same input (original behavior) | ||
| // for video (6ch), Conv2D_0 gets even frames (ch 0-2), Conv2D_1 gets odd frames (ch 3-5) | ||
| const bool is_video = (img.buf.size() == (size_t)img.nx * img.ny * 6); | ||
| const int n_channels = is_video ? 6 : 3; | ||
|
|
||
| ggml_tensor * inp_raw = build_inp_raw(n_channels); | ||
|
|
||
| ggml_tensor * inp; | ||
| if (is_video) { | ||
| const size_t nb1 = ggml_row_size(inp_raw->type, img.nx); | ||
| const size_t nb2 = nb1 * img.ny; | ||
| ggml_tensor * inp_even = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, 0); | ||
| ggml_tensor * inp_odd = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, nb2 * 3); | ||
| inp = ggml_add(ctx0, | ||
| ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_even, patch_size, patch_size, 0, 0, 1, 1), | ||
| ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_odd, patch_size, patch_size, 0, 0, 1, 1)); | ||
| } else { | ||
| inp = ggml_add(ctx0, | ||
| ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1), | ||
| ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1)); | ||
| } | ||
|
Comment on lines
+25
to
+37
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if I read this correctly, that means number of output token stays unchanged whether we input single image, or 2 frames |
||
|
|
||
| GGML_ASSERT(img.nx % (patch_size * 2) == 0); | ||
| GGML_ASSERT(img.ny % (patch_size * 2) == 0); | ||
|
|
||
| // second conv dimension | ||
| // spatial merge | ||
| { | ||
| auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); | ||
| inp = ggml_add(ctx0, inp, inp_1); | ||
|
|
||
| inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] | ||
| inp = ggml_cont_4d( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -174,6 +174,28 @@ struct decode_embd_batch { | |
| } | ||
| } | ||
|
|
||
| // M-RoPE for video: 3D positions [temporal, height, width] | ||
| void set_position_mrope_3d(llama_pos pos_0, int nx, int ny, int nt, llama_seq_id seq_id) { | ||
| GGML_ASSERT(n_pos_per_embd == 4); | ||
| seq_id_0[0] = seq_id; | ||
| for (int t = 0; t < nt; t++) { | ||
| for (int y = 0; y < ny; y++) { | ||
| for (int x = 0; x < nx; x++) { | ||
| int i = t * ny * nx + y * nx + x; | ||
| pos[i ] = pos_0 + t; | ||
| pos[i + batch.n_tokens ] = pos_0 + y; | ||
| pos[i + batch.n_tokens * 2] = pos_0 + x; | ||
| pos[i + batch.n_tokens * 3] = 0; | ||
| } | ||
| } | ||
| } | ||
| for (int i = 0; i < batch.n_tokens; i++) { | ||
| batch.n_seq_id[i] = 1; | ||
| batch.seq_id [i] = seq_id_0.data(); | ||
| batch.logits [i] = false; | ||
| } | ||
| } | ||
|
Comment on lines
+178
to
+197
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. note: to be replaced by #21851 |
||
|
|
||
| // M-RoPE for audio | ||
| void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) { | ||
| GGML_ASSERT(n_pos_per_embd == 4); | ||
|
|
@@ -260,7 +282,12 @@ int32_t mtmd_helper_decode_image_chunk( | |
| } | ||
| const int nx = mtmd_image_tokens_get_nx(image_tokens); | ||
| const int ny = mtmd_image_tokens_get_ny(image_tokens); | ||
| batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id); | ||
| const int nt = mtmd_image_tokens_get_nt(image_tokens); | ||
| if (nt > 1) { | ||
| batch_embd.set_position_mrope_3d(n_past, nx, ny, nt, seq_id); | ||
| } else { | ||
| batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id); | ||
| } | ||
| } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { | ||
| batch_embd.set_position_mrope_1d(n_past, seq_id); | ||
| } else { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,9 +23,11 @@ | |
|
|
||
| // represents raw image data, layout is RGBRGBRGB... | ||
| // length of data must be nx * ny * 3 | ||
| // for video: data is n_frames sequential RGB frames, each nx * ny * 3 bytes | ||
| struct mtmd_bitmap { | ||
| uint32_t nx; | ||
| uint32_t ny; | ||
| uint32_t n_frames = 0; // 0 for single images, >= 2 (even) for video | ||
| std::vector<unsigned char> data; | ||
| std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking | ||
| bool is_audio = false; // true if the bitmap is audio | ||
|
|
@@ -34,15 +36,17 @@ struct mtmd_bitmap { | |
| struct mtmd_image_tokens { | ||
| uint32_t nx; // number of tokens in x direction | ||
| uint32_t ny; // number of tokens in y direction | ||
| uint32_t nt = 1; // number of temporal positions (1 for images, > 1 for video) | ||
| bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position) | ||
| uint32_t n_tokens() const { return nx * ny; } | ||
| uint32_t n_tokens() const { return nt * nx * ny; } | ||
|
Comment on lines
+39
to
+41
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this seems to be wrong for qwen. it merges 2 frame into one output, so the output token count should stays |
||
| clip_image_f32_batch batch_f32; // preprocessed image patches | ||
| std::string id; // optional user-defined ID, useful for KV cache tracking | ||
|
|
||
| mtmd_image_tokens clone() { | ||
| return mtmd_image_tokens{ | ||
| nx, | ||
| ny, | ||
| nt, | ||
| use_mrope_pos, | ||
| batch_f32.clone(), | ||
| id | ||
|
|
@@ -549,6 +553,10 @@ struct mtmd_tokenizer { | |
| } | ||
|
|
||
| int32_t add_media(const mtmd_bitmap * bitmap) { | ||
| if (bitmap->n_frames >= 2) { | ||
| return add_video(bitmap); | ||
| } | ||
|
|
||
| if (!bitmap->is_audio) { | ||
| // handle image | ||
|
|
||
|
|
@@ -739,6 +747,102 @@ struct mtmd_tokenizer { | |
| return 0; | ||
| } | ||
|
|
||
| // preprocess video frames and create an image chunk with temporal dimension | ||
| // frames are paired (even+odd), each pair becomes one 6-channel image | ||
| // each pair is encoded independently through the ViT (per-frame attention) | ||
| int32_t add_video(const mtmd_bitmap * bitmap) { | ||
| if (!ctx->ctx_v) { | ||
| LOG_ERR("%s: error: model does not support vision input\n", __func__); | ||
| return 2; | ||
| } | ||
|
|
||
| const uint32_t n_frames = bitmap->n_frames; | ||
| const uint32_t n_pairs = n_frames / 2; | ||
| const size_t frame_bytes = (size_t)bitmap->nx * bitmap->ny * 3; | ||
|
|
||
| if (!ctx->img_beg.empty()) { | ||
| add_text(ctx->img_beg, true); | ||
| } | ||
|
|
||
| // preprocess each frame individually | ||
| clip_image_f32_batch all_frames; | ||
| for (uint32_t f = 0; f < n_frames; f++) { | ||
| clip_image_u8_ptr img_u8(clip_image_u8_init()); | ||
| img_u8->nx = bitmap->nx; | ||
| img_u8->ny = bitmap->ny; | ||
| img_u8->buf.resize(frame_bytes); | ||
| std::memcpy(img_u8->buf.data(), bitmap->data.data() + f * frame_bytes, frame_bytes); | ||
|
|
||
| clip_image_f32_batch frame_batch; | ||
| bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &frame_batch); | ||
| if (!ok) { | ||
| LOG_ERR("Unable to preprocess video frame %u\n", f); | ||
| return 2; | ||
| } | ||
| GGML_ASSERT(frame_batch.entries.size() == 1); | ||
| all_frames.entries.push_back(std::move(frame_batch.entries[0])); | ||
| } | ||
|
|
||
| const int frame_nx = all_frames.entries[0]->nx; | ||
| const int frame_ny = all_frames.entries[0]->ny; | ||
| const int n_pixels = frame_nx * frame_ny; | ||
|
|
||
| // interleave frame pairs into 6-channel images (even_rgb + odd_rgb) | ||
| // each pair is a separate batch entry, encoded independently | ||
| clip_image_f32_batch pair_batch; | ||
| for (uint32_t p = 0; p < n_pairs; p++) { | ||
| const auto & even = all_frames.entries[p * 2]; | ||
| const auto & odd = all_frames.entries[p * 2 + 1]; | ||
| GGML_ASSERT(even->nx == frame_nx && even->ny == frame_ny); | ||
| GGML_ASSERT(odd->nx == frame_nx && odd->ny == frame_ny); | ||
|
|
||
| clip_image_f32_ptr pair(clip_image_f32_init()); | ||
| pair->nx = frame_nx; | ||
| pair->ny = frame_ny; | ||
| pair->buf.resize((size_t)n_pixels * 6); | ||
|
|
||
| for (int i = 0; i < n_pixels; i++) { | ||
| const int dst = i * 6; | ||
| const int src = i * 3; | ||
| pair->buf[dst + 0] = even->buf[src + 0]; | ||
| pair->buf[dst + 1] = even->buf[src + 1]; | ||
| pair->buf[dst + 2] = even->buf[src + 2]; | ||
| pair->buf[dst + 3] = odd->buf[src + 0]; | ||
| pair->buf[dst + 4] = odd->buf[src + 1]; | ||
| pair->buf[dst + 5] = odd->buf[src + 2]; | ||
| } | ||
| pair_batch.entries.push_back(std::move(pair)); | ||
| } | ||
|
|
||
| const uint32_t tokens_x = clip_n_output_tokens_x(ctx->ctx_v, pair_batch.entries[0].get()); | ||
| const uint32_t tokens_y = clip_n_output_tokens_y(ctx->ctx_v, pair_batch.entries[0].get()); | ||
|
|
||
| mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); | ||
| image_tokens->nx = tokens_x; | ||
| image_tokens->ny = tokens_y; | ||
| image_tokens->nt = n_pairs; | ||
| image_tokens->use_mrope_pos = true; | ||
| image_tokens->batch_f32 = std::move(pair_batch); | ||
| image_tokens->id = bitmap->id; | ||
|
|
||
| LOG_DBG("video: nt=%u, nx=%u, ny=%u, n_tokens=%u\n", | ||
| image_tokens->nt, image_tokens->nx, image_tokens->ny, image_tokens->n_tokens()); | ||
|
|
||
| mtmd_input_chunk chunk{ | ||
| MTMD_INPUT_CHUNK_TYPE_IMAGE, | ||
| {}, // text tokens | ||
| std::move(image_tokens), | ||
| nullptr, // audio tokens | ||
| }; | ||
| cur.entries.emplace_back(std::move(chunk)); | ||
|
|
||
| if (!ctx->img_end.empty()) { | ||
| add_text(ctx->img_end, true); | ||
| } | ||
|
|
||
| return 0; | ||
| } | ||
|
|
||
| std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) { | ||
| std::vector<mtmd_input_chunk> chunks; | ||
|
|
||
|
|
@@ -851,10 +955,13 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) | |
| ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); | ||
| bool ok = false; | ||
|
|
||
| if (clip_is_llava(ctx_clip) | ||
| if (image_tokens->nt > 1 | ||
| || clip_is_llava(ctx_clip) | ||
| || clip_is_minicpmv(ctx_clip) | ||
| || clip_is_glm(ctx_clip)) { | ||
| // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() | ||
| // encode each batch entry independently | ||
| // video: each entry is one frame pair, encoded with per-frame attention | ||
| // llava/minicpmv/glm: does not support batched encoding | ||
| const auto & entries = image_tokens->batch_f32.entries; | ||
| for (size_t i = 0; i < entries.size(); i++) { | ||
| int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get()); | ||
|
|
@@ -934,6 +1041,21 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, | |
| return bitmap; | ||
| } | ||
|
|
||
| mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx, | ||
| uint32_t ny, | ||
| uint32_t n_frames, | ||
| const unsigned char * data) { | ||
| GGML_ASSERT(n_frames >= 2 && n_frames % 2 == 0); | ||
| mtmd_bitmap * bitmap = new mtmd_bitmap; | ||
| bitmap->nx = nx; | ||
| bitmap->ny = ny; | ||
| bitmap->n_frames = n_frames; | ||
| size_t data_size = (size_t)nx * ny * 3 * n_frames; | ||
| bitmap->data.resize(data_size); | ||
| std::memcpy(bitmap->data.data(), data, data_size); | ||
| return bitmap; | ||
| } | ||
|
|
||
| mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, | ||
| const float * data) { | ||
| mtmd_bitmap * bitmap = new mtmd_bitmap; | ||
|
|
@@ -966,6 +1088,14 @@ bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) { | |
| return bitmap->is_audio; | ||
| } | ||
|
|
||
| bool mtmd_bitmap_is_video(const mtmd_bitmap * bitmap) { | ||
| return bitmap->n_frames >= 2; | ||
| } | ||
|
|
||
| uint32_t mtmd_bitmap_get_n_frames(const mtmd_bitmap * bitmap) { | ||
| return bitmap->n_frames; | ||
| } | ||
|
|
||
| const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) { | ||
| return bitmap->id.c_str(); | ||
| } | ||
|
|
@@ -1102,15 +1232,18 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) { | |
| return image_tokens->ny; | ||
| } | ||
|
|
||
| size_t mtmd_image_tokens_get_nt(const mtmd_image_tokens * image_tokens) { | ||
| return image_tokens->nt; | ||
| } | ||
|
|
||
| const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) { | ||
| return image_tokens->id.c_str(); | ||
| } | ||
|
|
||
| llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) { | ||
| if (image_tokens->use_mrope_pos) { | ||
| // for M-RoPE, temporal dimension = max(t,h,w) | ||
| // t is omitted as we don't support video input | ||
| return std::max(image_tokens->nx, image_tokens->ny); | ||
| // for M-RoPE, n_pos = max(t, h, w) | ||
| return (llama_pos)std::max({image_tokens->nt, image_tokens->nx, image_tokens->ny}); | ||
| } | ||
| return image_tokens->n_tokens(); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
probably better to enter multiple images via the batch dimension, rather than using 6 channels