ggml-org · andrewmd5 · Mar 6, 2026 · ngxson · Apr 13, 2026 · ngxson
@@ -3596,9 +3596,17 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
     // set input pixel values
     if (!imgs.is_audio) {
+        // detect number of channels from the buffer size
+        const int nx = imgs.entries[0]->nx;
+        const int ny = imgs.entries[0]->ny;
+        const int n  = nx * ny;
+        const size_t buf_size = imgs.entries[0]->buf.size();
+        const int n_channels = (int)(buf_size / n);
+        GGML_ASSERT(n_channels == 3 || n_channels == 6);
+
         size_t nelem = 0;
         for (const auto & img : imgs.entries) {
-            nelem += img->nx * img->ny * 3;
+            nelem += img->nx * img->ny * n_channels;
         }
         std::vector<float> inp_raw(nelem);
 
@@ -3612,21 +3620,21 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         // │     H │  channel = B
         // └─────┘ │
         //   ──────┘ x B
-
-        for (size_t i = 0; i < imgs.entries.size(); i++) {
-            const int nx = imgs.entries[i]->nx;
-            const int ny = imgs.entries[i]->ny;
-            const int n = nx * ny;
-
-            for (int b = 0; b < batch_size; b++) {
-                float * batch_entry = inp_raw.data() + b * (3*n);
-                for (int y = 0; y < ny; y++) {
-                    for (int x = 0; x < nx; x++) {
-                        size_t base_src = 3*(y * nx + x); // idx of the first channel
-                        size_t base_dst =    y * nx + x;  // idx of the first channel
-                        batch_entry[      base_dst] = imgs.entries[b]->buf[base_src    ];
-                        batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
-                        batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
+        //
+        // for 6-channel video input, same layout but with 6 planar channels
+
+        for (int b = 0; b < batch_size; b++) {
+            const int cur_nx = imgs.entries[b]->nx;
+            const int cur_ny = imgs.entries[b]->ny;
+            const int cur_n  = cur_nx * cur_ny;
+
+            float * batch_entry = inp_raw.data() + b * (n_channels * cur_n);
+            for (int y = 0; y < cur_ny; y++) {
+                for (int x = 0; x < cur_nx; x++) {
+                    size_t base_src = n_channels * (y * cur_nx + x);
+                    size_t base_dst =              y * cur_nx + x;
+                    for (int c = 0; c < n_channels; c++) {
+                        batch_entry[c * cur_n + base_dst] = imgs.entries[b]->buf[base_src + c];
                     }
                 }
             }

@@ -13,16 +13,34 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
 
     int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
 
-    ggml_tensor * inp_raw = build_inp_raw();
-    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+    // detect video: 6-channel input means interleaved frame pairs (even_rgb + odd_rgb)
+    // for images (3ch), both Conv2Ds receive the same input (original behavior)
+    // for video (6ch), Conv2D_0 gets even frames (ch 0-2), Conv2D_1 gets odd frames (ch 3-5)
+    const bool is_video = (img.buf.size() == (size_t)img.nx * img.ny * 6);
+    const int  n_channels = is_video ? 6 : 3;
+
+    ggml_tensor * inp_raw = build_inp_raw(n_channels);
+
+    ggml_tensor * inp;
+    if (is_video) {
+        const size_t nb1 = ggml_row_size(inp_raw->type, img.nx);
+        const size_t nb2 = nb1 * img.ny;
+        ggml_tensor * inp_even = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, 0);
+        ggml_tensor * inp_odd  = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, nb2 * 3);
+        inp = ggml_add(ctx0,
+            ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_even, patch_size, patch_size, 0, 0, 1, 1),
+            ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_odd,  patch_size, patch_size, 0, 0, 1, 1));
+    } else {
+        inp = ggml_add(ctx0,
+            ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1),
+            ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1));
+    }
 
     GGML_ASSERT(img.nx % (patch_size * 2) == 0);
     GGML_ASSERT(img.ny % (patch_size * 2) == 0);
 
-    // second conv dimension
+    // spatial merge
     {
-        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-        inp = ggml_add(ctx0, inp, inp_1);
 
         inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
         inp = ggml_cont_4d(

@@ -174,6 +174,28 @@ struct decode_embd_batch {
         }
     }
 
+    // M-RoPE for video: 3D positions [temporal, height, width]
+    void set_position_mrope_3d(llama_pos pos_0, int nx, int ny, int nt, llama_seq_id seq_id) {
+        GGML_ASSERT(n_pos_per_embd == 4);
+        seq_id_0[0] = seq_id;
+        for (int t = 0; t < nt; t++) {
+            for (int y = 0; y < ny; y++) {
+                for (int x = 0; x < nx; x++) {
+                    int i = t * ny * nx + y * nx + x;
+                    pos[i                     ] = pos_0 + t;
+                    pos[i + batch.n_tokens    ] = pos_0 + y;
+                    pos[i + batch.n_tokens * 2] = pos_0 + x;
+                    pos[i + batch.n_tokens * 3] = 0;
+                }
+            }
+        }
+        for (int i = 0; i < batch.n_tokens; i++) {
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+
     // M-RoPE for audio
     void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
         GGML_ASSERT(n_pos_per_embd == 4);
@@ -260,7 +282,12 @@ int32_t mtmd_helper_decode_image_chunk(
             }
             const int nx = mtmd_image_tokens_get_nx(image_tokens);
             const int ny = mtmd_image_tokens_get_ny(image_tokens);
-            batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
+            const int nt = mtmd_image_tokens_get_nt(image_tokens);
+            if (nt > 1) {
+                batch_embd.set_position_mrope_3d(n_past, nx, ny, nt, seq_id);
+            } else {
+                batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
+            }
         } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
             batch_embd.set_position_mrope_1d(n_past, seq_id);
         } else {

@@ -23,9 +23,11 @@
 
 // represents raw image data, layout is RGBRGBRGB...
 // length of data must be nx * ny * 3
+// for video: data is n_frames sequential RGB frames, each nx * ny * 3 bytes
 struct mtmd_bitmap {
     uint32_t nx;
     uint32_t ny;
+    uint32_t n_frames = 0; // 0 for single images, >= 2 (even) for video
     std::vector<unsigned char> data;
     std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
     bool is_audio = false; // true if the bitmap is audio
@@ -34,15 +36,17 @@ struct mtmd_bitmap {
 struct mtmd_image_tokens {
     uint32_t nx; // number of tokens in x direction
     uint32_t ny; // number of tokens in y direction
+    uint32_t nt = 1; // number of temporal positions (1 for images, > 1 for video)
     bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
-    uint32_t n_tokens() const { return nx * ny; }
+    uint32_t n_tokens() const { return nt * nx * ny; }
     clip_image_f32_batch batch_f32; // preprocessed image patches
     std::string id; // optional user-defined ID, useful for KV cache tracking
 
     mtmd_image_tokens clone() {
         return mtmd_image_tokens{
             nx,
             ny,
+            nt,
             use_mrope_pos,
             batch_f32.clone(),
             id
@@ -549,6 +553,10 @@ struct mtmd_tokenizer {
     }
 
     int32_t add_media(const mtmd_bitmap * bitmap) {
+        if (bitmap->n_frames >= 2) {
+            return add_video(bitmap);
+        }
+
         if (!bitmap->is_audio) {
             // handle image
 
@@ -739,6 +747,102 @@ struct mtmd_tokenizer {
         return 0;
     }
 
+    // preprocess video frames and create an image chunk with temporal dimension
+    // frames are paired (even+odd), each pair becomes one 6-channel image
+    // each pair is encoded independently through the ViT (per-frame attention)
+    int32_t add_video(const mtmd_bitmap * bitmap) {
+        if (!ctx->ctx_v) {
+            LOG_ERR("%s: error: model does not support vision input\n", __func__);
+            return 2;
+        }
+
+        const uint32_t n_frames = bitmap->n_frames;
+        const uint32_t n_pairs  = n_frames / 2;
+        const size_t   frame_bytes = (size_t)bitmap->nx * bitmap->ny * 3;
+
+        if (!ctx->img_beg.empty()) {
+            add_text(ctx->img_beg, true);
+        }
+
+        // preprocess each frame individually
+        clip_image_f32_batch all_frames;
+        for (uint32_t f = 0; f < n_frames; f++) {
+            clip_image_u8_ptr img_u8(clip_image_u8_init());
+            img_u8->nx = bitmap->nx;
+            img_u8->ny = bitmap->ny;
+            img_u8->buf.resize(frame_bytes);
+            std::memcpy(img_u8->buf.data(), bitmap->data.data() + f * frame_bytes, frame_bytes);
+
+            clip_image_f32_batch frame_batch;
+            bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &frame_batch);
+            if (!ok) {
+                LOG_ERR("Unable to preprocess video frame %u\n", f);
+                return 2;
+            }
+            GGML_ASSERT(frame_batch.entries.size() == 1);
+            all_frames.entries.push_back(std::move(frame_batch.entries[0]));
+        }
+
+        const int frame_nx = all_frames.entries[0]->nx;
+        const int frame_ny = all_frames.entries[0]->ny;
+        const int n_pixels = frame_nx * frame_ny;
+
+        // interleave frame pairs into 6-channel images (even_rgb + odd_rgb)
+        // each pair is a separate batch entry, encoded independently
+        clip_image_f32_batch pair_batch;
+        for (uint32_t p = 0; p < n_pairs; p++) {
+            const auto & even = all_frames.entries[p * 2];
+            const auto & odd  = all_frames.entries[p * 2 + 1];
+            GGML_ASSERT(even->nx == frame_nx && even->ny == frame_ny);
+            GGML_ASSERT(odd->nx  == frame_nx && odd->ny  == frame_ny);
+
+            clip_image_f32_ptr pair(clip_image_f32_init());
+            pair->nx = frame_nx;
+            pair->ny = frame_ny;
+            pair->buf.resize((size_t)n_pixels * 6);
+
+            for (int i = 0; i < n_pixels; i++) {
+                const int dst = i * 6;
+                const int src = i * 3;
+                pair->buf[dst + 0] = even->buf[src + 0];
+                pair->buf[dst + 1] = even->buf[src + 1];
+                pair->buf[dst + 2] = even->buf[src + 2];
+                pair->buf[dst + 3] = odd->buf[src + 0];
+                pair->buf[dst + 4] = odd->buf[src + 1];
+                pair->buf[dst + 5] = odd->buf[src + 2];
+            }
+            pair_batch.entries.push_back(std::move(pair));
+        }
+
+        const uint32_t tokens_x = clip_n_output_tokens_x(ctx->ctx_v, pair_batch.entries[0].get());
+        const uint32_t tokens_y = clip_n_output_tokens_y(ctx->ctx_v, pair_batch.entries[0].get());
+
+        mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+        image_tokens->nx = tokens_x;
+        image_tokens->ny = tokens_y;
+        image_tokens->nt = n_pairs;
+        image_tokens->use_mrope_pos = true;
+        image_tokens->batch_f32 = std::move(pair_batch);
+        image_tokens->id = bitmap->id;
+
+        LOG_DBG("video: nt=%u, nx=%u, ny=%u, n_tokens=%u\n",
+                image_tokens->nt, image_tokens->nx, image_tokens->ny, image_tokens->n_tokens());
+
+        mtmd_input_chunk chunk{
+            MTMD_INPUT_CHUNK_TYPE_IMAGE,
+            {}, // text tokens
+            std::move(image_tokens),
+            nullptr, // audio tokens
+        };
+        cur.entries.emplace_back(std::move(chunk));
+
+        if (!ctx->img_end.empty()) {
+            add_text(ctx->img_end, true);
+        }
+
+        return 0;
+    }
+
     std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
         std::vector<mtmd_input_chunk> chunks;
 
@@ -851,10 +955,13 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
     bool ok = false;
 
-    if (clip_is_llava(ctx_clip)
+    if (image_tokens->nt > 1
+        || clip_is_llava(ctx_clip)
         || clip_is_minicpmv(ctx_clip)
         || clip_is_glm(ctx_clip)) {
-        // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
+        // encode each batch entry independently
+        // video: each entry is one frame pair, encoded with per-frame attention
+        // llava/minicpmv/glm: does not support batched encoding
         const auto & entries = image_tokens->batch_f32.entries;
         for (size_t i = 0; i < entries.size(); i++) {
             int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
@@ -934,6 +1041,21 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
     return bitmap;
 }
 
+mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx,
+                                          uint32_t ny,
+                                          uint32_t n_frames,
+                                          const unsigned char * data) {
+    GGML_ASSERT(n_frames >= 2 && n_frames % 2 == 0);
+    mtmd_bitmap * bitmap = new mtmd_bitmap;
+    bitmap->nx = nx;
+    bitmap->ny = ny;
+    bitmap->n_frames = n_frames;
+    size_t data_size = (size_t)nx * ny * 3 * n_frames;
+    bitmap->data.resize(data_size);
+    std::memcpy(bitmap->data.data(), data, data_size);
+    return bitmap;
+}
+
 mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
                                           const float * data) {
     mtmd_bitmap * bitmap = new mtmd_bitmap;
@@ -966,6 +1088,14 @@ bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
     return bitmap->is_audio;
 }
 
+bool mtmd_bitmap_is_video(const mtmd_bitmap * bitmap) {
+    return bitmap->n_frames >= 2;
+}
+
+uint32_t mtmd_bitmap_get_n_frames(const mtmd_bitmap * bitmap) {
+    return bitmap->n_frames;
+}
+
 const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
     return bitmap->id.c_str();
 }
@@ -1102,15 +1232,18 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
     return image_tokens->ny;
 }
 
+size_t mtmd_image_tokens_get_nt(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->nt;
+}
+
 const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
     return image_tokens->id.c_str();
 }
 
 llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
     if (image_tokens->use_mrope_pos) {
-        // for M-RoPE, temporal dimension = max(t,h,w)
-        // t is omitted as we don't support video input
-        return std::max(image_tokens->nx, image_tokens->ny);
+        // for M-RoPE, n_pos = max(t, h, w)
+        return (llama_pos)std::max({image_tokens->nt, image_tokens->nx, image_tokens->ny});
     }
     return image_tokens->n_tokens();
 }