ggml-org · sfallah · Jun 8, 2026
@@ -62,6 +62,9 @@ struct clip_graph {
     // build vision transformer (ViT) cgraph
     // this function should cover most of the models
     // if your model has specific features, you should probably duplicate this function
+    //
+    // inp is 2D [n_embd, n_pos] or 3D [n_embd, n_pos, B] (batched multi-tile encode);
+    // returns the same rank with the batch dim preserved (B==1 -> trailing 1)
     ggml_tensor * build_vit(
                 ggml_tensor * inp,
                 int64_t n_pos,
@@ -75,7 +78,7 @@ struct clip_graph {
     // returns tensor with shape [n_embd, n_patches]
     ggml_tensor * build_inp();
 
-    ggml_tensor * build_inp_raw(int channels = 3);
+    ggml_tensor * build_inp_raw(int channels = 3, int batch = 1);
 
     ggml_tensor * build_norm(
             ggml_tensor * cur,

@@ -523,8 +523,9 @@ struct clip_image_f32_batch {
 
     // for llava-uhd style models, we need to know the grid size
     // note: entries.size() == grid_x * grid_y + 1 (one overview image)
-    int grid_x = 0;
-    int grid_y = 0;
+    // 1x1 = no tiling; llava-uhd preprocessors always overwrite grid before has_tiling_grid reads it
+    int grid_x = 1;
+    int grid_y = 1;
 
     clip_image_f32_batch clone() const {
         clip_image_f32_batch new_batch{

@@ -69,6 +69,7 @@ struct clip_hparams {
     std::vector<clip_image_size> image_res_candidates;
     int32_t preproc_min_tiles = 0;
     int32_t preproc_max_tiles = 0;
+    int32_t preproc_tile_size = 0;
     resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
     resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
     pad_style image_pad_rf = PAD_CEIL;  // padding style for the refined image (e.g. llava-1.6)

@@ -64,7 +64,8 @@ int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
 // TODO: should be enum, not string
 const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 
-int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+// grid_x/grid_y of the tile grid; leave at 1 for a single image
+int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img, int grid_x = 1, int grid_y = 1);
 
 // for M-RoPE, this will be the number of token positions in X and Y directions
 // for other models, X will be the total number of tokens and Y will be 1

@@ -96,6 +96,8 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
     const int n_heads = hparams.sam_n_head;
     const int d_heads = n_embd / n_heads;
     const int window  = hparams.attn_window_size;
+    // SAM stage runs its layernorms at 1e-6
+    const float sam_eps = 1e-6f;
 
     ggml_tensor * inpL;
 
@@ -134,7 +136,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
         ggml_tensor * shortcut = cur;
 
         // layernorm1
-        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, sam_eps, il);
 
         const int64_t w0 = cur->ne[1];
         const int64_t h0 = cur->ne[2];
@@ -214,7 +216,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
         ggml_tensor * inpFF = cur;
 
         // layernorm2
-        cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+        cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, sam_eps, il);
 
         // ffn
         cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
@@ -229,12 +231,12 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
 
     cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
     cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
-    cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
+    cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, sam_eps, -1);
     cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
 
     cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
     cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
-    cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
+    cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, sam_eps, -1);
     cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
 
     cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
@@ -246,23 +248,23 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
 }
 
 ggml_cgraph * clip_graph_deepseekocr::build() {
-    // patch embedding
-    ggml_tensor * inp_raw = build_inp_raw();
+    const int64_t B = grid_x * grid_y;
+
+    ggml_tensor * inp_raw = build_inp_raw(3, grid_x * grid_y);
+
+    //sam_out: [16, 16, n_embd_clip, B]
     ggml_tensor * sam_out = build_sam(inp_raw);
 
     const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
 
     ggml_tensor * clip_out;
-    // Building DS-OCR CLIP
     {
-        ggml_tensor * inp;
-
-        inp = ggml_reshape_2d(ctx0, sam_out, clip_n_patches, sam_out->ne[2]);
+        // [16, 16, n_embd, B] -> [n_patches, n_embd, B]
+        ggml_tensor * inp = ggml_reshape_3d(ctx0, sam_out, clip_n_patches, sam_out->ne[2], B);
         inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
 
         ggml_tensor * new_pos_embd = model.position_embeddings;
-
-        int        n_pos    = new_pos_embd->ne[1];  // +1 for [CLS]
+        int         n_pos    = new_pos_embd->ne[1];   // +1 for [CLS]
         const auto tgt_size = static_cast<int>(std::sqrt(inp->ne[1]));
         const auto src_size = static_cast<int>(std::sqrt(n_pos - 1));
 
@@ -281,41 +283,52 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
             n_pos        = tgt_size * tgt_size + 1;
         }
 
-        // add CLS token
-        inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
+        // CLS token, broadcast across the batch
+        ggml_tensor * cls_b = ggml_repeat_4d(ctx0, model.class_embedding, n_embd, 1, B, 1);
+        inp = ggml_concat(ctx0, cls_b, inp, 1); // [n_embd, n_pos, B]
 
         // for selecting learned pos embd, used by ViT
         ggml_tensor * positions        = ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32);
         ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions);
 
-        ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_QUICK, learned_pos_embd, nullptr);
+        // one builder for all B (B==1 is just the single global view)
+        clip_out = build_vit(inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_QUICK, learned_pos_embd, nullptr);  // [n_embd, n_pos, B]
 
-        ggml_build_forward_expand(gf, cur);
-        clip_out = cur;
+        ggml_build_forward_expand(gf, clip_out);
     }
 
     sam_out  = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3));
-    sam_out  = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches);
-    clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]);
+    sam_out  = ggml_reshape_3d(ctx0, sam_out, sam_out->ne[0], clip_n_patches, B);
+    clip_out = ggml_view_3d(ctx0, clip_out, n_embd, clip_n_patches, B,
+                            clip_out->nb[1], clip_out->nb[2], clip_out->nb[1]);
 
     ggml_tensor * cur;
     cur = ggml_concat(ctx0, clip_out, sam_out, 0);
     cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
     cur = ggml_add(ctx0, cur, model.mm_fc_b);
 
-    const auto h     = static_cast<int>(std::sqrt(static_cast<float>(cur->ne[1])));
-    const auto w     = h;
-    const auto n_dim = cur->ne[0];
+    const int tile_tokens = cur->ne[1];
+    const int tile_w      = static_cast<int>(std::sqrt(static_cast<float>(tile_tokens))); // tiles are square
+    const int gw          = tile_w * grid_x;
+    const int gh          = tile_w * grid_y;
+    const int n_dim       = cur->ne[0];
 
-    ggml_tensor * imgnl;
+    cur = ggml_reshape_4d(ctx0, cur, n_dim * tile_w, tile_w, grid_x, grid_y); // [n_dim*tile_w, tile_w, grid_x, grid_y]
+    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
 
-    imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
-    cur   = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
-    cur   = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
-    cur   = ggml_concat(ctx0, cur, model.view_seperator, 1);  // (n_dim, h*(w+1) + 1)
+    ggml_tensor * nl;
 
-    cb(cur, "dsocr_output", -1);
+    // weave newline at the end of every grid row
+    nl  = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, gh, 1);
+    cur = ggml_reshape_3d(ctx0, cur, n_dim, gw, gh); //[n_dim, gw, gh]
+    cur = ggml_concat(ctx0, cur, nl, 1);
+    cur = ggml_reshape_2d(ctx0, cur, n_dim, (gw + 1) * gh);
+
+    if (img.add_viewsep) {
+        cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, (gw+1)*gh + 1)
+    }
 
+    cb(cur, "dsocr_output", -1);
     ggml_build_forward_expand(gf, cur);
     return gf;
 }
@@ -4,17 +4,17 @@ ggml_cgraph * clip_graph_deepseekocr2::build() {
     GGML_ASSERT(hparams.n_head_kv > 0);
     GGML_ASSERT(n_head % hparams.n_head_kv == 0);
 
-    // patch embedding
-    ggml_tensor * inp_raw = build_inp_raw();
+    const int64_t B = grid_x * grid_y;
+
+    ggml_tensor * inp_raw = build_inp_raw(3, grid_x * grid_y);
 
     ggml_tensor * sam_out = build_sam(inp_raw);
 
     ggml_tensor * qwen2_out;
     // Building Qwen2 encoder
     {
-        ggml_tensor * inp;
-
-        inp = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2]); // H*W, C
+        // [W, H, C, B] -> [H*W, C, B] -> [C, H*W, B]
+        ggml_tensor * inp = ggml_reshape_3d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2], B);
         inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
 
         auto num_image_tokens = inp->ne[1]; // H*W
@@ -32,8 +32,10 @@ ggml_cgraph * clip_graph_deepseekocr2::build() {
             num_queries = 144;
         }
 
-        // (B, num_image_tokens + num_queries, C)
-        inp = ggml_concat(ctx0, inp, ggml_cast(ctx0, query_embed, inp->type), 1);
+        // query_embed [C, num_queries]; broadcast across the batch and append:
+        ggml_tensor * query_b = ggml_repeat_4d(ctx0, ggml_cast(ctx0, query_embed, inp->type),
+                                               inp->ne[0], num_queries, B, 1);
+        inp = ggml_concat(ctx0, inp, query_b, 1);
 
         auto seq_len = inp->ne[1];
 
@@ -54,26 +56,30 @@ ggml_cgraph * clip_graph_deepseekocr2::build() {
 
         // build_vit applies model.post_ln_w internally; do not re-apply
         ggml_tensor * cur = build_vit(inp, seq_len, NORM_TYPE_RMS, FFN_SILU,
-                                      /* learned_pos_embd */ nullptr, add_rope, vit_opts);
+                                      /* learned_pos_embd */ nullptr, add_rope, vit_opts); // [C, seq_len, B]
 
+        // only keep the query tokens; [C, num_queries, B]
         cur = ggml_cont(ctx0,
-                        ggml_view_2d(ctx0, cur, cur->ne[0], num_queries, cur->nb[1],
-                                     cur->nb[1] * (cur->ne[1] - num_queries))); // only take query tokens for output
+                        ggml_view_3d(ctx0, cur, cur->ne[0], num_queries, B,
+                                     cur->nb[1], cur->nb[2], cur->nb[1] * (cur->ne[1] - num_queries)));
 
         ggml_build_forward_expand(gf, cur);
         qwen2_out = cur;
     }
 
     ggml_tensor * cur;
 
-    cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out);
+    cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out); // [n_dim, num_queries, B]
     cur = ggml_add(ctx0, cur, model.mm_fc_b);
 
     // view_seperator only after the global view
     if (img.add_viewsep) {
+        GGML_ASSERT(B == 1);
         cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, 257)
     }
 
+    // flatten the batch
+    cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
     cb(cur, "dsocr2_output", -1);
 
     ggml_build_forward_expand(gf, cur);

@@ -120,14 +120,24 @@ struct clip_graph_whisper_enc : clip_graph {
     ggml_cgraph * build() override;
 };
 
+// one graph for both the global view (grid 1x1)
+// and multi-tile batch; batch dim is grid_x * grid_y
 struct clip_graph_deepseekocr : clip_graph {
-    clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    int grid_x;
+    int grid_y;
+
+    clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img,
+                           const int grid_x = 1, const int grid_y = 1)
+        : clip_graph(ctx, img), grid_x(grid_x), grid_y(grid_y) {}
+
     ggml_cgraph * build() override;
     ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model
 };
 
 struct clip_graph_deepseekocr2 : clip_graph_deepseekocr {
-    clip_graph_deepseekocr2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_deepseekocr(ctx, img) {}
+    clip_graph_deepseekocr2(clip_ctx * ctx, const clip_image_f32 & img,
+                            const int grid_x = 1, const int grid_y = 1)
+        : clip_graph_deepseekocr(ctx, img, grid_x, grid_y) {}
     ggml_cgraph * build() override; // reuses build_sam() from base
 };