diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h
index 1d9f6a136a96..655d361b1ff2 100644
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -62,6 +62,9 @@ struct clip_graph {
     // build vision transformer (ViT) cgraph
     // this function should cover most of the models
     // if your model has specific features, you should probably duplicate this function
+    //
+    // inp is 2D [n_embd, n_pos] or 3D [n_embd, n_pos, B] (batched multi-tile encode);
+    // returns the same rank with the batch dim preserved (B==1 -> trailing 1)
     ggml_tensor * build_vit(
                 ggml_tensor * inp,
                 int64_t n_pos,
@@ -75,7 +78,7 @@ struct clip_graph {
     // returns tensor with shape [n_embd, n_patches]
     ggml_tensor * build_inp();
 
-    ggml_tensor * build_inp_raw(int channels = 3);
+    ggml_tensor * build_inp_raw(int channels = 3, int batch = 1);
 
     ggml_tensor * build_norm(
             ggml_tensor * cur,
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index c055cfb75419..cb517812f189 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -523,8 +523,9 @@ struct clip_image_f32_batch {
 
     // for llava-uhd style models, we need to know the grid size
     // note: entries.size() == grid_x * grid_y + 1 (one overview image)
-    int grid_x = 0;
-    int grid_y = 0;
+    // 1x1 = no tiling; llava-uhd preprocessors always overwrite grid before has_tiling_grid reads it
+    int grid_x = 1;
+    int grid_y = 1;
 
     clip_image_f32_batch clone() const {
         clip_image_f32_batch new_batch{
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index 238f805a9aae..848ad4da7801 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -69,6 +69,7 @@ struct clip_hparams {
     std::vector<clip_image_size> image_res_candidates;
     int32_t preproc_min_tiles = 0;
     int32_t preproc_max_tiles = 0;
+    int32_t preproc_tile_size = 0;
     resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
     resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
     pad_style image_pad_rf = PAD_CEIL;  // padding style for the refined image (e.g. llava-1.6)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 80136ed86672..5d7ed82b6e8f 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -310,11 +310,17 @@ ggml_tensor * clip_graph::build_vit(
             std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos,
             const build_vit_opts & opts
         ) {
+    // batch dim: inp is [n_embd, n_pos] (B==1) or [n_embd, n_pos, B] (multi-tile encode)
+    const int64_t B = inp->ne[2];
+
     if (learned_pos_embd) {
         inp = ggml_add(ctx0, inp, learned_pos_embd);
         cb(inp, "pos_embed", -1);
     }
 
+    // flatten batch; unflatten again in attention
+    inp = ggml_reshape_2d(ctx0, inp, n_embd, n_pos * B);
+
     ggml_tensor * inpL = inp;
 
     // pre-layernorm
@@ -344,20 +350,24 @@ ggml_tensor * clip_graph::build_vit(
                     cur = ggml_add(ctx0, cur, layer.qkv_b);
                 }
 
-                Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
-                    /* nb1    */ ggml_row_size(cur->type, d_head),
-                    /* nb2    */ cur->nb[1],
-                    /* offset */ 0);
-
-                Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
-                    /* nb1    */ ggml_row_size(cur->type, d_head),
-                    /* nb2    */ cur->nb[1],
-                    /* offset */ ggml_row_size(cur->type, n_embd));
-
-                Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
-                    /* nb1    */ ggml_row_size(cur->type, d_head),
-                    /* nb2    */ cur->nb[1],
-                    /* offset */ ggml_row_size(cur->type, 2 * n_embd));
+                // Q/K/V as [d_head, n_head, n_pos, B], the batch stride is cur->nb[1]*n_pos.
+                Qcur = ggml_view_4d(ctx0, cur, d_head, n_head, n_pos, B,
+                /* nb1    */ ggml_row_size(cur->type, d_head),
+                /* nb2    */ cur->nb[1],
+                /* nb3    */ cur->nb[1] * n_pos,
+                /* offset */ 0);
+
+                Kcur = ggml_view_4d(ctx0, cur, d_head, n_head, n_pos, B,
+                /* nb1    */ ggml_row_size(cur->type, d_head),
+                /* nb2    */ cur->nb[1],
+                /* nb3    */ cur->nb[1] * n_pos,
+                /* offset */ ggml_row_size(cur->type, n_embd));
+
+                Vcur = ggml_view_4d(ctx0, cur, d_head, n_head, n_pos, B,
+                /* nb1    */ ggml_row_size(cur->type, d_head),
+                /* nb2    */ cur->nb[1],
+                /* nb3    */ cur->nb[1] * n_pos,
+                /* offset */ ggml_row_size(cur->type, 2 * n_embd));
 
                 if (layer.q_norm) {
                     GGML_ASSERT(layer.q_norm->ne[0] == Qcur->ne[0]);
@@ -402,9 +412,9 @@ ggml_tensor * clip_graph::build_vit(
                     }
                 }
 
-                Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head,    n_pos);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head_kv, n_pos);
-                Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head_kv, n_pos);
+                Qcur = ggml_reshape_4d(ctx0, Qcur, d_head, n_head,    n_pos, B);
+                Kcur = ggml_reshape_4d(ctx0, Kcur, d_head, n_head_kv, n_pos, B);
+                Vcur = ggml_reshape_4d(ctx0, Vcur, d_head, n_head_kv, n_pos, B);
 
                 if (norm_per_head) {
                     if (layer.q_norm) {
@@ -434,6 +444,7 @@ ggml_tensor * clip_graph::build_vit(
                 cb(Vcur, "Vcur_normed", il);
             }
 
+            // build_attn returns a flat 2D [n_embd, n_pos*B]
             cur = build_attn(layer.o_w, layer.o_b,
                 Qcur, Kcur, Vcur, opts.attn_mask, kq_scale, il);
             cb(cur, "attn_out", il);
@@ -505,6 +516,10 @@ ggml_tensor * clip_graph::build_vit(
     if (model.post_ln_w) {
         inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
     }
+
+    // restore the batch dim
+    GGML_ASSERT(inpL->ne[1] % B == 0);
+    inpL = ggml_reshape_3d(ctx0, inpL, n_embd, inpL->ne[1] / B, B);
     return inpL;
 }
 
@@ -522,8 +537,10 @@ ggml_tensor * clip_graph::build_inp() {
     return inp;
 }
 
-ggml_tensor * clip_graph::build_inp_raw(int channels) {
-    ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
+ggml_tensor * clip_graph::build_inp_raw(int channels, int batch) {
+    ggml_tensor * inp_raw = batch > 1
+        ? ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels, batch)
+        : ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
     ggml_set_name(inp_raw, "inp_raw");
     ggml_set_input(inp_raw);
     return inp_raw;
@@ -844,7 +861,9 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
 }
 
 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
-    GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
+    const bool is_dsocr_tiles = (ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR
+                                 || ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR2) && imgs.entries.size() > 1;
+    GGML_ASSERT((imgs.entries.size() == 1 || is_dsocr_tiles) && "n_batch > 1 is not supported");
 
     const clip_image_f32 & img = *imgs.entries[0];
     std::unique_ptr<clip_graph> builder;
@@ -959,11 +978,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             } break;
         case PROJECTOR_TYPE_DEEPSEEKOCR:
             {
-                builder = std::make_unique<clip_graph_deepseekocr>(ctx, img);
+                // same builder for single image (grid 1x1) and batched tiles (grid_x*grid_y of them)
+                builder = std::make_unique<clip_graph_deepseekocr>(ctx, img, imgs.grid_x, imgs.grid_y);
             } break;
         case PROJECTOR_TYPE_DEEPSEEKOCR2:
              {
-                builder = std::make_unique<clip_graph_deepseekocr2>(ctx, img);
+                // same builder for single image (grid 1x1) and batched tiles (grid_x*grid_y of them);
+                builder = std::make_unique<clip_graph_deepseekocr2>(ctx, img, imgs.grid_x, imgs.grid_y);
             } break;
         case PROJECTOR_TYPE_LFM2A:
             {
@@ -1540,7 +1561,7 @@ struct clip_model_loader {
                     {
                         hparams.patch_size = 16;
                         hparams.image_size = 1024;
-                        hparams.warmup_image_size = 1024;
+                        hparams.warmup_image_size = hparams.image_size; // global view is fixed at image_size
                         hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
                         hparams.image_pad_color = {127, 127, 127};
 
@@ -1548,7 +1569,17 @@ struct clip_model_loader {
                         get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
                         get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
                         get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
+                        // dynamic-resolution tiling config
+                        hparams.preproc_min_tiles = 2;
+                        if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR) {
+                            hparams.preproc_max_tiles = 9;
+                            hparams.preproc_tile_size = 640;
+                            // the CLIP/ViT body runs at 1e-5
+                            hparams.eps = 1e-5f;
+                        }
                         if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
+                            hparams.preproc_max_tiles = 6;
+                            hparams.preproc_tile_size = 768;
                             // qwen2 encoder is GQA, requires KEY_N_HEAD_KV
                             get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv);
                         }
@@ -2724,6 +2755,38 @@ struct clip_model_loader {
         std::vector<support_info_op> ops;
     };
 
+    // reserve for the worst-case DeepSeek-OCR (v1+v2) tile batch
+    static void reserve_dsocr_max_tiles(clip_ctx & ctx_clip) {
+        const auto proj = ctx_clip.proj_type();
+        if (proj != PROJECTOR_TYPE_DEEPSEEKOCR && proj != PROJECTOR_TYPE_DEEPSEEKOCR2) {
+            return;
+        }
+        const auto & hparams   = ctx_clip.model.hparams;
+        const int    max_tiles = hparams.preproc_max_tiles;
+        const int    tile_size = hparams.preproc_tile_size;
+        if (max_tiles <= 1 || tile_size <= 0) {
+            return;
+        }
+
+        // v1 weaves a newline per grid row
+        const int grid_x = 1;
+        const int grid_y = max_tiles;
+
+        clip_image_f32_batch tiles;
+        for (int i = 0; i < max_tiles; i++) {
+            clip_image_f32_ptr tile(clip_image_f32_init());
+            tile->nx = tile_size;
+            tile->ny = tile_size;
+            tiles.entries.push_back(std::move(tile));
+        }
+        tiles.grid_x = grid_x;
+        tiles.grid_y = grid_y;
+
+        LOG_INF("%s: reserving worst-case tile batch: %d tiles (%dx%d grid) @ %dx%d\n",
+                __func__, max_tiles, grid_x, grid_y, tile_size, tile_size);
+        reserve_compute_meta(ctx_clip, tiles);
+    }
+
     static void warmup(clip_ctx & ctx_clip) {
         // create a fake batch
         const auto & hparams = ctx_clip.model.hparams;
@@ -2740,6 +2803,9 @@ struct clip_model_loader {
         }
         batch.entries.push_back(std::move(img));
         warmup(ctx_clip, batch);
+
+        // DeepSeek-OCR v1+v2: warmup's (worst-case) max tiles batch + global view;
+        reserve_dsocr_max_tiles(ctx_clip);
     }
 
     static void warmup(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
@@ -3171,7 +3237,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
     return 1;
 }
 
-int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img, int grid_x, int grid_y) {
     const auto & params = ctx->model.hparams;
 
     // for models with fixed size image, the input image is already pre-processed and resized to square
@@ -3346,17 +3412,25 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                 n_patches += 2; // for BOI and EOI token embeddings
             } break;
         case PROJECTOR_TYPE_DEEPSEEKOCR:
-        {
-            // SAM encoder applies two stride-2 convolutions (net_2 and net_3)
-            // that reduce spatial dimensions by 4x in each direction (16x total)
-            // E.g., 64x64 -> 16x16 patches
-            n_patches /= 16;
-
-            // build_global_local_features adds image newlines and view separator
-            // Formula: h*(w+1) + 1 where h = w = sqrt(n_patches)
-            int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
-            n_patches = h * (h + 1) + 1;
-        } break;
+            {
+                // SAM encoder applies two stride-2 convolutions (net_2 and net_3)
+                // that reduce spatial dimensions by 4x in each direction (16x total)
+                // E.g., 64x64 -> 16x16 patches
+                n_patches /= 16;
+
+                // global view (add_viewsep) is encoded single-image, never with a tile grid
+                GGML_ASSERT(!(img->add_viewsep && (grid_x > 1 || grid_y > 1)));
+
+                const int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
+                if (grid_x > 1 || grid_y > 1) {
+                    // tiles: the batched graph lays them out on the grid_x x grid_y grid
+                    // and weaves one newline per row, emitting a single combined output
+                    n_patches = (h * grid_x + 1) * (h * grid_y);
+                } else if (img->add_viewsep) {
+                    // global view: weave one newline per row + trailing view separator
+                    n_patches = h * (h + 1) + 1;
+                }
+            } break;
         case PROJECTOR_TYPE_HUNYUANVL:
             {
                 int merge = ctx->model.hparams.n_merge;
@@ -3365,14 +3439,20 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                 n_patches = (ow + 1) * oh + 2;
             } break;
         case PROJECTOR_TYPE_DEEPSEEKOCR2:
-        {
-            // 1024 global view -> 256 query tokens + 1 view separator = 257;
-            // 768 local tile   -> 144 query tokens, no separator.
-            n_patches /= 16;
-            if (img->add_viewsep) {
-                n_patches += 1; // view separator, appended only after the global view
-            }
-        } break;
+            {
+                // 1024 global view -> 256 query tokens + 1 view separator = 257;
+                // 768 local tile   -> 144 query tokens, no separator.
+                n_patches /= 16;
+
+                // global view (add_viewsep) is encoded single-image, never with a tile grid
+                GGML_ASSERT(!(img->add_viewsep && (grid_x > 1 || grid_y > 1)));
+                if (img->add_viewsep) {
+                    n_patches += 1; // view separator, appended only after the global view
+                } else if (grid_x > 1 || grid_y > 1) {
+                    // tiles concatenate their per-tile query tokens (grid_x*grid_y of them); no in-graph weave
+                    n_patches = n_patches * grid_x * grid_y;
+                }
+            } break;
         case PROJECTOR_TYPE_LFM2A:
             {
                 n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
@@ -3417,9 +3497,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
     int batch_size = imgs.entries.size();
 
+    // DSOCR (v1 and v2) encode their tiles in one batched graph; every other path is single-image
+    const bool is_dsocr_tiles = (ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR
+                                 || ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR2) && batch_size > 1;
+
     // TODO @ngxson : implement batch size > 1 as a loop
     //                we don't need true batching support because the cgraph will gonna be big anyway
-    if (batch_size != 1) {
+    if (batch_size != 1 && !is_dsocr_tiles) {
         return false; // only support batch size of 1
     }
 
@@ -3491,23 +3575,23 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         // └─────┘ │
         //   ──────┘ x B
 
-        for (size_t i = 0; i < imgs.entries.size(); i++) {
-            const int nx = imgs.entries[i]->nx;
-            const int ny = imgs.entries[i]->ny;
-            const int n = nx * ny;
-
-            for (int b = 0; b < batch_size; b++) {
-                float * batch_entry = inp_raw.data() + b * (3*n);
-                for (int y = 0; y < ny; y++) {
-                    for (int x = 0; x < nx; x++) {
-                        size_t base_src = 3*(y * nx + x); // idx of the first channel
-                        size_t base_dst =    y * nx + x;  // idx of the first channel
-                        batch_entry[      base_dst] = imgs.entries[b]->buf[base_src    ];
-                        batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
-                        batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
-                    }
+        size_t off = 0; // running offset into inp_raw; each entry is a contiguous [W, H, 3] block
+        for (size_t b = 0; b < imgs.entries.size(); b++) {
+            const int nx = imgs.entries[b]->nx;
+            const int ny = imgs.entries[b]->ny;
+            const int n  = nx * ny;
+
+            float * batch_entry = inp_raw.data() + off;
+            for (int y = 0; y < ny; y++) {
+                for (int x = 0; x < nx; x++) {
+                    size_t base_src = 3*(y * nx + x); // idx of the first channel
+                    size_t base_dst =    y * nx + x;  // idx of the first channel
+                    batch_entry[      base_dst] = imgs.entries[b]->buf[base_src    ];
+                    batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
+                    batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
                 }
             }
+            off += 3 * n;
         }
         set_input_f32("inp_raw", inp_raw);
 
@@ -4252,9 +4336,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     // the last node is the embedding tensor
     ggml_tensor * embeddings = ggml_graph_node(gf, -1);
 
-    // sanity check (only support batch size of 1 for now)
+    // sanity check
     const int n_tokens_out = embeddings->ne[1];
-    const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
+    const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get(), imgs.grid_x, imgs.grid_y);
     if (n_tokens_out != expected_n_tokens_out) {
         LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
         GGML_ABORT("Invalid number of output tokens");
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 9b807ffa77b3..f83cd24e777c 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -64,7 +64,8 @@ int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
 // TODO: should be enum, not string
 const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 
-int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+// grid_x/grid_y of the tile grid; leave at 1 for a single image
+int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img, int grid_x = 1, int grid_y = 1);
 
 // for M-RoPE, this will be the number of token positions in X and Y directions
 // for other models, X will be the total number of tokens and Y will be 1
diff --git a/tools/mtmd/models/deepseekocr.cpp b/tools/mtmd/models/deepseekocr.cpp
index c3c22d0a4bac..22badf70298d 100644
--- a/tools/mtmd/models/deepseekocr.cpp
+++ b/tools/mtmd/models/deepseekocr.cpp
@@ -96,6 +96,8 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
     const int n_heads = hparams.sam_n_head;
     const int d_heads = n_embd / n_heads;
     const int window  = hparams.attn_window_size;
+    // SAM stage runs its layernorms at 1e-6
+    const float sam_eps = 1e-6f;
 
     ggml_tensor * inpL;
 
@@ -134,7 +136,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
         ggml_tensor * shortcut = cur;
 
         // layernorm1
-        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, sam_eps, il);
 
         const int64_t w0 = cur->ne[1];
         const int64_t h0 = cur->ne[2];
@@ -214,7 +216,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
         ggml_tensor * inpFF = cur;
 
         // layernorm2
-        cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+        cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, sam_eps, il);
 
         // ffn
         cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
@@ -229,12 +231,12 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
 
     cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
     cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
-    cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
+    cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, sam_eps, -1);
     cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
 
     cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
     cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
-    cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
+    cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, sam_eps, -1);
     cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
 
     cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
@@ -246,23 +248,23 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
 }
 
 ggml_cgraph * clip_graph_deepseekocr::build() {
-    // patch embedding
-    ggml_tensor * inp_raw = build_inp_raw();
+    const int64_t B = grid_x * grid_y;
+
+    ggml_tensor * inp_raw = build_inp_raw(3, grid_x * grid_y);
+
+    //sam_out: [16, 16, n_embd_clip, B]
     ggml_tensor * sam_out = build_sam(inp_raw);
 
     const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
 
     ggml_tensor * clip_out;
-    // Building DS-OCR CLIP
     {
-        ggml_tensor * inp;
-
-        inp = ggml_reshape_2d(ctx0, sam_out, clip_n_patches, sam_out->ne[2]);
+        // [16, 16, n_embd, B] -> [n_patches, n_embd, B]
+        ggml_tensor * inp = ggml_reshape_3d(ctx0, sam_out, clip_n_patches, sam_out->ne[2], B);
         inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
 
         ggml_tensor * new_pos_embd = model.position_embeddings;
-
-        int        n_pos    = new_pos_embd->ne[1];  // +1 for [CLS]
+        int         n_pos    = new_pos_embd->ne[1];   // +1 for [CLS]
         const auto tgt_size = static_cast<int>(std::sqrt(inp->ne[1]));
         const auto src_size = static_cast<int>(std::sqrt(n_pos - 1));
 
@@ -281,41 +283,52 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
             n_pos        = tgt_size * tgt_size + 1;
         }
 
-        // add CLS token
-        inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
+        // CLS token, broadcast across the batch
+        ggml_tensor * cls_b = ggml_repeat_4d(ctx0, model.class_embedding, n_embd, 1, B, 1);
+        inp = ggml_concat(ctx0, cls_b, inp, 1); // [n_embd, n_pos, B]
 
         // for selecting learned pos embd, used by ViT
         ggml_tensor * positions        = ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32);
         ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions);
 
-        ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_QUICK, learned_pos_embd, nullptr);
+        // one builder for all B (B==1 is just the single global view)
+        clip_out = build_vit(inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_QUICK, learned_pos_embd, nullptr);  // [n_embd, n_pos, B]
 
-        ggml_build_forward_expand(gf, cur);
-        clip_out = cur;
+        ggml_build_forward_expand(gf, clip_out);
     }
 
     sam_out  = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3));
-    sam_out  = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches);
-    clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]);
+    sam_out  = ggml_reshape_3d(ctx0, sam_out, sam_out->ne[0], clip_n_patches, B);
+    clip_out = ggml_view_3d(ctx0, clip_out, n_embd, clip_n_patches, B,
+                            clip_out->nb[1], clip_out->nb[2], clip_out->nb[1]);
 
     ggml_tensor * cur;
     cur = ggml_concat(ctx0, clip_out, sam_out, 0);
     cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
     cur = ggml_add(ctx0, cur, model.mm_fc_b);
 
-    const auto h     = static_cast<int>(std::sqrt(static_cast<float>(cur->ne[1])));
-    const auto w     = h;
-    const auto n_dim = cur->ne[0];
+    const int tile_tokens = cur->ne[1];
+    const int tile_w      = static_cast<int>(std::sqrt(static_cast<float>(tile_tokens))); // tiles are square
+    const int gw          = tile_w * grid_x;
+    const int gh          = tile_w * grid_y;
+    const int n_dim       = cur->ne[0];
 
-    ggml_tensor * imgnl;
+    cur = ggml_reshape_4d(ctx0, cur, n_dim * tile_w, tile_w, grid_x, grid_y); // [n_dim*tile_w, tile_w, grid_x, grid_y]
+    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
 
-    imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
-    cur   = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
-    cur   = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
-    cur   = ggml_concat(ctx0, cur, model.view_seperator, 1);  // (n_dim, h*(w+1) + 1)
+    ggml_tensor * nl;
 
-    cb(cur, "dsocr_output", -1);
+    // weave newline at the end of every grid row
+    nl  = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, gh, 1);
+    cur = ggml_reshape_3d(ctx0, cur, n_dim, gw, gh); //[n_dim, gw, gh]
+    cur = ggml_concat(ctx0, cur, nl, 1);
+    cur = ggml_reshape_2d(ctx0, cur, n_dim, (gw + 1) * gh);
+
+    if (img.add_viewsep) {
+        cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, (gw+1)*gh + 1)
+    }
 
+    cb(cur, "dsocr_output", -1);
     ggml_build_forward_expand(gf, cur);
     return gf;
 }
diff --git a/tools/mtmd/models/deepseekocr2.cpp b/tools/mtmd/models/deepseekocr2.cpp
index 056bb81807f3..8fe47d580401 100644
--- a/tools/mtmd/models/deepseekocr2.cpp
+++ b/tools/mtmd/models/deepseekocr2.cpp
@@ -4,17 +4,17 @@ ggml_cgraph * clip_graph_deepseekocr2::build() {
     GGML_ASSERT(hparams.n_head_kv > 0);
     GGML_ASSERT(n_head % hparams.n_head_kv == 0);
 
-    // patch embedding
-    ggml_tensor * inp_raw = build_inp_raw();
+    const int64_t B = grid_x * grid_y;
+
+    ggml_tensor * inp_raw = build_inp_raw(3, grid_x * grid_y);
 
     ggml_tensor * sam_out = build_sam(inp_raw);
 
     ggml_tensor * qwen2_out;
     // Building Qwen2 encoder
     {
-        ggml_tensor * inp;
-
-        inp = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2]); // H*W, C
+        // [W, H, C, B] -> [H*W, C, B] -> [C, H*W, B]
+        ggml_tensor * inp = ggml_reshape_3d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2], B);
         inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
 
         auto num_image_tokens = inp->ne[1]; // H*W
@@ -32,8 +32,10 @@ ggml_cgraph * clip_graph_deepseekocr2::build() {
             num_queries = 144;
         }
 
-        // (B, num_image_tokens + num_queries, C)
-        inp = ggml_concat(ctx0, inp, ggml_cast(ctx0, query_embed, inp->type), 1);
+        // query_embed [C, num_queries]; broadcast across the batch and append:
+        ggml_tensor * query_b = ggml_repeat_4d(ctx0, ggml_cast(ctx0, query_embed, inp->type),
+                                               inp->ne[0], num_queries, B, 1);
+        inp = ggml_concat(ctx0, inp, query_b, 1);
 
         auto seq_len = inp->ne[1];
 
@@ -54,11 +56,12 @@ ggml_cgraph * clip_graph_deepseekocr2::build() {
 
         // build_vit applies model.post_ln_w internally; do not re-apply
         ggml_tensor * cur = build_vit(inp, seq_len, NORM_TYPE_RMS, FFN_SILU,
-                                      /* learned_pos_embd */ nullptr, add_rope, vit_opts);
+                                      /* learned_pos_embd */ nullptr, add_rope, vit_opts); // [C, seq_len, B]
 
+        // only keep the query tokens; [C, num_queries, B]
         cur = ggml_cont(ctx0,
-                        ggml_view_2d(ctx0, cur, cur->ne[0], num_queries, cur->nb[1],
-                                     cur->nb[1] * (cur->ne[1] - num_queries))); // only take query tokens for output
+                        ggml_view_3d(ctx0, cur, cur->ne[0], num_queries, B,
+                                     cur->nb[1], cur->nb[2], cur->nb[1] * (cur->ne[1] - num_queries)));
 
         ggml_build_forward_expand(gf, cur);
         qwen2_out = cur;
@@ -66,14 +69,17 @@ ggml_cgraph * clip_graph_deepseekocr2::build() {
 
     ggml_tensor * cur;
 
-    cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out);
+    cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out); // [n_dim, num_queries, B]
     cur = ggml_add(ctx0, cur, model.mm_fc_b);
 
     // view_seperator only after the global view
     if (img.add_viewsep) {
+        GGML_ASSERT(B == 1);
         cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, 257)
     }
 
+    // flatten the batch
+    cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
     cb(cur, "dsocr2_output", -1);
 
     ggml_build_forward_expand(gf, cur);
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
index b882f800dd77..c7db362f30b7 100644
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -120,14 +120,24 @@ struct clip_graph_whisper_enc : clip_graph {
     ggml_cgraph * build() override;
 };
 
+// one graph for both the global view (grid 1x1)
+// and multi-tile batch; batch dim is grid_x * grid_y
 struct clip_graph_deepseekocr : clip_graph {
-    clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    int grid_x;
+    int grid_y;
+
+    clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img,
+                           const int grid_x = 1, const int grid_y = 1)
+        : clip_graph(ctx, img), grid_x(grid_x), grid_y(grid_y) {}
+
     ggml_cgraph * build() override;
     ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model
 };
 
 struct clip_graph_deepseekocr2 : clip_graph_deepseekocr {
-    clip_graph_deepseekocr2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_deepseekocr(ctx, img) {}
+    clip_graph_deepseekocr2(clip_ctx * ctx, const clip_image_f32 & img,
+                            const int grid_x = 1, const int grid_y = 1)
+        : clip_graph_deepseekocr(ctx, img, grid_x, grid_y) {}
     ggml_cgraph * build() override; // reuses build_sam() from base
 };
 
diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp
index caf72d536214..371fa0357ab4 100644
--- a/tools/mtmd/mtmd-image.cpp
+++ b/tools/mtmd/mtmd-image.cpp
@@ -1104,46 +1104,7 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
 // mtmd_image_preprocessor_deepseekocr
 //
 
-bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
-    static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
-    // TODO: support 512 (tiny) and 640 (small) once we have eval data for them
-
-    const int64_t orig_area = static_cast<int64_t>(img.nx) * img.ny;
-
-    size_t  mode_i   = 0;
-    int64_t min_diff = std::numeric_limits<int64_t>::max();
-    for (size_t i = 0; i < std::size(native_resolutions); i++) {
-        const int64_t r    = native_resolutions[i];
-        const int64_t diff = std::abs(orig_area - r * r);
-        if (diff < min_diff) {
-            mode_i   = i;
-            min_diff = diff;
-        }
-    }
-    const int image_size = native_resolutions[mode_i];
-
-    // Aspect-preserving fit-and-pad. Pillow bicubic + PAD_NEAREST for
-    // byte-parity with the upstream deepseek-ai/DeepSeek-OCR HF preprocessor.
-    clip_image_u8 padded;
-    img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW,
-                     PAD_NEAREST, hparams.image_pad_color);
-
-    clip_image_f32_ptr res(clip_image_f32_init());
-    img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std);
-    output.entries.push_back(std::move(res));
-
-    output.grid_x = 1;
-    output.grid_y = 1;
-    return true;
-}
-
-//
-// mtmd_image_preprocessor_deepseekocr2
-//
-
-// candidate tile grids (cols, rows) with min_tiles <= cols*rows <= max_tiles
-// sorted by tile count
-std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ratios() {
+std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr::get_target_ratios() const {
     std::vector<clip_image_size> ratios;
     for (int n = min_tiles; n <= max_tiles; n++) {
         for (int w = 1; w <= n; w++) {
@@ -1159,7 +1120,7 @@ std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ra
                     }
                 }
                 if (!found) {
-                    ratios.push_back({ w, h });
+                    ratios.push_back({w, h});
                 }
             }
         }
@@ -1170,23 +1131,20 @@ std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ra
     return ratios;
 }
 
-// pick the grid whose aspect ratio is closest to the image
-// on a tie, prefer the larger grid when the image fits
-clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
-    float                                aspect_ratio,
-    const std::vector<clip_image_size> & target_ratios,
-    int                                  width,
-    int                                  height) {
-    float           best_ratio_diff = std::numeric_limits<float>::max();
-    clip_image_size best_ratio      = { 1, 1 };
-    const float     area            = static_cast<float>(width * height);
+clip_image_size mtmd_image_preprocessor_deepseekocr::find_closest_aspect_ratio(
+        float aspect_ratio,
+        const std::vector<clip_image_size> & target_ratios,
+        int width, int height) const {
+    float best_ratio_diff = std::numeric_limits<float>::max();
+    clip_image_size best_ratio = {1, 1};
+    const float area = static_cast<float>(width * height);
 
     for (const auto & ratio : target_ratios) {
         const float target_aspect_ratio = static_cast<float>(ratio.width) / ratio.height;
-        const float ratio_diff          = std::abs(aspect_ratio - target_aspect_ratio);
+        const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio);
         if (ratio_diff < best_ratio_diff) {
             best_ratio_diff = ratio_diff;
-            best_ratio      = ratio;
+            best_ratio = ratio;
         } else if (ratio_diff == best_ratio_diff) {
             const float target_area = static_cast<float>(tile_size * tile_size * ratio.width * ratio.height);
             if (area > 0.5f * target_area) {
@@ -1197,22 +1155,24 @@ clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
     return best_ratio;
 }
 
-bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
-    // emit 768x768 local tiles when the image is larger than a tile in either
-    // dimension, then always a 1024x1024 global view. order: [tiles..., global].
+bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    // output order: [local tiles..., global]
 
+    int grid_w = 1;
+    int grid_h = 1;
     if (img.nx > tile_size || img.ny > tile_size) {
-        const float           aspect_ratio  = static_cast<float>(img.nx) / img.ny;
-        const auto            target_ratios = get_target_ratios();
-        const clip_image_size grid          = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny);
+        const float aspect_ratio   = static_cast<float>(img.nx) / img.ny;
+        const auto  target_ratios  = get_target_ratios();
+        const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny);
+        grid_w = grid.width;
+        grid_h = grid.height;
 
-        // stretch onto the grid (no aspect preserve), then crop tiles row-major.
         clip_image_u8 refined;
-        img_tool::resize(img, refined, { tile_size * grid.width, tile_size * grid.height },
+        img_tool::resize(img, refined, {tile_size * grid_w, tile_size * grid_h},
                          RESIZE_ALGO_BICUBIC_PILLOW, PAD_NONE);
 
-        for (int row = 0; row < grid.height; row++) {
-            for (int col = 0; col < grid.width; col++) {
+        for (int row = 0; row < grid_h; row++) {
+            for (int col = 0; col < grid_w; col++) {
                 clip_image_u8 tile;
                 img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size);
                 clip_image_f32_ptr res(clip_image_f32_init());
@@ -1222,20 +1182,19 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img,
         }
     }
 
-    // global view: aspect-preserving fit-and-pad to base_size.
+    // global view: aspect-preserving fit-and-pad to base_size
     clip_image_u8 padded;
-    img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW,
+    img_tool::resize(img, padded, {base_size, base_size}, RESIZE_ALGO_BICUBIC_PILLOW,
                      PAD_NEAREST, hparams.image_pad_color);
     clip_image_f32_ptr global(clip_image_f32_init());
     img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std);
     global->add_viewsep = true;
     output.entries.push_back(std::move(global));
-
-    output.grid_x = 1;
-    output.grid_y = 1;
+    output.grid_x = grid_w;
+    output.grid_y = grid_h;
+    LOG_DBG("%s: grid size: %d x %d (%d tiles) + global view\n", __func__, grid_w, grid_h, grid_w * grid_h);
     return true;
 }
-
 //
 // mtmd_image_preprocessor_step3vl
 //
diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h
index 91a5bc253ef8..71170089422c 100644
--- a/tools/mtmd/mtmd-image.h
+++ b/tools/mtmd/mtmd-image.h
@@ -139,29 +139,28 @@ struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd {
     bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };
 
+// DeepSeek-OCR (v1/v2) global view + optional local tile grid
 struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
-    mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
-    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
-};
-
-// DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local
-// tiles when the image is larger than a tile in either dimension.
-struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor {
-    static constexpr int base_size = 1024; // global view
-    static constexpr int tile_size = 768;  // local tile
-    static constexpr int min_tiles = 2;
-    static constexpr int max_tiles = 6;
-
-    mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
+    // config is loaded into clip_hparams (see load_hparams); global view side is image_size
+    mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx)
+        : mtmd_image_preprocessor(ctx),
+          base_size(hparams.image_size),
+          tile_size(hparams.preproc_tile_size),
+          min_tiles(hparams.preproc_min_tiles),
+          max_tiles(hparams.preproc_max_tiles) {}
     bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 
 private:
-    static std::vector<clip_image_size> get_target_ratios();
-    static clip_image_size              find_closest_aspect_ratio(
-        float                                aspect_ratio,
-        const std::vector<clip_image_size> & target_ratios,
-        int                                  width,
-        int                                  height);
+    int base_size; // global view
+    int tile_size; // each tile
+    int min_tiles;
+    int max_tiles;
+
+    std::vector<clip_image_size> get_target_ratios() const;
+    clip_image_size find_closest_aspect_ratio(
+            float aspect_ratio,
+            const std::vector<clip_image_size> & target_ratios,
+            int width, int height) const;
 };
 
 // custom image preprocessing for Step3VL
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 0b5caa6cb5c1..e5ab806156fe 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -120,6 +120,46 @@ static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_
     return CLIP_FLASH_ATTN_TYPE_AUTO;
 }
 
+// DeepSeek-OCR multi-tile batched encode:
+// tile-grid is encoded as one batch,
+// then the global view is encoded and appended.
+//
+// v1 weaves newlines onto the grid in-graph;
+// v2 just concatenates the per-tile query tokens.
+static bool encode_deepseekocr(clip_ctx * ctx_clip,
+                               int n_threads,
+                               const clip_image_f32_batch & batch,
+                               float * out) {
+    const auto & entries       = batch.entries;
+    const int    n_tiles       = static_cast<int>(entries.size()) - 1; // global view is last
+    const int    n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
+
+    if (n_tiles > 0) {
+        GGML_ASSERT(n_tiles == batch.grid_x * batch.grid_y);
+        const size_t tiles_sz = static_cast<size_t>(
+            clip_n_output_tokens(ctx_clip, entries[0].get(), batch.grid_x, batch.grid_y)) * n_mmproj_embd;
+        clip_image_f32_batch tile_batch;
+        tile_batch.grid_x = batch.grid_x;
+        tile_batch.grid_y = batch.grid_y;
+        tile_batch.entries.reserve(n_tiles);
+        for (int i = 0; i < n_tiles; i++) {
+            tile_batch.entries.emplace_back(entries[i].get());
+        }
+
+        const bool ok = clip_image_batch_encode(ctx_clip, n_threads, &tile_batch, out);
+
+        for (auto & tile : tile_batch.entries) {
+            (void) tile.release();
+        }
+        if (!ok) {
+            return false;
+        }
+        out += tiles_sz;
+    }
+
+    return clip_image_encode(ctx_clip, n_threads, entries.back().get(), out);
+}
+
 mtmd_context_params mtmd_context_params_default() {
     mtmd_context_params params {
         /* use_gpu           */ true,
@@ -490,14 +530,10 @@ struct mtmd_context {
                     image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
                 } break;
             case PROJECTOR_TYPE_DEEPSEEKOCR:
-                {
-                    img_end = "\n"; // prevent empty batch on llama-server
-                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
-                } break;
             case PROJECTOR_TYPE_DEEPSEEKOCR2:
                 {
                     img_end = "\n"; // prevent empty batch on llama-server
-                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr2>(ctx_v);
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
                 } break;
             case PROJECTOR_TYPE_HUNYUANVL:
                 {
@@ -873,8 +909,18 @@ struct mtmd_tokenizer {
 
             } else {
                 size_t n_tokens = 0;
-                for (const auto & entry : batch_f32.entries) {
-                    n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
+                if (ctx->proj_type_v() == PROJECTOR_TYPE_DEEPSEEKOCR
+                    || ctx->proj_type_v() == PROJECTOR_TYPE_DEEPSEEKOCR2) {
+                    // tiles run as one batched image grid; the global view is separate single image
+                    if (batch_f32.entries.size() > 1) {
+                        n_tokens += clip_n_output_tokens(ctx->ctx_v, batch_f32.entries[0].get(),
+                                                         batch_f32.grid_x, batch_f32.grid_y);
+                    }
+                    n_tokens += clip_n_output_tokens(ctx->ctx_v, batch_f32.entries.back().get());
+                } else {
+                    for (const auto & entry : batch_f32.entries) {
+                        n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
+                    }
                 }
 
                 mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
@@ -1107,15 +1153,15 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
     bool ok = false;
 
-    if (clip_is_llava(ctx_clip)
+    if (proj_type == PROJECTOR_TYPE_DEEPSEEKOCR || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
+        ok = encode_deepseekocr(ctx_clip, ctx->n_threads, image_tokens->batch_f32, ctx->image_embd_v.data());
+    } else if (clip_is_llava(ctx_clip)
         || proj_type == PROJECTOR_TYPE_MINICPMV
         || proj_type == PROJECTOR_TYPE_GLM_EDGE
-        || proj_type == PROJECTOR_TYPE_INTERNVL
-        || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
+        || proj_type == PROJECTOR_TYPE_INTERNVL) {
         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
         const auto & entries = image_tokens->batch_f32.entries;
         // entries may have different token counts
-        // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
         size_t offset = 0;
         for (size_t i = 0; i < entries.size(); i++) {
             int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
diff --git a/tools/mtmd/tests/test-1-positive.png b/tools/mtmd/tests/test-1-positive.png
new file mode 100644
index 000000000000..007614594ef5
Binary files /dev/null and b/tools/mtmd/tests/test-1-positive.png differ
diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py
index 5f5fef765a62..f64104535558 100644
--- a/tools/mtmd/tests/test-deepseek-ocr.py
+++ b/tools/mtmd/tests/test-deepseek-ocr.py
@@ -82,6 +82,24 @@ def chrf_min(self) -> float:
         # is one pixel off and lands at ~0.69 instead.
         hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0,
     ),
+    TestCase(
+        model_key="v1", label="multi-tile (dynamic resolution)",
+        image="tools/mtmd/tests/test-1-positive.png",
+        ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
+        # 429x806 -- 806 > 640 triggers the v1 "Gundam" path: (1,2) grid ->
+        # 2 local 640 tiles + 1 global 1024 view. Regression guard for the
+        # tiling preprocessor -- a broken tile path craters the score.
+        # hf_cer/hf_chrf are HF v1's measured scores -- it reads this clean crop exactly.
+        hf_cer=0.0000, hf_chrf=100.00, cer_tol=0.03, chrf_tol=3.0,
+    ),
+    TestCase(
+        model_key="v2", label="multi-tile (dynamic resolution)",
+        image="tools/mtmd/tests/test-1-positive.png",
+        ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
+        # 429x806 -- 806 > 768 triggers the v2 path: (1,2) grid ->
+        # 2 local 768 tiles + 1 global 1024 view = 545 image tokens.
+        hf_cer=0.0236, hf_chrf=97.05, cer_tol=0.03, chrf_tol=3.0,
+    ),
 ]