diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 1d9f6a136a96..655d361b1ff2 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -62,6 +62,9 @@ struct clip_graph { // build vision transformer (ViT) cgraph // this function should cover most of the models // if your model has specific features, you should probably duplicate this function + // + // inp is 2D [n_embd, n_pos] or 3D [n_embd, n_pos, B] (batched multi-tile encode); + // returns the same rank with the batch dim preserved (B==1 -> trailing 1) ggml_tensor * build_vit( ggml_tensor * inp, int64_t n_pos, @@ -75,7 +78,7 @@ struct clip_graph { // returns tensor with shape [n_embd, n_patches] ggml_tensor * build_inp(); - ggml_tensor * build_inp_raw(int channels = 3); + ggml_tensor * build_inp_raw(int channels = 3, int batch = 1); ggml_tensor * build_norm( ggml_tensor * cur, diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index c055cfb75419..cb517812f189 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -523,8 +523,9 @@ struct clip_image_f32_batch { // for llava-uhd style models, we need to know the grid size // note: entries.size() == grid_x * grid_y + 1 (one overview image) - int grid_x = 0; - int grid_y = 0; + // 1x1 = no tiling; llava-uhd preprocessors always overwrite grid before has_tiling_grid reads it + int grid_x = 1; + int grid_y = 1; clip_image_f32_batch clone() const { clip_image_f32_batch new_batch{ diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 238f805a9aae..848ad4da7801 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -69,6 +69,7 @@ struct clip_hparams { std::vector image_res_candidates; int32_t preproc_min_tiles = 0; int32_t preproc_max_tiles = 0; + int32_t preproc_tile_size = 0; resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC; resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR; pad_style image_pad_rf = PAD_CEIL; // padding style for the refined image (e.g. llava-1.6) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 80136ed86672..5d7ed82b6e8f 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -310,11 +310,17 @@ ggml_tensor * clip_graph::build_vit( std::function add_pos, const build_vit_opts & opts ) { + // batch dim: inp is [n_embd, n_pos] (B==1) or [n_embd, n_pos, B] (multi-tile encode) + const int64_t B = inp->ne[2]; + if (learned_pos_embd) { inp = ggml_add(ctx0, inp, learned_pos_embd); cb(inp, "pos_embed", -1); } + // flatten batch; unflatten again in attention + inp = ggml_reshape_2d(ctx0, inp, n_embd, n_pos * B); + ggml_tensor * inpL = inp; // pre-layernorm @@ -344,20 +350,24 @@ ggml_tensor * clip_graph::build_vit( cur = ggml_add(ctx0, cur, layer.qkv_b); } - Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ 0); - - Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ ggml_row_size(cur->type, n_embd)); - - Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ ggml_row_size(cur->type, 2 * n_embd)); + // Q/K/V as [d_head, n_head, n_pos, B], the batch stride is cur->nb[1]*n_pos. + Qcur = ggml_view_4d(ctx0, cur, d_head, n_head, n_pos, B, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* nb3 */ cur->nb[1] * n_pos, + /* offset */ 0); + + Kcur = ggml_view_4d(ctx0, cur, d_head, n_head, n_pos, B, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* nb3 */ cur->nb[1] * n_pos, + /* offset */ ggml_row_size(cur->type, n_embd)); + + Vcur = ggml_view_4d(ctx0, cur, d_head, n_head, n_pos, B, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* nb3 */ cur->nb[1] * n_pos, + /* offset */ ggml_row_size(cur->type, 2 * n_embd)); if (layer.q_norm) { GGML_ASSERT(layer.q_norm->ne[0] == Qcur->ne[0]); @@ -402,9 +412,9 @@ ggml_tensor * clip_graph::build_vit( } } - Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); - Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head_kv, n_pos); - Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head_kv, n_pos); + Qcur = ggml_reshape_4d(ctx0, Qcur, d_head, n_head, n_pos, B); + Kcur = ggml_reshape_4d(ctx0, Kcur, d_head, n_head_kv, n_pos, B); + Vcur = ggml_reshape_4d(ctx0, Vcur, d_head, n_head_kv, n_pos, B); if (norm_per_head) { if (layer.q_norm) { @@ -434,6 +444,7 @@ ggml_tensor * clip_graph::build_vit( cb(Vcur, "Vcur_normed", il); } + // build_attn returns a flat 2D [n_embd, n_pos*B] cur = build_attn(layer.o_w, layer.o_b, Qcur, Kcur, Vcur, opts.attn_mask, kq_scale, il); cb(cur, "attn_out", il); @@ -505,6 +516,10 @@ ggml_tensor * clip_graph::build_vit( if (model.post_ln_w) { inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1); } + + // restore the batch dim + GGML_ASSERT(inpL->ne[1] % B == 0); + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, inpL->ne[1] / B, B); return inpL; } @@ -522,8 +537,10 @@ ggml_tensor * clip_graph::build_inp() { return inp; } -ggml_tensor * clip_graph::build_inp_raw(int channels) { - ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels); +ggml_tensor * clip_graph::build_inp_raw(int channels, int batch) { + ggml_tensor * inp_raw = batch > 1 + ? ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels, batch) + : ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels); ggml_set_name(inp_raw, "inp_raw"); ggml_set_input(inp_raw); return inp_raw; @@ -844,7 +861,9 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale } static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) { - GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported"); + const bool is_dsocr_tiles = (ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR + || ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR2) && imgs.entries.size() > 1; + GGML_ASSERT((imgs.entries.size() == 1 || is_dsocr_tiles) && "n_batch > 1 is not supported"); const clip_image_f32 & img = *imgs.entries[0]; std::unique_ptr builder; @@ -959,11 +978,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } break; case PROJECTOR_TYPE_DEEPSEEKOCR: { - builder = std::make_unique(ctx, img); + // same builder for single image (grid 1x1) and batched tiles (grid_x*grid_y of them) + builder = std::make_unique(ctx, img, imgs.grid_x, imgs.grid_y); } break; case PROJECTOR_TYPE_DEEPSEEKOCR2: { - builder = std::make_unique(ctx, img); + // same builder for single image (grid 1x1) and batched tiles (grid_x*grid_y of them); + builder = std::make_unique(ctx, img, imgs.grid_x, imgs.grid_y); } break; case PROJECTOR_TYPE_LFM2A: { @@ -1540,7 +1561,7 @@ struct clip_model_loader { { hparams.patch_size = 16; hparams.image_size = 1024; - hparams.warmup_image_size = 1024; + hparams.warmup_image_size = hparams.image_size; // global view is fixed at image_size hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW; hparams.image_pad_color = {127, 127, 127}; @@ -1548,7 +1569,17 @@ struct clip_model_loader { get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true); get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true); get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); + // dynamic-resolution tiling config + hparams.preproc_min_tiles = 2; + if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR) { + hparams.preproc_max_tiles = 9; + hparams.preproc_tile_size = 640; + // the CLIP/ViT body runs at 1e-5 + hparams.eps = 1e-5f; + } if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) { + hparams.preproc_max_tiles = 6; + hparams.preproc_tile_size = 768; // qwen2 encoder is GQA, requires KEY_N_HEAD_KV get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv); } @@ -2724,6 +2755,38 @@ struct clip_model_loader { std::vector ops; }; + // reserve for the worst-case DeepSeek-OCR (v1+v2) tile batch + static void reserve_dsocr_max_tiles(clip_ctx & ctx_clip) { + const auto proj = ctx_clip.proj_type(); + if (proj != PROJECTOR_TYPE_DEEPSEEKOCR && proj != PROJECTOR_TYPE_DEEPSEEKOCR2) { + return; + } + const auto & hparams = ctx_clip.model.hparams; + const int max_tiles = hparams.preproc_max_tiles; + const int tile_size = hparams.preproc_tile_size; + if (max_tiles <= 1 || tile_size <= 0) { + return; + } + + // v1 weaves a newline per grid row + const int grid_x = 1; + const int grid_y = max_tiles; + + clip_image_f32_batch tiles; + for (int i = 0; i < max_tiles; i++) { + clip_image_f32_ptr tile(clip_image_f32_init()); + tile->nx = tile_size; + tile->ny = tile_size; + tiles.entries.push_back(std::move(tile)); + } + tiles.grid_x = grid_x; + tiles.grid_y = grid_y; + + LOG_INF("%s: reserving worst-case tile batch: %d tiles (%dx%d grid) @ %dx%d\n", + __func__, max_tiles, grid_x, grid_y, tile_size, tile_size); + reserve_compute_meta(ctx_clip, tiles); + } + static void warmup(clip_ctx & ctx_clip) { // create a fake batch const auto & hparams = ctx_clip.model.hparams; @@ -2740,6 +2803,9 @@ struct clip_model_loader { } batch.entries.push_back(std::move(img)); warmup(ctx_clip, batch); + + // DeepSeek-OCR v1+v2: warmup's (worst-case) max tiles batch + global view; + reserve_dsocr_max_tiles(ctx_clip); } static void warmup(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) { @@ -3171,7 +3237,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * return 1; } -int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) { +int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img, int grid_x, int grid_y) { const auto & params = ctx->model.hparams; // for models with fixed size image, the input image is already pre-processed and resized to square @@ -3346,17 +3412,25 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches += 2; // for BOI and EOI token embeddings } break; case PROJECTOR_TYPE_DEEPSEEKOCR: - { - // SAM encoder applies two stride-2 convolutions (net_2 and net_3) - // that reduce spatial dimensions by 4x in each direction (16x total) - // E.g., 64x64 -> 16x16 patches - n_patches /= 16; - - // build_global_local_features adds image newlines and view separator - // Formula: h*(w+1) + 1 where h = w = sqrt(n_patches) - int h = static_cast(std::sqrt(static_cast(n_patches))); - n_patches = h * (h + 1) + 1; - } break; + { + // SAM encoder applies two stride-2 convolutions (net_2 and net_3) + // that reduce spatial dimensions by 4x in each direction (16x total) + // E.g., 64x64 -> 16x16 patches + n_patches /= 16; + + // global view (add_viewsep) is encoded single-image, never with a tile grid + GGML_ASSERT(!(img->add_viewsep && (grid_x > 1 || grid_y > 1))); + + const int h = static_cast(std::sqrt(static_cast(n_patches))); + if (grid_x > 1 || grid_y > 1) { + // tiles: the batched graph lays them out on the grid_x x grid_y grid + // and weaves one newline per row, emitting a single combined output + n_patches = (h * grid_x + 1) * (h * grid_y); + } else if (img->add_viewsep) { + // global view: weave one newline per row + trailing view separator + n_patches = h * (h + 1) + 1; + } + } break; case PROJECTOR_TYPE_HUNYUANVL: { int merge = ctx->model.hparams.n_merge; @@ -3365,14 +3439,20 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches = (ow + 1) * oh + 2; } break; case PROJECTOR_TYPE_DEEPSEEKOCR2: - { - // 1024 global view -> 256 query tokens + 1 view separator = 257; - // 768 local tile -> 144 query tokens, no separator. - n_patches /= 16; - if (img->add_viewsep) { - n_patches += 1; // view separator, appended only after the global view - } - } break; + { + // 1024 global view -> 256 query tokens + 1 view separator = 257; + // 768 local tile -> 144 query tokens, no separator. + n_patches /= 16; + + // global view (add_viewsep) is encoded single-image, never with a tile grid + GGML_ASSERT(!(img->add_viewsep && (grid_x > 1 || grid_y > 1))); + if (img->add_viewsep) { + n_patches += 1; // view separator, appended only after the global view + } else if (grid_x > 1 || grid_y > 1) { + // tiles concatenate their per-tile query tokens (grid_x*grid_y of them); no in-graph weave + n_patches = n_patches * grid_x * grid_y; + } + } break; case PROJECTOR_TYPE_LFM2A: { n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; @@ -3417,9 +3497,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const clip_image_f32_batch & imgs = *imgs_c_ptr; int batch_size = imgs.entries.size(); + // DSOCR (v1 and v2) encode their tiles in one batched graph; every other path is single-image + const bool is_dsocr_tiles = (ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR + || ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR2) && batch_size > 1; + // TODO @ngxson : implement batch size > 1 as a loop // we don't need true batching support because the cgraph will gonna be big anyway - if (batch_size != 1) { + if (batch_size != 1 && !is_dsocr_tiles) { return false; // only support batch size of 1 } @@ -3491,23 +3575,23 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // └─────┘ │ // ──────┘ x B - for (size_t i = 0; i < imgs.entries.size(); i++) { - const int nx = imgs.entries[i]->nx; - const int ny = imgs.entries[i]->ny; - const int n = nx * ny; - - for (int b = 0; b < batch_size; b++) { - float * batch_entry = inp_raw.data() + b * (3*n); - for (int y = 0; y < ny; y++) { - for (int x = 0; x < nx; x++) { - size_t base_src = 3*(y * nx + x); // idx of the first channel - size_t base_dst = y * nx + x; // idx of the first channel - batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ]; - batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1]; - batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2]; - } + size_t off = 0; // running offset into inp_raw; each entry is a contiguous [W, H, 3] block + for (size_t b = 0; b < imgs.entries.size(); b++) { + const int nx = imgs.entries[b]->nx; + const int ny = imgs.entries[b]->ny; + const int n = nx * ny; + + float * batch_entry = inp_raw.data() + off; + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + size_t base_src = 3*(y * nx + x); // idx of the first channel + size_t base_dst = y * nx + x; // idx of the first channel + batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ]; + batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1]; + batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2]; } } + off += 3 * n; } set_input_f32("inp_raw", inp_raw); @@ -4252,9 +4336,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // the last node is the embedding tensor ggml_tensor * embeddings = ggml_graph_node(gf, -1); - // sanity check (only support batch size of 1 for now) + // sanity check const int n_tokens_out = embeddings->ne[1]; - const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get()); + const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get(), imgs.grid_x, imgs.grid_y); if (n_tokens_out != expected_n_tokens_out) { LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out); GGML_ABORT("Invalid number of output tokens"); diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 9b807ffa77b3..f83cd24e777c 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -64,7 +64,8 @@ int32_t clip_get_hidden_size(const struct clip_ctx * ctx); // TODO: should be enum, not string const char * clip_patch_merge_type(const struct clip_ctx * ctx); -int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img); +// grid_x/grid_y of the tile grid; leave at 1 for a single image +int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img, int grid_x = 1, int grid_y = 1); // for M-RoPE, this will be the number of token positions in X and Y directions // for other models, X will be the total number of tokens and Y will be 1 diff --git a/tools/mtmd/models/deepseekocr.cpp b/tools/mtmd/models/deepseekocr.cpp index c3c22d0a4bac..22badf70298d 100644 --- a/tools/mtmd/models/deepseekocr.cpp +++ b/tools/mtmd/models/deepseekocr.cpp @@ -96,6 +96,8 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) { const int n_heads = hparams.sam_n_head; const int d_heads = n_embd / n_heads; const int window = hparams.attn_window_size; + // SAM stage runs its layernorms at 1e-6 + const float sam_eps = 1e-6f; ggml_tensor * inpL; @@ -134,7 +136,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) { ggml_tensor * shortcut = cur; // layernorm1 - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, sam_eps, il); const int64_t w0 = cur->ne[1]; const int64_t h0 = cur->ne[2]; @@ -214,7 +216,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) { ggml_tensor * inpFF = cur; // layernorm2 - cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, sam_eps, il); // ffn cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, @@ -229,12 +231,12 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) { cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1); cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); - cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1); + cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, sam_eps, -1); cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1); cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); - cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1); + cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, sam_eps, -1); cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1); @@ -246,23 +248,23 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) { } ggml_cgraph * clip_graph_deepseekocr::build() { - // patch embedding - ggml_tensor * inp_raw = build_inp_raw(); + const int64_t B = grid_x * grid_y; + + ggml_tensor * inp_raw = build_inp_raw(3, grid_x * grid_y); + + //sam_out: [16, 16, n_embd_clip, B] ggml_tensor * sam_out = build_sam(inp_raw); const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1]; ggml_tensor * clip_out; - // Building DS-OCR CLIP { - ggml_tensor * inp; - - inp = ggml_reshape_2d(ctx0, sam_out, clip_n_patches, sam_out->ne[2]); + // [16, 16, n_embd, B] -> [n_patches, n_embd, B] + ggml_tensor * inp = ggml_reshape_3d(ctx0, sam_out, clip_n_patches, sam_out->ne[2], B); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); ggml_tensor * new_pos_embd = model.position_embeddings; - - int n_pos = new_pos_embd->ne[1]; // +1 for [CLS] + int n_pos = new_pos_embd->ne[1]; // +1 for [CLS] const auto tgt_size = static_cast(std::sqrt(inp->ne[1])); const auto src_size = static_cast(std::sqrt(n_pos - 1)); @@ -281,41 +283,52 @@ ggml_cgraph * clip_graph_deepseekocr::build() { n_pos = tgt_size * tgt_size + 1; } - // add CLS token - inp = ggml_concat(ctx0, model.class_embedding, inp, 1); + // CLS token, broadcast across the batch + ggml_tensor * cls_b = ggml_repeat_4d(ctx0, model.class_embedding, n_embd, 1, B, 1); + inp = ggml_concat(ctx0, cls_b, inp, 1); // [n_embd, n_pos, B] // for selecting learned pos embd, used by ViT ggml_tensor * positions = ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32); ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions); - ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_QUICK, learned_pos_embd, nullptr); + // one builder for all B (B==1 is just the single global view) + clip_out = build_vit(inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_QUICK, learned_pos_embd, nullptr); // [n_embd, n_pos, B] - ggml_build_forward_expand(gf, cur); - clip_out = cur; + ggml_build_forward_expand(gf, clip_out); } sam_out = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3)); - sam_out = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches); - clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]); + sam_out = ggml_reshape_3d(ctx0, sam_out, sam_out->ne[0], clip_n_patches, B); + clip_out = ggml_view_3d(ctx0, clip_out, n_embd, clip_n_patches, B, + clip_out->nb[1], clip_out->nb[2], clip_out->nb[1]); ggml_tensor * cur; cur = ggml_concat(ctx0, clip_out, sam_out, 0); cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur); cur = ggml_add(ctx0, cur, model.mm_fc_b); - const auto h = static_cast(std::sqrt(static_cast(cur->ne[1]))); - const auto w = h; - const auto n_dim = cur->ne[0]; + const int tile_tokens = cur->ne[1]; + const int tile_w = static_cast(std::sqrt(static_cast(tile_tokens))); // tiles are square + const int gw = tile_w * grid_x; + const int gh = tile_w * grid_y; + const int n_dim = cur->ne[0]; - ggml_tensor * imgnl; + cur = ggml_reshape_4d(ctx0, cur, n_dim * tile_w, tile_w, grid_x, grid_y); // [n_dim*tile_w, tile_w, grid_x, grid_y] + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3)); - imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1); - cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h); - cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h); - cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, h*(w+1) + 1) + ggml_tensor * nl; - cb(cur, "dsocr_output", -1); + // weave newline at the end of every grid row + nl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, gh, 1); + cur = ggml_reshape_3d(ctx0, cur, n_dim, gw, gh); //[n_dim, gw, gh] + cur = ggml_concat(ctx0, cur, nl, 1); + cur = ggml_reshape_2d(ctx0, cur, n_dim, (gw + 1) * gh); + + if (img.add_viewsep) { + cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, (gw+1)*gh + 1) + } + cb(cur, "dsocr_output", -1); ggml_build_forward_expand(gf, cur); return gf; } diff --git a/tools/mtmd/models/deepseekocr2.cpp b/tools/mtmd/models/deepseekocr2.cpp index 056bb81807f3..8fe47d580401 100644 --- a/tools/mtmd/models/deepseekocr2.cpp +++ b/tools/mtmd/models/deepseekocr2.cpp @@ -4,17 +4,17 @@ ggml_cgraph * clip_graph_deepseekocr2::build() { GGML_ASSERT(hparams.n_head_kv > 0); GGML_ASSERT(n_head % hparams.n_head_kv == 0); - // patch embedding - ggml_tensor * inp_raw = build_inp_raw(); + const int64_t B = grid_x * grid_y; + + ggml_tensor * inp_raw = build_inp_raw(3, grid_x * grid_y); ggml_tensor * sam_out = build_sam(inp_raw); ggml_tensor * qwen2_out; // Building Qwen2 encoder { - ggml_tensor * inp; - - inp = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2]); // H*W, C + // [W, H, C, B] -> [H*W, C, B] -> [C, H*W, B] + ggml_tensor * inp = ggml_reshape_3d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2], B); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); auto num_image_tokens = inp->ne[1]; // H*W @@ -32,8 +32,10 @@ ggml_cgraph * clip_graph_deepseekocr2::build() { num_queries = 144; } - // (B, num_image_tokens + num_queries, C) - inp = ggml_concat(ctx0, inp, ggml_cast(ctx0, query_embed, inp->type), 1); + // query_embed [C, num_queries]; broadcast across the batch and append: + ggml_tensor * query_b = ggml_repeat_4d(ctx0, ggml_cast(ctx0, query_embed, inp->type), + inp->ne[0], num_queries, B, 1); + inp = ggml_concat(ctx0, inp, query_b, 1); auto seq_len = inp->ne[1]; @@ -54,11 +56,12 @@ ggml_cgraph * clip_graph_deepseekocr2::build() { // build_vit applies model.post_ln_w internally; do not re-apply ggml_tensor * cur = build_vit(inp, seq_len, NORM_TYPE_RMS, FFN_SILU, - /* learned_pos_embd */ nullptr, add_rope, vit_opts); + /* learned_pos_embd */ nullptr, add_rope, vit_opts); // [C, seq_len, B] + // only keep the query tokens; [C, num_queries, B] cur = ggml_cont(ctx0, - ggml_view_2d(ctx0, cur, cur->ne[0], num_queries, cur->nb[1], - cur->nb[1] * (cur->ne[1] - num_queries))); // only take query tokens for output + ggml_view_3d(ctx0, cur, cur->ne[0], num_queries, B, + cur->nb[1], cur->nb[2], cur->nb[1] * (cur->ne[1] - num_queries))); ggml_build_forward_expand(gf, cur); qwen2_out = cur; @@ -66,14 +69,17 @@ ggml_cgraph * clip_graph_deepseekocr2::build() { ggml_tensor * cur; - cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out); + cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out); // [n_dim, num_queries, B] cur = ggml_add(ctx0, cur, model.mm_fc_b); // view_seperator only after the global view if (img.add_viewsep) { + GGML_ASSERT(B == 1); cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, 257) } + // flatten the batch + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); cb(cur, "dsocr2_output", -1); ggml_build_forward_expand(gf, cur); diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index b882f800dd77..c7db362f30b7 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -120,14 +120,24 @@ struct clip_graph_whisper_enc : clip_graph { ggml_cgraph * build() override; }; +// one graph for both the global view (grid 1x1) +// and multi-tile batch; batch dim is grid_x * grid_y struct clip_graph_deepseekocr : clip_graph { - clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + int grid_x; + int grid_y; + + clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img, + const int grid_x = 1, const int grid_y = 1) + : clip_graph(ctx, img), grid_x(grid_x), grid_y(grid_y) {} + ggml_cgraph * build() override; ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model }; struct clip_graph_deepseekocr2 : clip_graph_deepseekocr { - clip_graph_deepseekocr2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_deepseekocr(ctx, img) {} + clip_graph_deepseekocr2(clip_ctx * ctx, const clip_image_f32 & img, + const int grid_x = 1, const int grid_y = 1) + : clip_graph_deepseekocr(ctx, img, grid_x, grid_y) {} ggml_cgraph * build() override; // reuses build_sam() from base }; diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp index caf72d536214..371fa0357ab4 100644 --- a/tools/mtmd/mtmd-image.cpp +++ b/tools/mtmd/mtmd-image.cpp @@ -1104,46 +1104,7 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli // mtmd_image_preprocessor_deepseekocr // -bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { - static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ }; - // TODO: support 512 (tiny) and 640 (small) once we have eval data for them - - const int64_t orig_area = static_cast(img.nx) * img.ny; - - size_t mode_i = 0; - int64_t min_diff = std::numeric_limits::max(); - for (size_t i = 0; i < std::size(native_resolutions); i++) { - const int64_t r = native_resolutions[i]; - const int64_t diff = std::abs(orig_area - r * r); - if (diff < min_diff) { - mode_i = i; - min_diff = diff; - } - } - const int image_size = native_resolutions[mode_i]; - - // Aspect-preserving fit-and-pad. Pillow bicubic + PAD_NEAREST for - // byte-parity with the upstream deepseek-ai/DeepSeek-OCR HF preprocessor. - clip_image_u8 padded; - img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW, - PAD_NEAREST, hparams.image_pad_color); - - clip_image_f32_ptr res(clip_image_f32_init()); - img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std); - output.entries.push_back(std::move(res)); - - output.grid_x = 1; - output.grid_y = 1; - return true; -} - -// -// mtmd_image_preprocessor_deepseekocr2 -// - -// candidate tile grids (cols, rows) with min_tiles <= cols*rows <= max_tiles -// sorted by tile count -std::vector mtmd_image_preprocessor_deepseekocr2::get_target_ratios() { +std::vector mtmd_image_preprocessor_deepseekocr::get_target_ratios() const { std::vector ratios; for (int n = min_tiles; n <= max_tiles; n++) { for (int w = 1; w <= n; w++) { @@ -1159,7 +1120,7 @@ std::vector mtmd_image_preprocessor_deepseekocr2::get_target_ra } } if (!found) { - ratios.push_back({ w, h }); + ratios.push_back({w, h}); } } } @@ -1170,23 +1131,20 @@ std::vector mtmd_image_preprocessor_deepseekocr2::get_target_ra return ratios; } -// pick the grid whose aspect ratio is closest to the image -// on a tie, prefer the larger grid when the image fits -clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio( - float aspect_ratio, - const std::vector & target_ratios, - int width, - int height) { - float best_ratio_diff = std::numeric_limits::max(); - clip_image_size best_ratio = { 1, 1 }; - const float area = static_cast(width * height); +clip_image_size mtmd_image_preprocessor_deepseekocr::find_closest_aspect_ratio( + float aspect_ratio, + const std::vector & target_ratios, + int width, int height) const { + float best_ratio_diff = std::numeric_limits::max(); + clip_image_size best_ratio = {1, 1}; + const float area = static_cast(width * height); for (const auto & ratio : target_ratios) { const float target_aspect_ratio = static_cast(ratio.width) / ratio.height; - const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio); + const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio); if (ratio_diff < best_ratio_diff) { best_ratio_diff = ratio_diff; - best_ratio = ratio; + best_ratio = ratio; } else if (ratio_diff == best_ratio_diff) { const float target_area = static_cast(tile_size * tile_size * ratio.width * ratio.height); if (area > 0.5f * target_area) { @@ -1197,22 +1155,24 @@ clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio( return best_ratio; } -bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { - // emit 768x768 local tiles when the image is larger than a tile in either - // dimension, then always a 1024x1024 global view. order: [tiles..., global]. +bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { + // output order: [local tiles..., global] + int grid_w = 1; + int grid_h = 1; if (img.nx > tile_size || img.ny > tile_size) { - const float aspect_ratio = static_cast(img.nx) / img.ny; - const auto target_ratios = get_target_ratios(); - const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny); + const float aspect_ratio = static_cast(img.nx) / img.ny; + const auto target_ratios = get_target_ratios(); + const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny); + grid_w = grid.width; + grid_h = grid.height; - // stretch onto the grid (no aspect preserve), then crop tiles row-major. clip_image_u8 refined; - img_tool::resize(img, refined, { tile_size * grid.width, tile_size * grid.height }, + img_tool::resize(img, refined, {tile_size * grid_w, tile_size * grid_h}, RESIZE_ALGO_BICUBIC_PILLOW, PAD_NONE); - for (int row = 0; row < grid.height; row++) { - for (int col = 0; col < grid.width; col++) { + for (int row = 0; row < grid_h; row++) { + for (int col = 0; col < grid_w; col++) { clip_image_u8 tile; img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size); clip_image_f32_ptr res(clip_image_f32_init()); @@ -1222,20 +1182,19 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, } } - // global view: aspect-preserving fit-and-pad to base_size. + // global view: aspect-preserving fit-and-pad to base_size clip_image_u8 padded; - img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW, + img_tool::resize(img, padded, {base_size, base_size}, RESIZE_ALGO_BICUBIC_PILLOW, PAD_NEAREST, hparams.image_pad_color); clip_image_f32_ptr global(clip_image_f32_init()); img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std); global->add_viewsep = true; output.entries.push_back(std::move(global)); - - output.grid_x = 1; - output.grid_y = 1; + output.grid_x = grid_w; + output.grid_y = grid_h; + LOG_DBG("%s: grid size: %d x %d (%d tiles) + global view\n", __func__, grid_w, grid_h, grid_w * grid_h); return true; } - // // mtmd_image_preprocessor_step3vl // diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h index 91a5bc253ef8..71170089422c 100644 --- a/tools/mtmd/mtmd-image.h +++ b/tools/mtmd/mtmd-image.h @@ -139,29 +139,28 @@ struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd { bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; }; +// DeepSeek-OCR (v1/v2) global view + optional local tile grid struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor { - mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} - bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; -}; - -// DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local -// tiles when the image is larger than a tile in either dimension. -struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor { - static constexpr int base_size = 1024; // global view - static constexpr int tile_size = 768; // local tile - static constexpr int min_tiles = 2; - static constexpr int max_tiles = 6; - - mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} + // config is loaded into clip_hparams (see load_hparams); global view side is image_size + mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) + : mtmd_image_preprocessor(ctx), + base_size(hparams.image_size), + tile_size(hparams.preproc_tile_size), + min_tiles(hparams.preproc_min_tiles), + max_tiles(hparams.preproc_max_tiles) {} bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; private: - static std::vector get_target_ratios(); - static clip_image_size find_closest_aspect_ratio( - float aspect_ratio, - const std::vector & target_ratios, - int width, - int height); + int base_size; // global view + int tile_size; // each tile + int min_tiles; + int max_tiles; + + std::vector get_target_ratios() const; + clip_image_size find_closest_aspect_ratio( + float aspect_ratio, + const std::vector & target_ratios, + int width, int height) const; }; // custom image preprocessing for Step3VL diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 0b5caa6cb5c1..e5ab806156fe 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -120,6 +120,46 @@ static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_ return CLIP_FLASH_ATTN_TYPE_AUTO; } +// DeepSeek-OCR multi-tile batched encode: +// tile-grid is encoded as one batch, +// then the global view is encoded and appended. +// +// v1 weaves newlines onto the grid in-graph; +// v2 just concatenates the per-tile query tokens. +static bool encode_deepseekocr(clip_ctx * ctx_clip, + int n_threads, + const clip_image_f32_batch & batch, + float * out) { + const auto & entries = batch.entries; + const int n_tiles = static_cast(entries.size()) - 1; // global view is last + const int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip); + + if (n_tiles > 0) { + GGML_ASSERT(n_tiles == batch.grid_x * batch.grid_y); + const size_t tiles_sz = static_cast( + clip_n_output_tokens(ctx_clip, entries[0].get(), batch.grid_x, batch.grid_y)) * n_mmproj_embd; + clip_image_f32_batch tile_batch; + tile_batch.grid_x = batch.grid_x; + tile_batch.grid_y = batch.grid_y; + tile_batch.entries.reserve(n_tiles); + for (int i = 0; i < n_tiles; i++) { + tile_batch.entries.emplace_back(entries[i].get()); + } + + const bool ok = clip_image_batch_encode(ctx_clip, n_threads, &tile_batch, out); + + for (auto & tile : tile_batch.entries) { + (void) tile.release(); + } + if (!ok) { + return false; + } + out += tiles_sz; + } + + return clip_image_encode(ctx_clip, n_threads, entries.back().get(), out); +} + mtmd_context_params mtmd_context_params_default() { mtmd_context_params params { /* use_gpu */ true, @@ -490,14 +530,10 @@ struct mtmd_context { image_preproc = std::make_unique(ctx_v); } break; case PROJECTOR_TYPE_DEEPSEEKOCR: - { - img_end = "\n"; // prevent empty batch on llama-server - image_preproc = std::make_unique(ctx_v); - } break; case PROJECTOR_TYPE_DEEPSEEKOCR2: { img_end = "\n"; // prevent empty batch on llama-server - image_preproc = std::make_unique(ctx_v); + image_preproc = std::make_unique(ctx_v); } break; case PROJECTOR_TYPE_HUNYUANVL: { @@ -873,8 +909,18 @@ struct mtmd_tokenizer { } else { size_t n_tokens = 0; - for (const auto & entry : batch_f32.entries) { - n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get()); + if (ctx->proj_type_v() == PROJECTOR_TYPE_DEEPSEEKOCR + || ctx->proj_type_v() == PROJECTOR_TYPE_DEEPSEEKOCR2) { + // tiles run as one batched image grid; the global view is separate single image + if (batch_f32.entries.size() > 1) { + n_tokens += clip_n_output_tokens(ctx->ctx_v, batch_f32.entries[0].get(), + batch_f32.grid_x, batch_f32.grid_y); + } + n_tokens += clip_n_output_tokens(ctx->ctx_v, batch_f32.entries.back().get()); + } else { + for (const auto & entry : batch_f32.entries) { + n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get()); + } } mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); @@ -1107,15 +1153,15 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); bool ok = false; - if (clip_is_llava(ctx_clip) + if (proj_type == PROJECTOR_TYPE_DEEPSEEKOCR || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) { + ok = encode_deepseekocr(ctx_clip, ctx->n_threads, image_tokens->batch_f32, ctx->image_embd_v.data()); + } else if (clip_is_llava(ctx_clip) || proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE - || proj_type == PROJECTOR_TYPE_INTERNVL - || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) { + || proj_type == PROJECTOR_TYPE_INTERNVL) { // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() const auto & entries = image_tokens->batch_f32.entries; // entries may have different token counts - // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view size_t offset = 0; for (size_t i = 0; i < entries.size(); i++) { int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get()); diff --git a/tools/mtmd/tests/test-1-positive.png b/tools/mtmd/tests/test-1-positive.png new file mode 100644 index 000000000000..007614594ef5 Binary files /dev/null and b/tools/mtmd/tests/test-1-positive.png differ diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py index 5f5fef765a62..f64104535558 100644 --- a/tools/mtmd/tests/test-deepseek-ocr.py +++ b/tools/mtmd/tests/test-deepseek-ocr.py @@ -82,6 +82,24 @@ def chrf_min(self) -> float: # is one pixel off and lands at ~0.69 instead. hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0, ), + TestCase( + model_key="v1", label="multi-tile (dynamic resolution)", + image="tools/mtmd/tests/test-1-positive.png", + ground_truth="tools/mtmd/tests/test-1-ground-truth.txt", + # 429x806 -- 806 > 640 triggers the v1 "Gundam" path: (1,2) grid -> + # 2 local 640 tiles + 1 global 1024 view. Regression guard for the + # tiling preprocessor -- a broken tile path craters the score. + # hf_cer/hf_chrf are HF v1's measured scores -- it reads this clean crop exactly. + hf_cer=0.0000, hf_chrf=100.00, cer_tol=0.03, chrf_tol=3.0, + ), + TestCase( + model_key="v2", label="multi-tile (dynamic resolution)", + image="tools/mtmd/tests/test-1-positive.png", + ground_truth="tools/mtmd/tests/test-1-ground-truth.txt", + # 429x806 -- 806 > 768 triggers the v2 path: (1,2) grid -> + # 2 local 768 tiles + 1 global 1024 view = 545 image tokens. + hf_cer=0.0236, hf_chrf=97.05, cer_tol=0.03, chrf_tol=3.0, + ), ]