Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion tools/mtmd/clip-graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ struct clip_graph {
// build vision transformer (ViT) cgraph
// this function should cover most of the models
// if your model has specific features, you should probably duplicate this function
//
// inp is 2D [n_embd, n_pos] or 3D [n_embd, n_pos, B] (batched multi-tile encode);
// returns the same rank with the batch dim preserved (B==1 -> trailing 1)
ggml_tensor * build_vit(
ggml_tensor * inp,
int64_t n_pos,
Expand All @@ -75,7 +78,7 @@ struct clip_graph {
// returns tensor with shape [n_embd, n_patches]
ggml_tensor * build_inp();

ggml_tensor * build_inp_raw(int channels = 3);
ggml_tensor * build_inp_raw(int channels = 3, int batch = 1);

ggml_tensor * build_norm(
ggml_tensor * cur,
Expand Down
5 changes: 3 additions & 2 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -523,8 +523,9 @@ struct clip_image_f32_batch {

// for llava-uhd style models, we need to know the grid size
// note: entries.size() == grid_x * grid_y + 1 (one overview image)
int grid_x = 0;
int grid_y = 0;
// 1x1 = no tiling; llava-uhd preprocessors always overwrite grid before has_tiling_grid reads it
int grid_x = 1;
int grid_y = 1;

clip_image_f32_batch clone() const {
clip_image_f32_batch new_batch{
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/clip-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ struct clip_hparams {
std::vector<clip_image_size> image_res_candidates;
int32_t preproc_min_tiles = 0;
int32_t preproc_max_tiles = 0;
int32_t preproc_tile_size = 0;
resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
pad_style image_pad_rf = PAD_CEIL; // padding style for the refined image (e.g. llava-1.6)
Expand Down
206 changes: 145 additions & 61 deletions tools/mtmd/clip.cpp

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion tools/mtmd/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
// TODO: should be enum, not string
const char * clip_patch_merge_type(const struct clip_ctx * ctx);

int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
// grid_x/grid_y of the tile grid; leave at 1 for a single image
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img, int grid_x = 1, int grid_y = 1);

// for M-RoPE, this will be the number of token positions in X and Y directions
// for other models, X will be the total number of tokens and Y will be 1
Expand Down
69 changes: 41 additions & 28 deletions tools/mtmd/models/deepseekocr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
const int n_heads = hparams.sam_n_head;
const int d_heads = n_embd / n_heads;
const int window = hparams.attn_window_size;
// SAM stage runs its layernorms at 1e-6
const float sam_eps = 1e-6f;

ggml_tensor * inpL;

Expand Down Expand Up @@ -134,7 +136,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
ggml_tensor * shortcut = cur;

// layernorm1
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, sam_eps, il);

const int64_t w0 = cur->ne[1];
const int64_t h0 = cur->ne[2];
Expand Down Expand Up @@ -214,7 +216,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
ggml_tensor * inpFF = cur;

// layernorm2
cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, sam_eps, il);

// ffn
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
Expand All @@ -229,12 +231,12 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {

cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, sam_eps, -1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));

cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, sam_eps, -1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));

cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
Expand All @@ -246,23 +248,23 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
}

ggml_cgraph * clip_graph_deepseekocr::build() {
// patch embedding
ggml_tensor * inp_raw = build_inp_raw();
const int64_t B = grid_x * grid_y;

ggml_tensor * inp_raw = build_inp_raw(3, grid_x * grid_y);

//sam_out: [16, 16, n_embd_clip, B]
ggml_tensor * sam_out = build_sam(inp_raw);

const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];

ggml_tensor * clip_out;
// Building DS-OCR CLIP
{
ggml_tensor * inp;

inp = ggml_reshape_2d(ctx0, sam_out, clip_n_patches, sam_out->ne[2]);
// [16, 16, n_embd, B] -> [n_patches, n_embd, B]
ggml_tensor * inp = ggml_reshape_3d(ctx0, sam_out, clip_n_patches, sam_out->ne[2], B);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));

ggml_tensor * new_pos_embd = model.position_embeddings;

int n_pos = new_pos_embd->ne[1]; // +1 for [CLS]
int n_pos = new_pos_embd->ne[1]; // +1 for [CLS]
const auto tgt_size = static_cast<int>(std::sqrt(inp->ne[1]));
const auto src_size = static_cast<int>(std::sqrt(n_pos - 1));

Expand All @@ -281,41 +283,52 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
n_pos = tgt_size * tgt_size + 1;
}

// add CLS token
inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
// CLS token, broadcast across the batch
ggml_tensor * cls_b = ggml_repeat_4d(ctx0, model.class_embedding, n_embd, 1, B, 1);
inp = ggml_concat(ctx0, cls_b, inp, 1); // [n_embd, n_pos, B]

// for selecting learned pos embd, used by ViT
ggml_tensor * positions = ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32);
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions);

ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_QUICK, learned_pos_embd, nullptr);
// one builder for all B (B==1 is just the single global view)
clip_out = build_vit(inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_QUICK, learned_pos_embd, nullptr); // [n_embd, n_pos, B]

ggml_build_forward_expand(gf, cur);
clip_out = cur;
ggml_build_forward_expand(gf, clip_out);
}

sam_out = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3));
sam_out = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches);
clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]);
sam_out = ggml_reshape_3d(ctx0, sam_out, sam_out->ne[0], clip_n_patches, B);
clip_out = ggml_view_3d(ctx0, clip_out, n_embd, clip_n_patches, B,
clip_out->nb[1], clip_out->nb[2], clip_out->nb[1]);

ggml_tensor * cur;
cur = ggml_concat(ctx0, clip_out, sam_out, 0);
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
cur = ggml_add(ctx0, cur, model.mm_fc_b);

const auto h = static_cast<int>(std::sqrt(static_cast<float>(cur->ne[1])));
const auto w = h;
const auto n_dim = cur->ne[0];
const int tile_tokens = cur->ne[1];
const int tile_w = static_cast<int>(std::sqrt(static_cast<float>(tile_tokens))); // tiles are square
const int gw = tile_w * grid_x;
const int gh = tile_w * grid_y;
const int n_dim = cur->ne[0];

ggml_tensor * imgnl;
cur = ggml_reshape_4d(ctx0, cur, n_dim * tile_w, tile_w, grid_x, grid_y); // [n_dim*tile_w, tile_w, grid_x, grid_y]
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));

imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, h*(w+1) + 1)
ggml_tensor * nl;

cb(cur, "dsocr_output", -1);
// weave newline at the end of every grid row
nl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, gh, 1);
cur = ggml_reshape_3d(ctx0, cur, n_dim, gw, gh); //[n_dim, gw, gh]
cur = ggml_concat(ctx0, cur, nl, 1);
cur = ggml_reshape_2d(ctx0, cur, n_dim, (gw + 1) * gh);

if (img.add_viewsep) {
cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, (gw+1)*gh + 1)
}

cb(cur, "dsocr_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
28 changes: 17 additions & 11 deletions tools/mtmd/models/deepseekocr2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@ ggml_cgraph * clip_graph_deepseekocr2::build() {
GGML_ASSERT(hparams.n_head_kv > 0);
GGML_ASSERT(n_head % hparams.n_head_kv == 0);

// patch embedding
ggml_tensor * inp_raw = build_inp_raw();
const int64_t B = grid_x * grid_y;

ggml_tensor * inp_raw = build_inp_raw(3, grid_x * grid_y);

ggml_tensor * sam_out = build_sam(inp_raw);

ggml_tensor * qwen2_out;
// Building Qwen2 encoder
{
ggml_tensor * inp;

inp = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2]); // H*W, C
// [W, H, C, B] -> [H*W, C, B] -> [C, H*W, B]
ggml_tensor * inp = ggml_reshape_3d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2], B);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));

auto num_image_tokens = inp->ne[1]; // H*W
Expand All @@ -32,8 +32,10 @@ ggml_cgraph * clip_graph_deepseekocr2::build() {
num_queries = 144;
}

// (B, num_image_tokens + num_queries, C)
inp = ggml_concat(ctx0, inp, ggml_cast(ctx0, query_embed, inp->type), 1);
// query_embed [C, num_queries]; broadcast across the batch and append:
ggml_tensor * query_b = ggml_repeat_4d(ctx0, ggml_cast(ctx0, query_embed, inp->type),
inp->ne[0], num_queries, B, 1);
inp = ggml_concat(ctx0, inp, query_b, 1);

auto seq_len = inp->ne[1];

Expand All @@ -54,26 +56,30 @@ ggml_cgraph * clip_graph_deepseekocr2::build() {

// build_vit applies model.post_ln_w internally; do not re-apply
ggml_tensor * cur = build_vit(inp, seq_len, NORM_TYPE_RMS, FFN_SILU,
/* learned_pos_embd */ nullptr, add_rope, vit_opts);
/* learned_pos_embd */ nullptr, add_rope, vit_opts); // [C, seq_len, B]

// only keep the query tokens; [C, num_queries, B]
cur = ggml_cont(ctx0,
ggml_view_2d(ctx0, cur, cur->ne[0], num_queries, cur->nb[1],
cur->nb[1] * (cur->ne[1] - num_queries))); // only take query tokens for output
ggml_view_3d(ctx0, cur, cur->ne[0], num_queries, B,
cur->nb[1], cur->nb[2], cur->nb[1] * (cur->ne[1] - num_queries)));

ggml_build_forward_expand(gf, cur);
qwen2_out = cur;
}

ggml_tensor * cur;

cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out);
cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out); // [n_dim, num_queries, B]
cur = ggml_add(ctx0, cur, model.mm_fc_b);

// view_seperator only after the global view
if (img.add_viewsep) {
GGML_ASSERT(B == 1);
cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, 257)
}

// flatten the batch
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
cb(cur, "dsocr2_output", -1);

ggml_build_forward_expand(gf, cur);
Expand Down
14 changes: 12 additions & 2 deletions tools/mtmd/models/models.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,24 @@ struct clip_graph_whisper_enc : clip_graph {
ggml_cgraph * build() override;
};

// one graph for both the global view (grid 1x1)
// and multi-tile batch; batch dim is grid_x * grid_y
struct clip_graph_deepseekocr : clip_graph {
clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
int grid_x;
int grid_y;

clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img,
const int grid_x = 1, const int grid_y = 1)
: clip_graph(ctx, img), grid_x(grid_x), grid_y(grid_y) {}

ggml_cgraph * build() override;
ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model
};

struct clip_graph_deepseekocr2 : clip_graph_deepseekocr {
clip_graph_deepseekocr2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_deepseekocr(ctx, img) {}
clip_graph_deepseekocr2(clip_ctx * ctx, const clip_image_f32 & img,
const int grid_x = 1, const int grid_y = 1)
: clip_graph_deepseekocr(ctx, img, grid_x, grid_y) {}
ggml_cgraph * build() override; // reuses build_sam() from base
};

Expand Down
Loading