Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions tools/mtmd/clip-graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ struct clip_graph {
float kq_scale; // TODO: maybe move this to hparams
const clip_flash_attn_type flash_attn_type;

// TODO [QWEN_VIDEO]: improve this in the future
int n_batch = 1;

ggml_context_ptr ctx0_ptr;
ggml_context * ctx0;
ggml_cgraph * gf;
Expand Down
19 changes: 11 additions & 8 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -480,10 +480,6 @@ struct clip_image_u8 {
buf[idx + 2] = rgb[2];
}

size_t n_pixels() const {
return (size_t) nx * (size_t) ny;
}

size_t n_elements() const {
return n_pixels() * 3;
}
Expand All @@ -492,10 +488,16 @@ struct clip_image_u8 {
std::vector<uint8_t> buf;
int nx = 0;
int ny = 0;

size_t n_pixels() const {
return (size_t) nx * (size_t) ny;
}
};

// For images, buf.size() == nx*ny*3
// Memory layout: RGBRGBRGB...
// For seq, buf.size() == nx*ny*3*nt
// Memory layout: RGBRGB...RGBRGB... (nt times)
// For audio, only one channel is used, buf.size() == nx*ny
// nx will be n_frames and ny will be n_mel
struct clip_image_f32 {
Expand Down Expand Up @@ -544,10 +546,6 @@ struct clip_image_f32 {
}
}

size_t n_pixels() const {
return (size_t) nx_ * (size_t) ny_;
}

size_t n_elements() const {
return n_pixels() * 3;
}
Expand Down Expand Up @@ -580,6 +578,10 @@ struct clip_image_f32 {
std::vector<float> buf;
int nx_ = 0;
int ny_ = 0;

size_t n_pixels() const {
return (size_t) nx_ * (size_t) ny_;
}
};

//
Expand Down Expand Up @@ -627,6 +629,7 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
va_end(args);
}

#define LOG_TRC(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
Expand Down
43 changes: 30 additions & 13 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ ggml_tensor * clip_graph::build_inp() {
}

ggml_tensor * clip_graph::build_inp_raw(int channels) {
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels);
ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels, n_batch);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
return inp_raw;
Expand Down Expand Up @@ -848,8 +848,6 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
}

static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");

const clip_image_f32 & img = *imgs.entries[0];
std::unique_ptr<clip_graph> builder;

Expand Down Expand Up @@ -1009,6 +1007,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
GGML_ABORT("missing cgraph builder");
}

// TODO [QWEN_VIDEO]: improve this in the future
builder->n_batch = imgs.entries.size();

return builder->build();
}

Expand Down Expand Up @@ -3479,12 +3480,15 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3

bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
const clip_image_f32_batch & imgs = *imgs_c_ptr;
int batch_size = imgs.entries.size();
int n_batch_cur = imgs.entries.size();

// maximum supported batch size, usually == 2 for qwen-vl-based models
int n_batch_max = clip_model_n_batch_max(ctx);

// TODO @ngxson : implement batch size > 1 as a loop
// we don't need true batching support because the cgraph will gonna be big anyway
if (batch_size != 1) {
return false; // only support batch size of 1
if (n_batch_cur > n_batch_max) {
return false;
}

// if buffers are not allocated, we need to do a warmup run to allocate them
Expand Down Expand Up @@ -3555,18 +3559,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// └─────┘ │
// ──────┘ x B

for (size_t i = 0; i < imgs.entries.size(); i++) {
const int nx = imgs.entries[i]->nx();
const int ny = imgs.entries[i]->ny();
const int n = nx * ny;
// IMPORTANT: [QWEN_VIDEO] the batch dim is currently used for temporal dim in Qwen-VL models
// All entries must have the same spatial size (enforced by can_batch_with() during merging)
{
const int nx = imgs.entries[0]->nx();
const int ny = imgs.entries[0]->ny();
const int n = nx * ny;

for (int b = 0; b < batch_size; b++) {
for (int b = 0; b < n_batch_cur; b++) {
const auto & buf = imgs.entries[b]->get_ro_buf();
float * batch_entry = inp_raw.data() + b * (3*n);
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
size_t base_src = 3*(y * nx + x); // idx of the first channel
size_t base_dst = y * nx + x; // idx of the first channel
size_t base_src = 3*(y * nx + x);
size_t base_dst = y * nx + x;
batch_entry[ base_dst] = buf[base_src ];
batch_entry[1*n + base_dst] = buf[base_src + 1];
batch_entry[2*n + base_dst] = buf[base_src + 2];
Expand Down Expand Up @@ -4549,6 +4555,17 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
return ctx->model.modality == CLIP_MODALITY_AUDIO;
}

int clip_model_n_batch_max(const struct clip_ctx * ctx) {
switch (ctx->proj_type()) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
return 2;
default:
return 1;
}
}

//
// API used internally with mtmd
//
Expand Down
8 changes: 8 additions & 0 deletions tools/mtmd/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ struct clip_image_size {
bool operator==(const clip_image_size & other) const {
return width == other.width && height == other.height;
}
bool operator!=(const clip_image_size & other) const {
return !(*this == other);
}
int area() const {
return width * height;
}
};

struct clip_image_f32;
Expand Down Expand Up @@ -101,6 +107,8 @@ bool clip_is_llava(const struct clip_ctx * ctx);
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
bool clip_has_audio_encoder(const struct clip_ctx * ctx);

int clip_model_n_batch_max(const struct clip_ctx * ctx);

std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);

struct clip_cap {
Expand Down
5 changes: 3 additions & 2 deletions tools/mtmd/models/models.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,11 @@ struct clip_graph_pixtral : clip_graph {
struct clip_graph_qwen2vl : clip_graph {
clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
ggml_tensor * build_inp_with_temporal_merge();
};

struct clip_graph_qwen3vl : clip_graph {
clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
struct clip_graph_qwen3vl : clip_graph_qwen2vl {
clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_qwen2vl(ctx, img) {}
ggml_cgraph * build() override;
};

Expand Down
38 changes: 30 additions & 8 deletions tools/mtmd/models/qwen2vl.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,34 @@
#include "models.h"

ggml_tensor * clip_graph_qwen2vl::build_inp_with_temporal_merge() {
ggml_tensor * inp_raw = build_inp_raw();

GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);

const size_t nb1 = ggml_row_size(inp_raw->type, img.nx());
const size_t nb2 = ggml_row_size(inp_raw->type, img.nx() * img.ny());

if (n_batch == 1) {
// still image input
return ggml_add(ctx0,
ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1),
ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1));
} else if (n_batch == 2) {
// 2 frames input (video input)
ggml_tensor * inp_0 = ggml_view_3d(ctx0, inp_raw,
img.nx(), img.ny(), 3, nb1, nb2, 0);
ggml_tensor * inp_1 = ggml_view_3d(ctx0, inp_raw,
img.nx(), img.ny(), 3, nb1, nb2,
nb2 * 3); // move to the second frame
return ggml_add(ctx0,
ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_0, patch_size, patch_size, 0, 0, 1, 1),
ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_1, patch_size, patch_size, 0, 0, 1, 1));
} else {
GGML_ASSERT(false && "n_batch > 2 is not supported");
}
}

ggml_cgraph * clip_graph_qwen2vl::build() {
GGML_ASSERT(model.patch_bias == nullptr);
GGML_ASSERT(model.class_embedding == nullptr);
Expand All @@ -16,17 +45,10 @@ ggml_cgraph * clip_graph_qwen2vl::build() {

int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};

ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);

GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
ggml_tensor * inp = build_inp_with_temporal_merge();

// second conv dimension
{
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);

inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(
ctx0, inp,
Expand Down
11 changes: 2 additions & 9 deletions tools/mtmd/models/qwen3vl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,10 @@ ggml_cgraph * clip_graph_qwen3vl::build() {

int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};

ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
ggml_tensor * inp = build_inp_with_temporal_merge();

GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);

// second conv dimension
// spatial merge
{
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);

inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(
ctx0, inp,
Expand Down
2 changes: 1 addition & 1 deletion tools/mtmd/mtmd-image.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1116,7 +1116,7 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
// TODO: support 512 (tiny) and 640 (small) once we have eval data for them

const int64_t orig_area = static_cast<int64_t>(img.n_pixels());
const int64_t orig_area = static_cast<int64_t>(img.get_size().area());

size_t mode_i = 0;
int64_t min_diff = std::numeric_limits<int64_t>::max();
Expand Down
Loading
Loading