Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ struct clip_hparams {
int32_t n_head;
int32_t n_layer;
// idefics3
int32_t preproc_image_size = 0;
int32_t preproc_image_size = 0; // aka max_dimension
int32_t proj_scale_factor = 0;

float image_mean[3];
Expand Down Expand Up @@ -3221,8 +3221,8 @@ struct image_manipulation {
return {0, 0};
}

float scale = std::min(1.0f, std::min(static_cast<float>(max_dimension) / inp_size.width,
static_cast<float>(max_dimension) / inp_size.height));
float scale = std::min(static_cast<float>(max_dimension) / inp_size.width,
static_cast<float>(max_dimension) / inp_size.height);

float target_width_f = static_cast<float>(inp_size.width) * scale;
float target_height_f = static_cast<float>(inp_size.height) * scale;
Expand Down Expand Up @@ -3385,7 +3385,7 @@ struct llava_uhd {

// resize to overview size
clip_image_u8_ptr resized_img(clip_image_u8_init());
image_manipulation::bicubic_resize(*img, *resized_img, inst.overview_size.width, inst.overview_size.height);
image_manipulation::resize_and_pad_image(*img, *resized_img, inst.overview_size);
Copy link
Collaborator Author

@ngxson ngxson Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also cc @gabe-l-hart for visibility, before this change, slice_image resize the overview image without padding (or preserving ratio) - it should be fixed now

output.push_back(std::move(resized_img));
if (inst.slices.empty()) {
// no slices, just return the resized image
Expand Down Expand Up @@ -3587,6 +3587,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio(
original_size, params.image_size, params.preproc_image_size);
// LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
// __func__, original_size.width, original_size.height,
// refined_size.width, refined_size.height);

llava_uhd::slice_instructions instructions;
instructions.overview_size = clip_image_size{params.image_size, params.image_size};
Expand All @@ -3597,6 +3600,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
};
for (int y = 0; y < refined_size.height; y += params.image_size) {
for (int x = 0; x < refined_size.width; x += params.image_size) {
// LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y);
instructions.slices.push_back(llava_uhd::slice_coordinates{
/* x */x,
/* y */y,
Expand Down
5 changes: 4 additions & 1 deletion tools/mtmd/tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,10 @@ for i in "${!arr_hf[@]}"; do

echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log

if echo "$output" | grep -iq "new york"; then
# either contains "new york" or both "men" and "walk"
if echo "$output" | grep -iq "new york" \
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
then
result="$prefix \033[32mOK\033[0m: $bin $hf"
else
result="$prefix \033[31mFAIL\033[0m: $bin $hf"
Expand Down
Loading