Skip to content

Commit 1bb4f43

Browse files
authored
mtmd : support home-cooked Mistral Small Omni (#14928)
1 parent 683fa6b commit 1bb4f43

File tree

2 files changed

+17
-3
lines changed

2 files changed

+17
-3
lines changed

tools/mtmd/clip-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
3131

3232
// vision-specific
33+
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
3334
#define KEY_IMAGE_SIZE "clip.vision.image_size"
3435
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
3536
#define KEY_PATCH_SIZE "clip.vision.patch_size"
@@ -48,6 +49,7 @@
4849
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
4950

5051
// audio-specific
52+
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
5153
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
5254
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
5355

tools/mtmd/clip.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2221,15 +2221,27 @@ struct clip_model_loader {
22212221
// projector type
22222222
std::string proj_type;
22232223
{
2224+
// default key
22242225
get_string(KEY_PROJ_TYPE, proj_type, false);
2225-
if (!proj_type.empty()) {
2226-
model.proj_type = clip_projector_type_from_string(proj_type);
2226+
2227+
// for models with mixed modalities
2228+
if (proj_type.empty()) {
2229+
if (modality == CLIP_MODALITY_VISION) {
2230+
get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
2231+
} else if (modality == CLIP_MODALITY_AUDIO) {
2232+
get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false);
2233+
} else {
2234+
GGML_ABORT("unknown modality");
2235+
}
22272236
}
2237+
2238+
model.proj_type = clip_projector_type_from_string(proj_type);
2239+
22282240
if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
22292241
throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
22302242
}
22312243

2232-
// correct arch for multimodal models
2244+
// correct arch for multimodal models (legacy method)
22332245
if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
22342246
model.proj_type = modality == CLIP_MODALITY_VISION
22352247
? PROJECTOR_TYPE_QWEN25VL

0 commit comments

Comments
 (0)