File tree Expand file tree Collapse file tree 2 files changed +17
-3
lines changed Expand file tree Collapse file tree 2 files changed +17
-3
lines changed Original file line number Diff line number Diff line change 3030#define KEY_LAYER_NORM_EPS " clip.%s.attention.layer_norm_epsilon"
3131
3232// vision-specific
33+ #define KEY_VISION_PROJ_TYPE " clip.vision.projector_type" // for models with mixed modalities
3334#define KEY_IMAGE_SIZE " clip.vision.image_size"
3435#define KEY_PREPROC_IMAGE_SIZE " clip.vision.preproc_image_size"
3536#define KEY_PATCH_SIZE " clip.vision.patch_size"
4849#define KEY_MINICPMV_QUERY_NUM " clip.minicpmv_query_num"
4950
5051// audio-specific
52+ #define KEY_AUDIO_PROJ_TYPE " clip.audio.projector_type" // for models with mixed modalities
5153#define KEY_A_NUM_MEL_BINS " clip.audio.num_mel_bins"
5254#define KEY_A_PROJ_STACK_FACTOR " clip.audio.projector.stack_factor"
5355
Original file line number Diff line number Diff line change @@ -2221,15 +2221,27 @@ struct clip_model_loader {
22212221 // projector type
22222222 std::string proj_type;
22232223 {
2224+ // default key
22242225 get_string (KEY_PROJ_TYPE, proj_type, false );
2225- if (!proj_type.empty ()) {
2226- model.proj_type = clip_projector_type_from_string (proj_type);
2226+
2227+ // for models with mixed modalities
2228+ if (proj_type.empty ()) {
2229+ if (modality == CLIP_MODALITY_VISION) {
2230+ get_string (KEY_VISION_PROJ_TYPE, proj_type, false );
2231+ } else if (modality == CLIP_MODALITY_AUDIO) {
2232+ get_string (KEY_AUDIO_PROJ_TYPE, proj_type, false );
2233+ } else {
2234+ GGML_ABORT (" unknown modality" );
2235+ }
22272236 }
2237+
2238+ model.proj_type = clip_projector_type_from_string (proj_type);
2239+
22282240 if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
22292241 throw std::runtime_error (string_format (" %s: unknown projector type: %s\n " , __func__, proj_type.c_str ()));
22302242 }
22312243
2232- // correct arch for multimodal models
2244+ // correct arch for multimodal models (legacy method)
22332245 if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
22342246 model.proj_type = modality == CLIP_MODALITY_VISION
22352247 ? PROJECTOR_TYPE_QWEN25VL
You can’t perform that action at this time.
0 commit comments