From 5b30cd467b177d42b8642209da77c66626eeaacc Mon Sep 17 00:00:00 2001 From: roipony Date: Thu, 14 Aug 2025 14:32:01 +0300 Subject: [PATCH 1/6] Add files via upload --- .../models/granite_vision_embedding_models.py | 158 ++++++++++++++++++ mteb/models/overview.py | 14 +- 2 files changed, 162 insertions(+), 10 deletions(-) create mode 100644 mteb/models/granite_vision_embedding_models.py diff --git a/mteb/models/granite_vision_embedding_models.py b/mteb/models/granite_vision_embedding_models.py new file mode 100644 index 0000000000..f53be35be8 --- /dev/null +++ b/mteb/models/granite_vision_embedding_models.py @@ -0,0 +1,158 @@ +from __future__ import annotations + +import logging +from functools import partial +from typing import Any + +import torch +from PIL import Image +from torch.utils.data import DataLoader +from transformers import AutoProcessor, AutoModel +from transformers.utils.import_utils import is_flash_attn_2_available + + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta +from mteb.requires_package import ( + requires_image_dependencies, + requires_package, +) + +logger = logging.getLogger(__name__) + + +class GraniteVisionEmbeddingWrapper: + + def __init__( + self, + model_name: str, + revision: str | None = None, + device: str | None = None, + **kwargs, + ): + requires_image_dependencies() + + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.model_name = model_name + + # Load model + self.mdl = AutoModel.from_pretrained( + model_name, revision=revision, device_map=self.device,trust_remote_code=True, **kwargs + ) + + self.mdl.eval() + + # Load processor + self.processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, revision=revision) + + def encode(self, sentences, **kwargs): + return self.get_text_embeddings(texts=sentences, **kwargs) + + def encode_input(self, inputs): + return self.mdl(**inputs) + + def get_image_embeddings( + self, + images, + batch_size: int = 16, + **kwargs, + ): + import torchvision.transforms.functional as F + + all_embeds = [] + + if isinstance(images, DataLoader): + iterator = images + else: + iterator = DataLoader(images, batch_size=batch_size) + + with torch.no_grad(): + for batch in iterator: + # batch may be list of tensors or PIL + imgs = [ + F.to_pil_image(b.to("cpu")) if not isinstance(b, Image.Image) else b + for b in batch + ] + inputs = self.processor.process_images(imgs) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + outs = self.encode_input(inputs) + all_embeds.extend(outs.cpu().to(torch.float32)) + + padded = torch.nn.utils.rnn.pad_sequence( + all_embeds, batch_first=True, padding_value=0 + ) + return padded + + def get_text_embeddings( + self, + texts, + batch_size: int = 32, + **kwargs, + ): + all_embeds = [] + with torch.no_grad(): + for i in range(0, len(texts), batch_size): + batch = texts[i : i + batch_size] + inputs = self.processor.process_queries(batch) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + outs = self.encode_input(inputs) + all_embeds.extend(outs.cpu().to(torch.float32)) + + padded = torch.nn.utils.rnn.pad_sequence( + all_embeds, batch_first=True, padding_value=0 + ) + return padded + + def get_fused_embeddings( + self, + texts: list[str] | None = None, + images: list[Image.Image] | DataLoader | None = None, + *, + task_name: str | None = None, + prompt_type: PromptType | None = None, + batch_size: int = 32, + fusion_mode="sum", + **kwargs: Any, + ): + raise NotImplementedError( + "Fused embeddings are not supported yet. Please use get_text_embeddings or get_image_embeddings." + ) + + def calculate_probs(self, text_embeddings, image_embeddings): + scores = self.similarity(text_embeddings, image_embeddings) + return (scores * 100).softmax(dim=-1) + + def similarity(self, a, b): + return self.processor.score_multi_vector(a, b) + + + + +granite_vision_embedding = ModelMeta( + loader=partial( + GraniteVisionEmbeddingWrapper, + model_name="ibm-granite/granite-vision-3.3-2b-embedding", + torch_dtype=torch.float16, + attn_implementation="flash_attention_2" + if is_flash_attn_2_available() + else None, + ), + name="ibm-granite/granite-vision-3.3-2b-embedding", + languages=["eng-Latn"], + revision="cee615db64d89d1552a4ee39c50f25c0fc5c66ca", + release_date="2025-06-11", + modalities=["image", "text"], + n_parameters=2_980_000_000, + memory_usage_mb=None, + max_tokens=128000, + embed_dim=128, + license="apache-2.0", + open_weights=True, + public_training_code="", + public_training_data="", + framework=["PyTorch"], + reference="https://huggingface.co/ibm-granite/granite-vision-3.3-2b-embedding", + similarity_fn_name="max_sim", + use_instructions=True, + training_datasets={}, +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index e2be5fc532..963801d9d4 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -41,22 +41,21 @@ geogpt_models, gme_v_models, google_models, + granite_vision_embedding_models, gritlm_models, gte_models, hinvec_models, + hit_tmg_models, ibm_granite_models, inf_models, jasper_models, jina_clip, jina_models, - kalm_models, lens_models, lgai_embedding_models, linq_models, - listconranker, llm2clip_models, llm2vec_models, - mcinext_models, misc_models, moco_models, model2vec_models, @@ -70,14 +69,12 @@ nvidia_models, openai_models, openclip_models, - opensearch_neural_sparse_models, ops_moa_models, piccolo_models, promptriever_models, qodo_models, qtack_models, qwen3_models, - qzhou_models, repllama_models, rerankers_custom, rerankers_monot5_based, @@ -128,19 +125,19 @@ e5_v, evaclip_models, google_models, + granite_vision_embedding_models, gritlm_models, gte_models, hinvec_models, + hit_tmg_models, ibm_granite_models, inf_models, jasper_models, jina_models, jina_clip, - kalm_models, lens_models, lgai_embedding_models, linq_models, - listconranker, llm2clip_models, llm2vec_models, misc_models, @@ -155,7 +152,6 @@ nvidia_llama_nemoretriever_colemb, openai_models, openclip_models, - opensearch_neural_sparse_models, ops_moa_models, piccolo_models, gme_v_models, @@ -163,7 +159,6 @@ qodo_models, qtack_models, qwen3_models, - qzhou_models, repllama_models, rerankers_custom, rerankers_monot5_based, @@ -194,7 +189,6 @@ colqwen_models, colsmol_models, geogpt_models, - mcinext_models, ] MODEL_REGISTRY = {} From 381751ba89d68ec68994ee5794155a5ece41a470 Mon Sep 17 00:00:00 2001 From: roipony Date: Fri, 15 Aug 2025 11:02:38 +0300 Subject: [PATCH 2/6] Address review comments --- mteb/models/overview.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 963801d9d4..7b0f73e60c 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -45,17 +45,19 @@ gritlm_models, gte_models, hinvec_models, - hit_tmg_models, ibm_granite_models, inf_models, jasper_models, jina_clip, jina_models, + kalm_models, lens_models, lgai_embedding_models, linq_models, + listconranker, llm2clip_models, llm2vec_models, + mcinext_models, misc_models, moco_models, model2vec_models, @@ -69,12 +71,14 @@ nvidia_models, openai_models, openclip_models, + opensearch_neural_sparse_models, ops_moa_models, piccolo_models, promptriever_models, qodo_models, qtack_models, qwen3_models, + qzhou_models, repllama_models, rerankers_custom, rerankers_monot5_based, @@ -129,15 +133,16 @@ gritlm_models, gte_models, hinvec_models, - hit_tmg_models, ibm_granite_models, inf_models, jasper_models, jina_models, jina_clip, + kalm_models, lens_models, lgai_embedding_models, linq_models, + listconranker, llm2clip_models, llm2vec_models, misc_models, @@ -152,6 +157,7 @@ nvidia_llama_nemoretriever_colemb, openai_models, openclip_models, + opensearch_neural_sparse_models, ops_moa_models, piccolo_models, gme_v_models, @@ -159,6 +165,7 @@ qodo_models, qtack_models, qwen3_models, + qzhou_models, repllama_models, rerankers_custom, rerankers_monot5_based, @@ -189,6 +196,7 @@ colqwen_models, colsmol_models, geogpt_models, + mcinext_models, ] MODEL_REGISTRY = {} From c6b146545ca03e96e66328df92be1bfc2c54b95d Mon Sep 17 00:00:00 2001 From: roipony Date: Fri, 15 Aug 2025 11:17:37 +0300 Subject: [PATCH 3/6] Address review comments --- mteb/models/granite_vision_embedding_models.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mteb/models/granite_vision_embedding_models.py b/mteb/models/granite_vision_embedding_models.py index f53be35be8..c1bdafcf21 100644 --- a/mteb/models/granite_vision_embedding_models.py +++ b/mteb/models/granite_vision_embedding_models.py @@ -136,6 +136,7 @@ def similarity(self, a, b): attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None, + revision ="cee615db64d89d1552a4ee39c50f25c0fc5c66ca" ), name="ibm-granite/granite-vision-3.3-2b-embedding", languages=["eng-Latn"], @@ -143,16 +144,16 @@ def similarity(self, a, b): release_date="2025-06-11", modalities=["image", "text"], n_parameters=2_980_000_000, - memory_usage_mb=None, + memory_usage_mb=11351, max_tokens=128000, embed_dim=128, license="apache-2.0", open_weights=True, - public_training_code="", - public_training_data="", + public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/ibm-granite/granite-vision-3.3-2b-embedding", similarity_fn_name="max_sim", use_instructions=True, - training_datasets={}, + training_datasets=None, ) From 737e1212d8ec58d14e79d0271f3483092c265b27 Mon Sep 17 00:00:00 2001 From: roipony Date: Fri, 15 Aug 2025 21:08:13 +0300 Subject: [PATCH 4/6] ruff format --- mteb/models/granite_vision_embedding_models.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mteb/models/granite_vision_embedding_models.py b/mteb/models/granite_vision_embedding_models.py index c1bdafcf21..1e15f7ec71 100644 --- a/mteb/models/granite_vision_embedding_models.py +++ b/mteb/models/granite_vision_embedding_models.py @@ -22,7 +22,6 @@ class GraniteVisionEmbeddingWrapper: - def __init__( self, model_name: str, @@ -34,16 +33,22 @@ def __init__( self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") self.model_name = model_name - + # Load model self.mdl = AutoModel.from_pretrained( - model_name, revision=revision, device_map=self.device,trust_remote_code=True, **kwargs + model_name, + revision=revision, + device_map=self.device, + trust_remote_code=True, + **kwargs, ) self.mdl.eval() # Load processor - self.processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, revision=revision) + self.processor = AutoProcessor.from_pretrained( + model_name, trust_remote_code=True, revision=revision + ) def encode(self, sentences, **kwargs): return self.get_text_embeddings(texts=sentences, **kwargs) @@ -126,8 +131,6 @@ def similarity(self, a, b): return self.processor.score_multi_vector(a, b) - - granite_vision_embedding = ModelMeta( loader=partial( GraniteVisionEmbeddingWrapper, @@ -136,7 +139,7 @@ def similarity(self, a, b): attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None, - revision ="cee615db64d89d1552a4ee39c50f25c0fc5c66ca" + revision="cee615db64d89d1552a4ee39c50f25c0fc5c66ca", ), name="ibm-granite/granite-vision-3.3-2b-embedding", languages=["eng-Latn"], From 5e9bc829dc915f80fbd8cd7cbe17d6beb65026fd Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 16 Aug 2025 15:02:50 +0200 Subject: [PATCH 5/6] Update mteb/models/granite_vision_embedding_models.py --- mteb/models/granite_vision_embedding_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/granite_vision_embedding_models.py b/mteb/models/granite_vision_embedding_models.py index 1e15f7ec71..17a0496919 100644 --- a/mteb/models/granite_vision_embedding_models.py +++ b/mteb/models/granite_vision_embedding_models.py @@ -158,5 +158,5 @@ def similarity(self, a, b): reference="https://huggingface.co/ibm-granite/granite-vision-3.3-2b-embedding", similarity_fn_name="max_sim", use_instructions=True, - training_datasets=None, + training_datasets=None, # proprietary, not public ) From 0053d3d2f29eeb4c3c38cfd6b23dfb7437946039 Mon Sep 17 00:00:00 2001 From: roipony Date: Sat, 16 Aug 2025 21:32:37 +0300 Subject: [PATCH 6/6] lint error fix --- mteb/models/granite_vision_embedding_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/granite_vision_embedding_models.py b/mteb/models/granite_vision_embedding_models.py index 17a0496919..23862f626e 100644 --- a/mteb/models/granite_vision_embedding_models.py +++ b/mteb/models/granite_vision_embedding_models.py @@ -158,5 +158,5 @@ def similarity(self, a, b): reference="https://huggingface.co/ibm-granite/granite-vision-3.3-2b-embedding", similarity_fn_name="max_sim", use_instructions=True, - training_datasets=None, # proprietary, not public + training_datasets=None, # proprietary, not public )