From 1e505570549a78740e53ae8337c7b2dfce555f7d Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Fri, 6 Sep 2024 16:20:57 +0100 Subject: [PATCH 01/17] wip: start adding BLIP models --- mteb/models/__init__.py | 2 + mteb/models/blip_models.py | 183 +++++++++++++++++++++++++++++++++++++ 2 files changed, 185 insertions(+) create mode 100644 mteb/models/blip_models.py diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py index 8e96542925..94358143c1 100644 --- a/mteb/models/__init__.py +++ b/mteb/models/__init__.py @@ -10,6 +10,7 @@ from mteb.models import ( align_models, bge_models, + blip_models, bm25, clip_models, cohere_models, @@ -130,6 +131,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe model_modules = [ align_models, bge_models, + blip_models, bm25, cohere_models, dino_models, diff --git a/mteb/models/blip_models.py b/mteb/models/blip_models.py new file mode 100644 index 0000000000..89b7f7d204 --- /dev/null +++ b/mteb/models/blip_models.py @@ -0,0 +1,183 @@ +from __future__ import annotations + +from functools import partial +from typing import Any + +import torch +from PIL import Image +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import AutoModel, AutoProcessor + +from mteb.model_meta import ModelMeta + + +class BLIPModelWrapper: + def __init__( + self, + model_name: str, + device: str = "cuda" if torch.cuda.is_available() else "cpu", + **kwargs: Any, + ): + self.model_name = model_name + self.device = device + self.model = AutoModel.from_pretrained(model_name).to(self.device) + self.processor = AutoProcessor.from_pretrained(model_name) + + def preprocess( + self, + texts: list[str], + images: list[Image.Image], + ): + return self.processor( + text=texts, images=images, return_tensors="pt", padding=True + ) + + def get_text_embeddings(self, texts: list[str], batch_size: int = 32): + all_text_embeddings = [] + + with torch.no_grad(): + for i in tqdm(range(0, len(texts), batch_size)): + batch_texts = texts[i : i + batch_size] + inputs = self.processor( + text=batch_texts, return_tensors="pt", padding=True, truncation=True + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + text_outputs = self.model.get_text_features(**inputs) + all_text_embeddings.append(text_outputs.cpu()) + + all_text_embeddings = torch.cat(all_text_embeddings, dim=0) + return all_text_embeddings + + def get_image_embeddings( + self, images: list[Image.Image] | DataLoader, batch_size: int = 32 + ): + all_image_embeddings = [] + + if isinstance(images, DataLoader): + with torch.no_grad(): + for batch in tqdm(images): + inputs = self.processor( + images=batch, return_tensors="pt", padding=True + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + image_outputs = self.model.get_image_features(**inputs) + all_image_embeddings.append(image_outputs.cpu()) + else: + with torch.no_grad(): + for i in tqdm(range(0, len(images), batch_size)): + batch_images = images[i : i + batch_size] + inputs = self.processor( + images=batch_images, return_tensors="pt", padding=True + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + image_outputs = self.model.get_image_features(**inputs) + all_image_embeddings.append(image_outputs.cpu()) + + all_image_embeddings = torch.cat(all_image_embeddings, dim=0) + return all_image_embeddings + + def calculate_probs(self, text_embeddings, image_embeddings): + text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True) + image_embeddings = image_embeddings / image_embeddings.norm( + dim=-1, keepdim=True + ) + logits = torch.matmul(image_embeddings, text_embeddings.T) + probs = (logits * 100).softmax(dim=-1) + return probs + + def get_fused_embeddings( + self, + texts: list[str] = None, + images: list[Image.Image] | DataLoader = None, + fusion_mode="sum", + batch_size: int = 32, + ): + # TODO: find out if BLIP has a prescribed way of fusing text and image embeddings + if texts is None and images is None: + raise ValueError("Either texts or images must be provided") + + text_embeddings = None + image_embeddings = None + + if texts is not None: + text_embeddings = self.get_text_embeddings(texts, batch_size) + + if images is not None: + image_embeddings = self.get_image_embeddings(images, batch_size) + + if text_embeddings is not None and image_embeddings is not None: + if len(text_embeddings) != len(image_embeddings): + raise ValueError( + "The number of texts and images must have the same length" + ) + if fusion_mode == "sum": + fused_embeddings = text_embeddings + image_embeddings + else: + # to do: add other fusion mode + raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented") + return fused_embeddings + elif text_embeddings is not None: + return text_embeddings + elif image_embeddings is not None: + return image_embeddings + + +""" +TODO: implement all model variants + +Salesforce/blip-image-captioning-large +Image-to-Text • Updated Dec 7, 2023 • +1.16M • +• +1.04k +Salesforce/blip-image-captioning-base +Image-to-Text • Updated Aug 1, 2023 • +857k • +• +475 +Salesforce/blip-vqa-base +Visual Question Answering • Updated Dec 7, 2023 • +168k • +119 +Salesforce/blip-vqa-capfilt-large +Visual Question Answering • Updated Jan 22 • +90.6k • +44 +Salesforce/blip-itm-base-coco +Updated Aug 1, 2023 • +12.8k • +16 +Salesforce/blip-itm-large-coco +Updated Aug 1, 2023 • +9.9k +Salesforce/blip-itm-base-flickr +Updated Aug 1, 2023 • +65 +Salesforce/blip-itm-large-flickr +Updated Aug 1, 2023 • +459 • +2 +""" + +blip_image_captioning_base = ModelMeta( + loader=partial( + BLIPModelWrapper, + model_name="Salesforce/blip-image-captioning-base", + ), + name="Salesforce/blip-image-captioning-base", + languages=["eng_Latn"], + open_source=True, + revision="89b09ea1789f7addf2f6d6f0dfc4ce10ab58ef84", + release_date="2023-08-01", +) + + +if __name__ == "__main__": + import mteb + + mdl = mteb.get_model( + blip_image_captioning_base.name, blip_image_captioning_base.revision + ) + emb = mdl.get_text_embeddings(["Hello, world!"]) + print(emb.shape) From 8f8e05cb3e3f1d4a773e3e6a1136d7d307c872d9 Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Mon, 9 Sep 2024 16:17:43 +0100 Subject: [PATCH 02/17] add other blip variants --- mteb/models/blip_models.py | 85 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/mteb/models/blip_models.py b/mteb/models/blip_models.py index 89b7f7d204..ead46b63e7 100644 --- a/mteb/models/blip_models.py +++ b/mteb/models/blip_models.py @@ -159,6 +159,18 @@ def get_fused_embeddings( 459 • 2 """ +# in descending order of usage (downloads from huggingface) +blip_image_captioning_large = ModelMeta( + loader=partial( + BLIPModelWrapper, + model_name="Salesforce/blip-image-captioning-large", + ), + name="Salesforce/blip-image-captioning-large", + languages=["eng_Latn"], + open_source=True, + revision="2227ac38c9f16105cb0412e7cab4759978a8fd90", + release_date="2023-12-07", +) blip_image_captioning_base = ModelMeta( loader=partial( @@ -173,6 +185,79 @@ def get_fused_embeddings( ) +blip_vqa_base = ModelMeta( + loader=partial( + BLIPModelWrapper, + model_name="Salesforce/blip-vqa-base", + ), + name="Salesforce/blip-vqa-base", + languages=["eng_Latn"], + open_source=True, + revision="c7df8e7cd7aa2ee9af18f56e2b29e59a92651b64", + release_date="2023-12-07", +) + +blip_vqa_capfilt_large = ModelMeta( + loader=partial( + BLIPModelWrapper, + model_name="Salesforce/blip-vqa-capfilt-large", + ), + name="Salesforce/blip-vqa-capfilt-large", + languages=["eng_Latn"], + open_source=True, + revision="e53f95265aeab69013fabb5380500ab984adbbb4", + release_date="2023-01-22", +) + +blip_itm_base_coco = ModelMeta( + loader=partial( + BLIPModelWrapper, + model_name="Salesforce/blip-itm-base-coco", + ), + name="Salesforce/blip-itm-base-coco", + languages=["eng_Latn"], + open_source=True, + revision="7eaa90c11850c0b17fc38c6a11e7d88bd6ac231f", + release_date="2023-08-01", +) + +blip_itm_large_coco = ModelMeta( + loader=partial( + BLIPModelWrapper, + model_name="Salesforce/blip-itm-large-coco", + ), + name="Salesforce/blip-itm-large-coco", + languages=["eng_Latn"], + open_source=True, + revision="fef05cafc05298067cbbca00b125749394a77a6f", + release_date="2023-08-01", +) + +blip_itm_base_flickr = ModelMeta( + loader=partial( + BLIPModelWrapper, + model_name="Salesforce/blip-itm-base-flickr", + ), + name="Salesforce/blip-itm-base-flickr", + languages=["eng_Latn"], + open_source=True, + revision="1de29e660d91ae1786c1876212ea805a22eab251", + release_date="2023-08-01", +) + +blip_itm_large_flickr = ModelMeta( + loader=partial( + BLIPModelWrapper, + model_name="Salesforce/blip-itm-large-flickr", + ), + name="Salesforce/blip-itm-large-flickr", + languages=["eng_Latn"], + open_source=True, + revision="bda12e6506758f54261b5ab174b2c55a3ba143fb", + release_date="2023-08-01", +) + + if __name__ == "__main__": import mteb From be8b4bbd007e274cc622c7291a24b2cb23c080c8 Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Wed, 11 Sep 2024 15:50:31 +0100 Subject: [PATCH 03/17] wip: add blip2_models.py --- mteb/models/blip2_models.py | 235 ++++++++++++++++++++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 mteb/models/blip2_models.py diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py new file mode 100644 index 0000000000..5db3d01c37 --- /dev/null +++ b/mteb/models/blip2_models.py @@ -0,0 +1,235 @@ +from __future__ import annotations + +from functools import partial +from typing import Any + +import torch +from torch.nn.functional import normalize +from PIL import Image +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import BlipForImageTextRetrieval, BlipProcessor + +from mteb.model_meta import ModelMeta + + +class BLIP2ModelWrapper: + def __init__( + self, + model_name: str, + device: str = "cuda" if torch.cuda.is_available() else "cpu", + **kwargs: Any, + ): + self.model_name = model_name + self.device = device + self.model = BlipForImageTextRetrieval.from_pretrained(model_name).to(self.device) + self.processor = BlipProcessor.from_pretrained(model_name) + + def preprocess( + self, + texts: list[str], + images: list[Image.Image], + ): + return self.processor( + text=texts, images=images, return_tensors="pt", padding=True + ) + + def get_text_embeddings(self, texts: list[str], batch_size: int = 32): + all_text_embeddings = [] + + with torch.no_grad(): + for i in tqdm(range(0, len(texts), batch_size)): + batch_texts = texts[i : i + batch_size] + inputs = self.processor( + text=batch_texts, return_tensors="pt", padding=True, truncation=True + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + # different to CLIPModelWrapper: text_encoder instead of get_text_features and apply projection and normalization + text_outputs = self.model.text_encoder(**inputs) + text_outputs = text_outputs[0] + text_outputs = normalize(self.model.text_proj(text_outputs[:,0,:]), dim=-1) + all_text_embeddings.append(text_outputs.cpu()) + + all_text_embeddings = torch.cat(all_text_embeddings, dim=0) + return all_text_embeddings + + def get_image_embeddings( + self, images: list[Image.Image] | DataLoader, batch_size: int = 32 + ): + all_image_embeddings = [] + + if isinstance(images, DataLoader): + with torch.no_grad(): + for batch in tqdm(images): + inputs = self.processor( + images=batch, return_tensors="pt", padding=True + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + image_outputs = self.model.vision_model(**inputs) + image_outputs = image_outputs[0] + image_outputs = normalize(self.model.vision_proj(image_outputs[:,0,:]), dim=-1) + all_image_embeddings.append(image_outputs.cpu()) + else: + with torch.no_grad(): + for i in tqdm(range(0, len(images), batch_size)): + batch_images = images[i : i + batch_size] + inputs = self.processor( + images=batch_images, return_tensors="pt", padding=True + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + image_outputs = self.model.get_image_features(**inputs) + image_outputs = self.model.vision_model(**inputs) + image_outputs = image_outputs[0] + image_outputs = normalize(self.model.vision_proj(image_outputs[:,0,:]), dim=-1) + all_image_embeddings.append(image_outputs.cpu()) + + all_image_embeddings = torch.cat(all_image_embeddings, dim=0) + return all_image_embeddings + + def calculate_probs(self, text_embeddings, image_embeddings): + text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True) + image_embeddings = image_embeddings / image_embeddings.norm( + dim=-1, keepdim=True + ) + logits = torch.matmul(image_embeddings, text_embeddings.T) + probs = (logits * 100).softmax(dim=-1) + return probs + + def get_fused_embeddings( + self, + texts: list[str] = None, + images: list[Image.Image] | DataLoader = None, + fusion_mode="sum", + batch_size: int = 32, + ): + # TODO: find out if BLIP has a prescribed way of fusing text and image embeddings + if texts is None and images is None: + raise ValueError("Either texts or images must be provided") + + text_embeddings = None + image_embeddings = None + + if texts is not None: + text_embeddings = self.get_text_embeddings(texts, batch_size) + + if images is not None: + image_embeddings = self.get_image_embeddings(images, batch_size) + + if text_embeddings is not None and image_embeddings is not None: + if len(text_embeddings) != len(image_embeddings): + raise ValueError( + "The number of texts and images must have the same length" + ) + if fusion_mode == "sum": + fused_embeddings = text_embeddings + image_embeddings + else: + # to do: add other fusion mode + raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented") + return fused_embeddings + elif text_embeddings is not None: + return text_embeddings + elif image_embeddings is not None: + return image_embeddings + + +""" + +Salesforce/blip2-opt-2.7b +Image-to-Text • Updated Mar 22 • +588k • +296 +Salesforce/blip2-flan-t5-xxl +Image-to-Text • Updated Mar 29 • +9.23k • +84 +Salesforce/blip2-opt-6.7b-coco +Image-to-Text • Updated Mar 31 • +1.51k • +28 +Salesforce/blip2-opt-6.7b +Image-to-Text • Updated Mar 27 • +4.93k • +71 +Salesforce/blip2-flan-t5-xl +Image-to-Text • Updated Dec 13, 2023 • +95.9k • +56 +""" +# in descending order of usage (downloads from huggingface) + +blip2_opt_2_7b = ModelMeta( + loader=partial( + BLIP2ModelWrapper, + model_name="Salesforce/blip2-opt-2.7b", + ), + name="Salesforce/blip2-opt-2.7b", + languages=["eng_Latn"], + open_source=True, + revision="51572668da0eb669e01a189dc22abe6088589a24", + release_date="2024-03-22", +) + +blip2_flan_t5_xxl = ModelMeta( + loader=partial( + BLIP2ModelWrapper, + model_name="Salesforce/blip2-flan-t5-xxl", + ), + name="Salesforce/blip2-flan-t5-xxl", + languages=["eng_Latn"], + open_source=True, + revision="43206cbc865b9d5b3dd7d080e5d94b4143ca8e74", + release_date="2024-03-29", +) + +blip2_opt_6_7b_coco = ModelMeta( + loader=partial( + BLIP2ModelWrapper, + model_name="Salesforce/blip2-opt-6.7b-coco", + ), + name="Salesforce/blip2-opt-6.7b-coco", + languages=["eng_Latn"], + open_source=True, + revision="0d580de59320a25a4d2c386387bcef310d5f286e", + release_date="2024-03-31", +) + +blip2_opt_6_7b = ModelMeta( + loader=partial( + BLIP2ModelWrapper, + model_name="Salesforce/blip2-opt-6.7b", + ), + name="Salesforce/blip2-opt-6.7b", + languages=["eng_Latn"], + open_source=True, + revision="1d33d60155fd1323b97556e0f1dd5148a9749f5b", + release_date="2024-03-27", +) + +blip2_flan_t5_xl = ModelMeta( + loader=partial( + BLIP2ModelWrapper, + model_name="Salesforce/blip2-flan-t5-xl", + ), + name="Salesforce/blip2-flan-t5-xl", + languages=["eng_Latn"], + open_source=True, + revision="e5025a34e3e769e72e2aab7f7bfd00bc84d5fd77", + release_date="2023-12-13", +) + +if __name__ == "__main__": + import mteb + + mdl = mteb.get_model( + blip2_opt_2_7b.name, blip2_opt_2_7b.revision + ) + emb = mdl.get_text_embeddings(["Hello, world!"]) + emb2 = mdl.get_text_embeddings(["Hello there, world!"]) + emb3 = mdl.get_text_embeddings(["Goodbye, person!"]) + + sim = torch.nn.functional.cosine_similarity(emb, emb2) + print(sim) + + sim = torch.nn.functional.cosine_similarity(emb, emb3) + print(sim) + From b57a395d5e103d0677c4547ebbb8f9f35564a202 Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Wed, 11 Sep 2024 16:26:45 +0100 Subject: [PATCH 04/17] make lint --- .../abstasks/Image/AbsTaskAny2AnyRetrieval.py | 2 +- .../Image/AbsTaskImageClassification.py | 2 +- mteb/abstasks/Image/AbsTaskImageClustering.py | 2 +- .../AbsTaskImageMultilabelClassification.py | 2 +- .../AbsTaskImageTextPairClassification.py | 2 +- .../Image/AbsTaskZeroshotClassification.py | 2 +- mteb/models/blip2_models.py | 25 +++--- mteb/models/blip_models.py | 77 ++++++++----------- mteb/models/instructions.py | 2 - mteb/models/ru_sentence_models.py | 2 - mteb/models/sentence_transformers_models.py | 2 - .../Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py | 3 +- .../eng/FashionIQIT2IRetrieval.py | 3 +- .../eng/HatefulMemesI2TRetrieval.py | 3 +- .../eng/HatefulMemesT2IRetrieval.py | 3 +- .../eng/InfoSeekIT2ITRetrieval.py | 3 +- .../eng/InfoSeekIT2TRetrieval.py | 3 +- .../eng/MemotionI2TRetrieval.py | 3 +- .../eng/MemotionT2IRetrieval.py | 3 +- .../eng/NIGHTSI2IRetrieval.py | 3 +- .../eng/OVENIT2ITRetrieval.py | 3 +- .../Any2AnyRetrieval/eng/OVENIT2TRetrieval.py | 3 +- .../eng/SciMMIRI2TRetrieval.py | 3 +- .../eng/SciMMIRT2IRetrieval.py | 3 +- .../eng/TUBerlinT2IRetrieval.py | 3 +- .../eng/VisualNewsI2TRetrieval.py | 3 +- .../eng/VisualNewsT2IRetrieval.py | 3 +- .../eng/WebQAT2ITRetrieval.py | 3 +- .../Any2AnyRetrieval/eng/WebQAT2TRetrieval.py | 3 +- .../multilingual/WITT2IRetrieval.py | 3 +- .../multilingual/XFlickr30kCoT2IRetrieval.py | 3 +- .../multilingual/XM3600T2IRetrieval.py | 3 +- mteb/tasks/Image/Clustering/eng/CIFAR.py | 3 +- .../eng/BirdsnapClassification.py | 3 +- .../Image/ImageClassification/eng/CIFAR.py | 3 +- .../eng/Caltech101Classification.py | 3 +- .../eng/DTDClassification.py | 3 +- .../eng/EuroSATClassification.py | 3 +- .../eng/FER2013Classification.py | 3 +- .../eng/FGVCAircraftClassification.py | 3 +- .../eng/Food101Classification.py | 3 +- .../eng/MNISTClassification.py | 3 +- .../eng/OxfordFlowersClassification.py | 3 +- .../eng/OxfordPetsClassification.py | 3 +- .../eng/RESISC45Classification.py | 3 +- .../eng/STL10Classification.py | 3 +- .../eng/SUN397Classification.py | 3 +- .../eng/StanfordCarsClassification.py | 3 +- .../ZeroshotClassification/eng/Birdsnap.py | 3 +- .../Image/ZeroshotClassification/eng/CIFAR.py | 3 +- .../ZeroshotClassification/eng/Caltech101.py | 3 +- .../Image/ZeroshotClassification/eng/DTD.py | 3 +- .../ZeroshotClassification/eng/EuroSAT.py | 3 +- .../ZeroshotClassification/eng/FER2013.py | 3 +- .../eng/FGVCAircraft.py | 3 +- .../ZeroshotClassification/eng/Food101.py | 3 +- .../Image/ZeroshotClassification/eng/MNIST.py | 3 +- .../ZeroshotClassification/eng/OxfordPets.py | 3 +- .../ZeroshotClassification/eng/RESISC45.py | 3 +- .../Image/ZeroshotClassification/eng/STL10.py | 3 +- .../ZeroshotClassification/eng/SUN397.py | 3 +- .../eng/StanfordCars.py | 3 +- 62 files changed, 103 insertions(+), 170 deletions(-) diff --git a/mteb/abstasks/Image/AbsTaskAny2AnyRetrieval.py b/mteb/abstasks/Image/AbsTaskAny2AnyRetrieval.py index 9c5987f4b1..c640988e91 100644 --- a/mteb/abstasks/Image/AbsTaskAny2AnyRetrieval.py +++ b/mteb/abstasks/Image/AbsTaskAny2AnyRetrieval.py @@ -12,9 +12,9 @@ from datasets import Features, Value, load_dataset from PIL import Image -from ..AbsTask import AbsTask from ...evaluation.evaluators import Any2AnyRetrievalEvaluator from ...load_results.mteb_results import ScoresDict +from ..AbsTask import AbsTask logger = logging.getLogger(__name__) diff --git a/mteb/abstasks/Image/AbsTaskImageClassification.py b/mteb/abstasks/Image/AbsTaskImageClassification.py index 3a95f2bd29..715f007e10 100644 --- a/mteb/abstasks/Image/AbsTaskImageClassification.py +++ b/mteb/abstasks/Image/AbsTaskImageClassification.py @@ -6,7 +6,6 @@ import numpy as np -from ..AbsTask import AbsTask from ...encoder_interface import Encoder from ...evaluation.evaluators import ( ImagekNNClassificationEvaluator, @@ -14,6 +13,7 @@ ImagelogRegClassificationEvaluator, ) from ...load_results.mteb_results import HFSubset, ScoresDict +from ..AbsTask import AbsTask logger = logging.getLogger(__name__) diff --git a/mteb/abstasks/Image/AbsTaskImageClustering.py b/mteb/abstasks/Image/AbsTaskImageClustering.py index 5370b16b15..3d6f7e88d2 100644 --- a/mteb/abstasks/Image/AbsTaskImageClustering.py +++ b/mteb/abstasks/Image/AbsTaskImageClustering.py @@ -5,10 +5,10 @@ from datasets import Dataset -from ..AbsTask import AbsTask from ...encoder_interface import Encoder, EncoderWithQueryCorpusEncode from ...evaluation.evaluators import ImageClusteringEvaluator from ...load_results.mteb_results import HFSubset, ScoresDict +from ..AbsTask import AbsTask logger = logging.getLogger(__name__) diff --git a/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py b/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py index 5669575a18..6a0d649f10 100644 --- a/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py +++ b/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py @@ -12,9 +12,9 @@ from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import MultiLabelBinarizer -from ..AbsTask import AbsTask from ...encoder_interface import Encoder from ...load_results.mteb_results import HFSubset, ScoresDict +from ..AbsTask import AbsTask logger = logging.getLogger(__name__) diff --git a/mteb/abstasks/Image/AbsTaskImageTextPairClassification.py b/mteb/abstasks/Image/AbsTaskImageTextPairClassification.py index 492de11659..81f3094b5c 100644 --- a/mteb/abstasks/Image/AbsTaskImageTextPairClassification.py +++ b/mteb/abstasks/Image/AbsTaskImageTextPairClassification.py @@ -6,10 +6,10 @@ from datasets import Dataset from tqdm import tqdm -from ..AbsTask import AbsTask from ...encoder_interface import Encoder, EncoderWithQueryCorpusEncode from ...evaluation.evaluators import ImageTextPairClassificationEvaluator from ...load_results.mteb_results import ScoresDict +from ..AbsTask import AbsTask logger = logging.getLogger(__name__) diff --git a/mteb/abstasks/Image/AbsTaskZeroshotClassification.py b/mteb/abstasks/Image/AbsTaskZeroshotClassification.py index 9d5a55e235..4f23bb46b4 100644 --- a/mteb/abstasks/Image/AbsTaskZeroshotClassification.py +++ b/mteb/abstasks/Image/AbsTaskZeroshotClassification.py @@ -5,10 +5,10 @@ from datasets import Dataset -from ..AbsTask import AbsTask from ...encoder_interface import Encoder, EncoderWithQueryCorpusEncode from ...evaluation.evaluators import ZeroshotClassificationEvaluator from ...load_results.mteb_results import ScoresDict +from ..AbsTask import AbsTask logger = logging.getLogger(__name__) diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py index 5db3d01c37..3181c5f5ac 100644 --- a/mteb/models/blip2_models.py +++ b/mteb/models/blip2_models.py @@ -4,8 +4,8 @@ from typing import Any import torch -from torch.nn.functional import normalize from PIL import Image +from torch.nn.functional import normalize from torch.utils.data import DataLoader from tqdm import tqdm from transformers import BlipForImageTextRetrieval, BlipProcessor @@ -22,7 +22,9 @@ def __init__( ): self.model_name = model_name self.device = device - self.model = BlipForImageTextRetrieval.from_pretrained(model_name).to(self.device) + self.model = BlipForImageTextRetrieval.from_pretrained(model_name).to( + self.device + ) self.processor = BlipProcessor.from_pretrained(model_name) def preprocess( @@ -47,7 +49,9 @@ def get_text_embeddings(self, texts: list[str], batch_size: int = 32): # different to CLIPModelWrapper: text_encoder instead of get_text_features and apply projection and normalization text_outputs = self.model.text_encoder(**inputs) text_outputs = text_outputs[0] - text_outputs = normalize(self.model.text_proj(text_outputs[:,0,:]), dim=-1) + text_outputs = normalize( + self.model.text_proj(text_outputs[:, 0, :]), dim=-1 + ) all_text_embeddings.append(text_outputs.cpu()) all_text_embeddings = torch.cat(all_text_embeddings, dim=0) @@ -67,7 +71,9 @@ def get_image_embeddings( inputs = {k: v.to(self.device) for k, v in inputs.items()} image_outputs = self.model.vision_model(**inputs) image_outputs = image_outputs[0] - image_outputs = normalize(self.model.vision_proj(image_outputs[:,0,:]), dim=-1) + image_outputs = normalize( + self.model.vision_proj(image_outputs[:, 0, :]), dim=-1 + ) all_image_embeddings.append(image_outputs.cpu()) else: with torch.no_grad(): @@ -80,7 +86,9 @@ def get_image_embeddings( image_outputs = self.model.get_image_features(**inputs) image_outputs = self.model.vision_model(**inputs) image_outputs = image_outputs[0] - image_outputs = normalize(self.model.vision_proj(image_outputs[:,0,:]), dim=-1) + image_outputs = normalize( + self.model.vision_proj(image_outputs[:, 0, :]), dim=-1 + ) all_image_embeddings.append(image_outputs.cpu()) all_image_embeddings = torch.cat(all_image_embeddings, dim=0) @@ -220,16 +228,13 @@ def get_fused_embeddings( if __name__ == "__main__": import mteb - mdl = mteb.get_model( - blip2_opt_2_7b.name, blip2_opt_2_7b.revision - ) + mdl = mteb.get_model(blip2_opt_2_7b.name, blip2_opt_2_7b.revision) emb = mdl.get_text_embeddings(["Hello, world!"]) emb2 = mdl.get_text_embeddings(["Hello there, world!"]) emb3 = mdl.get_text_embeddings(["Goodbye, person!"]) - + sim = torch.nn.functional.cosine_similarity(emb, emb2) print(sim) sim = torch.nn.functional.cosine_similarity(emb, emb3) print(sim) - diff --git a/mteb/models/blip_models.py b/mteb/models/blip_models.py index ead46b63e7..dff6014246 100644 --- a/mteb/models/blip_models.py +++ b/mteb/models/blip_models.py @@ -5,9 +5,10 @@ import torch from PIL import Image +from torch.nn.functional import normalize from torch.utils.data import DataLoader from tqdm import tqdm -from transformers import AutoModel, AutoProcessor +from transformers import BlipForImageTextRetrieval, BlipProcessor from mteb.model_meta import ModelMeta @@ -21,8 +22,10 @@ def __init__( ): self.model_name = model_name self.device = device - self.model = AutoModel.from_pretrained(model_name).to(self.device) - self.processor = AutoProcessor.from_pretrained(model_name) + self.model = BlipForImageTextRetrieval.from_pretrained(model_name).to( + self.device + ) + self.processor = BlipProcessor.from_pretrained(model_name) def preprocess( self, @@ -43,7 +46,12 @@ def get_text_embeddings(self, texts: list[str], batch_size: int = 32): text=batch_texts, return_tensors="pt", padding=True, truncation=True ) inputs = {k: v.to(self.device) for k, v in inputs.items()} - text_outputs = self.model.get_text_features(**inputs) + # different to CLIPModelWrapper: text_encoder instead of get_text_features and apply projection and normalization + text_outputs = self.model.text_encoder(**inputs) + text_outputs = text_outputs[0] + text_outputs = normalize( + self.model.text_proj(text_outputs[:, 0, :]), dim=-1 + ) all_text_embeddings.append(text_outputs.cpu()) all_text_embeddings = torch.cat(all_text_embeddings, dim=0) @@ -61,7 +69,11 @@ def get_image_embeddings( images=batch, return_tensors="pt", padding=True ) inputs = {k: v.to(self.device) for k, v in inputs.items()} - image_outputs = self.model.get_image_features(**inputs) + image_outputs = self.model.vision_model(**inputs) + image_outputs = image_outputs[0] + image_outputs = normalize( + self.model.vision_proj(image_outputs[:, 0, :]), dim=-1 + ) all_image_embeddings.append(image_outputs.cpu()) else: with torch.no_grad(): @@ -72,6 +84,11 @@ def get_image_embeddings( ) inputs = {k: v.to(self.device) for k, v in inputs.items()} image_outputs = self.model.get_image_features(**inputs) + image_outputs = self.model.vision_model(**inputs) + image_outputs = image_outputs[0] + image_outputs = normalize( + self.model.vision_proj(image_outputs[:, 0, :]), dim=-1 + ) all_image_embeddings.append(image_outputs.cpu()) all_image_embeddings = torch.cat(all_image_embeddings, dim=0) @@ -93,7 +110,6 @@ def get_fused_embeddings( fusion_mode="sum", batch_size: int = 32, ): - # TODO: find out if BLIP has a prescribed way of fusing text and image embeddings if texts is None and images is None: raise ValueError("Either texts or images must be provided") @@ -123,42 +139,6 @@ def get_fused_embeddings( return image_embeddings -""" -TODO: implement all model variants - -Salesforce/blip-image-captioning-large -Image-to-Text • Updated Dec 7, 2023 • -1.16M • -• -1.04k -Salesforce/blip-image-captioning-base -Image-to-Text • Updated Aug 1, 2023 • -857k • -• -475 -Salesforce/blip-vqa-base -Visual Question Answering • Updated Dec 7, 2023 • -168k • -119 -Salesforce/blip-vqa-capfilt-large -Visual Question Answering • Updated Jan 22 • -90.6k • -44 -Salesforce/blip-itm-base-coco -Updated Aug 1, 2023 • -12.8k • -16 -Salesforce/blip-itm-large-coco -Updated Aug 1, 2023 • -9.9k -Salesforce/blip-itm-base-flickr -Updated Aug 1, 2023 • -65 -Salesforce/blip-itm-large-flickr -Updated Aug 1, 2023 • -459 • -2 -""" # in descending order of usage (downloads from huggingface) blip_image_captioning_large = ModelMeta( loader=partial( @@ -261,8 +241,13 @@ def get_fused_embeddings( if __name__ == "__main__": import mteb - mdl = mteb.get_model( - blip_image_captioning_base.name, blip_image_captioning_base.revision - ) + mdl = mteb.get_model(blip_itm_base_coco.name, blip_itm_base_coco.revision) emb = mdl.get_text_embeddings(["Hello, world!"]) - print(emb.shape) + emb2 = mdl.get_text_embeddings(["Hello there, world!"]) + emb3 = mdl.get_text_embeddings(["Goodbye, person!"]) + + sim = torch.nn.functional.cosine_similarity(emb, emb2) + print(sim) + + sim = torch.nn.functional.cosine_similarity(emb, emb3) + print(sim) diff --git a/mteb/models/instructions.py b/mteb/models/instructions.py index 99054e41d7..4a31f8da02 100644 --- a/mteb/models/instructions.py +++ b/mteb/models/instructions.py @@ -2,8 +2,6 @@ from __future__ import annotations -from __future__ import annotations - import mteb # Prompts from diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index cffe7f7be4..30214c21f2 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -2,8 +2,6 @@ from __future__ import annotations -from __future__ import annotations - from functools import partial from mteb.model_meta import ModelMeta diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 33ea17b165..a3603d9eb3 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -2,8 +2,6 @@ from __future__ import annotations -from __future__ import annotations - from mteb.model_meta import ModelMeta paraphrase_langs = [ diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py index eb65b82e79..417e5d6caa 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata class CIRRIT2IRetrieval(AbsTaskAny2AnyRetrieval): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py index b336549557..a58ed15dd5 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata class FashionIQIT2IRetrieval(AbsTaskAny2AnyRetrieval): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesI2TRetrieval.py index 1fcf9f0cb9..817ea1c674 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesI2TRetrieval.py @@ -2,9 +2,8 @@ from datasets import concatenate_datasets, load_dataset -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesT2IRetrieval.py index 5b2b9bcaef..0a55e446ed 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesT2IRetrieval.py @@ -2,9 +2,8 @@ from datasets import concatenate_datasets, load_dataset -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py index 5029c51ec9..f7cb041bcb 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata class InfoSeekIT2ITRetrieval(AbsTaskAny2AnyRetrieval): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py index cd08aa77b2..cc2b23ea88 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata class InfoSeekIT2TRetrieval(AbsTaskAny2AnyRetrieval): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py index af68e278b9..9247a12f88 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py @@ -2,9 +2,8 @@ from datasets import concatenate_datasets, load_dataset -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py index 7478ddddeb..f214bd2ea5 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py @@ -2,9 +2,8 @@ from datasets import concatenate_datasets, load_dataset -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py index 82dcf0894a..73d3f7c280 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata class NIGHTSI2IRetrieval(AbsTaskAny2AnyRetrieval): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py index 51d031241c..0f53eb7e6a 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata class OVENIT2ITRetrieval(AbsTaskAny2AnyRetrieval): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py index cfa07350ba..3df5b92625 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata class OVENIT2TRetrieval(AbsTaskAny2AnyRetrieval): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRI2TRetrieval.py index fa0f5b5707..eb2c24aeb2 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRI2TRetrieval.py @@ -2,9 +2,8 @@ from datasets import load_dataset -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRT2IRetrieval.py index c6004e7840..e92bd637f5 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRT2IRetrieval.py @@ -2,9 +2,8 @@ from datasets import load_dataset -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/TUBerlinT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/TUBerlinT2IRetrieval.py index 018f708ce5..7c7bddfe4c 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/TUBerlinT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/TUBerlinT2IRetrieval.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata class TUBerlinT2IRetrieval(AbsTaskAny2AnyRetrieval): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py index c1f1b306ca..2de1713097 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata class VisualNewsI2TRetrieval(AbsTaskAny2AnyRetrieval): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py index 7457f00d03..091d7a7f00 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata class VisualNewsT2IRetrieval(AbsTaskAny2AnyRetrieval): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py index 7086c1c205..50725b79b9 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata class WebQAT2ITRetrieval(AbsTaskAny2AnyRetrieval): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py index 6a4efb261a..14c9c02148 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata class WebQAT2TRetrieval(AbsTaskAny2AnyRetrieval): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py index a0395594a2..5de06b937f 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py @@ -2,10 +2,9 @@ from datasets import Dataset, DatasetDict, load_dataset -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata _LANGUAGES = { "ar": ["ara-Arab"], diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py index 92f4a9c2c0..65c886f314 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py @@ -2,10 +2,9 @@ from datasets import DatasetDict, load_dataset -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata _LANGUAGES = { "de": ["deu-Latn"], diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py index 8cb7f0e9d1..687c9f0446 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py @@ -2,10 +2,9 @@ from datasets import Dataset, DatasetDict, load_dataset -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata _LANGUAGES = { "ar": ["ara-Arab"], diff --git a/mteb/tasks/Image/Clustering/eng/CIFAR.py b/mteb/tasks/Image/Clustering/eng/CIFAR.py index 01b493233c..e7f7a1d633 100644 --- a/mteb/tasks/Image/Clustering/eng/CIFAR.py +++ b/mteb/tasks/Image/Clustering/eng/CIFAR.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClustering import AbsTaskImageClustering +from mteb.abstasks.TaskMetadata import TaskMetadata class CIFAR10Clustering(AbsTaskImageClustering): diff --git a/mteb/tasks/Image/ImageClassification/eng/BirdsnapClassification.py b/mteb/tasks/Image/ImageClassification/eng/BirdsnapClassification.py index a104d51e13..38016e5e79 100644 --- a/mteb/tasks/Image/ImageClassification/eng/BirdsnapClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/BirdsnapClassification.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification +from mteb.abstasks.TaskMetadata import TaskMetadata class BirdsnapClassification(AbsTaskImageClassification): diff --git a/mteb/tasks/Image/ImageClassification/eng/CIFAR.py b/mteb/tasks/Image/ImageClassification/eng/CIFAR.py index 75e3cdf6fc..9b4f45e387 100644 --- a/mteb/tasks/Image/ImageClassification/eng/CIFAR.py +++ b/mteb/tasks/Image/ImageClassification/eng/CIFAR.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification +from mteb.abstasks.TaskMetadata import TaskMetadata class CIFAR10Classification(AbsTaskImageClassification): diff --git a/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py b/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py index 0175cd8663..fe62f955b3 100644 --- a/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py +++ b/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification +from mteb.abstasks.TaskMetadata import TaskMetadata class Caltech101Classification(AbsTaskImageClassification): diff --git a/mteb/tasks/Image/ImageClassification/eng/DTDClassification.py b/mteb/tasks/Image/ImageClassification/eng/DTDClassification.py index 2f921e5587..25f6ba0401 100644 --- a/mteb/tasks/Image/ImageClassification/eng/DTDClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/DTDClassification.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification +from mteb.abstasks.TaskMetadata import TaskMetadata class DTDClassification(AbsTaskImageClassification): diff --git a/mteb/tasks/Image/ImageClassification/eng/EuroSATClassification.py b/mteb/tasks/Image/ImageClassification/eng/EuroSATClassification.py index b849d93c0b..4930c13d1b 100644 --- a/mteb/tasks/Image/ImageClassification/eng/EuroSATClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/EuroSATClassification.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification +from mteb.abstasks.TaskMetadata import TaskMetadata class EuroSATClassification(AbsTaskImageClassification): diff --git a/mteb/tasks/Image/ImageClassification/eng/FER2013Classification.py b/mteb/tasks/Image/ImageClassification/eng/FER2013Classification.py index 2081683154..9db8b017f7 100644 --- a/mteb/tasks/Image/ImageClassification/eng/FER2013Classification.py +++ b/mteb/tasks/Image/ImageClassification/eng/FER2013Classification.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification +from mteb.abstasks.TaskMetadata import TaskMetadata class FER2013Classification(AbsTaskImageClassification): diff --git a/mteb/tasks/Image/ImageClassification/eng/FGVCAircraftClassification.py b/mteb/tasks/Image/ImageClassification/eng/FGVCAircraftClassification.py index bb09f32426..9b061e6dd1 100644 --- a/mteb/tasks/Image/ImageClassification/eng/FGVCAircraftClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/FGVCAircraftClassification.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification +from mteb.abstasks.TaskMetadata import TaskMetadata class FGVCAircraftClassification(AbsTaskImageClassification): diff --git a/mteb/tasks/Image/ImageClassification/eng/Food101Classification.py b/mteb/tasks/Image/ImageClassification/eng/Food101Classification.py index 533b2c2145..04389db8f1 100644 --- a/mteb/tasks/Image/ImageClassification/eng/Food101Classification.py +++ b/mteb/tasks/Image/ImageClassification/eng/Food101Classification.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification +from mteb.abstasks.TaskMetadata import TaskMetadata class Food101Classification(AbsTaskImageClassification): diff --git a/mteb/tasks/Image/ImageClassification/eng/MNISTClassification.py b/mteb/tasks/Image/ImageClassification/eng/MNISTClassification.py index 82de6fab16..f3831abdb4 100644 --- a/mteb/tasks/Image/ImageClassification/eng/MNISTClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/MNISTClassification.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification +from mteb.abstasks.TaskMetadata import TaskMetadata class MNISTClassification(AbsTaskImageClassification): diff --git a/mteb/tasks/Image/ImageClassification/eng/OxfordFlowersClassification.py b/mteb/tasks/Image/ImageClassification/eng/OxfordFlowersClassification.py index dce55d9362..c0a10de48d 100644 --- a/mteb/tasks/Image/ImageClassification/eng/OxfordFlowersClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/OxfordFlowersClassification.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification +from mteb.abstasks.TaskMetadata import TaskMetadata class OxfordFlowersClassification(AbsTaskImageClassification): diff --git a/mteb/tasks/Image/ImageClassification/eng/OxfordPetsClassification.py b/mteb/tasks/Image/ImageClassification/eng/OxfordPetsClassification.py index 0277098d64..cf537648ed 100644 --- a/mteb/tasks/Image/ImageClassification/eng/OxfordPetsClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/OxfordPetsClassification.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification +from mteb.abstasks.TaskMetadata import TaskMetadata class OxfordPetsClassification(AbsTaskImageClassification): diff --git a/mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py b/mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py index e883db4c6e..afbc8fe1da 100644 --- a/mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py +++ b/mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification +from mteb.abstasks.TaskMetadata import TaskMetadata class RESISC45Classification(AbsTaskImageClassification): diff --git a/mteb/tasks/Image/ImageClassification/eng/STL10Classification.py b/mteb/tasks/Image/ImageClassification/eng/STL10Classification.py index 9b9fcf3ef4..9531e1c1f6 100644 --- a/mteb/tasks/Image/ImageClassification/eng/STL10Classification.py +++ b/mteb/tasks/Image/ImageClassification/eng/STL10Classification.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification +from mteb.abstasks.TaskMetadata import TaskMetadata class STL10Classification(AbsTaskImageClassification): diff --git a/mteb/tasks/Image/ImageClassification/eng/SUN397Classification.py b/mteb/tasks/Image/ImageClassification/eng/SUN397Classification.py index 414f3560e6..eef0ccbfcb 100644 --- a/mteb/tasks/Image/ImageClassification/eng/SUN397Classification.py +++ b/mteb/tasks/Image/ImageClassification/eng/SUN397Classification.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification +from mteb.abstasks.TaskMetadata import TaskMetadata class SUN397Classification(AbsTaskImageClassification): diff --git a/mteb/tasks/Image/ImageClassification/eng/StanfordCarsClassification.py b/mteb/tasks/Image/ImageClassification/eng/StanfordCarsClassification.py index 1fa4f64af2..e4561b2165 100644 --- a/mteb/tasks/Image/ImageClassification/eng/StanfordCarsClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/StanfordCarsClassification.py @@ -1,8 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification +from mteb.abstasks.TaskMetadata import TaskMetadata class StanfordCarsClassification(AbsTaskImageClassification): diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py b/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py index 9273b66add..ed31e3f89f 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py @@ -1,10 +1,9 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( AbsTaskZeroshotClassification, ) +from mteb.abstasks.TaskMetadata import TaskMetadata class BirdsnapClassification(AbsTaskZeroshotClassification): diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py b/mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py index 517bf565cc..81103a0f1d 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py @@ -1,10 +1,9 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( AbsTaskZeroshotClassification, ) +from mteb.abstasks.TaskMetadata import TaskMetadata class CIFAR10ZeroShotClassification(AbsTaskZeroshotClassification): diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py b/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py index f07c423939..ab7ca141cb 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py @@ -1,10 +1,9 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( AbsTaskZeroshotClassification, ) +from mteb.abstasks.TaskMetadata import TaskMetadata class Caltech101Classification(AbsTaskZeroshotClassification): diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py b/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py index caea933534..27ef0a6f3d 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py @@ -1,10 +1,9 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( AbsTaskZeroshotClassification, ) +from mteb.abstasks.TaskMetadata import TaskMetadata class DTDClassification(AbsTaskZeroshotClassification): diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py b/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py index 275487580d..de6fb4c434 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py @@ -1,10 +1,9 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( AbsTaskZeroshotClassification, ) +from mteb.abstasks.TaskMetadata import TaskMetadata class EuroSATClassification(AbsTaskZeroshotClassification): diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py b/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py index febbb27e5e..9cfa0dd3e9 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py @@ -1,10 +1,9 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( AbsTaskZeroshotClassification, ) +from mteb.abstasks.TaskMetadata import TaskMetadata class FER2013Classification(AbsTaskZeroshotClassification): diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py b/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py index 833afde477..c15e0b6d4b 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py @@ -1,10 +1,9 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( AbsTaskZeroshotClassification, ) +from mteb.abstasks.TaskMetadata import TaskMetadata class FGVCAircraftClassification(AbsTaskZeroshotClassification): diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py b/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py index a2b93c2471..fd073ac412 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py @@ -1,10 +1,9 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( AbsTaskZeroshotClassification, ) +from mteb.abstasks.TaskMetadata import TaskMetadata class Food101Classification(AbsTaskZeroshotClassification): diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py b/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py index f343cb9211..253fa938ac 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py @@ -1,10 +1,9 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( AbsTaskZeroshotClassification, ) +from mteb.abstasks.TaskMetadata import TaskMetadata class MNISTClassification(AbsTaskZeroshotClassification): diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py b/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py index 2145fe8bff..3da580af1b 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py @@ -1,10 +1,9 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( AbsTaskZeroshotClassification, ) +from mteb.abstasks.TaskMetadata import TaskMetadata class OxfordPetsClassification(AbsTaskZeroshotClassification): diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py b/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py index 7ba9824455..d6fb98ba6c 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py @@ -1,10 +1,9 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( AbsTaskZeroshotClassification, ) +from mteb.abstasks.TaskMetadata import TaskMetadata class RESISC45Classification(AbsTaskZeroshotClassification): diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py b/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py index 11c53d5032..8b0f42d08d 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py @@ -1,10 +1,9 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( AbsTaskZeroshotClassification, ) +from mteb.abstasks.TaskMetadata import TaskMetadata class STL10Classification(AbsTaskZeroshotClassification): diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py b/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py index c3e67879b0..64252584b8 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py @@ -1,10 +1,9 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( AbsTaskZeroshotClassification, ) +from mteb.abstasks.TaskMetadata import TaskMetadata class SUN397Classification(AbsTaskZeroshotClassification): diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py b/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py index 0e881b65f0..c8cc639a4e 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py @@ -1,10 +1,9 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata - from mteb.abstasks.Image.AbsTaskZeroshotClassification import ( AbsTaskZeroshotClassification, ) +from mteb.abstasks.TaskMetadata import TaskMetadata class StanfordCarsClassification(AbsTaskZeroshotClassification): From 236a94f67aa630e4cd2295f481f886e08c98d310 Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Fri, 13 Sep 2024 16:39:39 +0100 Subject: [PATCH 05/17] wip: implement blip2 wrapper --- mteb/models/__init__.py | 2 + mteb/models/blip2_models.py | 257 ++++++++++++++++++------------------ 2 files changed, 133 insertions(+), 126 deletions(-) diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py index 94358143c1..2229b70239 100644 --- a/mteb/models/__init__.py +++ b/mteb/models/__init__.py @@ -11,6 +11,7 @@ align_models, bge_models, blip_models, + blip2_models, bm25, clip_models, cohere_models, @@ -132,6 +133,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe align_models, bge_models, blip_models, + blip2_models, bm25, cohere_models, dino_models, diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py index 3181c5f5ac..16acabc0ef 100644 --- a/mteb/models/blip2_models.py +++ b/mteb/models/blip2_models.py @@ -8,140 +8,145 @@ from torch.nn.functional import normalize from torch.utils.data import DataLoader from tqdm import tqdm -from transformers import BlipForImageTextRetrieval, BlipProcessor +from transformers import Blip2Processor from mteb.model_meta import ModelMeta +def blip2_loader(**kwargs): + try: # a temporal fix for the dependency issues of vista models. + from lavis.models import load_model_and_preprocess -class BLIP2ModelWrapper: - def __init__( - self, - model_name: str, - device: str = "cuda" if torch.cuda.is_available() else "cpu", - **kwargs: Any, - ): - self.model_name = model_name - self.device = device - self.model = BlipForImageTextRetrieval.from_pretrained(model_name).to( - self.device - ) - self.processor = BlipProcessor.from_pretrained(model_name) - - def preprocess( - self, - texts: list[str], - images: list[Image.Image], - ): - return self.processor( - text=texts, images=images, return_tensors="pt", padding=True + except ImportError: + raise ImportError( + "Please install `pip install salesforce-lavis` to use BLIP-2 models." ) + + class BLIP2ModelWrapper: + def __init__( + self, + model_name: str, + device: str = "cuda" if torch.cuda.is_available() else "cpu", + **kwargs: Any, + ): + self.model_name = model_name + self.device = device + self.model, self.vis_processors, self.txt_processors = load_model_and_preprocess(name="blip2-opt-2.7b", model_type="base") + self.model = self.model.to(self.device) + self.processor = Blip2Processor.from_pretrained(model_name) + + def preprocess( + self, + texts: list[str], + images: list[Image.Image], + ): + return self.processor( + text=texts, images=images, return_tensors="pt", padding=True + ) + + def get_text_embeddings(self, texts: list[str], batch_size: int = 32): + all_text_embeddings = [] - def get_text_embeddings(self, texts: list[str], batch_size: int = 32): - all_text_embeddings = [] - - with torch.no_grad(): - for i in tqdm(range(0, len(texts), batch_size)): - batch_texts = texts[i : i + batch_size] - inputs = self.processor( - text=batch_texts, return_tensors="pt", padding=True, truncation=True - ) - inputs = {k: v.to(self.device) for k, v in inputs.items()} - # different to CLIPModelWrapper: text_encoder instead of get_text_features and apply projection and normalization - text_outputs = self.model.text_encoder(**inputs) - text_outputs = text_outputs[0] - text_outputs = normalize( - self.model.text_proj(text_outputs[:, 0, :]), dim=-1 - ) - all_text_embeddings.append(text_outputs.cpu()) - - all_text_embeddings = torch.cat(all_text_embeddings, dim=0) - return all_text_embeddings - - def get_image_embeddings( - self, images: list[Image.Image] | DataLoader, batch_size: int = 32 - ): - all_image_embeddings = [] - - if isinstance(images, DataLoader): - with torch.no_grad(): - for batch in tqdm(images): - inputs = self.processor( - images=batch, return_tensors="pt", padding=True - ) - inputs = {k: v.to(self.device) for k, v in inputs.items()} - image_outputs = self.model.vision_model(**inputs) - image_outputs = image_outputs[0] - image_outputs = normalize( - self.model.vision_proj(image_outputs[:, 0, :]), dim=-1 - ) - all_image_embeddings.append(image_outputs.cpu()) - else: with torch.no_grad(): - for i in tqdm(range(0, len(images), batch_size)): - batch_images = images[i : i + batch_size] + for i in tqdm(range(0, len(texts), batch_size)): + batch_texts = texts[i : i + batch_size] inputs = self.processor( - images=batch_images, return_tensors="pt", padding=True + text=batch_texts, return_tensors="pt", padding=True, truncation=True ) inputs = {k: v.to(self.device) for k, v in inputs.items()} - image_outputs = self.model.get_image_features(**inputs) - image_outputs = self.model.vision_model(**inputs) - image_outputs = image_outputs[0] - image_outputs = normalize( - self.model.vision_proj(image_outputs[:, 0, :]), dim=-1 - ) - all_image_embeddings.append(image_outputs.cpu()) - - all_image_embeddings = torch.cat(all_image_embeddings, dim=0) - return all_image_embeddings - - def calculate_probs(self, text_embeddings, image_embeddings): - text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True) - image_embeddings = image_embeddings / image_embeddings.norm( - dim=-1, keepdim=True - ) - logits = torch.matmul(image_embeddings, text_embeddings.T) - probs = (logits * 100).softmax(dim=-1) - return probs - - def get_fused_embeddings( - self, - texts: list[str] = None, - images: list[Image.Image] | DataLoader = None, - fusion_mode="sum", - batch_size: int = 32, - ): - # TODO: find out if BLIP has a prescribed way of fusing text and image embeddings - if texts is None and images is None: - raise ValueError("Either texts or images must be provided") - - text_embeddings = None - image_embeddings = None - - if texts is not None: - text_embeddings = self.get_text_embeddings(texts, batch_size) - - if images is not None: - image_embeddings = self.get_image_embeddings(images, batch_size) - - if text_embeddings is not None and image_embeddings is not None: - if len(text_embeddings) != len(image_embeddings): - raise ValueError( - "The number of texts and images must have the same length" - ) - if fusion_mode == "sum": - fused_embeddings = text_embeddings + image_embeddings + + text_outputs = self.model.forward_text(**inputs) + text_outputs = torch.functional.normalize(self.model.text_proj(text_outputs)) + all_text_embeddings.append(text_outputs.cpu()) + + all_text_embeddings = torch.cat(all_text_embeddings, dim=0) + return all_text_embeddings + + def get_image_embeddings( + self, images: list[Image.Image] | DataLoader, batch_size: int = 32 + ): + all_image_embeddings = [] + + if isinstance(images, DataLoader): + with torch.no_grad(): + for batch in tqdm(images): + inputs = self.processor( + images=batch, return_tensors="pt", padding=True + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + image_outputs = self.model.vision_model(**inputs) + image_outputs = image_outputs[0] + image_outputs = normalize( + self.model.vision_proj(image_outputs[:, 0, :]), dim=-1 + ) + all_image_embeddings.append(image_outputs.cpu()) else: - # to do: add other fusion mode - raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented") - return fused_embeddings - elif text_embeddings is not None: - return text_embeddings - elif image_embeddings is not None: - return image_embeddings + with torch.no_grad(): + for i in tqdm(range(0, len(images), batch_size)): + batch_images = images[i : i + batch_size] + inputs = self.processor( + images=batch_images, return_tensors="pt", padding=True + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + image_outputs = self.model.get_image_features(**inputs) + image_outputs = self.model.vision_model(**inputs) + image_outputs = image_outputs[0] + image_outputs = normalize( + self.model.vision_proj(image_outputs[:, 0, :]), dim=-1 + ) + all_image_embeddings.append(image_outputs.cpu()) + + all_image_embeddings = torch.cat(all_image_embeddings, dim=0) + return all_image_embeddings + + def calculate_probs(self, text_embeddings, image_embeddings): + text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True) + image_embeddings = image_embeddings / image_embeddings.norm( + dim=-1, keepdim=True + ) + logits = torch.matmul(image_embeddings, text_embeddings.T) + probs = (logits * 100).softmax(dim=-1) + return probs + + def get_fused_embeddings( + self, + texts: list[str] = None, + images: list[Image.Image] | DataLoader = None, + fusion_mode="sum", + batch_size: int = 32, + ): + # TODO: find out if BLIP has a prescribed way of fusing text and image embeddings + if texts is None and images is None: + raise ValueError("Either texts or images must be provided") + + text_embeddings = None + image_embeddings = None + + if texts is not None: + text_embeddings = self.get_text_embeddings(texts, batch_size) + + if images is not None: + image_embeddings = self.get_image_embeddings(images, batch_size) + + if text_embeddings is not None and image_embeddings is not None: + if len(text_embeddings) != len(image_embeddings): + raise ValueError( + "The number of texts and images must have the same length" + ) + if fusion_mode == "sum": + fused_embeddings = text_embeddings + image_embeddings + else: + # to do: add other fusion mode + raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented") + return fused_embeddings + elif text_embeddings is not None: + return text_embeddings + elif image_embeddings is not None: + return image_embeddings + + return BLIP2ModelWrapper(**kwargs) """ - Salesforce/blip2-opt-2.7b Image-to-Text • Updated Mar 22 • 588k • @@ -167,7 +172,7 @@ def get_fused_embeddings( blip2_opt_2_7b = ModelMeta( loader=partial( - BLIP2ModelWrapper, + blip2_loader, model_name="Salesforce/blip2-opt-2.7b", ), name="Salesforce/blip2-opt-2.7b", @@ -179,7 +184,7 @@ def get_fused_embeddings( blip2_flan_t5_xxl = ModelMeta( loader=partial( - BLIP2ModelWrapper, + blip2_loader, model_name="Salesforce/blip2-flan-t5-xxl", ), name="Salesforce/blip2-flan-t5-xxl", @@ -191,7 +196,7 @@ def get_fused_embeddings( blip2_opt_6_7b_coco = ModelMeta( loader=partial( - BLIP2ModelWrapper, + blip2_loader, model_name="Salesforce/blip2-opt-6.7b-coco", ), name="Salesforce/blip2-opt-6.7b-coco", @@ -203,7 +208,7 @@ def get_fused_embeddings( blip2_opt_6_7b = ModelMeta( loader=partial( - BLIP2ModelWrapper, + blip2_loader, model_name="Salesforce/blip2-opt-6.7b", ), name="Salesforce/blip2-opt-6.7b", @@ -215,7 +220,7 @@ def get_fused_embeddings( blip2_flan_t5_xl = ModelMeta( loader=partial( - BLIP2ModelWrapper, + blip2_loader, model_name="Salesforce/blip2-flan-t5-xl", ), name="Salesforce/blip2-flan-t5-xl", @@ -228,7 +233,7 @@ def get_fused_embeddings( if __name__ == "__main__": import mteb - mdl = mteb.get_model(blip2_opt_2_7b.name, blip2_opt_2_7b.revision) + mdl = mteb.get_model(blip2_opt_2_7b.name, blip2_opt_2_7b.revision, device="cpu") emb = mdl.get_text_embeddings(["Hello, world!"]) emb2 = mdl.get_text_embeddings(["Hello there, world!"]) emb3 = mdl.get_text_embeddings(["Goodbye, person!"]) From 1f2f8c3d0960cd0af18620060262a53bfbe91f5c Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Sun, 15 Sep 2024 21:18:05 +0100 Subject: [PATCH 06/17] feat: add blip2 models, still mismatched names --- mteb/models/blip2_models.py | 133 ++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 67 deletions(-) diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py index 16acabc0ef..12dc1cfa51 100644 --- a/mteb/models/blip2_models.py +++ b/mteb/models/blip2_models.py @@ -2,20 +2,21 @@ from functools import partial from typing import Any +from types import SimpleNamespace import torch from PIL import Image from torch.nn.functional import normalize from torch.utils.data import DataLoader from tqdm import tqdm -from transformers import Blip2Processor +from transformers import Blip2Processor, BertTokenizer from mteb.model_meta import ModelMeta def blip2_loader(**kwargs): try: # a temporal fix for the dependency issues of vista models. from lavis.models import load_model_and_preprocess - + from lavis.models.blip2_models.blip2_image_text_matching import Blip2ITM, Blip2Qformer except ImportError: raise ImportError( "Please install `pip install salesforce-lavis` to use BLIP-2 models." @@ -30,8 +31,7 @@ def __init__( ): self.model_name = model_name self.device = device - self.model, self.vis_processors, self.txt_processors = load_model_and_preprocess(name="blip2-opt-2.7b", model_type="base") - self.model = self.model.to(self.device) + self.model = Blip2ITM.from_pretrained("pretrain").to(self.device).float() self.processor = Blip2Processor.from_pretrained(model_name) def preprocess( @@ -49,13 +49,15 @@ def get_text_embeddings(self, texts: list[str], batch_size: int = 32): with torch.no_grad(): for i in tqdm(range(0, len(texts), batch_size)): batch_texts = texts[i : i + batch_size] - inputs = self.processor( - text=batch_texts, return_tensors="pt", padding=True, truncation=True - ) - inputs = {k: v.to(self.device) for k, v in inputs.items()} - - text_outputs = self.model.forward_text(**inputs) - text_outputs = torch.functional.normalize(self.model.text_proj(text_outputs)) + text_tokens = self.model.tokenizer( + batch_texts, + padding="max_length", + truncation=True, + max_length=self.model.max_txt_len, + return_tensors="pt", + ).to(self.device) + text_outputs = self.model.forward_text(text_tokens) + text_outputs = normalize(self.model.text_proj(text_outputs)) all_text_embeddings.append(text_outputs.cpu()) all_text_embeddings = torch.cat(all_text_embeddings, dim=0) @@ -72,8 +74,7 @@ def get_image_embeddings( inputs = self.processor( images=batch, return_tensors="pt", padding=True ) - inputs = {k: v.to(self.device) for k, v in inputs.items()} - image_outputs = self.model.vision_model(**inputs) + image_outputs = self.model.forward_image(inputs["pixel_values"].to(self.device)) image_outputs = image_outputs[0] image_outputs = normalize( self.model.vision_proj(image_outputs[:, 0, :]), dim=-1 @@ -85,10 +86,8 @@ def get_image_embeddings( batch_images = images[i : i + batch_size] inputs = self.processor( images=batch_images, return_tensors="pt", padding=True - ) - inputs = {k: v.to(self.device) for k, v in inputs.items()} - image_outputs = self.model.get_image_features(**inputs) - image_outputs = self.model.vision_model(**inputs) + )["pixel_values"].to(self.device) + image_outputs = self.model.forward_image(inputs) image_outputs = image_outputs[0] image_outputs = normalize( self.model.vision_proj(image_outputs[:, 0, :]), dim=-1 @@ -98,6 +97,43 @@ def get_image_embeddings( all_image_embeddings = torch.cat(all_image_embeddings, dim=0) return all_image_embeddings + def get_multimodal_embeddings( + self, texts, images, batch_size + ): + all_multimodal_embeddings = [] + + with torch.no_grad(): + if isinstance(images, DataLoader): + for batch_images, i in tqdm(zip(images, range(0, len(texts), batch_size))): + batch_texts = texts[i : i + batch_size] + + image_inputs = self.processor( + images=batch_images, return_tensors="pt", padding=True + )["pixel_values"].to(self.device) + multimodal_outputs = self.model.extract_features({ + "text_input": batch_texts, + "image": image_inputs + }).multimodal_embeds + + all_multimodal_embeddings.append(multimodal_outputs.cpu()) + else: + for i in tqdm(range(0, len(texts), batch_size)): + batch_images = images[i : i + batch_size] + batch_texts = texts[i : i + batch_size] + + image_inputs = self.processor( + images=batch_images, return_tensors="pt", padding=True + )["pixel_values"].to(self.device) + multimodal_outputs = self.model.extract_features({ + "text_input": batch_texts, + "image": image_inputs + }).multimodal_embeds + + all_multimodal_embeddings.append(multimodal_outputs.cpu()) + + + return torch.cat(all_multimodal_embeddings, dim=0) + def calculate_probs(self, text_embeddings, image_embeddings): text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True) image_embeddings = image_embeddings / image_embeddings.norm( @@ -111,7 +147,7 @@ def get_fused_embeddings( self, texts: list[str] = None, images: list[Image.Image] | DataLoader = None, - fusion_mode="sum", + fusion_mode="multimodal", batch_size: int = 32, ): # TODO: find out if BLIP has a prescribed way of fusing text and image embeddings @@ -134,6 +170,8 @@ def get_fused_embeddings( ) if fusion_mode == "sum": fused_embeddings = text_embeddings + image_embeddings + if fusion_mode == "multimodal": + fused_embeddings = self.get_multimodal_embeddings(texts, images, batch_size) else: # to do: add other fusion mode raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented") @@ -170,7 +208,7 @@ def get_fused_embeddings( """ # in descending order of usage (downloads from huggingface) -blip2_opt_2_7b = ModelMeta( +blip2_image_text_matching = ModelMeta( loader=partial( blip2_loader, model_name="Salesforce/blip2-opt-2.7b", @@ -182,58 +220,12 @@ def get_fused_embeddings( release_date="2024-03-22", ) -blip2_flan_t5_xxl = ModelMeta( - loader=partial( - blip2_loader, - model_name="Salesforce/blip2-flan-t5-xxl", - ), - name="Salesforce/blip2-flan-t5-xxl", - languages=["eng_Latn"], - open_source=True, - revision="43206cbc865b9d5b3dd7d080e5d94b4143ca8e74", - release_date="2024-03-29", -) - -blip2_opt_6_7b_coco = ModelMeta( - loader=partial( - blip2_loader, - model_name="Salesforce/blip2-opt-6.7b-coco", - ), - name="Salesforce/blip2-opt-6.7b-coco", - languages=["eng_Latn"], - open_source=True, - revision="0d580de59320a25a4d2c386387bcef310d5f286e", - release_date="2024-03-31", -) - -blip2_opt_6_7b = ModelMeta( - loader=partial( - blip2_loader, - model_name="Salesforce/blip2-opt-6.7b", - ), - name="Salesforce/blip2-opt-6.7b", - languages=["eng_Latn"], - open_source=True, - revision="1d33d60155fd1323b97556e0f1dd5148a9749f5b", - release_date="2024-03-27", -) - -blip2_flan_t5_xl = ModelMeta( - loader=partial( - blip2_loader, - model_name="Salesforce/blip2-flan-t5-xl", - ), - name="Salesforce/blip2-flan-t5-xl", - languages=["eng_Latn"], - open_source=True, - revision="e5025a34e3e769e72e2aab7f7bfd00bc84d5fd77", - release_date="2023-12-13", -) if __name__ == "__main__": import mteb + import PIL.Image - mdl = mteb.get_model(blip2_opt_2_7b.name, blip2_opt_2_7b.revision, device="cpu") + mdl = mteb.get_model(blip2_image_text_matching.name, blip2_image_text_matching.revision, device="cpu") emb = mdl.get_text_embeddings(["Hello, world!"]) emb2 = mdl.get_text_embeddings(["Hello there, world!"]) emb3 = mdl.get_text_embeddings(["Goodbye, person!"]) @@ -243,3 +235,10 @@ def get_fused_embeddings( sim = torch.nn.functional.cosine_similarity(emb, emb3) print(sim) + + cat_img = Image.open("cat.jpg") + cat_text = "An image of a cat" + + multi_emv = mdl.get_multimodal_embeddings([cat_text], [cat_img], 32) + + From 8c6486087ad2790b724c03110b149e337f77b9b0 Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Sun, 15 Sep 2024 21:28:31 +0100 Subject: [PATCH 07/17] fix: remove projections from image and text embeddings --- mteb/models/blip2_models.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py index 12dc1cfa51..86f0676b0e 100644 --- a/mteb/models/blip2_models.py +++ b/mteb/models/blip2_models.py @@ -57,7 +57,7 @@ def get_text_embeddings(self, texts: list[str], batch_size: int = 32): return_tensors="pt", ).to(self.device) text_outputs = self.model.forward_text(text_tokens) - text_outputs = normalize(self.model.text_proj(text_outputs)) + #text_outputs = normalize(self.model.text_proj(text_outputs)) all_text_embeddings.append(text_outputs.cpu()) all_text_embeddings = torch.cat(all_text_embeddings, dim=0) @@ -75,10 +75,8 @@ def get_image_embeddings( images=batch, return_tensors="pt", padding=True ) image_outputs = self.model.forward_image(inputs["pixel_values"].to(self.device)) - image_outputs = image_outputs[0] - image_outputs = normalize( - self.model.vision_proj(image_outputs[:, 0, :]), dim=-1 - ) + image_outputs = image_outputs[0][:, 0, :] + #image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1) all_image_embeddings.append(image_outputs.cpu()) else: with torch.no_grad(): @@ -98,7 +96,7 @@ def get_image_embeddings( return all_image_embeddings def get_multimodal_embeddings( - self, texts, images, batch_size + self, texts, images, batch_size=32 ): all_multimodal_embeddings = [] @@ -113,7 +111,7 @@ def get_multimodal_embeddings( multimodal_outputs = self.model.extract_features({ "text_input": batch_texts, "image": image_inputs - }).multimodal_embeds + }).multimodal_embeds[:,0,:] all_multimodal_embeddings.append(multimodal_outputs.cpu()) else: @@ -127,7 +125,7 @@ def get_multimodal_embeddings( multimodal_outputs = self.model.extract_features({ "text_input": batch_texts, "image": image_inputs - }).multimodal_embeds + }).multimodal_embeds[:,0,:] all_multimodal_embeddings.append(multimodal_outputs.cpu()) @@ -239,6 +237,16 @@ def get_fused_embeddings( cat_img = Image.open("cat.jpg") cat_text = "An image of a cat" - multi_emv = mdl.get_multimodal_embeddings([cat_text], [cat_img], 32) + multi_cat_emb = mdl.get_multimodal_embeddings([cat_text], [cat_img]) + text_cat_emb = mdl.get_text_embeddings(["An photo of a cat"]) + text_dog_emb = mdl.get_text_embeddings(["An image of a dog"]) + + print(multi_cat_emb.shape) + + sim1 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_cat_emb) + sim2 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_dog_emb) + + print(sim1, sim2) + From 20839ca93b114bf4a5011aeaa41846a9f0f32482 Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Sun, 15 Sep 2024 21:52:30 +0100 Subject: [PATCH 08/17] make lint --- mteb/models/__init__.py | 2 +- mteb/models/blip2_models.py | 95 +++++++++++++++---------------------- 2 files changed, 40 insertions(+), 57 deletions(-) diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py index 2229b70239..eabe5a2d3f 100644 --- a/mteb/models/__init__.py +++ b/mteb/models/__init__.py @@ -10,8 +10,8 @@ from mteb.models import ( align_models, bge_models, - blip_models, blip2_models, + blip_models, bm25, clip_models, cohere_models, diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py index 86f0676b0e..b735b65cf3 100644 --- a/mteb/models/blip2_models.py +++ b/mteb/models/blip2_models.py @@ -2,26 +2,29 @@ from functools import partial from typing import Any -from types import SimpleNamespace import torch from PIL import Image from torch.nn.functional import normalize from torch.utils.data import DataLoader from tqdm import tqdm -from transformers import Blip2Processor, BertTokenizer +from transformers import Blip2Processor from mteb.model_meta import ModelMeta + def blip2_loader(**kwargs): try: # a temporal fix for the dependency issues of vista models. from lavis.models import load_model_and_preprocess - from lavis.models.blip2_models.blip2_image_text_matching import Blip2ITM, Blip2Qformer + from lavis.models.blip2_models.blip2_image_text_matching import ( + Blip2ITM, + Blip2Qformer, + ) except ImportError: raise ImportError( "Please install `pip install salesforce-lavis` to use BLIP-2 models." ) - + class BLIP2ModelWrapper: def __init__( self, @@ -57,7 +60,7 @@ def get_text_embeddings(self, texts: list[str], batch_size: int = 32): return_tensors="pt", ).to(self.device) text_outputs = self.model.forward_text(text_tokens) - #text_outputs = normalize(self.model.text_proj(text_outputs)) + # text_outputs = normalize(self.model.text_proj(text_outputs)) all_text_embeddings.append(text_outputs.cpu()) all_text_embeddings = torch.cat(all_text_embeddings, dim=0) @@ -74,9 +77,11 @@ def get_image_embeddings( inputs = self.processor( images=batch, return_tensors="pt", padding=True ) - image_outputs = self.model.forward_image(inputs["pixel_values"].to(self.device)) + image_outputs = self.model.forward_image( + inputs["pixel_values"].to(self.device) + ) image_outputs = image_outputs[0][:, 0, :] - #image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1) + # image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1) all_image_embeddings.append(image_outputs.cpu()) else: with torch.no_grad(): @@ -95,23 +100,22 @@ def get_image_embeddings( all_image_embeddings = torch.cat(all_image_embeddings, dim=0) return all_image_embeddings - def get_multimodal_embeddings( - self, texts, images, batch_size=32 - ): + def get_multimodal_embeddings(self, texts, images, batch_size=32): all_multimodal_embeddings = [] with torch.no_grad(): if isinstance(images, DataLoader): - for batch_images, i in tqdm(zip(images, range(0, len(texts), batch_size))): + for batch_images, i in tqdm( + zip(images, range(0, len(texts), batch_size)) + ): batch_texts = texts[i : i + batch_size] - - image_inputs = self.processor( + + image_inputs = self.processor( images=batch_images, return_tensors="pt", padding=True )["pixel_values"].to(self.device) - multimodal_outputs = self.model.extract_features({ - "text_input": batch_texts, - "image": image_inputs - }).multimodal_embeds[:,0,:] + multimodal_outputs = self.model.extract_features( + {"text_input": batch_texts, "image": image_inputs} + ).multimodal_embeds[:, 0, :] all_multimodal_embeddings.append(multimodal_outputs.cpu()) else: @@ -119,21 +123,21 @@ def get_multimodal_embeddings( batch_images = images[i : i + batch_size] batch_texts = texts[i : i + batch_size] - image_inputs = self.processor( + image_inputs = self.processor( images=batch_images, return_tensors="pt", padding=True )["pixel_values"].to(self.device) - multimodal_outputs = self.model.extract_features({ - "text_input": batch_texts, - "image": image_inputs - }).multimodal_embeds[:,0,:] + multimodal_outputs = self.model.extract_features( + {"text_input": batch_texts, "image": image_inputs} + ).multimodal_embeds[:, 0, :] all_multimodal_embeddings.append(multimodal_outputs.cpu()) - return torch.cat(all_multimodal_embeddings, dim=0) def calculate_probs(self, text_embeddings, image_embeddings): - text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True) + text_embeddings = text_embeddings / text_embeddings.norm( + dim=-1, keepdim=True + ) image_embeddings = image_embeddings / image_embeddings.norm( dim=-1, keepdim=True ) @@ -169,42 +173,22 @@ def get_fused_embeddings( if fusion_mode == "sum": fused_embeddings = text_embeddings + image_embeddings if fusion_mode == "multimodal": - fused_embeddings = self.get_multimodal_embeddings(texts, images, batch_size) + fused_embeddings = self.get_multimodal_embeddings( + texts, images, batch_size + ) else: # to do: add other fusion mode - raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented") + raise ValueError( + f"fusion mode {fusion_mode} hasn't been implemented" + ) return fused_embeddings elif text_embeddings is not None: return text_embeddings elif image_embeddings is not None: return image_embeddings - - return BLIP2ModelWrapper(**kwargs) + return BLIP2ModelWrapper(**kwargs) -""" -Salesforce/blip2-opt-2.7b -Image-to-Text • Updated Mar 22 • -588k • -296 -Salesforce/blip2-flan-t5-xxl -Image-to-Text • Updated Mar 29 • -9.23k • -84 -Salesforce/blip2-opt-6.7b-coco -Image-to-Text • Updated Mar 31 • -1.51k • -28 -Salesforce/blip2-opt-6.7b -Image-to-Text • Updated Mar 27 • -4.93k • -71 -Salesforce/blip2-flan-t5-xl -Image-to-Text • Updated Dec 13, 2023 • -95.9k • -56 -""" -# in descending order of usage (downloads from huggingface) blip2_image_text_matching = ModelMeta( loader=partial( @@ -220,10 +204,12 @@ def get_fused_embeddings( if __name__ == "__main__": + import mteb - import PIL.Image - mdl = mteb.get_model(blip2_image_text_matching.name, blip2_image_text_matching.revision, device="cpu") + mdl = mteb.get_model( + blip2_image_text_matching.name, blip2_image_text_matching.revision, device="cpu" + ) emb = mdl.get_text_embeddings(["Hello, world!"]) emb2 = mdl.get_text_embeddings(["Hello there, world!"]) emb3 = mdl.get_text_embeddings(["Goodbye, person!"]) @@ -247,6 +233,3 @@ def get_fused_embeddings( sim2 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_dog_emb) print(sim1, sim2) - - - From ec47c690261169ac8d197af476eccb0ae32a187d Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Sun, 15 Sep 2024 22:17:35 +0100 Subject: [PATCH 09/17] wip: add coco BLIP2 --- mteb/models/blip2_models.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py index b735b65cf3..aedb03e24f 100644 --- a/mteb/models/blip2_models.py +++ b/mteb/models/blip2_models.py @@ -18,7 +18,6 @@ def blip2_loader(**kwargs): from lavis.models import load_model_and_preprocess from lavis.models.blip2_models.blip2_image_text_matching import ( Blip2ITM, - Blip2Qformer, ) except ImportError: raise ImportError( @@ -34,7 +33,8 @@ def __init__( ): self.model_name = model_name self.device = device - self.model = Blip2ITM.from_pretrained("pretrain").to(self.device).float() + model_type = "coco" if "coco" in model_name else "pretrain" + self.model = Blip2ITM.from_pretrained(model_type).to(self.device).float() self.processor = Blip2Processor.from_pretrained(model_name) def preprocess( @@ -190,7 +190,7 @@ def get_fused_embeddings( return BLIP2ModelWrapper(**kwargs) -blip2_image_text_matching = ModelMeta( +blip2_opt_2_7b = ModelMeta( loader=partial( blip2_loader, model_name="Salesforce/blip2-opt-2.7b", @@ -202,13 +202,25 @@ def get_fused_embeddings( release_date="2024-03-22", ) +blip2_opt_6_7b_coco = ModelMeta( + loader=partial( + blip2_loader, + model_name="Salesforce/blip2-opt-6.7b-coco", + ), + name="Salesforce/blip2-opt-6.7b-coco", + languages=["eng_Latn"], + open_source=True, + revision="0d580de59320a25a4d2c386387bcef310d5f286e", + release_date="2024-03-31", +) + if __name__ == "__main__": import mteb mdl = mteb.get_model( - blip2_image_text_matching.name, blip2_image_text_matching.revision, device="cpu" + blip2_opt_2_7b.name, blip2_opt_2_7b.revision, device="cpu" ) emb = mdl.get_text_embeddings(["Hello, world!"]) emb2 = mdl.get_text_embeddings(["Hello there, world!"]) From e8f4ae1b6cdc455c7ac06d69bf6433936fac1ef4 Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Mon, 16 Sep 2024 12:06:47 +0100 Subject: [PATCH 10/17] fix: BLIP2 better zero-shot classification without text_proj and vision_proj --- mteb/models/blip2_models.py | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py index aedb03e24f..9cac90d6f0 100644 --- a/mteb/models/blip2_models.py +++ b/mteb/models/blip2_models.py @@ -35,6 +35,8 @@ def __init__( self.device = device model_type = "coco" if "coco" in model_name else "pretrain" self.model = Blip2ITM.from_pretrained(model_type).to(self.device).float() + # print numbr of parameters + print(f"Number of parameters: {sum(p.numel() for p in self.model.parameters())}") self.processor = Blip2Processor.from_pretrained(model_name) def preprocess( @@ -60,7 +62,7 @@ def get_text_embeddings(self, texts: list[str], batch_size: int = 32): return_tensors="pt", ).to(self.device) text_outputs = self.model.forward_text(text_tokens) - # text_outputs = normalize(self.model.text_proj(text_outputs)) + #text_outputs = normalize(self.model.text_proj(text_outputs)) all_text_embeddings.append(text_outputs.cpu()) all_text_embeddings = torch.cat(all_text_embeddings, dim=0) @@ -81,7 +83,7 @@ def get_image_embeddings( inputs["pixel_values"].to(self.device) ) image_outputs = image_outputs[0][:, 0, :] - # image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1) + #image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1) all_image_embeddings.append(image_outputs.cpu()) else: with torch.no_grad(): @@ -91,10 +93,8 @@ def get_image_embeddings( images=batch_images, return_tensors="pt", padding=True )["pixel_values"].to(self.device) image_outputs = self.model.forward_image(inputs) - image_outputs = image_outputs[0] - image_outputs = normalize( - self.model.vision_proj(image_outputs[:, 0, :]), dim=-1 - ) + image_outputs = image_outputs[0][:, 0, :] + #image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1) all_image_embeddings.append(image_outputs.cpu()) all_image_embeddings = torch.cat(all_image_embeddings, dim=0) @@ -105,6 +105,11 @@ def get_multimodal_embeddings(self, texts, images, batch_size=32): with torch.no_grad(): if isinstance(images, DataLoader): + # check dataloader batch size is the same as batch size + if images.batch_size != batch_size: + raise ValueError( + "Image DataLoader batch size must be the same as the given batch size: " + str(batch_size) + ) for batch_images, i in tqdm( zip(images, range(0, len(texts), batch_size)) ): @@ -117,6 +122,8 @@ def get_multimodal_embeddings(self, texts, images, batch_size=32): {"text_input": batch_texts, "image": image_inputs} ).multimodal_embeds[:, 0, :] + #multimodal_outputs = normalize(self.model.text_proj(multimodal_outputs), dim=-1) + all_multimodal_embeddings.append(multimodal_outputs.cpu()) else: for i in tqdm(range(0, len(texts), batch_size)): @@ -130,6 +137,8 @@ def get_multimodal_embeddings(self, texts, images, batch_size=32): {"text_input": batch_texts, "image": image_inputs} ).multimodal_embeds[:, 0, :] + #multimodal_outputs = normalize(self.model.text_proj(multimodal_outputs), dim=-1) + all_multimodal_embeddings.append(multimodal_outputs.cpu()) return torch.cat(all_multimodal_embeddings, dim=0) @@ -172,7 +181,7 @@ def get_fused_embeddings( ) if fusion_mode == "sum": fused_embeddings = text_embeddings + image_embeddings - if fusion_mode == "multimodal": + elif fusion_mode == "multimodal": fused_embeddings = self.get_multimodal_embeddings( texts, images, batch_size ) @@ -235,13 +244,21 @@ def get_fused_embeddings( cat_img = Image.open("cat.jpg") cat_text = "An image of a cat" - multi_cat_emb = mdl.get_multimodal_embeddings([cat_text], [cat_img]) + multi_cat_emb = mdl.get_fused_embeddings(["A photo of an animal"], [cat_img], fusion_mode="multimodal") + multi_conflicting_emb = mdl.get_fused_embeddings(["A photo of a dog"], [cat_img], fusion_mode="multimodal") + image_cat_emb = mdl.get_image_embeddings([cat_img]) text_cat_emb = mdl.get_text_embeddings(["An photo of a cat"]) text_dog_emb = mdl.get_text_embeddings(["An image of a dog"]) print(multi_cat_emb.shape) - sim1 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_cat_emb) - sim2 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_dog_emb) + sim1 = torch.nn.functional.cosine_similarity(image_cat_emb, text_cat_emb) + sim2 = torch.nn.functional.cosine_similarity(image_cat_emb, text_dog_emb) + sim3 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_cat_emb) + sim4 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_dog_emb) + sim5 = torch.nn.functional.cosine_similarity(multi_conflicting_emb, text_cat_emb) + print(sim1, sim2) + + print(sim3, sim4, sim5) From 57bc3b8d4e98e4d29c2e44fb12e1bdce1c263cc1 Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Thu, 19 Sep 2024 16:06:02 +0100 Subject: [PATCH 11/17] tidy blip2 --- mteb/models/blip2_models.py | 2 +- mteb/tasks/Image/Clustering/__init__.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py index 9cac90d6f0..2195f42c12 100644 --- a/mteb/models/blip2_models.py +++ b/mteb/models/blip2_models.py @@ -158,7 +158,7 @@ def get_fused_embeddings( self, texts: list[str] = None, images: list[Image.Image] | DataLoader = None, - fusion_mode="multimodal", + fusion_mode="sum", batch_size: int = 32, ): # TODO: find out if BLIP has a prescribed way of fusing text and image embeddings diff --git a/mteb/tasks/Image/Clustering/__init__.py b/mteb/tasks/Image/Clustering/__init__.py index fd9a71ec19..9ce1b567e6 100644 --- a/mteb/tasks/Image/Clustering/__init__.py +++ b/mteb/tasks/Image/Clustering/__init__.py @@ -2,3 +2,4 @@ from .eng.CIFAR import * from .eng.TinyImageNet import * +from .eng.ImageNet import * From 4cbec1bc6957fcc74f35888191ee2cf620bdaa4c Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Thu, 19 Sep 2024 16:09:32 +0100 Subject: [PATCH 12/17] add imagenet-dog-15 dataset --- mteb/tasks/Image/Clustering/eng/ImageNet.py | 71 +++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 mteb/tasks/Image/Clustering/eng/ImageNet.py diff --git a/mteb/tasks/Image/Clustering/eng/ImageNet.py b/mteb/tasks/Image/Clustering/eng/ImageNet.py new file mode 100644 index 0000000000..0efe69f844 --- /dev/null +++ b/mteb/tasks/Image/Clustering/eng/ImageNet.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +import io +import PIL.Image as Image +from mteb.abstasks.Image.AbsTaskImageClustering import AbsTaskImageClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + +""" +Classes: +1.MALTESE DOG +2.BLENHEIM SPANIEL +3.BASSET +4.NORWEGIAN ELKHOUND +5.GIANT SCHNAUZER +6.GOLDEN RETRIEVER +7.BRITTANY SPANIEL +8.CLUMBER +9.WELSH SPRINGER SPANIEL +10.GROENENDAEL +11.KELPIE +12.SHETLAND SHEEPDOG +13.DOBERMAN +14.PUG +15.CHOW +""" + +class ImageNetDog15Clustering(AbsTaskImageClustering): + metadata = TaskMetadata( + name="ImageNetDog15Clustering", + description="Clustering images from a 15-class dogs-only subset of the dog classes in ImageNet.", + reference="http://vision.stanford.edu/aditya86/ImageNetDogs/main.html", + dataset={ + "path": "JamieSJS/imagenet-dog-15", + "revision": "bfb6ad3b2109d26c9daddf14f98d315daa35ee72", + }, + type="Clustering", + category="i2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2009-06-20", + "2009-06-20" + ), # Conference date + domains=["Web"], + task_subtypes=["Object recognition"], + license="Not specified", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + modalities=["image"], + sample_creation="created", + bibtex_citation=""" @INPROCEEDINGS{5206848, + author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Kai Li and Li Fei-Fei}, + booktitle={2009 IEEE Conference on Computer Vision and Pattern Recognition}, + title={ImageNet: A large-scale hierarchical image database}, + year={2009}, + volume={}, + number={}, + pages={248-255}, + keywords={Large-scale systems;Image databases;Explosions;Internet;Robustness;Information retrieval;Image retrieval;Multimedia databases;Ontologies;Spine}, + doi={10.1109/CVPR.2009.5206848}} + """, + descriptive_stats={ + "n_samples": {"test": 1076, "train":1500}, + #"avg_character_length": {"test": 431.4}, + }, + ) + + + From 35be38d82a6b2f43245e7094d0b62b037ceb13e6 Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Thu, 19 Sep 2024 16:26:57 +0100 Subject: [PATCH 13/17] tidy and lint --- .../evaluators/Image/VisualSTSEvaluator.py | 6 ++-- mteb/models/blip2_models.py | 32 +++++++++--------- mteb/tasks/Image/Clustering/__init__.py | 2 +- mteb/tasks/Image/Clustering/eng/ImageNet.py | 33 ++----------------- mteb/tasks/Image/VisualSTS/__init__.py | 2 ++ .../Image/VisualSTS/en/STS12VisualSTS.py | 2 +- .../Image/VisualSTS/en/STS13VisualSTS.py | 2 +- .../Image/VisualSTS/en/STS14VisualSTS.py | 2 +- .../Image/VisualSTS/en/STS15VisualSTS.py | 2 +- .../Image/VisualSTS/en/STS16VisualSTS.py | 2 +- 10 files changed, 31 insertions(+), 54 deletions(-) diff --git a/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py b/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py index d47e060e75..a442eb6a9a 100644 --- a/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py +++ b/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py @@ -1,18 +1,18 @@ from __future__ import annotations import logging -from typing import Any +import math import os +from typing import Any import numpy as np +import torch from scipy.stats import pearsonr, spearmanr from sklearn.metrics.pairwise import ( paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances, ) -import math -import torch from torch.utils.data import DataLoader from torchvision import transforms diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py index 2195f42c12..aa92452ba9 100644 --- a/mteb/models/blip2_models.py +++ b/mteb/models/blip2_models.py @@ -5,7 +5,6 @@ import torch from PIL import Image -from torch.nn.functional import normalize from torch.utils.data import DataLoader from tqdm import tqdm from transformers import Blip2Processor @@ -36,7 +35,9 @@ def __init__( model_type = "coco" if "coco" in model_name else "pretrain" self.model = Blip2ITM.from_pretrained(model_type).to(self.device).float() # print numbr of parameters - print(f"Number of parameters: {sum(p.numel() for p in self.model.parameters())}") + print( + f"Number of parameters: {sum(p.numel() for p in self.model.parameters())}" + ) self.processor = Blip2Processor.from_pretrained(model_name) def preprocess( @@ -62,7 +63,7 @@ def get_text_embeddings(self, texts: list[str], batch_size: int = 32): return_tensors="pt", ).to(self.device) text_outputs = self.model.forward_text(text_tokens) - #text_outputs = normalize(self.model.text_proj(text_outputs)) + # text_outputs = normalize(self.model.text_proj(text_outputs)) all_text_embeddings.append(text_outputs.cpu()) all_text_embeddings = torch.cat(all_text_embeddings, dim=0) @@ -83,7 +84,7 @@ def get_image_embeddings( inputs["pixel_values"].to(self.device) ) image_outputs = image_outputs[0][:, 0, :] - #image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1) + # image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1) all_image_embeddings.append(image_outputs.cpu()) else: with torch.no_grad(): @@ -94,7 +95,7 @@ def get_image_embeddings( )["pixel_values"].to(self.device) image_outputs = self.model.forward_image(inputs) image_outputs = image_outputs[0][:, 0, :] - #image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1) + # image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1) all_image_embeddings.append(image_outputs.cpu()) all_image_embeddings = torch.cat(all_image_embeddings, dim=0) @@ -108,7 +109,8 @@ def get_multimodal_embeddings(self, texts, images, batch_size=32): # check dataloader batch size is the same as batch size if images.batch_size != batch_size: raise ValueError( - "Image DataLoader batch size must be the same as the given batch size: " + str(batch_size) + "Image DataLoader batch size must be the same as the given batch size: " + + str(batch_size) ) for batch_images, i in tqdm( zip(images, range(0, len(texts), batch_size)) @@ -122,7 +124,7 @@ def get_multimodal_embeddings(self, texts, images, batch_size=32): {"text_input": batch_texts, "image": image_inputs} ).multimodal_embeds[:, 0, :] - #multimodal_outputs = normalize(self.model.text_proj(multimodal_outputs), dim=-1) + # multimodal_outputs = normalize(self.model.text_proj(multimodal_outputs), dim=-1) all_multimodal_embeddings.append(multimodal_outputs.cpu()) else: @@ -137,7 +139,7 @@ def get_multimodal_embeddings(self, texts, images, batch_size=32): {"text_input": batch_texts, "image": image_inputs} ).multimodal_embeds[:, 0, :] - #multimodal_outputs = normalize(self.model.text_proj(multimodal_outputs), dim=-1) + # multimodal_outputs = normalize(self.model.text_proj(multimodal_outputs), dim=-1) all_multimodal_embeddings.append(multimodal_outputs.cpu()) @@ -225,12 +227,9 @@ def get_fused_embeddings( if __name__ == "__main__": - import mteb - mdl = mteb.get_model( - blip2_opt_2_7b.name, blip2_opt_2_7b.revision, device="cpu" - ) + mdl = mteb.get_model(blip2_opt_2_7b.name, blip2_opt_2_7b.revision, device="cpu") emb = mdl.get_text_embeddings(["Hello, world!"]) emb2 = mdl.get_text_embeddings(["Hello there, world!"]) emb3 = mdl.get_text_embeddings(["Goodbye, person!"]) @@ -244,8 +243,12 @@ def get_fused_embeddings( cat_img = Image.open("cat.jpg") cat_text = "An image of a cat" - multi_cat_emb = mdl.get_fused_embeddings(["A photo of an animal"], [cat_img], fusion_mode="multimodal") - multi_conflicting_emb = mdl.get_fused_embeddings(["A photo of a dog"], [cat_img], fusion_mode="multimodal") + multi_cat_emb = mdl.get_fused_embeddings( + ["A photo of an animal"], [cat_img], fusion_mode="multimodal" + ) + multi_conflicting_emb = mdl.get_fused_embeddings( + ["A photo of a dog"], [cat_img], fusion_mode="multimodal" + ) image_cat_emb = mdl.get_image_embeddings([cat_img]) text_cat_emb = mdl.get_text_embeddings(["An photo of a cat"]) text_dog_emb = mdl.get_text_embeddings(["An image of a dog"]) @@ -258,7 +261,6 @@ def get_fused_embeddings( sim4 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_dog_emb) sim5 = torch.nn.functional.cosine_similarity(multi_conflicting_emb, text_cat_emb) - print(sim1, sim2) print(sim3, sim4, sim5) diff --git a/mteb/tasks/Image/Clustering/__init__.py b/mteb/tasks/Image/Clustering/__init__.py index 9ce1b567e6..804870ebeb 100644 --- a/mteb/tasks/Image/Clustering/__init__.py +++ b/mteb/tasks/Image/Clustering/__init__.py @@ -1,5 +1,5 @@ from __future__ import annotations from .eng.CIFAR import * -from .eng.TinyImageNet import * from .eng.ImageNet import * +from .eng.TinyImageNet import * diff --git a/mteb/tasks/Image/Clustering/eng/ImageNet.py b/mteb/tasks/Image/Clustering/eng/ImageNet.py index 0efe69f844..b45956cfe7 100644 --- a/mteb/tasks/Image/Clustering/eng/ImageNet.py +++ b/mteb/tasks/Image/Clustering/eng/ImageNet.py @@ -1,29 +1,8 @@ from __future__ import annotations -import io -import PIL.Image as Image from mteb.abstasks.Image.AbsTaskImageClustering import AbsTaskImageClustering from mteb.abstasks.TaskMetadata import TaskMetadata -""" -Classes: -1.MALTESE DOG -2.BLENHEIM SPANIEL -3.BASSET -4.NORWEGIAN ELKHOUND -5.GIANT SCHNAUZER -6.GOLDEN RETRIEVER -7.BRITTANY SPANIEL -8.CLUMBER -9.WELSH SPRINGER SPANIEL -10.GROENENDAEL -11.KELPIE -12.SHETLAND SHEEPDOG -13.DOBERMAN -14.PUG -15.CHOW -""" - class ImageNetDog15Clustering(AbsTaskImageClustering): metadata = TaskMetadata( name="ImageNetDog15Clustering", @@ -38,10 +17,7 @@ class ImageNetDog15Clustering(AbsTaskImageClustering): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", - date=( - "2009-06-20", - "2009-06-20" - ), # Conference date + date=("2009-06-20", "2009-06-20"), # Conference date domains=["Web"], task_subtypes=["Object recognition"], license="Not specified", @@ -62,10 +38,7 @@ class ImageNetDog15Clustering(AbsTaskImageClustering): doi={10.1109/CVPR.2009.5206848}} """, descriptive_stats={ - "n_samples": {"test": 1076, "train":1500}, - #"avg_character_length": {"test": 431.4}, + "n_samples": {"test": 1076, "train": 1500}, + # "avg_character_length": {"test": 431.4}, }, ) - - - diff --git a/mteb/tasks/Image/VisualSTS/__init__.py b/mteb/tasks/Image/VisualSTS/__init__.py index cc7823118b..eb785d5d85 100644 --- a/mteb/tasks/Image/VisualSTS/__init__.py +++ b/mteb/tasks/Image/VisualSTS/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from .en.STS12VisualSTS import * from .en.STS13VisualSTS import * from .en.STS14VisualSTS import * diff --git a/mteb/tasks/Image/VisualSTS/en/STS12VisualSTS.py b/mteb/tasks/Image/VisualSTS/en/STS12VisualSTS.py index 1f88b8045a..8d78bb7238 100644 --- a/mteb/tasks/Image/VisualSTS/en/STS12VisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/en/STS12VisualSTS.py @@ -1,7 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS +from mteb.abstasks.TaskMetadata import TaskMetadata class STS12VisualSTS(AbsTaskVisualSTS): diff --git a/mteb/tasks/Image/VisualSTS/en/STS13VisualSTS.py b/mteb/tasks/Image/VisualSTS/en/STS13VisualSTS.py index 122a5d6d30..1b02248d35 100644 --- a/mteb/tasks/Image/VisualSTS/en/STS13VisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/en/STS13VisualSTS.py @@ -1,7 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS +from mteb.abstasks.TaskMetadata import TaskMetadata class STS13VisualSTS(AbsTaskVisualSTS): diff --git a/mteb/tasks/Image/VisualSTS/en/STS14VisualSTS.py b/mteb/tasks/Image/VisualSTS/en/STS14VisualSTS.py index cbbcc94445..a427fdae0b 100644 --- a/mteb/tasks/Image/VisualSTS/en/STS14VisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/en/STS14VisualSTS.py @@ -1,7 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS +from mteb.abstasks.TaskMetadata import TaskMetadata class STS14VisualSTS(AbsTaskVisualSTS): diff --git a/mteb/tasks/Image/VisualSTS/en/STS15VisualSTS.py b/mteb/tasks/Image/VisualSTS/en/STS15VisualSTS.py index 9eb99af506..12c9a74c81 100644 --- a/mteb/tasks/Image/VisualSTS/en/STS15VisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/en/STS15VisualSTS.py @@ -1,7 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS +from mteb.abstasks.TaskMetadata import TaskMetadata class STS15VisualSTS(AbsTaskVisualSTS): diff --git a/mteb/tasks/Image/VisualSTS/en/STS16VisualSTS.py b/mteb/tasks/Image/VisualSTS/en/STS16VisualSTS.py index 7db7b4f906..ae1e2900dd 100644 --- a/mteb/tasks/Image/VisualSTS/en/STS16VisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/en/STS16VisualSTS.py @@ -1,7 +1,7 @@ from __future__ import annotations -from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS +from mteb.abstasks.TaskMetadata import TaskMetadata class STS16VisualSTS(AbsTaskVisualSTS): From 83d0f455d75dc904db7c83740f09277e62ad28e3 Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Thu, 19 Sep 2024 16:37:43 +0100 Subject: [PATCH 14/17] remove unused import --- mteb/models/blip2_models.py | 1 - mteb/tasks/Image/Clustering/eng/ImageNet.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py index aa92452ba9..cb289b3f96 100644 --- a/mteb/models/blip2_models.py +++ b/mteb/models/blip2_models.py @@ -14,7 +14,6 @@ def blip2_loader(**kwargs): try: # a temporal fix for the dependency issues of vista models. - from lavis.models import load_model_and_preprocess from lavis.models.blip2_models.blip2_image_text_matching import ( Blip2ITM, ) diff --git a/mteb/tasks/Image/Clustering/eng/ImageNet.py b/mteb/tasks/Image/Clustering/eng/ImageNet.py index b45956cfe7..1259808450 100644 --- a/mteb/tasks/Image/Clustering/eng/ImageNet.py +++ b/mteb/tasks/Image/Clustering/eng/ImageNet.py @@ -3,6 +3,7 @@ from mteb.abstasks.Image.AbsTaskImageClustering import AbsTaskImageClustering from mteb.abstasks.TaskMetadata import TaskMetadata + class ImageNetDog15Clustering(AbsTaskImageClustering): metadata = TaskMetadata( name="ImageNetDog15Clustering", From a309de512954b958ac005869c33534ec39985497 Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Thu, 19 Sep 2024 21:17:45 +0100 Subject: [PATCH 15/17] add cluster_accuracy, ari and nmi to Image.ClusteringEvaluator --- .../evaluators/Image/ClusteringEvaluator.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py b/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py index b006470416..31b5c26f1a 100644 --- a/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py +++ b/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py @@ -5,8 +5,10 @@ import sklearn import sklearn.cluster +import numpy as np from PIL import Image from sklearn import metrics +from scipy.optimize import linear_sum_assignment from mteb.encoder_interface import Encoder from mteb.evaluation.evaluators.Evaluator import Evaluator @@ -53,6 +55,16 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}): logger.info("Evaluating...") v_measure = metrics.cluster.v_measure_score(self.labels, cluster_assignment) + nmi = metrics.cluster.normalized_mutual_info_score(self.labels, cluster_assignment) + ari = metrics.cluster.adjusted_rand_score(self.labels, cluster_assignment) + accuracy = metrics.accuracy_score(self.labels, cluster_assignment) + + matrix = metrics.confusion_matrix(self.labels, cluster_assignment) + + # get linear sum assignment + row_ind, col_ind = linear_sum_assignment(matrix, maximize=True) + total_correct = matrix[row_ind, col_ind].sum() + clustering_accuracy = total_correct / len(self.labels) - return {"v_measure": v_measure, "accuracy": accuracy} + return {"v_measure": v_measure, "accuracy": accuracy, "nmi": nmi, "ari": ari, "cluster_accuracy": clustering_accuracy} From 02c1f81b85e1dda106dace662af3905096ca1c31 Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Fri, 20 Sep 2024 12:33:26 +0100 Subject: [PATCH 16/17] add imagenet-10 clustering task --- mteb/tasks/Image/Clustering/eng/ImageNet.py | 40 +++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/mteb/tasks/Image/Clustering/eng/ImageNet.py b/mteb/tasks/Image/Clustering/eng/ImageNet.py index 1259808450..dd02d8e830 100644 --- a/mteb/tasks/Image/Clustering/eng/ImageNet.py +++ b/mteb/tasks/Image/Clustering/eng/ImageNet.py @@ -43,3 +43,43 @@ class ImageNetDog15Clustering(AbsTaskImageClustering): # "avg_character_length": {"test": 431.4}, }, ) + +class ImageNet10Clustering(AbsTaskImageClustering): + metadata = TaskMetadata( + name="ImageNet10Clustering", + description="Clustering images from an 10-class subset of ImageNet which are generally easy to distinguish.", + reference="https://www.kaggle.com/datasets/liusha249/imagenet10", + dataset={ + "path": "JamieSJS/imagenet-10", + "revision": "88f8a6d47c257895094c5ad81e67ba751771fc99", + }, + type="Clustering", + category="i2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2009-06-20", "2009-06-20"), # Conference date + domains=["Web"], + task_subtypes=["Object recognition"], + license="Not specified", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + modalities=["image"], + sample_creation="created", + bibtex_citation=""" @INPROCEEDINGS{5206848, + author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Kai Li and Li Fei-Fei}, + booktitle={2009 IEEE Conference on Computer Vision and Pattern Recognition}, + title={ImageNet: A large-scale hierarchical image database}, + year={2009}, + volume={}, + number={}, + pages={248-255}, + keywords={Large-scale systems;Image databases;Explosions;Internet;Robustness;Information retrieval;Image retrieval;Multimedia databases;Ontologies;Spine}, + doi={10.1109/CVPR.2009.5206848}} + """, + descriptive_stats={ + "n_samples": {"test": 13000}, + # "avg_character_length": {"test": 431.4}, + }, + ) From d226748b698d8ab7513fabf917caa8ade0e3f752 Mon Sep 17 00:00:00 2001 From: Jamie-Stirling Date: Fri, 20 Sep 2024 14:05:01 +0100 Subject: [PATCH 17/17] add results forclip on ImageNet10Clustering and ImageNetDog15Clustering --- .../evaluators/Image/ClusteringEvaluator.py | 19 ++++++++++----- mteb/tasks/Image/Clustering/eng/ImageNet.py | 1 + .../ImageNet10Clustering.json | 23 +++++++++++++++++++ .../ImageNetDog15Clustering.json | 23 +++++++++++++++++++ 4 files changed, 60 insertions(+), 6 deletions(-) create mode 100644 results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNet10Clustering.json create mode 100644 results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNetDog15Clustering.json diff --git a/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py b/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py index 31b5c26f1a..f53befe8ef 100644 --- a/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py +++ b/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py @@ -5,10 +5,9 @@ import sklearn import sklearn.cluster -import numpy as np from PIL import Image -from sklearn import metrics from scipy.optimize import linear_sum_assignment +from sklearn import metrics from mteb.encoder_interface import Encoder from mteb.evaluation.evaluators.Evaluator import Evaluator @@ -55,16 +54,24 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}): logger.info("Evaluating...") v_measure = metrics.cluster.v_measure_score(self.labels, cluster_assignment) - nmi = metrics.cluster.normalized_mutual_info_score(self.labels, cluster_assignment) + nmi = metrics.cluster.normalized_mutual_info_score( + self.labels, cluster_assignment + ) ari = metrics.cluster.adjusted_rand_score(self.labels, cluster_assignment) accuracy = metrics.accuracy_score(self.labels, cluster_assignment) - + matrix = metrics.confusion_matrix(self.labels, cluster_assignment) - + # get linear sum assignment row_ind, col_ind = linear_sum_assignment(matrix, maximize=True) total_correct = matrix[row_ind, col_ind].sum() clustering_accuracy = total_correct / len(self.labels) - return {"v_measure": v_measure, "accuracy": accuracy, "nmi": nmi, "ari": ari, "cluster_accuracy": clustering_accuracy} + return { + "v_measure": v_measure, + "accuracy": accuracy, + "nmi": nmi, + "ari": ari, + "cluster_accuracy": clustering_accuracy, + } diff --git a/mteb/tasks/Image/Clustering/eng/ImageNet.py b/mteb/tasks/Image/Clustering/eng/ImageNet.py index dd02d8e830..dcf8587322 100644 --- a/mteb/tasks/Image/Clustering/eng/ImageNet.py +++ b/mteb/tasks/Image/Clustering/eng/ImageNet.py @@ -44,6 +44,7 @@ class ImageNetDog15Clustering(AbsTaskImageClustering): }, ) + class ImageNet10Clustering(AbsTaskImageClustering): metadata = TaskMetadata( name="ImageNet10Clustering", diff --git a/results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNet10Clustering.json b/results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNet10Clustering.json new file mode 100644 index 0000000000..d502635992 --- /dev/null +++ b/results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNet10Clustering.json @@ -0,0 +1,23 @@ +{ + "dataset_revision": "88f8a6d47c257895094c5ad81e67ba751771fc99", + "evaluation_time": 33.32936453819275, + "kg_co2_emissions": null, + "mteb_version": "1.12.90", + "scores": { + "test": [ + { + "accuracy": 0.1993076923076923, + "ari": 0.9672782515730578, + "cluster_accuracy": 0.985, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.1993076923076923, + "nmi": 0.9644473066207006, + "v_measure": 0.9644473066207006 + } + ] + }, + "task_name": "ImageNet10Clustering" +} \ No newline at end of file diff --git a/results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNetDog15Clustering.json b/results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNetDog15Clustering.json new file mode 100644 index 0000000000..fe53c8ed7e --- /dev/null +++ b/results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNetDog15Clustering.json @@ -0,0 +1,23 @@ +{ + "dataset_revision": "bfb6ad3b2109d26c9daddf14f98d315daa35ee72", + "evaluation_time": 4.18316650390625, + "kg_co2_emissions": null, + "mteb_version": "1.12.90", + "scores": { + "test": [ + { + "accuracy": 0.026022304832713755, + "ari": 0.36465670607270784, + "cluster_accuracy": 0.4656133828996282, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.026022304832713755, + "nmi": 0.5160500208664386, + "v_measure": 0.5160500208664386 + } + ] + }, + "task_name": "ImageNetDog15Clustering" +} \ No newline at end of file