From 1e505570549a78740e53ae8337c7b2dfce555f7d Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Fri, 6 Sep 2024 16:20:57 +0100
Subject: [PATCH 01/17] wip: start adding BLIP models

---
 mteb/models/__init__.py    |   2 +
 mteb/models/blip_models.py | 183 +++++++++++++++++++++++++++++++++++++
 2 files changed, 185 insertions(+)
 create mode 100644 mteb/models/blip_models.py

diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py
index 8e96542925..94358143c1 100644
--- a/mteb/models/__init__.py
+++ b/mteb/models/__init__.py
@@ -10,6 +10,7 @@
 from mteb.models import (
     align_models,
     bge_models,
+    blip_models,
     bm25,
     clip_models,
     cohere_models,
@@ -130,6 +131,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe
 model_modules = [
     align_models,
     bge_models,
+    blip_models,
     bm25,
     cohere_models,
     dino_models,
diff --git a/mteb/models/blip_models.py b/mteb/models/blip_models.py
new file mode 100644
index 0000000000..89b7f7d204
--- /dev/null
+++ b/mteb/models/blip_models.py
@@ -0,0 +1,183 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Any
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoModel, AutoProcessor
+
+from mteb.model_meta import ModelMeta
+
+
+class BLIPModelWrapper:
+    def __init__(
+        self,
+        model_name: str,
+        device: str = "cuda" if torch.cuda.is_available() else "cpu",
+        **kwargs: Any,
+    ):
+        self.model_name = model_name
+        self.device = device
+        self.model = AutoModel.from_pretrained(model_name).to(self.device)
+        self.processor = AutoProcessor.from_pretrained(model_name)
+
+    def preprocess(
+        self,
+        texts: list[str],
+        images: list[Image.Image],
+    ):
+        return self.processor(
+            text=texts, images=images, return_tensors="pt", padding=True
+        )
+
+    def get_text_embeddings(self, texts: list[str], batch_size: int = 32):
+        all_text_embeddings = []
+
+        with torch.no_grad():
+            for i in tqdm(range(0, len(texts), batch_size)):
+                batch_texts = texts[i : i + batch_size]
+                inputs = self.processor(
+                    text=batch_texts, return_tensors="pt", padding=True, truncation=True
+                )
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                text_outputs = self.model.get_text_features(**inputs)
+                all_text_embeddings.append(text_outputs.cpu())
+
+        all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
+        return all_text_embeddings
+
+    def get_image_embeddings(
+        self, images: list[Image.Image] | DataLoader, batch_size: int = 32
+    ):
+        all_image_embeddings = []
+
+        if isinstance(images, DataLoader):
+            with torch.no_grad():
+                for batch in tqdm(images):
+                    inputs = self.processor(
+                        images=batch, return_tensors="pt", padding=True
+                    )
+                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                    image_outputs = self.model.get_image_features(**inputs)
+                    all_image_embeddings.append(image_outputs.cpu())
+        else:
+            with torch.no_grad():
+                for i in tqdm(range(0, len(images), batch_size)):
+                    batch_images = images[i : i + batch_size]
+                    inputs = self.processor(
+                        images=batch_images, return_tensors="pt", padding=True
+                    )
+                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                    image_outputs = self.model.get_image_features(**inputs)
+                    all_image_embeddings.append(image_outputs.cpu())
+
+        all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+        return all_image_embeddings
+
+    def calculate_probs(self, text_embeddings, image_embeddings):
+        text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
+        image_embeddings = image_embeddings / image_embeddings.norm(
+            dim=-1, keepdim=True
+        )
+        logits = torch.matmul(image_embeddings, text_embeddings.T)
+        probs = (logits * 100).softmax(dim=-1)
+        return probs
+
+    def get_fused_embeddings(
+        self,
+        texts: list[str] = None,
+        images: list[Image.Image] | DataLoader = None,
+        fusion_mode="sum",
+        batch_size: int = 32,
+    ):
+        # TODO: find out if BLIP has a prescribed way of fusing text and image embeddings
+        if texts is None and images is None:
+            raise ValueError("Either texts or images must be provided")
+
+        text_embeddings = None
+        image_embeddings = None
+
+        if texts is not None:
+            text_embeddings = self.get_text_embeddings(texts, batch_size)
+
+        if images is not None:
+            image_embeddings = self.get_image_embeddings(images, batch_size)
+
+        if text_embeddings is not None and image_embeddings is not None:
+            if len(text_embeddings) != len(image_embeddings):
+                raise ValueError(
+                    "The number of texts and images must have the same length"
+                )
+            if fusion_mode == "sum":
+                fused_embeddings = text_embeddings + image_embeddings
+            else:
+                # to do: add other fusion mode
+                raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented")
+            return fused_embeddings
+        elif text_embeddings is not None:
+            return text_embeddings
+        elif image_embeddings is not None:
+            return image_embeddings
+
+
+"""
+TODO: implement all model variants
+
+Salesforce/blip-image-captioning-large
+Image-to-Text • Updated Dec 7, 2023 •
+1.16M •
+•
+1.04k
+Salesforce/blip-image-captioning-base
+Image-to-Text • Updated Aug 1, 2023 •
+857k •
+•
+475
+Salesforce/blip-vqa-base
+Visual Question Answering • Updated Dec 7, 2023 •
+168k •
+119
+Salesforce/blip-vqa-capfilt-large
+Visual Question Answering • Updated Jan 22 •
+90.6k •
+44
+Salesforce/blip-itm-base-coco
+Updated Aug 1, 2023 •
+12.8k •
+16
+Salesforce/blip-itm-large-coco
+Updated Aug 1, 2023 •
+9.9k
+Salesforce/blip-itm-base-flickr
+Updated Aug 1, 2023 •
+65
+Salesforce/blip-itm-large-flickr
+Updated Aug 1, 2023 •
+459 •
+2
+"""
+
+blip_image_captioning_base = ModelMeta(
+    loader=partial(
+        BLIPModelWrapper,
+        model_name="Salesforce/blip-image-captioning-base",
+    ),
+    name="Salesforce/blip-image-captioning-base",
+    languages=["eng_Latn"],
+    open_source=True,
+    revision="89b09ea1789f7addf2f6d6f0dfc4ce10ab58ef84",
+    release_date="2023-08-01",
+)
+
+
+if __name__ == "__main__":
+    import mteb
+
+    mdl = mteb.get_model(
+        blip_image_captioning_base.name, blip_image_captioning_base.revision
+    )
+    emb = mdl.get_text_embeddings(["Hello, world!"])
+    print(emb.shape)

From 8f8e05cb3e3f1d4a773e3e6a1136d7d307c872d9 Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Mon, 9 Sep 2024 16:17:43 +0100
Subject: [PATCH 02/17] add other blip variants

---
 mteb/models/blip_models.py | 85 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/mteb/models/blip_models.py b/mteb/models/blip_models.py
index 89b7f7d204..ead46b63e7 100644
--- a/mteb/models/blip_models.py
+++ b/mteb/models/blip_models.py
@@ -159,6 +159,18 @@ def get_fused_embeddings(
 459 •
 2
 """
+# in descending order of usage (downloads from huggingface)
+blip_image_captioning_large = ModelMeta(
+    loader=partial(
+        BLIPModelWrapper,
+        model_name="Salesforce/blip-image-captioning-large",
+    ),
+    name="Salesforce/blip-image-captioning-large",
+    languages=["eng_Latn"],
+    open_source=True,
+    revision="2227ac38c9f16105cb0412e7cab4759978a8fd90",
+    release_date="2023-12-07",
+)
 
 blip_image_captioning_base = ModelMeta(
     loader=partial(
@@ -173,6 +185,79 @@ def get_fused_embeddings(
 )
 
 
+blip_vqa_base = ModelMeta(
+    loader=partial(
+        BLIPModelWrapper,
+        model_name="Salesforce/blip-vqa-base",
+    ),
+    name="Salesforce/blip-vqa-base",
+    languages=["eng_Latn"],
+    open_source=True,
+    revision="c7df8e7cd7aa2ee9af18f56e2b29e59a92651b64",
+    release_date="2023-12-07",
+)
+
+blip_vqa_capfilt_large = ModelMeta(
+    loader=partial(
+        BLIPModelWrapper,
+        model_name="Salesforce/blip-vqa-capfilt-large",
+    ),
+    name="Salesforce/blip-vqa-capfilt-large",
+    languages=["eng_Latn"],
+    open_source=True,
+    revision="e53f95265aeab69013fabb5380500ab984adbbb4",
+    release_date="2023-01-22",
+)
+
+blip_itm_base_coco = ModelMeta(
+    loader=partial(
+        BLIPModelWrapper,
+        model_name="Salesforce/blip-itm-base-coco",
+    ),
+    name="Salesforce/blip-itm-base-coco",
+    languages=["eng_Latn"],
+    open_source=True,
+    revision="7eaa90c11850c0b17fc38c6a11e7d88bd6ac231f",
+    release_date="2023-08-01",
+)
+
+blip_itm_large_coco = ModelMeta(
+    loader=partial(
+        BLIPModelWrapper,
+        model_name="Salesforce/blip-itm-large-coco",
+    ),
+    name="Salesforce/blip-itm-large-coco",
+    languages=["eng_Latn"],
+    open_source=True,
+    revision="fef05cafc05298067cbbca00b125749394a77a6f",
+    release_date="2023-08-01",
+)
+
+blip_itm_base_flickr = ModelMeta(
+    loader=partial(
+        BLIPModelWrapper,
+        model_name="Salesforce/blip-itm-base-flickr",
+    ),
+    name="Salesforce/blip-itm-base-flickr",
+    languages=["eng_Latn"],
+    open_source=True,
+    revision="1de29e660d91ae1786c1876212ea805a22eab251",
+    release_date="2023-08-01",
+)
+
+blip_itm_large_flickr = ModelMeta(
+    loader=partial(
+        BLIPModelWrapper,
+        model_name="Salesforce/blip-itm-large-flickr",
+    ),
+    name="Salesforce/blip-itm-large-flickr",
+    languages=["eng_Latn"],
+    open_source=True,
+    revision="bda12e6506758f54261b5ab174b2c55a3ba143fb",
+    release_date="2023-08-01",
+)
+
+
 if __name__ == "__main__":
     import mteb
 

From be8b4bbd007e274cc622c7291a24b2cb23c080c8 Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Wed, 11 Sep 2024 15:50:31 +0100
Subject: [PATCH 03/17] wip: add blip2_models.py

---
 mteb/models/blip2_models.py | 235 ++++++++++++++++++++++++++++++++++++
 1 file changed, 235 insertions(+)
 create mode 100644 mteb/models/blip2_models.py

diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py
new file mode 100644
index 0000000000..5db3d01c37
--- /dev/null
+++ b/mteb/models/blip2_models.py
@@ -0,0 +1,235 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Any
+
+import torch
+from torch.nn.functional import normalize
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import BlipForImageTextRetrieval, BlipProcessor
+
+from mteb.model_meta import ModelMeta
+
+
+class BLIP2ModelWrapper:
+    def __init__(
+        self,
+        model_name: str,
+        device: str = "cuda" if torch.cuda.is_available() else "cpu",
+        **kwargs: Any,
+    ):
+        self.model_name = model_name
+        self.device = device
+        self.model = BlipForImageTextRetrieval.from_pretrained(model_name).to(self.device)
+        self.processor = BlipProcessor.from_pretrained(model_name)
+
+    def preprocess(
+        self,
+        texts: list[str],
+        images: list[Image.Image],
+    ):
+        return self.processor(
+            text=texts, images=images, return_tensors="pt", padding=True
+        )
+
+    def get_text_embeddings(self, texts: list[str], batch_size: int = 32):
+        all_text_embeddings = []
+
+        with torch.no_grad():
+            for i in tqdm(range(0, len(texts), batch_size)):
+                batch_texts = texts[i : i + batch_size]
+                inputs = self.processor(
+                    text=batch_texts, return_tensors="pt", padding=True, truncation=True
+                )
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                # different to CLIPModelWrapper: text_encoder instead of get_text_features and apply projection and normalization
+                text_outputs = self.model.text_encoder(**inputs)
+                text_outputs = text_outputs[0]
+                text_outputs = normalize(self.model.text_proj(text_outputs[:,0,:]), dim=-1)
+                all_text_embeddings.append(text_outputs.cpu())
+
+        all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
+        return all_text_embeddings
+
+    def get_image_embeddings(
+        self, images: list[Image.Image] | DataLoader, batch_size: int = 32
+    ):
+        all_image_embeddings = []
+
+        if isinstance(images, DataLoader):
+            with torch.no_grad():
+                for batch in tqdm(images):
+                    inputs = self.processor(
+                        images=batch, return_tensors="pt", padding=True
+                    )
+                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                    image_outputs = self.model.vision_model(**inputs)
+                    image_outputs = image_outputs[0]
+                    image_outputs = normalize(self.model.vision_proj(image_outputs[:,0,:]), dim=-1)
+                    all_image_embeddings.append(image_outputs.cpu())
+        else:
+            with torch.no_grad():
+                for i in tqdm(range(0, len(images), batch_size)):
+                    batch_images = images[i : i + batch_size]
+                    inputs = self.processor(
+                        images=batch_images, return_tensors="pt", padding=True
+                    )
+                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                    image_outputs = self.model.get_image_features(**inputs)
+                    image_outputs = self.model.vision_model(**inputs)
+                    image_outputs = image_outputs[0]
+                    image_outputs = normalize(self.model.vision_proj(image_outputs[:,0,:]), dim=-1)
+                    all_image_embeddings.append(image_outputs.cpu())
+
+        all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+        return all_image_embeddings
+
+    def calculate_probs(self, text_embeddings, image_embeddings):
+        text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
+        image_embeddings = image_embeddings / image_embeddings.norm(
+            dim=-1, keepdim=True
+        )
+        logits = torch.matmul(image_embeddings, text_embeddings.T)
+        probs = (logits * 100).softmax(dim=-1)
+        return probs
+
+    def get_fused_embeddings(
+        self,
+        texts: list[str] = None,
+        images: list[Image.Image] | DataLoader = None,
+        fusion_mode="sum",
+        batch_size: int = 32,
+    ):
+        # TODO: find out if BLIP has a prescribed way of fusing text and image embeddings
+        if texts is None and images is None:
+            raise ValueError("Either texts or images must be provided")
+
+        text_embeddings = None
+        image_embeddings = None
+
+        if texts is not None:
+            text_embeddings = self.get_text_embeddings(texts, batch_size)
+
+        if images is not None:
+            image_embeddings = self.get_image_embeddings(images, batch_size)
+
+        if text_embeddings is not None and image_embeddings is not None:
+            if len(text_embeddings) != len(image_embeddings):
+                raise ValueError(
+                    "The number of texts and images must have the same length"
+                )
+            if fusion_mode == "sum":
+                fused_embeddings = text_embeddings + image_embeddings
+            else:
+                # to do: add other fusion mode
+                raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented")
+            return fused_embeddings
+        elif text_embeddings is not None:
+            return text_embeddings
+        elif image_embeddings is not None:
+            return image_embeddings
+
+
+"""
+
+Salesforce/blip2-opt-2.7b
+Image-to-Text • Updated Mar 22 •
+588k •
+296
+Salesforce/blip2-flan-t5-xxl
+Image-to-Text • Updated Mar 29 •
+9.23k •
+84
+Salesforce/blip2-opt-6.7b-coco
+Image-to-Text • Updated Mar 31 •
+1.51k •
+28
+Salesforce/blip2-opt-6.7b
+Image-to-Text • Updated Mar 27 •
+4.93k •
+71
+Salesforce/blip2-flan-t5-xl
+Image-to-Text • Updated Dec 13, 2023 •
+95.9k •
+56
+"""
+# in descending order of usage (downloads from huggingface)
+
+blip2_opt_2_7b = ModelMeta(
+    loader=partial(
+        BLIP2ModelWrapper,
+        model_name="Salesforce/blip2-opt-2.7b",
+    ),
+    name="Salesforce/blip2-opt-2.7b",
+    languages=["eng_Latn"],
+    open_source=True,
+    revision="51572668da0eb669e01a189dc22abe6088589a24",
+    release_date="2024-03-22",
+)
+
+blip2_flan_t5_xxl = ModelMeta(
+    loader=partial(
+        BLIP2ModelWrapper,
+        model_name="Salesforce/blip2-flan-t5-xxl",
+    ),
+    name="Salesforce/blip2-flan-t5-xxl",
+    languages=["eng_Latn"],
+    open_source=True,
+    revision="43206cbc865b9d5b3dd7d080e5d94b4143ca8e74",
+    release_date="2024-03-29",
+)
+
+blip2_opt_6_7b_coco = ModelMeta(
+    loader=partial(
+        BLIP2ModelWrapper,
+        model_name="Salesforce/blip2-opt-6.7b-coco",
+    ),
+    name="Salesforce/blip2-opt-6.7b-coco",
+    languages=["eng_Latn"],
+    open_source=True,
+    revision="0d580de59320a25a4d2c386387bcef310d5f286e",
+    release_date="2024-03-31",
+)
+
+blip2_opt_6_7b = ModelMeta(
+    loader=partial(
+        BLIP2ModelWrapper,
+        model_name="Salesforce/blip2-opt-6.7b",
+    ),
+    name="Salesforce/blip2-opt-6.7b",
+    languages=["eng_Latn"],
+    open_source=True,
+    revision="1d33d60155fd1323b97556e0f1dd5148a9749f5b",
+    release_date="2024-03-27",
+)
+
+blip2_flan_t5_xl = ModelMeta(
+    loader=partial(
+        BLIP2ModelWrapper,
+        model_name="Salesforce/blip2-flan-t5-xl",
+    ),
+    name="Salesforce/blip2-flan-t5-xl",
+    languages=["eng_Latn"],
+    open_source=True,
+    revision="e5025a34e3e769e72e2aab7f7bfd00bc84d5fd77",
+    release_date="2023-12-13",
+)
+
+if __name__ == "__main__":
+    import mteb
+
+    mdl = mteb.get_model(
+        blip2_opt_2_7b.name, blip2_opt_2_7b.revision
+    )
+    emb = mdl.get_text_embeddings(["Hello, world!"])
+    emb2 = mdl.get_text_embeddings(["Hello there, world!"])
+    emb3 = mdl.get_text_embeddings(["Goodbye, person!"])
+    
+    sim = torch.nn.functional.cosine_similarity(emb, emb2)
+    print(sim)
+
+    sim = torch.nn.functional.cosine_similarity(emb, emb3)
+    print(sim)
+    

From b57a395d5e103d0677c4547ebbb8f9f35564a202 Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Wed, 11 Sep 2024 16:26:45 +0100
Subject: [PATCH 04/17] make lint

---
 .../abstasks/Image/AbsTaskAny2AnyRetrieval.py |  2 +-
 .../Image/AbsTaskImageClassification.py       |  2 +-
 mteb/abstasks/Image/AbsTaskImageClustering.py |  2 +-
 .../AbsTaskImageMultilabelClassification.py   |  2 +-
 .../AbsTaskImageTextPairClassification.py     |  2 +-
 .../Image/AbsTaskZeroshotClassification.py    |  2 +-
 mteb/models/blip2_models.py                   | 25 +++---
 mteb/models/blip_models.py                    | 77 ++++++++-----------
 mteb/models/instructions.py                   |  2 -
 mteb/models/ru_sentence_models.py             |  2 -
 mteb/models/sentence_transformers_models.py   |  2 -
 .../Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py |  3 +-
 .../eng/FashionIQIT2IRetrieval.py             |  3 +-
 .../eng/HatefulMemesI2TRetrieval.py           |  3 +-
 .../eng/HatefulMemesT2IRetrieval.py           |  3 +-
 .../eng/InfoSeekIT2ITRetrieval.py             |  3 +-
 .../eng/InfoSeekIT2TRetrieval.py              |  3 +-
 .../eng/MemotionI2TRetrieval.py               |  3 +-
 .../eng/MemotionT2IRetrieval.py               |  3 +-
 .../eng/NIGHTSI2IRetrieval.py                 |  3 +-
 .../eng/OVENIT2ITRetrieval.py                 |  3 +-
 .../Any2AnyRetrieval/eng/OVENIT2TRetrieval.py |  3 +-
 .../eng/SciMMIRI2TRetrieval.py                |  3 +-
 .../eng/SciMMIRT2IRetrieval.py                |  3 +-
 .../eng/TUBerlinT2IRetrieval.py               |  3 +-
 .../eng/VisualNewsI2TRetrieval.py             |  3 +-
 .../eng/VisualNewsT2IRetrieval.py             |  3 +-
 .../eng/WebQAT2ITRetrieval.py                 |  3 +-
 .../Any2AnyRetrieval/eng/WebQAT2TRetrieval.py |  3 +-
 .../multilingual/WITT2IRetrieval.py           |  3 +-
 .../multilingual/XFlickr30kCoT2IRetrieval.py  |  3 +-
 .../multilingual/XM3600T2IRetrieval.py        |  3 +-
 mteb/tasks/Image/Clustering/eng/CIFAR.py      |  3 +-
 .../eng/BirdsnapClassification.py             |  3 +-
 .../Image/ImageClassification/eng/CIFAR.py    |  3 +-
 .../eng/Caltech101Classification.py           |  3 +-
 .../eng/DTDClassification.py                  |  3 +-
 .../eng/EuroSATClassification.py              |  3 +-
 .../eng/FER2013Classification.py              |  3 +-
 .../eng/FGVCAircraftClassification.py         |  3 +-
 .../eng/Food101Classification.py              |  3 +-
 .../eng/MNISTClassification.py                |  3 +-
 .../eng/OxfordFlowersClassification.py        |  3 +-
 .../eng/OxfordPetsClassification.py           |  3 +-
 .../eng/RESISC45Classification.py             |  3 +-
 .../eng/STL10Classification.py                |  3 +-
 .../eng/SUN397Classification.py               |  3 +-
 .../eng/StanfordCarsClassification.py         |  3 +-
 .../ZeroshotClassification/eng/Birdsnap.py    |  3 +-
 .../Image/ZeroshotClassification/eng/CIFAR.py |  3 +-
 .../ZeroshotClassification/eng/Caltech101.py  |  3 +-
 .../Image/ZeroshotClassification/eng/DTD.py   |  3 +-
 .../ZeroshotClassification/eng/EuroSAT.py     |  3 +-
 .../ZeroshotClassification/eng/FER2013.py     |  3 +-
 .../eng/FGVCAircraft.py                       |  3 +-
 .../ZeroshotClassification/eng/Food101.py     |  3 +-
 .../Image/ZeroshotClassification/eng/MNIST.py |  3 +-
 .../ZeroshotClassification/eng/OxfordPets.py  |  3 +-
 .../ZeroshotClassification/eng/RESISC45.py    |  3 +-
 .../Image/ZeroshotClassification/eng/STL10.py |  3 +-
 .../ZeroshotClassification/eng/SUN397.py      |  3 +-
 .../eng/StanfordCars.py                       |  3 +-
 62 files changed, 103 insertions(+), 170 deletions(-)

diff --git a/mteb/abstasks/Image/AbsTaskAny2AnyRetrieval.py b/mteb/abstasks/Image/AbsTaskAny2AnyRetrieval.py
index 9c5987f4b1..c640988e91 100644
--- a/mteb/abstasks/Image/AbsTaskAny2AnyRetrieval.py
+++ b/mteb/abstasks/Image/AbsTaskAny2AnyRetrieval.py
@@ -12,9 +12,9 @@
 from datasets import Features, Value, load_dataset
 from PIL import Image
 
-from ..AbsTask import AbsTask
 from ...evaluation.evaluators import Any2AnyRetrievalEvaluator
 from ...load_results.mteb_results import ScoresDict
+from ..AbsTask import AbsTask
 
 logger = logging.getLogger(__name__)
 
diff --git a/mteb/abstasks/Image/AbsTaskImageClassification.py b/mteb/abstasks/Image/AbsTaskImageClassification.py
index 3a95f2bd29..715f007e10 100644
--- a/mteb/abstasks/Image/AbsTaskImageClassification.py
+++ b/mteb/abstasks/Image/AbsTaskImageClassification.py
@@ -6,7 +6,6 @@
 
 import numpy as np
 
-from ..AbsTask import AbsTask
 from ...encoder_interface import Encoder
 from ...evaluation.evaluators import (
     ImagekNNClassificationEvaluator,
@@ -14,6 +13,7 @@
     ImagelogRegClassificationEvaluator,
 )
 from ...load_results.mteb_results import HFSubset, ScoresDict
+from ..AbsTask import AbsTask
 
 logger = logging.getLogger(__name__)
 
diff --git a/mteb/abstasks/Image/AbsTaskImageClustering.py b/mteb/abstasks/Image/AbsTaskImageClustering.py
index 5370b16b15..3d6f7e88d2 100644
--- a/mteb/abstasks/Image/AbsTaskImageClustering.py
+++ b/mteb/abstasks/Image/AbsTaskImageClustering.py
@@ -5,10 +5,10 @@
 
 from datasets import Dataset
 
-from ..AbsTask import AbsTask
 from ...encoder_interface import Encoder, EncoderWithQueryCorpusEncode
 from ...evaluation.evaluators import ImageClusteringEvaluator
 from ...load_results.mteb_results import HFSubset, ScoresDict
+from ..AbsTask import AbsTask
 
 logger = logging.getLogger(__name__)
 
diff --git a/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py b/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py
index 5669575a18..6a0d649f10 100644
--- a/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py
+++ b/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py
@@ -12,9 +12,9 @@
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.preprocessing import MultiLabelBinarizer
 
-from ..AbsTask import AbsTask
 from ...encoder_interface import Encoder
 from ...load_results.mteb_results import HFSubset, ScoresDict
+from ..AbsTask import AbsTask
 
 logger = logging.getLogger(__name__)
 
diff --git a/mteb/abstasks/Image/AbsTaskImageTextPairClassification.py b/mteb/abstasks/Image/AbsTaskImageTextPairClassification.py
index 492de11659..81f3094b5c 100644
--- a/mteb/abstasks/Image/AbsTaskImageTextPairClassification.py
+++ b/mteb/abstasks/Image/AbsTaskImageTextPairClassification.py
@@ -6,10 +6,10 @@
 from datasets import Dataset
 from tqdm import tqdm
 
-from ..AbsTask import AbsTask
 from ...encoder_interface import Encoder, EncoderWithQueryCorpusEncode
 from ...evaluation.evaluators import ImageTextPairClassificationEvaluator
 from ...load_results.mteb_results import ScoresDict
+from ..AbsTask import AbsTask
 
 logger = logging.getLogger(__name__)
 
diff --git a/mteb/abstasks/Image/AbsTaskZeroshotClassification.py b/mteb/abstasks/Image/AbsTaskZeroshotClassification.py
index 9d5a55e235..4f23bb46b4 100644
--- a/mteb/abstasks/Image/AbsTaskZeroshotClassification.py
+++ b/mteb/abstasks/Image/AbsTaskZeroshotClassification.py
@@ -5,10 +5,10 @@
 
 from datasets import Dataset
 
-from ..AbsTask import AbsTask
 from ...encoder_interface import Encoder, EncoderWithQueryCorpusEncode
 from ...evaluation.evaluators import ZeroshotClassificationEvaluator
 from ...load_results.mteb_results import ScoresDict
+from ..AbsTask import AbsTask
 
 logger = logging.getLogger(__name__)
 
diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py
index 5db3d01c37..3181c5f5ac 100644
--- a/mteb/models/blip2_models.py
+++ b/mteb/models/blip2_models.py
@@ -4,8 +4,8 @@
 from typing import Any
 
 import torch
-from torch.nn.functional import normalize
 from PIL import Image
+from torch.nn.functional import normalize
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import BlipForImageTextRetrieval, BlipProcessor
@@ -22,7 +22,9 @@ def __init__(
     ):
         self.model_name = model_name
         self.device = device
-        self.model = BlipForImageTextRetrieval.from_pretrained(model_name).to(self.device)
+        self.model = BlipForImageTextRetrieval.from_pretrained(model_name).to(
+            self.device
+        )
         self.processor = BlipProcessor.from_pretrained(model_name)
 
     def preprocess(
@@ -47,7 +49,9 @@ def get_text_embeddings(self, texts: list[str], batch_size: int = 32):
                 # different to CLIPModelWrapper: text_encoder instead of get_text_features and apply projection and normalization
                 text_outputs = self.model.text_encoder(**inputs)
                 text_outputs = text_outputs[0]
-                text_outputs = normalize(self.model.text_proj(text_outputs[:,0,:]), dim=-1)
+                text_outputs = normalize(
+                    self.model.text_proj(text_outputs[:, 0, :]), dim=-1
+                )
                 all_text_embeddings.append(text_outputs.cpu())
 
         all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
@@ -67,7 +71,9 @@ def get_image_embeddings(
                     inputs = {k: v.to(self.device) for k, v in inputs.items()}
                     image_outputs = self.model.vision_model(**inputs)
                     image_outputs = image_outputs[0]
-                    image_outputs = normalize(self.model.vision_proj(image_outputs[:,0,:]), dim=-1)
+                    image_outputs = normalize(
+                        self.model.vision_proj(image_outputs[:, 0, :]), dim=-1
+                    )
                     all_image_embeddings.append(image_outputs.cpu())
         else:
             with torch.no_grad():
@@ -80,7 +86,9 @@ def get_image_embeddings(
                     image_outputs = self.model.get_image_features(**inputs)
                     image_outputs = self.model.vision_model(**inputs)
                     image_outputs = image_outputs[0]
-                    image_outputs = normalize(self.model.vision_proj(image_outputs[:,0,:]), dim=-1)
+                    image_outputs = normalize(
+                        self.model.vision_proj(image_outputs[:, 0, :]), dim=-1
+                    )
                     all_image_embeddings.append(image_outputs.cpu())
 
         all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
@@ -220,16 +228,13 @@ def get_fused_embeddings(
 if __name__ == "__main__":
     import mteb
 
-    mdl = mteb.get_model(
-        blip2_opt_2_7b.name, blip2_opt_2_7b.revision
-    )
+    mdl = mteb.get_model(blip2_opt_2_7b.name, blip2_opt_2_7b.revision)
     emb = mdl.get_text_embeddings(["Hello, world!"])
     emb2 = mdl.get_text_embeddings(["Hello there, world!"])
     emb3 = mdl.get_text_embeddings(["Goodbye, person!"])
-    
+
     sim = torch.nn.functional.cosine_similarity(emb, emb2)
     print(sim)
 
     sim = torch.nn.functional.cosine_similarity(emb, emb3)
     print(sim)
-    
diff --git a/mteb/models/blip_models.py b/mteb/models/blip_models.py
index ead46b63e7..dff6014246 100644
--- a/mteb/models/blip_models.py
+++ b/mteb/models/blip_models.py
@@ -5,9 +5,10 @@
 
 import torch
 from PIL import Image
+from torch.nn.functional import normalize
 from torch.utils.data import DataLoader
 from tqdm import tqdm
-from transformers import AutoModel, AutoProcessor
+from transformers import BlipForImageTextRetrieval, BlipProcessor
 
 from mteb.model_meta import ModelMeta
 
@@ -21,8 +22,10 @@ def __init__(
     ):
         self.model_name = model_name
         self.device = device
-        self.model = AutoModel.from_pretrained(model_name).to(self.device)
-        self.processor = AutoProcessor.from_pretrained(model_name)
+        self.model = BlipForImageTextRetrieval.from_pretrained(model_name).to(
+            self.device
+        )
+        self.processor = BlipProcessor.from_pretrained(model_name)
 
     def preprocess(
         self,
@@ -43,7 +46,12 @@ def get_text_embeddings(self, texts: list[str], batch_size: int = 32):
                     text=batch_texts, return_tensors="pt", padding=True, truncation=True
                 )
                 inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                text_outputs = self.model.get_text_features(**inputs)
+                # different to CLIPModelWrapper: text_encoder instead of get_text_features and apply projection and normalization
+                text_outputs = self.model.text_encoder(**inputs)
+                text_outputs = text_outputs[0]
+                text_outputs = normalize(
+                    self.model.text_proj(text_outputs[:, 0, :]), dim=-1
+                )
                 all_text_embeddings.append(text_outputs.cpu())
 
         all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
@@ -61,7 +69,11 @@ def get_image_embeddings(
                         images=batch, return_tensors="pt", padding=True
                     )
                     inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                    image_outputs = self.model.get_image_features(**inputs)
+                    image_outputs = self.model.vision_model(**inputs)
+                    image_outputs = image_outputs[0]
+                    image_outputs = normalize(
+                        self.model.vision_proj(image_outputs[:, 0, :]), dim=-1
+                    )
                     all_image_embeddings.append(image_outputs.cpu())
         else:
             with torch.no_grad():
@@ -72,6 +84,11 @@ def get_image_embeddings(
                     )
                     inputs = {k: v.to(self.device) for k, v in inputs.items()}
                     image_outputs = self.model.get_image_features(**inputs)
+                    image_outputs = self.model.vision_model(**inputs)
+                    image_outputs = image_outputs[0]
+                    image_outputs = normalize(
+                        self.model.vision_proj(image_outputs[:, 0, :]), dim=-1
+                    )
                     all_image_embeddings.append(image_outputs.cpu())
 
         all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
@@ -93,7 +110,6 @@ def get_fused_embeddings(
         fusion_mode="sum",
         batch_size: int = 32,
     ):
-        # TODO: find out if BLIP has a prescribed way of fusing text and image embeddings
         if texts is None and images is None:
             raise ValueError("Either texts or images must be provided")
 
@@ -123,42 +139,6 @@ def get_fused_embeddings(
             return image_embeddings
 
 
-"""
-TODO: implement all model variants
-
-Salesforce/blip-image-captioning-large
-Image-to-Text • Updated Dec 7, 2023 •
-1.16M •
-•
-1.04k
-Salesforce/blip-image-captioning-base
-Image-to-Text • Updated Aug 1, 2023 •
-857k •
-•
-475
-Salesforce/blip-vqa-base
-Visual Question Answering • Updated Dec 7, 2023 •
-168k •
-119
-Salesforce/blip-vqa-capfilt-large
-Visual Question Answering • Updated Jan 22 •
-90.6k •
-44
-Salesforce/blip-itm-base-coco
-Updated Aug 1, 2023 •
-12.8k •
-16
-Salesforce/blip-itm-large-coco
-Updated Aug 1, 2023 •
-9.9k
-Salesforce/blip-itm-base-flickr
-Updated Aug 1, 2023 •
-65
-Salesforce/blip-itm-large-flickr
-Updated Aug 1, 2023 •
-459 •
-2
-"""
 # in descending order of usage (downloads from huggingface)
 blip_image_captioning_large = ModelMeta(
     loader=partial(
@@ -261,8 +241,13 @@ def get_fused_embeddings(
 if __name__ == "__main__":
     import mteb
 
-    mdl = mteb.get_model(
-        blip_image_captioning_base.name, blip_image_captioning_base.revision
-    )
+    mdl = mteb.get_model(blip_itm_base_coco.name, blip_itm_base_coco.revision)
     emb = mdl.get_text_embeddings(["Hello, world!"])
-    print(emb.shape)
+    emb2 = mdl.get_text_embeddings(["Hello there, world!"])
+    emb3 = mdl.get_text_embeddings(["Goodbye, person!"])
+
+    sim = torch.nn.functional.cosine_similarity(emb, emb2)
+    print(sim)
+
+    sim = torch.nn.functional.cosine_similarity(emb, emb3)
+    print(sim)
diff --git a/mteb/models/instructions.py b/mteb/models/instructions.py
index 99054e41d7..4a31f8da02 100644
--- a/mteb/models/instructions.py
+++ b/mteb/models/instructions.py
@@ -2,8 +2,6 @@
 
 from __future__ import annotations
 
-from __future__ import annotations
-
 import mteb
 
 # Prompts from
diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py
index cffe7f7be4..30214c21f2 100644
--- a/mteb/models/ru_sentence_models.py
+++ b/mteb/models/ru_sentence_models.py
@@ -2,8 +2,6 @@
 
 from __future__ import annotations
 
-from __future__ import annotations
-
 from functools import partial
 
 from mteb.model_meta import ModelMeta
diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py
index 33ea17b165..a3603d9eb3 100644
--- a/mteb/models/sentence_transformers_models.py
+++ b/mteb/models/sentence_transformers_models.py
@@ -2,8 +2,6 @@
 
 from __future__ import annotations
 
-from __future__ import annotations
-
 from mteb.model_meta import ModelMeta
 
 paraphrase_langs = [
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py
index eb65b82e79..417e5d6caa 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class CIRRIT2IRetrieval(AbsTaskAny2AnyRetrieval):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py
index b336549557..a58ed15dd5 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class FashionIQIT2IRetrieval(AbsTaskAny2AnyRetrieval):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesI2TRetrieval.py
index 1fcf9f0cb9..817ea1c674 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesI2TRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesI2TRetrieval.py
@@ -2,9 +2,8 @@
 
 from datasets import concatenate_datasets, load_dataset
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesT2IRetrieval.py
index 5b2b9bcaef..0a55e446ed 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesT2IRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesT2IRetrieval.py
@@ -2,9 +2,8 @@
 
 from datasets import concatenate_datasets, load_dataset
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py
index 5029c51ec9..f7cb041bcb 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class InfoSeekIT2ITRetrieval(AbsTaskAny2AnyRetrieval):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py
index cd08aa77b2..cc2b23ea88 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class InfoSeekIT2TRetrieval(AbsTaskAny2AnyRetrieval):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py
index af68e278b9..9247a12f88 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py
@@ -2,9 +2,8 @@
 
 from datasets import concatenate_datasets, load_dataset
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py
index 7478ddddeb..f214bd2ea5 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py
@@ -2,9 +2,8 @@
 
 from datasets import concatenate_datasets, load_dataset
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py
index 82dcf0894a..73d3f7c280 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class NIGHTSI2IRetrieval(AbsTaskAny2AnyRetrieval):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py
index 51d031241c..0f53eb7e6a 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class OVENIT2ITRetrieval(AbsTaskAny2AnyRetrieval):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py
index cfa07350ba..3df5b92625 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class OVENIT2TRetrieval(AbsTaskAny2AnyRetrieval):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRI2TRetrieval.py
index fa0f5b5707..eb2c24aeb2 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRI2TRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRI2TRetrieval.py
@@ -2,9 +2,8 @@
 
 from datasets import load_dataset
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRT2IRetrieval.py
index c6004e7840..e92bd637f5 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRT2IRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRT2IRetrieval.py
@@ -2,9 +2,8 @@
 
 from datasets import load_dataset
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = None):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/TUBerlinT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/TUBerlinT2IRetrieval.py
index 018f708ce5..7c7bddfe4c 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/TUBerlinT2IRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/TUBerlinT2IRetrieval.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class TUBerlinT2IRetrieval(AbsTaskAny2AnyRetrieval):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py
index c1f1b306ca..2de1713097 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class VisualNewsI2TRetrieval(AbsTaskAny2AnyRetrieval):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py
index 7457f00d03..091d7a7f00 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class VisualNewsT2IRetrieval(AbsTaskAny2AnyRetrieval):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py
index 7086c1c205..50725b79b9 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class WebQAT2ITRetrieval(AbsTaskAny2AnyRetrieval):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py
index 6a4efb261a..14c9c02148 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class WebQAT2TRetrieval(AbsTaskAny2AnyRetrieval):
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py
index a0395594a2..5de06b937f 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py
@@ -2,10 +2,9 @@
 
 from datasets import Dataset, DatasetDict, load_dataset
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
 from mteb.abstasks.MultilingualTask import MultilingualTask
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 _LANGUAGES = {
     "ar": ["ara-Arab"],
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py
index 92f4a9c2c0..65c886f314 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py
@@ -2,10 +2,9 @@
 
 from datasets import DatasetDict, load_dataset
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
 from mteb.abstasks.MultilingualTask import MultilingualTask
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 _LANGUAGES = {
     "de": ["deu-Latn"],
diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py
index 8cb7f0e9d1..687c9f0446 100644
--- a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py
+++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py
@@ -2,10 +2,9 @@
 
 from datasets import Dataset, DatasetDict, load_dataset
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval
 from mteb.abstasks.MultilingualTask import MultilingualTask
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 _LANGUAGES = {
     "ar": ["ara-Arab"],
diff --git a/mteb/tasks/Image/Clustering/eng/CIFAR.py b/mteb/tasks/Image/Clustering/eng/CIFAR.py
index 01b493233c..e7f7a1d633 100644
--- a/mteb/tasks/Image/Clustering/eng/CIFAR.py
+++ b/mteb/tasks/Image/Clustering/eng/CIFAR.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClustering import AbsTaskImageClustering
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class CIFAR10Clustering(AbsTaskImageClustering):
diff --git a/mteb/tasks/Image/ImageClassification/eng/BirdsnapClassification.py b/mteb/tasks/Image/ImageClassification/eng/BirdsnapClassification.py
index a104d51e13..38016e5e79 100644
--- a/mteb/tasks/Image/ImageClassification/eng/BirdsnapClassification.py
+++ b/mteb/tasks/Image/ImageClassification/eng/BirdsnapClassification.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class BirdsnapClassification(AbsTaskImageClassification):
diff --git a/mteb/tasks/Image/ImageClassification/eng/CIFAR.py b/mteb/tasks/Image/ImageClassification/eng/CIFAR.py
index 75e3cdf6fc..9b4f45e387 100644
--- a/mteb/tasks/Image/ImageClassification/eng/CIFAR.py
+++ b/mteb/tasks/Image/ImageClassification/eng/CIFAR.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class CIFAR10Classification(AbsTaskImageClassification):
diff --git a/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py b/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py
index 0175cd8663..fe62f955b3 100644
--- a/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py
+++ b/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class Caltech101Classification(AbsTaskImageClassification):
diff --git a/mteb/tasks/Image/ImageClassification/eng/DTDClassification.py b/mteb/tasks/Image/ImageClassification/eng/DTDClassification.py
index 2f921e5587..25f6ba0401 100644
--- a/mteb/tasks/Image/ImageClassification/eng/DTDClassification.py
+++ b/mteb/tasks/Image/ImageClassification/eng/DTDClassification.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class DTDClassification(AbsTaskImageClassification):
diff --git a/mteb/tasks/Image/ImageClassification/eng/EuroSATClassification.py b/mteb/tasks/Image/ImageClassification/eng/EuroSATClassification.py
index b849d93c0b..4930c13d1b 100644
--- a/mteb/tasks/Image/ImageClassification/eng/EuroSATClassification.py
+++ b/mteb/tasks/Image/ImageClassification/eng/EuroSATClassification.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class EuroSATClassification(AbsTaskImageClassification):
diff --git a/mteb/tasks/Image/ImageClassification/eng/FER2013Classification.py b/mteb/tasks/Image/ImageClassification/eng/FER2013Classification.py
index 2081683154..9db8b017f7 100644
--- a/mteb/tasks/Image/ImageClassification/eng/FER2013Classification.py
+++ b/mteb/tasks/Image/ImageClassification/eng/FER2013Classification.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class FER2013Classification(AbsTaskImageClassification):
diff --git a/mteb/tasks/Image/ImageClassification/eng/FGVCAircraftClassification.py b/mteb/tasks/Image/ImageClassification/eng/FGVCAircraftClassification.py
index bb09f32426..9b061e6dd1 100644
--- a/mteb/tasks/Image/ImageClassification/eng/FGVCAircraftClassification.py
+++ b/mteb/tasks/Image/ImageClassification/eng/FGVCAircraftClassification.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class FGVCAircraftClassification(AbsTaskImageClassification):
diff --git a/mteb/tasks/Image/ImageClassification/eng/Food101Classification.py b/mteb/tasks/Image/ImageClassification/eng/Food101Classification.py
index 533b2c2145..04389db8f1 100644
--- a/mteb/tasks/Image/ImageClassification/eng/Food101Classification.py
+++ b/mteb/tasks/Image/ImageClassification/eng/Food101Classification.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class Food101Classification(AbsTaskImageClassification):
diff --git a/mteb/tasks/Image/ImageClassification/eng/MNISTClassification.py b/mteb/tasks/Image/ImageClassification/eng/MNISTClassification.py
index 82de6fab16..f3831abdb4 100644
--- a/mteb/tasks/Image/ImageClassification/eng/MNISTClassification.py
+++ b/mteb/tasks/Image/ImageClassification/eng/MNISTClassification.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class MNISTClassification(AbsTaskImageClassification):
diff --git a/mteb/tasks/Image/ImageClassification/eng/OxfordFlowersClassification.py b/mteb/tasks/Image/ImageClassification/eng/OxfordFlowersClassification.py
index dce55d9362..c0a10de48d 100644
--- a/mteb/tasks/Image/ImageClassification/eng/OxfordFlowersClassification.py
+++ b/mteb/tasks/Image/ImageClassification/eng/OxfordFlowersClassification.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class OxfordFlowersClassification(AbsTaskImageClassification):
diff --git a/mteb/tasks/Image/ImageClassification/eng/OxfordPetsClassification.py b/mteb/tasks/Image/ImageClassification/eng/OxfordPetsClassification.py
index 0277098d64..cf537648ed 100644
--- a/mteb/tasks/Image/ImageClassification/eng/OxfordPetsClassification.py
+++ b/mteb/tasks/Image/ImageClassification/eng/OxfordPetsClassification.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class OxfordPetsClassification(AbsTaskImageClassification):
diff --git a/mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py b/mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py
index e883db4c6e..afbc8fe1da 100644
--- a/mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py
+++ b/mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class RESISC45Classification(AbsTaskImageClassification):
diff --git a/mteb/tasks/Image/ImageClassification/eng/STL10Classification.py b/mteb/tasks/Image/ImageClassification/eng/STL10Classification.py
index 9b9fcf3ef4..9531e1c1f6 100644
--- a/mteb/tasks/Image/ImageClassification/eng/STL10Classification.py
+++ b/mteb/tasks/Image/ImageClassification/eng/STL10Classification.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class STL10Classification(AbsTaskImageClassification):
diff --git a/mteb/tasks/Image/ImageClassification/eng/SUN397Classification.py b/mteb/tasks/Image/ImageClassification/eng/SUN397Classification.py
index 414f3560e6..eef0ccbfcb 100644
--- a/mteb/tasks/Image/ImageClassification/eng/SUN397Classification.py
+++ b/mteb/tasks/Image/ImageClassification/eng/SUN397Classification.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class SUN397Classification(AbsTaskImageClassification):
diff --git a/mteb/tasks/Image/ImageClassification/eng/StanfordCarsClassification.py b/mteb/tasks/Image/ImageClassification/eng/StanfordCarsClassification.py
index 1fa4f64af2..e4561b2165 100644
--- a/mteb/tasks/Image/ImageClassification/eng/StanfordCarsClassification.py
+++ b/mteb/tasks/Image/ImageClassification/eng/StanfordCarsClassification.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskImageClassification import AbsTaskImageClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class StanfordCarsClassification(AbsTaskImageClassification):
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py b/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py
index 9273b66add..ed31e3f89f 100644
--- a/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
     AbsTaskZeroshotClassification,
 )
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class BirdsnapClassification(AbsTaskZeroshotClassification):
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py b/mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py
index 517bf565cc..81103a0f1d 100644
--- a/mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/CIFAR.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
     AbsTaskZeroshotClassification,
 )
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class CIFAR10ZeroShotClassification(AbsTaskZeroshotClassification):
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py b/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py
index f07c423939..ab7ca141cb 100644
--- a/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
     AbsTaskZeroshotClassification,
 )
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class Caltech101Classification(AbsTaskZeroshotClassification):
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py b/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py
index caea933534..27ef0a6f3d 100644
--- a/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
     AbsTaskZeroshotClassification,
 )
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class DTDClassification(AbsTaskZeroshotClassification):
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py b/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py
index 275487580d..de6fb4c434 100644
--- a/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
     AbsTaskZeroshotClassification,
 )
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class EuroSATClassification(AbsTaskZeroshotClassification):
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py b/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py
index febbb27e5e..9cfa0dd3e9 100644
--- a/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
     AbsTaskZeroshotClassification,
 )
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class FER2013Classification(AbsTaskZeroshotClassification):
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py b/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py
index 833afde477..c15e0b6d4b 100644
--- a/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
     AbsTaskZeroshotClassification,
 )
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class FGVCAircraftClassification(AbsTaskZeroshotClassification):
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py b/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py
index a2b93c2471..fd073ac412 100644
--- a/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
     AbsTaskZeroshotClassification,
 )
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class Food101Classification(AbsTaskZeroshotClassification):
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py b/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py
index f343cb9211..253fa938ac 100644
--- a/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
     AbsTaskZeroshotClassification,
 )
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class MNISTClassification(AbsTaskZeroshotClassification):
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py b/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py
index 2145fe8bff..3da580af1b 100644
--- a/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
     AbsTaskZeroshotClassification,
 )
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class OxfordPetsClassification(AbsTaskZeroshotClassification):
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py b/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py
index 7ba9824455..d6fb98ba6c 100644
--- a/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
     AbsTaskZeroshotClassification,
 )
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class RESISC45Classification(AbsTaskZeroshotClassification):
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py b/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py
index 11c53d5032..8b0f42d08d 100644
--- a/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
     AbsTaskZeroshotClassification,
 )
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class STL10Classification(AbsTaskZeroshotClassification):
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py b/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py
index c3e67879b0..64252584b8 100644
--- a/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
     AbsTaskZeroshotClassification,
 )
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class SUN397Classification(AbsTaskZeroshotClassification):
diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py b/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py
index 0e881b65f0..c8cc639a4e 100644
--- a/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py
+++ b/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from mteb.abstasks.Image.AbsTaskZeroshotClassification import (
     AbsTaskZeroshotClassification,
 )
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class StanfordCarsClassification(AbsTaskZeroshotClassification):

From 236a94f67aa630e4cd2295f481f886e08c98d310 Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Fri, 13 Sep 2024 16:39:39 +0100
Subject: [PATCH 05/17] wip: implement blip2 wrapper

---
 mteb/models/__init__.py     |   2 +
 mteb/models/blip2_models.py | 257 ++++++++++++++++++------------------
 2 files changed, 133 insertions(+), 126 deletions(-)

diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py
index 94358143c1..2229b70239 100644
--- a/mteb/models/__init__.py
+++ b/mteb/models/__init__.py
@@ -11,6 +11,7 @@
     align_models,
     bge_models,
     blip_models,
+    blip2_models,
     bm25,
     clip_models,
     cohere_models,
@@ -132,6 +133,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe
     align_models,
     bge_models,
     blip_models,
+    blip2_models,
     bm25,
     cohere_models,
     dino_models,
diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py
index 3181c5f5ac..16acabc0ef 100644
--- a/mteb/models/blip2_models.py
+++ b/mteb/models/blip2_models.py
@@ -8,140 +8,145 @@
 from torch.nn.functional import normalize
 from torch.utils.data import DataLoader
 from tqdm import tqdm
-from transformers import BlipForImageTextRetrieval, BlipProcessor
+from transformers import Blip2Processor
 
 from mteb.model_meta import ModelMeta
 
+def blip2_loader(**kwargs):
+    try:  # a temporal fix for the dependency issues of vista models.
+        from lavis.models import load_model_and_preprocess
 
-class BLIP2ModelWrapper:
-    def __init__(
-        self,
-        model_name: str,
-        device: str = "cuda" if torch.cuda.is_available() else "cpu",
-        **kwargs: Any,
-    ):
-        self.model_name = model_name
-        self.device = device
-        self.model = BlipForImageTextRetrieval.from_pretrained(model_name).to(
-            self.device
-        )
-        self.processor = BlipProcessor.from_pretrained(model_name)
-
-    def preprocess(
-        self,
-        texts: list[str],
-        images: list[Image.Image],
-    ):
-        return self.processor(
-            text=texts, images=images, return_tensors="pt", padding=True
+    except ImportError:
+        raise ImportError(
+            "Please install `pip install salesforce-lavis` to use BLIP-2 models."
         )
+    
+    class BLIP2ModelWrapper:
+        def __init__(
+            self,
+            model_name: str,
+            device: str = "cuda" if torch.cuda.is_available() else "cpu",
+            **kwargs: Any,
+        ):
+            self.model_name = model_name
+            self.device = device
+            self.model, self.vis_processors, self.txt_processors = load_model_and_preprocess(name="blip2-opt-2.7b", model_type="base")
+            self.model = self.model.to(self.device)
+            self.processor = Blip2Processor.from_pretrained(model_name)
+
+        def preprocess(
+            self,
+            texts: list[str],
+            images: list[Image.Image],
+        ):
+            return self.processor(
+                text=texts, images=images, return_tensors="pt", padding=True
+            )
+
+        def get_text_embeddings(self, texts: list[str], batch_size: int = 32):
+            all_text_embeddings = []
 
-    def get_text_embeddings(self, texts: list[str], batch_size: int = 32):
-        all_text_embeddings = []
-
-        with torch.no_grad():
-            for i in tqdm(range(0, len(texts), batch_size)):
-                batch_texts = texts[i : i + batch_size]
-                inputs = self.processor(
-                    text=batch_texts, return_tensors="pt", padding=True, truncation=True
-                )
-                inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                # different to CLIPModelWrapper: text_encoder instead of get_text_features and apply projection and normalization
-                text_outputs = self.model.text_encoder(**inputs)
-                text_outputs = text_outputs[0]
-                text_outputs = normalize(
-                    self.model.text_proj(text_outputs[:, 0, :]), dim=-1
-                )
-                all_text_embeddings.append(text_outputs.cpu())
-
-        all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
-        return all_text_embeddings
-
-    def get_image_embeddings(
-        self, images: list[Image.Image] | DataLoader, batch_size: int = 32
-    ):
-        all_image_embeddings = []
-
-        if isinstance(images, DataLoader):
-            with torch.no_grad():
-                for batch in tqdm(images):
-                    inputs = self.processor(
-                        images=batch, return_tensors="pt", padding=True
-                    )
-                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                    image_outputs = self.model.vision_model(**inputs)
-                    image_outputs = image_outputs[0]
-                    image_outputs = normalize(
-                        self.model.vision_proj(image_outputs[:, 0, :]), dim=-1
-                    )
-                    all_image_embeddings.append(image_outputs.cpu())
-        else:
             with torch.no_grad():
-                for i in tqdm(range(0, len(images), batch_size)):
-                    batch_images = images[i : i + batch_size]
+                for i in tqdm(range(0, len(texts), batch_size)):
+                    batch_texts = texts[i : i + batch_size]
                     inputs = self.processor(
-                        images=batch_images, return_tensors="pt", padding=True
+                        text=batch_texts, return_tensors="pt", padding=True, truncation=True
                     )
                     inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                    image_outputs = self.model.get_image_features(**inputs)
-                    image_outputs = self.model.vision_model(**inputs)
-                    image_outputs = image_outputs[0]
-                    image_outputs = normalize(
-                        self.model.vision_proj(image_outputs[:, 0, :]), dim=-1
-                    )
-                    all_image_embeddings.append(image_outputs.cpu())
-
-        all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
-        return all_image_embeddings
-
-    def calculate_probs(self, text_embeddings, image_embeddings):
-        text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
-        image_embeddings = image_embeddings / image_embeddings.norm(
-            dim=-1, keepdim=True
-        )
-        logits = torch.matmul(image_embeddings, text_embeddings.T)
-        probs = (logits * 100).softmax(dim=-1)
-        return probs
-
-    def get_fused_embeddings(
-        self,
-        texts: list[str] = None,
-        images: list[Image.Image] | DataLoader = None,
-        fusion_mode="sum",
-        batch_size: int = 32,
-    ):
-        # TODO: find out if BLIP has a prescribed way of fusing text and image embeddings
-        if texts is None and images is None:
-            raise ValueError("Either texts or images must be provided")
-
-        text_embeddings = None
-        image_embeddings = None
-
-        if texts is not None:
-            text_embeddings = self.get_text_embeddings(texts, batch_size)
-
-        if images is not None:
-            image_embeddings = self.get_image_embeddings(images, batch_size)
-
-        if text_embeddings is not None and image_embeddings is not None:
-            if len(text_embeddings) != len(image_embeddings):
-                raise ValueError(
-                    "The number of texts and images must have the same length"
-                )
-            if fusion_mode == "sum":
-                fused_embeddings = text_embeddings + image_embeddings
+        
+                    text_outputs = self.model.forward_text(**inputs)
+                    text_outputs = torch.functional.normalize(self.model.text_proj(text_outputs))
+                    all_text_embeddings.append(text_outputs.cpu())
+
+            all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
+            return all_text_embeddings
+
+        def get_image_embeddings(
+            self, images: list[Image.Image] | DataLoader, batch_size: int = 32
+        ):
+            all_image_embeddings = []
+
+            if isinstance(images, DataLoader):
+                with torch.no_grad():
+                    for batch in tqdm(images):
+                        inputs = self.processor(
+                            images=batch, return_tensors="pt", padding=True
+                        )
+                        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                        image_outputs = self.model.vision_model(**inputs)
+                        image_outputs = image_outputs[0]
+                        image_outputs = normalize(
+                            self.model.vision_proj(image_outputs[:, 0, :]), dim=-1
+                        )
+                        all_image_embeddings.append(image_outputs.cpu())
             else:
-                # to do: add other fusion mode
-                raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented")
-            return fused_embeddings
-        elif text_embeddings is not None:
-            return text_embeddings
-        elif image_embeddings is not None:
-            return image_embeddings
+                with torch.no_grad():
+                    for i in tqdm(range(0, len(images), batch_size)):
+                        batch_images = images[i : i + batch_size]
+                        inputs = self.processor(
+                            images=batch_images, return_tensors="pt", padding=True
+                        )
+                        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                        image_outputs = self.model.get_image_features(**inputs)
+                        image_outputs = self.model.vision_model(**inputs)
+                        image_outputs = image_outputs[0]
+                        image_outputs = normalize(
+                            self.model.vision_proj(image_outputs[:, 0, :]), dim=-1
+                        )
+                        all_image_embeddings.append(image_outputs.cpu())
+
+            all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
+            return all_image_embeddings
+
+        def calculate_probs(self, text_embeddings, image_embeddings):
+            text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
+            image_embeddings = image_embeddings / image_embeddings.norm(
+                dim=-1, keepdim=True
+            )
+            logits = torch.matmul(image_embeddings, text_embeddings.T)
+            probs = (logits * 100).softmax(dim=-1)
+            return probs
+
+        def get_fused_embeddings(
+            self,
+            texts: list[str] = None,
+            images: list[Image.Image] | DataLoader = None,
+            fusion_mode="sum",
+            batch_size: int = 32,
+        ):
+            # TODO: find out if BLIP has a prescribed way of fusing text and image embeddings
+            if texts is None and images is None:
+                raise ValueError("Either texts or images must be provided")
+
+            text_embeddings = None
+            image_embeddings = None
+
+            if texts is not None:
+                text_embeddings = self.get_text_embeddings(texts, batch_size)
+
+            if images is not None:
+                image_embeddings = self.get_image_embeddings(images, batch_size)
+
+            if text_embeddings is not None and image_embeddings is not None:
+                if len(text_embeddings) != len(image_embeddings):
+                    raise ValueError(
+                        "The number of texts and images must have the same length"
+                    )
+                if fusion_mode == "sum":
+                    fused_embeddings = text_embeddings + image_embeddings
+                else:
+                    # to do: add other fusion mode
+                    raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented")
+                return fused_embeddings
+            elif text_embeddings is not None:
+                return text_embeddings
+            elif image_embeddings is not None:
+                return image_embeddings
+    
+    return BLIP2ModelWrapper(**kwargs)
 
 
 """
-
 Salesforce/blip2-opt-2.7b
 Image-to-Text • Updated Mar 22 •
 588k •
@@ -167,7 +172,7 @@ def get_fused_embeddings(
 
 blip2_opt_2_7b = ModelMeta(
     loader=partial(
-        BLIP2ModelWrapper,
+        blip2_loader,
         model_name="Salesforce/blip2-opt-2.7b",
     ),
     name="Salesforce/blip2-opt-2.7b",
@@ -179,7 +184,7 @@ def get_fused_embeddings(
 
 blip2_flan_t5_xxl = ModelMeta(
     loader=partial(
-        BLIP2ModelWrapper,
+        blip2_loader,
         model_name="Salesforce/blip2-flan-t5-xxl",
     ),
     name="Salesforce/blip2-flan-t5-xxl",
@@ -191,7 +196,7 @@ def get_fused_embeddings(
 
 blip2_opt_6_7b_coco = ModelMeta(
     loader=partial(
-        BLIP2ModelWrapper,
+        blip2_loader,
         model_name="Salesforce/blip2-opt-6.7b-coco",
     ),
     name="Salesforce/blip2-opt-6.7b-coco",
@@ -203,7 +208,7 @@ def get_fused_embeddings(
 
 blip2_opt_6_7b = ModelMeta(
     loader=partial(
-        BLIP2ModelWrapper,
+        blip2_loader,
         model_name="Salesforce/blip2-opt-6.7b",
     ),
     name="Salesforce/blip2-opt-6.7b",
@@ -215,7 +220,7 @@ def get_fused_embeddings(
 
 blip2_flan_t5_xl = ModelMeta(
     loader=partial(
-        BLIP2ModelWrapper,
+        blip2_loader,
         model_name="Salesforce/blip2-flan-t5-xl",
     ),
     name="Salesforce/blip2-flan-t5-xl",
@@ -228,7 +233,7 @@ def get_fused_embeddings(
 if __name__ == "__main__":
     import mteb
 
-    mdl = mteb.get_model(blip2_opt_2_7b.name, blip2_opt_2_7b.revision)
+    mdl = mteb.get_model(blip2_opt_2_7b.name, blip2_opt_2_7b.revision, device="cpu")
     emb = mdl.get_text_embeddings(["Hello, world!"])
     emb2 = mdl.get_text_embeddings(["Hello there, world!"])
     emb3 = mdl.get_text_embeddings(["Goodbye, person!"])

From 1f2f8c3d0960cd0af18620060262a53bfbe91f5c Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Sun, 15 Sep 2024 21:18:05 +0100
Subject: [PATCH 06/17] feat: add blip2 models, still mismatched names

---
 mteb/models/blip2_models.py | 133 ++++++++++++++++++------------------
 1 file changed, 66 insertions(+), 67 deletions(-)

diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py
index 16acabc0ef..12dc1cfa51 100644
--- a/mteb/models/blip2_models.py
+++ b/mteb/models/blip2_models.py
@@ -2,20 +2,21 @@
 
 from functools import partial
 from typing import Any
+from types import SimpleNamespace
 
 import torch
 from PIL import Image
 from torch.nn.functional import normalize
 from torch.utils.data import DataLoader
 from tqdm import tqdm
-from transformers import Blip2Processor
+from transformers import Blip2Processor, BertTokenizer
 
 from mteb.model_meta import ModelMeta
 
 def blip2_loader(**kwargs):
     try:  # a temporal fix for the dependency issues of vista models.
         from lavis.models import load_model_and_preprocess
-
+        from lavis.models.blip2_models.blip2_image_text_matching import Blip2ITM, Blip2Qformer
     except ImportError:
         raise ImportError(
             "Please install `pip install salesforce-lavis` to use BLIP-2 models."
@@ -30,8 +31,7 @@ def __init__(
         ):
             self.model_name = model_name
             self.device = device
-            self.model, self.vis_processors, self.txt_processors = load_model_and_preprocess(name="blip2-opt-2.7b", model_type="base")
-            self.model = self.model.to(self.device)
+            self.model = Blip2ITM.from_pretrained("pretrain").to(self.device).float()
             self.processor = Blip2Processor.from_pretrained(model_name)
 
         def preprocess(
@@ -49,13 +49,15 @@ def get_text_embeddings(self, texts: list[str], batch_size: int = 32):
             with torch.no_grad():
                 for i in tqdm(range(0, len(texts), batch_size)):
                     batch_texts = texts[i : i + batch_size]
-                    inputs = self.processor(
-                        text=batch_texts, return_tensors="pt", padding=True, truncation=True
-                    )
-                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
-        
-                    text_outputs = self.model.forward_text(**inputs)
-                    text_outputs = torch.functional.normalize(self.model.text_proj(text_outputs))
+                    text_tokens = self.model.tokenizer(
+                        batch_texts,
+                        padding="max_length",
+                        truncation=True,
+                        max_length=self.model.max_txt_len,
+                        return_tensors="pt",
+                    ).to(self.device)
+                    text_outputs = self.model.forward_text(text_tokens)
+                    text_outputs = normalize(self.model.text_proj(text_outputs))
                     all_text_embeddings.append(text_outputs.cpu())
 
             all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
@@ -72,8 +74,7 @@ def get_image_embeddings(
                         inputs = self.processor(
                             images=batch, return_tensors="pt", padding=True
                         )
-                        inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                        image_outputs = self.model.vision_model(**inputs)
+                        image_outputs = self.model.forward_image(inputs["pixel_values"].to(self.device))
                         image_outputs = image_outputs[0]
                         image_outputs = normalize(
                             self.model.vision_proj(image_outputs[:, 0, :]), dim=-1
@@ -85,10 +86,8 @@ def get_image_embeddings(
                         batch_images = images[i : i + batch_size]
                         inputs = self.processor(
                             images=batch_images, return_tensors="pt", padding=True
-                        )
-                        inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                        image_outputs = self.model.get_image_features(**inputs)
-                        image_outputs = self.model.vision_model(**inputs)
+                        )["pixel_values"].to(self.device)
+                        image_outputs = self.model.forward_image(inputs)
                         image_outputs = image_outputs[0]
                         image_outputs = normalize(
                             self.model.vision_proj(image_outputs[:, 0, :]), dim=-1
@@ -98,6 +97,43 @@ def get_image_embeddings(
             all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
             return all_image_embeddings
 
+        def get_multimodal_embeddings(
+            self, texts, images, batch_size
+        ):
+            all_multimodal_embeddings = []
+
+            with torch.no_grad():
+                if isinstance(images, DataLoader):
+                    for batch_images, i in tqdm(zip(images, range(0, len(texts), batch_size))):
+                        batch_texts = texts[i : i + batch_size]
+                        
+                        image_inputs  = self.processor(
+                            images=batch_images, return_tensors="pt", padding=True
+                        )["pixel_values"].to(self.device)
+                        multimodal_outputs = self.model.extract_features({
+                            "text_input": batch_texts,
+                            "image": image_inputs
+                        }).multimodal_embeds
+
+                        all_multimodal_embeddings.append(multimodal_outputs.cpu())
+                else:
+                    for i in tqdm(range(0, len(texts), batch_size)):
+                        batch_images = images[i : i + batch_size]
+                        batch_texts = texts[i : i + batch_size]
+
+                        image_inputs  = self.processor(
+                            images=batch_images, return_tensors="pt", padding=True
+                        )["pixel_values"].to(self.device)
+                        multimodal_outputs = self.model.extract_features({
+                            "text_input": batch_texts,
+                            "image": image_inputs
+                        }).multimodal_embeds
+
+                        all_multimodal_embeddings.append(multimodal_outputs.cpu())
+                        
+
+            return torch.cat(all_multimodal_embeddings, dim=0)
+
         def calculate_probs(self, text_embeddings, image_embeddings):
             text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
             image_embeddings = image_embeddings / image_embeddings.norm(
@@ -111,7 +147,7 @@ def get_fused_embeddings(
             self,
             texts: list[str] = None,
             images: list[Image.Image] | DataLoader = None,
-            fusion_mode="sum",
+            fusion_mode="multimodal",
             batch_size: int = 32,
         ):
             # TODO: find out if BLIP has a prescribed way of fusing text and image embeddings
@@ -134,6 +170,8 @@ def get_fused_embeddings(
                     )
                 if fusion_mode == "sum":
                     fused_embeddings = text_embeddings + image_embeddings
+                if fusion_mode == "multimodal":
+                    fused_embeddings = self.get_multimodal_embeddings(texts, images, batch_size)
                 else:
                     # to do: add other fusion mode
                     raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented")
@@ -170,7 +208,7 @@ def get_fused_embeddings(
 """
 # in descending order of usage (downloads from huggingface)
 
-blip2_opt_2_7b = ModelMeta(
+blip2_image_text_matching = ModelMeta(
     loader=partial(
         blip2_loader,
         model_name="Salesforce/blip2-opt-2.7b",
@@ -182,58 +220,12 @@ def get_fused_embeddings(
     release_date="2024-03-22",
 )
 
-blip2_flan_t5_xxl = ModelMeta(
-    loader=partial(
-        blip2_loader,
-        model_name="Salesforce/blip2-flan-t5-xxl",
-    ),
-    name="Salesforce/blip2-flan-t5-xxl",
-    languages=["eng_Latn"],
-    open_source=True,
-    revision="43206cbc865b9d5b3dd7d080e5d94b4143ca8e74",
-    release_date="2024-03-29",
-)
-
-blip2_opt_6_7b_coco = ModelMeta(
-    loader=partial(
-        blip2_loader,
-        model_name="Salesforce/blip2-opt-6.7b-coco",
-    ),
-    name="Salesforce/blip2-opt-6.7b-coco",
-    languages=["eng_Latn"],
-    open_source=True,
-    revision="0d580de59320a25a4d2c386387bcef310d5f286e",
-    release_date="2024-03-31",
-)
-
-blip2_opt_6_7b = ModelMeta(
-    loader=partial(
-        blip2_loader,
-        model_name="Salesforce/blip2-opt-6.7b",
-    ),
-    name="Salesforce/blip2-opt-6.7b",
-    languages=["eng_Latn"],
-    open_source=True,
-    revision="1d33d60155fd1323b97556e0f1dd5148a9749f5b",
-    release_date="2024-03-27",
-)
-
-blip2_flan_t5_xl = ModelMeta(
-    loader=partial(
-        blip2_loader,
-        model_name="Salesforce/blip2-flan-t5-xl",
-    ),
-    name="Salesforce/blip2-flan-t5-xl",
-    languages=["eng_Latn"],
-    open_source=True,
-    revision="e5025a34e3e769e72e2aab7f7bfd00bc84d5fd77",
-    release_date="2023-12-13",
-)
 
 if __name__ == "__main__":
     import mteb
+    import PIL.Image
 
-    mdl = mteb.get_model(blip2_opt_2_7b.name, blip2_opt_2_7b.revision, device="cpu")
+    mdl = mteb.get_model(blip2_image_text_matching.name, blip2_image_text_matching.revision, device="cpu")
     emb = mdl.get_text_embeddings(["Hello, world!"])
     emb2 = mdl.get_text_embeddings(["Hello there, world!"])
     emb3 = mdl.get_text_embeddings(["Goodbye, person!"])
@@ -243,3 +235,10 @@ def get_fused_embeddings(
 
     sim = torch.nn.functional.cosine_similarity(emb, emb3)
     print(sim)
+
+    cat_img = Image.open("cat.jpg")
+    cat_text = "An image of a cat"
+
+    multi_emv = mdl.get_multimodal_embeddings([cat_text], [cat_img], 32)
+
+

From 8c6486087ad2790b724c03110b149e337f77b9b0 Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Sun, 15 Sep 2024 21:28:31 +0100
Subject: [PATCH 07/17] fix: remove projections from image and text embeddings

---
 mteb/models/blip2_models.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py
index 12dc1cfa51..86f0676b0e 100644
--- a/mteb/models/blip2_models.py
+++ b/mteb/models/blip2_models.py
@@ -57,7 +57,7 @@ def get_text_embeddings(self, texts: list[str], batch_size: int = 32):
                         return_tensors="pt",
                     ).to(self.device)
                     text_outputs = self.model.forward_text(text_tokens)
-                    text_outputs = normalize(self.model.text_proj(text_outputs))
+                    #text_outputs = normalize(self.model.text_proj(text_outputs))
                     all_text_embeddings.append(text_outputs.cpu())
 
             all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
@@ -75,10 +75,8 @@ def get_image_embeddings(
                             images=batch, return_tensors="pt", padding=True
                         )
                         image_outputs = self.model.forward_image(inputs["pixel_values"].to(self.device))
-                        image_outputs = image_outputs[0]
-                        image_outputs = normalize(
-                            self.model.vision_proj(image_outputs[:, 0, :]), dim=-1
-                        )
+                        image_outputs = image_outputs[0][:, 0, :]
+                        #image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1)
                         all_image_embeddings.append(image_outputs.cpu())
             else:
                 with torch.no_grad():
@@ -98,7 +96,7 @@ def get_image_embeddings(
             return all_image_embeddings
 
         def get_multimodal_embeddings(
-            self, texts, images, batch_size
+            self, texts, images, batch_size=32
         ):
             all_multimodal_embeddings = []
 
@@ -113,7 +111,7 @@ def get_multimodal_embeddings(
                         multimodal_outputs = self.model.extract_features({
                             "text_input": batch_texts,
                             "image": image_inputs
-                        }).multimodal_embeds
+                        }).multimodal_embeds[:,0,:]
 
                         all_multimodal_embeddings.append(multimodal_outputs.cpu())
                 else:
@@ -127,7 +125,7 @@ def get_multimodal_embeddings(
                         multimodal_outputs = self.model.extract_features({
                             "text_input": batch_texts,
                             "image": image_inputs
-                        }).multimodal_embeds
+                        }).multimodal_embeds[:,0,:]
 
                         all_multimodal_embeddings.append(multimodal_outputs.cpu())
                         
@@ -239,6 +237,16 @@ def get_fused_embeddings(
     cat_img = Image.open("cat.jpg")
     cat_text = "An image of a cat"
 
-    multi_emv = mdl.get_multimodal_embeddings([cat_text], [cat_img], 32)
+    multi_cat_emb = mdl.get_multimodal_embeddings([cat_text], [cat_img])
+    text_cat_emb = mdl.get_text_embeddings(["An photo of a cat"])
+    text_dog_emb = mdl.get_text_embeddings(["An image of a dog"])
+
+    print(multi_cat_emb.shape)
+
+    sim1 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_cat_emb)
+    sim2 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_dog_emb)
+
+    print(sim1, sim2)
+
 
 

From 20839ca93b114bf4a5011aeaa41846a9f0f32482 Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Sun, 15 Sep 2024 21:52:30 +0100
Subject: [PATCH 08/17] make lint

---
 mteb/models/__init__.py     |  2 +-
 mteb/models/blip2_models.py | 95 +++++++++++++++----------------------
 2 files changed, 40 insertions(+), 57 deletions(-)

diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py
index 2229b70239..eabe5a2d3f 100644
--- a/mteb/models/__init__.py
+++ b/mteb/models/__init__.py
@@ -10,8 +10,8 @@
 from mteb.models import (
     align_models,
     bge_models,
-    blip_models,
     blip2_models,
+    blip_models,
     bm25,
     clip_models,
     cohere_models,
diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py
index 86f0676b0e..b735b65cf3 100644
--- a/mteb/models/blip2_models.py
+++ b/mteb/models/blip2_models.py
@@ -2,26 +2,29 @@
 
 from functools import partial
 from typing import Any
-from types import SimpleNamespace
 
 import torch
 from PIL import Image
 from torch.nn.functional import normalize
 from torch.utils.data import DataLoader
 from tqdm import tqdm
-from transformers import Blip2Processor, BertTokenizer
+from transformers import Blip2Processor
 
 from mteb.model_meta import ModelMeta
 
+
 def blip2_loader(**kwargs):
     try:  # a temporal fix for the dependency issues of vista models.
         from lavis.models import load_model_and_preprocess
-        from lavis.models.blip2_models.blip2_image_text_matching import Blip2ITM, Blip2Qformer
+        from lavis.models.blip2_models.blip2_image_text_matching import (
+            Blip2ITM,
+            Blip2Qformer,
+        )
     except ImportError:
         raise ImportError(
             "Please install `pip install salesforce-lavis` to use BLIP-2 models."
         )
-    
+
     class BLIP2ModelWrapper:
         def __init__(
             self,
@@ -57,7 +60,7 @@ def get_text_embeddings(self, texts: list[str], batch_size: int = 32):
                         return_tensors="pt",
                     ).to(self.device)
                     text_outputs = self.model.forward_text(text_tokens)
-                    #text_outputs = normalize(self.model.text_proj(text_outputs))
+                    # text_outputs = normalize(self.model.text_proj(text_outputs))
                     all_text_embeddings.append(text_outputs.cpu())
 
             all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
@@ -74,9 +77,11 @@ def get_image_embeddings(
                         inputs = self.processor(
                             images=batch, return_tensors="pt", padding=True
                         )
-                        image_outputs = self.model.forward_image(inputs["pixel_values"].to(self.device))
+                        image_outputs = self.model.forward_image(
+                            inputs["pixel_values"].to(self.device)
+                        )
                         image_outputs = image_outputs[0][:, 0, :]
-                        #image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1)
+                        # image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1)
                         all_image_embeddings.append(image_outputs.cpu())
             else:
                 with torch.no_grad():
@@ -95,23 +100,22 @@ def get_image_embeddings(
             all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
             return all_image_embeddings
 
-        def get_multimodal_embeddings(
-            self, texts, images, batch_size=32
-        ):
+        def get_multimodal_embeddings(self, texts, images, batch_size=32):
             all_multimodal_embeddings = []
 
             with torch.no_grad():
                 if isinstance(images, DataLoader):
-                    for batch_images, i in tqdm(zip(images, range(0, len(texts), batch_size))):
+                    for batch_images, i in tqdm(
+                        zip(images, range(0, len(texts), batch_size))
+                    ):
                         batch_texts = texts[i : i + batch_size]
-                        
-                        image_inputs  = self.processor(
+
+                        image_inputs = self.processor(
                             images=batch_images, return_tensors="pt", padding=True
                         )["pixel_values"].to(self.device)
-                        multimodal_outputs = self.model.extract_features({
-                            "text_input": batch_texts,
-                            "image": image_inputs
-                        }).multimodal_embeds[:,0,:]
+                        multimodal_outputs = self.model.extract_features(
+                            {"text_input": batch_texts, "image": image_inputs}
+                        ).multimodal_embeds[:, 0, :]
 
                         all_multimodal_embeddings.append(multimodal_outputs.cpu())
                 else:
@@ -119,21 +123,21 @@ def get_multimodal_embeddings(
                         batch_images = images[i : i + batch_size]
                         batch_texts = texts[i : i + batch_size]
 
-                        image_inputs  = self.processor(
+                        image_inputs = self.processor(
                             images=batch_images, return_tensors="pt", padding=True
                         )["pixel_values"].to(self.device)
-                        multimodal_outputs = self.model.extract_features({
-                            "text_input": batch_texts,
-                            "image": image_inputs
-                        }).multimodal_embeds[:,0,:]
+                        multimodal_outputs = self.model.extract_features(
+                            {"text_input": batch_texts, "image": image_inputs}
+                        ).multimodal_embeds[:, 0, :]
 
                         all_multimodal_embeddings.append(multimodal_outputs.cpu())
-                        
 
             return torch.cat(all_multimodal_embeddings, dim=0)
 
         def calculate_probs(self, text_embeddings, image_embeddings):
-            text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
+            text_embeddings = text_embeddings / text_embeddings.norm(
+                dim=-1, keepdim=True
+            )
             image_embeddings = image_embeddings / image_embeddings.norm(
                 dim=-1, keepdim=True
             )
@@ -169,42 +173,22 @@ def get_fused_embeddings(
                 if fusion_mode == "sum":
                     fused_embeddings = text_embeddings + image_embeddings
                 if fusion_mode == "multimodal":
-                    fused_embeddings = self.get_multimodal_embeddings(texts, images, batch_size)
+                    fused_embeddings = self.get_multimodal_embeddings(
+                        texts, images, batch_size
+                    )
                 else:
                     # to do: add other fusion mode
-                    raise ValueError(f"fusion mode {fusion_mode} hasn't been implemented")
+                    raise ValueError(
+                        f"fusion mode {fusion_mode} hasn't been implemented"
+                    )
                 return fused_embeddings
             elif text_embeddings is not None:
                 return text_embeddings
             elif image_embeddings is not None:
                 return image_embeddings
-    
-    return BLIP2ModelWrapper(**kwargs)
 
+    return BLIP2ModelWrapper(**kwargs)
 
-"""
-Salesforce/blip2-opt-2.7b
-Image-to-Text • Updated Mar 22 •
-588k •
-296
-Salesforce/blip2-flan-t5-xxl
-Image-to-Text • Updated Mar 29 •
-9.23k •
-84
-Salesforce/blip2-opt-6.7b-coco
-Image-to-Text • Updated Mar 31 •
-1.51k •
-28
-Salesforce/blip2-opt-6.7b
-Image-to-Text • Updated Mar 27 •
-4.93k •
-71
-Salesforce/blip2-flan-t5-xl
-Image-to-Text • Updated Dec 13, 2023 •
-95.9k •
-56
-"""
-# in descending order of usage (downloads from huggingface)
 
 blip2_image_text_matching = ModelMeta(
     loader=partial(
@@ -220,10 +204,12 @@ def get_fused_embeddings(
 
 
 if __name__ == "__main__":
+
     import mteb
-    import PIL.Image
 
-    mdl = mteb.get_model(blip2_image_text_matching.name, blip2_image_text_matching.revision, device="cpu")
+    mdl = mteb.get_model(
+        blip2_image_text_matching.name, blip2_image_text_matching.revision, device="cpu"
+    )
     emb = mdl.get_text_embeddings(["Hello, world!"])
     emb2 = mdl.get_text_embeddings(["Hello there, world!"])
     emb3 = mdl.get_text_embeddings(["Goodbye, person!"])
@@ -247,6 +233,3 @@ def get_fused_embeddings(
     sim2 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_dog_emb)
 
     print(sim1, sim2)
-
-
-

From ec47c690261169ac8d197af476eccb0ae32a187d Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Sun, 15 Sep 2024 22:17:35 +0100
Subject: [PATCH 09/17] wip: add coco BLIP2

---
 mteb/models/blip2_models.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py
index b735b65cf3..aedb03e24f 100644
--- a/mteb/models/blip2_models.py
+++ b/mteb/models/blip2_models.py
@@ -18,7 +18,6 @@ def blip2_loader(**kwargs):
         from lavis.models import load_model_and_preprocess
         from lavis.models.blip2_models.blip2_image_text_matching import (
             Blip2ITM,
-            Blip2Qformer,
         )
     except ImportError:
         raise ImportError(
@@ -34,7 +33,8 @@ def __init__(
         ):
             self.model_name = model_name
             self.device = device
-            self.model = Blip2ITM.from_pretrained("pretrain").to(self.device).float()
+            model_type = "coco" if "coco" in model_name else "pretrain"
+            self.model = Blip2ITM.from_pretrained(model_type).to(self.device).float()
             self.processor = Blip2Processor.from_pretrained(model_name)
 
         def preprocess(
@@ -190,7 +190,7 @@ def get_fused_embeddings(
     return BLIP2ModelWrapper(**kwargs)
 
 
-blip2_image_text_matching = ModelMeta(
+blip2_opt_2_7b = ModelMeta(
     loader=partial(
         blip2_loader,
         model_name="Salesforce/blip2-opt-2.7b",
@@ -202,13 +202,25 @@ def get_fused_embeddings(
     release_date="2024-03-22",
 )
 
+blip2_opt_6_7b_coco = ModelMeta(
+    loader=partial(
+        blip2_loader,
+        model_name="Salesforce/blip2-opt-6.7b-coco",
+    ),
+    name="Salesforce/blip2-opt-6.7b-coco",
+    languages=["eng_Latn"],
+    open_source=True,
+    revision="0d580de59320a25a4d2c386387bcef310d5f286e",
+    release_date="2024-03-31",
+)
+
 
 if __name__ == "__main__":
 
     import mteb
 
     mdl = mteb.get_model(
-        blip2_image_text_matching.name, blip2_image_text_matching.revision, device="cpu"
+        blip2_opt_2_7b.name, blip2_opt_2_7b.revision, device="cpu"
     )
     emb = mdl.get_text_embeddings(["Hello, world!"])
     emb2 = mdl.get_text_embeddings(["Hello there, world!"])

From e8f4ae1b6cdc455c7ac06d69bf6433936fac1ef4 Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Mon, 16 Sep 2024 12:06:47 +0100
Subject: [PATCH 10/17] fix: BLIP2 better zero-shot classification without
 text_proj and vision_proj

---
 mteb/models/blip2_models.py | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py
index aedb03e24f..9cac90d6f0 100644
--- a/mteb/models/blip2_models.py
+++ b/mteb/models/blip2_models.py
@@ -35,6 +35,8 @@ def __init__(
             self.device = device
             model_type = "coco" if "coco" in model_name else "pretrain"
             self.model = Blip2ITM.from_pretrained(model_type).to(self.device).float()
+            # print numbr of parameters
+            print(f"Number of parameters: {sum(p.numel() for p in self.model.parameters())}")
             self.processor = Blip2Processor.from_pretrained(model_name)
 
         def preprocess(
@@ -60,7 +62,7 @@ def get_text_embeddings(self, texts: list[str], batch_size: int = 32):
                         return_tensors="pt",
                     ).to(self.device)
                     text_outputs = self.model.forward_text(text_tokens)
-                    # text_outputs = normalize(self.model.text_proj(text_outputs))
+                    #text_outputs = normalize(self.model.text_proj(text_outputs))
                     all_text_embeddings.append(text_outputs.cpu())
 
             all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
@@ -81,7 +83,7 @@ def get_image_embeddings(
                             inputs["pixel_values"].to(self.device)
                         )
                         image_outputs = image_outputs[0][:, 0, :]
-                        # image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1)
+                        #image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1)
                         all_image_embeddings.append(image_outputs.cpu())
             else:
                 with torch.no_grad():
@@ -91,10 +93,8 @@ def get_image_embeddings(
                             images=batch_images, return_tensors="pt", padding=True
                         )["pixel_values"].to(self.device)
                         image_outputs = self.model.forward_image(inputs)
-                        image_outputs = image_outputs[0]
-                        image_outputs = normalize(
-                            self.model.vision_proj(image_outputs[:, 0, :]), dim=-1
-                        )
+                        image_outputs = image_outputs[0][:, 0, :]
+                        #image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1)
                         all_image_embeddings.append(image_outputs.cpu())
 
             all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
@@ -105,6 +105,11 @@ def get_multimodal_embeddings(self, texts, images, batch_size=32):
 
             with torch.no_grad():
                 if isinstance(images, DataLoader):
+                    # check dataloader batch size is the same as batch size
+                    if images.batch_size != batch_size:
+                        raise ValueError(
+                            "Image DataLoader batch size must be the same as the given batch size: " + str(batch_size)
+                        )
                     for batch_images, i in tqdm(
                         zip(images, range(0, len(texts), batch_size))
                     ):
@@ -117,6 +122,8 @@ def get_multimodal_embeddings(self, texts, images, batch_size=32):
                             {"text_input": batch_texts, "image": image_inputs}
                         ).multimodal_embeds[:, 0, :]
 
+                        #multimodal_outputs = normalize(self.model.text_proj(multimodal_outputs), dim=-1)
+
                         all_multimodal_embeddings.append(multimodal_outputs.cpu())
                 else:
                     for i in tqdm(range(0, len(texts), batch_size)):
@@ -130,6 +137,8 @@ def get_multimodal_embeddings(self, texts, images, batch_size=32):
                             {"text_input": batch_texts, "image": image_inputs}
                         ).multimodal_embeds[:, 0, :]
 
+                        #multimodal_outputs = normalize(self.model.text_proj(multimodal_outputs), dim=-1)
+
                         all_multimodal_embeddings.append(multimodal_outputs.cpu())
 
             return torch.cat(all_multimodal_embeddings, dim=0)
@@ -172,7 +181,7 @@ def get_fused_embeddings(
                     )
                 if fusion_mode == "sum":
                     fused_embeddings = text_embeddings + image_embeddings
-                if fusion_mode == "multimodal":
+                elif fusion_mode == "multimodal":
                     fused_embeddings = self.get_multimodal_embeddings(
                         texts, images, batch_size
                     )
@@ -235,13 +244,21 @@ def get_fused_embeddings(
     cat_img = Image.open("cat.jpg")
     cat_text = "An image of a cat"
 
-    multi_cat_emb = mdl.get_multimodal_embeddings([cat_text], [cat_img])
+    multi_cat_emb = mdl.get_fused_embeddings(["A photo of an animal"], [cat_img], fusion_mode="multimodal")
+    multi_conflicting_emb = mdl.get_fused_embeddings(["A photo of a dog"], [cat_img], fusion_mode="multimodal")
+    image_cat_emb = mdl.get_image_embeddings([cat_img])
     text_cat_emb = mdl.get_text_embeddings(["An photo of a cat"])
     text_dog_emb = mdl.get_text_embeddings(["An image of a dog"])
 
     print(multi_cat_emb.shape)
 
-    sim1 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_cat_emb)
-    sim2 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_dog_emb)
+    sim1 = torch.nn.functional.cosine_similarity(image_cat_emb, text_cat_emb)
+    sim2 = torch.nn.functional.cosine_similarity(image_cat_emb, text_dog_emb)
+    sim3 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_cat_emb)
+    sim4 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_dog_emb)
+    sim5 = torch.nn.functional.cosine_similarity(multi_conflicting_emb, text_cat_emb)
+
 
     print(sim1, sim2)
+
+    print(sim3, sim4, sim5)

From 57bc3b8d4e98e4d29c2e44fb12e1bdce1c263cc1 Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Thu, 19 Sep 2024 16:06:02 +0100
Subject: [PATCH 11/17] tidy blip2

---
 mteb/models/blip2_models.py             | 2 +-
 mteb/tasks/Image/Clustering/__init__.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py
index 9cac90d6f0..2195f42c12 100644
--- a/mteb/models/blip2_models.py
+++ b/mteb/models/blip2_models.py
@@ -158,7 +158,7 @@ def get_fused_embeddings(
             self,
             texts: list[str] = None,
             images: list[Image.Image] | DataLoader = None,
-            fusion_mode="multimodal",
+            fusion_mode="sum",
             batch_size: int = 32,
         ):
             # TODO: find out if BLIP has a prescribed way of fusing text and image embeddings
diff --git a/mteb/tasks/Image/Clustering/__init__.py b/mteb/tasks/Image/Clustering/__init__.py
index fd9a71ec19..9ce1b567e6 100644
--- a/mteb/tasks/Image/Clustering/__init__.py
+++ b/mteb/tasks/Image/Clustering/__init__.py
@@ -2,3 +2,4 @@
 
 from .eng.CIFAR import *
 from .eng.TinyImageNet import *
+from .eng.ImageNet import *

From 4cbec1bc6957fcc74f35888191ee2cf620bdaa4c Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Thu, 19 Sep 2024 16:09:32 +0100
Subject: [PATCH 12/17] add imagenet-dog-15 dataset

---
 mteb/tasks/Image/Clustering/eng/ImageNet.py | 71 +++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 mteb/tasks/Image/Clustering/eng/ImageNet.py

diff --git a/mteb/tasks/Image/Clustering/eng/ImageNet.py b/mteb/tasks/Image/Clustering/eng/ImageNet.py
new file mode 100644
index 0000000000..0efe69f844
--- /dev/null
+++ b/mteb/tasks/Image/Clustering/eng/ImageNet.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+import io
+import PIL.Image as Image
+from mteb.abstasks.Image.AbsTaskImageClustering import AbsTaskImageClustering
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+"""
+Classes:
+1.MALTESE DOG
+2.BLENHEIM SPANIEL
+3.BASSET
+4.NORWEGIAN ELKHOUND
+5.GIANT SCHNAUZER
+6.GOLDEN RETRIEVER
+7.BRITTANY SPANIEL
+8.CLUMBER
+9.WELSH SPRINGER SPANIEL
+10.GROENENDAEL
+11.KELPIE
+12.SHETLAND SHEEPDOG
+13.DOBERMAN
+14.PUG
+15.CHOW
+"""
+
+class ImageNetDog15Clustering(AbsTaskImageClustering):
+    metadata = TaskMetadata(
+        name="ImageNetDog15Clustering",
+        description="Clustering images from a 15-class dogs-only subset of the dog classes in ImageNet.",
+        reference="http://vision.stanford.edu/aditya86/ImageNetDogs/main.html",
+        dataset={
+            "path": "JamieSJS/imagenet-dog-15",
+            "revision": "bfb6ad3b2109d26c9daddf14f98d315daa35ee72",
+        },
+        type="Clustering",
+        category="i2t",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="accuracy",
+        date=(
+            "2009-06-20",
+            "2009-06-20"
+        ),  # Conference date
+        domains=["Web"],
+        task_subtypes=["Object recognition"],
+        license="Not specified",
+        socioeconomic_status="mixed",
+        annotations_creators="derived",
+        dialect=[],
+        modalities=["image"],
+        sample_creation="created",
+        bibtex_citation=""" @INPROCEEDINGS{5206848,
+  author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Kai Li and Li Fei-Fei},
+  booktitle={2009 IEEE Conference on Computer Vision and Pattern Recognition}, 
+  title={ImageNet: A large-scale hierarchical image database}, 
+  year={2009},
+  volume={},
+  number={},
+  pages={248-255},
+  keywords={Large-scale systems;Image databases;Explosions;Internet;Robustness;Information retrieval;Image retrieval;Multimedia databases;Ontologies;Spine},
+  doi={10.1109/CVPR.2009.5206848}}
+        """,
+        descriptive_stats={
+            "n_samples": {"test": 1076, "train":1500},
+            #"avg_character_length": {"test": 431.4},
+        },
+    )
+
+    
+

From 35be38d82a6b2f43245e7094d0b62b037ceb13e6 Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Thu, 19 Sep 2024 16:26:57 +0100
Subject: [PATCH 13/17] tidy and lint

---
 .../evaluators/Image/VisualSTSEvaluator.py    |  6 ++--
 mteb/models/blip2_models.py                   | 32 +++++++++---------
 mteb/tasks/Image/Clustering/__init__.py       |  2 +-
 mteb/tasks/Image/Clustering/eng/ImageNet.py   | 33 ++-----------------
 mteb/tasks/Image/VisualSTS/__init__.py        |  2 ++
 .../Image/VisualSTS/en/STS12VisualSTS.py      |  2 +-
 .../Image/VisualSTS/en/STS13VisualSTS.py      |  2 +-
 .../Image/VisualSTS/en/STS14VisualSTS.py      |  2 +-
 .../Image/VisualSTS/en/STS15VisualSTS.py      |  2 +-
 .../Image/VisualSTS/en/STS16VisualSTS.py      |  2 +-
 10 files changed, 31 insertions(+), 54 deletions(-)

diff --git a/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py b/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py
index d47e060e75..a442eb6a9a 100644
--- a/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py
+++ b/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py
@@ -1,18 +1,18 @@
 from __future__ import annotations
 
 import logging
-from typing import Any
+import math
 import os
+from typing import Any
 
 import numpy as np
+import torch
 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics.pairwise import (
     paired_cosine_distances,
     paired_euclidean_distances,
     paired_manhattan_distances,
 )
-import math
-import torch
 from torch.utils.data import DataLoader
 from torchvision import transforms
 
diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py
index 2195f42c12..aa92452ba9 100644
--- a/mteb/models/blip2_models.py
+++ b/mteb/models/blip2_models.py
@@ -5,7 +5,6 @@
 
 import torch
 from PIL import Image
-from torch.nn.functional import normalize
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import Blip2Processor
@@ -36,7 +35,9 @@ def __init__(
             model_type = "coco" if "coco" in model_name else "pretrain"
             self.model = Blip2ITM.from_pretrained(model_type).to(self.device).float()
             # print numbr of parameters
-            print(f"Number of parameters: {sum(p.numel() for p in self.model.parameters())}")
+            print(
+                f"Number of parameters: {sum(p.numel() for p in self.model.parameters())}"
+            )
             self.processor = Blip2Processor.from_pretrained(model_name)
 
         def preprocess(
@@ -62,7 +63,7 @@ def get_text_embeddings(self, texts: list[str], batch_size: int = 32):
                         return_tensors="pt",
                     ).to(self.device)
                     text_outputs = self.model.forward_text(text_tokens)
-                    #text_outputs = normalize(self.model.text_proj(text_outputs))
+                    # text_outputs = normalize(self.model.text_proj(text_outputs))
                     all_text_embeddings.append(text_outputs.cpu())
 
             all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
@@ -83,7 +84,7 @@ def get_image_embeddings(
                             inputs["pixel_values"].to(self.device)
                         )
                         image_outputs = image_outputs[0][:, 0, :]
-                        #image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1)
+                        # image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1)
                         all_image_embeddings.append(image_outputs.cpu())
             else:
                 with torch.no_grad():
@@ -94,7 +95,7 @@ def get_image_embeddings(
                         )["pixel_values"].to(self.device)
                         image_outputs = self.model.forward_image(inputs)
                         image_outputs = image_outputs[0][:, 0, :]
-                        #image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1)
+                        # image_outputs = normalize(self.model.vision_proj(image_outputs), dim=-1)
                         all_image_embeddings.append(image_outputs.cpu())
 
             all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
@@ -108,7 +109,8 @@ def get_multimodal_embeddings(self, texts, images, batch_size=32):
                     # check dataloader batch size is the same as batch size
                     if images.batch_size != batch_size:
                         raise ValueError(
-                            "Image DataLoader batch size must be the same as the given batch size: " + str(batch_size)
+                            "Image DataLoader batch size must be the same as the given batch size: "
+                            + str(batch_size)
                         )
                     for batch_images, i in tqdm(
                         zip(images, range(0, len(texts), batch_size))
@@ -122,7 +124,7 @@ def get_multimodal_embeddings(self, texts, images, batch_size=32):
                             {"text_input": batch_texts, "image": image_inputs}
                         ).multimodal_embeds[:, 0, :]
 
-                        #multimodal_outputs = normalize(self.model.text_proj(multimodal_outputs), dim=-1)
+                        # multimodal_outputs = normalize(self.model.text_proj(multimodal_outputs), dim=-1)
 
                         all_multimodal_embeddings.append(multimodal_outputs.cpu())
                 else:
@@ -137,7 +139,7 @@ def get_multimodal_embeddings(self, texts, images, batch_size=32):
                             {"text_input": batch_texts, "image": image_inputs}
                         ).multimodal_embeds[:, 0, :]
 
-                        #multimodal_outputs = normalize(self.model.text_proj(multimodal_outputs), dim=-1)
+                        # multimodal_outputs = normalize(self.model.text_proj(multimodal_outputs), dim=-1)
 
                         all_multimodal_embeddings.append(multimodal_outputs.cpu())
 
@@ -225,12 +227,9 @@ def get_fused_embeddings(
 
 
 if __name__ == "__main__":
-
     import mteb
 
-    mdl = mteb.get_model(
-        blip2_opt_2_7b.name, blip2_opt_2_7b.revision, device="cpu"
-    )
+    mdl = mteb.get_model(blip2_opt_2_7b.name, blip2_opt_2_7b.revision, device="cpu")
     emb = mdl.get_text_embeddings(["Hello, world!"])
     emb2 = mdl.get_text_embeddings(["Hello there, world!"])
     emb3 = mdl.get_text_embeddings(["Goodbye, person!"])
@@ -244,8 +243,12 @@ def get_fused_embeddings(
     cat_img = Image.open("cat.jpg")
     cat_text = "An image of a cat"
 
-    multi_cat_emb = mdl.get_fused_embeddings(["A photo of an animal"], [cat_img], fusion_mode="multimodal")
-    multi_conflicting_emb = mdl.get_fused_embeddings(["A photo of a dog"], [cat_img], fusion_mode="multimodal")
+    multi_cat_emb = mdl.get_fused_embeddings(
+        ["A photo of an animal"], [cat_img], fusion_mode="multimodal"
+    )
+    multi_conflicting_emb = mdl.get_fused_embeddings(
+        ["A photo of a dog"], [cat_img], fusion_mode="multimodal"
+    )
     image_cat_emb = mdl.get_image_embeddings([cat_img])
     text_cat_emb = mdl.get_text_embeddings(["An photo of a cat"])
     text_dog_emb = mdl.get_text_embeddings(["An image of a dog"])
@@ -258,7 +261,6 @@ def get_fused_embeddings(
     sim4 = torch.nn.functional.cosine_similarity(multi_cat_emb, text_dog_emb)
     sim5 = torch.nn.functional.cosine_similarity(multi_conflicting_emb, text_cat_emb)
 
-
     print(sim1, sim2)
 
     print(sim3, sim4, sim5)
diff --git a/mteb/tasks/Image/Clustering/__init__.py b/mteb/tasks/Image/Clustering/__init__.py
index 9ce1b567e6..804870ebeb 100644
--- a/mteb/tasks/Image/Clustering/__init__.py
+++ b/mteb/tasks/Image/Clustering/__init__.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
 
 from .eng.CIFAR import *
-from .eng.TinyImageNet import *
 from .eng.ImageNet import *
+from .eng.TinyImageNet import *
diff --git a/mteb/tasks/Image/Clustering/eng/ImageNet.py b/mteb/tasks/Image/Clustering/eng/ImageNet.py
index 0efe69f844..b45956cfe7 100644
--- a/mteb/tasks/Image/Clustering/eng/ImageNet.py
+++ b/mteb/tasks/Image/Clustering/eng/ImageNet.py
@@ -1,29 +1,8 @@
 from __future__ import annotations
 
-import io
-import PIL.Image as Image
 from mteb.abstasks.Image.AbsTaskImageClustering import AbsTaskImageClustering
 from mteb.abstasks.TaskMetadata import TaskMetadata
 
-"""
-Classes:
-1.MALTESE DOG
-2.BLENHEIM SPANIEL
-3.BASSET
-4.NORWEGIAN ELKHOUND
-5.GIANT SCHNAUZER
-6.GOLDEN RETRIEVER
-7.BRITTANY SPANIEL
-8.CLUMBER
-9.WELSH SPRINGER SPANIEL
-10.GROENENDAEL
-11.KELPIE
-12.SHETLAND SHEEPDOG
-13.DOBERMAN
-14.PUG
-15.CHOW
-"""
-
 class ImageNetDog15Clustering(AbsTaskImageClustering):
     metadata = TaskMetadata(
         name="ImageNetDog15Clustering",
@@ -38,10 +17,7 @@ class ImageNetDog15Clustering(AbsTaskImageClustering):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="accuracy",
-        date=(
-            "2009-06-20",
-            "2009-06-20"
-        ),  # Conference date
+        date=("2009-06-20", "2009-06-20"),  # Conference date
         domains=["Web"],
         task_subtypes=["Object recognition"],
         license="Not specified",
@@ -62,10 +38,7 @@ class ImageNetDog15Clustering(AbsTaskImageClustering):
   doi={10.1109/CVPR.2009.5206848}}
         """,
         descriptive_stats={
-            "n_samples": {"test": 1076, "train":1500},
-            #"avg_character_length": {"test": 431.4},
+            "n_samples": {"test": 1076, "train": 1500},
+            # "avg_character_length": {"test": 431.4},
         },
     )
-
-    
-
diff --git a/mteb/tasks/Image/VisualSTS/__init__.py b/mteb/tasks/Image/VisualSTS/__init__.py
index cc7823118b..eb785d5d85 100644
--- a/mteb/tasks/Image/VisualSTS/__init__.py
+++ b/mteb/tasks/Image/VisualSTS/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from .en.STS12VisualSTS import *
 from .en.STS13VisualSTS import *
 from .en.STS14VisualSTS import *
diff --git a/mteb/tasks/Image/VisualSTS/en/STS12VisualSTS.py b/mteb/tasks/Image/VisualSTS/en/STS12VisualSTS.py
index 1f88b8045a..8d78bb7238 100644
--- a/mteb/tasks/Image/VisualSTS/en/STS12VisualSTS.py
+++ b/mteb/tasks/Image/VisualSTS/en/STS12VisualSTS.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
 from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class STS12VisualSTS(AbsTaskVisualSTS):
diff --git a/mteb/tasks/Image/VisualSTS/en/STS13VisualSTS.py b/mteb/tasks/Image/VisualSTS/en/STS13VisualSTS.py
index 122a5d6d30..1b02248d35 100644
--- a/mteb/tasks/Image/VisualSTS/en/STS13VisualSTS.py
+++ b/mteb/tasks/Image/VisualSTS/en/STS13VisualSTS.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
 from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class STS13VisualSTS(AbsTaskVisualSTS):
diff --git a/mteb/tasks/Image/VisualSTS/en/STS14VisualSTS.py b/mteb/tasks/Image/VisualSTS/en/STS14VisualSTS.py
index cbbcc94445..a427fdae0b 100644
--- a/mteb/tasks/Image/VisualSTS/en/STS14VisualSTS.py
+++ b/mteb/tasks/Image/VisualSTS/en/STS14VisualSTS.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
 from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class STS14VisualSTS(AbsTaskVisualSTS):
diff --git a/mteb/tasks/Image/VisualSTS/en/STS15VisualSTS.py b/mteb/tasks/Image/VisualSTS/en/STS15VisualSTS.py
index 9eb99af506..12c9a74c81 100644
--- a/mteb/tasks/Image/VisualSTS/en/STS15VisualSTS.py
+++ b/mteb/tasks/Image/VisualSTS/en/STS15VisualSTS.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
 from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class STS15VisualSTS(AbsTaskVisualSTS):
diff --git a/mteb/tasks/Image/VisualSTS/en/STS16VisualSTS.py b/mteb/tasks/Image/VisualSTS/en/STS16VisualSTS.py
index 7db7b4f906..ae1e2900dd 100644
--- a/mteb/tasks/Image/VisualSTS/en/STS16VisualSTS.py
+++ b/mteb/tasks/Image/VisualSTS/en/STS16VisualSTS.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-from mteb.abstasks.TaskMetadata import TaskMetadata
 from mteb.abstasks.Image.AbsTaskVisualSTS import AbsTaskVisualSTS
+from mteb.abstasks.TaskMetadata import TaskMetadata
 
 
 class STS16VisualSTS(AbsTaskVisualSTS):

From 83d0f455d75dc904db7c83740f09277e62ad28e3 Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Thu, 19 Sep 2024 16:37:43 +0100
Subject: [PATCH 14/17] remove unused import

---
 mteb/models/blip2_models.py                 | 1 -
 mteb/tasks/Image/Clustering/eng/ImageNet.py | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py
index aa92452ba9..cb289b3f96 100644
--- a/mteb/models/blip2_models.py
+++ b/mteb/models/blip2_models.py
@@ -14,7 +14,6 @@
 
 def blip2_loader(**kwargs):
     try:  # a temporal fix for the dependency issues of vista models.
-        from lavis.models import load_model_and_preprocess
         from lavis.models.blip2_models.blip2_image_text_matching import (
             Blip2ITM,
         )
diff --git a/mteb/tasks/Image/Clustering/eng/ImageNet.py b/mteb/tasks/Image/Clustering/eng/ImageNet.py
index b45956cfe7..1259808450 100644
--- a/mteb/tasks/Image/Clustering/eng/ImageNet.py
+++ b/mteb/tasks/Image/Clustering/eng/ImageNet.py
@@ -3,6 +3,7 @@
 from mteb.abstasks.Image.AbsTaskImageClustering import AbsTaskImageClustering
 from mteb.abstasks.TaskMetadata import TaskMetadata
 
+
 class ImageNetDog15Clustering(AbsTaskImageClustering):
     metadata = TaskMetadata(
         name="ImageNetDog15Clustering",

From a309de512954b958ac005869c33534ec39985497 Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Thu, 19 Sep 2024 21:17:45 +0100
Subject: [PATCH 15/17] add cluster_accuracy, ari and nmi to
 Image.ClusteringEvaluator

---
 .../evaluators/Image/ClusteringEvaluator.py        | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py b/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py
index b006470416..31b5c26f1a 100644
--- a/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py
+++ b/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py
@@ -5,8 +5,10 @@
 
 import sklearn
 import sklearn.cluster
+import numpy as np
 from PIL import Image
 from sklearn import metrics
+from scipy.optimize import linear_sum_assignment
 
 from mteb.encoder_interface import Encoder
 from mteb.evaluation.evaluators.Evaluator import Evaluator
@@ -53,6 +55,16 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}):
 
         logger.info("Evaluating...")
         v_measure = metrics.cluster.v_measure_score(self.labels, cluster_assignment)
+        nmi = metrics.cluster.normalized_mutual_info_score(self.labels, cluster_assignment)
+        ari = metrics.cluster.adjusted_rand_score(self.labels, cluster_assignment)
+
         accuracy = metrics.accuracy_score(self.labels, cluster_assignment)
+        
+        matrix = metrics.confusion_matrix(self.labels, cluster_assignment)
+        
+        # get linear sum assignment
+        row_ind, col_ind = linear_sum_assignment(matrix, maximize=True)
+        total_correct = matrix[row_ind, col_ind].sum()
+        clustering_accuracy = total_correct / len(self.labels)
 
-        return {"v_measure": v_measure, "accuracy": accuracy}
+        return {"v_measure": v_measure, "accuracy": accuracy, "nmi": nmi, "ari": ari, "cluster_accuracy": clustering_accuracy}

From 02c1f81b85e1dda106dace662af3905096ca1c31 Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Fri, 20 Sep 2024 12:33:26 +0100
Subject: [PATCH 16/17] add imagenet-10 clustering task

---
 mteb/tasks/Image/Clustering/eng/ImageNet.py | 40 +++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/mteb/tasks/Image/Clustering/eng/ImageNet.py b/mteb/tasks/Image/Clustering/eng/ImageNet.py
index 1259808450..dd02d8e830 100644
--- a/mteb/tasks/Image/Clustering/eng/ImageNet.py
+++ b/mteb/tasks/Image/Clustering/eng/ImageNet.py
@@ -43,3 +43,43 @@ class ImageNetDog15Clustering(AbsTaskImageClustering):
             # "avg_character_length": {"test": 431.4},
         },
     )
+
+class ImageNet10Clustering(AbsTaskImageClustering):
+    metadata = TaskMetadata(
+        name="ImageNet10Clustering",
+        description="Clustering images from an 10-class subset of ImageNet which are generally easy to distinguish.",
+        reference="https://www.kaggle.com/datasets/liusha249/imagenet10",
+        dataset={
+            "path": "JamieSJS/imagenet-10",
+            "revision": "88f8a6d47c257895094c5ad81e67ba751771fc99",
+        },
+        type="Clustering",
+        category="i2t",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="accuracy",
+        date=("2009-06-20", "2009-06-20"),  # Conference date
+        domains=["Web"],
+        task_subtypes=["Object recognition"],
+        license="Not specified",
+        socioeconomic_status="mixed",
+        annotations_creators="derived",
+        dialect=[],
+        modalities=["image"],
+        sample_creation="created",
+        bibtex_citation=""" @INPROCEEDINGS{5206848,
+  author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Kai Li and Li Fei-Fei},
+  booktitle={2009 IEEE Conference on Computer Vision and Pattern Recognition}, 
+  title={ImageNet: A large-scale hierarchical image database}, 
+  year={2009},
+  volume={},
+  number={},
+  pages={248-255},
+  keywords={Large-scale systems;Image databases;Explosions;Internet;Robustness;Information retrieval;Image retrieval;Multimedia databases;Ontologies;Spine},
+  doi={10.1109/CVPR.2009.5206848}}
+        """,
+        descriptive_stats={
+            "n_samples": {"test": 13000},
+            # "avg_character_length": {"test": 431.4},
+        },
+    )

From d226748b698d8ab7513fabf917caa8ade0e3f752 Mon Sep 17 00:00:00 2001
From: Jamie-Stirling <stirlingj00@gmail.com>
Date: Fri, 20 Sep 2024 14:05:01 +0100
Subject: [PATCH 17/17] add results forclip on ImageNet10Clustering and
 ImageNetDog15Clustering

---
 .../evaluators/Image/ClusteringEvaluator.py   | 19 ++++++++++-----
 mteb/tasks/Image/Clustering/eng/ImageNet.py   |  1 +
 .../ImageNet10Clustering.json                 | 23 +++++++++++++++++++
 .../ImageNetDog15Clustering.json              | 23 +++++++++++++++++++
 4 files changed, 60 insertions(+), 6 deletions(-)
 create mode 100644 results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNet10Clustering.json
 create mode 100644 results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNetDog15Clustering.json

diff --git a/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py b/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py
index 31b5c26f1a..f53befe8ef 100644
--- a/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py
+++ b/mteb/evaluation/evaluators/Image/ClusteringEvaluator.py
@@ -5,10 +5,9 @@
 
 import sklearn
 import sklearn.cluster
-import numpy as np
 from PIL import Image
-from sklearn import metrics
 from scipy.optimize import linear_sum_assignment
+from sklearn import metrics
 
 from mteb.encoder_interface import Encoder
 from mteb.evaluation.evaluators.Evaluator import Evaluator
@@ -55,16 +54,24 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}):
 
         logger.info("Evaluating...")
         v_measure = metrics.cluster.v_measure_score(self.labels, cluster_assignment)
-        nmi = metrics.cluster.normalized_mutual_info_score(self.labels, cluster_assignment)
+        nmi = metrics.cluster.normalized_mutual_info_score(
+            self.labels, cluster_assignment
+        )
         ari = metrics.cluster.adjusted_rand_score(self.labels, cluster_assignment)
 
         accuracy = metrics.accuracy_score(self.labels, cluster_assignment)
-        
+
         matrix = metrics.confusion_matrix(self.labels, cluster_assignment)
-        
+
         # get linear sum assignment
         row_ind, col_ind = linear_sum_assignment(matrix, maximize=True)
         total_correct = matrix[row_ind, col_ind].sum()
         clustering_accuracy = total_correct / len(self.labels)
 
-        return {"v_measure": v_measure, "accuracy": accuracy, "nmi": nmi, "ari": ari, "cluster_accuracy": clustering_accuracy}
+        return {
+            "v_measure": v_measure,
+            "accuracy": accuracy,
+            "nmi": nmi,
+            "ari": ari,
+            "cluster_accuracy": clustering_accuracy,
+        }
diff --git a/mteb/tasks/Image/Clustering/eng/ImageNet.py b/mteb/tasks/Image/Clustering/eng/ImageNet.py
index dd02d8e830..dcf8587322 100644
--- a/mteb/tasks/Image/Clustering/eng/ImageNet.py
+++ b/mteb/tasks/Image/Clustering/eng/ImageNet.py
@@ -44,6 +44,7 @@ class ImageNetDog15Clustering(AbsTaskImageClustering):
         },
     )
 
+
 class ImageNet10Clustering(AbsTaskImageClustering):
     metadata = TaskMetadata(
         name="ImageNet10Clustering",
diff --git a/results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNet10Clustering.json b/results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNet10Clustering.json
new file mode 100644
index 0000000000..d502635992
--- /dev/null
+++ b/results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNet10Clustering.json
@@ -0,0 +1,23 @@
+{
+  "dataset_revision": "88f8a6d47c257895094c5ad81e67ba751771fc99",
+  "evaluation_time": 33.32936453819275,
+  "kg_co2_emissions": null,
+  "mteb_version": "1.12.90",
+  "scores": {
+    "test": [
+      {
+        "accuracy": 0.1993076923076923,
+        "ari": 0.9672782515730578,
+        "cluster_accuracy": 0.985,
+        "hf_subset": "default",
+        "languages": [
+          "eng-Latn"
+        ],
+        "main_score": 0.1993076923076923,
+        "nmi": 0.9644473066207006,
+        "v_measure": 0.9644473066207006
+      }
+    ]
+  },
+  "task_name": "ImageNet10Clustering"
+}
\ No newline at end of file
diff --git a/results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNetDog15Clustering.json b/results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNetDog15Clustering.json
new file mode 100644
index 0000000000..fe53c8ed7e
--- /dev/null
+++ b/results-mieb/openai__clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/ImageNetDog15Clustering.json
@@ -0,0 +1,23 @@
+{
+  "dataset_revision": "bfb6ad3b2109d26c9daddf14f98d315daa35ee72",
+  "evaluation_time": 4.18316650390625,
+  "kg_co2_emissions": null,
+  "mteb_version": "1.12.90",
+  "scores": {
+    "test": [
+      {
+        "accuracy": 0.026022304832713755,
+        "ari": 0.36465670607270784,
+        "cluster_accuracy": 0.4656133828996282,
+        "hf_subset": "default",
+        "languages": [
+          "eng-Latn"
+        ],
+        "main_score": 0.026022304832713755,
+        "nmi": 0.5160500208664386,
+        "v_measure": 0.5160500208664386
+      }
+    ]
+  },
+  "task_name": "ImageNetDog15Clustering"
+}
\ No newline at end of file