diff --git a/docs/mieb-docs/run_vista.md b/docs/mieb-docs/run_vista.md new file mode 100644 index 0000000000..038b44bb92 --- /dev/null +++ b/docs/mieb-docs/run_vista.md @@ -0,0 +1,26 @@ +## set up VISTA + +the latest FlagEmbedding repo doesn't support VISTA anymore so we use a old version. +``` +git clone --no-checkout https://github.com/FlagOpen/FlagEmbedding.git +cd FlagEmbedding +git checkout 5c9260277977f8f8e256e56a8e12387552693af9 +pip install -e . +pip install torchvision timm einops ftfy +``` +download the vision tower for bge-base +``` +wget https://huggingface.co/BAAI/bge-visualized/resolve/main/Visualized_base_en_v1.5.pth?download=true +``` +rename it to `visualized_base_en_V1.5.pth` +``` +mv Visualized_base_en_v1.5.pth?download=true visualized_base_en_V1.5.pth +``` +download the vision tower for bge-m3 +``` +wget https://huggingface.co/BAAI/bge-visualized/resolve/main/Visualized_m3.pth?download=true +``` +rename it to `visualized_m3.pth` +``` +mv Visualized_m3.pth?download=true visualized_m3.pth +``` \ No newline at end of file diff --git a/mteb/evaluation/evaluators/Image/Any2AnyMultiChoiceEvaluator.py b/mteb/evaluation/evaluators/Image/Any2AnyMultiChoiceEvaluator.py index a391af472b..5fdbb112f3 100644 --- a/mteb/evaluation/evaluators/Image/Any2AnyMultiChoiceEvaluator.py +++ b/mteb/evaluation/evaluators/Image/Any2AnyMultiChoiceEvaluator.py @@ -4,6 +4,7 @@ import io import json import logging +import math import os from collections import defaultdict from typing import Any @@ -132,7 +133,7 @@ def search( batch_size=self.encode_kwargs["batch_size"], shuffle=False, collate_fn=custom_collate_fn, - num_workers=max(1, os.cpu_count() // 2), + num_workers=min(math.floor(os.cpu_count() / 2), 16), ) if q_modality == "image": query_embeddings = self.model.get_image_embeddings( @@ -182,7 +183,7 @@ def search( batch_size=self.encode_kwargs["batch_size"], shuffle=False, collate_fn=custom_collate_fn, - num_workers=max(1, os.cpu_count() // 2), + num_workers=min(math.floor(os.cpu_count() / 2), 16), ) if corpus_modality == "image": sub_corpus_embeddings = self.model.get_image_embeddings( diff --git a/mteb/evaluation/evaluators/Image/Any2AnyRetrievalEvaluator.py b/mteb/evaluation/evaluators/Image/Any2AnyRetrievalEvaluator.py index cb0119ea6e..a321979d26 100644 --- a/mteb/evaluation/evaluators/Image/Any2AnyRetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/Image/Any2AnyRetrievalEvaluator.py @@ -4,6 +4,7 @@ import io import json import logging +import math import os from collections import defaultdict from typing import Any @@ -131,7 +132,7 @@ def search( batch_size=self.encode_kwargs["batch_size"], shuffle=False, collate_fn=custom_collate_fn, - num_workers=max(1, os.cpu_count() // 2), + num_workers=min(math.floor(os.cpu_count() / 2), 16), ) if q_modality == "image": query_embeddings = self.model.get_image_embeddings( @@ -181,7 +182,7 @@ def search( batch_size=self.encode_kwargs["batch_size"], shuffle=False, collate_fn=custom_collate_fn, - num_workers=max(1, os.cpu_count() // 2), + num_workers=min(math.floor(os.cpu_count() / 2), 16), ) if corpus_modality == "image": sub_corpus_embeddings = self.model.get_image_embeddings( diff --git a/mteb/evaluation/evaluators/Image/ImageTextPairClassificationEvaluator.py b/mteb/evaluation/evaluators/Image/ImageTextPairClassificationEvaluator.py index b548da365e..7e3d84bb87 100644 --- a/mteb/evaluation/evaluators/Image/ImageTextPairClassificationEvaluator.py +++ b/mteb/evaluation/evaluators/Image/ImageTextPairClassificationEvaluator.py @@ -1,6 +1,8 @@ from __future__ import annotations import logging +import math +import os from typing import Any import torch @@ -106,7 +108,7 @@ def __call__( shuffle=False, # collate_fn=lambda x: x, # Identity collate function collate_fn=custom_collate_fn, - num_workers=4, + num_workers=min(math.floor(os.cpu_count() / 2), 16), ) num_images_per_sample = ( diff --git a/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py b/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py index a442eb6a9a..a042d22f5a 100644 --- a/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py +++ b/mteb/evaluation/evaluators/Image/VisualSTSEvaluator.py @@ -78,14 +78,14 @@ def __call__( batch_size=encode_kwargs["batch_size"], shuffle=False, collate_fn=custom_collate_fn, - num_workers=math.floor(os.cpu_count() / 2), + num_workers=min(math.floor(os.cpu_count() / 2), 16), ) sentence2_dataloader = DataLoader( self.sentence2_dataset, batch_size=encode_kwargs["batch_size"], shuffle=False, collate_fn=custom_collate_fn, - num_workers=math.floor(os.cpu_count() / 2), + num_workers=min(math.floor(os.cpu_count() / 2), 16), ) embeddings1 = model.get_image_embeddings( diff --git a/mteb/evaluation/evaluators/Image/ZeroshotClassificationEvaluator.py b/mteb/evaluation/evaluators/Image/ZeroshotClassificationEvaluator.py index d082f13517..0b3b7f8f67 100644 --- a/mteb/evaluation/evaluators/Image/ZeroshotClassificationEvaluator.py +++ b/mteb/evaluation/evaluators/Image/ZeroshotClassificationEvaluator.py @@ -1,6 +1,8 @@ from __future__ import annotations import logging +import math +import os from typing import Any import torch @@ -66,7 +68,7 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}): batch_size=encode_kwargs["batch_size"], shuffle=False, collate_fn=custom_collate_fn, - num_workers=16, + num_workers=min(math.floor(os.cpu_count() / 2), 16), ) text_embeddings = model.get_text_embeddings( diff --git a/mteb/models/cohere_v.py b/mteb/models/cohere_v.py index cf98f7dc3c..d53fd662e8 100644 --- a/mteb/models/cohere_v.py +++ b/mteb/models/cohere_v.py @@ -1,5 +1,9 @@ from __future__ import annotations +import base64 +import io +import os +import time from functools import partial from typing import Any @@ -8,11 +12,8 @@ from torch.utils.data import DataLoader from torchvision import transforms from tqdm import tqdm -import os -import io -import base64 + import mteb -import time from mteb.model_meta import ModelMeta api_key = os.getenv("COHERE_API_KEY") diff --git a/mteb/models/e5_v.py b/mteb/models/e5_v.py index 70bc20cabf..5647bee380 100644 --- a/mteb/models/e5_v.py +++ b/mteb/models/e5_v.py @@ -21,6 +21,8 @@ def __init__( ): self.model_name = model_name self.processor = LlavaNextProcessor.from_pretrained(model_name) + if "device" in kwargs: + self.device = kwargs.pop("device") self.model = LlavaNextForConditionalGeneration.from_pretrained( model_name, **kwargs ) diff --git a/mteb/models/evaclip_models.py b/mteb/models/evaclip_models.py index 1e123096ca..015c965c07 100644 --- a/mteb/models/evaclip_models.py +++ b/mteb/models/evaclip_models.py @@ -13,8 +13,8 @@ def evaclip_loader(**kwargs): try: - import sys import os + import sys sys.path.insert(0, os.path.join(os.getcwd(), "EVA/EVA-CLIP/rei")) diff --git a/scripts/run_mieb.py b/scripts/run_mieb.py index 3fd3e3c81b..676fd966a7 100644 --- a/scripts/run_mieb.py +++ b/scripts/run_mieb.py @@ -43,6 +43,14 @@ "laion/CLIP-ViT-B-32-laion2B-s34B-b79K", "TIGER-Lab/VLM2Vec-LoRA", "TIGER-Lab/VLM2Vec-Full", + "Salesforce/blip-itm-base-coco", + "Salesforce/blip-itm-large-coco", + "Salesforce/blip-itm-base-flickr", + "Salesforce/blip-itm-large-flickr", + "EVA02-CLIP-B-16", + "EVA02-CLIP-L-14", + "EVA02-CLIP-bigE-14", + "EVA02-CLIP-bigE-14-plus", # "embed-english-v3.0-v", # not feasible to run due to the 40 images/min constraint ]: model = mteb.get_model(model_name)