Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions mteb/evaluation/evaluators/Image/ClusteringEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,6 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}):
)
ari = metrics.cluster.adjusted_rand_score(self.labels, cluster_assignment)

accuracy = metrics.accuracy_score(self.labels, cluster_assignment)

matrix = metrics.confusion_matrix(self.labels, cluster_assignment)

# get linear sum assignment
Expand All @@ -70,7 +68,6 @@ def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any] = {}):

return {
"v_measure": v_measure,
"accuracy": accuracy,
"nmi": nmi,
"ari": ari,
"cluster_accuracy": clustering_accuracy,
Expand Down
202 changes: 202 additions & 0 deletions mteb/models/cohere_v.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
from __future__ import annotations

from functools import partial
from typing import Any

import torch
from PIL import Image
from torch.utils.data import DataLoader
from torchvision import transforms
from tqdm import tqdm
import os
import io
import base64
import mteb
import time
from mteb.model_meta import ModelMeta

api_key = os.getenv("COHERE_API_KEY")
tensor_to_image = transforms.Compose([transforms.ToPILImage()])


def cohere_v_loader(**kwargs):
try:
import cohere
except ImportError:
raise ImportError("To use cohere models, please run `pip install cohere`.")

class CohereMultiModalModelWrapper:
def __init__(
self,
model_name: str,
**kwargs: Any,
):
self.model_name = model_name
self.client = cohere.ClientV2(api_key)
self.image_format = "JPEG"
""" Wrapper for Cohere multimodal embedding model,

do `export COHERE_API_KEY=<Your_Cohere_API_KEY>` before running eval scripts.
Cohere currently supports 40 images/min, thus time.sleep(1.5) is applied after each image.
Remove or adjust this after Cohere API changes capacity.
"""

def get_text_embeddings(self, texts: list[str], batch_size: int = 32):
all_text_embeddings = []

for i in tqdm(range(0, len(texts), batch_size)):
batch_texts = texts[i : i + batch_size]
response = self.client.embed(
texts=batch_texts,
model=self.model_name,
input_type="search_document",
)
all_text_embeddings.append(torch.tensor(response.embeddings.float))

all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
return all_text_embeddings

def get_image_embeddings(
self, images: list[Image.Image] | DataLoader, batch_size: int = 32
):
all_image_embeddings = []

if isinstance(images, DataLoader):
for batch in tqdm(images):
for image in batch:
# cohere only supports 1 image per call
buffered = io.BytesIO()
image = tensor_to_image(image)
image.save(buffered, format=self.image_format)
image_bytes = buffered.getvalue()
stringified_buffer = base64.b64encode(image_bytes).decode(
"utf-8"
)
content_type = f"image/{self.image_format.lower()}"
image_base64 = (
f"data:{content_type};base64,{stringified_buffer}"
)
response = self.client.embed(
model=self.model_name,
input_type="image",
embedding_types=["float"],
images=[image_base64],
)
all_image_embeddings.append(
torch.tensor(response.embeddings.float)
)
time.sleep(1.5)
else:
for i in tqdm(range(0, len(images), batch_size)):
batch_images = images[i : i + batch_size]
for image in batch_images:
# cohere only supports 1 image per call
buffered = io.BytesIO()
image.save(buffered, format=self.image_format)
image_bytes = buffered.getvalue()
stringified_buffer = base64.b64encode(image_bytes).decode(
"utf-8"
)
content_type = f"image/{self.image_format.lower()}"
image_base64 = (
f"data:{content_type};base64,{stringified_buffer}"
)
response = self.client.embed(
model=self.model_name,
input_type="image",
embedding_types=["float"],
images=[image_base64],
)
all_image_embeddings.append(
torch.tensor(response.embeddings.float)
)
time.sleep(1.5)
all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
return all_image_embeddings

def calculate_probs(self, text_embeddings, image_embeddings):
text_embeddings = text_embeddings / text_embeddings.norm(
dim=-1, keepdim=True
)
image_embeddings = image_embeddings / image_embeddings.norm(
dim=-1, keepdim=True
)
logits = torch.matmul(image_embeddings, text_embeddings.T)
probs = (logits * 100).softmax(dim=-1)
return probs

def get_fused_embeddings(
self,
texts: list[str] = None,
images: list[Image.Image] | DataLoader = None,
fusion_mode="sum",
batch_size: int = 32,
):
if texts is None and images is None:
raise ValueError("Either texts or images must be provided")

text_embeddings = None
image_embeddings = None

if texts is not None:
text_embeddings = self.get_text_embeddings(texts, batch_size)

if images is not None:
image_embeddings = self.get_image_embeddings(images, batch_size)

if text_embeddings is not None and image_embeddings is not None:
if len(text_embeddings) != len(image_embeddings):
raise ValueError(
"The number of texts and images must have the same length"
)
if fusion_mode == "sum":
fused_embeddings = text_embeddings + image_embeddings
else:
# to do: add other fusion mode
raise ValueError(
f"fusion mode {fusion_mode} hasn't been implemented"
)
return fused_embeddings
elif text_embeddings is not None:
return text_embeddings
elif image_embeddings is not None:
return image_embeddings

return CohereMultiModalModelWrapper(**kwargs)


cohere_mult_3 = ModelMeta(
loader=partial(cohere_v_loader, model_name="embed-multilingual-v3.0"),
name="embed-multilingual-v3.0-v",
languages=[], # Unknown, but support >100 languages
open_source=False,
revision="1",
release_date="2024-10-24",
n_parameters=None,
memory_usage=None,
max_tokens=None,
embed_dim=1024,
license=None,
similarity_fn_name="cosine",
framework=[],
)

cohere_eng_3 = ModelMeta(
loader=partial(cohere_v_loader, model_name="embed-english-v3.0"),
name="embed-english-v3.0-v",
languages=["eng-Latn"],
open_source=False,
revision="1",
release_date="2024-10-24",
n_parameters=None,
memory_usage=None,
max_tokens=None,
embed_dim=1024,
license=None,
similarity_fn_name="cosine",
framework=[],
)

if __name__ == "__main__":
mdl = mteb.get_model(cohere_mult_3.name, cohere_mult_3.revision)
emb = mdl.encode(["Hello, world!"])
4 changes: 4 additions & 0 deletions mteb/models/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
bm25,
clip_models,
cohere_models,
cohere_v,
dino_models,
e5_instruct,
e5_models,
Expand All @@ -38,6 +39,7 @@
ru_sentence_models,
salesforce_models,
sentence_transformers_models,
siglip_models,
vista_models,
vlm2vec_models,
voyage_models,
Expand All @@ -53,6 +55,7 @@
bm25,
clip_models,
cohere_models,
cohere_v,
dino_models,
e5_instruct,
e5_models,
Expand All @@ -73,6 +76,7 @@
ru_sentence_models,
salesforce_models,
sentence_transformers_models,
siglip_models,
vista_models,
voyage_models,
vlm2vec_models,
Expand Down
Loading