-
Notifications
You must be signed in to change notification settings - Fork 559
Add Voyage's multimodal embedding #1555
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,179 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import os | ||
| from functools import partial | ||
| from typing import Any | ||
|
|
||
| import torch | ||
| from PIL import Image | ||
| from torch.utils.data import DataLoader | ||
| from torchvision import transforms | ||
| from tqdm import tqdm | ||
|
|
||
| import mteb | ||
| from mteb.model_meta import ModelMeta | ||
|
|
||
| api_key = os.getenv("VOYAGE_API_KEY") | ||
| tensor_to_image = transforms.Compose([transforms.ToPILImage()]) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move inside class? |
||
|
|
||
|
|
||
| def voyage_v_loader(**kwargs): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would move the import into the class to avoid the wrapper function |
||
| try: | ||
| import voyageai | ||
| except ImportError: | ||
| raise ImportError("To use voyage models, please run `pip install -U voyageai`.") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pip install mteb[voyageai] and add to pyproject.toml with version |
||
|
|
||
| class VoyageMultiModalModelWrapper: | ||
| def __init__( | ||
| self, | ||
| model_name: str, | ||
| **kwargs: Any, | ||
| ): | ||
| self.model_name = model_name | ||
| self.vo = voyageai.Client() | ||
|
|
||
| def get_text_embeddings( | ||
| self, texts: list[str], batch_size: int = 32, input_type=None | ||
| ): | ||
| all_text_embeddings = [] | ||
|
|
||
| for i in tqdm(range(0, len(texts), batch_size)): | ||
| batch_texts = texts[i : i + batch_size] | ||
| batch_texts = [[text] for text in batch_texts] | ||
| all_text_embeddings += torch.tensor( | ||
| self.vo.multimodal_embed( | ||
| batch_texts, model=self.model_name, input_type=input_type | ||
| ).embeddings | ||
| ) | ||
| all_text_embeddings = torch.vstack(all_text_embeddings) | ||
| return all_text_embeddings | ||
|
|
||
| def get_image_embeddings( | ||
| self, | ||
| images: list[Image.Image] | DataLoader, | ||
| batch_size: int = 32, | ||
| input_type=None, | ||
| ): | ||
| all_image_embeddings = [] | ||
|
|
||
| if isinstance(images, DataLoader): | ||
| for index, batch in enumerate(tqdm(images)): | ||
| if index == 0: | ||
| assert len(batch) == batch_size | ||
| batch_images = [[tensor_to_image(image)] for image in batch] | ||
| all_image_embeddings += torch.tensor( | ||
| self.vo.multimodal_embed( | ||
| batch_images, model=self.model_name, input_type=input_type | ||
| ).embeddings | ||
| ) | ||
| else: | ||
| for i in tqdm(range(0, len(images), batch_size)): | ||
| batch_images = images[i : i + batch_size] | ||
| batch_images = [[image] for image in batch_images] | ||
| all_image_embeddings += torch.tensor( | ||
| self.vo.multimodal_embed( | ||
| batch_images, model=self.model_name, input_type=input_type | ||
| ).embeddings | ||
| ) | ||
| all_image_embeddings = torch.vstack(all_image_embeddings) | ||
| return all_image_embeddings | ||
|
|
||
| def calculate_probs(self, text_embeddings, image_embeddings): | ||
| text_embeddings = text_embeddings / text_embeddings.norm( | ||
| dim=-1, keepdim=True | ||
| ) | ||
| image_embeddings = image_embeddings / image_embeddings.norm( | ||
| dim=-1, keepdim=True | ||
| ) | ||
| logits = torch.matmul(image_embeddings, text_embeddings.T) | ||
| probs = (logits * 100).softmax(dim=-1) | ||
| return probs | ||
|
|
||
| def get_fused_embeddings( | ||
| self, | ||
| texts: list[str] = None, | ||
| images: list[Image.Image] | DataLoader = None, | ||
| batch_size: int = 32, | ||
| input_type=None, | ||
| ): | ||
| if texts is None and images is None: | ||
| raise ValueError("Either texts or images must be provided") | ||
|
|
||
| text_embeddings = None | ||
| image_embeddings = None | ||
|
|
||
| interleaved_embeddings = [] | ||
| if texts is not None and images is not None: | ||
| # print("encoding interleaved inputs") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. delete
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (general) |
||
| if isinstance(images, DataLoader): | ||
| for index, batch in tqdm(enumerate(images)): | ||
| if index == 0: | ||
| assert len(batch) == batch_size | ||
| batch_images = [tensor_to_image(image) for image in batch] | ||
| batch_texts = texts[ | ||
| index * batch_size : (index + 1) * batch_size | ||
| ] | ||
| interleaved_inputs = [ | ||
| [text, image] | ||
| for image, text in zip(batch_images, batch_texts) | ||
| ] | ||
| interleaved_embeddings += torch.tensor( | ||
| self.vo.multimodal_embed( | ||
| interleaved_inputs, | ||
| model=self.model_name, | ||
| input_type=input_type, | ||
| ).embeddings | ||
| ) | ||
| else: | ||
| for i in tqdm(range(0, len(images), batch_size)): | ||
| batch_images = images[i : i + batch_size] | ||
| batch_texts = texts[i : i + batch_size] | ||
| interleaved_inputs = [ | ||
| [text, image] | ||
| for image, text in zip(batch_images, batch_texts) | ||
| ] | ||
| interleaved_embeddings += torch.tensor( | ||
| self.vo.multimodal_embed( | ||
| interleaved_inputs, | ||
| model=self.model_name, | ||
| input_type=input_type, | ||
| ).embeddings | ||
| ) | ||
| interleaved_embeddings = torch.vstack(interleaved_embeddings) | ||
| return interleaved_embeddings | ||
|
|
||
| elif texts is not None: | ||
| # print("encoding texts only") | ||
| text_embeddings = self.get_text_embeddings(texts, batch_size) | ||
|
|
||
| elif images is not None: | ||
| # print("encoding images only") | ||
| image_embeddings = self.get_image_embeddings(images, batch_size) | ||
|
|
||
| if text_embeddings is not None: | ||
| return text_embeddings | ||
| elif image_embeddings is not None: | ||
| return image_embeddings | ||
|
|
||
| return VoyageMultiModalModelWrapper(**kwargs) | ||
|
|
||
|
|
||
| cohere_mult_3 = ModelMeta( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. voyage_mult_3 ** |
||
| loader=partial(voyage_v_loader, model_name="voyage-multimodal-3"), | ||
| name="voyage-multimodal-3", | ||
| languages=[], # Unknown | ||
| open_source=False, | ||
| revision="1", | ||
| release_date="2024-11-10", | ||
| n_parameters=None, | ||
| memory_usage=None, | ||
| max_tokens=None, | ||
| embed_dim=1024, | ||
| license=None, | ||
| similarity_fn_name="cosine", | ||
| framework=[], | ||
| ) | ||
|
|
||
| if __name__ == "__main__": | ||
| mdl = mteb.get_model(cohere_mult_3.name, cohere_mult_3.revision) | ||
| emb = mdl.encode(["Hello, world!"]) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| { | ||
| "dataset_revision": "aadb3af77e9048adbea6b47c21a81e47dd092ae5", | ||
| "evaluation_time": 231.98164081573486, | ||
| "kg_co2_emissions": null, | ||
| "mteb_version": "1.14.15", | ||
| "scores": { | ||
| "test": [ | ||
| { | ||
| "ari": 0.46531392836213886, | ||
| "cluster_accuracy": 0.6042, | ||
| "hf_subset": "default", | ||
| "languages": [ | ||
| "eng-Latn" | ||
| ], | ||
| "main_score": 0.7515067668506068, | ||
| "nmi": 0.7515067668506068, | ||
| "v_measure": 0.7515067668506068 | ||
| } | ||
| ] | ||
| }, | ||
| "task_name": "CIFAR100Clustering" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| { | ||
| "dataset_revision": "0b2714987fa478483af9968de7c934580d0bb9a2", | ||
| "evaluation_time": 236.98327565193176, | ||
| "kg_co2_emissions": null, | ||
| "mteb_version": "1.14.15", | ||
| "scores": { | ||
| "test": [ | ||
| { | ||
| "ari": 0.7986159784876974, | ||
| "cluster_accuracy": 0.8244, | ||
| "hf_subset": "default", | ||
| "languages": [ | ||
| "eng-Latn" | ||
| ], | ||
| "main_score": 0.8622309717386565, | ||
| "nmi": 0.8622309717386565, | ||
| "v_measure": 0.8622309717386566 | ||
| } | ||
| ] | ||
| }, | ||
| "task_name": "CIFAR10Clustering" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| { | ||
| "dataset_revision": "22409a927ab5cf68e3655023d51694587455fc99", | ||
| "evaluation_time": 444.4619266986847, | ||
| "kg_co2_emissions": null, | ||
| "mteb_version": "1.14.15", | ||
| "scores": { | ||
| "test": [ | ||
| { | ||
| "accuracy": 0.2639593908629442, | ||
| "hf_subset": "default", | ||
| "languages": [ | ||
| "eng-Latn" | ||
| ], | ||
| "main_score": 0.2639593908629442 | ||
| } | ||
| ] | ||
| }, | ||
| "task_name": "CVBenchCount" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| { | ||
| "dataset_revision": "22409a927ab5cf68e3655023d51694587455fc99", | ||
| "evaluation_time": 413.3635849952698, | ||
| "kg_co2_emissions": null, | ||
| "mteb_version": "1.14.15", | ||
| "scores": { | ||
| "test": [ | ||
| { | ||
| "accuracy": 0.5316666666666666, | ||
| "hf_subset": "default", | ||
| "languages": [ | ||
| "eng-Latn" | ||
| ], | ||
| "main_score": 0.5316666666666666 | ||
| } | ||
| ] | ||
| }, | ||
| "task_name": "CVBenchDepth" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| { | ||
| "dataset_revision": "22409a927ab5cf68e3655023d51694587455fc99", | ||
| "evaluation_time": 402.95056080818176, | ||
| "kg_co2_emissions": null, | ||
| "mteb_version": "1.14.15", | ||
| "scores": { | ||
| "test": [ | ||
| { | ||
| "accuracy": 0.475, | ||
| "hf_subset": "default", | ||
| "languages": [ | ||
| "eng-Latn" | ||
| ], | ||
| "main_score": 0.475 | ||
| } | ||
| ] | ||
| }, | ||
| "task_name": "CVBenchDistance" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| { | ||
| "dataset_revision": "22409a927ab5cf68e3655023d51694587455fc99", | ||
| "evaluation_time": 201.59757685661316, | ||
| "kg_co2_emissions": null, | ||
| "mteb_version": "1.14.15", | ||
| "scores": { | ||
| "test": [ | ||
| { | ||
| "accuracy": 0.5353846153846153, | ||
| "hf_subset": "default", | ||
| "languages": [ | ||
| "eng-Latn" | ||
| ], | ||
| "main_score": 0.5353846153846153 | ||
| } | ||
| ] | ||
| }, | ||
| "task_name": "CVBenchRelation" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| { | ||
| "dataset_revision": "88f8a6d47c257895094c5ad81e67ba751771fc99", | ||
| "evaluation_time": 914.053968667984, | ||
| "kg_co2_emissions": null, | ||
| "mteb_version": "1.14.15", | ||
| "scores": { | ||
| "test": [ | ||
| { | ||
| "ari": 0.9808613040028806, | ||
| "cluster_accuracy": 0.9913076923076923, | ||
| "hf_subset": "default", | ||
| "languages": [ | ||
| "eng-Latn" | ||
| ], | ||
| "main_score": 0.9758070118084365, | ||
| "nmi": 0.9758070118084365, | ||
| "v_measure": 0.9758070118084365 | ||
| } | ||
| ] | ||
| }, | ||
| "task_name": "ImageNet10Clustering" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| { | ||
| "dataset_revision": "bfb6ad3b2109d26c9daddf14f98d315daa35ee72", | ||
| "evaluation_time": 78.63793706893921, | ||
| "kg_co2_emissions": null, | ||
| "mteb_version": "1.14.15", | ||
| "scores": { | ||
| "test": [ | ||
| { | ||
| "ari": 0.8119121646758497, | ||
| "cluster_accuracy": 0.820631970260223, | ||
| "hf_subset": "default", | ||
| "languages": [ | ||
| "eng-Latn" | ||
| ], | ||
| "main_score": 0.8381839891894807, | ||
| "nmi": 0.8381839891894807, | ||
| "v_measure": 0.8381839891894807 | ||
| } | ||
| ] | ||
| }, | ||
| "task_name": "ImageNetDog15Clustering" | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
delete