From 9556f99d6a2a12e665aadfe282708a7401de09a0 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Thu, 13 Feb 2025 23:35:14 +0900 Subject: [PATCH 001/233] misc: Add image classification descriptive stats implementation (#2045) * add ImageClassificationDescriptiveStatistics * add MNIST descriptive stats * use tuples instead * add label count and update docstrings * update MNIST example --- .../Image/AbsTaskImageClassification.py | 74 ++++++++++++++++++- mteb/abstasks/TaskMetadata.py | 32 +++++--- .../Image/ImageClassification/MNIST.json | 44 +++++++++++ 3 files changed, 135 insertions(+), 15 deletions(-) create mode 100644 mteb/descriptive_stats/Image/ImageClassification/MNIST.json diff --git a/mteb/abstasks/Image/AbsTaskImageClassification.py b/mteb/abstasks/Image/AbsTaskImageClassification.py index c7afbe1136..d3ba2dabf0 100644 --- a/mteb/abstasks/Image/AbsTaskImageClassification.py +++ b/mteb/abstasks/Image/AbsTaskImageClassification.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from collections import defaultdict +from collections import Counter, defaultdict from typing import Any import numpy as np @@ -16,12 +16,45 @@ ImagelogRegClassificationEvaluator, ) from ..AbsTask import AbsTask, ScoresDict +from ..TaskMetadata import DescriptiveStatistics ImageFile.LOAD_TRUNCATED_IMAGES = True logger = logging.getLogger(__name__) +class ImageClassificationDescriptiveStatistics(DescriptiveStatistics): + """Descriptive statistics for ImageClassification + + Attributes: + num_samples: number of samples in the dataset. + + min_image_width: Minimum width of images + average_image_width: Average width of images + max_image_width: Maximum width of images + + min_image_height: Minimum height of images + average_image_height: Average height of images + max_image_height: Maximum height of images + + unique_labels: Number of unique labels + labels: dict of label frequencies + """ + + num_samples: int + + min_image_width: float + average_image_width: float + max_image_width: float + + min_image_height: float + average_image_height: float + max_image_height: float + + unique_num_labels: int + labels: dict[str, dict[str, int]] + + class AbsTaskImageClassification(AbsTask): """Abstract class for kNN classification tasks The similarity is computed between pairs and the results are ranked. @@ -73,8 +106,43 @@ def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None: def _calculate_metrics_from_split( self, split: str, hf_subset: str | None = None, compute_overall: bool = False - ): - pass + ) -> ImageClassificationDescriptiveStatistics: + if hf_subset: + imgs = self.dataset[hf_subset][split][self.image_column_name] + labels = self.dataset[hf_subset][split][self.label_column_name] + elif compute_overall: + imgs = [] + labels = [] + for hf_subset in self.metadata.eval_langs: + imgs.extend(self.dataset[hf_subset][split][self.image_column_name]) + labels.extend(self.dataset[hf_subset][split][self.label_column_name]) + else: + imgs = self.dataset[split][self.image_column_name] + labels = self.dataset[split][self.label_column_name] + + num_samples = len(labels) + unique_num_labels = len(set(labels)) + label_count = Counter(labels) + + img_widths, img_heights = [], [] + for img in imgs: + width, height = img.size + img_heights.append(height) + img_widths.append(width) + + return ImageClassificationDescriptiveStatistics( + num_samples=num_samples, + unique_num_labels=unique_num_labels, + min_image_width=min(img_widths), + average_image_width=sum(img_widths) / len(img_widths), + max_image_width=max(img_widths), + min_image_height=min(img_heights), + average_image_height=sum(img_heights) / len(img_heights), + max_image_height=max(img_heights), + labels={ + str(label): {"count": count} for label, count in label_count.items() + }, + ) def evaluate( self, diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index 24b3c9fa23..db28dc1915 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -100,7 +100,20 @@ "rendered", "multiple", ] -TASK_TYPE = Literal[ + +MIEB_TASK_TYPE = ( + "Any2AnyMultiChoice", + "Any2AnyRetrieval", + "Any2TextMutipleChoice", + "ImageClustering", + "ImageClassification", + "ImageMultilabelClassification", + "ImageTextPairClassification", + "VisualSTS", + "ZeroShotClassification", +) + +TASK_TYPE = ( "BitextMining", "Classification", "MultilabelClassification", @@ -112,16 +125,9 @@ "Summarization", "InstructionRetrieval", "Speed", - "Any2AnyMultiChoice", - "Any2AnyRetrieval", - "Any2TextMutipleChoice", - "ImageClustering", - "ImageClassification", - "ImageMultilabelClassification", - "ImageTextPairClassification", - "VisualSTS", - "ZeroShotClassification", -] +) + MIEB_TASK_TYPE + +TASK_TYPE = Literal[TASK_TYPE] TASK_CATEGORY = Literal[ @@ -455,9 +461,11 @@ def descriptive_stats(self) -> dict[str, DescriptiveStatistics] | None: def descriptive_stat_path(self) -> Path: """Return the path to the descriptive statistics file.""" descriptive_stat_base_dir = Path(__file__).parent.parent / "descriptive_stats" + if self.type in MIEB_TASK_TYPE: + descriptive_stat_base_dir = descriptive_stat_base_dir / "Image" + task_type_dir = descriptive_stat_base_dir / self.type if not descriptive_stat_base_dir.exists(): descriptive_stat_base_dir.mkdir() - task_type_dir = descriptive_stat_base_dir / self.type if not task_type_dir.exists(): task_type_dir.mkdir() return task_type_dir / f"{self.name}.json" diff --git a/mteb/descriptive_stats/Image/ImageClassification/MNIST.json b/mteb/descriptive_stats/Image/ImageClassification/MNIST.json new file mode 100644 index 0000000000..c028ae135d --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/MNIST.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 10000, + "unique_num_labels": 10, + "min_image_width": 28, + "average_image_width": 28.0, + "max_image_width": 28, + "min_image_height": 28, + "average_image_height": 28.0, + "max_image_height": 28, + "labels": { + "7": { + "count": 1028 + }, + "2": { + "count": 1032 + }, + "1": { + "count": 1135 + }, + "0": { + "count": 980 + }, + "4": { + "count": 982 + }, + "9": { + "count": 1009 + }, + "5": { + "count": 892 + }, + "6": { + "count": 958 + }, + "3": { + "count": 1010 + }, + "8": { + "count": 974 + } + } + } +} \ No newline at end of file From fadba483340e12cd1ccf15acd47b038f844f2b9b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 13 Feb 2025 14:37:20 +0000 Subject: [PATCH 002/233] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index ea5c742463..553cdd5acd 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -430,7 +430,7 @@ The following tables give you an overview of the tasks in MTEB. | [MLSUMClusteringS2S.v2](https://huggingface.co/datasets/mteb/mlsum) (Scialom et al., 2020) | ['deu', 'fra', 'rus', 'spa'] | Clustering | s2s | [News, Written] | None | None | | [MMarcoReranking](https://github.com/unicamp-dl/mMARCO) (Luiz Henrique Bonifacio, 2021) | ['cmn'] | Reranking | s2s | | None | None | | [MMarcoRetrieval](https://arxiv.org/abs/2309.07597) (Shitao Xiao, 2024) | ['cmn'] | Retrieval | s2p | | None | None | -| [MNIST](https://en.wikipedia.org/wiki/MNIST_database) (LeCun et al., 2010) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [MNIST](https://en.wikipedia.org/wiki/MNIST_database) (LeCun et al., 2010) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 10000} | {'test': {'num_samples': 10000, 'unique_num_labels': 10, 'min_image_width': 28, 'average_image_width': 28.0, 'max_image_width': 28, 'min_image_height': 28, 'average_image_height': 28.0, 'max_image_height': 28, 'labels': {'7': {'count': 1028}, '2': {'count': 1032}, '1': {'count': 1135}, '0': {'count': 980}, '4': {'count': 982}, '9': {'count': 1009}, '5': {'count': 892}, '6': {'count': 958}, '3': {'count': 1010}, '8': {'count': 974}}}} | | [MNISTZeroShot](https://en.wikipedia.org/wiki/MNIST_database) (LeCun et al., 2010) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [MSCOCOI2TRetrieval](https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48) (Lin et al., 2014) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | None | None | | [MSCOCOT2IRetrieval](https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48) (Lin et al., 2014) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | None | None | From 01fd6fbb2a7a2f54543a4b2a41ac96fd90cc61b2 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Thu, 13 Feb 2025 15:42:17 +0100 Subject: [PATCH 003/233] fix: Add column descriptions to leaderboard (#2039) * fix: Add column descriptions to leaderboard * removed existing overlap --- mteb/leaderboard/app.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index caee5c2e28..8e8b40edfb 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -428,6 +428,18 @@ def filter_models( Based on community feedback and research findings, This definition could change in the future. """ ) + with gr.Accordion( + "What does the other columns mean?", + open=False, + ): + gr.Markdown( + """ +- **Number of Parameters**: This is the total number of parameters in the model including embedding parameters. A higher value means the model requires more CPU/GPU memory to run; thus, less is generally desirable. +- **Embedding Dimension**: This is the vector dimension of the embeddings that the model produces. When saving embeddings to disk, a higher dimension will require more space, thus less is usually desirable. +- **Max tokens**: This refers to how many tokens (=word pieces) the model can process. Generally, a larger value is desirable. +- **Zero-shot**: This indicates if the model is zero-shot on the benchmark. For more information on zero-shot see the info-box below. + """ + ) with gr.Accordion( "Why is a model missing or not showing up?", open=False, From 35372238b1f345ecf1422cb967186d8059213d07 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Thu, 13 Feb 2025 15:45:55 +0100 Subject: [PATCH 004/233] fix: Add BRIGHT (long) and fix bug in TaskResult.filter_and_validate() (#2041) * fix: Add BRIGHT Long Fixes #1978 * fix: Add BRIGHT(long) * fix bug in task results * updated bright * updated tests for TaskResults --- mteb/benchmarks/benchmarks.py | 26 ++++++- mteb/load_results/task_results.py | 2 +- mteb/tasks/Retrieval/eng/BrightRetrieval.py | 1 - ...t_mteb_results.py => test_task_results.py} | 68 ++++++++++++++++--- 4 files changed, 83 insertions(+), 14 deletions(-) rename tests/test_load_results/{test_mteb_results.py => test_task_results.py} (60%) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 0a97261bb6..e0a62f08bf 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1060,9 +1060,7 @@ BRIGHT = Benchmark( name="BRIGHT", - tasks=get_tasks( - tasks=["BrightRetrieval"], - ), + tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["standard"]), description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval. BRIGHT is the first text retrieval benchmark that requires intensive reasoning to retrieve relevant documents with @@ -1079,6 +1077,28 @@ }""", ) + +BRIGHT_LONG = Benchmark( + name="BRIGHT (long)", + tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["long"]), + description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval. +BRIGHT is the first text retrieval +benchmark that requires intensive reasoning to retrieve relevant documents with +a dataset consisting of 1,384 real-world queries spanning diverse domains, such as +economics, psychology, mathematics, and coding. These queries are drawn from +naturally occurring and carefully curated human data. + +This is the long version of the benchmark, which only filter longer documents. + """, + reference="https://brightbenchmark.github.io/", + citation="""@article{su2024bright, + title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, + author={Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others}, + journal={arXiv preprint arXiv:2407.12883}, + year={2024} +}""", +) + CODE_RAG = Benchmark( name="CodeRAG", tasks=get_tasks( diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index 4ff2406934..2eabeeab02 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -525,7 +525,7 @@ def validate_and_filter_scores(self, task: AbsTask | None = None) -> TaskResult: if task is None: task = get_task(self.task_name) - splits = task.metadata.eval_splits + splits = task.eval_splits hf_subsets = task.hf_subsets hf_subsets = set(hf_subsets) diff --git a/mteb/tasks/Retrieval/eng/BrightRetrieval.py b/mteb/tasks/Retrieval/eng/BrightRetrieval.py index 393b121f3f..5ad887d476 100644 --- a/mteb/tasks/Retrieval/eng/BrightRetrieval.py +++ b/mteb/tasks/Retrieval/eng/BrightRetrieval.py @@ -50,7 +50,6 @@ class BrightRetrieval(MultilingualTask, AbsTaskRetrieval): domains=["Non-fiction", "Written"], task_subtypes=["Article retrieval"], license="cc-by-4.0", - socioeconomic_status="low", annotations_creators="derived", dialect=[], sample_creation="found", diff --git a/tests/test_load_results/test_mteb_results.py b/tests/test_load_results/test_task_results.py similarity index 60% rename from tests/test_load_results/test_mteb_results.py rename to tests/test_load_results/test_task_results.py index 6c22b390f3..75f8c6153c 100644 --- a/tests/test_load_results/test_mteb_results.py +++ b/tests/test_load_results/test_task_results.py @@ -34,7 +34,6 @@ class DummyTask(AbsTask): annotations_creators="derived", dialect=[], bibtex_citation="", - descriptive_stats={}, modalities=["text"], sample_creation="created", ) @@ -48,11 +47,11 @@ def _evaluate_subset(self, **kwargs): def _calculate_metrics_from_split( self, split: str, hf_subset: str | None = None, compute_overall=False ) -> dict[str, float]: - pass + return {} -def test_mteb_results(): - """Test TaskResult class (this is the same as the example in the docstring)""" +@pytest.fixture() +def task_result(): scores = { "train": { "en-de": { @@ -66,13 +65,19 @@ def test_mteb_results(): evaluation_time = 100 - mteb_results = TaskResult.from_task_results( + return TaskResult.from_task_results( task=DummyTask(), scores=scores, evaluation_time=evaluation_time ) - assert mteb_results.get_score() == 0.55 - assert mteb_results.get_score(languages=["eng"]) == 0.55 - assert mteb_results.get_score(languages=["fra"]) == 0.6 + +def test_task_results_get_score(task_result: TaskResult): + """Test TaskResult class (this is the same as the example in the docstring)""" + assert task_result.get_score() == 0.55 + assert task_result.get_score(languages=["eng"]) == 0.55 + assert task_result.get_score(languages=["fra"]) == 0.6 + + +def test_task_results_to_dict(task_result: TaskResult): dict_repr = { "dataset_revision": "1.0", "task_name": "dummy_task", @@ -94,7 +99,52 @@ def test_mteb_results(): ] }, } - assert mteb_results.to_dict() == dict_repr + assert task_result.to_dict() == dict_repr + + +def test_task_results_validate_and_filter(): + scores = { + "train": { + "en-de": { + "main_score": 0.5, + }, + "en-fr": { + "main_score": 0.6, + }, + }, + "test": { + "en-de": { + "main_score": 0.3, + }, + "en-fr": { + "main_score": 0.4, + }, + }, + } + + evaluation_time = 100 + + res = TaskResult.from_task_results( + task=DummyTask(), scores=scores, evaluation_time=evaluation_time + ) + + task = DummyTask() + task._eval_splits = ["train", "test"] + res1 = res.validate_and_filter_scores(task=task) + + assert res1.scores.keys() == {"train", "test"} + assert res1.get_score() == (0.5 + 0.6 + 0.3 + 0.4) / 4 + + task._eval_splits = ["test"] + res2 = res.validate_and_filter_scores(task=task) + assert res2.scores.keys() == {"test"} + assert res2.get_score() == (0.3 + 0.4) / 2 # only test scores + + task.hf_subsets = ["en-de"] + task._eval_splits = ["train", "test"] + res3 = res.validate_and_filter_scores(task=task) + assert res3.scores.keys() == {"train", "test"} + assert res3.get_score() == (0.5 + 0.3) / 2 # only en-de scores @pytest.mark.parametrize( From 68ff565004f39a6b91c1a3e0c9c1789b4d6ceec9 Mon Sep 17 00:00:00 2001 From: github-actions Date: Thu, 13 Feb 2025 14:58:30 +0000 Subject: [PATCH 005/233] 1.34.12 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1086a34e73..72641be32f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.34.11" +version = "1.34.12" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From eb32719210a05ab33cb0d33319b9aeafb6455a91 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Fri, 14 Feb 2025 01:29:50 +0900 Subject: [PATCH 006/233] misc: Add image clustering descriptive stats implementation (#2057) * add image clustering descirptive stats and run * finish off last one * remove script --- mteb/abstasks/Image/AbsTaskImageClustering.py | 73 ++- .../ImageClustering/CIFAR100Clustering.json | 314 +++++++++ .../ImageClustering/CIFAR10Clustering.json | 44 ++ .../ImageClustering/ImageNet10Clustering.json | 44 ++ .../ImageNetDog15Clustering.json | 59 ++ .../TinyImageNetClustering.json | 614 ++++++++++++++++++ 6 files changed, 1146 insertions(+), 2 deletions(-) create mode 100644 mteb/descriptive_stats/Image/ImageClustering/CIFAR100Clustering.json create mode 100644 mteb/descriptive_stats/Image/ImageClustering/CIFAR10Clustering.json create mode 100644 mteb/descriptive_stats/Image/ImageClustering/ImageNet10Clustering.json create mode 100644 mteb/descriptive_stats/Image/ImageClustering/ImageNetDog15Clustering.json create mode 100644 mteb/descriptive_stats/Image/ImageClustering/TinyImageNetClustering.json diff --git a/mteb/abstasks/Image/AbsTaskImageClustering.py b/mteb/abstasks/Image/AbsTaskImageClustering.py index 8152bf10f7..585bf4ea5b 100644 --- a/mteb/abstasks/Image/AbsTaskImageClustering.py +++ b/mteb/abstasks/Image/AbsTaskImageClustering.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from collections import Counter from typing import Any from datasets import Dataset @@ -10,10 +11,43 @@ from ...encoder_interface import Encoder from ...evaluation.evaluators import ImageClusteringEvaluator from ..AbsTask import AbsTask, ScoresDict +from ..TaskMetadata import DescriptiveStatistics logger = logging.getLogger(__name__) +class ImageClusteringDescriptiveStatistics(DescriptiveStatistics): + """Descriptive statistics for ImageClustering + + Attributes: + num_samples: number of samples in the dataset. + + min_image_width: Minimum width of images + average_image_width: Average width of images + max_image_width: Maximum width of images + + min_image_height: Minimum height of images + average_image_height: Average height of images + max_image_height: Maximum height of images + + unique_labels: Number of unique labels + labels: dict of label frequencies + """ + + num_samples: int + + min_image_width: float + average_image_width: float + max_image_width: float + + min_image_height: float + average_image_height: float + max_image_height: float + + unique_num_labels: int + labels: dict[str, dict[str, int]] + + class AbsTaskImageClustering(AbsTask): """Abstract class for Clustering tasks The similarity is computed between pairs and the results are ranked. @@ -34,8 +68,43 @@ def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None: def _calculate_metrics_from_split( self, split: str, hf_subset: str | None = None, compute_overall: bool = False - ): - pass + ) -> ImageClusteringDescriptiveStatistics: + if hf_subset: + imgs = self.dataset[hf_subset][split][self.image_column_name] + labels = self.dataset[hf_subset][split][self.label_column_name] + elif compute_overall: + imgs = [] + labels = [] + for hf_subset in self.metadata.eval_langs: + imgs.extend(self.dataset[hf_subset][split][self.image_column_name]) + labels.extend(self.dataset[hf_subset][split][self.label_column_name]) + else: + imgs = self.dataset[split][self.image_column_name] + labels = self.dataset[split][self.label_column_name] + + num_samples = len(labels) + unique_num_labels = len(set(labels)) + label_count = Counter(labels) + + img_widths, img_heights = [], [] + for img in imgs: + width, height = img.size + img_heights.append(height) + img_widths.append(width) + + return ImageClusteringDescriptiveStatistics( + num_samples=num_samples, + unique_num_labels=unique_num_labels, + min_image_width=min(img_widths), + average_image_width=sum(img_widths) / len(img_widths), + max_image_width=max(img_widths), + min_image_height=min(img_heights), + average_image_height=sum(img_heights) / len(img_heights), + max_image_height=max(img_heights), + labels={ + str(label): {"count": count} for label, count in label_count.items() + }, + ) def _evaluate_subset( self, diff --git a/mteb/descriptive_stats/Image/ImageClustering/CIFAR100Clustering.json b/mteb/descriptive_stats/Image/ImageClustering/CIFAR100Clustering.json new file mode 100644 index 0000000000..e8a282bc67 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClustering/CIFAR100Clustering.json @@ -0,0 +1,314 @@ +{ + "test": { + "num_samples": 10000, + "unique_num_labels": 100, + "min_image_width": 32, + "average_image_width": 32.0, + "max_image_width": 32, + "min_image_height": 32, + "average_image_height": 32.0, + "max_image_height": 32, + "labels": { + "49": { + "count": 100 + }, + "33": { + "count": 100 + }, + "72": { + "count": 100 + }, + "51": { + "count": 100 + }, + "71": { + "count": 100 + }, + "92": { + "count": 100 + }, + "15": { + "count": 100 + }, + "14": { + "count": 100 + }, + "23": { + "count": 100 + }, + "0": { + "count": 100 + }, + "75": { + "count": 100 + }, + "81": { + "count": 100 + }, + "69": { + "count": 100 + }, + "40": { + "count": 100 + }, + "43": { + "count": 100 + }, + "97": { + "count": 100 + }, + "70": { + "count": 100 + }, + "53": { + "count": 100 + }, + "29": { + "count": 100 + }, + "21": { + "count": 100 + }, + "16": { + "count": 100 + }, + "39": { + "count": 100 + }, + "8": { + "count": 100 + }, + "20": { + "count": 100 + }, + "61": { + "count": 100 + }, + "41": { + "count": 100 + }, + "93": { + "count": 100 + }, + "56": { + "count": 100 + }, + "73": { + "count": 100 + }, + "58": { + "count": 100 + }, + "11": { + "count": 100 + }, + "25": { + "count": 100 + }, + "37": { + "count": 100 + }, + "63": { + "count": 100 + }, + "24": { + "count": 100 + }, + "22": { + "count": 100 + }, + "17": { + "count": 100 + }, + "4": { + "count": 100 + }, + "6": { + "count": 100 + }, + "9": { + "count": 100 + }, + "57": { + "count": 100 + }, + "2": { + "count": 100 + }, + "32": { + "count": 100 + }, + "52": { + "count": 100 + }, + "42": { + "count": 100 + }, + "77": { + "count": 100 + }, + "27": { + "count": 100 + }, + "65": { + "count": 100 + }, + "7": { + "count": 100 + }, + "35": { + "count": 100 + }, + "82": { + "count": 100 + }, + "66": { + "count": 100 + }, + "90": { + "count": 100 + }, + "67": { + "count": 100 + }, + "91": { + "count": 100 + }, + "10": { + "count": 100 + }, + "78": { + "count": 100 + }, + "54": { + "count": 100 + }, + "89": { + "count": 100 + }, + "18": { + "count": 100 + }, + "13": { + "count": 100 + }, + "50": { + "count": 100 + }, + "26": { + "count": 100 + }, + "83": { + "count": 100 + }, + "47": { + "count": 100 + }, + "95": { + "count": 100 + }, + "76": { + "count": 100 + }, + "59": { + "count": 100 + }, + "85": { + "count": 100 + }, + "19": { + "count": 100 + }, + "46": { + "count": 100 + }, + "1": { + "count": 100 + }, + "74": { + "count": 100 + }, + "60": { + "count": 100 + }, + "64": { + "count": 100 + }, + "45": { + "count": 100 + }, + "36": { + "count": 100 + }, + "87": { + "count": 100 + }, + "30": { + "count": 100 + }, + "99": { + "count": 100 + }, + "80": { + "count": 100 + }, + "28": { + "count": 100 + }, + "98": { + "count": 100 + }, + "12": { + "count": 100 + }, + "94": { + "count": 100 + }, + "68": { + "count": 100 + }, + "44": { + "count": 100 + }, + "31": { + "count": 100 + }, + "79": { + "count": 100 + }, + "34": { + "count": 100 + }, + "55": { + "count": 100 + }, + "62": { + "count": 100 + }, + "96": { + "count": 100 + }, + "84": { + "count": 100 + }, + "38": { + "count": 100 + }, + "86": { + "count": 100 + }, + "5": { + "count": 100 + }, + "48": { + "count": 100 + }, + "3": { + "count": 100 + }, + "88": { + "count": 100 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClustering/CIFAR10Clustering.json b/mteb/descriptive_stats/Image/ImageClustering/CIFAR10Clustering.json new file mode 100644 index 0000000000..34ff70e050 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClustering/CIFAR10Clustering.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 10000, + "unique_num_labels": 10, + "min_image_width": 32, + "average_image_width": 32.0, + "max_image_width": 32, + "min_image_height": 32, + "average_image_height": 32.0, + "max_image_height": 32, + "labels": { + "3": { + "count": 1000 + }, + "8": { + "count": 1000 + }, + "0": { + "count": 1000 + }, + "6": { + "count": 1000 + }, + "1": { + "count": 1000 + }, + "9": { + "count": 1000 + }, + "5": { + "count": 1000 + }, + "7": { + "count": 1000 + }, + "4": { + "count": 1000 + }, + "2": { + "count": 1000 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClustering/ImageNet10Clustering.json b/mteb/descriptive_stats/Image/ImageClustering/ImageNet10Clustering.json new file mode 100644 index 0000000000..8d367e6406 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClustering/ImageNet10Clustering.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 13000, + "unique_num_labels": 10, + "min_image_width": 224, + "average_image_width": 224.0, + "max_image_width": 224, + "min_image_height": 224, + "average_image_height": 224.0, + "max_image_height": 224, + "labels": { + "0": { + "count": 1300 + }, + "1": { + "count": 1300 + }, + "2": { + "count": 1300 + }, + "3": { + "count": 1300 + }, + "4": { + "count": 1300 + }, + "5": { + "count": 1300 + }, + "6": { + "count": 1300 + }, + "7": { + "count": 1300 + }, + "8": { + "count": 1300 + }, + "10": { + "count": 1300 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClustering/ImageNetDog15Clustering.json b/mteb/descriptive_stats/Image/ImageClustering/ImageNetDog15Clustering.json new file mode 100644 index 0000000000..7719d70e57 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClustering/ImageNetDog15Clustering.json @@ -0,0 +1,59 @@ +{ + "test": { + "num_samples": 1076, + "unique_num_labels": 15, + "min_image_width": 224, + "average_image_width": 224.0, + "max_image_width": 224, + "min_image_height": 224, + "average_image_height": 224.0, + "max_image_height": 224, + "labels": { + "0": { + "count": 152 + }, + "1": { + "count": 88 + }, + "2": { + "count": 75 + }, + "3": { + "count": 96 + }, + "4": { + "count": 57 + }, + "5": { + "count": 50 + }, + "6": { + "count": 52 + }, + "7": { + "count": 50 + }, + "8": { + "count": 50 + }, + "9": { + "count": 50 + }, + "10": { + "count": 53 + }, + "11": { + "count": 57 + }, + "12": { + "count": 50 + }, + "13": { + "count": 100 + }, + "14": { + "count": 96 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClustering/TinyImageNetClustering.json b/mteb/descriptive_stats/Image/ImageClustering/TinyImageNetClustering.json new file mode 100644 index 0000000000..777c0bda4c --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClustering/TinyImageNetClustering.json @@ -0,0 +1,614 @@ +{ + "valid": { + "num_samples": 10000, + "unique_num_labels": 200, + "min_image_width": 64, + "average_image_width": 64.0, + "max_image_width": 64, + "min_image_height": 64, + "average_image_height": 64.0, + "max_image_height": 64, + "labels": { + "0": { + "count": 50 + }, + "1": { + "count": 50 + }, + "2": { + "count": 50 + }, + "3": { + "count": 50 + }, + "4": { + "count": 50 + }, + "5": { + "count": 50 + }, + "6": { + "count": 50 + }, + "7": { + "count": 50 + }, + "8": { + "count": 50 + }, + "9": { + "count": 50 + }, + "10": { + "count": 50 + }, + "11": { + "count": 50 + }, + "12": { + "count": 50 + }, + "13": { + "count": 50 + }, + "14": { + "count": 50 + }, + "15": { + "count": 50 + }, + "16": { + "count": 50 + }, + "17": { + "count": 50 + }, + "18": { + "count": 50 + }, + "19": { + "count": 50 + }, + "20": { + "count": 50 + }, + "21": { + "count": 50 + }, + "22": { + "count": 50 + }, + "23": { + "count": 50 + }, + "24": { + "count": 50 + }, + "25": { + "count": 50 + }, + "26": { + "count": 50 + }, + "27": { + "count": 50 + }, + "28": { + "count": 50 + }, + "29": { + "count": 50 + }, + "30": { + "count": 50 + }, + "31": { + "count": 50 + }, + "32": { + "count": 50 + }, + "33": { + "count": 50 + }, + "34": { + "count": 50 + }, + "35": { + "count": 50 + }, + "36": { + "count": 50 + }, + "37": { + "count": 50 + }, + "38": { + "count": 50 + }, + "39": { + "count": 50 + }, + "40": { + "count": 50 + }, + "41": { + "count": 50 + }, + "42": { + "count": 50 + }, + "43": { + "count": 50 + }, + "44": { + "count": 50 + }, + "45": { + "count": 50 + }, + "46": { + "count": 50 + }, + "47": { + "count": 50 + }, + "48": { + "count": 50 + }, + "49": { + "count": 50 + }, + "50": { + "count": 50 + }, + "51": { + "count": 50 + }, + "52": { + "count": 50 + }, + "53": { + "count": 50 + }, + "54": { + "count": 50 + }, + "55": { + "count": 50 + }, + "56": { + "count": 50 + }, + "57": { + "count": 50 + }, + "58": { + "count": 50 + }, + "59": { + "count": 50 + }, + "60": { + "count": 50 + }, + "61": { + "count": 50 + }, + "62": { + "count": 50 + }, + "63": { + "count": 50 + }, + "64": { + "count": 50 + }, + "65": { + "count": 50 + }, + "66": { + "count": 50 + }, + "67": { + "count": 50 + }, + "68": { + "count": 50 + }, + "69": { + "count": 50 + }, + "70": { + "count": 50 + }, + "71": { + "count": 50 + }, + "72": { + "count": 50 + }, + "73": { + "count": 50 + }, + "74": { + "count": 50 + }, + "75": { + "count": 50 + }, + "76": { + "count": 50 + }, + "77": { + "count": 50 + }, + "78": { + "count": 50 + }, + "79": { + "count": 50 + }, + "80": { + "count": 50 + }, + "81": { + "count": 50 + }, + "82": { + "count": 50 + }, + "83": { + "count": 50 + }, + "84": { + "count": 50 + }, + "85": { + "count": 50 + }, + "86": { + "count": 50 + }, + "87": { + "count": 50 + }, + "88": { + "count": 50 + }, + "89": { + "count": 50 + }, + "90": { + "count": 50 + }, + "91": { + "count": 50 + }, + "92": { + "count": 50 + }, + "93": { + "count": 50 + }, + "94": { + "count": 50 + }, + "95": { + "count": 50 + }, + "96": { + "count": 50 + }, + "97": { + "count": 50 + }, + "98": { + "count": 50 + }, + "99": { + "count": 50 + }, + "100": { + "count": 50 + }, + "101": { + "count": 50 + }, + "102": { + "count": 50 + }, + "103": { + "count": 50 + }, + "104": { + "count": 50 + }, + "105": { + "count": 50 + }, + "106": { + "count": 50 + }, + "107": { + "count": 50 + }, + "108": { + "count": 50 + }, + "109": { + "count": 50 + }, + "110": { + "count": 50 + }, + "111": { + "count": 50 + }, + "112": { + "count": 50 + }, + "113": { + "count": 50 + }, + "114": { + "count": 50 + }, + "115": { + "count": 50 + }, + "116": { + "count": 50 + }, + "117": { + "count": 50 + }, + "118": { + "count": 50 + }, + "119": { + "count": 50 + }, + "120": { + "count": 50 + }, + "121": { + "count": 50 + }, + "122": { + "count": 50 + }, + "123": { + "count": 50 + }, + "124": { + "count": 50 + }, + "125": { + "count": 50 + }, + "126": { + "count": 50 + }, + "127": { + "count": 50 + }, + "128": { + "count": 50 + }, + "129": { + "count": 50 + }, + "130": { + "count": 50 + }, + "131": { + "count": 50 + }, + "132": { + "count": 50 + }, + "133": { + "count": 50 + }, + "134": { + "count": 50 + }, + "135": { + "count": 50 + }, + "136": { + "count": 50 + }, + "137": { + "count": 50 + }, + "138": { + "count": 50 + }, + "139": { + "count": 50 + }, + "140": { + "count": 50 + }, + "141": { + "count": 50 + }, + "142": { + "count": 50 + }, + "143": { + "count": 50 + }, + "144": { + "count": 50 + }, + "145": { + "count": 50 + }, + "146": { + "count": 50 + }, + "147": { + "count": 50 + }, + "148": { + "count": 50 + }, + "149": { + "count": 50 + }, + "150": { + "count": 50 + }, + "151": { + "count": 50 + }, + "152": { + "count": 50 + }, + "153": { + "count": 50 + }, + "154": { + "count": 50 + }, + "155": { + "count": 50 + }, + "156": { + "count": 50 + }, + "157": { + "count": 50 + }, + "158": { + "count": 50 + }, + "159": { + "count": 50 + }, + "160": { + "count": 50 + }, + "161": { + "count": 50 + }, + "162": { + "count": 50 + }, + "163": { + "count": 50 + }, + "164": { + "count": 50 + }, + "165": { + "count": 50 + }, + "166": { + "count": 50 + }, + "167": { + "count": 50 + }, + "168": { + "count": 50 + }, + "169": { + "count": 50 + }, + "170": { + "count": 50 + }, + "171": { + "count": 50 + }, + "172": { + "count": 50 + }, + "173": { + "count": 50 + }, + "174": { + "count": 50 + }, + "175": { + "count": 50 + }, + "176": { + "count": 50 + }, + "177": { + "count": 50 + }, + "178": { + "count": 50 + }, + "179": { + "count": 50 + }, + "180": { + "count": 50 + }, + "181": { + "count": 50 + }, + "182": { + "count": 50 + }, + "183": { + "count": 50 + }, + "184": { + "count": 50 + }, + "185": { + "count": 50 + }, + "186": { + "count": 50 + }, + "187": { + "count": 50 + }, + "188": { + "count": 50 + }, + "189": { + "count": 50 + }, + "190": { + "count": 50 + }, + "191": { + "count": 50 + }, + "192": { + "count": 50 + }, + "193": { + "count": 50 + }, + "194": { + "count": 50 + }, + "195": { + "count": 50 + }, + "196": { + "count": 50 + }, + "197": { + "count": 50 + }, + "198": { + "count": 50 + }, + "199": { + "count": 50 + } + } + } +} \ No newline at end of file From 50b8e7ba10c9a33d2febbc25be8b69893d0b50e6 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Thu, 13 Feb 2025 17:30:27 +0100 Subject: [PATCH 007/233] fix: Update embed_dim for jina models (#2058) see https://github.com/embeddings-benchmark/results/pull/117 --- mteb/models/jina_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index 01b10b1318..8d832c17f1 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -216,7 +216,7 @@ def encode( n_parameters=int(572 * 1e6), memory_usage_mb=1092, max_tokens=8194, - embed_dim=4096, + embed_dim=1024, license="cc-by-nc-4.0", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], From 48ef6f4d2351042b1f5397ea1d2e28f58df2ecc0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 13 Feb 2025 16:34:39 +0000 Subject: [PATCH 008/233] Update tasks table --- docs/tasks.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index 553cdd5acd..c210fbf0c5 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -76,9 +76,9 @@ The following tables give you an overview of the tasks in MTEB. | [CExaPPC](https://github.com/exaco/exappc) | ['fas'] | PairClassification | s2s | [Social, Web] | None | None | | [CIFAR10](https://huggingface.co/datasets/uoft-cs/cifar10) (Alex Krizhevsky, 2009) | ['eng'] | ImageClassification | i2i | [Web] | None | None | | [CIFAR100](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ImageClassification | i2t | [Web] | None | None | -| [CIFAR100Clustering](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ImageClustering | i2t | [Web] | None | None | +| [CIFAR100Clustering](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ImageClustering | i2t | [Web] | {'test': 10000} | {'test': {'num_samples': 10000, 'unique_num_labels': 100, 'min_image_width': 32, 'average_image_width': 32.0, 'max_image_width': 32, 'min_image_height': 32, 'average_image_height': 32.0, 'max_image_height': 32, 'labels': {'49': {'count': 100}, '33': {'count': 100}, '72': {'count': 100}, '51': {'count': 100}, '71': {'count': 100}, '92': {'count': 100}, '15': {'count': 100}, '14': {'count': 100}, '23': {'count': 100}, '0': {'count': 100}, '75': {'count': 100}, '81': {'count': 100}, '69': {'count': 100}, '40': {'count': 100}, '43': {'count': 100}, '97': {'count': 100}, '70': {'count': 100}, '53': {'count': 100}, '29': {'count': 100}, '21': {'count': 100}, '16': {'count': 100}, '39': {'count': 100}, '8': {'count': 100}, '20': {'count': 100}, '61': {'count': 100}, '41': {'count': 100}, '93': {'count': 100}, '56': {'count': 100}, '73': {'count': 100}, '58': {'count': 100}, '11': {'count': 100}, '25': {'count': 100}, '37': {'count': 100}, '63': {'count': 100}, '24': {'count': 100}, '22': {'count': 100}, '17': {'count': 100}, '4': {'count': 100}, '6': {'count': 100}, '9': {'count': 100}, '57': {'count': 100}, '2': {'count': 100}, '32': {'count': 100}, '52': {'count': 100}, '42': {'count': 100}, '77': {'count': 100}, '27': {'count': 100}, '65': {'count': 100}, '7': {'count': 100}, '35': {'count': 100}, '82': {'count': 100}, '66': {'count': 100}, '90': {'count': 100}, '67': {'count': 100}, '91': {'count': 100}, '10': {'count': 100}, '78': {'count': 100}, '54': {'count': 100}, '89': {'count': 100}, '18': {'count': 100}, '13': {'count': 100}, '50': {'count': 100}, '26': {'count': 100}, '83': {'count': 100}, '47': {'count': 100}, '95': {'count': 100}, '76': {'count': 100}, '59': {'count': 100}, '85': {'count': 100}, '19': {'count': 100}, '46': {'count': 100}, '1': {'count': 100}, '74': {'count': 100}, '60': {'count': 100}, '64': {'count': 100}, '45': {'count': 100}, '36': {'count': 100}, '87': {'count': 100}, '30': {'count': 100}, '99': {'count': 100}, '80': {'count': 100}, '28': {'count': 100}, '98': {'count': 100}, '12': {'count': 100}, '94': {'count': 100}, '68': {'count': 100}, '44': {'count': 100}, '31': {'count': 100}, '79': {'count': 100}, '34': {'count': 100}, '55': {'count': 100}, '62': {'count': 100}, '96': {'count': 100}, '84': {'count': 100}, '38': {'count': 100}, '86': {'count': 100}, '5': {'count': 100}, '48': {'count': 100}, '3': {'count': 100}, '88': {'count': 100}}}} | | [CIFAR100ZeroShot](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ZeroShotClassification | i2t | [Web] | None | None | -| [CIFAR10Clustering](https://huggingface.co/datasets/uoft-cs/cifar10) (Alex Krizhevsky, 2009) | ['eng'] | ImageClustering | i2i | [Web] | None | None | +| [CIFAR10Clustering](https://huggingface.co/datasets/uoft-cs/cifar10) (Alex Krizhevsky, 2009) | ['eng'] | ImageClustering | i2i | [Web] | {'test': 10000} | {'test': {'num_samples': 10000, 'unique_num_labels': 10, 'min_image_width': 32, 'average_image_width': 32.0, 'max_image_width': 32, 'min_image_height': 32, 'average_image_height': 32.0, 'max_image_height': 32, 'labels': {'3': {'count': 1000}, '8': {'count': 1000}, '0': {'count': 1000}, '6': {'count': 1000}, '1': {'count': 1000}, '9': {'count': 1000}, '5': {'count': 1000}, '7': {'count': 1000}, '4': {'count': 1000}, '2': {'count': 1000}}}} | | [CIFAR10ZeroShot](https://huggingface.co/datasets/uoft-cs/cifar10) (Alex Krizhevsky, 2009) | ['eng'] | ZeroShotClassification | i2t | [Web] | None | None | | [CIRRIT2IRetrieval](https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Image_Retrieval_on_Real-Life_Images_With_Pre-Trained_Vision-and-Language_Models_ICCV_2021_paper.html) (Liu et al., 2021) | ['eng'] | Any2AnyRetrieval | it2i | [Encyclopaedic] | None | None | | [CLEVRCountZeroShot](https://openaccess.thecvf.com/content_cvpr_2017/html/Johnson_CLEVR_A_Diagnostic_CVPR_2017_paper.html) (Johnson et al., 2017) | ['eng'] | ZeroShotClassification | i2t | [Constructed] | None | None | @@ -341,8 +341,8 @@ The following tables give you an overview of the tasks in MTEB. | [IWSLT2017BitextMining](https://aclanthology.org/2017.iwslt-1.1/) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'jpn', 'kor', 'nld', 'ron'] | BitextMining | s2s | [Fiction, Non-fiction, Written] | {'validation': 21938} | {'validation': {'num_samples': 21938, 'number_of_characters': 4256244, 'unique_pairs': 21840, 'min_sentence1_length': 2, 'average_sentence1_length': 97.01, 'max_sentence1_length': 521, 'unique_sentence1': 11563, 'min_sentence2_length': 2, 'average_sentence2_length': 97.01, 'max_sentence2_length': 521, 'unique_sentence2': 11563, 'hf_subset_descriptive_stats': {'ar-en': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 4, 'average_sentence1_length': 85.49, 'max_sentence1_length': 369, 'unique_sentence1': 887, 'min_sentence2_length': 10, 'average_sentence2_length': 108.77, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'de-en': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 119.03, 'max_sentence1_length': 521, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.83, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'en-ar': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 10, 'average_sentence1_length': 108.77, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 4, 'average_sentence2_length': 85.49, 'max_sentence2_length': 369, 'unique_sentence2': 887}, 'en-de': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.83, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 6, 'average_sentence2_length': 119.03, 'max_sentence2_length': 521, 'unique_sentence2': 881}, 'en-fr': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.41, 'max_sentence1_length': 462, 'unique_sentence1': 883, 'min_sentence2_length': 6, 'average_sentence2_length': 113.63, 'max_sentence2_length': 493, 'unique_sentence2': 881}, 'en-it': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 10, 'average_sentence1_length': 103.0, 'max_sentence1_length': 433, 'unique_sentence1': 922, 'min_sentence2_length': 7, 'average_sentence2_length': 103.46, 'max_sentence2_length': 444, 'unique_sentence2': 918}, 'en-ja': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 10, 'average_sentence1_length': 109.81, 'max_sentence1_length': 462, 'unique_sentence1': 864, 'min_sentence2_length': 5, 'average_sentence2_length': 42.59, 'max_sentence2_length': 225, 'unique_sentence2': 866}, 'en-ko': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 10, 'average_sentence1_length': 107.74, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 3, 'average_sentence2_length': 54.56, 'max_sentence2_length': 250, 'unique_sentence2': 872}, 'en-nl': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 10, 'average_sentence1_length': 95.27, 'max_sentence1_length': 433, 'unique_sentence1': 996, 'min_sentence2_length': 4, 'average_sentence2_length': 93.8, 'max_sentence2_length': 477, 'unique_sentence2': 1000}, 'en-ro': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 10, 'average_sentence1_length': 104.72, 'max_sentence1_length': 433, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.67, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'en-zh': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 10, 'average_sentence1_length': 109.37, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 2, 'average_sentence2_length': 39.81, 'max_sentence2_length': 230, 'unique_sentence2': 867}, 'fr-en': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 113.63, 'max_sentence1_length': 493, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.41, 'max_sentence2_length': 462, 'unique_sentence2': 883}, 'it-en': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 7, 'average_sentence1_length': 103.46, 'max_sentence1_length': 444, 'unique_sentence1': 918, 'min_sentence2_length': 10, 'average_sentence2_length': 103.0, 'max_sentence2_length': 433, 'unique_sentence2': 922}, 'it-nl': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.64, 'max_sentence1_length': 459, 'unique_sentence1': 994, 'min_sentence2_length': 7, 'average_sentence2_length': 94.03, 'max_sentence2_length': 505, 'unique_sentence2': 998}, 'it-ro': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 103.91, 'max_sentence1_length': 435, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.62, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'ja-en': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 5, 'average_sentence1_length': 42.59, 'max_sentence1_length': 225, 'unique_sentence1': 866, 'min_sentence2_length': 10, 'average_sentence2_length': 109.81, 'max_sentence2_length': 462, 'unique_sentence2': 864}, 'ko-en': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 3, 'average_sentence1_length': 54.56, 'max_sentence1_length': 250, 'unique_sentence1': 872, 'min_sentence2_length': 10, 'average_sentence2_length': 107.74, 'max_sentence2_length': 462, 'unique_sentence2': 872}, 'nl-en': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 4, 'average_sentence1_length': 93.8, 'max_sentence1_length': 477, 'unique_sentence1': 1000, 'min_sentence2_length': 10, 'average_sentence2_length': 95.27, 'max_sentence2_length': 433, 'unique_sentence2': 996}, 'nl-it': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.03, 'max_sentence1_length': 505, 'unique_sentence1': 998, 'min_sentence2_length': 7, 'average_sentence2_length': 94.64, 'max_sentence2_length': 459, 'unique_sentence2': 994}, 'nl-ro': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 102.02, 'max_sentence1_length': 478, 'unique_sentence1': 909, 'min_sentence2_length': 9, 'average_sentence2_length': 107.59, 'max_sentence2_length': 515, 'unique_sentence2': 909}, 'ro-en': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 9, 'average_sentence1_length': 107.67, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 10, 'average_sentence2_length': 104.72, 'max_sentence2_length': 433, 'unique_sentence2': 907}, 'ro-it': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.62, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 7, 'average_sentence2_length': 103.91, 'max_sentence2_length': 435, 'unique_sentence2': 907}, 'ro-nl': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.59, 'max_sentence1_length': 515, 'unique_sentence1': 909, 'min_sentence2_length': 7, 'average_sentence2_length': 102.02, 'max_sentence2_length': 478, 'unique_sentence2': 909}, 'zh-en': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 2, 'average_sentence1_length': 39.81, 'max_sentence1_length': 230, 'unique_sentence1': 867, 'min_sentence2_length': 10, 'average_sentence2_length': 109.37, 'max_sentence2_length': 462, 'unique_sentence2': 872}}}} | | [ImageCoDeT2IMultiChoice](https://aclanthology.org/2022.acl-long.241.pdf) (Krojer et al., 2022) | ['eng'] | Any2AnyMultiChoice | it2i | [Web, Written] | None | None | | [ImageCoDeT2IRetrieval](https://aclanthology.org/2022.acl-long.241.pdf) (Krojer et al., 2022) | ['eng'] | Any2AnyRetrieval | t2i | [Web, Written] | None | None | -| [ImageNet10Clustering](https://www.kaggle.com/datasets/liusha249/imagenet10) (Deng et al., 2009) | ['eng'] | ImageClustering | i2t | [Web] | None | None | -| [ImageNetDog15Clustering](http://vision.stanford.edu/aditya86/ImageNetDogs/main.html) (Deng et al., 2009) | ['eng'] | ImageClustering | i2i | [Web] | None | None | +| [ImageNet10Clustering](https://www.kaggle.com/datasets/liusha249/imagenet10) (Deng et al., 2009) | ['eng'] | ImageClustering | i2t | [Web] | {'test': 13000} | {'test': {'num_samples': 13000, 'unique_num_labels': 10, 'min_image_width': 224, 'average_image_width': 224.0, 'max_image_width': 224, 'min_image_height': 224, 'average_image_height': 224.0, 'max_image_height': 224, 'labels': {'0': {'count': 1300}, '1': {'count': 1300}, '2': {'count': 1300}, '3': {'count': 1300}, '4': {'count': 1300}, '5': {'count': 1300}, '6': {'count': 1300}, '7': {'count': 1300}, '8': {'count': 1300}, '10': {'count': 1300}}}} | +| [ImageNetDog15Clustering](http://vision.stanford.edu/aditya86/ImageNetDogs/main.html) (Deng et al., 2009) | ['eng'] | ImageClustering | i2i | [Web] | {'test': 1076} | {'test': {'num_samples': 1076, 'unique_num_labels': 15, 'min_image_width': 224, 'average_image_width': 224.0, 'max_image_width': 224, 'min_image_height': 224, 'average_image_height': 224.0, 'max_image_height': 224, 'labels': {'0': {'count': 152}, '1': {'count': 88}, '2': {'count': 75}, '3': {'count': 96}, '4': {'count': 57}, '5': {'count': 50}, '6': {'count': 52}, '7': {'count': 50}, '8': {'count': 50}, '9': {'count': 50}, '10': {'count': 53}, '11': {'count': 57}, '12': {'count': 50}, '13': {'count': 100}, '14': {'count': 96}}}} | | [Imagenet1k](https://ieeexplore.ieee.org/document/5206848) (Deng et al., 2009) | ['eng'] | ImageClassification | i2i | [Scene] | None | None | | [Imagenet1kZeroShot](https://ieeexplore.ieee.org/document/5206848) (Deng et al., 2009) | ['eng'] | ZeroShotClassification | i2t | [Scene] | None | None | | [ImdbClassification](http://www.aclweb.org/anthology/P11-1015) | ['eng'] | Classification | p2p | [Reviews, Written] | None | None | @@ -787,7 +787,7 @@ The following tables give you an overview of the tasks in MTEB. | [TextualismToolPlainLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [ThuNewsClusteringP2P.v2](http://thuctc.thunlp.org/) (Sun et al., 2016) | ['cmn'] | Clustering | p2p | [News, Written] | None | None | | [ThuNewsClusteringS2S.v2](http://thuctc.thunlp.org/) (Sun et al., 2016) | ['cmn'] | Clustering | s2s | [News, Written] | None | None | -| [TinyImageNetClustering](https://huggingface.co/datasets/zh-plus/tiny-imagenet/viewer/default/valid) | ['eng'] | ImageClustering | i2i | [Reviews] | None | None | +| [TinyImageNetClustering](https://huggingface.co/datasets/zh-plus/tiny-imagenet/viewer/default/valid) | ['eng'] | ImageClustering | i2i | [Reviews] | {'valid': 10000} | {'valid': {'num_samples': 10000, 'unique_num_labels': 200, 'min_image_width': 64, 'average_image_width': 64.0, 'max_image_width': 64, 'min_image_height': 64, 'average_image_height': 64.0, 'max_image_height': 64, 'labels': {'0': {'count': 50}, '1': {'count': 50}, '2': {'count': 50}, '3': {'count': 50}, '4': {'count': 50}, '5': {'count': 50}, '6': {'count': 50}, '7': {'count': 50}, '8': {'count': 50}, '9': {'count': 50}, '10': {'count': 50}, '11': {'count': 50}, '12': {'count': 50}, '13': {'count': 50}, '14': {'count': 50}, '15': {'count': 50}, '16': {'count': 50}, '17': {'count': 50}, '18': {'count': 50}, '19': {'count': 50}, '20': {'count': 50}, '21': {'count': 50}, '22': {'count': 50}, '23': {'count': 50}, '24': {'count': 50}, '25': {'count': 50}, '26': {'count': 50}, '27': {'count': 50}, '28': {'count': 50}, '29': {'count': 50}, '30': {'count': 50}, '31': {'count': 50}, '32': {'count': 50}, '33': {'count': 50}, '34': {'count': 50}, '35': {'count': 50}, '36': {'count': 50}, '37': {'count': 50}, '38': {'count': 50}, '39': {'count': 50}, '40': {'count': 50}, '41': {'count': 50}, '42': {'count': 50}, '43': {'count': 50}, '44': {'count': 50}, '45': {'count': 50}, '46': {'count': 50}, '47': {'count': 50}, '48': {'count': 50}, '49': {'count': 50}, '50': {'count': 50}, '51': {'count': 50}, '52': {'count': 50}, '53': {'count': 50}, '54': {'count': 50}, '55': {'count': 50}, '56': {'count': 50}, '57': {'count': 50}, '58': {'count': 50}, '59': {'count': 50}, '60': {'count': 50}, '61': {'count': 50}, '62': {'count': 50}, '63': {'count': 50}, '64': {'count': 50}, '65': {'count': 50}, '66': {'count': 50}, '67': {'count': 50}, '68': {'count': 50}, '69': {'count': 50}, '70': {'count': 50}, '71': {'count': 50}, '72': {'count': 50}, '73': {'count': 50}, '74': {'count': 50}, '75': {'count': 50}, '76': {'count': 50}, '77': {'count': 50}, '78': {'count': 50}, '79': {'count': 50}, '80': {'count': 50}, '81': {'count': 50}, '82': {'count': 50}, '83': {'count': 50}, '84': {'count': 50}, '85': {'count': 50}, '86': {'count': 50}, '87': {'count': 50}, '88': {'count': 50}, '89': {'count': 50}, '90': {'count': 50}, '91': {'count': 50}, '92': {'count': 50}, '93': {'count': 50}, '94': {'count': 50}, '95': {'count': 50}, '96': {'count': 50}, '97': {'count': 50}, '98': {'count': 50}, '99': {'count': 50}, '100': {'count': 50}, '101': {'count': 50}, '102': {'count': 50}, '103': {'count': 50}, '104': {'count': 50}, '105': {'count': 50}, '106': {'count': 50}, '107': {'count': 50}, '108': {'count': 50}, '109': {'count': 50}, '110': {'count': 50}, '111': {'count': 50}, '112': {'count': 50}, '113': {'count': 50}, '114': {'count': 50}, '115': {'count': 50}, '116': {'count': 50}, '117': {'count': 50}, '118': {'count': 50}, '119': {'count': 50}, '120': {'count': 50}, '121': {'count': 50}, '122': {'count': 50}, '123': {'count': 50}, '124': {'count': 50}, '125': {'count': 50}, '126': {'count': 50}, '127': {'count': 50}, '128': {'count': 50}, '129': {'count': 50}, '130': {'count': 50}, '131': {'count': 50}, '132': {'count': 50}, '133': {'count': 50}, '134': {'count': 50}, '135': {'count': 50}, '136': {'count': 50}, '137': {'count': 50}, '138': {'count': 50}, '139': {'count': 50}, '140': {'count': 50}, '141': {'count': 50}, '142': {'count': 50}, '143': {'count': 50}, '144': {'count': 50}, '145': {'count': 50}, '146': {'count': 50}, '147': {'count': 50}, '148': {'count': 50}, '149': {'count': 50}, '150': {'count': 50}, '151': {'count': 50}, '152': {'count': 50}, '153': {'count': 50}, '154': {'count': 50}, '155': {'count': 50}, '156': {'count': 50}, '157': {'count': 50}, '158': {'count': 50}, '159': {'count': 50}, '160': {'count': 50}, '161': {'count': 50}, '162': {'count': 50}, '163': {'count': 50}, '164': {'count': 50}, '165': {'count': 50}, '166': {'count': 50}, '167': {'count': 50}, '168': {'count': 50}, '169': {'count': 50}, '170': {'count': 50}, '171': {'count': 50}, '172': {'count': 50}, '173': {'count': 50}, '174': {'count': 50}, '175': {'count': 50}, '176': {'count': 50}, '177': {'count': 50}, '178': {'count': 50}, '179': {'count': 50}, '180': {'count': 50}, '181': {'count': 50}, '182': {'count': 50}, '183': {'count': 50}, '184': {'count': 50}, '185': {'count': 50}, '186': {'count': 50}, '187': {'count': 50}, '188': {'count': 50}, '189': {'count': 50}, '190': {'count': 50}, '191': {'count': 50}, '192': {'count': 50}, '193': {'count': 50}, '194': {'count': 50}, '195': {'count': 50}, '196': {'count': 50}, '197': {'count': 50}, '198': {'count': 50}, '199': {'count': 50}}}} | | [TopiOCQA](https://mcgill-nlp.github.io/topiocqa) (Vaibhav Adlakha, 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [TopiOCQAHardNegatives](https://mcgill-nlp.github.io/topiocqa) (Vaibhav Adlakha, 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [Touche2020-Fa](https://huggingface.co/datasets/MCINext/touche2020-fa) | ['fas'] | Retrieval | s2p | [Spoken] | None | None | From 8b7f2f8511259a24a24e9d48fba2943f98c37329 Mon Sep 17 00:00:00 2001 From: github-actions Date: Thu, 13 Feb 2025 16:44:51 +0000 Subject: [PATCH 009/233] 1.34.13 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 72641be32f..f7ea7c6103 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.34.12" +version = "1.34.13" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 02d258307099782d233c92e7764e490a98c62903 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Fri, 14 Feb 2025 00:33:08 +0300 Subject: [PATCH 010/233] Add giga embeddings (#1741) * add gigaembeddings * use jasper * fix name * create sentence_transformer instruct wrapper * apply instruction template * fix jasper * update meta --- mteb/models/jasper_models.py | 2 +- mteb/models/ru_sentence_models.py | 34 +++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index 1cf0b53a54..47f2c0bd56 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -44,7 +44,7 @@ def encode( instruction = self.get_task_instruction(task_name, prompt_type) # to passage prompts won't be applied to passages - if prompt_type == PromptType.passage and task.metadata.type == "s2p": + if prompt_type == PromptType.passage and task.metadata.category == "s2p": instruction = None embeddings = self.model.encode( diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index ac468f47d2..d9a8bd1041 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -4,9 +4,12 @@ from functools import partial +import torch + from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta, sentence_transformers_loader from mteb.models.bge_models import bge_m3_training_data +from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper rubert_tiny = ModelMeta( name="cointegrated/rubert-tiny", @@ -559,3 +562,34 @@ public_training_code=None, framework=["Sentence Transformers", "PyTorch"], ) + +giga_embeddings = ModelMeta( + loader=partial( + InstructSentenceTransformerWrapper, + model_name="ai-sage/Giga-Embeddings-instruct", + revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92", + trust_remote_code=True, + instruction_template="Instruct: {instruction}\nQuery: ", + apply_instruction_to_passages=False, + model_kwargs={ + "torch_dtype": torch.bfloat16, + }, + ), + name="ai-sage/Giga-Embeddings-instruct", + languages=["eng_Latn", "rus_Cyrl"], + open_weights=True, + revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92", + release_date="2024-12-13", + n_parameters=2_530_000_000, + memory_usage_mb=9649, + embed_dim=2048, + license="mit", + max_tokens=32768, + reference="https://huggingface.co/ai-sage/Giga-Embeddings-instruct", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=None, +) From 20df284af80d1623c0feb546e70a95d858e32dd8 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Fri, 14 Feb 2025 12:01:42 +0900 Subject: [PATCH 011/233] misc: Add ZS and multilabel image classification descriptive stats implementation (#2059) * add image clustering descirptive stats and run * finish off last one * remove script * add ImageMultilabelClassificationDescriptiveStatistics * add VOC2007 * add zeroshot and mnist example --- .../AbsTaskImageMultilabelClassification.py | 88 ++++++++++++++++++- .../Image/AbsTaskZeroshotClassification.py | 88 ++++++++++++++++++- .../VOC2007.json | 77 ++++++++++++++++ .../ZeroShotClassification/MNISTZeroShot.json | 47 ++++++++++ 4 files changed, 295 insertions(+), 5 deletions(-) create mode 100644 mteb/descriptive_stats/Image/ImageMultilabelClassification/VOC2007.json create mode 100644 mteb/descriptive_stats/Image/ZeroShotClassification/MNISTZeroShot.json diff --git a/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py b/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py index 26fd799d56..1e66e30cdc 100644 --- a/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py +++ b/mteb/abstasks/Image/AbsTaskImageMultilabelClassification.py @@ -2,7 +2,7 @@ import itertools import logging -from collections import defaultdict +from collections import Counter, defaultdict from typing import Any import numpy as np @@ -17,10 +17,49 @@ from ...encoder_interface import Encoder from ..AbsTask import AbsTask, ScoresDict +from ..TaskMetadata import DescriptiveStatistics logger = logging.getLogger(__name__) +class ImageMultilabelClassificationDescriptiveStatistics(DescriptiveStatistics): + """Descriptive statistics for ImageMultilabelClassification + + Attributes: + num_samples: number of samples in the dataset. + + min_image_width: Minimum width of images + average_image_width: Average width of images + max_image_width: Maximum width of images + + min_image_height: Minimum height of images + average_image_height: Average height of images + max_image_height: Maximum height of images + + min_labels_per_sample: Minimum number of labels per sample + average_label_per_sample: Average number of labels per sample + max_labels_per_sample: Maximum number of labels per sample + unique_labels: Number of unique labels + labels: dict of label frequencies + """ + + num_samples: int + + min_image_width: float + average_image_width: float + max_image_width: float + + min_image_height: float + average_image_height: float + max_image_height: float + + min_labels_per_sample: int + average_label_per_sample: float + max_labels_per_sample: int + unique_num_labels: int + labels: dict[str, dict[str, int]] + + def evaluate_classifier( embeddings_train: np.ndarray, y_train: np.ndarray, @@ -88,8 +127,51 @@ def _add_main_score(self, scores): def _calculate_metrics_from_split( self, split: str, hf_subset: str | None = None, compute_overall: bool = False - ): - pass + ) -> ImageMultilabelClassificationDescriptiveStatistics: + if hf_subset: + imgs = self.dataset[hf_subset][split][self.image_column_name] + labels = self.dataset[hf_subset][split][self.label_column_name] + elif compute_overall: + imgs = [] + labels = [] + for hf_subset in self.metadata.eval_langs: + imgs.extend(self.dataset[hf_subset][split][self.image_column_name]) + labels.extend(self.dataset[hf_subset][split][self.label_column_name]) + else: + imgs = self.dataset[split][self.image_column_name] + labels = self.dataset[split][self.label_column_name] + + num_samples = len(labels) + + label_len = [len(l) for l in labels] + total_label_len = sum(label_len) + total_labels = [] + for l in labels: + total_labels.extend(l if len(l) > 0 else [None]) + label_count = Counter(total_labels) + + img_widths, img_heights = [], [] + for img in imgs: + width, height = img.size + img_heights.append(height) + img_widths.append(width) + + return ImageMultilabelClassificationDescriptiveStatistics( + num_samples=num_samples, + min_image_width=min(img_widths), + average_image_width=sum(img_widths) / len(img_widths), + max_image_width=max(img_widths), + min_image_height=min(img_heights), + average_image_height=sum(img_heights) / len(img_heights), + max_image_height=max(img_heights), + min_labels_per_sample=min(label_len), + average_label_per_sample=total_label_len / len(labels), + max_labels_per_sample=max(label_len), + unique_num_labels=len(label_count), + labels={ + str(label): {"count": count} for label, count in label_count.items() + }, + ) def evaluate( self, diff --git a/mteb/abstasks/Image/AbsTaskZeroshotClassification.py b/mteb/abstasks/Image/AbsTaskZeroshotClassification.py index 36bdd27103..b69dbfe8d2 100644 --- a/mteb/abstasks/Image/AbsTaskZeroshotClassification.py +++ b/mteb/abstasks/Image/AbsTaskZeroshotClassification.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from collections import Counter from typing import Any from datasets import Dataset @@ -8,10 +9,51 @@ from ...encoder_interface import Encoder from ...evaluation.evaluators import ZeroshotClassificationEvaluator from ..AbsTask import AbsTask, ScoresDict +from ..TaskMetadata import DescriptiveStatistics logger = logging.getLogger(__name__) +class ZeroshotClassificationDescriptiveStatistics(DescriptiveStatistics): + """Descriptive statistics for ZeroshotClassification + + Attributes: + num_samples: number of samples in the dataset. + + min_image_width: Minimum width of images + average_image_width: Average width of images + max_image_width: Maximum width of images + + min_image_height: Minimum height of images + average_image_height: Average height of images + max_image_height: Maximum height of images + + unique_labels: Number of unique labels + labels: dict of label frequencies + + min_label_text_length: Minimum length of candidate label text + average_label_text_length: Average length of candidate label text + max_label_text_length: Maximum length of candidate label text + """ + + num_samples: int + + min_image_width: float + average_image_width: float + max_image_width: float + + min_image_height: float + average_image_height: float + max_image_height: float + + unique_num_labels: int + labels: dict[str, dict[str, int]] + + min_label_text_length: int + average_label_text_length: float + max_label_text_length: int + + class AbsTaskZeroshotClassification(AbsTask): """Abstract class for ZeroshotClassification tasks The similarity between an images and candidate text prompts, such as this is a dog/this is a cat. @@ -32,8 +74,50 @@ def _add_main_score(self, scores) -> None: def _calculate_metrics_from_split( self, split: str, hf_subset: str | None = None, compute_overall: bool = False - ): - pass + ) -> ZeroshotClassificationDescriptiveStatistics: + if hf_subset: + imgs = self.dataset[hf_subset][split][self.image_column_name] + labels = self.dataset[hf_subset][split][self.label_column_name] + elif compute_overall: + imgs = [] + labels = [] + for hf_subset in self.metadata.eval_langs: + imgs.extend(self.dataset[hf_subset][split][self.image_column_name]) + labels.extend(self.dataset[hf_subset][split][self.label_column_name]) + else: + imgs = self.dataset[split][self.image_column_name] + labels = self.dataset[split][self.label_column_name] + + num_samples = len(labels) + unique_num_labels = len(set(labels)) + label_count = Counter(labels) + + img_widths, img_heights = [], [] + for img in imgs: + width, height = img.size + img_heights.append(height) + img_widths.append(width) + + candidate_labels = self.get_candidate_labels() + candidate_labels_len = [len(c) for c in candidate_labels] + + return ZeroshotClassificationDescriptiveStatistics( + num_samples=num_samples, + unique_num_labels=unique_num_labels, + min_image_width=min(img_widths), + average_image_width=sum(img_widths) / len(img_widths), + max_image_width=max(img_widths), + min_image_height=min(img_heights), + average_image_height=sum(img_heights) / len(img_heights), + max_image_height=max(img_heights), + min_label_text_length=min(candidate_labels_len), + average_label_text_length=sum(candidate_labels_len) + / len(candidate_labels_len), + max_label_text_length=max(candidate_labels_len), + labels={ + str(label): {"count": count} for label, count in label_count.items() + }, + ) def _evaluate_subset( self, diff --git a/mteb/descriptive_stats/Image/ImageMultilabelClassification/VOC2007.json b/mteb/descriptive_stats/Image/ImageMultilabelClassification/VOC2007.json new file mode 100644 index 0000000000..3a001114c0 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageMultilabelClassification/VOC2007.json @@ -0,0 +1,77 @@ +{ + "test": { + "num_samples": 4952, + "min_image_width": 148, + "average_image_width": 471.24656704361877, + "max_image_width": 500, + "min_image_height": 139, + "average_image_height": 381.53776252019384, + "max_image_height": 500, + "min_labels_per_sample": 1, + "average_label_per_sample": 1.4161954765751212, + "max_labels_per_sample": 5, + "unique_num_labels": 20, + "labels": { + "14": { + "count": 2007 + }, + "11": { + "count": 418 + }, + "18": { + "count": 259 + }, + "17": { + "count": 223 + }, + "8": { + "count": 417 + }, + "6": { + "count": 721 + }, + "10": { + "count": 190 + }, + "15": { + "count": 224 + }, + "12": { + "count": 274 + }, + "7": { + "count": 322 + }, + "9": { + "count": 127 + }, + "5": { + "count": 174 + }, + "1": { + "count": 239 + }, + "13": { + "count": 222 + }, + "2": { + "count": 282 + }, + "19": { + "count": 229 + }, + "16": { + "count": 97 + }, + "0": { + "count": 204 + }, + "3": { + "count": 172 + }, + "4": { + "count": 212 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ZeroShotClassification/MNISTZeroShot.json b/mteb/descriptive_stats/Image/ZeroShotClassification/MNISTZeroShot.json new file mode 100644 index 0000000000..e6300993ff --- /dev/null +++ b/mteb/descriptive_stats/Image/ZeroShotClassification/MNISTZeroShot.json @@ -0,0 +1,47 @@ +{ + "test": { + "num_samples": 10000, + "unique_num_labels": 10, + "min_image_width": 28, + "average_image_width": 28.0, + "max_image_width": 28, + "min_image_height": 28, + "average_image_height": 28.0, + "max_image_height": 28, + "min_label_text_length": 27, + "average_label_text_length": 27.0, + "max_label_text_length": 27, + "labels": { + "7": { + "count": 1028 + }, + "2": { + "count": 1032 + }, + "1": { + "count": 1135 + }, + "0": { + "count": 980 + }, + "4": { + "count": 982 + }, + "9": { + "count": 1009 + }, + "5": { + "count": 892 + }, + "6": { + "count": 958 + }, + "3": { + "count": 1010 + }, + "8": { + "count": 974 + } + } + } +} \ No newline at end of file From e090330a6bbcf595d13ff682703cad524007b65e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 14 Feb 2025 03:04:04 +0000 Subject: [PATCH 012/233] Update tasks table --- docs/tasks.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index c210fbf0c5..6cba72c0c6 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -431,7 +431,7 @@ The following tables give you an overview of the tasks in MTEB. | [MMarcoReranking](https://github.com/unicamp-dl/mMARCO) (Luiz Henrique Bonifacio, 2021) | ['cmn'] | Reranking | s2s | | None | None | | [MMarcoRetrieval](https://arxiv.org/abs/2309.07597) (Shitao Xiao, 2024) | ['cmn'] | Retrieval | s2p | | None | None | | [MNIST](https://en.wikipedia.org/wiki/MNIST_database) (LeCun et al., 2010) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 10000} | {'test': {'num_samples': 10000, 'unique_num_labels': 10, 'min_image_width': 28, 'average_image_width': 28.0, 'max_image_width': 28, 'min_image_height': 28, 'average_image_height': 28.0, 'max_image_height': 28, 'labels': {'7': {'count': 1028}, '2': {'count': 1032}, '1': {'count': 1135}, '0': {'count': 980}, '4': {'count': 982}, '9': {'count': 1009}, '5': {'count': 892}, '6': {'count': 958}, '3': {'count': 1010}, '8': {'count': 974}}}} | -| [MNISTZeroShot](https://en.wikipedia.org/wiki/MNIST_database) (LeCun et al., 2010) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | +| [MNISTZeroShot](https://en.wikipedia.org/wiki/MNIST_database) (LeCun et al., 2010) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 10000} | {'test': {'num_samples': 10000, 'unique_num_labels': 10, 'min_image_width': 28, 'average_image_width': 28.0, 'max_image_width': 28, 'min_image_height': 28, 'average_image_height': 28.0, 'max_image_height': 28, 'min_label_text_length': 27, 'average_label_text_length': 27.0, 'max_label_text_length': 27, 'labels': {'7': {'count': 1028}, '2': {'count': 1032}, '1': {'count': 1135}, '0': {'count': 980}, '4': {'count': 982}, '9': {'count': 1009}, '5': {'count': 892}, '6': {'count': 958}, '3': {'count': 1010}, '8': {'count': 974}}}} | | [MSCOCOI2TRetrieval](https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48) (Lin et al., 2014) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | None | None | | [MSCOCOT2IRetrieval](https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48) (Lin et al., 2014) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | None | None | | [MSMARCO](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | @@ -817,7 +817,7 @@ The following tables give you an overview of the tasks in MTEB. | [UrduRomanSentimentClassification](https://archive.ics.uci.edu/dataset/458/roman+urdu+data+set) (Sharf,Zareen, 2018) | ['urd'] | Classification | s2s | [Social, Written] | None | None | | [VGHierarchicalClusteringP2P](https://huggingface.co/datasets/navjordj/VG_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | | [VGHierarchicalClusteringS2S](https://huggingface.co/datasets/navjordj/VG_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | -| [VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/) | ['eng'] | ImageMultilabelClassification | i2i | [Encyclopaedic] | None | None | +| [VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/) | ['eng'] | ImageMultilabelClassification | i2i | [Encyclopaedic] | {'test': 4952} | {'test': {'num_samples': 4952, 'min_image_width': 148, 'average_image_width': 471.25, 'max_image_width': 500, 'min_image_height': 139, 'average_image_height': 381.54, 'max_image_height': 500, 'min_labels_per_sample': 1, 'average_label_per_sample': 1.42, 'max_labels_per_sample': 5, 'unique_num_labels': 20, 'labels': {'14': {'count': 2007}, '11': {'count': 418}, '18': {'count': 259}, '17': {'count': 223}, '8': {'count': 417}, '6': {'count': 721}, '10': {'count': 190}, '15': {'count': 224}, '12': {'count': 274}, '7': {'count': 322}, '9': {'count': 127}, '5': {'count': 174}, '1': {'count': 239}, '13': {'count': 222}, '2': {'count': 282}, '19': {'count': 229}, '16': {'count': 97}, '0': {'count': 204}, '3': {'count': 172}, '4': {'count': 212}}}} | | [VQA2IT2TRetrieval](https://openaccess.thecvf.com/content_cvpr_2017/html/Goyal_Making_the_v_CVPR_2017_paper.html) (Goyal et al., 2017) | ['eng'] | Any2AnyRetrieval | it2t | [Web] | None | None | | [VideoRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None | | [VidoreArxivQARetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | None | None | From bef4046b51fc24b3e0648d30307efb2ea0d89163 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Fri, 14 Feb 2025 07:09:18 +0300 Subject: [PATCH 013/233] Rename MIEB task classes with duplicated names (#2061) fix class names --- mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/Country211.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/DTD.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/Food101.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/GTSRB.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/Imagenet1k.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/PatchCamelyon.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/STL10.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py | 2 +- mteb/tasks/Image/ZeroshotClassification/eng/UCF101.py | 2 +- 18 files changed, 18 insertions(+), 18 deletions(-) diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py b/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py index 14609d08a6..0b97ecd7ee 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/Birdsnap.py @@ -6,7 +6,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class BirdsnapClassification(AbsTaskZeroshotClassification): +class BirdsnapZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="BirdsnapZeroShot", description="Classifying bird images from 500 species.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py b/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py index 749ac71273..610e677b0d 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/Caltech101.py @@ -6,7 +6,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class Caltech101Classification(AbsTaskZeroshotClassification): +class Caltech101ZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="Caltech101ZeroShot", description="Classifying images of 101 widely varied objects.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Country211.py b/mteb/tasks/Image/ZeroshotClassification/eng/Country211.py index eb0dd5158b..6e30a6c6ad 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/Country211.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/Country211.py @@ -8,7 +8,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class Country211Classification(AbsTaskZeroshotClassification): +class Country211ZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="Country211ZeroShot", description="Classifying images of 211 countries.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py b/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py index 2d182e0854..8e60139718 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/DTD.py @@ -6,7 +6,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class DTDClassification(AbsTaskZeroshotClassification): +class DTDZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="DTDZeroShot", description="Describable Textures Dataset in 47 categories.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py b/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py index 85a1b13e5d..02d4955879 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/EuroSAT.py @@ -6,7 +6,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class EuroSATClassification(AbsTaskZeroshotClassification): +class EuroSATZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="EuroSATZeroShot", description="Classifying satellite images.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py b/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py index a0a391e235..dc17edeac5 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/FER2013.py @@ -6,7 +6,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class FER2013Classification(AbsTaskZeroshotClassification): +class FER2013ZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="FER2013ZeroShot", description="Classifying facial emotions.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py b/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py index 65af473d3f..0fe1a10738 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/FGVCAircraft.py @@ -6,7 +6,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class FGVCAircraftClassification(AbsTaskZeroshotClassification): +class FGVCAircraftZeroShotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="FGVCAircraftZeroShot", description="Classifying aircraft images from 41 manufacturers and 102 variants.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py b/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py index cc64484e65..a925df384c 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/Food101.py @@ -6,7 +6,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class Food101Classification(AbsTaskZeroshotClassification): +class Food101ZeroShotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="Food101ZeroShot", description="Classifying food.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/GTSRB.py b/mteb/tasks/Image/ZeroshotClassification/eng/GTSRB.py index e08866b6bd..05292b2f40 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/GTSRB.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/GTSRB.py @@ -8,7 +8,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class GTSRBClassification(AbsTaskZeroshotClassification): +class GTSRBZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="GTSRBZeroShot", description="""The German Traffic Sign Recognition Benchmark (GTSRB) is a multi-class classification dataset for traffic signs. It consists of dataset of more than 50,000 traffic sign images. The dataset comprises 43 classes with unbalanced class frequencies.""", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/Imagenet1k.py b/mteb/tasks/Image/ZeroshotClassification/eng/Imagenet1k.py index 53dce7feb1..3de153e634 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/Imagenet1k.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/Imagenet1k.py @@ -8,7 +8,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class Imagenet1kClassification(AbsTaskZeroshotClassification): +class Imagenet1kZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="Imagenet1kZeroShot", description="ImageNet, a large-scale ontology of images built upon the backbone of the WordNet structure.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py b/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py index 6433104c90..df99dfda33 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/MNIST.py @@ -6,7 +6,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class MNISTClassification(AbsTaskZeroshotClassification): +class MNISTZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="MNISTZeroShot", description="Classifying handwritten digits.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py b/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py index 372d2fa7bf..043215d337 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/OxfordPets.py @@ -6,7 +6,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class OxfordPetsClassification(AbsTaskZeroshotClassification): +class OxfordPetsZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="OxfordPetsZeroShot", description="Classifying animal images.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/PatchCamelyon.py b/mteb/tasks/Image/ZeroshotClassification/eng/PatchCamelyon.py index 24b3e7a4b1..4621286079 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/PatchCamelyon.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/PatchCamelyon.py @@ -8,7 +8,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class PatchCamelyonClassification(AbsTaskZeroshotClassification): +class PatchCamelyonZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="PatchCamelyonZeroShot", description="""Histopathology diagnosis classification dataset.""", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py b/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py index e58da7863e..98223093bd 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/RESISC45.py @@ -6,7 +6,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class RESISC45Classification(AbsTaskZeroshotClassification): +class RESISC45ZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="RESISC45ZeroShot", description="Remote Sensing Image Scene Classification by Northwestern Polytechnical University (NWPU).", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py b/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py index 67357adc88..4a8d145ec3 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/STL10.py @@ -6,7 +6,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class STL10Classification(AbsTaskZeroshotClassification): +class STL10ZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="STL10ZeroShot", description="Classifying 96x96 images from 10 classes.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py b/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py index c28bf146f1..fe01c38d89 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/SUN397.py @@ -6,7 +6,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class SUN397Classification(AbsTaskZeroshotClassification): +class SUN397ZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="SUN397ZeroShot", description="Large scale scene recognition in 397 categories.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py b/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py index d3e01a34ca..5b48b0f2c7 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/StanfordCars.py @@ -6,7 +6,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class StanfordCarsClassification(AbsTaskZeroshotClassification): +class StanfordCarsZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="StanfordCarsZeroShot", description="Classifying car images from 96 makes.", diff --git a/mteb/tasks/Image/ZeroshotClassification/eng/UCF101.py b/mteb/tasks/Image/ZeroshotClassification/eng/UCF101.py index b0d5293632..4ad52e52f6 100644 --- a/mteb/tasks/Image/ZeroshotClassification/eng/UCF101.py +++ b/mteb/tasks/Image/ZeroshotClassification/eng/UCF101.py @@ -6,7 +6,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -class UCF101Classification(AbsTaskZeroshotClassification): +class UCF101ZeroshotClassification(AbsTaskZeroshotClassification): metadata = TaskMetadata( name="UCF101ZeroShot", description="""UCF101 is an action recognition data set of realistic From 3cf7b158047498b516c46197a25c06d0d7004fa1 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Fri, 14 Feb 2025 19:21:12 +0900 Subject: [PATCH 014/233] misc: Add VisualSTS descriptive stats (#2062) * add visualsts stats * add last dataset --- mteb/abstasks/Image/AbsTaskVisualSTS.py | 81 +++- .../Image/VisualSTS/STS12VisualSTS.json | 20 + .../Image/VisualSTS/STS13VisualSTS.json | 20 + .../Image/VisualSTS/STS14VisualSTS.json | 20 + .../Image/VisualSTS/STS15VisualSTS.json | 20 + .../Image/VisualSTS/STS16VisualSTS.json | 20 + .../VisualSTS/STS17MultilingualVisualSTS.json | 220 ++++++++++ .../STSBenchmarkMultilingualVisualSTS.json | 402 ++++++++++++++++++ 8 files changed, 798 insertions(+), 5 deletions(-) create mode 100644 mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json create mode 100644 mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json create mode 100644 mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json create mode 100644 mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json create mode 100644 mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json create mode 100644 mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json create mode 100644 mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json diff --git a/mteb/abstasks/Image/AbsTaskVisualSTS.py b/mteb/abstasks/Image/AbsTaskVisualSTS.py index 45de465eac..85a7306bfb 100644 --- a/mteb/abstasks/Image/AbsTaskVisualSTS.py +++ b/mteb/abstasks/Image/AbsTaskVisualSTS.py @@ -14,14 +14,49 @@ class VisualSTSDescriptiveStatistics(DescriptiveStatistics): Attributes: num_samples: number of samples in the dataset + + min_image1_width: Minimum width of images1 + average_image1_width: Average width of images1 + max_image1_width: Maximum width of images1 + + min_image1_height: Minimum height of images1 + average_image1_height: Average height of images1 + max_image1_height: Maximum height of images1 + + min_image2_width: Minimum width of images2 + average_image2_width: Average width of images2 + max_image2_width: Maximum width of images2 + + min_image2_height: Minimum height of images2 + average_image2_height: Average height of images2 + max_image2_height: Maximum height of images2 + + min_score: Minimum score avg_score: Average score + max_score: Maximum score """ - # TODO: what are useful stats for visual STS tasks? - # average_pixel_width; average_pixel_height; average non-white boxes? - num_samples: int + + min_image1_width: float + average_image1_width: float + max_image1_width: float + + min_image1_height: float + average_image1_height: float + max_image1_height: float + + min_image2_width: float + average_image2_width: float + max_image2_width: float + + min_image2_height: float + average_image2_height: float + max_image2_height: float + + min_score: float avg_score: float + max_score: float class AbsTaskVisualSTS(AbsTask): @@ -72,16 +107,52 @@ def _calculate_metrics_from_split( self, split: str, hf_subset: str | None = None, compute_overall: bool = False ) -> VisualSTSDescriptiveStatistics: if hf_subset: + images1 = self.dataset[hf_subset][split][self.sentences_column_names[0]] + images2 = self.dataset[hf_subset][split][self.sentences_column_names[1]] score = self.dataset[hf_subset][split]["score"] elif compute_overall: + images1, images2 = [], [] score = [] for hf_subset in self.metadata.eval_langs: + images1.extend( + self.dataset[hf_subset][split][self.sentences_column_names[0]] + ) + images2.extend( + self.dataset[hf_subset][split][self.sentences_column_names[1]] + ) score.extend(self.dataset[hf_subset][split]["score"]) else: + images1 = self.dataset[split][self.sentences_column_names[0]] + images2 = self.dataset[split][self.sentences_column_names[1]] score = self.dataset[split]["score"] - avg_score = sum(score) / len(score) + img_widths1, img_heights1 = [], [] + for img in images1: + width, height = img.size + img_heights1.append(height) + img_widths1.append(width) + + img_widths2, img_heights2 = [], [] + for img in images1: + width, height = img.size + img_heights2.append(height) + img_widths2.append(width) + return VisualSTSDescriptiveStatistics( num_samples=len(score), - avg_score=avg_score, + min_image1_width=min(img_widths1), + average_image1_width=sum(img_widths1) / len(img_widths1), + max_image1_width=max(img_widths1), + min_image1_height=min(img_heights1), + average_image1_height=sum(img_heights1) / len(img_heights1), + max_image1_height=max(img_widths1), + min_image2_width=min(img_widths2), + average_image2_width=sum(img_widths2) / len(img_widths2), + max_image2_width=max(img_widths2), + min_image2_height=min(img_heights2), + average_image2_height=sum(img_heights2) / len(img_heights2), + max_image2_height=max(img_widths2), + min_score=min(score), + avg_score=sum(score) / len(score), + max_score=max(score), ) diff --git a/mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json b/mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json new file mode 100644 index 0000000000..cde98bd69a --- /dev/null +++ b/mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json @@ -0,0 +1,20 @@ +{ + "test": { + "num_samples": 3108, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 3.5060643500643507, + "max_score": 5.0 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json b/mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json new file mode 100644 index 0000000000..984259653a --- /dev/null +++ b/mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json @@ -0,0 +1,20 @@ +{ + "test": { + "num_samples": 1500, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.3361888888888864, + "max_score": 5.0 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json b/mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json new file mode 100644 index 0000000000..34da6d586c --- /dev/null +++ b/mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json @@ -0,0 +1,20 @@ +{ + "test": { + "num_samples": 3750, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.8114334391534355, + "max_score": 5.0 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json b/mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json new file mode 100644 index 0000000000..abc058e064 --- /dev/null +++ b/mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json @@ -0,0 +1,20 @@ +{ + "test": { + "num_samples": 3000, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.40591333333333, + "max_score": 5.0 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json b/mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json new file mode 100644 index 0000000000..30ac86138f --- /dev/null +++ b/mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json @@ -0,0 +1,20 @@ +{ + "test": { + "num_samples": 1186, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.4131534569983137, + "max_score": 5.0 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json b/mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json new file mode 100644 index 0000000000..1f5e666018 --- /dev/null +++ b/mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json @@ -0,0 +1,220 @@ +{ + "test": { + "num_samples": 5346, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.3554804214989464, + "max_score": 5.0, + "hf_subset_descriptive_stats": { + "ko-ko": { + "num_samples": 2846, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.469359920356055, + "max_score": 5.0 + }, + "ar-ar": { + "num_samples": 250, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.216800000000001, + "max_score": 5.0 + }, + "en-ar": { + "num_samples": 250, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.1423999999999994, + "max_score": 5.0 + }, + "en-de": { + "num_samples": 250, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.2776000000000014, + "max_score": 5.0 + }, + "en-en": { + "num_samples": 250, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.2776000000000014, + "max_score": 5.0 + }, + "en-tr": { + "num_samples": 250, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.1335999999999986, + "max_score": 5.0 + }, + "es-en": { + "num_samples": 250, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.1464000000000003, + "max_score": 5.0 + }, + "es-es": { + "num_samples": 250, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.2312000000000007, + "max_score": 5.0 + }, + "fr-en": { + "num_samples": 250, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.2776000000000014, + "max_score": 5.0 + }, + "it-en": { + "num_samples": 250, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.2776000000000014, + "max_score": 5.0 + }, + "nl-en": { + "num_samples": 250, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.2776000000000014, + "max_score": 5.0 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json b/mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json new file mode 100644 index 0000000000..e8b961db0e --- /dev/null +++ b/mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json @@ -0,0 +1,402 @@ +{ + "dev": { + "num_samples": 15000, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.3639075540602206, + "max_score": 5.0, + "hf_subset_descriptive_stats": { + "en": { + "num_samples": 1500, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.3639075540602206, + "max_score": 5.0 + }, + "de": { + "num_samples": 1500, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.3639075540602206, + "max_score": 5.0 + }, + "es": { + "num_samples": 1500, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.3639075540602206, + "max_score": 5.0 + }, + "fr": { + "num_samples": 1500, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.3639075540602206, + "max_score": 5.0 + }, + "it": { + "num_samples": 1500, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.3639075540602206, + "max_score": 5.0 + }, + "nl": { + "num_samples": 1500, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.3639075540602206, + "max_score": 5.0 + }, + "pl": { + "num_samples": 1500, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.3639075540602206, + "max_score": 5.0 + }, + "pt": { + "num_samples": 1500, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.3639075540602206, + "max_score": 5.0 + }, + "ru": { + "num_samples": 1500, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.3639075540602206, + "max_score": 5.0 + }, + "zh": { + "num_samples": 1500, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.3639075540602206, + "max_score": 5.0 + } + } + }, + "test": { + "num_samples": 13790, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.6079166059890806, + "max_score": 5.0, + "hf_subset_descriptive_stats": { + "en": { + "num_samples": 1379, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.6079166059890806, + "max_score": 5.0 + }, + "de": { + "num_samples": 1379, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.6079166059890806, + "max_score": 5.0 + }, + "es": { + "num_samples": 1379, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.6079166059890806, + "max_score": 5.0 + }, + "fr": { + "num_samples": 1379, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.6079166059890806, + "max_score": 5.0 + }, + "it": { + "num_samples": 1379, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.6079166059890806, + "max_score": 5.0 + }, + "nl": { + "num_samples": 1379, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.6079166059890806, + "max_score": 5.0 + }, + "pl": { + "num_samples": 1379, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.6079166059890806, + "max_score": 5.0 + }, + "pt": { + "num_samples": 1379, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.6079166059890806, + "max_score": 5.0 + }, + "ru": { + "num_samples": 1379, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.6079166059890806, + "max_score": 5.0 + }, + "zh": { + "num_samples": 1379, + "min_image1_width": 448, + "average_image1_width": 448.0, + "max_image1_width": 448, + "min_image1_height": 448, + "average_image1_height": 448.0, + "max_image1_height": 448, + "min_image2_width": 448, + "average_image2_width": 448.0, + "max_image2_width": 448, + "min_image2_height": 448, + "average_image2_height": 448.0, + "max_image2_height": 448, + "min_score": 0.0, + "avg_score": 2.6079166059890806, + "max_score": 5.0 + } + } + } +} \ No newline at end of file From 479fa206a9d2a1c3401fccd93786ac493769db56 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 14 Feb 2025 10:24:32 +0000 Subject: [PATCH 015/233] Update tasks table --- docs/tasks.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index 6cba72c0c6..cbd995961b 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -660,22 +660,22 @@ The following tables give you an overview of the tasks in MTEB. | [STL10](https://cs.stanford.edu/~acoates/stl10/) (Coates et al., 2011) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | | [STL10ZeroShot](https://cs.stanford.edu/~acoates/stl10/) (Coates et al., 2011) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [STS12](https://www.aclweb.org/anthology/S12-1051.pdf) (Agirre et al., 2012) | ['eng'] | STS | s2s | [Encyclopaedic, News, Written] | {'test': 3108} | {'test': {'num_samples': 3108, 'number_of_characters': 402118, 'min_sentence1_length': 3, 'average_sentence1_len': 63.79, 'max_sentence1_length': 220, 'unique_sentence1': 2236, 'min_sentence2_length': 7, 'average_sentence2_len': 65.59, 'max_sentence2_length': 204, 'unique_sentence2': 2797, 'min_score': 0.0, 'avg_score': 3.51, 'max_score': 5.0}} | -| [STS12VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Encyclopaedic, News, Written] | None | None | +| [STS12VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Encyclopaedic, News, Written] | {'test': 3108} | {'test': {'num_samples': 3108, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 3.51, 'max_score': 5.0}} | | [STS13](https://www.aclweb.org/anthology/S13-1004/) (Eneko Agirre, 2013) | ['eng'] | STS | s2s | [News, Non-fiction, Web, Written] | None | None | -| [STS13VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [News, Non-fiction, Web, Written] | None | None | +| [STS13VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [News, Non-fiction, Web, Written] | {'test': 1500} | {'test': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.34, 'max_score': 5.0}} | | [STS14](https://www.aclweb.org/anthology/S14-1002) | ['eng'] | STS | s2s | [Blog, Spoken, Web] | None | None | -| [STS14VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Blog, Spoken, Web] | None | None | +| [STS14VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Blog, Spoken, Web] | {'test': 3750} | {'test': {'num_samples': 3750, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.81, 'max_score': 5.0}} | | [STS15](https://www.aclweb.org/anthology/S15-2010) | ['eng'] | STS | s2s | [Blog, News, Spoken, Web, Written] | None | None | -| [STS15VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Blog, News, Spoken, Web, Written] | None | None | +| [STS15VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Blog, News, Spoken, Web, Written] | {'test': 3000} | {'test': {'num_samples': 3000, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.41, 'max_score': 5.0}} | | [STS16](https://www.aclweb.org/anthology/S16-1001) | ['eng'] | STS | s2s | [Blog, Spoken, Web] | None | None | -| [STS16VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Blog, Spoken, Web] | None | None | +| [STS16VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Blog, Spoken, Web] | {'test': 1186} | {'test': {'num_samples': 1186, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.41, 'max_score': 5.0}} | | [STS17](https://alt.qcri.org/semeval2017/task1/) | ['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur'] | STS | s2s | [News, Web, Written] | {'test': 5346} | {'test': {'num_samples': 5346, 'number_of_characters': 400264, 'min_sentence1_length': 6, 'average_sentence1_len': 38.15, 'max_sentence1_length': 976, 'unique_sentence1': 4900, 'min_sentence2_length': 6, 'average_sentence2_len': 36.73, 'max_sentence2_length': 1007, 'unique_sentence2': 4470, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'ko-ko': {'num_samples': 2846, 'number_of_characters': 183387, 'min_sentence1_length': 6, 'average_sentence1_len': 31.99, 'max_sentence1_length': 976, 'unique_sentence1': 2650, 'min_sentence2_length': 6, 'average_sentence2_len': 32.44, 'max_sentence2_length': 1007, 'unique_sentence2': 2720, 'min_score': 0.0, 'avg_score': 2.47, 'max_score': 5.0}, 'ar-ar': {'num_samples': 250, 'number_of_characters': 16247, 'min_sentence1_length': 11, 'average_sentence1_len': 32.21, 'max_sentence1_length': 99, 'unique_sentence1': 250, 'min_sentence2_length': 9, 'average_sentence2_len': 32.78, 'max_sentence2_length': 83, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.22, 'max_score': 5.0}, 'en-ar': {'num_samples': 250, 'number_of_characters': 18764, 'min_sentence1_length': 13, 'average_sentence1_len': 42.36, 'max_sentence1_length': 105, 'unique_sentence1': 250, 'min_sentence2_length': 10, 'average_sentence2_len': 32.7, 'max_sentence2_length': 104, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.14, 'max_score': 5.0}, 'en-de': {'num_samples': 250, 'number_of_characters': 22177, 'min_sentence1_length': 12, 'average_sentence1_len': 43.95, 'max_sentence1_length': 94, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 44.76, 'max_sentence2_length': 104, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-en': {'num_samples': 250, 'number_of_characters': 21669, 'min_sentence1_length': 12, 'average_sentence1_len': 43.95, 'max_sentence1_length': 94, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-tr': {'num_samples': 250, 'number_of_characters': 20879, 'min_sentence1_length': 15, 'average_sentence1_len': 41.92, 'max_sentence1_length': 101, 'unique_sentence1': 250, 'min_sentence2_length': 10, 'average_sentence2_len': 41.6, 'max_sentence2_length': 107, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.13, 'max_score': 5.0}, 'es-en': {'num_samples': 250, 'number_of_characters': 23216, 'min_sentence1_length': 12, 'average_sentence1_len': 50.84, 'max_sentence1_length': 160, 'unique_sentence1': 250, 'min_sentence2_length': 14, 'average_sentence2_len': 42.02, 'max_sentence2_length': 117, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.15, 'max_score': 5.0}, 'es-es': {'num_samples': 250, 'number_of_characters': 25265, 'min_sentence1_length': 18, 'average_sentence1_len': 49.84, 'max_sentence1_length': 136, 'unique_sentence1': 250, 'min_sentence2_length': 13, 'average_sentence2_len': 51.22, 'max_sentence2_length': 129, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.23, 'max_score': 5.0}, 'fr-en': {'num_samples': 250, 'number_of_characters': 23087, 'min_sentence1_length': 19, 'average_sentence1_len': 49.62, 'max_sentence1_length': 115, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'it-en': {'num_samples': 250, 'number_of_characters': 23188, 'min_sentence1_length': 15, 'average_sentence1_len': 50.03, 'max_sentence1_length': 113, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'nl-en': {'num_samples': 250, 'number_of_characters': 22385, 'min_sentence1_length': 14, 'average_sentence1_len': 46.82, 'max_sentence1_length': 123, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}}}} | -| [STS17MultilingualVisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur'] | VisualSTS | i2i | [News, Social, Spoken, Web, Written] | None | None | +| [STS17MultilingualVisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur'] | VisualSTS | i2i | [News, Social, Spoken, Web, Written] | {'test': 5346} | {'test': {'num_samples': 5346, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'ko-ko': {'num_samples': 2846, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.47, 'max_score': 5.0}, 'ar-ar': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.22, 'max_score': 5.0}, 'en-ar': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.14, 'max_score': 5.0}, 'en-de': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-en': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-tr': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.13, 'max_score': 5.0}, 'es-en': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.15, 'max_score': 5.0}, 'es-es': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.23, 'max_score': 5.0}, 'fr-en': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'it-en': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'nl-en': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}}}} | | [STS22.v2](https://competitions.codalab.org/competitions/33835) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'pol', 'rus', 'spa', 'tur'] | STS | p2p | [News, Written] | None | None | | [STSB](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | | [STSBenchmark](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['eng'] | STS | s2s | [Blog, News, Written] | None | None | | [STSBenchmarkMultilingualSTS](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | STS | s2s | [News, Social, Spoken, Web, Written] | None | None | -| [STSBenchmarkMultilingualVisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | VisualSTS | i2i | [News, Social, Spoken, Web, Written] | None | None | +| [STSBenchmarkMultilingualVisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | VisualSTS | i2i | [News, Social, Spoken, Web, Written] | {'dev': 15000, 'test': 13790} | {'dev': {'num_samples': 15000, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'en': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'de': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'es': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'fr': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'it': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'nl': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'pl': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'pt': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'ru': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'zh': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}}}, 'test': {'num_samples': 13790, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'en': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'de': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'es': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'fr': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'it': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'nl': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'pl': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'pt': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'ru': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'zh': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}}}} | | [STSES](https://huggingface.co/datasets/PlanTL-GOB-ES/sts-es) (Agirre et al., 2015) | ['spa'] | STS | s2s | [Written] | None | None | | [SUN397](https://ieeexplore.ieee.org/abstract/document/5539970) (Xiao et al., 2010) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | | [SUN397ZeroShot](https://ieeexplore.ieee.org/abstract/document/5539970) (Xiao et al., 2010) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | From 76e05ddb0006620eaf0b8850c5fb37bb74b943e1 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 14 Feb 2025 15:50:05 +0100 Subject: [PATCH 016/233] fix: Added gte models (#1539) * fix: Added gte models * fix: Add mixbai models (#1540) for #1515 --- mteb/models/gte_models.py | 33 ++++++++++++++++++++++---- mteb/models/mxbai_models.py | 47 +++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 5 deletions(-) diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index 6ebf2548f0..8e681b0a31 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -125,7 +125,7 @@ def instruction_template( open_weights=True, revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a", release_date="2023-11-08", # initial commit of hf model. - n_parameters=30.3 * 1e6, + n_parameters=int(30.3 * 1e6), memory_usage_mb=58, embed_dim=1024, license="mit", @@ -150,7 +150,7 @@ def instruction_template( open_weights=True, revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c", release_date="2023-11-08", # initial commit of hf model. - n_parameters=102 * 1e6, + n_parameters=int(102 * 1e6), memory_usage_mb=195, embed_dim=1024, license="mit", @@ -175,7 +175,7 @@ def instruction_template( open_weights=True, revision="64c364e579de308104a9b2c170ca009502f4f545", release_date="2023-11-08", # initial commit of hf model. - n_parameters=326 * 1e6, + n_parameters=int(326 * 1e6), memory_usage_mb=621, embed_dim=1024, license="mit", @@ -297,7 +297,7 @@ def instruction_template( open_weights=True, revision="ca1791e0bcc104f6db161f27de1340241b13c5a4", release_date="2024-07-20", # initial commit of hf model. - n_parameters=305 * 1e6, + n_parameters=int(305 * 1e6), memory_usage_mb=582, embed_dim=1024, license="apache-2", @@ -322,7 +322,7 @@ def instruction_template( open_weights=True, revision="7ca8b4ca700621b67618669f5378fe5f5820b8e4", release_date="2025-01-21", # initial commit of hf model. - n_parameters=149 * 1e6, + n_parameters=int(149 * 1e6), memory_usage_mb=284, embed_dim=768, license="apache-2", @@ -335,3 +335,26 @@ def instruction_template( public_training_data=None, training_datasets=gte_multi_training_data, # English part of gte_multi_training_data, ) + + +gte_base_en_v15 = ModelMeta( + name="Alibaba-NLP/gte-base-en-v1.5", + languages=["eng-Latn"], + open_weights=True, + revision="a829fd0e060bb84554da0dfd354d0de0f7712b7f", # can be any + release_date="2024-06-20", # initial commit of hf model + n_parameters=137_000_000, + memory_usage_mb=None, + embed_dim=768, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=None, + public_training_data=None, + training_datasets=None, +) diff --git a/mteb/models/mxbai_models.py b/mteb/models/mxbai_models.py index 133bbbed7f..b70d498ff4 100644 --- a/mteb/models/mxbai_models.py +++ b/mteb/models/mxbai_models.py @@ -31,3 +31,50 @@ public_training_data=None, training_datasets=None, ) + +mxbai_embed_2d_large_v1 = ModelMeta( + loader=None, + name="mixedbread-ai/mxbai-embed-2d-large-v1", + languages=["eng_Latn"], + open_weights=True, + revision="7e639ca8e344af398876ead3b19ec3c0b9068f49", + release_date="2024-03-04", # initial commit of hf model. + n_parameters=335_000_000, + memory_usage_mb=None, + max_tokens=512, + embed_dim=768, + license="apache-2.0", + reference="https://huggingface.co/mixedbread-ai/mxbai-embed-2d-large-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + adapted_from=None, + superseded_by=None, + public_training_code=None, + public_training_data=None, + training_datasets=None, +) + + +mxbai_embed_xsmall_v1 = ModelMeta( + loader=None, + name="mixedbread-ai/mxbai-embed-xsmall-v1", + languages=["eng_Latn"], + open_weights=True, + revision="2f741ec33328bb57e4704e1238fc59a4a5745705", + release_date="2024-08-13", # initial commit of hf model. + n_parameters=24_100_000, + memory_usage_mb=None, + max_tokens=512, + embed_dim=384, + license="apache-2.0", + reference="https://huggingface.co/mixedbread-ai/mxbai-embed-xsmall-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + adapted_from="sentence-transformers/all-MiniLM-L6-v2", + superseded_by=None, + public_training_code=None, + public_training_data=None, + training_datasets=None, +) From 8604e079fd5bc6adac2d2050713dcceaeee5932d Mon Sep 17 00:00:00 2001 From: Mina Parham <36207068+mina-parham@users.noreply.github.com> Date: Fri, 14 Feb 2025 09:55:55 -0500 Subject: [PATCH 017/233] fix: Add climate fever v2 (#1873) * Updated ClimateFEVER dataset with new version * Adds Fill in the empty metadata. * Updates the date tuple * Update class name Co-authored-by: Kenneth Enevoldsen * Update domains Co-authored-by: Kenneth Enevoldsen * Update task_subtypes * Update annotations_creators for the first version * Update date Co-authored-by: Kenneth Enevoldsen * Update task subtypes * Update path * Update description --------- Co-authored-by: Kenneth Enevoldsen Co-authored-by: Mina Parham --- mteb/models/lens_models.py | 1 + .../Retrieval/eng/ClimateFEVERRetrieval.py | 40 ++++++++++++++++++- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/mteb/models/lens_models.py b/mteb/models/lens_models.py index c83bf2a3d0..46bc25c3de 100644 --- a/mteb/models/lens_models.py +++ b/mteb/models/lens_models.py @@ -3,6 +3,7 @@ from mteb.model_meta import ModelMeta from mteb.models.bge_models import bge_full_data + lens_d4000 = ModelMeta( loader=None, # TODO: implement this in the future name="yibinlei/LENS-d4000", diff --git a/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py b/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py index b87e5223e0..36910ef518 100644 --- a/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py +++ b/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py @@ -20,7 +20,7 @@ class ClimateFEVER(AbsTaskRetrieval): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="ndcg_at_10", - date=None, + date=("2001-01-01", "2020-12-31"), # launch of wiki -> paper publication domains=["Encyclopaedic", "Written"], task_subtypes=["Claim verification"], license="cc-by-sa-4.0", @@ -56,7 +56,7 @@ class ClimateFEVERHardNegatives(AbsTaskRetrieval): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="ndcg_at_10", - date=None, + date=("2001-01-01", "2020-12-31"), # launch of wiki -> paper publication domains=["Encyclopaedic", "Written"], task_subtypes=["Claim verification"], license="cc-by-sa-4.0", @@ -72,3 +72,39 @@ class ClimateFEVERHardNegatives(AbsTaskRetrieval): primaryClass={cs.CL} }""", ) + + +class ClimateFEVERRetrievalv2(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ClimateFEVER.v2", + description="CLIMATE-FEVER is a dataset following the FEVER methodology, containing 1,535 real-world climate change claims. This updated version addresses corpus mismatches and qrel inconsistencies in MTEB, restoring labels while refining corpus-query alignment for better accuracy. ", + reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html", + dataset={ + "path": "mteb/climate-fever-v2", + "revision": "e438c9586767800aeb10dbe8a245c41dbea4e5f4", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2001-01-01", "2020-12-31"), # launch of wiki -> paper publication + domains=["Academic", "Written"], + task_subtypes=["Claim verification"], + license="cc-by-sa-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@misc{diggelmann2021climatefever, + title={CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, + author={Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold}, + year={2021}, + eprint={2012.00614}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +}""", + prompt={ + "query": "Given a claim about climate change, retrieve documents that support or refute the claim" + }, + ) From 11ced79009a190f02a285743476745ca40d3d987 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 14 Feb 2025 15:03:06 +0000 Subject: [PATCH 018/233] Update tasks table --- docs/tasks.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index cbd995961b..c92e45c19f 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -184,6 +184,7 @@ The following tables give you an overview of the tasks in MTEB. | [ClimateFEVER](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [ClimateFEVER-Fa](https://huggingface.co/datasets/MCINext/climate-fever-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [ClimateFEVER-NL](https://huggingface.co/datasets/clips/beir-nl-climate-fever) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [ClimateFEVER.v2](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Academic, Written] | None | None | | [ClimateFEVERHardNegatives](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [CmedqaRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) | ['cmn'] | Retrieval | s2p | [Medical, Written] | None | None | | [Cmnli](https://huggingface.co/datasets/clue/viewer/cmnli) | ['cmn'] | PairClassification | s2s | | None | None | @@ -1174,7 +1175,7 @@ The following tables give you an overview of the tasks in MTEB. | ell | Modern Greek (1453-) | Indo-European | 0 | 2 | 0 | 3 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 18 | | emi | Mussau-Emira | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | emp | Northern Emberá | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 9 | 62 | 4 | 17 | 160 | 18 | 21 | 5 | 1 | 6 | 3 | 1 | 13 | 8 | 108 | 13 | 2 | 1 | 7 | 24 | 483 | +| eng | English | Indo-European | 9 | 62 | 4 | 17 | 160 | 18 | 21 | 5 | 1 | 6 | 3 | 1 | 13 | 8 | 109 | 13 | 2 | 1 | 7 | 24 | 484 | | enq | Enga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | epo | Esperanto | Artificial Language | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | eri | Ogea | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1947,7 +1948,7 @@ The following tables give you an overview of the tasks in MTEB. | zty | Yatee Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zul | Zulu | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 | | zyp | Zyphe Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 9 | 114 | 4 | 1398 | 836 | 311 | 21 | 5 | 1 | 6 | 3 | 28 | 91 | 55 | 534 | 88 | 2 | 2 | 24 | 24 | +| Total | None | None | None | 9 | 114 | 4 | 1398 | 836 | 311 | 21 | 5 | 1 | 6 | 3 | 28 | 91 | 55 | 535 | 88 | 2 | 2 | 24 | 24 | From c6829d34d7a324bb1f3754d39dd52756921d6a9f Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 14 Feb 2025 16:03:25 +0100 Subject: [PATCH 019/233] fix: Updating paper scripts (#1958) * change reference revisions to align with paper * Update author list * Added code for main results table * updated minor changes * added external as a "no_revision_available" case * revert unintended changes * format --- mteb/benchmarks/benchmarks.py | 73 +- mteb/load_results/benchmark_results.py | 5 + scripts/mmteb_create_author_list.ipynb | 479 +-- .../create_main_results_table.ipynb | 2738 +++++++++++++++++ 4 files changed, 3029 insertions(+), 266 deletions(-) create mode 100644 scripts/task_selection/create_main_results_table.ipynb diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index e0a62f08bf..e254dcff22 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -859,41 +859,44 @@ MTEB_INDIC = Benchmark( name="MTEB(Indic, v1)", - tasks=get_tasks( - tasks=[ - # Bitext - "IN22ConvBitextMining", - "IN22GenBitextMining", - "IndicGenBenchFloresBitextMining", - "LinceMTBitextMining", - # clustering - "SIB200ClusteringS2S", - # classification - "BengaliSentimentAnalysis", - "GujaratiNewsClassification", - "HindiDiscourseClassification", - "SentimentAnalysisHindi", - "MalayalamNewsClassification", - "IndicLangClassification", - "MTOPIntentClassification", - "MultiHateClassification", - "TweetSentimentClassification", - "NepaliNewsClassification", - "PunjabiNewsClassification", - "SanskritShlokasClassification", - "UrduRomanSentimentClassification", - # STS - "IndicCrosslingualSTS", - # pair classification - "XNLI", - # retrieval - "BelebeleRetrieval", - "XQuADRetrieval", - # reranking - "WikipediaRerankingMultilingual", - ], - languages=indic_languages, - exclusive_language_filter=True, + tasks=MTEBTasks( + get_tasks( + tasks=[ + # Bitext + "IN22ConvBitextMining", + "IN22GenBitextMining", + "IndicGenBenchFloresBitextMining", + "LinceMTBitextMining", + # clustering + "SIB200ClusteringS2S", + # classification + "BengaliSentimentAnalysis", + "GujaratiNewsClassification", + "HindiDiscourseClassification", + "SentimentAnalysisHindi", + "MalayalamNewsClassification", + "IndicLangClassification", + "MTOPIntentClassification", + "MultiHateClassification", + "TweetSentimentClassification", + "NepaliNewsClassification", + "PunjabiNewsClassification", + "SanskritShlokasClassification", + "UrduRomanSentimentClassification", + # pair classification + "XNLI", + # retrieval + "BelebeleRetrieval", + "XQuADRetrieval", + # reranking + "WikipediaRerankingMultilingual", + ], + languages=indic_languages, + exclusive_language_filter=True, + ) + + + # STS + (get_task("IndicCrosslingualSTS"),) ), description="A regional geopolitical text embedding benchmark targetting embedding performance on Indic languages.", reference=None, diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index 4d69da284c..7fe104825e 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -266,6 +266,11 @@ def keep_best(group: pd.DataFrame) -> pd.DataFrame: if is_main_revision.sum() > 0: return group[is_main_revision].head(n=1) unique_revisions = group["revision"].unique() + + # ensure None/NA/"external" revisions is filtered out + group["revision"][group["revision"].isna()] = "no_revision_available" + group["revision"][group["revision"] == "external"] = "no_revision_available" + # Filtering out no_revision_available if other revisions are present if (len(unique_revisions) > 1) and ( "no_revision_available" in unique_revisions diff --git a/scripts/mmteb_create_author_list.ipynb b/scripts/mmteb_create_author_list.ipynb index 3c9d99c2ed..bf2d3d77b3 100644 --- a/scripts/mmteb_create_author_list.ipynb +++ b/scripts/mmteb_create_author_list.ipynb @@ -87,8 +87,8 @@ " New dataset\n", " Dataset annotations\n", " Paper writing\n", - " New task\n", " Coordination\n", + " New task\n", " Running Models\n", " \n", " \n", @@ -107,15 +107,15 @@ " \n", " \n", " KennethEnevoldsen\n", - " 593\n", - " 85\n", - " 324\n", + " 597\n", + " 87\n", + " 326\n", " 68\n", " 35\n", " 0\n", - " 0\n", " 81\n", " 0\n", + " 0\n", " \n", " \n", " isaac-chung\n", @@ -125,8 +125,8 @@ " 120\n", " 1\n", " 12\n", - " 2\n", " 54\n", + " 2\n", " 0\n", " \n", " \n", @@ -137,9 +137,9 @@ " 120\n", " 0\n", " 0\n", - " 0\n", " 70\n", " 0\n", + " 0\n", " \n", " \n", " awinml\n", @@ -161,8 +161,8 @@ " 144\n", " 0\n", " 0\n", - " 12\n", " 41\n", + " 12\n", " 0\n", " \n", " \n", @@ -178,10 +178,10 @@ " ...\n", " \n", " \n", - " antoniolanza1996\n", - " 2\n", + " PhilipMay\n", " 2\n", " 0\n", + " 2\n", " 0\n", " 0\n", " 0\n", @@ -190,7 +190,7 @@ " 0\n", " \n", " \n", - " cslizc\n", + " achibb\n", " 2\n", " 0\n", " 0\n", @@ -202,11 +202,11 @@ " 0\n", " \n", " \n", - " hanhainebula\n", + " antoniolanza1996\n", + " 2\n", " 2\n", " 0\n", " 0\n", - " 2\n", " 0\n", " 0\n", " 0\n", @@ -214,11 +214,11 @@ " 0\n", " \n", " \n", - " hongjin-su\n", + " cslizc\n", " 2\n", " 0\n", - " 2\n", " 0\n", + " 2\n", " 0\n", " 0\n", " 0\n", @@ -226,7 +226,7 @@ " 0\n", " \n", " \n", - " bakrianoo\n", + " hanhainebula\n", " 2\n", " 0\n", " 0\n", @@ -239,37 +239,37 @@ " \n", " \n", "\n", - "

96 rows × 9 columns

\n", + "

98 rows × 9 columns

\n", "" ], "text/plain": [ " Total Bug fixes Review PR New dataset \\\n", "GitHub \n", - "KennethEnevoldsen 593 85 324 68 \n", + "KennethEnevoldsen 597 87 326 68 \n", "isaac-chung 433 50 194 120 \n", "imenelydiaker 358 24 144 120 \n", "awinml 302 0 2 300 \n", "x-tabdeveloping 239 10 32 144 \n", "... ... ... ... ... \n", + "PhilipMay 2 0 2 0 \n", + "achibb 2 0 0 2 \n", "antoniolanza1996 2 2 0 0 \n", "cslizc 2 0 0 2 \n", "hanhainebula 2 0 0 2 \n", - "hongjin-su 2 0 2 0 \n", - "bakrianoo 2 0 0 2 \n", "\n", - " Dataset annotations Paper writing New task Coordination \\\n", + " Dataset annotations Paper writing Coordination New task \\\n", "GitHub \n", - "KennethEnevoldsen 35 0 0 81 \n", - "isaac-chung 1 12 2 54 \n", - "imenelydiaker 0 0 0 70 \n", - "awinml 0 0 0 0 \n", - "x-tabdeveloping 0 0 12 41 \n", - "... ... ... ... ... \n", - "antoniolanza1996 0 0 0 0 \n", - "cslizc 0 0 0 0 \n", - "hanhainebula 0 0 0 0 \n", - "hongjin-su 0 0 0 0 \n", - "bakrianoo 0 0 0 0 \n", + "KennethEnevoldsen 35 0 81 0 \n", + "isaac-chung 1 12 54 2 \n", + "imenelydiaker 0 0 70 0 \n", + "awinml 0 0 0 0 \n", + "x-tabdeveloping 0 0 41 12 \n", + "... ... ... ... ... \n", + "PhilipMay 0 0 0 0 \n", + "achibb 0 0 0 0 \n", + "antoniolanza1996 0 0 0 0 \n", + "cslizc 0 0 0 0 \n", + "hanhainebula 0 0 0 0 \n", "\n", " Running Models \n", "GitHub \n", @@ -279,13 +279,13 @@ "awinml 0 \n", "x-tabdeveloping 0 \n", "... ... \n", + "PhilipMay 0 \n", + "achibb 0 \n", "antoniolanza1996 0 \n", "cslizc 0 \n", "hanhainebula 0 \n", - "hongjin-su 0 \n", - "bakrianoo 0 \n", "\n", - "[96 rows x 9 columns]" + "[98 rows x 9 columns]" ] }, "execution_count": 4, @@ -309,13 +309,13 @@ "\\begin{longtable}{lrrrrrrrrr}\n", "\\caption{Contributions by GitHub users. See \u0007utoref{tab:authors} for the mapping between authors and GitHub handles.} \\label{tab:contributions} \\\\\n", "\\toprule\n", - " & Total & Bug fixes & Review PR & New dataset & Dataset annotations & Paper writing & New task & Coordination & Running Models \\\\\n", + " & Total & Bug fixes & Review PR & New dataset & Dataset annotations & Paper writing & Coordination & New task & Running Models \\\\\n", "GitHub & & & & & & & & & \\\\\n", "\\midrule\n", "\\endfirsthead\n", "\\caption[]{Contributions by GitHub users. See \u0007utoref{tab:authors} for the mapping between authors and GitHub handles.} \\\\\n", "\\toprule\n", - " & Total & Bug fixes & Review PR & New dataset & Dataset annotations & Paper writing & New task & Coordination & Running Models \\\\\n", + " & Total & Bug fixes & Review PR & New dataset & Dataset annotations & Paper writing & Coordination & New task & Running Models \\\\\n", "GitHub & & & & & & & & & \\\\\n", "\\midrule\n", "\\endhead\n", @@ -325,33 +325,33 @@ "\\endfoot\n", "\\bottomrule\n", "\\endlastfoot\n", - "KennethEnevoldsen & 593 & 85 & 324 & 68 & 35 & 0 & 0 & 81 & 0 \\\\\n", - "isaac-chung & 433 & 50 & 194 & 120 & 1 & 12 & 2 & 54 & 0 \\\\\n", - "imenelydiaker & 358 & 24 & 144 & 120 & 0 & 0 & 0 & 70 & 0 \\\\\n", + "KennethEnevoldsen & 597 & 87 & 326 & 68 & 35 & 0 & 81 & 0 & 0 \\\\\n", + "isaac-chung & 433 & 50 & 194 & 120 & 1 & 12 & 54 & 2 & 0 \\\\\n", + "imenelydiaker & 358 & 24 & 144 & 120 & 0 & 0 & 70 & 0 & 0 \\\\\n", "awinml & 302 & 0 & 2 & 300 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "x-tabdeveloping & 239 & 10 & 32 & 144 & 0 & 0 & 12 & 41 & 0 \\\\\n", + "x-tabdeveloping & 239 & 10 & 32 & 144 & 0 & 0 & 41 & 12 & 0 \\\\\n", "davidstap & 176 & 0 & 0 & 176 & 0 & 0 & 0 & 0 & 0 \\\\\n", "jaygala24 & 149 & 0 & 0 & 149 & 0 & 0 & 0 & 0 & 0 \\\\\n", "wissam-sib & 144 & 4 & 6 & 134 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "Muennighoff & 142 & 0 & 48 & 0 & 0 & 0 & 0 & 70 & 24 \\\\\n", + "Muennighoff & 142 & 0 & 48 & 0 & 0 & 0 & 70 & 0 & 24 \\\\\n", + "orionw & 125 & 20 & 20 & 0 & 0 & 0 & 75 & 10 & 0 \\\\\n", "dokato & 112 & 12 & 6 & 94 & 0 & 0 & 0 & 0 & 0 \\\\\n", "gentaiscool & 110 & 0 & 0 & 110 & 0 & 0 & 0 & 0 & 0 \\\\\n", "jupyterjazz & 108 & 0 & 0 & 108 & 0 & 0 & 0 & 0 & 0 \\\\\n", "SaitejaUtpala & 102 & 0 & 0 & 102 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "orionw & 100 & 20 & 20 & 0 & 0 & 0 & 10 & 50 & 0 \\\\\n", - "schmarion & 88 & 0 & 0 & 88 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "vaibhavad & 93 & 8 & 4 & 6 & 0 & 0 & 75 & 0 & 0 \\\\\n", "MathieuCiancone & 88 & 0 & 0 & 88 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "schmarion & 88 & 0 & 0 & 88 & 0 & 0 & 0 & 0 & 0 \\\\\n", "GabrielSequeira & 88 & 0 & 0 & 88 & 0 & 0 & 0 & 0 & 0 \\\\\n", "digantamisra98 & 71 & 0 & 0 & 71 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "vaibhavad & 68 & 8 & 4 & 6 & 0 & 0 & 0 & 50 & 0 \\\\\n", "shreeya-dhakal & 62 & 0 & 8 & 54 & 0 & 0 & 0 & 0 & 0 \\\\\n", "Rysias & 58 & 0 & 0 & 58 & 0 & 0 & 0 & 0 & 0 \\\\\n", "Samoed & 51 & 22 & 2 & 18 & 0 & 0 & 0 & 0 & 9 \\\\\n", - "sivareddyg & 50 & 0 & 0 & 0 & 0 & 0 & 0 & 50 & 0 \\\\\n", "gowitheflow-1998 & 50 & 0 & 0 & 50 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "sivareddyg & 50 & 0 & 0 & 0 & 0 & 0 & 50 & 0 & 0 \\\\\n", "asparius & 48 & 0 & 14 & 34 & 0 & 0 & 0 & 0 & 0 \\\\\n", "Akash190104 & 46 & 0 & 0 & 46 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "MartinBernstorff & 43 & 13 & 8 & 2 & 0 & 0 & 0 & 20 & 0 \\\\\n", + "MartinBernstorff & 43 & 13 & 8 & 2 & 0 & 0 & 20 & 0 & 0 \\\\\n", "staoxiao & 40 & 0 & 0 & 40 & 0 & 0 & 0 & 0 & 0 \\\\\n", "akshita-sukhlecha & 40 & 4 & 0 & 36 & 0 & 0 & 0 & 0 & 0 \\\\\n", "rafalposwiata & 36 & 0 & 0 & 36 & 0 & 0 & 0 & 0 & 0 \\\\\n", @@ -363,64 +363,66 @@ "jphme & 28 & 0 & 0 & 28 & 0 & 0 & 0 & 0 & 0 \\\\\n", "ShawonAshraf & 28 & 0 & 0 & 28 & 0 & 0 & 0 & 0 & 0 \\\\\n", "violenil & 26 & 0 & 0 & 26 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "mariyahendriksen & 24 & 0 & 0 & 0 & 0 & 24 & 0 & 0 & 0 \\\\\n", "dwzhu-pku & 24 & 0 & 0 & 24 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "hgissbkh & 23 & 13 & 2 & 0 & 0 & 3 & 5 & 0 & 0 \\\\\n", - "taeminlee & 22 & 0 & 0 & 22 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "hgissbkh & 23 & 13 & 2 & 0 & 0 & 3 & 0 & 5 & 0 \\\\\n", "jankounchained & 22 & 8 & 0 & 14 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "tomaarsen & 22 & 0 & 2 & 0 & 0 & 0 & 0 & 20 & 0 \\\\\n", + "taeminlee & 22 & 0 & 0 & 22 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "tomaarsen & 22 & 0 & 2 & 0 & 0 & 0 & 20 & 0 & 0 \\\\\n", "kwojtasi & 22 & 0 & 0 & 22 & 0 & 0 & 0 & 0 & 0 \\\\\n", "mrshu & 21 & 0 & 4 & 16 & 1 & 0 & 0 & 0 & 0 \\\\\n", "crystina-z & 21 & 0 & 0 & 21 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "ManuelFay & 20 & 13 & 0 & 2 & 0 & 0 & 0 & 5 & 0 \\\\\n", "AlexeyVatolin & 20 & 20 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "john-b-yang & 20 & 0 & 0 & 0 & 0 & 20 & 0 & 0 & 0 \\\\\n", "Andrian0s & 20 & 2 & 4 & 14 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "mmhamdy & 20 & 0 & 0 & 20 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "ManuelFay & 20 & 13 & 0 & 2 & 0 & 0 & 5 & 0 & 0 \\\\\n", "rbroc & 20 & 0 & 0 & 20 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "thakur-nandan & 18 & 0 & 0 & 18 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "john-b-yang & 20 & 0 & 0 & 0 & 0 & 20 & 0 & 0 & 0 \\\\\n", + "mmhamdy & 20 & 0 & 0 & 20 & 0 & 0 & 0 & 0 & 0 \\\\\n", "manandey & 18 & 0 & 0 & 18 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "thakur-nandan & 18 & 0 & 0 & 18 & 0 & 0 & 0 & 0 & 0 \\\\\n", "PranjalChitale & 16 & 0 & 0 & 16 & 0 & 0 & 0 & 0 & 0 \\\\\n", "Sakshamrzt & 16 & 0 & 4 & 12 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "dipam7 & 16 & 0 & 2 & 14 & 0 & 0 & 0 & 0 & 0 \\\\\n", "sted97 & 16 & 0 & 0 & 16 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "dipam7 & 16 & 0 & 2 & 14 & 0 & 0 & 0 & 0 & 0 \\\\\n", "artemsnegirev & 14 & 0 & 0 & 12 & 2 & 0 & 0 & 0 & 0 \\\\\n", "taidnguyen & 14 & 0 & 0 & 14 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "mariyahendriksen & 12 & 0 & 0 & 0 & 0 & 12 & 0 & 0 & 0 \\\\\n", - "guenthermi & 12 & 0 & 0 & 12 & 0 & 0 & 0 & 0 & 0 \\\\\n", "jordiclive & 12 & 10 & 0 & 2 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "guenthermi & 12 & 0 & 0 & 12 & 0 & 0 & 0 & 0 & 0 \\\\\n", "slvnwhrl & 12 & 0 & 0 & 12 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "anpalmak2003 & 12 & 0 & 0 & 9 & 3 & 0 & 0 & 0 & 0 \\\\\n", - "xhluca & 12 & 4 & 2 & 6 & 0 & 0 & 0 & 0 & 0 \\\\\n", "Art3mis07 & 12 & 0 & 0 & 12 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "xiamengzhou & 12 & 0 & 0 & 12 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "swj0419 & 12 & 0 & 0 & 12 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "henilp105 & 11 & 2 & 0 & 0 & 9 & 0 & 0 & 0 & 0 \\\\\n", + "xhluca & 12 & 4 & 2 & 6 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "anpalmak2003 & 12 & 0 & 0 & 9 & 3 & 0 & 0 & 0 & 0 \\\\\n", "ab1992ao & 11 & 0 & 0 & 8 & 3 & 0 & 0 & 0 & 0 \\\\\n", "MariyaTikhonova & 11 & 0 & 0 & 7 & 4 & 0 & 0 & 0 & 0 \\\\\n", + "henilp105 & 11 & 2 & 0 & 0 & 9 & 0 & 0 & 0 & 0 \\\\\n", "simon-clematide & 10 & 0 & 0 & 10 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "tmp_handle & 10 & 0 & 0 & 0 & 0 & 0 & 10 & 0 & 0 \\\\\n", "sarahooker & 10 & 0 & 0 & 0 & 0 & 10 & 0 & 0 & 0 \\\\\n", + "swj0419 & 10 & 0 & 0 & 10 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "xiamengzhou & 10 & 0 & 0 & 10 & 0 & 0 & 0 & 0 & 0 \\\\\n", "ABorghini & 10 & 0 & 0 & 10 & 0 & 0 & 0 & 0 & 0 \\\\\n", "xu3kev & 10 & 0 & 0 & 10 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "Ruqyai & 10 & 0 & 8 & 2 & 0 & 0 & 0 & 0 & 0 \\\\\n", "malteos & 10 & 0 & 0 & 10 & 0 & 0 & 0 & 0 & 0 \\\\\n", "ljvmiranda921 & 10 & 0 & 0 & 10 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "Alenush & 10 & 0 & 0 & 6 & 4 & 0 & 0 & 0 & 0 \\\\\n", - "HLasse & 10 & 5 & 0 & 0 & 5 & 0 & 0 & 0 & 0 \\\\\n", + "howard-yen & 10 & 0 & 0 & 10 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "hongjin-su & 10 & 0 & 0 & 10 & 0 & 0 & 0 & 0 & 0 \\\\\n", "guangyusong & 10 & 0 & 0 & 10 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "Alenush & 10 & 0 & 0 & 6 & 4 & 0 & 0 & 0 & 0 \\\\\n", "cassanof & 10 & 1 & 0 & 8 & 0 & 0 & 0 & 0 & 1 \\\\\n", + "HLasse & 10 & 5 & 0 & 0 & 5 & 0 & 0 & 0 & 0 \\\\\n", "ZhengLiu101 & 10 & 0 & 0 & 10 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "Ruqyai & 10 & 0 & 8 & 2 & 0 & 0 & 0 & 0 & 0 \\\\\n", "izhx & 6 & 0 & 0 & 6 & 0 & 0 & 0 & 0 & 0 \\\\\n", "marcobellagente93 & 6 & 0 & 0 & 6 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "MexicanLemonade & 2 & 0 & 0 & 2 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "monikernemo & 2 & 0 & 0 & 2 & 0 & 0 & 0 & 0 & 0 \\\\\n", "NouamaneTazi & 2 & 0 & 2 & 0 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "MexicanLemonade & 2 & 0 & 0 & 2 & 0 & 0 & 0 & 0 & 0 \\\\\n", + "bakrianoo & 2 & 0 & 0 & 2 & 0 & 0 & 0 & 0 & 0 \\\\\n", "PhilipMay & 2 & 0 & 2 & 0 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "monikernemo & 2 & 0 & 0 & 2 & 0 & 0 & 0 & 0 & 0 \\\\\n", "achibb & 2 & 0 & 0 & 2 & 0 & 0 & 0 & 0 & 0 \\\\\n", "antoniolanza1996 & 2 & 2 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\\\\n", "cslizc & 2 & 0 & 0 & 2 & 0 & 0 & 0 & 0 & 0 \\\\\n", "hanhainebula & 2 & 0 & 0 & 2 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "hongjin-su & 2 & 0 & 2 & 0 & 0 & 0 & 0 & 0 & 0 \\\\\n", - "bakrianoo & 2 & 0 & 0 & 2 & 0 & 0 & 0 & 0 & 0 \\\\\n", "\\end{longtable}\n", "\n" ] @@ -575,33 +577,6 @@ " ...\n", " \n", " \n", - " 80\n", - " john-b-yang\n", - " John\n", - " Yang\n", - " johnby@stanford.edu\n", - " ~John_Yang3\n", - " Stanford University\n", - " \n", - " \n", - " 81\n", - " thakur-nandan\n", - " Nandan\n", - " Thakur\n", - " \n", - " ~Nandan_Thakur1\n", - " University of Waterloo\n", - " \n", - " \n", - " 82\n", - " loicmagne\n", - " Loic\n", - " Magne\n", - " ~Loïc_Magne1\n", - " Individual Contributor\n", - " None\n", - " \n", - " \n", " 83\n", " sarahooker\n", " Sara\n", @@ -615,13 +590,40 @@ " kwojtasi\n", " Konrad\n", " Wojtasik\n", + " \n", " ~Konrad_Wojtasik1\n", " Wrocław University of Science and Technology\n", - " None\n", + " \n", + " \n", + " 85\n", + " tmp_handle\n", + " Jimmy\n", + " Lin\n", + " \n", + " ~Jimmy_Lin2\n", + " University of Waterloo\n", + " \n", + " \n", + " 86\n", + " hongjin-su\n", + " Hongjin\n", + " Su\n", + " \n", + " ~Hongjin_SU1\n", + " University of Hong Kong\n", + " \n", + " \n", + " 87\n", + " howard-yen\n", + " Howard\n", + " Yen\n", + " \n", + " ~Howard_Yen1\n", + " Princeton University\n", " \n", " \n", "\n", - "

85 rows × 6 columns

\n", + "

88 rows × 6 columns

\n", "" ], "text/plain": [ @@ -632,26 +634,26 @@ "3 wissam-sib Wissam Siblini wissam.siblini92@gmail.com \n", "4 GabrielSequeira Gabriel Sequeira \n", ".. ... ... ... ... \n", - "80 john-b-yang John Yang johnby@stanford.edu \n", - "81 thakur-nandan Nandan Thakur \n", - "82 loicmagne Loic Magne ~Loïc_Magne1 \n", "83 sarahooker Sara Hooker \n", - "84 kwojtasi Konrad Wojtasik ~Konrad_Wojtasik1 \n", + "84 kwojtasi Konrad Wojtasik \n", + "85 tmp_handle Jimmy Lin \n", + "86 hongjin-su Hongjin Su \n", + "87 howard-yen Howard Yen \n", "\n", - " User on openreview Affiliations \n", - "0 ~Kenneth_Enevoldsen1 Aarhus University \n", - "1 ~Márton_Kardos1 Aarhus University \n", - "2 ~Imene_Kerboua1 INSA Lyon, LIRIS \n", - "3 ~Wissam_Siblini1 Individual Contributor \n", - "4 Individual Contributor \n", - ".. ... ... \n", - "80 ~John_Yang3 Stanford University \n", - "81 ~Nandan_Thakur1 University of Waterloo \n", - "82 Individual Contributor None \n", - "83 ~Sara_Hooker2 Cohere For AI \n", - "84 Wrocław University of Science and Technology None \n", + " User on openreview Affiliations \n", + "0 ~Kenneth_Enevoldsen1 Aarhus University \n", + "1 ~Márton_Kardos1 Aarhus University \n", + "2 ~Imene_Kerboua1 INSA Lyon, LIRIS \n", + "3 ~Wissam_Siblini1 Individual Contributor \n", + "4 Individual Contributor \n", + ".. ... ... \n", + "83 ~Sara_Hooker2 Cohere For AI \n", + "84 ~Konrad_Wojtasik1 Wrocław University of Science and Technology \n", + "85 ~Jimmy_Lin2 University of Waterloo \n", + "86 ~Hongjin_SU1 University of Hong Kong \n", + "87 ~Howard_Yen1 Princeton University \n", "\n", - "[85 rows x 6 columns]" + "[88 rows x 6 columns]" ] }, "execution_count": 8, @@ -737,7 +739,7 @@ "gowitheflow-1998 & Chenghao & Xiao & Durham University \\\\\n", "mariyahendriksen & Mariya & Hendriksen & University of Amsterdam \\\\\n", "dokato & Dominik & Krzemiński & Cohere For AI Community \\\\\n", - "Samoed & Roman & Solomatin & ITMO \\\\\n", + "Samoed & Roman & Solomatin & AI Talent Hub && ITMO University \\\\\n", "Alenush & Alena & Fenogenova & SaluteDevices \\\\\n", "ab1992ao & Aleksandr & Abramov & SaluteDevices \\\\\n", "artemsnegirev & Artem & Snegirev & SaluteDevices \\\\\n", @@ -758,9 +760,12 @@ "xiamengzhou & Mengzhou & Xia & Princeton University \\\\\n", "john-b-yang & John & Yang & Stanford University \\\\\n", "thakur-nandan & Nandan & Thakur & University of Waterloo \\\\\n", - "loicmagne & Loic & Magne & NaN \\\\\n", + "loicmagne & Loic & Magne & Individual Contributor \\\\\n", "sarahooker & Sara & Hooker & Cohere For AI \\\\\n", - "kwojtasi & Konrad & Wojtasik & NaN \\\\\n", + "kwojtasi & Konrad & Wojtasik & Wrocław University of Science and Technology \\\\\n", + "tmp_handle & Jimmy & Lin & University of Waterloo \\\\\n", + "hongjin-su & Hongjin & Su & University of Hong Kong \\\\\n", + "howard-yen & Howard & Yen & Princeton University \\\\\n", "\\bottomrule\n", "\\end{tabular}\n", "\n" @@ -820,7 +825,7 @@ { "data": { "text/plain": [ - "'guangyusong'" + "'ShawonAshraf'" ] }, "execution_count": 11, @@ -998,102 +1003,106 @@ "\\textbf{Jay Gala\\textsuperscript{5}}, \n", "\\\\\n", "\\textbf{Wissam Siblini\\textsuperscript{2}}, \n", - "\\textbf{Dominik Krzemiński\\textsuperscript{8}}, \n", + "\\textbf{Orion Weller\\textsuperscript{8}}, \n", + "\\textbf{Dominik Krzemiński\\textsuperscript{9}}, \n", + "\\\\\n", "\\textbf{Genta Indra Winata\\textsuperscript{2}}, \n", + "\\textbf{Saba Sturua\\textsuperscript{10}}, \n", + "\\textbf{Saiteja Utpala\\textsuperscript{11}}, \n", "\\\\\n", - "\\textbf{Saba Sturua\\textsuperscript{9}}, \n", - "\\textbf{Saiteja Utpala\\textsuperscript{10}}, \n", - "\\textbf{Orion Weller\\textsuperscript{11}}, \n", - "\\textbf{Mathieu Ciancone\\textsuperscript{12}}, \n", + "\\textbf{Vaibhav Adlakha\\textsuperscript{12,13}}, \n", + "\\textbf{Mathieu Ciancone\\textsuperscript{14}}, \n", + "\\textbf{Marion Schaeffer\\textsuperscript{14}}, \n", "\\\\\n", - "\\textbf{Marion Schaeffer\\textsuperscript{12}}, \n", "\\textbf{Gabriel Sequeira\\textsuperscript{2}}, \n", - "\\textbf{Diganta Misra\\textsuperscript{13,14}}, \n", - "\\\\\n", - "\\textbf{Vaibhav Adlakha\\textsuperscript{15,16}}, \n", + "\\textbf{Diganta Misra\\textsuperscript{15,16}}, \n", "\\textbf{Shreeya Dhakal\\textsuperscript{2}}, \n", - "\\textbf{Jonathan Rystrøm\\textsuperscript{17}}, \n", "\\\\\n", - "\\textbf{Roman Solomatin\\textsuperscript{18}}, \n", - "\\textbf{Chenghao Xiao\\textsuperscript{19}}, \n", - "\\textbf{Ömer Çağatan\\textsuperscript{20}}, \n", - "\\textbf{Akash Kundu\\textsuperscript{21,22}}, \n", + "\\textbf{Jonathan Rystrøm\\textsuperscript{17}}, \n", + "\\textbf{Roman Solomatin\\textsuperscript{18,19}}, \n", + "\\textbf{Chenghao Xiao\\textsuperscript{20}}, \n", "\\\\\n", + "\\textbf{Ömer Çağatan\\textsuperscript{21}}, \n", + "\\textbf{Akash Kundu\\textsuperscript{22,23}}, \n", "\\textbf{Martin Bernstorff\\textsuperscript{1}}, \n", - "\\textbf{Shitao Xiao\\textsuperscript{23}}, \n", + "\\\\\n", "\\textbf{Akshita Sukhlecha\\textsuperscript{2}}, \n", + "\\textbf{Shitao Xiao\\textsuperscript{24}}, \n", + "\\textbf{Bhavish Pahwa\\textsuperscript{11}}, \n", "\\\\\n", - "\\textbf{Bhavish Pahwa\\textsuperscript{10}}, \n", - "\\textbf{Rafał Poświata\\textsuperscript{24}}, \n", - "\\textbf{Kranthi Kiran GV\\textsuperscript{25}}, \n", + "\\textbf{Rafał Poświata\\textsuperscript{25}}, \n", + "\\textbf{Kranthi Kiran GV\\textsuperscript{26}}, \n", + "\\textbf{Björn Plüster\\textsuperscript{27}}, \n", "\\\\\n", - "\\textbf{Shawon Ashraf\\textsuperscript{26}}, \n", - "\\textbf{Daniel Auras\\textsuperscript{26}}, \n", - "\\textbf{Björn Plüster\\textsuperscript{26}}, \n", + "\\textbf{Daniel Auras\\textsuperscript{27}}, \n", + "\\textbf{Shawon Ashraf\\textsuperscript{27}}, \n", + "\\textbf{Jan Philipp Harries\\textsuperscript{27}}, \n", "\\\\\n", - "\\textbf{Jan Philipp Harries\\textsuperscript{26}}, \n", - "\\textbf{Loic Magne}, \n", - "\\textbf{Isabelle Mohr\\textsuperscript{9}}, \n", - "\\textbf{Dawei Zhu\\textsuperscript{27}}, \n", + "\\textbf{Loic Magne\\textsuperscript{2}}, \n", + "\\textbf{Isabelle Mohr\\textsuperscript{10}}, \n", + "\\textbf{Dawei Zhu\\textsuperscript{28}}, \n", + "\\textbf{Mariya Hendriksen\\textsuperscript{4}}, \n", "\\\\\n", - "\\textbf{Hippolyte Gisserot-Boukhlef\\textsuperscript{28,29}}, \n", - "\\textbf{Tom Aarsen\\textsuperscript{30}}, \n", - "\\textbf{Jan Kostkan\\textsuperscript{1}}, \n", + "\\textbf{Hippolyte Gisserot-Boukhlef\\textsuperscript{29,30}}, \n", + "\\textbf{Konrad Wojtasik\\textsuperscript{31}}, \n", + "\\textbf{Tom Aarsen\\textsuperscript{32}}, \n", "\\\\\n", - "\\textbf{Konrad Wojtasik}, \n", - "\\textbf{Taemin Lee\\textsuperscript{31}}, \n", - "\\textbf{Marek Suppa\\textsuperscript{32,33}}, \n", - "\\textbf{Xinyu Zhang\\textsuperscript{34}}, \n", + "\\textbf{Jan Kostkan\\textsuperscript{1}}, \n", + "\\textbf{Taemin Lee\\textsuperscript{33}}, \n", + "\\textbf{Marek Suppa\\textsuperscript{34,35}}, \n", + "\\textbf{Xinyu Zhang\\textsuperscript{36}}, \n", "\\\\\n", - "\\textbf{Roberta Rocca\\textsuperscript{1}}, \n", - "\\textbf{Mohammed Hamdy\\textsuperscript{8}}, \n", - "\\textbf{Andrianos Michail\\textsuperscript{35}}, \n", + "\\textbf{Aleksei Vatolin\\textsuperscript{37}}, \n", + "\\textbf{Mohammed Hamdy\\textsuperscript{9}}, \n", "\\textbf{John Yang\\textsuperscript{6}}, \n", "\\\\\n", - "\\textbf{Manuel Faysse\\textsuperscript{28,36}}, \n", - "\\textbf{Aleksei Vatolin\\textsuperscript{37}}, \n", - "\\textbf{Nandan Thakur\\textsuperscript{34}}, \n", - "\\textbf{Manan Dey\\textsuperscript{38}}, \n", + "\\textbf{Andrianos Michail\\textsuperscript{38}}, \n", + "\\textbf{Manuel Faysse\\textsuperscript{29,39}}, \n", + "\\textbf{Roberta Rocca\\textsuperscript{1}}, \n", + "\\textbf{Manan Dey\\textsuperscript{40}}, \n", "\\\\\n", + "\\textbf{Nandan Thakur\\textsuperscript{36}}, \n", + "\\textbf{Simone Tedeschi\\textsuperscript{41}}, \n", "\\textbf{Dipam Vasani\\textsuperscript{2}}, \n", - "\\textbf{Pranjal Chitale\\textsuperscript{39}}, \n", - "\\textbf{Simone Tedeschi\\textsuperscript{40}}, \n", - "\\textbf{Nguyen Tai\\textsuperscript{41}}, \n", - "\\\\\n", - "\\textbf{Artem Snegirev\\textsuperscript{42}}, \n", - "\\textbf{Mariya Hendriksen\\textsuperscript{4}}, \n", - "\\textbf{Michael Günther\\textsuperscript{9}}, \n", "\\\\\n", - "\\textbf{Mengzhou Xia\\textsuperscript{43}}, \n", - "\\textbf{Weijia Shi\\textsuperscript{44}}, \n", - "\\textbf{Xing Han Lù\\textsuperscript{15}}, \n", - "\\textbf{Jordan Clive\\textsuperscript{45}}, \n", + "\\textbf{Pranjal Chitale\\textsuperscript{42}}, \n", + "\\textbf{Artem Snegirev\\textsuperscript{43}}, \n", + "\\textbf{Nguyen Tai\\textsuperscript{44}}, \n", + "\\textbf{Silvan Wehrli\\textsuperscript{45}}, \n", "\\\\\n", "\\textbf{Gayatri K\\textsuperscript{46}}, \n", - "\\textbf{Anna Maksimova\\textsuperscript{42}}, \n", - "\\textbf{Silvan Wehrli\\textsuperscript{47}}, \n", - "\\textbf{Maria Tikhonova\\textsuperscript{42,48}}, \n", - "\\\\\n", - "\\textbf{Henil Panchal\\textsuperscript{49}}, \n", - "\\textbf{Aleksandr Abramov\\textsuperscript{42}}, \n", - "\\textbf{Malte Ostendorff\\textsuperscript{50}}, \n", + "\\textbf{Xing Han Lù\\textsuperscript{12}}, \n", + "\\textbf{Michael Günther\\textsuperscript{10}}, \n", + "\\textbf{Jordan Clive\\textsuperscript{47}}, \n", "\\\\\n", - "\\textbf{Sara Hooker\\textsuperscript{51}}, \n", - "\\textbf{Zheng Liu\\textsuperscript{23}}, \n", - "\\textbf{Simon Clematide\\textsuperscript{35}}, \n", + "\\textbf{Anna Maksimova\\textsuperscript{43}}, \n", + "\\textbf{Maria Tikhonova\\textsuperscript{43,48}}, \n", + "\\textbf{Aleksandr Abramov\\textsuperscript{43}}, \n", "\\\\\n", - "\\textbf{Lester James Miranda\\textsuperscript{52}}, \n", - "\\textbf{Alena Fenogenova\\textsuperscript{42}}, \n", - "\\textbf{Lasse Hansen\\textsuperscript{1}}, \n", + "\\textbf{Henil Panchal\\textsuperscript{49}}, \n", + "\\textbf{Weijia Shi\\textsuperscript{50}}, \n", + "\\textbf{Hongjin Su\\textsuperscript{51}}, \n", + "\\textbf{Jimmy Lin\\textsuperscript{36}}, \n", "\\\\\n", - "\\textbf{Guangyu Song\\textsuperscript{53}}, \n", + "\\textbf{Zheng Liu\\textsuperscript{24}}, \n", + "\\textbf{Sara Hooker\\textsuperscript{52}}, \n", "\\textbf{Ruqiya Bin Safi}, \n", - "\\textbf{Wen-Ding Li\\textsuperscript{54}}, \n", + "\\textbf{Simon Clematide\\textsuperscript{38}}, \n", "\\\\\n", - "\\textbf{Alessia Borghini\\textsuperscript{40}}, \n", + "\\textbf{Mengzhou Xia\\textsuperscript{53}}, \n", + "\\textbf{Malte Ostendorff\\textsuperscript{54}}, \n", "\\textbf{Federico Cassano\\textsuperscript{55,56}}, \n", - "\\textbf{Siva Reddy\\textsuperscript{15,16}}, \n", "\\\\\n", + "\\textbf{Lester James Miranda\\textsuperscript{57}}, \n", + "\\textbf{Alessia Borghini\\textsuperscript{41}}, \n", + "\\textbf{Lasse Hansen\\textsuperscript{1}}, \n", + "\\\\\n", + "\\textbf{Wen-Ding Li\\textsuperscript{58}}, \n", + "\\textbf{Guangyu Song\\textsuperscript{59}}, \n", + "\\textbf{Alena Fenogenova\\textsuperscript{43}}, \n", + "\\textbf{Howard Yen\\textsuperscript{53}}, \n", + "\\\\\n", + "\\textbf{Siva Reddy\\textsuperscript{12,13}}, \n", "\\textbf{Niklas Muennighoff\\textsuperscript{6,7}}, \n", "\\\\\n", "\\\\\n", @@ -1105,69 +1114,72 @@ "\\\\\n", "\\textsuperscript{6}Stanford University, \n", "\\textsuperscript{7}Contextual AI, \n", - "\\textsuperscript{8}Cohere For AI Community, \n", - "\\textsuperscript{9}Jina AI, \n", - "\\textsuperscript{10}Microsoft Research, \n", + "\\textsuperscript{8}Johns Hopkins University, \n", + "\\textsuperscript{9}Cohere For AI Community, \n", "\\\\\n", - "\\textsuperscript{11}Johns Hopkins University, \n", - "\\textsuperscript{12}Wikit, \n", - "\\textsuperscript{13}Max Planck Institute for Intelligent Systems, \n", + "\\textsuperscript{10}Jina AI, \n", + "\\textsuperscript{11}Microsoft Research, \n", + "\\textsuperscript{12}Mila, McGill University, \n", + "\\textsuperscript{13}ServiceNow Research, \n", + "\\textsuperscript{14}Wikit, \n", "\\\\\n", - "\\textsuperscript{14}ELLIS Institute Tübingen, \n", - "\\textsuperscript{15}Mila, McGill University, \n", - "\\textsuperscript{16}ServiceNow Research, \n", + "\\textsuperscript{15}Max Planck Institute for Intelligent Systems, \n", + "\\textsuperscript{16}ELLIS Institute Tübingen, \n", "\\\\\n", "\\textsuperscript{17}University of Oxford, \n", - "\\textsuperscript{18}ITMO, \n", - "\\textsuperscript{19}Durham University, \n", - "\\textsuperscript{20}Koç University,Turkey, \n", + "\\textsuperscript{18}AI Talent Hub, \n", + "\\textsuperscript{19}ITMO University, \n", + "\\textsuperscript{20}Durham University, \n", "\\\\\n", - "\\textsuperscript{21}Heritage Institute of Technology, \n", - "\\textsuperscript{22}Apart Research, \n", + "\\textsuperscript{21}Koç University,Turkey, \n", + "\\textsuperscript{22}Heritage Institute of Technology, \n", + "\\textsuperscript{23}Apart Research, \n", "\\\\\n", - "\\textsuperscript{23}Beijing Academy of Artificial Intelligence, \n", - "\\textsuperscript{24}National Information Processing Institute, \n", + "\\textsuperscript{24}Beijing Academy of Artificial Intelligence, \n", + "\\textsuperscript{25}National Information Processing Institute, \n", "\\\\\n", - "\\textsuperscript{25}New York University, \n", - "\\textsuperscript{26}ellamind, Germany, \n", - "\\textsuperscript{27}Peking University, \n", - "\\textsuperscript{28}CentraleSupélec, \n", + "\\textsuperscript{26}New York University, \n", + "\\textsuperscript{27}ellamind, Germany, \n", + "\\textsuperscript{28}Peking University, \n", + "\\textsuperscript{29}CentraleSupélec, \n", "\\\\\n", - "\\textsuperscript{29}Artefact Research Center, \n", - "\\textsuperscript{30}Hugging Face, \n", - "\\textsuperscript{31}Korea University Human-Inspired AI Research, \n", + "\\textsuperscript{30}Artefact Research Center, \n", + "\\textsuperscript{31}Wrocław University of Science and Technology, \n", + "\\textsuperscript{32}Hugging Face, \n", "\\\\\n", - "\\textsuperscript{32}Comenius University Bratislava, \n", - "\\textsuperscript{33}Cisco Systems, \n", - "\\textsuperscript{34}University of Waterloo, \n", - "\\textsuperscript{35}University of Zurich, \n", + "\\textsuperscript{33}Korea University Human-Inspired AI Research, \n", + "\\textsuperscript{34}Comenius University Bratislava, \n", "\\\\\n", - "\\textsuperscript{36}Illuin Technology, \n", + "\\textsuperscript{35}Cisco Systems, \n", + "\\textsuperscript{36}University of Waterloo, \n", "\\textsuperscript{37}FRC CSC RAS, \n", - "\\textsuperscript{38}Salesforce, \n", - "\\textsuperscript{39}Indian Institute of Technology, \n", - "\\\\\n", - "\\textsuperscript{40}Sapienza University of Rome, \n", - "\\textsuperscript{41}University of Pennsylvania, \n", - "\\textsuperscript{42}SaluteDevices, \n", + "\\textsuperscript{38}University of Zurich, \n", + "\\textsuperscript{39}Illuin Technology, \n", "\\\\\n", - "\\textsuperscript{43}Princeton University, \n", - "\\textsuperscript{44}University of Washington, \n", - "\\textsuperscript{45}Imperial College London, \n", + "\\textsuperscript{40}Salesforce, \n", + "\\textsuperscript{41}Sapienza University of Rome, \n", + "\\textsuperscript{42}Indian Institute of Technology, \n", + "\\textsuperscript{43}SaluteDevices, \n", "\\\\\n", + "\\textsuperscript{44}University of Pennsylvania, \n", + "\\textsuperscript{45}Robert Koch Institute, \n", "\\textsuperscript{46}R. V. College of Engineering, \n", - "\\textsuperscript{47}Robert Koch Institute, \n", + "\\\\\n", + "\\textsuperscript{47}Imperial College London, \n", "\\textsuperscript{48}HSE University, \n", "\\textsuperscript{49}Nirma University, \n", + "\\textsuperscript{50}University of Washington, \n", "\\\\\n", - "\\textsuperscript{50}Occiglot, \n", - "\\textsuperscript{51}Cohere For AI, \n", - "\\textsuperscript{52}Allen Institute for AI, \n", - "\\textsuperscript{53}Tano Labs, \n", - "\\textsuperscript{54}Cornell University, \n", + "\\textsuperscript{51}University of Hong Kong, \n", + "\\textsuperscript{52}Cohere For AI, \n", + "\\textsuperscript{53}Princeton University, \n", + "\\textsuperscript{54}Occiglot, \n", "\\\\\n", "\\textsuperscript{55}Northeastern University, \n", "\\textsuperscript{56}Cursor AI, \n", + "\\textsuperscript{57}Allen Institute for AI, \n", + "\\textsuperscript{58}Cornell University, \n", + "\\textsuperscript{59}Tano Labs, \n", "\n" ] } @@ -1176,6 +1188,11 @@ "print(latex)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, { "cell_type": "code", "execution_count": 19, @@ -1424,7 +1441,7 @@ ], "metadata": { "kernelspec": { - "display_name": "mteb", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -1438,7 +1455,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.19" + "version": "3.9.20" } }, "nbformat": 4, diff --git a/scripts/task_selection/create_main_results_table.ipynb b/scripts/task_selection/create_main_results_table.ipynb new file mode 100644 index 0000000000..a22c2c4c5c --- /dev/null +++ b/scripts/task_selection/create_main_results_table.ipynb @@ -0,0 +1,2738 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating data for main results table" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/au561649/Github/mteb/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from __future__ import annotations\n", + "\n", + "import pandas as pd\n", + "\n", + "import mteb\n", + "\n", + "mdl_names = [\n", + " \"sentence-transformers/all-MiniLM-L6-v2\",\n", + " \"sentence-transformers/all-MiniLM-L12-v2\",\n", + " \"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2\",\n", + " \"sentence-transformers/paraphrase-multilingual-mpnet-base-v2\",\n", + " \"sentence-transformers/all-mpnet-base-v2\",\n", + " \"sentence-transformers/LaBSE\",\n", + " \"intfloat/multilingual-e5-large-instruct\",\n", + " \"intfloat/e5-mistral-7b-instruct\",\n", + " \"GritLM/GritLM-7B\",\n", + " \"intfloat/multilingual-e5-small\",\n", + " \"intfloat/multilingual-e5-base\",\n", + " \"intfloat/multilingual-e5-large\",\n", + "]\n", + "model_metas = [mteb.get_model_meta(name) for name in mdl_names]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def add_aggregate_columns(results):\n", + " task_names = results.columns[2:]\n", + "\n", + " # convert to 100 scale\n", + " results[task_names] = results[task_names] * 100\n", + "\n", + " borda = results[task_names].rank(ascending=True, method=\"min\").sum(axis=1)\n", + " results[\"Borda Count\"] = borda\n", + " results = results.sort_values(\"Borda Count\", ascending=False)\n", + " # borda str: 1 ({borda count}) 2 ({borda count}) 3 ({borda count}) ...\n", + " results[\"Borda str\"] = [\n", + " f\"{i+1} ({int(borda_count)})\"\n", + " for i, borda_count in enumerate(results[\"Borda Count\"].to_list())\n", + " ]\n", + "\n", + " # add mean across tasks\n", + " results[\"Mean\"] = results[task_names].mean(axis=1)\n", + "\n", + " # add mean pr. task type\n", + " task_types = [\n", + " \"BitextMining\",\n", + " \"PairClassification\",\n", + " \"Classification\",\n", + " \"STS\",\n", + " \"Retrieval\",\n", + " \"MultilabelClassification\",\n", + " \"Clustering\",\n", + " \"Reranking\",\n", + " ]\n", + "\n", + " tasks = [mteb.get_task(name) for name in task_names]\n", + " tasktype_to_tasks = {\n", + " task_type: [t for t in tasks if t.metadata.type == task_type]\n", + " for task_type in task_types\n", + " }\n", + "\n", + " for task_type, tasks in tasktype_to_tasks.items():\n", + " task_names = [t.metadata.name for t in tasks]\n", + " results[f\"Mean {task_type}\"] = results[task_names].mean(axis=1)\n", + "\n", + " # add mean pr. task type\n", + " cols = [f\"Mean {task_type}\" for task_type in task_types]\n", + " results[\"mean pr. task type\"] = results[cols].mean(axis=1)\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "mult_tasks = mteb.get_benchmark(\"MTEB(Indic)\").tasks\n", + "\n", + "# load task results for the specified models from mteb/results repository\n", + "mteb_results = mteb.load_results(\n", + " models=model_metas,\n", + " tasks=mult_tasks,\n", + " download_latest=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "mteb_results = mteb_results.join_revisions()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Indic\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/au561649/Github/mteb/mteb/load_results/benchmark_results.py:120: UserWarning: Couldn't get scores for IndicGenBenchFloresBitextMining due to No splits had scores for the specified languages..\n", + " warnings.warn(\n", + "/Users/au561649/Github/mteb/mteb/load_results/benchmark_results.py:120: UserWarning: Couldn't get scores for IndicLangClassification due to No splits had scores for the specified languages..\n", + " warnings.warn(\n", + "/Users/au561649/Github/mteb/mteb/load_results/benchmark_results.py:120: UserWarning: Couldn't get scores for LinceMTBitextMining due to No splits had scores for the specified languages..\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "mult_tasks = mteb.get_benchmark(\"MTEB(Indic)\").tasks\n", + "\n", + "# load task results for the specified models from mteb/results repository\n", + "mteb_results = mteb.load_results(\n", + " models=model_metas,\n", + " tasks=mult_tasks,\n", + " download_latest=False,\n", + ")\n", + "\n", + "mteb_results = mteb_results.join_revisions().filter_models()\n", + "\n", + "# manual check that everything is there\n", + "# pd.DataFrame(mteb_results.get_scores()).to_csv(\"tmp.csv\")\n", + "\n", + "results = pd.DataFrame(mteb_results.get_scores())\n", + "results = add_aggregate_columns(results=results)\n", + "\n", + "\n", + "# create latex table\n", + "# column order\n", + "cols = [\n", + " \"model\",\n", + " \"Borda str\",\n", + " \"Mean\",\n", + " \"mean pr. task type\",\n", + " \"Mean BitextMining\",\n", + " \"Mean PairClassification\",\n", + " \"Mean Classification\",\n", + " \"Mean STS\",\n", + " \"Mean Retrieval\",\n", + " \"Mean MultilabelClassification\",\n", + " \"Mean Clustering\",\n", + " \"Mean Reranking\",\n", + "]\n", + "\n", + "latex_df = results[cols]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
modelrevisionBelebeleRetrievalBengaliSentimentAnalysisGujaratiNewsClassificationHindiDiscourseClassificationIN22ConvBitextMiningIN22GenBitextMiningIndicCrosslingualSTSMTOPIntentClassification...MeanMean BitextMiningMean PairClassificationMean ClassificationMean STSMean RetrievalMean MultilabelClassificationMean ClusteringMean Rerankingmean pr. task type
4intfloat/multilingual-e5-large-instructbaa7be480a7de1539afce709c8f13f833a510e0a73.75457783.98492787.51896835.23437571.87354988.87541753.68853362.952996...70.18608580.37448376.31431167.00924953.68853384.862788NaN51.67125487.46206371.626097
3intfloat/multilingual-e5-large4dc6d853a804b9c8886ede6dda8a073b7dc08a8168.19926983.07070676.73748138.74023467.78459087.69666443.86616259.198891...66.35486977.74062775.05757564.65915543.86616282.604635NaN25.60267585.97059565.071632
2intfloat/multilingual-e5-based13f1b27baf31030b7fd040960d60d909913633f60.37165479.64311174.90895339.03808663.13074785.29001241.11389054.049160...64.57153274.21037972.79577163.75385141.11389077.842327NaN24.60790483.76145562.583654
5intfloat/multilingual-e5-smalle4ce9877abf3edfe10b0d82785e83bdcb973e22e58.19353883.43044474.39302039.33593862.73917484.66321240.76108052.118503...64.72015673.70119373.79510463.78219640.76108076.817269NaN29.05408884.36957763.182930
0GritLM/GritLM-7B13f00a0e36500c80ce12870ea513846a066004af70.06365472.10045969.85584237.08984442.13757474.66753027.24562063.660123...60.20417358.40255267.83817060.04395327.24562079.496827NaN27.97833184.69514657.957228
1intfloat/e5-mistral-7b-instruct07163b72af1488142a360786df853f237b1a3ca166.28942372.07481673.08801232.00683644.30278173.79988722.98129759.232093...60.02282759.05133472.95109959.56386922.98129777.266212NaN32.70254684.42007958.419491
6sentence-transformers/LaBSEe34fab64a3011d2176c99545a93d5cbddc9a91b747.51553880.41541876.35811838.39843863.45925284.66884052.75801862.863971...61.85511774.06404664.58336661.90647452.75801864.334769NaN21.10516978.98047659.676046
11sentence-transformers/paraphrase-multilingual-...79f2382ceacceacdf38563d7c5d16b9ff8d725d636.10169274.88259481.93475038.69140633.58841454.80244334.09687461.699834...58.50288744.19542882.03614261.94326634.09687457.910346NaN32.06169774.33227555.225147
10sentence-transformers/paraphrase-multilingual-...bf3bf13ab40c3157080a7ab344c831b9ad18b5eb19.39507760.61352276.87405237.43164111.86723518.70964919.78897159.196945...49.67263215.28844277.84952057.64542419.78897148.779038NaN16.67540359.25869042.183641
9sentence-transformers/all-mpnet-base-v284f2bcc00d77236f9e89c8a360a00fb1139bf47d9.62761554.18827344.10470434.3164062.1103605.353099-2.50933218.150352...33.6302933.73173052.63453345.224600-2.50933212.853808NaN4.01262542.60177222.649962
7sentence-transformers/all-MiniLM-L12-v2a05860a77cef7b37e0048a7864658139bc18a8549.92765458.97598241.63884728.9111331.9225094.993733-5.33711617.615309...33.1216293.45812155.03429943.891361-5.33711613.923827NaN3.68798147.58719623.177952
8sentence-transformers/all-MiniLM-L6-v28b3219a92973c328a8e22fadcfa821b5dc75636a7.66165458.54163142.57966632.0361331.4159653.573556-6.27576818.495051...31.8423422.49476053.66748744.145491-6.2757686.217327NaN3.10327839.18178620.362052
\n", + "

12 rows × 34 columns

\n", + "
" + ], + "text/plain": [ + " model \\\n", + "4 intfloat/multilingual-e5-large-instruct \n", + "3 intfloat/multilingual-e5-large \n", + "2 intfloat/multilingual-e5-base \n", + "5 intfloat/multilingual-e5-small \n", + "0 GritLM/GritLM-7B \n", + "1 intfloat/e5-mistral-7b-instruct \n", + "6 sentence-transformers/LaBSE \n", + "11 sentence-transformers/paraphrase-multilingual-... \n", + "10 sentence-transformers/paraphrase-multilingual-... \n", + "9 sentence-transformers/all-mpnet-base-v2 \n", + "7 sentence-transformers/all-MiniLM-L12-v2 \n", + "8 sentence-transformers/all-MiniLM-L6-v2 \n", + "\n", + " revision BelebeleRetrieval \\\n", + "4 baa7be480a7de1539afce709c8f13f833a510e0a 73.754577 \n", + "3 4dc6d853a804b9c8886ede6dda8a073b7dc08a81 68.199269 \n", + "2 d13f1b27baf31030b7fd040960d60d909913633f 60.371654 \n", + "5 e4ce9877abf3edfe10b0d82785e83bdcb973e22e 58.193538 \n", + "0 13f00a0e36500c80ce12870ea513846a066004af 70.063654 \n", + "1 07163b72af1488142a360786df853f237b1a3ca1 66.289423 \n", + "6 e34fab64a3011d2176c99545a93d5cbddc9a91b7 47.515538 \n", + "11 79f2382ceacceacdf38563d7c5d16b9ff8d725d6 36.101692 \n", + "10 bf3bf13ab40c3157080a7ab344c831b9ad18b5eb 19.395077 \n", + "9 84f2bcc00d77236f9e89c8a360a00fb1139bf47d 9.627615 \n", + "7 a05860a77cef7b37e0048a7864658139bc18a854 9.927654 \n", + "8 8b3219a92973c328a8e22fadcfa821b5dc75636a 7.661654 \n", + "\n", + " BengaliSentimentAnalysis GujaratiNewsClassification \\\n", + "4 83.984927 87.518968 \n", + "3 83.070706 76.737481 \n", + "2 79.643111 74.908953 \n", + "5 83.430444 74.393020 \n", + "0 72.100459 69.855842 \n", + "1 72.074816 73.088012 \n", + "6 80.415418 76.358118 \n", + "11 74.882594 81.934750 \n", + "10 60.613522 76.874052 \n", + "9 54.188273 44.104704 \n", + "7 58.975982 41.638847 \n", + "8 58.541631 42.579666 \n", + "\n", + " HindiDiscourseClassification IN22ConvBitextMining IN22GenBitextMining \\\n", + "4 35.234375 71.873549 88.875417 \n", + "3 38.740234 67.784590 87.696664 \n", + "2 39.038086 63.130747 85.290012 \n", + "5 39.335938 62.739174 84.663212 \n", + "0 37.089844 42.137574 74.667530 \n", + "1 32.006836 44.302781 73.799887 \n", + "6 38.398438 63.459252 84.668840 \n", + "11 38.691406 33.588414 54.802443 \n", + "10 37.431641 11.867235 18.709649 \n", + "9 34.316406 2.110360 5.353099 \n", + "7 28.911133 1.922509 4.993733 \n", + "8 32.036133 1.415965 3.573556 \n", + "\n", + " IndicCrosslingualSTS MTOPIntentClassification ... Mean \\\n", + "4 53.688533 62.952996 ... 70.186085 \n", + "3 43.866162 59.198891 ... 66.354869 \n", + "2 41.113890 54.049160 ... 64.571532 \n", + "5 40.761080 52.118503 ... 64.720156 \n", + "0 27.245620 63.660123 ... 60.204173 \n", + "1 22.981297 59.232093 ... 60.022827 \n", + "6 52.758018 62.863971 ... 61.855117 \n", + "11 34.096874 61.699834 ... 58.502887 \n", + "10 19.788971 59.196945 ... 49.672632 \n", + "9 -2.509332 18.150352 ... 33.630293 \n", + "7 -5.337116 17.615309 ... 33.121629 \n", + "8 -6.275768 18.495051 ... 31.842342 \n", + "\n", + " Mean BitextMining Mean PairClassification Mean Classification \\\n", + "4 80.374483 76.314311 67.009249 \n", + "3 77.740627 75.057575 64.659155 \n", + "2 74.210379 72.795771 63.753851 \n", + "5 73.701193 73.795104 63.782196 \n", + "0 58.402552 67.838170 60.043953 \n", + "1 59.051334 72.951099 59.563869 \n", + "6 74.064046 64.583366 61.906474 \n", + "11 44.195428 82.036142 61.943266 \n", + "10 15.288442 77.849520 57.645424 \n", + "9 3.731730 52.634533 45.224600 \n", + "7 3.458121 55.034299 43.891361 \n", + "8 2.494760 53.667487 44.145491 \n", + "\n", + " Mean STS Mean Retrieval Mean MultilabelClassification Mean Clustering \\\n", + "4 53.688533 84.862788 NaN 51.671254 \n", + "3 43.866162 82.604635 NaN 25.602675 \n", + "2 41.113890 77.842327 NaN 24.607904 \n", + "5 40.761080 76.817269 NaN 29.054088 \n", + "0 27.245620 79.496827 NaN 27.978331 \n", + "1 22.981297 77.266212 NaN 32.702546 \n", + "6 52.758018 64.334769 NaN 21.105169 \n", + "11 34.096874 57.910346 NaN 32.061697 \n", + "10 19.788971 48.779038 NaN 16.675403 \n", + "9 -2.509332 12.853808 NaN 4.012625 \n", + "7 -5.337116 13.923827 NaN 3.687981 \n", + "8 -6.275768 6.217327 NaN 3.103278 \n", + "\n", + " Mean Reranking mean pr. task type \n", + "4 87.462063 71.626097 \n", + "3 85.970595 65.071632 \n", + "2 83.761455 62.583654 \n", + "5 84.369577 63.182930 \n", + "0 84.695146 57.957228 \n", + "1 84.420079 58.419491 \n", + "6 78.980476 59.676046 \n", + "11 74.332275 55.225147 \n", + "10 59.258690 42.183641 \n", + "9 42.601772 22.649962 \n", + "7 47.587196 23.177952 \n", + "8 39.181786 20.362052 \n", + "\n", + "[12 rows x 34 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
modelBorda strMeanmean pr. task typeMean BitextMiningMean PairClassificationMean ClassificationMean STSMean RetrievalMean MultilabelClassificationMean ClusteringMean Reranking
4intfloat/multilingual-e5-large-instruct1 (209)70.18608571.62609780.37448376.31431167.00924953.68853384.862788NaN51.67125487.462063
3intfloat/multilingual-e5-large2 (188)66.35486965.07163277.74062775.05757564.65915543.86616282.604635NaN25.60267585.970595
2intfloat/multilingual-e5-base3 (173)64.57153262.58365474.21037972.79577163.75385141.11389077.842327NaN24.60790483.761455
5intfloat/multilingual-e5-small4 (164)64.72015663.18293073.70119373.79510463.78219640.76108076.817269NaN29.05408884.369577
0GritLM/GritLM-7B5 (151)60.20417357.95722858.40255267.83817060.04395327.24562079.496827NaN27.97833184.695146
1intfloat/e5-mistral-7b-instruct6 (144)60.02282758.41949159.05133472.95109959.56386922.98129777.266212NaN32.70254684.420079
6sentence-transformers/LaBSE7 (139)61.85511759.67604674.06404664.58336661.90647452.75801864.334769NaN21.10516978.980476
11sentence-transformers/paraphrase-multilingual-...8 (137)58.50288755.22514744.19542882.03614261.94326634.09687457.910346NaN32.06169774.332275
10sentence-transformers/paraphrase-multilingual-...9 (98)49.67263242.18364115.28844277.84952057.64542419.78897148.779038NaN16.67540359.258690
9sentence-transformers/all-mpnet-base-v210 (68)33.63029322.6499623.73173052.63453345.224600-2.50933212.853808NaN4.01262542.601772
7sentence-transformers/all-MiniLM-L12-v211 (49)33.12162923.1779523.45812155.03429943.891361-5.33711613.923827NaN3.68798147.587196
8sentence-transformers/all-MiniLM-L6-v212 (40)31.84234220.3620522.49476053.66748744.145491-6.2757686.217327NaN3.10327839.181786
\n", + "
" + ], + "text/plain": [ + " model Borda str Mean \\\n", + "4 intfloat/multilingual-e5-large-instruct 1 (209) 70.186085 \n", + "3 intfloat/multilingual-e5-large 2 (188) 66.354869 \n", + "2 intfloat/multilingual-e5-base 3 (173) 64.571532 \n", + "5 intfloat/multilingual-e5-small 4 (164) 64.720156 \n", + "0 GritLM/GritLM-7B 5 (151) 60.204173 \n", + "1 intfloat/e5-mistral-7b-instruct 6 (144) 60.022827 \n", + "6 sentence-transformers/LaBSE 7 (139) 61.855117 \n", + "11 sentence-transformers/paraphrase-multilingual-... 8 (137) 58.502887 \n", + "10 sentence-transformers/paraphrase-multilingual-... 9 (98) 49.672632 \n", + "9 sentence-transformers/all-mpnet-base-v2 10 (68) 33.630293 \n", + "7 sentence-transformers/all-MiniLM-L12-v2 11 (49) 33.121629 \n", + "8 sentence-transformers/all-MiniLM-L6-v2 12 (40) 31.842342 \n", + "\n", + " mean pr. task type Mean BitextMining Mean PairClassification \\\n", + "4 71.626097 80.374483 76.314311 \n", + "3 65.071632 77.740627 75.057575 \n", + "2 62.583654 74.210379 72.795771 \n", + "5 63.182930 73.701193 73.795104 \n", + "0 57.957228 58.402552 67.838170 \n", + "1 58.419491 59.051334 72.951099 \n", + "6 59.676046 74.064046 64.583366 \n", + "11 55.225147 44.195428 82.036142 \n", + "10 42.183641 15.288442 77.849520 \n", + "9 22.649962 3.731730 52.634533 \n", + "7 23.177952 3.458121 55.034299 \n", + "8 20.362052 2.494760 53.667487 \n", + "\n", + " Mean Classification Mean STS Mean Retrieval \\\n", + "4 67.009249 53.688533 84.862788 \n", + "3 64.659155 43.866162 82.604635 \n", + "2 63.753851 41.113890 77.842327 \n", + "5 63.782196 40.761080 76.817269 \n", + "0 60.043953 27.245620 79.496827 \n", + "1 59.563869 22.981297 77.266212 \n", + "6 61.906474 52.758018 64.334769 \n", + "11 61.943266 34.096874 57.910346 \n", + "10 57.645424 19.788971 48.779038 \n", + "9 45.224600 -2.509332 12.853808 \n", + "7 43.891361 -5.337116 13.923827 \n", + "8 44.145491 -6.275768 6.217327 \n", + "\n", + " Mean MultilabelClassification Mean Clustering Mean Reranking \n", + "4 NaN 51.671254 87.462063 \n", + "3 NaN 25.602675 85.970595 \n", + "2 NaN 24.607904 83.761455 \n", + "5 NaN 29.054088 84.369577 \n", + "0 NaN 27.978331 84.695146 \n", + "1 NaN 32.702546 84.420079 \n", + "6 NaN 21.105169 78.980476 \n", + "11 NaN 32.061697 74.332275 \n", + "10 NaN 16.675403 59.258690 \n", + "9 NaN 4.012625 42.601772 \n", + "7 NaN 3.687981 47.587196 \n", + "8 NaN 3.103278 39.181786 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "latex_df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\begin{tabular}{llrrrrrrrrrr}\n", + "\\toprule\n", + "model & Borda str & Mean & mean pr. task type & Mean BitextMining & Mean PairClassification & Mean Classification & Mean STS & Mean Retrieval & Mean MultilabelClassification & Mean Clustering & Mean Reranking \\\\\n", + "\\midrule\n", + "intfloat/multilingual-e5-large-instruct & 1 (209) & 70.2 & 71.6 & 80.4 & 76.3 & 67.0 & 53.7 & 84.9 & NaN & 51.7 & 87.5 \\\\\n", + "intfloat/multilingual-e5-large & 2 (188) & 66.4 & 65.1 & 77.7 & 75.1 & 64.7 & 43.9 & 82.6 & NaN & 25.6 & 86.0 \\\\\n", + "intfloat/multilingual-e5-base & 3 (173) & 64.6 & 62.6 & 74.2 & 72.8 & 63.8 & 41.1 & 77.8 & NaN & 24.6 & 83.8 \\\\\n", + "intfloat/multilingual-e5-small & 4 (164) & 64.7 & 63.2 & 73.7 & 73.8 & 63.8 & 40.8 & 76.8 & NaN & 29.1 & 84.4 \\\\\n", + "GritLM/GritLM-7B & 5 (151) & 60.2 & 58.0 & 58.4 & 67.8 & 60.0 & 27.2 & 79.5 & NaN & 28.0 & 84.7 \\\\\n", + "intfloat/e5-mistral-7b-instruct & 6 (144) & 60.0 & 58.4 & 59.1 & 73.0 & 59.6 & 23.0 & 77.3 & NaN & 32.7 & 84.4 \\\\\n", + "sentence-transformers/LaBSE & 7 (139) & 61.9 & 59.7 & 74.1 & 64.6 & 61.9 & 52.8 & 64.3 & NaN & 21.1 & 79.0 \\\\\n", + "sentence-transformers/paraphrase-multilingual-mpnet-base-v2 & 8 (137) & 58.5 & 55.2 & 44.2 & 82.0 & 61.9 & 34.1 & 57.9 & NaN & 32.1 & 74.3 \\\\\n", + "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 & 9 (98) & 49.7 & 42.2 & 15.3 & 77.8 & 57.6 & 19.8 & 48.8 & NaN & 16.7 & 59.3 \\\\\n", + "sentence-transformers/all-mpnet-base-v2 & 10 (68) & 33.6 & 22.6 & 3.7 & 52.6 & 45.2 & -2.5 & 12.9 & NaN & 4.0 & 42.6 \\\\\n", + "sentence-transformers/all-MiniLM-L12-v2 & 11 (49) & 33.1 & 23.2 & 3.5 & 55.0 & 43.9 & -5.3 & 13.9 & NaN & 3.7 & 47.6 \\\\\n", + "sentence-transformers/all-MiniLM-L6-v2 & 12 (40) & 31.8 & 20.4 & 2.5 & 53.7 & 44.1 & -6.3 & 6.2 & NaN & 3.1 & 39.2 \\\\\n", + "\\bottomrule\n", + "\\end{tabular}\n", + "\n" + ] + } + ], + "source": [ + "print(latex_df.to_latex(index=False, float_format=\"%.1f\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Europe" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/au561649/Github/mteb/mteb/load_results/benchmark_results.py:120: UserWarning: Couldn't get scores for NordicLangClassification due to No splits had scores for the specified languages..\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "mult_tasks = mteb.get_benchmark(\"MTEB(Europe)\").tasks\n", + "\n", + "# load task results for the specified models from mteb/results repository\n", + "mteb_results = mteb.load_results(\n", + " models=model_metas,\n", + " tasks=mult_tasks,\n", + " download_latest=False,\n", + ")\n", + "\n", + "mteb_results = mteb_results.join_revisions().filter_models()\n", + "\n", + "# manual check that everything is there\n", + "pd.DataFrame(mteb_results.get_scores()).to_csv(\"tmp.csv\")\n", + "\n", + "results = pd.DataFrame(mteb_results.get_scores())\n", + "results = add_aggregate_columns(results=results)\n", + "\n", + "\n", + "# create latex table\n", + "# column order\n", + "cols = [\n", + " \"model\",\n", + " \"Borda str\",\n", + " \"Mean\",\n", + " \"mean pr. task type\",\n", + " \"Mean BitextMining\",\n", + " \"Mean PairClassification\",\n", + " \"Mean Classification\",\n", + " \"Mean STS\",\n", + " \"Mean Retrieval\",\n", + " \"Mean MultilabelClassification\",\n", + " \"Mean Clustering\",\n", + " \"Mean Reranking\",\n", + "]\n", + "\n", + "latex_df = results[cols]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
modelrevisionAlloProfClusteringS2S.v2AlloprofRerankingAlloprofRetrievalAmazonCounterfactualClassificationArguAnaBUCC.v2BelebeleRetrievalBibleNLPBitextMining...MeanMean BitextMiningMean PairClassificationMean ClassificationMean STSMean RetrievalMean MultilabelClassificationMean ClusteringMean Rerankingmean pr. task type
0GritLM/GritLM-7B13f00a0e36500c80ce12870ea513846a066004af56.41182177.92616055.42278.45278763.17199.50242991.39311097.337705...62.97081990.42070589.93944064.73689676.05024757.10526317.55120045.28110560.26964462.669312
4intfloat/multilingual-e5-large-instructbaa7be480a7de1539afce709c8f13f833a510e0a56.46565574.67773052.11867.61980258.47699.47383692.24012397.916667...62.19292990.38384689.98587963.24131977.42865754.80256617.26588946.89558358.42010862.302981
1intfloat/e5-mistral-7b-instruct07163b72af1488142a360786df853f237b1a3ca157.11200078.31766454.61973.90146661.65399.38457988.39387796.731306...61.72911989.58018791.15418262.94671576.48128753.64409515.46454546.47332859.81530061.944955
3intfloat/multilingual-e5-large4dc6d853a804b9c8886ede6dda8a073b7dc08a8135.15076469.44288039.34175.11673254.35799.02254792.72843894.571508...58.49154384.45640488.75347760.38512575.75905650.81435814.98004038.23591755.91058058.661870
2intfloat/multilingual-e5-based13f1b27baf31030b7fd040960d60d909913633f34.11319065.89715234.44775.09807944.20698.69987887.64986394.283655...57.18673884.11074287.35257257.85409073.66949850.20156714.86227238.16056053.85446857.508221
11sentence-transformers/paraphrase-multilingual-...79f2382ceacceacdf38563d7c5d16b9ff8d725d641.80627467.20432230.79973.98368748.90898.33017179.73872695.205078...54.41045379.46865990.72551256.59934974.25350741.1603126.89784135.78320952.33668054.653134
5intfloat/multilingual-e5-smalle4ce9877abf3edfe10b0d82785e83bdcb973e22e35.39326164.41002327.38071.74662539.08896.35298482.59243885.072037...55.03810580.94879986.37405256.10888571.63607646.07227913.96744836.49808054.10891855.714317
6sentence-transformers/LaBSEe34fab64a3011d2176c99545a93d5cbddc9a91b730.20893055.37476619.77574.48545534.17899.18915072.62969997.470989...51.84371888.77928485.18018255.10093265.68381834.35170916.29811434.25307948.66005153.538396
10sentence-transformers/paraphrase-multilingual-...bf3bf13ab40c3157080a7ab344c831b9ad18b5eb40.45121362.42438226.63469.77266844.87897.16738674.56121993.669550...51.73191276.98896888.92509252.67843072.53639337.5987835.68795334.44316350.19811952.382113
9sentence-transformers/all-mpnet-base-v284f2bcc00d77236f9e89c8a360a00fb1139bf47d35.21515469.63005634.27062.19371346.52126.35764439.2880416.588554...44.68741029.80746980.51957949.24922063.88359237.30784710.87212436.19005449.60848744.679796
7sentence-transformers/all-MiniLM-L12-v2a05860a77cef7b37e0048a7864658139bc18a85431.97653767.01369633.19663.05826347.12828.50381938.2651646.486622...44.38913832.06117181.52094649.24410664.19240236.2432327.57419432.51339649.19630444.068219
8sentence-transformers/all-MiniLM-L6-v28b3219a92973c328a8e22fadcfa821b5dc75636a31.10665462.62172628.41361.66063850.16720.29339534.4848634.975517...43.44798927.24429180.18741247.75747462.65085937.3465278.77571933.55543547.72914043.155857
\n", + "

12 rows × 87 columns

\n", + "
" + ], + "text/plain": [ + " model \\\n", + "0 GritLM/GritLM-7B \n", + "4 intfloat/multilingual-e5-large-instruct \n", + "1 intfloat/e5-mistral-7b-instruct \n", + "3 intfloat/multilingual-e5-large \n", + "2 intfloat/multilingual-e5-base \n", + "11 sentence-transformers/paraphrase-multilingual-... \n", + "5 intfloat/multilingual-e5-small \n", + "6 sentence-transformers/LaBSE \n", + "10 sentence-transformers/paraphrase-multilingual-... \n", + "9 sentence-transformers/all-mpnet-base-v2 \n", + "7 sentence-transformers/all-MiniLM-L12-v2 \n", + "8 sentence-transformers/all-MiniLM-L6-v2 \n", + "\n", + " revision AlloProfClusteringS2S.v2 \\\n", + "0 13f00a0e36500c80ce12870ea513846a066004af 56.411821 \n", + "4 baa7be480a7de1539afce709c8f13f833a510e0a 56.465655 \n", + "1 07163b72af1488142a360786df853f237b1a3ca1 57.112000 \n", + "3 4dc6d853a804b9c8886ede6dda8a073b7dc08a81 35.150764 \n", + "2 d13f1b27baf31030b7fd040960d60d909913633f 34.113190 \n", + "11 79f2382ceacceacdf38563d7c5d16b9ff8d725d6 41.806274 \n", + "5 e4ce9877abf3edfe10b0d82785e83bdcb973e22e 35.393261 \n", + "6 e34fab64a3011d2176c99545a93d5cbddc9a91b7 30.208930 \n", + "10 bf3bf13ab40c3157080a7ab344c831b9ad18b5eb 40.451213 \n", + "9 84f2bcc00d77236f9e89c8a360a00fb1139bf47d 35.215154 \n", + "7 a05860a77cef7b37e0048a7864658139bc18a854 31.976537 \n", + "8 8b3219a92973c328a8e22fadcfa821b5dc75636a 31.106654 \n", + "\n", + " AlloprofReranking AlloprofRetrieval AmazonCounterfactualClassification \\\n", + "0 77.926160 55.422 78.452787 \n", + "4 74.677730 52.118 67.619802 \n", + "1 78.317664 54.619 73.901466 \n", + "3 69.442880 39.341 75.116732 \n", + "2 65.897152 34.447 75.098079 \n", + "11 67.204322 30.799 73.983687 \n", + "5 64.410023 27.380 71.746625 \n", + "6 55.374766 19.775 74.485455 \n", + "10 62.424382 26.634 69.772668 \n", + "9 69.630056 34.270 62.193713 \n", + "7 67.013696 33.196 63.058263 \n", + "8 62.621726 28.413 61.660638 \n", + "\n", + " ArguAna BUCC.v2 BelebeleRetrieval BibleNLPBitextMining ... \\\n", + "0 63.171 99.502429 91.393110 97.337705 ... \n", + "4 58.476 99.473836 92.240123 97.916667 ... \n", + "1 61.653 99.384579 88.393877 96.731306 ... \n", + "3 54.357 99.022547 92.728438 94.571508 ... \n", + "2 44.206 98.699878 87.649863 94.283655 ... \n", + "11 48.908 98.330171 79.738726 95.205078 ... \n", + "5 39.088 96.352984 82.592438 85.072037 ... \n", + "6 34.178 99.189150 72.629699 97.470989 ... \n", + "10 44.878 97.167386 74.561219 93.669550 ... \n", + "9 46.521 26.357644 39.288041 6.588554 ... \n", + "7 47.128 28.503819 38.265164 6.486622 ... \n", + "8 50.167 20.293395 34.484863 4.975517 ... \n", + "\n", + " Mean Mean BitextMining Mean PairClassification \\\n", + "0 62.970819 90.420705 89.939440 \n", + "4 62.192929 90.383846 89.985879 \n", + "1 61.729119 89.580187 91.154182 \n", + "3 58.491543 84.456404 88.753477 \n", + "2 57.186738 84.110742 87.352572 \n", + "11 54.410453 79.468659 90.725512 \n", + "5 55.038105 80.948799 86.374052 \n", + "6 51.843718 88.779284 85.180182 \n", + "10 51.731912 76.988968 88.925092 \n", + "9 44.687410 29.807469 80.519579 \n", + "7 44.389138 32.061171 81.520946 \n", + "8 43.447989 27.244291 80.187412 \n", + "\n", + " Mean Classification Mean STS Mean Retrieval \\\n", + "0 64.736896 76.050247 57.105263 \n", + "4 63.241319 77.428657 54.802566 \n", + "1 62.946715 76.481287 53.644095 \n", + "3 60.385125 75.759056 50.814358 \n", + "2 57.854090 73.669498 50.201567 \n", + "11 56.599349 74.253507 41.160312 \n", + "5 56.108885 71.636076 46.072279 \n", + "6 55.100932 65.683818 34.351709 \n", + "10 52.678430 72.536393 37.598783 \n", + "9 49.249220 63.883592 37.307847 \n", + "7 49.244106 64.192402 36.243232 \n", + "8 47.757474 62.650859 37.346527 \n", + "\n", + " Mean MultilabelClassification Mean Clustering Mean Reranking \\\n", + "0 17.551200 45.281105 60.269644 \n", + "4 17.265889 46.895583 58.420108 \n", + "1 15.464545 46.473328 59.815300 \n", + "3 14.980040 38.235917 55.910580 \n", + "2 14.862272 38.160560 53.854468 \n", + "11 6.897841 35.783209 52.336680 \n", + "5 13.967448 36.498080 54.108918 \n", + "6 16.298114 34.253079 48.660051 \n", + "10 5.687953 34.443163 50.198119 \n", + "9 10.872124 36.190054 49.608487 \n", + "7 7.574194 32.513396 49.196304 \n", + "8 8.775719 33.555435 47.729140 \n", + "\n", + " mean pr. task type \n", + "0 62.669312 \n", + "4 62.302981 \n", + "1 61.944955 \n", + "3 58.661870 \n", + "2 57.508221 \n", + "11 54.653134 \n", + "5 55.714317 \n", + "6 53.538396 \n", + "10 52.382113 \n", + "9 44.679796 \n", + "7 44.068219 \n", + "8 43.155857 \n", + "\n", + "[12 rows x 87 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
modelBorda strMeanmean pr. task typeMean BitextMiningMean PairClassificationMean ClassificationMean STSMean RetrievalMean MultilabelClassificationMean ClusteringMean Reranking
0GritLM/GritLM-7B1 (757)62.97081962.66931290.42070589.93944064.73689676.05024757.10526317.55120045.28110560.269644
4intfloat/multilingual-e5-large-instruct2 (732)62.19292962.30298190.38384689.98587963.24131977.42865754.80256617.26588946.89558358.420108
1intfloat/e5-mistral-7b-instruct3 (725)61.72911961.94495589.58018791.15418262.94671576.48128753.64409515.46454546.47332859.815300
3intfloat/multilingual-e5-large4 (586)58.49154358.66187084.45640488.75347760.38512575.75905650.81435814.98004038.23591755.910580
2intfloat/multilingual-e5-base5 (499)57.18673857.50822184.11074287.35257257.85409073.66949850.20156714.86227238.16056053.854468
11sentence-transformers/paraphrase-multilingual-...6 (463)54.41045354.65313479.46865990.72551256.59934974.25350741.1603126.89784135.78320952.336680
5intfloat/multilingual-e5-small7 (399)55.03810555.71431780.94879986.37405256.10888571.63607646.07227913.96744836.49808054.108918
6sentence-transformers/LaBSE8 (358)51.84371853.53839688.77928485.18018255.10093265.68381834.35170916.29811434.25307948.660051
10sentence-transformers/paraphrase-multilingual-...9 (328)51.73191252.38211376.98896888.92509252.67843072.53639337.5987835.68795334.44316350.198119
9sentence-transformers/all-mpnet-base-v210 (310)44.68741044.67979629.80746980.51957949.24922063.88359237.30784710.87212436.19005449.608487
7sentence-transformers/all-MiniLM-L12-v211 (292)44.38913844.06821932.06117181.52094649.24410664.19240236.2432327.57419432.51339649.196304
8sentence-transformers/all-MiniLM-L6-v212 (237)43.44798943.15585727.24429180.18741247.75747462.65085937.3465278.77571933.55543547.729140
\n", + "
" + ], + "text/plain": [ + " model Borda str Mean \\\n", + "0 GritLM/GritLM-7B 1 (757) 62.970819 \n", + "4 intfloat/multilingual-e5-large-instruct 2 (732) 62.192929 \n", + "1 intfloat/e5-mistral-7b-instruct 3 (725) 61.729119 \n", + "3 intfloat/multilingual-e5-large 4 (586) 58.491543 \n", + "2 intfloat/multilingual-e5-base 5 (499) 57.186738 \n", + "11 sentence-transformers/paraphrase-multilingual-... 6 (463) 54.410453 \n", + "5 intfloat/multilingual-e5-small 7 (399) 55.038105 \n", + "6 sentence-transformers/LaBSE 8 (358) 51.843718 \n", + "10 sentence-transformers/paraphrase-multilingual-... 9 (328) 51.731912 \n", + "9 sentence-transformers/all-mpnet-base-v2 10 (310) 44.687410 \n", + "7 sentence-transformers/all-MiniLM-L12-v2 11 (292) 44.389138 \n", + "8 sentence-transformers/all-MiniLM-L6-v2 12 (237) 43.447989 \n", + "\n", + " mean pr. task type Mean BitextMining Mean PairClassification \\\n", + "0 62.669312 90.420705 89.939440 \n", + "4 62.302981 90.383846 89.985879 \n", + "1 61.944955 89.580187 91.154182 \n", + "3 58.661870 84.456404 88.753477 \n", + "2 57.508221 84.110742 87.352572 \n", + "11 54.653134 79.468659 90.725512 \n", + "5 55.714317 80.948799 86.374052 \n", + "6 53.538396 88.779284 85.180182 \n", + "10 52.382113 76.988968 88.925092 \n", + "9 44.679796 29.807469 80.519579 \n", + "7 44.068219 32.061171 81.520946 \n", + "8 43.155857 27.244291 80.187412 \n", + "\n", + " Mean Classification Mean STS Mean Retrieval \\\n", + "0 64.736896 76.050247 57.105263 \n", + "4 63.241319 77.428657 54.802566 \n", + "1 62.946715 76.481287 53.644095 \n", + "3 60.385125 75.759056 50.814358 \n", + "2 57.854090 73.669498 50.201567 \n", + "11 56.599349 74.253507 41.160312 \n", + "5 56.108885 71.636076 46.072279 \n", + "6 55.100932 65.683818 34.351709 \n", + "10 52.678430 72.536393 37.598783 \n", + "9 49.249220 63.883592 37.307847 \n", + "7 49.244106 64.192402 36.243232 \n", + "8 47.757474 62.650859 37.346527 \n", + "\n", + " Mean MultilabelClassification Mean Clustering Mean Reranking \n", + "0 17.551200 45.281105 60.269644 \n", + "4 17.265889 46.895583 58.420108 \n", + "1 15.464545 46.473328 59.815300 \n", + "3 14.980040 38.235917 55.910580 \n", + "2 14.862272 38.160560 53.854468 \n", + "11 6.897841 35.783209 52.336680 \n", + "5 13.967448 36.498080 54.108918 \n", + "6 16.298114 34.253079 48.660051 \n", + "10 5.687953 34.443163 50.198119 \n", + "9 10.872124 36.190054 49.608487 \n", + "7 7.574194 32.513396 49.196304 \n", + "8 8.775719 33.555435 47.729140 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "latex_df" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\begin{tabular}{llrrrrrrrrrr}\n", + "\\toprule\n", + "model & Borda str & Mean & mean pr. task type & Mean BitextMining & Mean PairClassification & Mean Classification & Mean STS & Mean Retrieval & Mean MultilabelClassification & Mean Clustering & Mean Reranking \\\\\n", + "\\midrule\n", + "GritLM/GritLM-7B & 1 (757) & 63.0 & 62.7 & 90.4 & 89.9 & 64.7 & 76.1 & 57.1 & 17.6 & 45.3 & 60.3 \\\\\n", + "intfloat/multilingual-e5-large-instruct & 2 (732) & 62.2 & 62.3 & 90.4 & 90.0 & 63.2 & 77.4 & 54.8 & 17.3 & 46.9 & 58.4 \\\\\n", + "intfloat/e5-mistral-7b-instruct & 3 (725) & 61.7 & 61.9 & 89.6 & 91.2 & 62.9 & 76.5 & 53.6 & 15.5 & 46.5 & 59.8 \\\\\n", + "intfloat/multilingual-e5-large & 4 (586) & 58.5 & 58.7 & 84.5 & 88.8 & 60.4 & 75.8 & 50.8 & 15.0 & 38.2 & 55.9 \\\\\n", + "intfloat/multilingual-e5-base & 5 (499) & 57.2 & 57.5 & 84.1 & 87.4 & 57.9 & 73.7 & 50.2 & 14.9 & 38.2 & 53.9 \\\\\n", + "sentence-transformers/paraphrase-multilingual-mpnet-base-v2 & 6 (463) & 54.4 & 54.7 & 79.5 & 90.7 & 56.6 & 74.3 & 41.2 & 6.9 & 35.8 & 52.3 \\\\\n", + "intfloat/multilingual-e5-small & 7 (399) & 55.0 & 55.7 & 80.9 & 86.4 & 56.1 & 71.6 & 46.1 & 14.0 & 36.5 & 54.1 \\\\\n", + "sentence-transformers/LaBSE & 8 (358) & 51.8 & 53.5 & 88.8 & 85.2 & 55.1 & 65.7 & 34.4 & 16.3 & 34.3 & 48.7 \\\\\n", + "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 & 9 (328) & 51.7 & 52.4 & 77.0 & 88.9 & 52.7 & 72.5 & 37.6 & 5.7 & 34.4 & 50.2 \\\\\n", + "sentence-transformers/all-mpnet-base-v2 & 10 (310) & 44.7 & 44.7 & 29.8 & 80.5 & 49.2 & 63.9 & 37.3 & 10.9 & 36.2 & 49.6 \\\\\n", + "sentence-transformers/all-MiniLM-L12-v2 & 11 (292) & 44.4 & 44.1 & 32.1 & 81.5 & 49.2 & 64.2 & 36.2 & 7.6 & 32.5 & 49.2 \\\\\n", + "sentence-transformers/all-MiniLM-L6-v2 & 12 (237) & 43.4 & 43.2 & 27.2 & 80.2 & 47.8 & 62.7 & 37.3 & 8.8 & 33.6 & 47.7 \\\\\n", + "\\bottomrule\n", + "\\end{tabular}\n", + "\n" + ] + } + ], + "source": [ + "print(latex_df.to_latex(index=False, float_format=\"%.1f\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multilingual" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/bq/3m2kv2_535q0c9ld2jmmz774yj4nph/T/ipykernel_22431/597027223.py:8: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", + " results[\"Borda Count\"] = borda\n", + "/var/folders/bq/3m2kv2_535q0c9ld2jmmz774yj4nph/T/ipykernel_22431/597027223.py:11: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", + " results[\"Borda str\"] = [\n", + "/var/folders/bq/3m2kv2_535q0c9ld2jmmz774yj4nph/T/ipykernel_22431/597027223.py:17: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", + " results[\"Mean\"] = results[task_names].mean(axis=1)\n", + "/var/folders/bq/3m2kv2_535q0c9ld2jmmz774yj4nph/T/ipykernel_22431/597027223.py:39: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", + " results[f\"Mean {task_type}\"] = results[task_names].mean(axis=1)\n", + "/var/folders/bq/3m2kv2_535q0c9ld2jmmz774yj4nph/T/ipykernel_22431/597027223.py:39: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", + " results[f\"Mean {task_type}\"] = results[task_names].mean(axis=1)\n", + "/var/folders/bq/3m2kv2_535q0c9ld2jmmz774yj4nph/T/ipykernel_22431/597027223.py:39: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", + " results[f\"Mean {task_type}\"] = results[task_names].mean(axis=1)\n", + "/var/folders/bq/3m2kv2_535q0c9ld2jmmz774yj4nph/T/ipykernel_22431/597027223.py:39: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", + " results[f\"Mean {task_type}\"] = results[task_names].mean(axis=1)\n", + "/var/folders/bq/3m2kv2_535q0c9ld2jmmz774yj4nph/T/ipykernel_22431/597027223.py:39: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", + " results[f\"Mean {task_type}\"] = results[task_names].mean(axis=1)\n", + "/var/folders/bq/3m2kv2_535q0c9ld2jmmz774yj4nph/T/ipykernel_22431/597027223.py:39: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", + " results[f\"Mean {task_type}\"] = results[task_names].mean(axis=1)\n", + "/var/folders/bq/3m2kv2_535q0c9ld2jmmz774yj4nph/T/ipykernel_22431/597027223.py:39: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", + " results[f\"Mean {task_type}\"] = results[task_names].mean(axis=1)\n", + "/var/folders/bq/3m2kv2_535q0c9ld2jmmz774yj4nph/T/ipykernel_22431/597027223.py:39: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", + " results[f\"Mean {task_type}\"] = results[task_names].mean(axis=1)\n", + "/var/folders/bq/3m2kv2_535q0c9ld2jmmz774yj4nph/T/ipykernel_22431/597027223.py:43: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", + " results[\"mean pr. task type\"] = results[cols].mean(axis=1)\n" + ] + } + ], + "source": [ + "mult_tasks = mteb.get_benchmark(\"MTEB(Multilingual)\").tasks\n", + "\n", + "# load task results for the specified models from mteb/results repository\n", + "mteb_results = mteb.load_results(\n", + " models=model_metas,\n", + " tasks=mult_tasks,\n", + " download_latest=False,\n", + ")\n", + "\n", + "mteb_results = mteb_results.join_revisions().filter_models()\n", + "\n", + "# manual check that everything is there\n", + "pd.DataFrame(mteb_results.get_scores()).to_csv(\"tmp.csv\")\n", + "\n", + "results = pd.DataFrame(mteb_results.get_scores())\n", + "results = add_aggregate_columns(results=results)\n", + "\n", + "\n", + "# create latex table\n", + "# column order\n", + "cols = [\n", + " \"model\",\n", + " \"Borda str\",\n", + " \"Mean\",\n", + " \"mean pr. task type\",\n", + " \"Mean BitextMining\",\n", + " \"Mean PairClassification\",\n", + " \"Mean Classification\",\n", + " \"Mean STS\",\n", + " \"Mean Retrieval\",\n", + " \"Mean MultilabelClassification\",\n", + " \"Mean Clustering\",\n", + " \"Mean Reranking\",\n", + "]\n", + "\n", + "latex_df = results[cols]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
modelrevisionAILAStatutesAfriSentiClassificationAlloProfClusteringS2S.v2AlloprofRerankingAmazonCounterfactualClassificationArXivHierarchicalClusteringP2PArXivHierarchicalClusteringS2SArguAna...MeanMean BitextMiningMean PairClassificationMean ClassificationMean STSMean RetrievalMean MultilabelClassificationMean ClusteringMean Rerankingmean pr. task type
4intfloat/multilingual-e5-large-instructbaa7be480a7de1539afce709c8f13f833a510e0a29.65945.38743256.46565574.67773068.60635662.53499461.28405858.476...63.22716980.12647080.86358464.94214476.81467157.11668622.91350451.53801662.61327362.116044
0GritLM/GritLM-7B13f00a0e36500c80ce12870ea513846a066004af41.80045.07858956.41182177.92616079.29651259.76004662.28324363.171...60.93085570.53172679.94441161.83024773.32796058.30671122.77377150.48252063.77875460.122013
1intfloat/e5-mistral-7b-instruct07163b72af1488142a360786df853f237b1a3ca134.53544.47633557.11200078.31766473.55583965.28373561.27812361.653...60.27974670.57990581.12221160.31429574.02160055.75010122.19682151.39009563.81918359.899276
3intfloat/multilingual-e5-large4dc6d853a804b9c8886ede6dda8a073b7dc08a8120.84245.50050735.15076469.44288076.16351455.57211456.21221754.357...58.57057271.66625079.02839059.91697573.48837254.11117821.30237342.92375562.84046658.159720
2intfloat/multilingual-e5-based13f1b27baf31030b7fd040960d60d909913633f20.37143.80231534.11319065.89715274.33401456.68313756.11505644.206...57.01295969.43794577.15438558.20571871.44424952.72182320.16206042.67446160.17636056.497125
11sentence-transformers/paraphrase-multilingual-...79f2382ceacceacdf38563d7c5d16b9ff8d725d622.23642.44547141.80627467.20432272.76652855.34276755.16032848.908...52.00525952.06293781.15439255.06435469.66104039.75775216.39803441.08066553.37467751.069231
5intfloat/multilingual-e5-smalle4ce9877abf3edfe10b0d82785e83bdcb973e22e19.01142.35811835.39326164.41002369.16365554.27623554.19874939.088...55.45670167.47292276.32905356.50092870.36081749.34501919.09643041.73544660.39101055.153953
6sentence-transformers/LaBSEe34fab64a3011d2176c99545a93d5cbddc9a91b716.71743.17065730.20893055.37476674.98791053.44270249.98606434.178...52.10005676.35100875.96907554.60084465.34976333.16911320.12211739.15919550.19756251.864835
10sentence-transformers/paraphrase-multilingual-...bf3bf13ab40c3157080a7ab344c831b9ad18b5eb20.52537.67274040.45121362.42438268.07564653.61794452.24573644.878...48.78152044.56339078.99322951.65688966.58195336.61497114.93032939.33737150.97238747.956315
9sentence-transformers/all-mpnet-base-v284f2bcc00d77236f9e89c8a360a00fb1139bf47d21.27537.26774135.21515469.63005661.84628161.47339256.45931246.521...42.47004921.16131770.89351346.98548857.59966032.80855716.28050840.76591342.23441041.091171
7sentence-transformers/all-MiniLM-L12-v2a05860a77cef7b37e0048a7864658139bc18a85420.71437.28835031.97653767.01369662.11399357.44453055.06172847.128...42.15156422.90816271.67931346.84855857.20296132.50419014.58640736.83992744.32662840.862018
8sentence-transformers/all-MiniLM-L6-v28b3219a92973c328a8e22fadcfa821b5dc75636a20.51639.80778531.10665462.62172661.28010959.10688954.54234050.167...41.43210520.09367371.23465646.19891156.08406532.51344515.05433138.03755440.28457339.937651
\n", + "

12 rows × 146 columns

\n", + "
" + ], + "text/plain": [ + " model \\\n", + "4 intfloat/multilingual-e5-large-instruct \n", + "0 GritLM/GritLM-7B \n", + "1 intfloat/e5-mistral-7b-instruct \n", + "3 intfloat/multilingual-e5-large \n", + "2 intfloat/multilingual-e5-base \n", + "11 sentence-transformers/paraphrase-multilingual-... \n", + "5 intfloat/multilingual-e5-small \n", + "6 sentence-transformers/LaBSE \n", + "10 sentence-transformers/paraphrase-multilingual-... \n", + "9 sentence-transformers/all-mpnet-base-v2 \n", + "7 sentence-transformers/all-MiniLM-L12-v2 \n", + "8 sentence-transformers/all-MiniLM-L6-v2 \n", + "\n", + " revision AILAStatutes \\\n", + "4 baa7be480a7de1539afce709c8f13f833a510e0a 29.659 \n", + "0 13f00a0e36500c80ce12870ea513846a066004af 41.800 \n", + "1 07163b72af1488142a360786df853f237b1a3ca1 34.535 \n", + "3 4dc6d853a804b9c8886ede6dda8a073b7dc08a81 20.842 \n", + "2 d13f1b27baf31030b7fd040960d60d909913633f 20.371 \n", + "11 79f2382ceacceacdf38563d7c5d16b9ff8d725d6 22.236 \n", + "5 e4ce9877abf3edfe10b0d82785e83bdcb973e22e 19.011 \n", + "6 e34fab64a3011d2176c99545a93d5cbddc9a91b7 16.717 \n", + "10 bf3bf13ab40c3157080a7ab344c831b9ad18b5eb 20.525 \n", + "9 84f2bcc00d77236f9e89c8a360a00fb1139bf47d 21.275 \n", + "7 a05860a77cef7b37e0048a7864658139bc18a854 20.714 \n", + "8 8b3219a92973c328a8e22fadcfa821b5dc75636a 20.516 \n", + "\n", + " AfriSentiClassification AlloProfClusteringS2S.v2 AlloprofReranking \\\n", + "4 45.387432 56.465655 74.677730 \n", + "0 45.078589 56.411821 77.926160 \n", + "1 44.476335 57.112000 78.317664 \n", + "3 45.500507 35.150764 69.442880 \n", + "2 43.802315 34.113190 65.897152 \n", + "11 42.445471 41.806274 67.204322 \n", + "5 42.358118 35.393261 64.410023 \n", + "6 43.170657 30.208930 55.374766 \n", + "10 37.672740 40.451213 62.424382 \n", + "9 37.267741 35.215154 69.630056 \n", + "7 37.288350 31.976537 67.013696 \n", + "8 39.807785 31.106654 62.621726 \n", + "\n", + " AmazonCounterfactualClassification ArXivHierarchicalClusteringP2P \\\n", + "4 68.606356 62.534994 \n", + "0 79.296512 59.760046 \n", + "1 73.555839 65.283735 \n", + "3 76.163514 55.572114 \n", + "2 74.334014 56.683137 \n", + "11 72.766528 55.342767 \n", + "5 69.163655 54.276235 \n", + "6 74.987910 53.442702 \n", + "10 68.075646 53.617944 \n", + "9 61.846281 61.473392 \n", + "7 62.113993 57.444530 \n", + "8 61.280109 59.106889 \n", + "\n", + " ArXivHierarchicalClusteringS2S ArguAna ... Mean \\\n", + "4 61.284058 58.476 ... 63.227169 \n", + "0 62.283243 63.171 ... 60.930855 \n", + "1 61.278123 61.653 ... 60.279746 \n", + "3 56.212217 54.357 ... 58.570572 \n", + "2 56.115056 44.206 ... 57.012959 \n", + "11 55.160328 48.908 ... 52.005259 \n", + "5 54.198749 39.088 ... 55.456701 \n", + "6 49.986064 34.178 ... 52.100056 \n", + "10 52.245736 44.878 ... 48.781520 \n", + "9 56.459312 46.521 ... 42.470049 \n", + "7 55.061728 47.128 ... 42.151564 \n", + "8 54.542340 50.167 ... 41.432105 \n", + "\n", + " Mean BitextMining Mean PairClassification Mean Classification \\\n", + "4 80.126470 80.863584 64.942144 \n", + "0 70.531726 79.944411 61.830247 \n", + "1 70.579905 81.122211 60.314295 \n", + "3 71.666250 79.028390 59.916975 \n", + "2 69.437945 77.154385 58.205718 \n", + "11 52.062937 81.154392 55.064354 \n", + "5 67.472922 76.329053 56.500928 \n", + "6 76.351008 75.969075 54.600844 \n", + "10 44.563390 78.993229 51.656889 \n", + "9 21.161317 70.893513 46.985488 \n", + "7 22.908162 71.679313 46.848558 \n", + "8 20.093673 71.234656 46.198911 \n", + "\n", + " Mean STS Mean Retrieval Mean MultilabelClassification Mean Clustering \\\n", + "4 76.814671 57.116686 22.913504 51.538016 \n", + "0 73.327960 58.306711 22.773771 50.482520 \n", + "1 74.021600 55.750101 22.196821 51.390095 \n", + "3 73.488372 54.111178 21.302373 42.923755 \n", + "2 71.444249 52.721823 20.162060 42.674461 \n", + "11 69.661040 39.757752 16.398034 41.080665 \n", + "5 70.360817 49.345019 19.096430 41.735446 \n", + "6 65.349763 33.169113 20.122117 39.159195 \n", + "10 66.581953 36.614971 14.930329 39.337371 \n", + "9 57.599660 32.808557 16.280508 40.765913 \n", + "7 57.202961 32.504190 14.586407 36.839927 \n", + "8 56.084065 32.513445 15.054331 38.037554 \n", + "\n", + " Mean Reranking mean pr. task type \n", + "4 62.613273 62.116044 \n", + "0 63.778754 60.122013 \n", + "1 63.819183 59.899276 \n", + "3 62.840466 58.159720 \n", + "2 60.176360 56.497125 \n", + "11 53.374677 51.069231 \n", + "5 60.391010 55.153953 \n", + "6 50.197562 51.864835 \n", + "10 50.972387 47.956315 \n", + "9 42.234410 41.091171 \n", + "7 44.326628 40.862018 \n", + "8 40.284573 39.937651 \n", + "\n", + "[12 rows x 146 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
modelBorda strMeanmean pr. task typeMean BitextMiningMean PairClassificationMean ClassificationMean STSMean RetrievalMean MultilabelClassificationMean ClusteringMean Reranking
4intfloat/multilingual-e5-large-instruct1 (1375)63.22716962.11604480.12647080.86358464.94214476.81467157.11668622.91350451.53801662.613273
0GritLM/GritLM-7B2 (1258)60.93085560.12201370.53172679.94441161.83024773.32796058.30671122.77377150.48252063.778754
1intfloat/e5-mistral-7b-instruct3 (1233)60.27974659.89927670.57990581.12221160.31429574.02160055.75010122.19682151.39009563.819183
3intfloat/multilingual-e5-large4 (1109)58.57057258.15972071.66625079.02839059.91697573.48837254.11117821.30237342.92375562.840466
2intfloat/multilingual-e5-base5 (944)57.01295956.49712569.43794577.15438558.20571871.44424952.72182320.16206042.67446160.176360
11sentence-transformers/paraphrase-multilingual-...6 (830)52.00525951.06923152.06293781.15439255.06435469.66104039.75775216.39803441.08066553.374677
5intfloat/multilingual-e5-small7 (784)55.45670155.15395367.47292276.32905356.50092870.36081749.34501919.09643041.73544660.391010
6sentence-transformers/LaBSE8 (719)52.10005651.86483576.35100875.96907554.60084465.34976333.16911320.12211739.15919550.197562
10sentence-transformers/paraphrase-multilingual-...9 (603)48.78152047.95631544.56339078.99322951.65688966.58195336.61497114.93032939.33737150.972387
9sentence-transformers/all-mpnet-base-v210 (526)42.47004941.09117121.16131770.89351346.98548857.59966032.80855716.28050840.76591342.234410
7sentence-transformers/all-MiniLM-L12-v211 (490)42.15156440.86201822.90816271.67931346.84855857.20296132.50419014.58640736.83992744.326628
8sentence-transformers/all-MiniLM-L6-v212 (418)41.43210539.93765120.09367371.23465646.19891156.08406532.51344515.05433138.03755440.284573
\n", + "
" + ], + "text/plain": [ + " model Borda str Mean \\\n", + "4 intfloat/multilingual-e5-large-instruct 1 (1375) 63.227169 \n", + "0 GritLM/GritLM-7B 2 (1258) 60.930855 \n", + "1 intfloat/e5-mistral-7b-instruct 3 (1233) 60.279746 \n", + "3 intfloat/multilingual-e5-large 4 (1109) 58.570572 \n", + "2 intfloat/multilingual-e5-base 5 (944) 57.012959 \n", + "11 sentence-transformers/paraphrase-multilingual-... 6 (830) 52.005259 \n", + "5 intfloat/multilingual-e5-small 7 (784) 55.456701 \n", + "6 sentence-transformers/LaBSE 8 (719) 52.100056 \n", + "10 sentence-transformers/paraphrase-multilingual-... 9 (603) 48.781520 \n", + "9 sentence-transformers/all-mpnet-base-v2 10 (526) 42.470049 \n", + "7 sentence-transformers/all-MiniLM-L12-v2 11 (490) 42.151564 \n", + "8 sentence-transformers/all-MiniLM-L6-v2 12 (418) 41.432105 \n", + "\n", + " mean pr. task type Mean BitextMining Mean PairClassification \\\n", + "4 62.116044 80.126470 80.863584 \n", + "0 60.122013 70.531726 79.944411 \n", + "1 59.899276 70.579905 81.122211 \n", + "3 58.159720 71.666250 79.028390 \n", + "2 56.497125 69.437945 77.154385 \n", + "11 51.069231 52.062937 81.154392 \n", + "5 55.153953 67.472922 76.329053 \n", + "6 51.864835 76.351008 75.969075 \n", + "10 47.956315 44.563390 78.993229 \n", + "9 41.091171 21.161317 70.893513 \n", + "7 40.862018 22.908162 71.679313 \n", + "8 39.937651 20.093673 71.234656 \n", + "\n", + " Mean Classification Mean STS Mean Retrieval \\\n", + "4 64.942144 76.814671 57.116686 \n", + "0 61.830247 73.327960 58.306711 \n", + "1 60.314295 74.021600 55.750101 \n", + "3 59.916975 73.488372 54.111178 \n", + "2 58.205718 71.444249 52.721823 \n", + "11 55.064354 69.661040 39.757752 \n", + "5 56.500928 70.360817 49.345019 \n", + "6 54.600844 65.349763 33.169113 \n", + "10 51.656889 66.581953 36.614971 \n", + "9 46.985488 57.599660 32.808557 \n", + "7 46.848558 57.202961 32.504190 \n", + "8 46.198911 56.084065 32.513445 \n", + "\n", + " Mean MultilabelClassification Mean Clustering Mean Reranking \n", + "4 22.913504 51.538016 62.613273 \n", + "0 22.773771 50.482520 63.778754 \n", + "1 22.196821 51.390095 63.819183 \n", + "3 21.302373 42.923755 62.840466 \n", + "2 20.162060 42.674461 60.176360 \n", + "11 16.398034 41.080665 53.374677 \n", + "5 19.096430 41.735446 60.391010 \n", + "6 20.122117 39.159195 50.197562 \n", + "10 14.930329 39.337371 50.972387 \n", + "9 16.280508 40.765913 42.234410 \n", + "7 14.586407 36.839927 44.326628 \n", + "8 15.054331 38.037554 40.284573 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "latex_df" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\begin{tabular}{llrrrrrrrrrr}\n", + "\\toprule\n", + "model & Borda str & Mean & mean pr. task type & Mean BitextMining & Mean PairClassification & Mean Classification & Mean STS & Mean Retrieval & Mean MultilabelClassification & Mean Clustering & Mean Reranking \\\\\n", + "\\midrule\n", + "intfloat/multilingual-e5-large-instruct & 1 (1375) & 63.2 & 62.1 & 80.1 & 80.9 & 64.9 & 76.8 & 57.1 & 22.9 & 51.5 & 62.6 \\\\\n", + "GritLM/GritLM-7B & 2 (1258) & 60.9 & 60.1 & 70.5 & 79.9 & 61.8 & 73.3 & 58.3 & 22.8 & 50.5 & 63.8 \\\\\n", + "intfloat/e5-mistral-7b-instruct & 3 (1233) & 60.3 & 59.9 & 70.6 & 81.1 & 60.3 & 74.0 & 55.8 & 22.2 & 51.4 & 63.8 \\\\\n", + "intfloat/multilingual-e5-large & 4 (1109) & 58.6 & 58.2 & 71.7 & 79.0 & 59.9 & 73.5 & 54.1 & 21.3 & 42.9 & 62.8 \\\\\n", + "intfloat/multilingual-e5-base & 5 (944) & 57.0 & 56.5 & 69.4 & 77.2 & 58.2 & 71.4 & 52.7 & 20.2 & 42.7 & 60.2 \\\\\n", + "sentence-transformers/paraphrase-multilingual-mpnet-base-v2 & 6 (830) & 52.0 & 51.1 & 52.1 & 81.2 & 55.1 & 69.7 & 39.8 & 16.4 & 41.1 & 53.4 \\\\\n", + "intfloat/multilingual-e5-small & 7 (784) & 55.5 & 55.2 & 67.5 & 76.3 & 56.5 & 70.4 & 49.3 & 19.1 & 41.7 & 60.4 \\\\\n", + "sentence-transformers/LaBSE & 8 (719) & 52.1 & 51.9 & 76.4 & 76.0 & 54.6 & 65.3 & 33.2 & 20.1 & 39.2 & 50.2 \\\\\n", + "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 & 9 (603) & 48.8 & 48.0 & 44.6 & 79.0 & 51.7 & 66.6 & 36.6 & 14.9 & 39.3 & 51.0 \\\\\n", + "sentence-transformers/all-mpnet-base-v2 & 10 (526) & 42.5 & 41.1 & 21.2 & 70.9 & 47.0 & 57.6 & 32.8 & 16.3 & 40.8 & 42.2 \\\\\n", + "sentence-transformers/all-MiniLM-L12-v2 & 11 (490) & 42.2 & 40.9 & 22.9 & 71.7 & 46.8 & 57.2 & 32.5 & 14.6 & 36.8 & 44.3 \\\\\n", + "sentence-transformers/all-MiniLM-L6-v2 & 12 (418) & 41.4 & 39.9 & 20.1 & 71.2 & 46.2 & 56.1 & 32.5 & 15.1 & 38.0 & 40.3 \\\\\n", + "\\bottomrule\n", + "\\end{tabular}\n", + "\n" + ] + } + ], + "source": [ + "print(latex_df.to_latex(index=False, float_format=\"%.1f\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.20" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 26708c564b3f173d2f4fc532a33134742723f4e5 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 14 Feb 2025 15:12:53 +0000 Subject: [PATCH 020/233] 1.34.14 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f7ea7c6103..277667e73a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.34.13" +version = "1.34.14" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 5f4b593416ca8615f128a04844b50eb2dd4c09b9 Mon Sep 17 00:00:00 2001 From: Mehrzad Shahin-Moghadam <42153677+mehrzadshm@users.noreply.github.com> Date: Sat, 15 Feb 2025 16:11:11 -0500 Subject: [PATCH 021/233] Add datasets for a benchmark newly introduced for "Engineering" domain (#1911) * adding clustering tasks (built-bench-clustering S2S & P2P) * updated built-bench-clustering tasks * Updated BuiltBenchClustering tasks * Added "Engineering" as new domain to TaskMetadata.py * Updated tasks table in docs * Updated task metadata for BuiltBenchClustering S2S and P2P * updated metadata for clustering tasks * Add/update BuiltBench tasks - Add BuiltBenchRetrieval task - Add BuiltBenchReranking task - Update metadata for BuiltBenchClusterinP2P - Update metadata for BuiltBenchClusterinS2S * update BuiltBench benchmark * Update mteb/benchmarks/benchmarks.py Co-authored-by: Roman Solomatin * Update mteb/tasks/Clustering/eng/BuiltBenchClusteringS2S.py Co-authored-by: Roman Solomatin * Update mteb/tasks/Clustering/eng/BuiltBenchClusteringP2P.py Co-authored-by: Roman Solomatin * Update mteb/benchmarks/benchmarks.py Co-authored-by: Isaac Chung * Fix formatting via ruff --------- Co-authored-by: Roman Solomatin Co-authored-by: Isaac Chung --- mteb/abstasks/TaskMetadata.py | 2 + mteb/benchmarks/benchmarks.py | 21 ++++++++++ mteb/tasks/Clustering/__init__.py | 2 + .../Clustering/eng/BuiltBenchClusteringP2P.py | 36 +++++++++++++++++ .../Clustering/eng/BuiltBenchClusteringS2S.py | 36 +++++++++++++++++ mteb/tasks/Reranking/__init__.py | 1 + .../Reranking/eng/BuiltBenchReranking.py | 39 +++++++++++++++++++ mteb/tasks/Retrieval/__init__.py | 1 + .../Retrieval/eng/BuiltBenchRetrieval.py | 39 +++++++++++++++++++ 9 files changed, 177 insertions(+) create mode 100644 mteb/tasks/Clustering/eng/BuiltBenchClusteringP2P.py create mode 100644 mteb/tasks/Clustering/eng/BuiltBenchClusteringS2S.py create mode 100644 mteb/tasks/Reranking/eng/BuiltBenchReranking.py create mode 100644 mteb/tasks/Retrieval/eng/BuiltBenchRetrieval.py diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index db28dc1915..1f6971d0e5 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -66,6 +66,7 @@ "Blog", "Constructed", "Encyclopaedic", + "Engineering", "Fiction", "Government", "Legal", @@ -199,6 +200,7 @@ "cc-by-nc-sa-3.0", "cc-by-nc-sa-4.0", "cc-by-nc-nd-4.0", + "cc-by-nd-4.0", "openrail", "openrail++", "odc-by", diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index e254dcff22..6c9dbaafcb 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1395,3 +1395,24 @@ url={https://arxiv.org/abs/2412.08329}, }""", ) + +BUILT_MTEB = Benchmark( + name="BuiltBench(eng)", + tasks=get_tasks( + tasks=[ + "BuiltBenchClusteringP2P", + "BuiltBenchClusteringS2S", + "BuiltBenchRetrieval", + "BuiltBenchReranking", + ], + ), + description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various dicsiplines such as architeture, engineering, constrcution, and operations management of the built environment.', + reference="https://arxiv.org/abs/2411.12056", + citation="""@article{shahinmoghadam2024benchmarking, + title={Benchmarking pre-trained text embedding models in aligning built asset information}, + author={Shahinmoghadam, Mehrzad and Motamedi, Ali}, + journal={arXiv preprint arXiv:2411.12056}, + year={2024} +}""", + contacts=["mehrzadshm"], +) diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index 65d8b01246..c70d722011 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -10,6 +10,8 @@ from .eng.BigPatentClustering import * from .eng.BiorxivClusteringP2P import * from .eng.BiorxivClusteringS2S import * +from .eng.BuiltBenchClusteringP2P import * +from .eng.BuiltBenchClusteringS2S import * from .eng.MedrxivClusteringP2P import * from .eng.MedrxivClusteringS2S import * from .eng.RedditClustering import * diff --git a/mteb/tasks/Clustering/eng/BuiltBenchClusteringP2P.py b/mteb/tasks/Clustering/eng/BuiltBenchClusteringP2P.py new file mode 100644 index 0000000000..a7739a11da --- /dev/null +++ b/mteb/tasks/Clustering/eng/BuiltBenchClusteringP2P.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class BuiltBenchClusteringP2P(AbsTaskClustering): + metadata = TaskMetadata( + name="BuiltBenchClusteringP2P", + description="Clustering of built asset item descriptions based on categories identified within industry classification systems such as IFC, Uniclass, etc.", + reference="https://arxiv.org/abs/2411.12056", + dataset={ + "path": "mehrzad-shahin/BuiltBench-clustering-p2p", + "revision": "919bb71053e9de62a68998161ce4f0cee8f786fb", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="v_measure", + date=("2024-06-01", "2024-11-30"), + domains=["Engineering", "Written"], + task_subtypes=["Thematic clustering"], + license="cc-by-nd-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation="""@article{shahinmoghadam2024benchmarking, + title={Benchmarking pre-trained text embedding models in aligning built asset information}, + author={Shahinmoghadam, Mehrzad and Motamedi, Ali}, + journal={arXiv preprint arXiv:2411.12056}, + year={2024} +}""", + prompt="Identify the category of the built asset entities based on the entity description", + ) diff --git a/mteb/tasks/Clustering/eng/BuiltBenchClusteringS2S.py b/mteb/tasks/Clustering/eng/BuiltBenchClusteringS2S.py new file mode 100644 index 0000000000..58b53a476d --- /dev/null +++ b/mteb/tasks/Clustering/eng/BuiltBenchClusteringS2S.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class BuiltBenchClusteringS2S(AbsTaskClustering): + metadata = TaskMetadata( + name="BuiltBenchClusteringS2S", + description="Clustering of built asset names/titles based on categories identified within industry classification systems such as IFC, Uniclass, etc.", + reference="https://arxiv.org/abs/2411.12056", + dataset={ + "path": "mehrzad-shahin/BuiltBench-clustering-s2s", + "revision": "1aaeb2ece89ea0a8c64e215c95c4cfaf7e891149", + }, + type="Clustering", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="v_measure", + date=("2024-06-01", "2024-11-30"), + domains=["Engineering", "Written"], + task_subtypes=["Thematic clustering"], + license="cc-by-nd-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation="""@article{shahinmoghadam2024benchmarking, + title={Benchmarking pre-trained text embedding models in aligning built asset information}, + author={Shahinmoghadam, Mehrzad and Motamedi, Ali}, + journal={arXiv preprint arXiv:2411.12056}, + year={2024} +}""", + prompt="Identify the category of the built asset entities based on the names or titles", + ) diff --git a/mteb/tasks/Reranking/__init__.py b/mteb/tasks/Reranking/__init__.py index 2c3a27919a..4fa8ed73cf 100644 --- a/mteb/tasks/Reranking/__init__.py +++ b/mteb/tasks/Reranking/__init__.py @@ -2,6 +2,7 @@ from .ara.NamaaMrTydiReranking import * from .eng.AskUbuntuDupQuestions import * +from .eng.BuiltBenchReranking import * from .eng.MindSmallReranking import * from .eng.SciDocsReranking import * from .eng.StackOverflowDupQuestions import * diff --git a/mteb/tasks/Reranking/eng/BuiltBenchReranking.py b/mteb/tasks/Reranking/eng/BuiltBenchReranking.py new file mode 100644 index 0000000000..890978fbf9 --- /dev/null +++ b/mteb/tasks/Reranking/eng/BuiltBenchReranking.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskReranking import AbsTaskReranking + + +class BuiltBenchReranking(AbsTaskReranking): + metadata = TaskMetadata( + name="BuiltBenchReranking", + description="Reranking of built asset entity type/class descriptions given a query describing an entity as represented in well-established industry classification systems such as Uniclass, IFC, etc.", + reference="https://arxiv.org/abs/2411.12056", + dataset={ + "path": "mehrzad-shahin/BuiltBench-reranking", + "revision": "fd33b0b3454deb256be06a57e8147b32ba078ff9", + }, + type="Reranking", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="map", + date=("2024-06-01", "2024-11-30"), + domains=["Engineering", "Written"], + task_subtypes=[], + license="cc-by-nd-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation="""@article{shahinmoghadam2024benchmarking, + title={Benchmarking pre-trained text embedding models in aligning built asset information}, + author={Shahinmoghadam, Mehrzad and Motamedi, Ali}, + journal={arXiv preprint arXiv:2411.12056}, + year={2024} +}""", + prompt={ + "query": "Given a query, retrieve relevant entity descriptions from buit asset classification systems such as IFC and Uniclass" + }, + ) diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index bc4b20b248..96cd664696 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -30,6 +30,7 @@ from .eng.ARCChallengeRetrieval import * from .eng.ArguAnaRetrieval import * from .eng.BrightRetrieval import * +from .eng.BuiltBenchRetrieval import * from .eng.ChemHotpotQARetrieval import * from .eng.ChemNQRetrieval import * from .eng.ClimateFEVERRetrieval import * diff --git a/mteb/tasks/Retrieval/eng/BuiltBenchRetrieval.py b/mteb/tasks/Retrieval/eng/BuiltBenchRetrieval.py new file mode 100644 index 0000000000..5d36f219a7 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/BuiltBenchRetrieval.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class BuiltBenchRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="BuiltBenchRetrieval", + description="Retrieval of built asset entity type/class descriptions given a query describing an entity as represented in well-established industry classification systems such as Uniclass, IFC, etc.", + reference="https://arxiv.org/abs/2411.12056", + dataset={ + "path": "mehrzad-shahin/BuiltBench-retrieval", + "revision": "ae611238a58dae85f3130563fe9f9e995444a8d6", + }, + type="Retrieval", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-06-01", "2024-11-30"), + domains=["Engineering", "Written"], + task_subtypes=["Question answering"], + license="cc-by-nd-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation="""@article{shahinmoghadam2024benchmarking, + title={Benchmarking pre-trained text embedding models in aligning built asset information}, + author={Shahinmoghadam, Mehrzad and Motamedi, Ali}, + journal={arXiv preprint arXiv:2411.12056}, + year={2024} +}""", + prompt={ + "query": "Given a query, retrieve relevant entity descriptions from buit asset classification systems such as IFC and Uniclass" + }, + ) From dbda3c59024c4042c2d3e37c61a5e2ee16e4e8ed Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 15 Feb 2025 21:13:25 +0000 Subject: [PATCH 022/233] Update tasks table --- docs/tasks.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index c92e45c19f..1ac3e5b666 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -68,6 +68,10 @@ The following tables give you an overview of the tasks in MTEB. | [BornholmBitextMining](https://aclanthology.org/W19-6138/) | ['dan'] | BitextMining | s2s | [Fiction, Social, Web, Written] | {'test': 500} | {'test': {'num_samples': 500, 'number_of_characters': 44361, 'unique_pairs': 500, 'min_sentence1_length': 1, 'average_sentence1_length': 49.83, 'max_sentence1_length': 555, 'unique_sentence1': 497, 'min_sentence2_length': 5, 'average_sentence2_length': 38.89, 'max_sentence2_length': 453, 'unique_sentence2': 491}} | | [BrazilianToxicTweetsClassification](https://paperswithcode.com/dataset/told-br) (Joao Augusto Leite and Diego F. Silva and Kalina Bontcheva and Carolina Scarton, 2020) | ['por'] | MultilabelClassification | s2s | [Constructed, Written] | None | None | | [BrightRetrieval](https://huggingface.co/datasets/xlangai/BRIGHT) (Hongjin Su, 2024) | ['eng'] | Retrieval | s2p | [Non-fiction, Written] | None | None | +| [BuiltBenchClusteringP2P](https://arxiv.org/abs/2411.12056) (Shahinmoghadam et al., 2024) | ['eng'] | Clustering | p2p | [Engineering, Written] | None | None | +| [BuiltBenchClusteringS2S](https://arxiv.org/abs/2411.12056) (Shahinmoghadam et al., 2024) | ['eng'] | Clustering | s2s | [Engineering, Written] | None | None | +| [BuiltBenchReranking](https://arxiv.org/abs/2411.12056) (Shahinmoghadam et al., 2024) | ['eng'] | Reranking | p2p | [Engineering, Written] | None | None | +| [BuiltBenchRetrieval](https://arxiv.org/abs/2411.12056) (Shahinmoghadam et al., 2024) | ['eng'] | Retrieval | p2p | [Engineering, Written] | None | None | | [BulgarianStoreReviewSentimentClassfication](https://doi.org/10.7910/DVN/TXIK9P) (Georgieva-Trifonova et al., 2018) | ['bul'] | Classification | s2s | [Reviews, Written] | None | None | | [CBD](http://2019.poleval.pl/files/poleval2019.pdf) | ['pol'] | Classification | s2s | [Social, Written] | None | None | | [CDSC-E](https://aclanthology.org/P17-1073.pdf) | ['pol'] | PairClassification | s2s | [Written] | None | None | @@ -1175,7 +1179,7 @@ The following tables give you an overview of the tasks in MTEB. | ell | Modern Greek (1453-) | Indo-European | 0 | 2 | 0 | 3 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 18 | | emi | Mussau-Emira | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | emp | Northern Emberá | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 9 | 62 | 4 | 17 | 160 | 18 | 21 | 5 | 1 | 6 | 3 | 1 | 13 | 8 | 109 | 13 | 2 | 1 | 7 | 24 | 484 | +| eng | English | Indo-European | 9 | 62 | 4 | 17 | 160 | 20 | 21 | 5 | 1 | 6 | 3 | 1 | 13 | 9 | 110 | 13 | 2 | 1 | 7 | 24 | 488 | | enq | Enga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | epo | Esperanto | Artificial Language | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | eri | Ogea | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1948,7 +1952,7 @@ The following tables give you an overview of the tasks in MTEB. | zty | Yatee Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zul | Zulu | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 7 | | zyp | Zyphe Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 9 | 114 | 4 | 1398 | 836 | 311 | 21 | 5 | 1 | 6 | 3 | 28 | 91 | 55 | 535 | 88 | 2 | 2 | 24 | 24 | +| Total | None | None | None | 9 | 114 | 4 | 1398 | 836 | 313 | 21 | 5 | 1 | 6 | 3 | 28 | 91 | 56 | 536 | 88 | 2 | 2 | 24 | 24 | From 50cc1c995765dacc820d19f632bdea2f5fbef260 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Sun, 16 Feb 2025 23:03:53 +0300 Subject: [PATCH 023/233] misc: update model names to adjust for adding to results repo (#2074) * update model names to adjust for adding to results repo * update model meta script --- mteb/models/evaclip_models.py | 8 +++--- mteb/models/lens_models.py | 1 - mteb/models/voyage_v.py | 2 +- scripts/refill_mieb_model_meta.py | 42 +++++++++++++++++++++++++++++++ 4 files changed, 47 insertions(+), 6 deletions(-) create mode 100644 scripts/refill_mieb_model_meta.py diff --git a/mteb/models/evaclip_models.py b/mteb/models/evaclip_models.py index 545a06027f..0b9e0e19bc 100644 --- a/mteb/models/evaclip_models.py +++ b/mteb/models/evaclip_models.py @@ -179,7 +179,7 @@ def get_fused_embeddings( evaclip_loader, model_name="EVA02-CLIP-B-16", ), - name="EVA02-CLIP-B-16", + name="QuanSun/EVA02-CLIP-B-16", languages=["eng_Latn"], revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12", release_date="2023-04-26", @@ -204,7 +204,7 @@ def get_fused_embeddings( evaclip_loader, model_name="EVA02-CLIP-L-14", ), - name="EVA02-CLIP-L-14", + name="QuanSun/EVA02-CLIP-L-14", languages=["eng_Latn"], revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12", release_date="2023-04-26", @@ -229,7 +229,7 @@ def get_fused_embeddings( evaclip_loader, model_name="EVA02-CLIP-bigE-14", ), - name="EVA02-CLIP-bigE-14", + name="QuanSun/EVA02-CLIP-bigE-14", languages=["eng_Latn"], revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12", release_date="2023-04-26", @@ -255,7 +255,7 @@ def get_fused_embeddings( evaclip_loader, model_name="EVA02-CLIP-bigE-14-plus", ), - name="EVA02-CLIP-bigE-14-plus", + name="QuanSun/EVA02-CLIP-bigE-14-plus", languages=["eng_Latn"], revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12", release_date="2023-04-26", diff --git a/mteb/models/lens_models.py b/mteb/models/lens_models.py index 46bc25c3de..c83bf2a3d0 100644 --- a/mteb/models/lens_models.py +++ b/mteb/models/lens_models.py @@ -3,7 +3,6 @@ from mteb.model_meta import ModelMeta from mteb.models.bge_models import bge_full_data - lens_d4000 = ModelMeta( loader=None, # TODO: implement this in the future name="yibinlei/LENS-d4000", diff --git a/mteb/models/voyage_v.py b/mteb/models/voyage_v.py index 025165cd7d..d607d809f0 100644 --- a/mteb/models/voyage_v.py +++ b/mteb/models/voyage_v.py @@ -242,7 +242,7 @@ def get_fused_embeddings( voyage_v = ModelMeta( loader=partial(voyage_v_loader, model_name="voyage-multimodal-3"), - name="voyage-multimodal-3", + name="voyageai/voyage-multimodal-3", languages=[], # Unknown revision="1", release_date="2024-11-10", diff --git a/scripts/refill_mieb_model_meta.py b/scripts/refill_mieb_model_meta.py new file mode 100644 index 0000000000..017ee5da34 --- /dev/null +++ b/scripts/refill_mieb_model_meta.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import json +from pathlib import Path + +import mteb + +path = Path("/home/tmp") +model_name_folders = [ + f + for f in path.iterdir() + if f.is_dir() and "linear" not in f.name and "." not in f.name +] +revision_folders = [ + sf + for folder in model_name_folders + for sf in folder.iterdir() + if sf.is_dir() and "." not in sf.name +] +model_names = [f.name for f in model_name_folders] +revisions = [f.name for f in revision_folders] + +models = [] +for m in model_names: + if "EVA" in m: + models.append(f"QuanSun/{m}") + elif "voyage" in m: + models.append(f"voyageai/{m}") + else: + models.append(m.replace("__", "/")) + + +base_results_path = Path("/home/results/results") +for m, r in zip(models, revisions): + print(m) + mm = mteb.get_model_meta(model_name=m, revision=r) + print(mm.to_dict()) + + target_path = base_results_path / m.replace("/", "__") / r / "model_meta.json" + + with open(target_path, "w") as f: + json.dump(mm.to_dict(), f) From 04c9993f3c5478f76253141ec1b0170522d821a2 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Mon, 17 Feb 2025 06:21:48 +0300 Subject: [PATCH 024/233] misc: Add all image classification descriptive stats (#2073) * add most image classification descr stats * revert changes to encoder * add stats --------- Co-authored-by: Roman Solomatin <36135455+Samoed@users.noreply.github.com> --- .../Image/ImageClassification/Birdsnap.json | 1 + .../Image/ImageClassification/CIFAR10.json | 44 ++ .../Image/ImageClassification/CIFAR100.json | 314 +++++++++ .../Image/ImageClassification/Caltech101.json | 320 +++++++++ .../Image/ImageClassification/Country211.json | 647 ++++++++++++++++++ .../Image/ImageClassification/DTD.json | 155 +++++ .../Image/ImageClassification/EuroSAT.json | 44 ++ .../Image/ImageClassification/FER2013.json | 35 + .../ImageClassification/FGVCAircraft.json | 314 +++++++++ .../Food101Classification.json | 317 +++++++++ .../Image/ImageClassification/GTSRB.json | 143 ++++ .../OxfordFlowersClassification.json | 320 +++++++++ .../Image/ImageClassification/OxfordPets.json | 125 ++++ .../ImageClassification/PatchCamelyon.json | 20 + .../Image/ImageClassification/RESISC45.json | 149 ++++ .../Image/ImageClassification/STL10.json | 44 ++ .../Image/ImageClassification/SUN397.json | 1 + .../ImageClassification/StanfordCars.json | 602 ++++++++++++++++ .../Image/ImageClassification/UCF101.json | 1 + 19 files changed, 3596 insertions(+) create mode 100644 mteb/descriptive_stats/Image/ImageClassification/Birdsnap.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/CIFAR10.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/CIFAR100.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/Caltech101.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/Country211.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/DTD.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/EuroSAT.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/FER2013.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/FGVCAircraft.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/Food101Classification.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/GTSRB.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/OxfordFlowersClassification.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/OxfordPets.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/PatchCamelyon.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/RESISC45.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/STL10.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/SUN397.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/StanfordCars.json create mode 100644 mteb/descriptive_stats/Image/ImageClassification/UCF101.json diff --git a/mteb/descriptive_stats/Image/ImageClassification/Birdsnap.json b/mteb/descriptive_stats/Image/ImageClassification/Birdsnap.json new file mode 100644 index 0000000000..83d7d504c2 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/Birdsnap.json @@ -0,0 +1 @@ +{"test": {"num_samples": 1851, "unique_num_labels": 490, "min_image_width": 267, "average_image_width": 2081.5569962182603, "max_image_width": 6400, "min_image_height": 200, "average_image_height": 1609.192868719611, "max_image_height": 5400, "labels": {"0": {"count": 4}, "1": {"count": 5}, "2": {"count": 4}, "3": {"count": 4}, "4": {"count": 4}, "5": {"count": 2}, "6": {"count": 3}, "7": {"count": 5}, "8": {"count": 4}, "9": {"count": 5}, "11": {"count": 3}, "12": {"count": 4}, "13": {"count": 5}, "14": {"count": 4}, "15": {"count": 5}, "16": {"count": 4}, "17": {"count": 3}, "18": {"count": 2}, "19": {"count": 5}, "20": {"count": 4}, "21": {"count": 4}, "22": {"count": 5}, "23": {"count": 2}, "24": {"count": 4}, "25": {"count": 3}, "26": {"count": 4}, "27": {"count": 4}, "28": {"count": 2}, "29": {"count": 5}, "30": {"count": 3}, "31": {"count": 3}, "32": {"count": 3}, "33": {"count": 4}, "34": {"count": 4}, "35": {"count": 4}, "36": {"count": 3}, "37": {"count": 3}, "38": {"count": 4}, "39": {"count": 3}, "40": {"count": 4}, "41": {"count": 3}, "42": {"count": 3}, "43": {"count": 4}, "44": {"count": 2}, "45": {"count": 3}, "47": {"count": 5}, "48": {"count": 2}, "49": {"count": 5}, "50": {"count": 4}, "51": {"count": 5}, "52": {"count": 3}, "53": {"count": 3}, "54": {"count": 4}, "55": {"count": 2}, "56": {"count": 2}, "57": {"count": 5}, "58": {"count": 2}, "59": {"count": 1}, "60": {"count": 1}, "61": {"count": 3}, "62": {"count": 3}, "63": {"count": 5}, "64": {"count": 5}, "65": {"count": 4}, "67": {"count": 2}, "68": {"count": 3}, "69": {"count": 4}, "70": {"count": 5}, "71": {"count": 5}, "72": {"count": 5}, "73": {"count": 4}, "74": {"count": 5}, "75": {"count": 4}, "76": {"count": 4}, "80": {"count": 3}, "81": {"count": 5}, "82": {"count": 3}, "83": {"count": 5}, "84": {"count": 3}, "85": {"count": 4}, "86": {"count": 4}, "87": {"count": 5}, "88": {"count": 4}, "89": {"count": 5}, "90": {"count": 4}, "91": {"count": 4}, "92": {"count": 5}, "93": {"count": 4}, "94": {"count": 4}, "95": {"count": 5}, "96": {"count": 5}, "97": {"count": 5}, "98": {"count": 3}, "99": {"count": 5}, "100": {"count": 4}, "101": {"count": 5}, "102": {"count": 4}, "103": {"count": 3}, "105": {"count": 4}, "108": {"count": 4}, "109": {"count": 5}, "110": {"count": 3}, "111": {"count": 3}, "112": {"count": 4}, "113": {"count": 4}, "114": {"count": 5}, "115": {"count": 4}, "116": {"count": 5}, "117": {"count": 4}, "118": {"count": 4}, "119": {"count": 5}, "120": {"count": 5}, "121": {"count": 4}, "122": {"count": 3}, "124": {"count": 3}, "125": {"count": 4}, "126": {"count": 2}, "127": {"count": 3}, "128": {"count": 5}, "129": {"count": 5}, "130": {"count": 5}, "131": {"count": 3}, "132": {"count": 4}, "133": {"count": 4}, "134": {"count": 2}, "135": {"count": 5}, "136": {"count": 5}, "137": {"count": 3}, "138": {"count": 4}, "139": {"count": 3}, "140": {"count": 3}, "141": {"count": 2}, "142": {"count": 3}, "143": {"count": 5}, "144": {"count": 4}, "145": {"count": 5}, "146": {"count": 5}, "147": {"count": 5}, "148": {"count": 4}, "149": {"count": 4}, "150": {"count": 5}, "151": {"count": 5}, "152": {"count": 5}, "153": {"count": 3}, "154": {"count": 4}, "155": {"count": 3}, "156": {"count": 3}, "157": {"count": 3}, "159": {"count": 3}, "160": {"count": 4}, "161": {"count": 4}, "162": {"count": 4}, "163": {"count": 4}, "164": {"count": 3}, "165": {"count": 3}, "166": {"count": 3}, "167": {"count": 4}, "168": {"count": 4}, "169": {"count": 4}, "170": {"count": 4}, "171": {"count": 5}, "172": {"count": 4}, "173": {"count": 4}, "174": {"count": 5}, "175": {"count": 4}, "176": {"count": 2}, "177": {"count": 5}, "178": {"count": 5}, "179": {"count": 5}, "180": {"count": 5}, "181": {"count": 4}, "183": {"count": 2}, "184": {"count": 3}, "185": {"count": 2}, "186": {"count": 5}, "187": {"count": 2}, "188": {"count": 3}, "189": {"count": 2}, "190": {"count": 5}, "191": {"count": 4}, "192": {"count": 3}, "193": {"count": 3}, "194": {"count": 4}, "195": {"count": 3}, "196": {"count": 4}, "197": {"count": 3}, "198": {"count": 4}, "199": {"count": 5}, "200": {"count": 5}, "201": {"count": 1}, "204": {"count": 4}, "205": {"count": 5}, "206": {"count": 4}, "207": {"count": 3}, "208": {"count": 4}, "209": {"count": 4}, "210": {"count": 4}, "211": {"count": 4}, "212": {"count": 5}, "213": {"count": 4}, "214": {"count": 5}, "215": {"count": 3}, "216": {"count": 1}, "217": {"count": 5}, "218": {"count": 2}, "219": {"count": 5}, "220": {"count": 4}, "221": {"count": 5}, "222": {"count": 5}, "223": {"count": 3}, "224": {"count": 4}, "225": {"count": 5}, "226": {"count": 3}, "227": {"count": 4}, "228": {"count": 3}, "229": {"count": 4}, "230": {"count": 4}, "231": {"count": 5}, "232": {"count": 5}, "233": {"count": 5}, "234": {"count": 4}, "235": {"count": 4}, "236": {"count": 5}, "237": {"count": 5}, "238": {"count": 5}, "239": {"count": 4}, "240": {"count": 3}, "241": {"count": 3}, "242": {"count": 4}, "243": {"count": 5}, "244": {"count": 2}, "245": {"count": 4}, "246": {"count": 5}, "247": {"count": 3}, "248": {"count": 3}, "249": {"count": 5}, "250": {"count": 5}, "251": {"count": 4}, "252": {"count": 2}, "253": {"count": 5}, "254": {"count": 5}, "255": {"count": 5}, "256": {"count": 4}, "257": {"count": 4}, "258": {"count": 4}, "259": {"count": 3}, "260": {"count": 5}, "261": {"count": 4}, "262": {"count": 4}, "264": {"count": 4}, "265": {"count": 3}, "266": {"count": 5}, "267": {"count": 5}, "268": {"count": 3}, "269": {"count": 2}, "270": {"count": 3}, "271": {"count": 4}, "272": {"count": 4}, "273": {"count": 5}, "274": {"count": 5}, "275": {"count": 5}, "276": {"count": 2}, "277": {"count": 3}, "278": {"count": 5}, "279": {"count": 5}, "280": {"count": 4}, "281": {"count": 5}, "282": {"count": 5}, "283": {"count": 3}, "284": {"count": 5}, "285": {"count": 3}, "286": {"count": 5}, "287": {"count": 5}, "288": {"count": 4}, "289": {"count": 4}, "290": {"count": 5}, "291": {"count": 3}, "292": {"count": 2}, "293": {"count": 1}, "294": {"count": 1}, "295": {"count": 2}, "296": {"count": 4}, "297": {"count": 5}, "298": {"count": 4}, "300": {"count": 3}, "301": {"count": 3}, "303": {"count": 4}, "304": {"count": 4}, "305": {"count": 4}, "306": {"count": 2}, "307": {"count": 5}, "308": {"count": 4}, "309": {"count": 2}, "310": {"count": 3}, "311": {"count": 3}, "312": {"count": 4}, "313": {"count": 3}, "314": {"count": 3}, "315": {"count": 3}, "316": {"count": 5}, "317": {"count": 4}, "318": {"count": 5}, "319": {"count": 4}, "320": {"count": 4}, "321": {"count": 3}, "322": {"count": 5}, "323": {"count": 4}, "324": {"count": 2}, "325": {"count": 1}, "326": {"count": 3}, "327": {"count": 4}, "328": {"count": 3}, "330": {"count": 4}, "331": {"count": 4}, "332": {"count": 2}, "333": {"count": 5}, "334": {"count": 5}, "335": {"count": 5}, "336": {"count": 4}, "337": {"count": 4}, "338": {"count": 5}, "339": {"count": 3}, "340": {"count": 5}, "341": {"count": 5}, "342": {"count": 5}, "343": {"count": 2}, "344": {"count": 2}, "345": {"count": 3}, "346": {"count": 3}, "347": {"count": 5}, "348": {"count": 3}, "349": {"count": 2}, "350": {"count": 4}, "352": {"count": 5}, "353": {"count": 3}, "354": {"count": 5}, "355": {"count": 5}, "356": {"count": 4}, "357": {"count": 3}, "358": {"count": 3}, "359": {"count": 4}, "360": {"count": 5}, "361": {"count": 5}, "362": {"count": 4}, "363": {"count": 3}, "364": {"count": 4}, "365": {"count": 1}, "366": {"count": 4}, "367": {"count": 3}, "368": {"count": 4}, "369": {"count": 3}, "370": {"count": 5}, "371": {"count": 3}, "372": {"count": 5}, "373": {"count": 4}, "374": {"count": 4}, "375": {"count": 3}, "376": {"count": 4}, "377": {"count": 4}, "378": {"count": 4}, "379": {"count": 4}, "380": {"count": 4}, "381": {"count": 4}, "382": {"count": 1}, "383": {"count": 4}, "384": {"count": 4}, "385": {"count": 4}, "386": {"count": 2}, "387": {"count": 4}, "388": {"count": 2}, "389": {"count": 5}, "390": {"count": 4}, "391": {"count": 5}, "392": {"count": 4}, "394": {"count": 4}, "395": {"count": 4}, "396": {"count": 4}, "397": {"count": 4}, "398": {"count": 5}, "399": {"count": 4}, "400": {"count": 5}, "401": {"count": 4}, "402": {"count": 4}, "404": {"count": 5}, "405": {"count": 5}, "406": {"count": 5}, "407": {"count": 4}, "408": {"count": 2}, "409": {"count": 4}, "410": {"count": 3}, "411": {"count": 5}, "412": {"count": 4}, "413": {"count": 3}, "414": {"count": 4}, "415": {"count": 4}, "416": {"count": 4}, "417": {"count": 5}, "418": {"count": 3}, "419": {"count": 5}, "421": {"count": 4}, "422": {"count": 3}, "423": {"count": 5}, "424": {"count": 5}, "425": {"count": 2}, "426": {"count": 5}, "427": {"count": 4}, "428": {"count": 5}, "429": {"count": 3}, "430": {"count": 2}, "431": {"count": 3}, "432": {"count": 5}, "433": {"count": 4}, "434": {"count": 3}, "435": {"count": 3}, "437": {"count": 3}, "438": {"count": 5}, "439": {"count": 2}, "440": {"count": 4}, "441": {"count": 4}, "442": {"count": 5}, "443": {"count": 2}, "444": {"count": 3}, "445": {"count": 3}, "446": {"count": 5}, "447": {"count": 3}, "448": {"count": 2}, "449": {"count": 1}, "450": {"count": 3}, "451": {"count": 3}, "452": {"count": 4}, "453": {"count": 2}, "454": {"count": 4}, "455": {"count": 4}, "456": {"count": 5}, "458": {"count": 4}, "459": {"count": 4}, "460": {"count": 5}, "461": {"count": 4}, "462": {"count": 4}, "463": {"count": 5}, "464": {"count": 5}, "466": {"count": 2}, "467": {"count": 4}, "468": {"count": 3}, "469": {"count": 5}, "470": {"count": 5}, "471": {"count": 2}, "472": {"count": 4}, "473": {"count": 3}, "474": {"count": 5}, "475": {"count": 5}, "476": {"count": 5}, "477": {"count": 4}, "478": {"count": 2}, "479": {"count": 4}, "480": {"count": 4}, "481": {"count": 5}, "482": {"count": 4}, "483": {"count": 3}, "484": {"count": 5}, "485": {"count": 5}, "486": {"count": 4}, "487": {"count": 3}, "488": {"count": 3}, "489": {"count": 1}, "490": {"count": 1}, "491": {"count": 2}, "492": {"count": 4}, "493": {"count": 4}, "494": {"count": 3}, "495": {"count": 4}, "496": {"count": 5}, "497": {"count": 5}, "498": {"count": 5}, "499": {"count": 4}, "79": {"count": 4}, "106": {"count": 4}, "107": {"count": 4}, "202": {"count": 1}, "203": {"count": 1}, "457": {"count": 3}, "77": {"count": 2}, "78": {"count": 4}, "182": {"count": 2}, "263": {"count": 4}, "104": {"count": 1}, "158": {"count": 5}, "329": {"count": 1}, "393": {"count": 2}, "420": {"count": 2}}}} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/CIFAR10.json b/mteb/descriptive_stats/Image/ImageClassification/CIFAR10.json new file mode 100644 index 0000000000..34ff70e050 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/CIFAR10.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 10000, + "unique_num_labels": 10, + "min_image_width": 32, + "average_image_width": 32.0, + "max_image_width": 32, + "min_image_height": 32, + "average_image_height": 32.0, + "max_image_height": 32, + "labels": { + "3": { + "count": 1000 + }, + "8": { + "count": 1000 + }, + "0": { + "count": 1000 + }, + "6": { + "count": 1000 + }, + "1": { + "count": 1000 + }, + "9": { + "count": 1000 + }, + "5": { + "count": 1000 + }, + "7": { + "count": 1000 + }, + "4": { + "count": 1000 + }, + "2": { + "count": 1000 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/CIFAR100.json b/mteb/descriptive_stats/Image/ImageClassification/CIFAR100.json new file mode 100644 index 0000000000..e8a282bc67 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/CIFAR100.json @@ -0,0 +1,314 @@ +{ + "test": { + "num_samples": 10000, + "unique_num_labels": 100, + "min_image_width": 32, + "average_image_width": 32.0, + "max_image_width": 32, + "min_image_height": 32, + "average_image_height": 32.0, + "max_image_height": 32, + "labels": { + "49": { + "count": 100 + }, + "33": { + "count": 100 + }, + "72": { + "count": 100 + }, + "51": { + "count": 100 + }, + "71": { + "count": 100 + }, + "92": { + "count": 100 + }, + "15": { + "count": 100 + }, + "14": { + "count": 100 + }, + "23": { + "count": 100 + }, + "0": { + "count": 100 + }, + "75": { + "count": 100 + }, + "81": { + "count": 100 + }, + "69": { + "count": 100 + }, + "40": { + "count": 100 + }, + "43": { + "count": 100 + }, + "97": { + "count": 100 + }, + "70": { + "count": 100 + }, + "53": { + "count": 100 + }, + "29": { + "count": 100 + }, + "21": { + "count": 100 + }, + "16": { + "count": 100 + }, + "39": { + "count": 100 + }, + "8": { + "count": 100 + }, + "20": { + "count": 100 + }, + "61": { + "count": 100 + }, + "41": { + "count": 100 + }, + "93": { + "count": 100 + }, + "56": { + "count": 100 + }, + "73": { + "count": 100 + }, + "58": { + "count": 100 + }, + "11": { + "count": 100 + }, + "25": { + "count": 100 + }, + "37": { + "count": 100 + }, + "63": { + "count": 100 + }, + "24": { + "count": 100 + }, + "22": { + "count": 100 + }, + "17": { + "count": 100 + }, + "4": { + "count": 100 + }, + "6": { + "count": 100 + }, + "9": { + "count": 100 + }, + "57": { + "count": 100 + }, + "2": { + "count": 100 + }, + "32": { + "count": 100 + }, + "52": { + "count": 100 + }, + "42": { + "count": 100 + }, + "77": { + "count": 100 + }, + "27": { + "count": 100 + }, + "65": { + "count": 100 + }, + "7": { + "count": 100 + }, + "35": { + "count": 100 + }, + "82": { + "count": 100 + }, + "66": { + "count": 100 + }, + "90": { + "count": 100 + }, + "67": { + "count": 100 + }, + "91": { + "count": 100 + }, + "10": { + "count": 100 + }, + "78": { + "count": 100 + }, + "54": { + "count": 100 + }, + "89": { + "count": 100 + }, + "18": { + "count": 100 + }, + "13": { + "count": 100 + }, + "50": { + "count": 100 + }, + "26": { + "count": 100 + }, + "83": { + "count": 100 + }, + "47": { + "count": 100 + }, + "95": { + "count": 100 + }, + "76": { + "count": 100 + }, + "59": { + "count": 100 + }, + "85": { + "count": 100 + }, + "19": { + "count": 100 + }, + "46": { + "count": 100 + }, + "1": { + "count": 100 + }, + "74": { + "count": 100 + }, + "60": { + "count": 100 + }, + "64": { + "count": 100 + }, + "45": { + "count": 100 + }, + "36": { + "count": 100 + }, + "87": { + "count": 100 + }, + "30": { + "count": 100 + }, + "99": { + "count": 100 + }, + "80": { + "count": 100 + }, + "28": { + "count": 100 + }, + "98": { + "count": 100 + }, + "12": { + "count": 100 + }, + "94": { + "count": 100 + }, + "68": { + "count": 100 + }, + "44": { + "count": 100 + }, + "31": { + "count": 100 + }, + "79": { + "count": 100 + }, + "34": { + "count": 100 + }, + "55": { + "count": 100 + }, + "62": { + "count": 100 + }, + "96": { + "count": 100 + }, + "84": { + "count": 100 + }, + "38": { + "count": 100 + }, + "86": { + "count": 100 + }, + "5": { + "count": 100 + }, + "48": { + "count": 100 + }, + "3": { + "count": 100 + }, + "88": { + "count": 100 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/Caltech101.json b/mteb/descriptive_stats/Image/ImageClassification/Caltech101.json new file mode 100644 index 0000000000..3d94eab4ca --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/Caltech101.json @@ -0,0 +1,320 @@ +{ + "test": { + "num_samples": 6084, + "unique_num_labels": 102, + "min_image_width": 80, + "average_image_width": 311.7217291255753, + "max_image_width": 3481, + "min_image_height": 101, + "average_image_height": 241.84418145956607, + "max_image_height": 3999, + "labels": { + "4": { + "count": 437 + }, + "37": { + "count": 405 + }, + "38": { + "count": 405 + }, + "57": { + "count": 170 + }, + "66": { + "count": 768 + }, + "0": { + "count": 25 + }, + "1": { + "count": 770 + }, + "2": { + "count": 12 + }, + "3": { + "count": 12 + }, + "5": { + "count": 17 + }, + "6": { + "count": 24 + }, + "7": { + "count": 16 + }, + "8": { + "count": 3 + }, + "9": { + "count": 98 + }, + "10": { + "count": 68 + }, + "11": { + "count": 13 + }, + "12": { + "count": 55 + }, + "13": { + "count": 61 + }, + "14": { + "count": 20 + }, + "15": { + "count": 13 + }, + "16": { + "count": 93 + }, + "17": { + "count": 17 + }, + "18": { + "count": 29 + }, + "19": { + "count": 32 + }, + "20": { + "count": 77 + }, + "22": { + "count": 39 + }, + "23": { + "count": 43 + }, + "24": { + "count": 40 + }, + "25": { + "count": 20 + }, + "26": { + "count": 21 + }, + "27": { + "count": 27 + }, + "28": { + "count": 37 + }, + "29": { + "count": 22 + }, + "30": { + "count": 35 + }, + "31": { + "count": 38 + }, + "32": { + "count": 45 + }, + "33": { + "count": 34 + }, + "34": { + "count": 23 + }, + "35": { + "count": 34 + }, + "36": { + "count": 55 + }, + "39": { + "count": 37 + }, + "40": { + "count": 37 + }, + "41": { + "count": 15 + }, + "42": { + "count": 4 + }, + "43": { + "count": 4 + }, + "44": { + "count": 21 + }, + "45": { + "count": 69 + }, + "46": { + "count": 70 + }, + "47": { + "count": 12 + }, + "48": { + "count": 24 + }, + "49": { + "count": 58 + }, + "50": { + "count": 50 + }, + "51": { + "count": 1 + }, + "52": { + "count": 34 + }, + "53": { + "count": 56 + }, + "54": { + "count": 84 + }, + "55": { + "count": 31 + }, + "56": { + "count": 51 + }, + "58": { + "count": 48 + }, + "59": { + "count": 11 + }, + "60": { + "count": 36 + }, + "61": { + "count": 13 + }, + "62": { + "count": 10 + }, + "63": { + "count": 57 + }, + "64": { + "count": 2 + }, + "65": { + "count": 46 + }, + "67": { + "count": 25 + }, + "68": { + "count": 5 + }, + "69": { + "count": 9 + }, + "70": { + "count": 17 + }, + "71": { + "count": 8 + }, + "72": { + "count": 15 + }, + "73": { + "count": 23 + }, + "74": { + "count": 4 + }, + "75": { + "count": 27 + }, + "76": { + "count": 52 + }, + "77": { + "count": 29 + }, + "78": { + "count": 19 + }, + "79": { + "count": 10 + }, + "80": { + "count": 33 + }, + "81": { + "count": 9 + }, + "82": { + "count": 54 + }, + "83": { + "count": 27 + }, + "84": { + "count": 5 + }, + "85": { + "count": 34 + }, + "86": { + "count": 15 + }, + "87": { + "count": 56 + }, + "88": { + "count": 29 + }, + "89": { + "count": 34 + }, + "90": { + "count": 5 + }, + "91": { + "count": 55 + }, + "92": { + "count": 19 + }, + "93": { + "count": 56 + }, + "94": { + "count": 45 + }, + "95": { + "count": 209 + }, + "96": { + "count": 7 + }, + "97": { + "count": 29 + }, + "98": { + "count": 4 + }, + "99": { + "count": 26 + }, + "100": { + "count": 9 + }, + "101": { + "count": 30 + }, + "21": { + "count": 17 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/Country211.json b/mteb/descriptive_stats/Image/ImageClassification/Country211.json new file mode 100644 index 0000000000..647aefea10 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/Country211.json @@ -0,0 +1,647 @@ +{ + "test": { + "num_samples": 21100, + "unique_num_labels": 211, + "min_image_width": 32, + "average_image_width": 468.5923222748815, + "max_image_width": 500, + "min_image_height": 37, + "average_image_height": 381.726682464455, + "max_image_height": 500, + "labels": { + "0": { + "count": 100 + }, + "1": { + "count": 100 + }, + "2": { + "count": 100 + }, + "3": { + "count": 100 + }, + "4": { + "count": 100 + }, + "5": { + "count": 100 + }, + "6": { + "count": 100 + }, + "7": { + "count": 100 + }, + "8": { + "count": 100 + }, + "9": { + "count": 100 + }, + "10": { + "count": 100 + }, + "11": { + "count": 100 + }, + "12": { + "count": 100 + }, + "13": { + "count": 100 + }, + "14": { + "count": 100 + }, + "15": { + "count": 100 + }, + "16": { + "count": 100 + }, + "17": { + "count": 100 + }, + "18": { + "count": 100 + }, + "19": { + "count": 100 + }, + "20": { + "count": 100 + }, + "21": { + "count": 100 + }, + "22": { + "count": 100 + }, + "23": { + "count": 100 + }, + "24": { + "count": 100 + }, + "25": { + "count": 100 + }, + "26": { + "count": 100 + }, + "27": { + "count": 100 + }, + "28": { + "count": 100 + }, + "29": { + "count": 100 + }, + "30": { + "count": 100 + }, + "31": { + "count": 100 + }, + "32": { + "count": 100 + }, + "33": { + "count": 100 + }, + "34": { + "count": 100 + }, + "35": { + "count": 100 + }, + "36": { + "count": 100 + }, + "37": { + "count": 100 + }, + "38": { + "count": 100 + }, + "39": { + "count": 100 + }, + "40": { + "count": 100 + }, + "41": { + "count": 100 + }, + "42": { + "count": 100 + }, + "43": { + "count": 100 + }, + "44": { + "count": 100 + }, + "45": { + "count": 100 + }, + "46": { + "count": 100 + }, + "47": { + "count": 100 + }, + "48": { + "count": 100 + }, + "49": { + "count": 100 + }, + "50": { + "count": 100 + }, + "51": { + "count": 100 + }, + "52": { + "count": 100 + }, + "53": { + "count": 100 + }, + "54": { + "count": 100 + }, + "55": { + "count": 100 + }, + "56": { + "count": 100 + }, + "57": { + "count": 100 + }, + "58": { + "count": 100 + }, + "59": { + "count": 100 + }, + "60": { + "count": 100 + }, + "61": { + "count": 100 + }, + "62": { + "count": 100 + }, + "63": { + "count": 100 + }, + "64": { + "count": 100 + }, + "65": { + "count": 100 + }, + "66": { + "count": 100 + }, + "67": { + "count": 100 + }, + "68": { + "count": 100 + }, + "69": { + "count": 100 + }, + "70": { + "count": 100 + }, + "71": { + "count": 100 + }, + "72": { + "count": 100 + }, + "73": { + "count": 100 + }, + "74": { + "count": 100 + }, + "75": { + "count": 100 + }, + "76": { + "count": 100 + }, + "77": { + "count": 100 + }, + "78": { + "count": 100 + }, + "79": { + "count": 100 + }, + "80": { + "count": 100 + }, + "81": { + "count": 100 + }, + "82": { + "count": 100 + }, + "83": { + "count": 100 + }, + "84": { + "count": 100 + }, + "85": { + "count": 100 + }, + "86": { + "count": 100 + }, + "87": { + "count": 100 + }, + "88": { + "count": 100 + }, + "89": { + "count": 100 + }, + "90": { + "count": 100 + }, + "91": { + "count": 100 + }, + "92": { + "count": 100 + }, + "93": { + "count": 100 + }, + "94": { + "count": 100 + }, + "95": { + "count": 100 + }, + "96": { + "count": 100 + }, + "97": { + "count": 100 + }, + "98": { + "count": 100 + }, + "99": { + "count": 100 + }, + "100": { + "count": 100 + }, + "101": { + "count": 100 + }, + "102": { + "count": 100 + }, + "103": { + "count": 100 + }, + "104": { + "count": 100 + }, + "105": { + "count": 100 + }, + "106": { + "count": 100 + }, + "107": { + "count": 100 + }, + "108": { + "count": 100 + }, + "109": { + "count": 100 + }, + "110": { + "count": 100 + }, + "111": { + "count": 100 + }, + "112": { + "count": 100 + }, + "113": { + "count": 100 + }, + "114": { + "count": 100 + }, + "115": { + "count": 100 + }, + "116": { + "count": 100 + }, + "117": { + "count": 100 + }, + "118": { + "count": 100 + }, + "119": { + "count": 100 + }, + "120": { + "count": 100 + }, + "121": { + "count": 100 + }, + "122": { + "count": 100 + }, + "123": { + "count": 100 + }, + "124": { + "count": 100 + }, + "125": { + "count": 100 + }, + "126": { + "count": 100 + }, + "127": { + "count": 100 + }, + "128": { + "count": 100 + }, + "129": { + "count": 100 + }, + "130": { + "count": 100 + }, + "131": { + "count": 100 + }, + "132": { + "count": 100 + }, + "133": { + "count": 100 + }, + "134": { + "count": 100 + }, + "135": { + "count": 100 + }, + "136": { + "count": 100 + }, + "137": { + "count": 100 + }, + "138": { + "count": 100 + }, + "139": { + "count": 100 + }, + "140": { + "count": 100 + }, + "141": { + "count": 100 + }, + "142": { + "count": 100 + }, + "143": { + "count": 100 + }, + "144": { + "count": 100 + }, + "145": { + "count": 100 + }, + "146": { + "count": 100 + }, + "147": { + "count": 100 + }, + "148": { + "count": 100 + }, + "149": { + "count": 100 + }, + "150": { + "count": 100 + }, + "151": { + "count": 100 + }, + "152": { + "count": 100 + }, + "153": { + "count": 100 + }, + "154": { + "count": 100 + }, + "155": { + "count": 100 + }, + "156": { + "count": 100 + }, + "157": { + "count": 100 + }, + "158": { + "count": 100 + }, + "159": { + "count": 100 + }, + "160": { + "count": 100 + }, + "161": { + "count": 100 + }, + "162": { + "count": 100 + }, + "163": { + "count": 100 + }, + "164": { + "count": 100 + }, + "165": { + "count": 100 + }, + "166": { + "count": 100 + }, + "167": { + "count": 100 + }, + "168": { + "count": 100 + }, + "169": { + "count": 100 + }, + "170": { + "count": 100 + }, + "171": { + "count": 100 + }, + "172": { + "count": 100 + }, + "173": { + "count": 100 + }, + "174": { + "count": 100 + }, + "175": { + "count": 100 + }, + "176": { + "count": 100 + }, + "177": { + "count": 100 + }, + "178": { + "count": 100 + }, + "179": { + "count": 100 + }, + "180": { + "count": 100 + }, + "181": { + "count": 100 + }, + "182": { + "count": 100 + }, + "183": { + "count": 100 + }, + "184": { + "count": 100 + }, + "185": { + "count": 100 + }, + "186": { + "count": 100 + }, + "187": { + "count": 100 + }, + "188": { + "count": 100 + }, + "189": { + "count": 100 + }, + "190": { + "count": 100 + }, + "191": { + "count": 100 + }, + "192": { + "count": 100 + }, + "193": { + "count": 100 + }, + "194": { + "count": 100 + }, + "195": { + "count": 100 + }, + "196": { + "count": 100 + }, + "197": { + "count": 100 + }, + "198": { + "count": 100 + }, + "199": { + "count": 100 + }, + "200": { + "count": 100 + }, + "201": { + "count": 100 + }, + "202": { + "count": 100 + }, + "203": { + "count": 100 + }, + "204": { + "count": 100 + }, + "205": { + "count": 100 + }, + "206": { + "count": 100 + }, + "207": { + "count": 100 + }, + "208": { + "count": 100 + }, + "209": { + "count": 100 + }, + "210": { + "count": 100 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/DTD.json b/mteb/descriptive_stats/Image/ImageClassification/DTD.json new file mode 100644 index 0000000000..13a6fcf10c --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/DTD.json @@ -0,0 +1,155 @@ +{ + "test": { + "num_samples": 1880, + "unique_num_labels": 47, + "min_image_width": 300, + "average_image_width": 488.9760638297872, + "max_image_width": 900, + "min_image_height": 300, + "average_image_height": 447.4962765957447, + "max_image_height": 778, + "labels": { + "0": { + "count": 40 + }, + "1": { + "count": 40 + }, + "10": { + "count": 40 + }, + "11": { + "count": 40 + }, + "12": { + "count": 40 + }, + "13": { + "count": 40 + }, + "14": { + "count": 40 + }, + "15": { + "count": 40 + }, + "16": { + "count": 40 + }, + "17": { + "count": 40 + }, + "18": { + "count": 40 + }, + "19": { + "count": 40 + }, + "2": { + "count": 40 + }, + "20": { + "count": 40 + }, + "21": { + "count": 40 + }, + "22": { + "count": 40 + }, + "23": { + "count": 40 + }, + "24": { + "count": 40 + }, + "25": { + "count": 40 + }, + "26": { + "count": 40 + }, + "27": { + "count": 40 + }, + "28": { + "count": 40 + }, + "29": { + "count": 40 + }, + "3": { + "count": 40 + }, + "30": { + "count": 40 + }, + "31": { + "count": 40 + }, + "32": { + "count": 40 + }, + "33": { + "count": 40 + }, + "34": { + "count": 40 + }, + "35": { + "count": 40 + }, + "36": { + "count": 40 + }, + "37": { + "count": 40 + }, + "38": { + "count": 40 + }, + "39": { + "count": 40 + }, + "4": { + "count": 40 + }, + "40": { + "count": 40 + }, + "41": { + "count": 40 + }, + "42": { + "count": 40 + }, + "43": { + "count": 40 + }, + "44": { + "count": 40 + }, + "45": { + "count": 40 + }, + "46": { + "count": 40 + }, + "5": { + "count": 40 + }, + "6": { + "count": 40 + }, + "7": { + "count": 40 + }, + "8": { + "count": 40 + }, + "9": { + "count": 40 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/EuroSAT.json b/mteb/descriptive_stats/Image/ImageClassification/EuroSAT.json new file mode 100644 index 0000000000..45464c8286 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/EuroSAT.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 5400, + "unique_num_labels": 10, + "min_image_width": 64, + "average_image_width": 64.0, + "max_image_width": 64, + "min_image_height": 64, + "average_image_height": 64.0, + "max_image_height": 64, + "labels": { + "4": { + "count": 501 + }, + "3": { + "count": 496 + }, + "7": { + "count": 554 + }, + "2": { + "count": 573 + }, + "9": { + "count": 609 + }, + "0": { + "count": 596 + }, + "8": { + "count": 529 + }, + "1": { + "count": 608 + }, + "5": { + "count": 396 + }, + "6": { + "count": 538 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/FER2013.json b/mteb/descriptive_stats/Image/ImageClassification/FER2013.json new file mode 100644 index 0000000000..a7238cb1a5 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/FER2013.json @@ -0,0 +1,35 @@ +{ + "test": { + "num_samples": 7178, + "unique_num_labels": 7, + "min_image_width": 48, + "average_image_width": 48.0, + "max_image_width": 48, + "min_image_height": 48, + "average_image_height": 48.0, + "max_image_height": 48, + "labels": { + "0": { + "count": 958 + }, + "1": { + "count": 111 + }, + "2": { + "count": 1024 + }, + "3": { + "count": 1774 + }, + "4": { + "count": 1233 + }, + "5": { + "count": 1247 + }, + "6": { + "count": 831 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/FGVCAircraft.json b/mteb/descriptive_stats/Image/ImageClassification/FGVCAircraft.json new file mode 100644 index 0000000000..2b9f2d88b6 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/FGVCAircraft.json @@ -0,0 +1,314 @@ +{ + "test": { + "num_samples": 3333, + "unique_num_labels": 100, + "min_image_width": 800, + "average_image_width": 1098.5760576057605, + "max_image_width": 1600, + "min_image_height": 413, + "average_image_height": 746.996699669967, + "max_image_height": 1197, + "labels": { + "0": { + "count": 33 + }, + "1": { + "count": 33 + }, + "2": { + "count": 34 + }, + "3": { + "count": 33 + }, + "4": { + "count": 33 + }, + "5": { + "count": 34 + }, + "6": { + "count": 33 + }, + "7": { + "count": 33 + }, + "8": { + "count": 34 + }, + "9": { + "count": 33 + }, + "10": { + "count": 33 + }, + "11": { + "count": 34 + }, + "12": { + "count": 33 + }, + "13": { + "count": 33 + }, + "14": { + "count": 34 + }, + "15": { + "count": 33 + }, + "16": { + "count": 33 + }, + "17": { + "count": 34 + }, + "18": { + "count": 33 + }, + "19": { + "count": 33 + }, + "20": { + "count": 34 + }, + "21": { + "count": 33 + }, + "22": { + "count": 33 + }, + "23": { + "count": 34 + }, + "24": { + "count": 33 + }, + "25": { + "count": 33 + }, + "26": { + "count": 34 + }, + "27": { + "count": 33 + }, + "28": { + "count": 33 + }, + "29": { + "count": 34 + }, + "30": { + "count": 33 + }, + "31": { + "count": 33 + }, + "32": { + "count": 34 + }, + "33": { + "count": 33 + }, + "34": { + "count": 33 + }, + "35": { + "count": 34 + }, + "36": { + "count": 33 + }, + "37": { + "count": 33 + }, + "38": { + "count": 34 + }, + "39": { + "count": 33 + }, + "40": { + "count": 33 + }, + "41": { + "count": 34 + }, + "42": { + "count": 33 + }, + "43": { + "count": 33 + }, + "44": { + "count": 34 + }, + "45": { + "count": 33 + }, + "46": { + "count": 33 + }, + "47": { + "count": 34 + }, + "48": { + "count": 33 + }, + "49": { + "count": 33 + }, + "50": { + "count": 34 + }, + "51": { + "count": 33 + }, + "52": { + "count": 33 + }, + "53": { + "count": 34 + }, + "54": { + "count": 33 + }, + "55": { + "count": 33 + }, + "56": { + "count": 34 + }, + "57": { + "count": 33 + }, + "58": { + "count": 33 + }, + "59": { + "count": 34 + }, + "60": { + "count": 33 + }, + "61": { + "count": 33 + }, + "62": { + "count": 34 + }, + "63": { + "count": 33 + }, + "64": { + "count": 33 + }, + "65": { + "count": 34 + }, + "66": { + "count": 33 + }, + "67": { + "count": 33 + }, + "68": { + "count": 34 + }, + "69": { + "count": 33 + }, + "70": { + "count": 33 + }, + "71": { + "count": 34 + }, + "72": { + "count": 33 + }, + "73": { + "count": 33 + }, + "74": { + "count": 34 + }, + "75": { + "count": 33 + }, + "76": { + "count": 33 + }, + "77": { + "count": 34 + }, + "78": { + "count": 33 + }, + "79": { + "count": 33 + }, + "80": { + "count": 34 + }, + "81": { + "count": 33 + }, + "82": { + "count": 33 + }, + "83": { + "count": 34 + }, + "84": { + "count": 33 + }, + "85": { + "count": 33 + }, + "86": { + "count": 34 + }, + "87": { + "count": 33 + }, + "88": { + "count": 33 + }, + "89": { + "count": 34 + }, + "90": { + "count": 33 + }, + "91": { + "count": 33 + }, + "92": { + "count": 34 + }, + "93": { + "count": 33 + }, + "94": { + "count": 33 + }, + "95": { + "count": 34 + }, + "96": { + "count": 33 + }, + "97": { + "count": 33 + }, + "98": { + "count": 34 + }, + "99": { + "count": 33 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/Food101Classification.json b/mteb/descriptive_stats/Image/ImageClassification/Food101Classification.json new file mode 100644 index 0000000000..4137261f84 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/Food101Classification.json @@ -0,0 +1,317 @@ +{ + "validation": { + "num_samples": 25250, + "unique_num_labels": 101, + "min_image_width": 287, + "average_image_width": 495.818495049505, + "max_image_width": 512, + "min_image_height": 213, + "average_image_height": 475.08229702970294, + "max_image_height": 512, + "labels": { + "6": { + "count": 250 + }, + "79": { + "count": 250 + }, + "81": { + "count": 250 + }, + "53": { + "count": 250 + }, + "10": { + "count": 250 + }, + "20": { + "count": 250 + }, + "77": { + "count": 250 + }, + "48": { + "count": 250 + }, + "86": { + "count": 250 + }, + "84": { + "count": 250 + }, + "76": { + "count": 250 + }, + "34": { + "count": 250 + }, + "51": { + "count": 250 + }, + "21": { + "count": 250 + }, + "64": { + "count": 250 + }, + "0": { + "count": 250 + }, + "43": { + "count": 250 + }, + "44": { + "count": 250 + }, + "73": { + "count": 250 + }, + "57": { + "count": 250 + }, + "14": { + "count": 250 + }, + "5": { + "count": 250 + }, + "46": { + "count": 250 + }, + "55": { + "count": 250 + }, + "93": { + "count": 250 + }, + "98": { + "count": 250 + }, + "38": { + "count": 250 + }, + "11": { + "count": 250 + }, + "99": { + "count": 250 + }, + "72": { + "count": 250 + }, + "22": { + "count": 250 + }, + "59": { + "count": 250 + }, + "70": { + "count": 250 + }, + "16": { + "count": 250 + }, + "2": { + "count": 250 + }, + "58": { + "count": 250 + }, + "83": { + "count": 250 + }, + "96": { + "count": 250 + }, + "39": { + "count": 250 + }, + "49": { + "count": 250 + }, + "45": { + "count": 250 + }, + "88": { + "count": 250 + }, + "9": { + "count": 250 + }, + "26": { + "count": 250 + }, + "94": { + "count": 250 + }, + "4": { + "count": 250 + }, + "65": { + "count": 250 + }, + "32": { + "count": 250 + }, + "27": { + "count": 250 + }, + "36": { + "count": 250 + }, + "87": { + "count": 250 + }, + "69": { + "count": 250 + }, + "85": { + "count": 250 + }, + "25": { + "count": 250 + }, + "40": { + "count": 250 + }, + "19": { + "count": 250 + }, + "35": { + "count": 250 + }, + "56": { + "count": 250 + }, + "42": { + "count": 250 + }, + "60": { + "count": 250 + }, + "68": { + "count": 250 + }, + "100": { + "count": 250 + }, + "41": { + "count": 250 + }, + "92": { + "count": 250 + }, + "24": { + "count": 250 + }, + "3": { + "count": 250 + }, + "89": { + "count": 250 + }, + "75": { + "count": 250 + }, + "17": { + "count": 250 + }, + "97": { + "count": 250 + }, + "61": { + "count": 250 + }, + "33": { + "count": 250 + }, + "80": { + "count": 250 + }, + "30": { + "count": 250 + }, + "8": { + "count": 250 + }, + "74": { + "count": 250 + }, + "66": { + "count": 250 + }, + "31": { + "count": 250 + }, + "18": { + "count": 250 + }, + "67": { + "count": 250 + }, + "37": { + "count": 250 + }, + "13": { + "count": 250 + }, + "63": { + "count": 250 + }, + "28": { + "count": 250 + }, + "47": { + "count": 250 + }, + "52": { + "count": 250 + }, + "54": { + "count": 250 + }, + "1": { + "count": 250 + }, + "82": { + "count": 250 + }, + "91": { + "count": 250 + }, + "95": { + "count": 250 + }, + "7": { + "count": 250 + }, + "29": { + "count": 250 + }, + "78": { + "count": 250 + }, + "15": { + "count": 250 + }, + "23": { + "count": 250 + }, + "12": { + "count": 250 + }, + "62": { + "count": 250 + }, + "50": { + "count": 250 + }, + "71": { + "count": 250 + }, + "90": { + "count": 250 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/GTSRB.json b/mteb/descriptive_stats/Image/ImageClassification/GTSRB.json new file mode 100644 index 0000000000..84a5216450 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/GTSRB.json @@ -0,0 +1,143 @@ +{ + "test": { + "num_samples": 12630, + "unique_num_labels": 43, + "min_image_width": 25, + "average_image_width": 50.50775930324624, + "max_image_width": 266, + "min_image_height": 25, + "average_image_height": 50.36444972288203, + "max_image_height": 232, + "labels": { + "16": { + "count": 150 + }, + "1": { + "count": 720 + }, + "38": { + "count": 690 + }, + "33": { + "count": 210 + }, + "11": { + "count": 420 + }, + "18": { + "count": 390 + }, + "12": { + "count": 690 + }, + "25": { + "count": 480 + }, + "35": { + "count": 390 + }, + "7": { + "count": 450 + }, + "23": { + "count": 150 + }, + "4": { + "count": 660 + }, + "9": { + "count": 480 + }, + "21": { + "count": 90 + }, + "20": { + "count": 90 + }, + "27": { + "count": 60 + }, + "3": { + "count": 450 + }, + "13": { + "count": 720 + }, + "10": { + "count": 660 + }, + "5": { + "count": 630 + }, + "17": { + "count": 360 + }, + "34": { + "count": 120 + }, + "2": { + "count": 750 + }, + "8": { + "count": 450 + }, + "30": { + "count": 150 + }, + "24": { + "count": 90 + }, + "15": { + "count": 210 + }, + "26": { + "count": 180 + }, + "28": { + "count": 150 + }, + "22": { + "count": 120 + }, + "14": { + "count": 270 + }, + "32": { + "count": 60 + }, + "29": { + "count": 90 + }, + "6": { + "count": 150 + }, + "36": { + "count": 120 + }, + "40": { + "count": 90 + }, + "41": { + "count": 60 + }, + "31": { + "count": 270 + }, + "19": { + "count": 60 + }, + "0": { + "count": 60 + }, + "39": { + "count": 90 + }, + "42": { + "count": 90 + }, + "37": { + "count": 60 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/OxfordFlowersClassification.json b/mteb/descriptive_stats/Image/ImageClassification/OxfordFlowersClassification.json new file mode 100644 index 0000000000..852aadc78b --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/OxfordFlowersClassification.json @@ -0,0 +1,320 @@ +{ + "test": { + "num_samples": 1020, + "unique_num_labels": 102, + "min_image_width": 500, + "average_image_width": 618.0725490196079, + "max_image_width": 873, + "min_image_height": 500, + "average_image_height": 538.2607843137255, + "max_image_height": 928, + "labels": { + "0": { + "count": 9 + }, + "1": { + "count": 9 + }, + "2": { + "count": 10 + }, + "3": { + "count": 9 + }, + "4": { + "count": 11 + }, + "5": { + "count": 11 + }, + "6": { + "count": 10 + }, + "7": { + "count": 10 + }, + "8": { + "count": 11 + }, + "9": { + "count": 10 + }, + "10": { + "count": 10 + }, + "11": { + "count": 9 + }, + "12": { + "count": 10 + }, + "13": { + "count": 10 + }, + "14": { + "count": 10 + }, + "15": { + "count": 9 + }, + "16": { + "count": 11 + }, + "17": { + "count": 11 + }, + "18": { + "count": 10 + }, + "19": { + "count": 9 + }, + "20": { + "count": 9 + }, + "21": { + "count": 10 + }, + "22": { + "count": 11 + }, + "23": { + "count": 11 + }, + "24": { + "count": 10 + }, + "25": { + "count": 11 + }, + "26": { + "count": 10 + }, + "27": { + "count": 9 + }, + "28": { + "count": 11 + }, + "29": { + "count": 10 + }, + "30": { + "count": 10 + }, + "31": { + "count": 9 + }, + "32": { + "count": 10 + }, + "33": { + "count": 10 + }, + "34": { + "count": 10 + }, + "35": { + "count": 11 + }, + "36": { + "count": 9 + }, + "37": { + "count": 10 + }, + "38": { + "count": 10 + }, + "39": { + "count": 11 + }, + "40": { + "count": 10 + }, + "41": { + "count": 10 + }, + "42": { + "count": 11 + }, + "43": { + "count": 10 + }, + "44": { + "count": 10 + }, + "45": { + "count": 10 + }, + "46": { + "count": 10 + }, + "47": { + "count": 9 + }, + "48": { + "count": 10 + }, + "49": { + "count": 11 + }, + "50": { + "count": 10 + }, + "51": { + "count": 10 + }, + "52": { + "count": 10 + }, + "53": { + "count": 10 + }, + "54": { + "count": 10 + }, + "55": { + "count": 10 + }, + "56": { + "count": 10 + }, + "57": { + "count": 11 + }, + "58": { + "count": 10 + }, + "59": { + "count": 10 + }, + "60": { + "count": 10 + }, + "61": { + "count": 10 + }, + "62": { + "count": 9 + }, + "63": { + "count": 10 + }, + "64": { + "count": 10 + }, + "65": { + "count": 9 + }, + "66": { + "count": 11 + }, + "67": { + "count": 10 + }, + "68": { + "count": 11 + }, + "69": { + "count": 9 + }, + "70": { + "count": 9 + }, + "71": { + "count": 10 + }, + "72": { + "count": 10 + }, + "73": { + "count": 10 + }, + "74": { + "count": 10 + }, + "75": { + "count": 10 + }, + "76": { + "count": 10 + }, + "77": { + "count": 10 + }, + "78": { + "count": 9 + }, + "79": { + "count": 10 + }, + "80": { + "count": 10 + }, + "81": { + "count": 11 + }, + "82": { + "count": 10 + }, + "83": { + "count": 9 + }, + "84": { + "count": 11 + }, + "85": { + "count": 10 + }, + "86": { + "count": 10 + }, + "87": { + "count": 10 + }, + "88": { + "count": 10 + }, + "89": { + "count": 10 + }, + "90": { + "count": 10 + }, + "91": { + "count": 10 + }, + "92": { + "count": 10 + }, + "93": { + "count": 9 + }, + "94": { + "count": 9 + }, + "95": { + "count": 10 + }, + "96": { + "count": 10 + }, + "97": { + "count": 10 + }, + "98": { + "count": 10 + }, + "99": { + "count": 10 + }, + "100": { + "count": 10 + }, + "101": { + "count": 11 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/OxfordPets.json b/mteb/descriptive_stats/Image/ImageClassification/OxfordPets.json new file mode 100644 index 0000000000..0aa19204a7 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/OxfordPets.json @@ -0,0 +1,125 @@ +{ + "test": { + "num_samples": 3669, + "unique_num_labels": 37, + "min_image_width": 137, + "average_image_width": 443.4600708639956, + "max_image_width": 1646, + "min_image_height": 103, + "average_image_height": 399.3780321613519, + "max_image_height": 2160, + "labels": { + "0": { + "count": 98 + }, + "1": { + "count": 100 + }, + "2": { + "count": 100 + }, + "3": { + "count": 100 + }, + "4": { + "count": 100 + }, + "5": { + "count": 100 + }, + "6": { + "count": 100 + }, + "7": { + "count": 88 + }, + "8": { + "count": 99 + }, + "9": { + "count": 100 + }, + "10": { + "count": 100 + }, + "11": { + "count": 97 + }, + "12": { + "count": 100 + }, + "13": { + "count": 100 + }, + "14": { + "count": 100 + }, + "15": { + "count": 100 + }, + "16": { + "count": 100 + }, + "17": { + "count": 100 + }, + "18": { + "count": 99 + }, + "19": { + "count": 100 + }, + "20": { + "count": 100 + }, + "21": { + "count": 100 + }, + "22": { + "count": 100 + }, + "23": { + "count": 100 + }, + "24": { + "count": 100 + }, + "25": { + "count": 100 + }, + "26": { + "count": 100 + }, + "27": { + "count": 100 + }, + "28": { + "count": 100 + }, + "29": { + "count": 100 + }, + "30": { + "count": 99 + }, + "31": { + "count": 100 + }, + "32": { + "count": 100 + }, + "33": { + "count": 100 + }, + "34": { + "count": 89 + }, + "35": { + "count": 100 + }, + "36": { + "count": 100 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/PatchCamelyon.json b/mteb/descriptive_stats/Image/ImageClassification/PatchCamelyon.json new file mode 100644 index 0000000000..e6b13afe50 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/PatchCamelyon.json @@ -0,0 +1,20 @@ +{ + "test": { + "num_samples": 32768, + "unique_num_labels": 2, + "min_image_width": 96, + "average_image_width": 96.0, + "max_image_width": 96, + "min_image_height": 96, + "average_image_height": 96.0, + "max_image_height": 96, + "labels": { + "0": { + "count": 16391 + }, + "1": { + "count": 16377 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/RESISC45.json b/mteb/descriptive_stats/Image/ImageClassification/RESISC45.json new file mode 100644 index 0000000000..f4a2290189 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/RESISC45.json @@ -0,0 +1,149 @@ +{ + "test": { + "num_samples": 6300, + "unique_num_labels": 45, + "min_image_width": 256, + "average_image_width": 256.0, + "max_image_width": 256, + "min_image_height": 256, + "average_image_height": 256.0, + "max_image_height": 256, + "labels": { + "31": { + "count": 135 + }, + "11": { + "count": 144 + }, + "28": { + "count": 135 + }, + "43": { + "count": 154 + }, + "41": { + "count": 144 + }, + "33": { + "count": 134 + }, + "19": { + "count": 130 + }, + "16": { + "count": 127 + }, + "22": { + "count": 130 + }, + "34": { + "count": 143 + }, + "24": { + "count": 164 + }, + "0": { + "count": 169 + }, + "13": { + "count": 146 + }, + "25": { + "count": 115 + }, + "6": { + "count": 132 + }, + "36": { + "count": 135 + }, + "39": { + "count": 142 + }, + "18": { + "count": 140 + }, + "23": { + "count": 147 + }, + "37": { + "count": 159 + }, + "15": { + "count": 122 + }, + "29": { + "count": 140 + }, + "9": { + "count": 159 + }, + "27": { + "count": 140 + }, + "21": { + "count": 131 + }, + "3": { + "count": 134 + }, + "1": { + "count": 162 + }, + "32": { + "count": 153 + }, + "26": { + "count": 150 + }, + "35": { + "count": 151 + }, + "44": { + "count": 118 + }, + "30": { + "count": 154 + }, + "20": { + "count": 139 + }, + "4": { + "count": 130 + }, + "42": { + "count": 127 + }, + "40": { + "count": 137 + }, + "5": { + "count": 140 + }, + "17": { + "count": 142 + }, + "2": { + "count": 123 + }, + "38": { + "count": 130 + }, + "10": { + "count": 140 + }, + "12": { + "count": 146 + }, + "8": { + "count": 146 + }, + "7": { + "count": 143 + }, + "14": { + "count": 118 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/STL10.json b/mteb/descriptive_stats/Image/ImageClassification/STL10.json new file mode 100644 index 0000000000..a647321290 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/STL10.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 8000, + "unique_num_labels": 10, + "min_image_width": 96, + "average_image_width": 96.0, + "max_image_width": 96, + "min_image_height": 96, + "average_image_height": 96.0, + "max_image_height": 96, + "labels": { + "0": { + "count": 800 + }, + "1": { + "count": 800 + }, + "2": { + "count": 800 + }, + "3": { + "count": 800 + }, + "4": { + "count": 800 + }, + "5": { + "count": 800 + }, + "6": { + "count": 800 + }, + "7": { + "count": 800 + }, + "8": { + "count": 800 + }, + "9": { + "count": 800 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/SUN397.json b/mteb/descriptive_stats/Image/ImageClassification/SUN397.json new file mode 100644 index 0000000000..cd840269be --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/SUN397.json @@ -0,0 +1 @@ +{"test": {"num_samples": 21750, "unique_num_labels": 397, "min_image_width": 125, "average_image_width": 354.21843678160917, "max_image_width": 696, "min_image_height": 94, "average_image_height": 291.1662988505747, "max_image_height": 595, "labels": {"227": {"count": 439}, "213": {"count": 335}, "53": {"count": 23}, "350": {"count": 40}, "73": {"count": 38}, "316": {"count": 63}, "177": {"count": 80}, "25": {"count": 39}, "275": {"count": 31}, "328": {"count": 33}, "263": {"count": 47}, "239": {"count": 26}, "41": {"count": 213}, "319": {"count": 51}, "91": {"count": 16}, "95": {"count": 183}, "396": {"count": 20}, "259": {"count": 36}, "107": {"count": 167}, "381": {"count": 164}, "174": {"count": 167}, "246": {"count": 44}, "67": {"count": 31}, "374": {"count": 28}, "354": {"count": 22}, "72": {"count": 100}, "97": {"count": 32}, "256": {"count": 57}, "247": {"count": 57}, "159": {"count": 49}, "270": {"count": 135}, "133": {"count": 215}, "197": {"count": 40}, "12": {"count": 38}, "2": {"count": 226}, "115": {"count": 75}, "200": {"count": 93}, "47": {"count": 103}, "9": {"count": 37}, "22": {"count": 76}, "255": {"count": 34}, "267": {"count": 22}, "244": {"count": 93}, "85": {"count": 115}, "342": {"count": 87}, "55": {"count": 50}, "7": {"count": 41}, "337": {"count": 99}, "38": {"count": 28}, "269": {"count": 69}, "106": {"count": 15}, "298": {"count": 27}, "361": {"count": 53}, "8": {"count": 108}, "166": {"count": 47}, "280": {"count": 51}, "35": {"count": 61}, "147": {"count": 82}, "214": {"count": 26}, "284": {"count": 28}, "286": {"count": 66}, "113": {"count": 67}, "83": {"count": 38}, "82": {"count": 236}, "365": {"count": 17}, "242": {"count": 116}, "186": {"count": 38}, "87": {"count": 111}, "274": {"count": 48}, "27": {"count": 95}, "283": {"count": 22}, "4": {"count": 76}, "334": {"count": 139}, "364": {"count": 21}, "48": {"count": 408}, "311": {"count": 41}, "101": {"count": 64}, "131": {"count": 55}, "172": {"count": 31}, "355": {"count": 28}, "308": {"count": 56}, "5": {"count": 47}, "318": {"count": 155}, "86": {"count": 87}, "46": {"count": 230}, "111": {"count": 69}, "88": {"count": 54}, "23": {"count": 47}, "70": {"count": 61}, "217": {"count": 34}, "11": {"count": 76}, "193": {"count": 207}, "0": {"count": 99}, "303": {"count": 23}, "324": {"count": 47}, "377": {"count": 19}, "345": {"count": 39}, "154": {"count": 49}, "393": {"count": 68}, "152": {"count": 58}, "317": {"count": 27}, "384": {"count": 46}, "257": {"count": 38}, "294": {"count": 47}, "145": {"count": 23}, "289": {"count": 33}, "375": {"count": 19}, "57": {"count": 42}, "15": {"count": 62}, "109": {"count": 24}, "139": {"count": 24}, "66": {"count": 26}, "340": {"count": 32}, "150": {"count": 41}, "118": {"count": 105}, "333": {"count": 27}, "126": {"count": 55}, "366": {"count": 116}, "358": {"count": 151}, "251": {"count": 37}, "309": {"count": 35}, "54": {"count": 20}, "327": {"count": 38}, "3": {"count": 60}, "21": {"count": 56}, "17": {"count": 62}, "146": {"count": 84}, "94": {"count": 42}, "243": {"count": 48}, "335": {"count": 85}, "245": {"count": 141}, "279": {"count": 187}, "360": {"count": 25}, "192": {"count": 105}, "49": {"count": 31}, "230": {"count": 81}, "357": {"count": 22}, "64": {"count": 72}, "112": {"count": 26}, "338": {"count": 70}, "216": {"count": 99}, "234": {"count": 183}, "300": {"count": 153}, "188": {"count": 48}, "254": {"count": 41}, "184": {"count": 183}, "373": {"count": 47}, "221": {"count": 86}, "84": {"count": 49}, "81": {"count": 119}, "161": {"count": 97}, "352": {"count": 21}, "105": {"count": 43}, "39": {"count": 59}, "383": {"count": 40}, "341": {"count": 56}, "63": {"count": 158}, "125": {"count": 29}, "302": {"count": 83}, "262": {"count": 40}, "392": {"count": 51}, "326": {"count": 173}, "228": {"count": 93}, "339": {"count": 25}, "80": {"count": 73}, "30": {"count": 42}, "264": {"count": 112}, "56": {"count": 94}, "321": {"count": 16}, "395": {"count": 52}, "68": {"count": 45}, "211": {"count": 45}, "44": {"count": 26}, "299": {"count": 21}, "220": {"count": 35}, "61": {"count": 20}, "138": {"count": 55}, "108": {"count": 111}, "10": {"count": 35}, "386": {"count": 28}, "297": {"count": 49}, "210": {"count": 36}, "175": {"count": 77}, "260": {"count": 68}, "391": {"count": 69}, "102": {"count": 77}, "26": {"count": 44}, "232": {"count": 54}, "6": {"count": 158}, "124": {"count": 43}, "14": {"count": 23}, "201": {"count": 39}, "168": {"count": 18}, "202": {"count": 26}, "140": {"count": 31}, "261": {"count": 60}, "104": {"count": 27}, "356": {"count": 22}, "34": {"count": 147}, "225": {"count": 111}, "60": {"count": 84}, "156": {"count": 35}, "237": {"count": 45}, "268": {"count": 87}, "310": {"count": 31}, "249": {"count": 73}, "281": {"count": 46}, "75": {"count": 89}, "77": {"count": 53}, "132": {"count": 45}, "235": {"count": 42}, "336": {"count": 84}, "123": {"count": 27}, "349": {"count": 90}, "180": {"count": 49}, "378": {"count": 17}, "332": {"count": 30}, "185": {"count": 29}, "389": {"count": 60}, "382": {"count": 77}, "198": {"count": 54}, "74": {"count": 48}, "231": {"count": 85}, "76": {"count": 54}, "151": {"count": 64}, "182": {"count": 17}, "209": {"count": 39}, "344": {"count": 37}, "204": {"count": 67}, "329": {"count": 23}, "380": {"count": 91}, "388": {"count": 32}, "116": {"count": 29}, "24": {"count": 103}, "199": {"count": 33}, "369": {"count": 14}, "359": {"count": 77}, "325": {"count": 39}, "323": {"count": 34}, "162": {"count": 35}, "33": {"count": 46}, "129": {"count": 21}, "287": {"count": 30}, "155": {"count": 24}, "170": {"count": 157}, "296": {"count": 40}, "110": {"count": 102}, "304": {"count": 21}, "164": {"count": 37}, "278": {"count": 23}, "71": {"count": 18}, "194": {"count": 24}, "136": {"count": 117}, "103": {"count": 134}, "330": {"count": 26}, "347": {"count": 26}, "206": {"count": 50}, "178": {"count": 43}, "362": {"count": 26}, "119": {"count": 111}, "208": {"count": 33}, "165": {"count": 44}, "90": {"count": 36}, "167": {"count": 40}, "187": {"count": 26}, "99": {"count": 50}, "390": {"count": 64}, "205": {"count": 16}, "65": {"count": 30}, "293": {"count": 23}, "223": {"count": 19}, "96": {"count": 31}, "305": {"count": 44}, "100": {"count": 57}, "385": {"count": 18}, "78": {"count": 42}, "59": {"count": 20}, "37": {"count": 59}, "219": {"count": 76}, "212": {"count": 28}, "1": {"count": 26}, "122": {"count": 35}, "92": {"count": 62}, "43": {"count": 39}, "196": {"count": 56}, "19": {"count": 25}, "128": {"count": 35}, "376": {"count": 77}, "313": {"count": 30}, "114": {"count": 54}, "121": {"count": 31}, "169": {"count": 62}, "331": {"count": 55}, "238": {"count": 16}, "179": {"count": 31}, "127": {"count": 31}, "370": {"count": 98}, "149": {"count": 47}, "346": {"count": 41}, "250": {"count": 22}, "276": {"count": 25}, "163": {"count": 43}, "18": {"count": 33}, "282": {"count": 23}, "215": {"count": 33}, "258": {"count": 60}, "240": {"count": 29}, "233": {"count": 14}, "93": {"count": 27}, "69": {"count": 23}, "266": {"count": 26}, "387": {"count": 55}, "141": {"count": 18}, "191": {"count": 26}, "183": {"count": 42}, "271": {"count": 22}, "120": {"count": 32}, "98": {"count": 53}, "29": {"count": 34}, "28": {"count": 21}, "144": {"count": 26}, "351": {"count": 50}, "368": {"count": 20}, "314": {"count": 27}, "45": {"count": 17}, "218": {"count": 50}, "348": {"count": 25}, "157": {"count": 35}, "117": {"count": 24}, "367": {"count": 24}, "13": {"count": 31}, "363": {"count": 22}, "79": {"count": 28}, "312": {"count": 27}, "372": {"count": 29}, "189": {"count": 21}, "50": {"count": 22}, "160": {"count": 35}, "16": {"count": 39}, "222": {"count": 21}, "58": {"count": 37}, "153": {"count": 64}, "62": {"count": 21}, "290": {"count": 25}, "292": {"count": 24}, "285": {"count": 25}, "343": {"count": 32}, "301": {"count": 19}, "190": {"count": 46}, "195": {"count": 24}, "135": {"count": 30}, "315": {"count": 25}, "203": {"count": 29}, "307": {"count": 18}, "142": {"count": 25}, "173": {"count": 28}, "236": {"count": 41}, "171": {"count": 23}, "371": {"count": 17}, "130": {"count": 15}, "277": {"count": 39}, "248": {"count": 22}, "181": {"count": 35}, "40": {"count": 20}, "322": {"count": 15}, "273": {"count": 23}, "148": {"count": 23}, "295": {"count": 25}, "32": {"count": 21}, "320": {"count": 25}, "137": {"count": 32}, "253": {"count": 36}, "31": {"count": 19}, "306": {"count": 27}, "51": {"count": 19}, "52": {"count": 29}, "176": {"count": 31}, "241": {"count": 23}, "265": {"count": 32}, "394": {"count": 26}, "158": {"count": 26}, "226": {"count": 28}, "288": {"count": 21}, "353": {"count": 19}, "291": {"count": 21}, "224": {"count": 26}, "36": {"count": 38}, "20": {"count": 22}, "252": {"count": 18}, "134": {"count": 24}, "143": {"count": 21}, "207": {"count": 28}, "89": {"count": 16}, "272": {"count": 23}, "379": {"count": 24}, "229": {"count": 20}, "42": {"count": 23}}}} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/StanfordCars.json b/mteb/descriptive_stats/Image/ImageClassification/StanfordCars.json new file mode 100644 index 0000000000..b1d5a9afc5 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/StanfordCars.json @@ -0,0 +1,602 @@ +{ + "test": { + "num_samples": 8041, + "unique_num_labels": 196, + "min_image_width": 78, + "average_image_width": 701.1770924014426, + "max_image_width": 7800, + "min_image_height": 41, + "average_image_height": 483.749658002736, + "max_image_height": 5400, + "labels": { + "180": { + "count": 38 + }, + "102": { + "count": 39 + }, + "144": { + "count": 44 + }, + "186": { + "count": 43 + }, + "184": { + "count": 38 + }, + "77": { + "count": 37 + }, + "117": { + "count": 41 + }, + "164": { + "count": 44 + }, + "31": { + "count": 41 + }, + "59": { + "count": 36 + }, + "48": { + "count": 37 + }, + "107": { + "count": 44 + }, + "115": { + "count": 37 + }, + "134": { + "count": 42 + }, + "82": { + "count": 40 + }, + "50": { + "count": 43 + }, + "153": { + "count": 42 + }, + "32": { + "count": 42 + }, + "21": { + "count": 42 + }, + "150": { + "count": 43 + }, + "3": { + "count": 42 + }, + "80": { + "count": 45 + }, + "106": { + "count": 44 + }, + "190": { + "count": 46 + }, + "169": { + "count": 44 + }, + "194": { + "count": 43 + }, + "90": { + "count": 38 + }, + "4": { + "count": 40 + }, + "163": { + "count": 43 + }, + "147": { + "count": 45 + }, + "187": { + "count": 43 + }, + "43": { + "count": 44 + }, + "6": { + "count": 39 + }, + "30": { + "count": 44 + }, + "73": { + "count": 43 + }, + "29": { + "count": 41 + }, + "165": { + "count": 41 + }, + "179": { + "count": 42 + }, + "105": { + "count": 41 + }, + "2": { + "count": 43 + }, + "64": { + "count": 45 + }, + "34": { + "count": 41 + }, + "74": { + "count": 44 + }, + "84": { + "count": 43 + }, + "24": { + "count": 39 + }, + "167": { + "count": 42 + }, + "136": { + "count": 43 + }, + "133": { + "count": 33 + }, + "155": { + "count": 39 + }, + "119": { + "count": 42 + }, + "129": { + "count": 41 + }, + "127": { + "count": 39 + }, + "35": { + "count": 41 + }, + "170": { + "count": 46 + }, + "36": { + "count": 38 + }, + "63": { + "count": 29 + }, + "182": { + "count": 42 + }, + "42": { + "count": 46 + }, + "17": { + "count": 42 + }, + "75": { + "count": 43 + }, + "0": { + "count": 44 + }, + "62": { + "count": 44 + }, + "173": { + "count": 41 + }, + "16": { + "count": 40 + }, + "104": { + "count": 43 + }, + "49": { + "count": 42 + }, + "122": { + "count": 44 + }, + "81": { + "count": 45 + }, + "191": { + "count": 42 + }, + "92": { + "count": 39 + }, + "145": { + "count": 43 + }, + "95": { + "count": 41 + }, + "54": { + "count": 39 + }, + "114": { + "count": 45 + }, + "112": { + "count": 42 + }, + "151": { + "count": 35 + }, + "91": { + "count": 40 + }, + "188": { + "count": 40 + }, + "20": { + "count": 42 + }, + "33": { + "count": 44 + }, + "86": { + "count": 44 + }, + "128": { + "count": 38 + }, + "142": { + "count": 40 + }, + "19": { + "count": 46 + }, + "177": { + "count": 41 + }, + "11": { + "count": 36 + }, + "45": { + "count": 43 + }, + "60": { + "count": 43 + }, + "8": { + "count": 41 + }, + "56": { + "count": 37 + }, + "28": { + "count": 42 + }, + "120": { + "count": 44 + }, + "5": { + "count": 44 + }, + "85": { + "count": 42 + }, + "68": { + "count": 38 + }, + "22": { + "count": 39 + }, + "108": { + "count": 44 + }, + "89": { + "count": 41 + }, + "132": { + "count": 42 + }, + "125": { + "count": 42 + }, + "137": { + "count": 39 + }, + "158": { + "count": 36 + }, + "58": { + "count": 44 + }, + "123": { + "count": 39 + }, + "52": { + "count": 44 + }, + "27": { + "count": 41 + }, + "13": { + "count": 42 + }, + "70": { + "count": 35 + }, + "25": { + "count": 34 + }, + "185": { + "count": 38 + }, + "171": { + "count": 44 + }, + "9": { + "count": 33 + }, + "40": { + "count": 35 + }, + "178": { + "count": 45 + }, + "44": { + "count": 32 + }, + "97": { + "count": 46 + }, + "87": { + "count": 39 + }, + "159": { + "count": 44 + }, + "146": { + "count": 44 + }, + "51": { + "count": 41 + }, + "121": { + "count": 40 + }, + "1": { + "count": 32 + }, + "160": { + "count": 48 + }, + "78": { + "count": 48 + }, + "109": { + "count": 43 + }, + "103": { + "count": 42 + }, + "174": { + "count": 30 + }, + "181": { + "count": 46 + }, + "23": { + "count": 45 + }, + "111": { + "count": 45 + }, + "166": { + "count": 47 + }, + "172": { + "count": 43 + }, + "66": { + "count": 38 + }, + "192": { + "count": 41 + }, + "148": { + "count": 42 + }, + "72": { + "count": 44 + }, + "141": { + "count": 32 + }, + "71": { + "count": 45 + }, + "7": { + "count": 45 + }, + "152": { + "count": 44 + }, + "183": { + "count": 40 + }, + "98": { + "count": 27 + }, + "94": { + "count": 45 + }, + "126": { + "count": 41 + }, + "100": { + "count": 42 + }, + "131": { + "count": 43 + }, + "116": { + "count": 42 + }, + "39": { + "count": 39 + }, + "149": { + "count": 36 + }, + "101": { + "count": 39 + }, + "139": { + "count": 42 + }, + "69": { + "count": 42 + }, + "12": { + "count": 41 + }, + "14": { + "count": 43 + }, + "96": { + "count": 42 + }, + "41": { + "count": 34 + }, + "189": { + "count": 43 + }, + "10": { + "count": 38 + }, + "140": { + "count": 34 + }, + "26": { + "count": 35 + }, + "57": { + "count": 44 + }, + "88": { + "count": 44 + }, + "67": { + "count": 40 + }, + "93": { + "count": 43 + }, + "193": { + "count": 45 + }, + "161": { + "count": 45 + }, + "118": { + "count": 68 + }, + "110": { + "count": 42 + }, + "154": { + "count": 42 + }, + "138": { + "count": 42 + }, + "143": { + "count": 46 + }, + "61": { + "count": 37 + }, + "176": { + "count": 44 + }, + "113": { + "count": 45 + }, + "18": { + "count": 40 + }, + "53": { + "count": 40 + }, + "47": { + "count": 42 + }, + "157": { + "count": 29 + }, + "168": { + "count": 38 + }, + "124": { + "count": 43 + }, + "79": { + "count": 43 + }, + "130": { + "count": 42 + }, + "46": { + "count": 35 + }, + "55": { + "count": 46 + }, + "195": { + "count": 40 + }, + "38": { + "count": 36 + }, + "37": { + "count": 40 + }, + "99": { + "count": 33 + }, + "83": { + "count": 42 + }, + "162": { + "count": 36 + }, + "135": { + "count": 24 + }, + "175": { + "count": 38 + }, + "156": { + "count": 36 + }, + "15": { + "count": 43 + }, + "65": { + "count": 41 + }, + "76": { + "count": 40 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClassification/UCF101.json b/mteb/descriptive_stats/Image/ImageClassification/UCF101.json new file mode 100644 index 0000000000..be8f90403a --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClassification/UCF101.json @@ -0,0 +1 @@ +{"test": {"num_samples": 697222, "unique_num_labels": 101, "min_image_width": 320, "average_image_width": 320.1187570099624, "max_image_width": 400, "min_image_height": 226, "average_image_height": 239.97921752325658, "max_image_height": 240, "labels": {"0": {"count": 7475}, "1": {"count": 6341}, "2": {"count": 6181}, "3": {"count": 6320}, "4": {"count": 3708}, "5": {"count": 7296}, "6": {"count": 4004}, "7": {"count": 3923}, "8": {"count": 2267}, "9": {"count": 5587}, "10": {"count": 8946}, "11": {"count": 12714}, "12": {"count": 6053}, "13": {"count": 3191}, "14": {"count": 3696}, "15": {"count": 5468}, "16": {"count": 10032}, "17": {"count": 8346}, "18": {"count": 5098}, "19": {"count": 10811}, "20": {"count": 6378}, "21": {"count": 3385}, "22": {"count": 3974}, "23": {"count": 4781}, "24": {"count": 5867}, "25": {"count": 7904}, "26": {"count": 12181}, "27": {"count": 4511}, "28": {"count": 4402}, "29": {"count": 5513}, "30": {"count": 3236}, "31": {"count": 7160}, "32": {"count": 6455}, "33": {"count": 3766}, "34": {"count": 8362}, "35": {"count": 3521}, "36": {"count": 3263}, "37": {"count": 5112}, "38": {"count": 9685}, "39": {"count": 4598}, "40": {"count": 6682}, "41": {"count": 8690}, "42": {"count": 3591}, "43": {"count": 11432}, "44": {"count": 3458}, "45": {"count": 10080}, "46": {"count": 16507}, "47": {"count": 3001}, "48": {"count": 6524}, "49": {"count": 7786}, "50": {"count": 4657}, "51": {"count": 8795}, "52": {"count": 3992}, "53": {"count": 5668}, "54": {"count": 6575}, "55": {"count": 8662}, "56": {"count": 5253}, "57": {"count": 3761}, "58": {"count": 8679}, "59": {"count": 11986}, "60": {"count": 15720}, "61": {"count": 12080}, "62": {"count": 10634}, "63": {"count": 6161}, "64": {"count": 13934}, "65": {"count": 8393}, "66": {"count": 5452}, "67": {"count": 7905}, "68": {"count": 12354}, "69": {"count": 4060}, "70": {"count": 9075}, "71": {"count": 2689}, "72": {"count": 5435}, "73": {"count": 17655}, "74": {"count": 5693}, "75": {"count": 12572}, "76": {"count": 9543}, "77": {"count": 10793}, "78": {"count": 4134}, "79": {"count": 4832}, "80": {"count": 8977}, "81": {"count": 7381}, "82": {"count": 4927}, "83": {"count": 12469}, "84": {"count": 3843}, "85": {"count": 4945}, "86": {"count": 6724}, "87": {"count": 6582}, "88": {"count": 7046}, "89": {"count": 5874}, "90": {"count": 4878}, "91": {"count": 6417}, "92": {"count": 3762}, "93": {"count": 7349}, "94": {"count": 8149}, "95": {"count": 3925}, "96": {"count": 3378}, "97": {"count": 7721}, "98": {"count": 3671}, "99": {"count": 6292}, "100": {"count": 6508}}}} \ No newline at end of file From 3dbdeb1f2664e3502ef68bcc84bd05f8d7c0c342 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 17 Feb 2025 03:24:01 +0000 Subject: [PATCH 025/233] Update tasks table --- docs/tasks.md | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index 1ac3e5b666..7dbd5935ae 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -61,7 +61,7 @@ The following tables give you an overview of the tasks in MTEB. | [BigPatentClustering.v2](https://huggingface.co/datasets/NortheasternUniversity/big_patent) (Eva Sharma and Chen Li and Lu Wang, 2019) | ['eng'] | Clustering | p2p | [Legal, Written] | None | None | | [BiorxivClusteringP2P.v2](https://api.biorxiv.org/) | ['eng'] | Clustering | p2p | [Academic, Written] | None | None | | [BiorxivClusteringS2S.v2](https://api.biorxiv.org/) | ['eng'] | Clustering | s2s | [Academic, Written] | None | None | -| [Birdsnap](https://openaccess.thecvf.com/content_cvpr_2014/html/Berg_Birdsnap_Large-scale_Fine-grained_2014_CVPR_paper.html) (Berg et al., 2014) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [Birdsnap](https://openaccess.thecvf.com/content_cvpr_2014/html/Berg_Birdsnap_Large-scale_Fine-grained_2014_CVPR_paper.html) (Berg et al., 2014) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 1851} | {'test': {'num_samples': 1851, 'unique_num_labels': 490, 'min_image_width': 267, 'average_image_width': 2081.56, 'max_image_width': 6400, 'min_image_height': 200, 'average_image_height': 1609.19, 'max_image_height': 5400, 'labels': {'0': {'count': 4}, '1': {'count': 5}, '2': {'count': 4}, '3': {'count': 4}, '4': {'count': 4}, '5': {'count': 2}, '6': {'count': 3}, '7': {'count': 5}, '8': {'count': 4}, '9': {'count': 5}, '11': {'count': 3}, '12': {'count': 4}, '13': {'count': 5}, '14': {'count': 4}, '15': {'count': 5}, '16': {'count': 4}, '17': {'count': 3}, '18': {'count': 2}, '19': {'count': 5}, '20': {'count': 4}, '21': {'count': 4}, '22': {'count': 5}, '23': {'count': 2}, '24': {'count': 4}, '25': {'count': 3}, '26': {'count': 4}, '27': {'count': 4}, '28': {'count': 2}, '29': {'count': 5}, '30': {'count': 3}, '31': {'count': 3}, '32': {'count': 3}, '33': {'count': 4}, '34': {'count': 4}, '35': {'count': 4}, '36': {'count': 3}, '37': {'count': 3}, '38': {'count': 4}, '39': {'count': 3}, '40': {'count': 4}, '41': {'count': 3}, '42': {'count': 3}, '43': {'count': 4}, '44': {'count': 2}, '45': {'count': 3}, '47': {'count': 5}, '48': {'count': 2}, '49': {'count': 5}, '50': {'count': 4}, '51': {'count': 5}, '52': {'count': 3}, '53': {'count': 3}, '54': {'count': 4}, '55': {'count': 2}, '56': {'count': 2}, '57': {'count': 5}, '58': {'count': 2}, '59': {'count': 1}, '60': {'count': 1}, '61': {'count': 3}, '62': {'count': 3}, '63': {'count': 5}, '64': {'count': 5}, '65': {'count': 4}, '67': {'count': 2}, '68': {'count': 3}, '69': {'count': 4}, '70': {'count': 5}, '71': {'count': 5}, '72': {'count': 5}, '73': {'count': 4}, '74': {'count': 5}, '75': {'count': 4}, '76': {'count': 4}, '80': {'count': 3}, '81': {'count': 5}, '82': {'count': 3}, '83': {'count': 5}, '84': {'count': 3}, '85': {'count': 4}, '86': {'count': 4}, '87': {'count': 5}, '88': {'count': 4}, '89': {'count': 5}, '90': {'count': 4}, '91': {'count': 4}, '92': {'count': 5}, '93': {'count': 4}, '94': {'count': 4}, '95': {'count': 5}, '96': {'count': 5}, '97': {'count': 5}, '98': {'count': 3}, '99': {'count': 5}, '100': {'count': 4}, '101': {'count': 5}, '102': {'count': 4}, '103': {'count': 3}, '105': {'count': 4}, '108': {'count': 4}, '109': {'count': 5}, '110': {'count': 3}, '111': {'count': 3}, '112': {'count': 4}, '113': {'count': 4}, '114': {'count': 5}, '115': {'count': 4}, '116': {'count': 5}, '117': {'count': 4}, '118': {'count': 4}, '119': {'count': 5}, '120': {'count': 5}, '121': {'count': 4}, '122': {'count': 3}, '124': {'count': 3}, '125': {'count': 4}, '126': {'count': 2}, '127': {'count': 3}, '128': {'count': 5}, '129': {'count': 5}, '130': {'count': 5}, '131': {'count': 3}, '132': {'count': 4}, '133': {'count': 4}, '134': {'count': 2}, '135': {'count': 5}, '136': {'count': 5}, '137': {'count': 3}, '138': {'count': 4}, '139': {'count': 3}, '140': {'count': 3}, '141': {'count': 2}, '142': {'count': 3}, '143': {'count': 5}, '144': {'count': 4}, '145': {'count': 5}, '146': {'count': 5}, '147': {'count': 5}, '148': {'count': 4}, '149': {'count': 4}, '150': {'count': 5}, '151': {'count': 5}, '152': {'count': 5}, '153': {'count': 3}, '154': {'count': 4}, '155': {'count': 3}, '156': {'count': 3}, '157': {'count': 3}, '159': {'count': 3}, '160': {'count': 4}, '161': {'count': 4}, '162': {'count': 4}, '163': {'count': 4}, '164': {'count': 3}, '165': {'count': 3}, '166': {'count': 3}, '167': {'count': 4}, '168': {'count': 4}, '169': {'count': 4}, '170': {'count': 4}, '171': {'count': 5}, '172': {'count': 4}, '173': {'count': 4}, '174': {'count': 5}, '175': {'count': 4}, '176': {'count': 2}, '177': {'count': 5}, '178': {'count': 5}, '179': {'count': 5}, '180': {'count': 5}, '181': {'count': 4}, '183': {'count': 2}, '184': {'count': 3}, '185': {'count': 2}, '186': {'count': 5}, '187': {'count': 2}, '188': {'count': 3}, '189': {'count': 2}, '190': {'count': 5}, '191': {'count': 4}, '192': {'count': 3}, '193': {'count': 3}, '194': {'count': 4}, '195': {'count': 3}, '196': {'count': 4}, '197': {'count': 3}, '198': {'count': 4}, '199': {'count': 5}, '200': {'count': 5}, '201': {'count': 1}, '204': {'count': 4}, '205': {'count': 5}, '206': {'count': 4}, '207': {'count': 3}, '208': {'count': 4}, '209': {'count': 4}, '210': {'count': 4}, '211': {'count': 4}, '212': {'count': 5}, '213': {'count': 4}, '214': {'count': 5}, '215': {'count': 3}, '216': {'count': 1}, '217': {'count': 5}, '218': {'count': 2}, '219': {'count': 5}, '220': {'count': 4}, '221': {'count': 5}, '222': {'count': 5}, '223': {'count': 3}, '224': {'count': 4}, '225': {'count': 5}, '226': {'count': 3}, '227': {'count': 4}, '228': {'count': 3}, '229': {'count': 4}, '230': {'count': 4}, '231': {'count': 5}, '232': {'count': 5}, '233': {'count': 5}, '234': {'count': 4}, '235': {'count': 4}, '236': {'count': 5}, '237': {'count': 5}, '238': {'count': 5}, '239': {'count': 4}, '240': {'count': 3}, '241': {'count': 3}, '242': {'count': 4}, '243': {'count': 5}, '244': {'count': 2}, '245': {'count': 4}, '246': {'count': 5}, '247': {'count': 3}, '248': {'count': 3}, '249': {'count': 5}, '250': {'count': 5}, '251': {'count': 4}, '252': {'count': 2}, '253': {'count': 5}, '254': {'count': 5}, '255': {'count': 5}, '256': {'count': 4}, '257': {'count': 4}, '258': {'count': 4}, '259': {'count': 3}, '260': {'count': 5}, '261': {'count': 4}, '262': {'count': 4}, '264': {'count': 4}, '265': {'count': 3}, '266': {'count': 5}, '267': {'count': 5}, '268': {'count': 3}, '269': {'count': 2}, '270': {'count': 3}, '271': {'count': 4}, '272': {'count': 4}, '273': {'count': 5}, '274': {'count': 5}, '275': {'count': 5}, '276': {'count': 2}, '277': {'count': 3}, '278': {'count': 5}, '279': {'count': 5}, '280': {'count': 4}, '281': {'count': 5}, '282': {'count': 5}, '283': {'count': 3}, '284': {'count': 5}, '285': {'count': 3}, '286': {'count': 5}, '287': {'count': 5}, '288': {'count': 4}, '289': {'count': 4}, '290': {'count': 5}, '291': {'count': 3}, '292': {'count': 2}, '293': {'count': 1}, '294': {'count': 1}, '295': {'count': 2}, '296': {'count': 4}, '297': {'count': 5}, '298': {'count': 4}, '300': {'count': 3}, '301': {'count': 3}, '303': {'count': 4}, '304': {'count': 4}, '305': {'count': 4}, '306': {'count': 2}, '307': {'count': 5}, '308': {'count': 4}, '309': {'count': 2}, '310': {'count': 3}, '311': {'count': 3}, '312': {'count': 4}, '313': {'count': 3}, '314': {'count': 3}, '315': {'count': 3}, '316': {'count': 5}, '317': {'count': 4}, '318': {'count': 5}, '319': {'count': 4}, '320': {'count': 4}, '321': {'count': 3}, '322': {'count': 5}, '323': {'count': 4}, '324': {'count': 2}, '325': {'count': 1}, '326': {'count': 3}, '327': {'count': 4}, '328': {'count': 3}, '330': {'count': 4}, '331': {'count': 4}, '332': {'count': 2}, '333': {'count': 5}, '334': {'count': 5}, '335': {'count': 5}, '336': {'count': 4}, '337': {'count': 4}, '338': {'count': 5}, '339': {'count': 3}, '340': {'count': 5}, '341': {'count': 5}, '342': {'count': 5}, '343': {'count': 2}, '344': {'count': 2}, '345': {'count': 3}, '346': {'count': 3}, '347': {'count': 5}, '348': {'count': 3}, '349': {'count': 2}, '350': {'count': 4}, '352': {'count': 5}, '353': {'count': 3}, '354': {'count': 5}, '355': {'count': 5}, '356': {'count': 4}, '357': {'count': 3}, '358': {'count': 3}, '359': {'count': 4}, '360': {'count': 5}, '361': {'count': 5}, '362': {'count': 4}, '363': {'count': 3}, '364': {'count': 4}, '365': {'count': 1}, '366': {'count': 4}, '367': {'count': 3}, '368': {'count': 4}, '369': {'count': 3}, '370': {'count': 5}, '371': {'count': 3}, '372': {'count': 5}, '373': {'count': 4}, '374': {'count': 4}, '375': {'count': 3}, '376': {'count': 4}, '377': {'count': 4}, '378': {'count': 4}, '379': {'count': 4}, '380': {'count': 4}, '381': {'count': 4}, '382': {'count': 1}, '383': {'count': 4}, '384': {'count': 4}, '385': {'count': 4}, '386': {'count': 2}, '387': {'count': 4}, '388': {'count': 2}, '389': {'count': 5}, '390': {'count': 4}, '391': {'count': 5}, '392': {'count': 4}, '394': {'count': 4}, '395': {'count': 4}, '396': {'count': 4}, '397': {'count': 4}, '398': {'count': 5}, '399': {'count': 4}, '400': {'count': 5}, '401': {'count': 4}, '402': {'count': 4}, '404': {'count': 5}, '405': {'count': 5}, '406': {'count': 5}, '407': {'count': 4}, '408': {'count': 2}, '409': {'count': 4}, '410': {'count': 3}, '411': {'count': 5}, '412': {'count': 4}, '413': {'count': 3}, '414': {'count': 4}, '415': {'count': 4}, '416': {'count': 4}, '417': {'count': 5}, '418': {'count': 3}, '419': {'count': 5}, '421': {'count': 4}, '422': {'count': 3}, '423': {'count': 5}, '424': {'count': 5}, '425': {'count': 2}, '426': {'count': 5}, '427': {'count': 4}, '428': {'count': 5}, '429': {'count': 3}, '430': {'count': 2}, '431': {'count': 3}, '432': {'count': 5}, '433': {'count': 4}, '434': {'count': 3}, '435': {'count': 3}, '437': {'count': 3}, '438': {'count': 5}, '439': {'count': 2}, '440': {'count': 4}, '441': {'count': 4}, '442': {'count': 5}, '443': {'count': 2}, '444': {'count': 3}, '445': {'count': 3}, '446': {'count': 5}, '447': {'count': 3}, '448': {'count': 2}, '449': {'count': 1}, '450': {'count': 3}, '451': {'count': 3}, '452': {'count': 4}, '453': {'count': 2}, '454': {'count': 4}, '455': {'count': 4}, '456': {'count': 5}, '458': {'count': 4}, '459': {'count': 4}, '460': {'count': 5}, '461': {'count': 4}, '462': {'count': 4}, '463': {'count': 5}, '464': {'count': 5}, '466': {'count': 2}, '467': {'count': 4}, '468': {'count': 3}, '469': {'count': 5}, '470': {'count': 5}, '471': {'count': 2}, '472': {'count': 4}, '473': {'count': 3}, '474': {'count': 5}, '475': {'count': 5}, '476': {'count': 5}, '477': {'count': 4}, '478': {'count': 2}, '479': {'count': 4}, '480': {'count': 4}, '481': {'count': 5}, '482': {'count': 4}, '483': {'count': 3}, '484': {'count': 5}, '485': {'count': 5}, '486': {'count': 4}, '487': {'count': 3}, '488': {'count': 3}, '489': {'count': 1}, '490': {'count': 1}, '491': {'count': 2}, '492': {'count': 4}, '493': {'count': 4}, '494': {'count': 3}, '495': {'count': 4}, '496': {'count': 5}, '497': {'count': 5}, '498': {'count': 5}, '499': {'count': 4}, '79': {'count': 4}, '106': {'count': 4}, '107': {'count': 4}, '202': {'count': 1}, '203': {'count': 1}, '457': {'count': 3}, '77': {'count': 2}, '78': {'count': 4}, '182': {'count': 2}, '263': {'count': 4}, '104': {'count': 1}, '158': {'count': 5}, '329': {'count': 1}, '393': {'count': 2}, '420': {'count': 2}}}} | | [BirdsnapZeroShot](https://openaccess.thecvf.com/content_cvpr_2014/html/Berg_Birdsnap_Large-scale_Fine-grained_2014_CVPR_paper.html) (Berg et al., 2014) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [BlurbsClusteringP2P.v2](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html) (Steffen Remus, 2019) | ['deu'] | Clustering | p2p | [Fiction, Written] | None | None | | [BlurbsClusteringS2S.v2](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html) (Steffen Remus, 2019) | ['deu'] | Clustering | s2s | [Fiction, Written] | None | None | @@ -78,8 +78,8 @@ The following tables give you an overview of the tasks in MTEB. | [CDSC-R](https://aclanthology.org/P17-1073.pdf) | ['pol'] | STS | s2s | [Web, Written] | None | None | | [CEDRClassification](https://www.sciencedirect.com/science/article/pii/S1877050921013247) (Sboev et al., 2021) | ['rus'] | MultilabelClassification | s2s | [Blog, Social, Web, Written] | {'test': 1882, 'train': 7528} | {'test': {'num_samples': 1882, 'number_of_characters': 171649, 'number_texts_in_train': 7, 'min_text_length': 6, 'average_text_length': 91.21, 'max_text_length': 220, 'unique_texts': 1875, 'min_labels_per_text': 0, 'average_label_per_text': 0.62, 'max_labels_per_text': 2, 'unique_labels': 6, 'labels': {'None': {'count': 734}, '3': {'count': 141}, '2': {'count': 170}, '1': {'count': 379}, '0': {'count': 353}, '4': {'count': 125}}}, 'train': {'num_samples': 7528, 'number_of_characters': 697322, 'number_texts_in_train': None, 'min_text_length': 5, 'average_text_length': 92.63, 'max_text_length': 280, 'unique_texts': 7500, 'min_labels_per_text': 0, 'average_label_per_text': 0.61, 'max_labels_per_text': 3, 'unique_labels': 6, 'labels': {'None': {'count': 3043}, '2': {'count': 607}, '0': {'count': 1569}, '3': {'count': 589}, '1': {'count': 1417}, '4': {'count': 411}}}} | | [CExaPPC](https://github.com/exaco/exappc) | ['fas'] | PairClassification | s2s | [Social, Web] | None | None | -| [CIFAR10](https://huggingface.co/datasets/uoft-cs/cifar10) (Alex Krizhevsky, 2009) | ['eng'] | ImageClassification | i2i | [Web] | None | None | -| [CIFAR100](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ImageClassification | i2t | [Web] | None | None | +| [CIFAR10](https://huggingface.co/datasets/uoft-cs/cifar10) (Alex Krizhevsky, 2009) | ['eng'] | ImageClassification | i2i | [Web] | {'test': 10000} | {'test': {'num_samples': 10000, 'unique_num_labels': 10, 'min_image_width': 32, 'average_image_width': 32.0, 'max_image_width': 32, 'min_image_height': 32, 'average_image_height': 32.0, 'max_image_height': 32, 'labels': {'3': {'count': 1000}, '8': {'count': 1000}, '0': {'count': 1000}, '6': {'count': 1000}, '1': {'count': 1000}, '9': {'count': 1000}, '5': {'count': 1000}, '7': {'count': 1000}, '4': {'count': 1000}, '2': {'count': 1000}}}} | +| [CIFAR100](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ImageClassification | i2t | [Web] | {'test': 10000} | {'test': {'num_samples': 10000, 'unique_num_labels': 100, 'min_image_width': 32, 'average_image_width': 32.0, 'max_image_width': 32, 'min_image_height': 32, 'average_image_height': 32.0, 'max_image_height': 32, 'labels': {'49': {'count': 100}, '33': {'count': 100}, '72': {'count': 100}, '51': {'count': 100}, '71': {'count': 100}, '92': {'count': 100}, '15': {'count': 100}, '14': {'count': 100}, '23': {'count': 100}, '0': {'count': 100}, '75': {'count': 100}, '81': {'count': 100}, '69': {'count': 100}, '40': {'count': 100}, '43': {'count': 100}, '97': {'count': 100}, '70': {'count': 100}, '53': {'count': 100}, '29': {'count': 100}, '21': {'count': 100}, '16': {'count': 100}, '39': {'count': 100}, '8': {'count': 100}, '20': {'count': 100}, '61': {'count': 100}, '41': {'count': 100}, '93': {'count': 100}, '56': {'count': 100}, '73': {'count': 100}, '58': {'count': 100}, '11': {'count': 100}, '25': {'count': 100}, '37': {'count': 100}, '63': {'count': 100}, '24': {'count': 100}, '22': {'count': 100}, '17': {'count': 100}, '4': {'count': 100}, '6': {'count': 100}, '9': {'count': 100}, '57': {'count': 100}, '2': {'count': 100}, '32': {'count': 100}, '52': {'count': 100}, '42': {'count': 100}, '77': {'count': 100}, '27': {'count': 100}, '65': {'count': 100}, '7': {'count': 100}, '35': {'count': 100}, '82': {'count': 100}, '66': {'count': 100}, '90': {'count': 100}, '67': {'count': 100}, '91': {'count': 100}, '10': {'count': 100}, '78': {'count': 100}, '54': {'count': 100}, '89': {'count': 100}, '18': {'count': 100}, '13': {'count': 100}, '50': {'count': 100}, '26': {'count': 100}, '83': {'count': 100}, '47': {'count': 100}, '95': {'count': 100}, '76': {'count': 100}, '59': {'count': 100}, '85': {'count': 100}, '19': {'count': 100}, '46': {'count': 100}, '1': {'count': 100}, '74': {'count': 100}, '60': {'count': 100}, '64': {'count': 100}, '45': {'count': 100}, '36': {'count': 100}, '87': {'count': 100}, '30': {'count': 100}, '99': {'count': 100}, '80': {'count': 100}, '28': {'count': 100}, '98': {'count': 100}, '12': {'count': 100}, '94': {'count': 100}, '68': {'count': 100}, '44': {'count': 100}, '31': {'count': 100}, '79': {'count': 100}, '34': {'count': 100}, '55': {'count': 100}, '62': {'count': 100}, '96': {'count': 100}, '84': {'count': 100}, '38': {'count': 100}, '86': {'count': 100}, '5': {'count': 100}, '48': {'count': 100}, '3': {'count': 100}, '88': {'count': 100}}}} | | [CIFAR100Clustering](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ImageClustering | i2t | [Web] | {'test': 10000} | {'test': {'num_samples': 10000, 'unique_num_labels': 100, 'min_image_width': 32, 'average_image_width': 32.0, 'max_image_width': 32, 'min_image_height': 32, 'average_image_height': 32.0, 'max_image_height': 32, 'labels': {'49': {'count': 100}, '33': {'count': 100}, '72': {'count': 100}, '51': {'count': 100}, '71': {'count': 100}, '92': {'count': 100}, '15': {'count': 100}, '14': {'count': 100}, '23': {'count': 100}, '0': {'count': 100}, '75': {'count': 100}, '81': {'count': 100}, '69': {'count': 100}, '40': {'count': 100}, '43': {'count': 100}, '97': {'count': 100}, '70': {'count': 100}, '53': {'count': 100}, '29': {'count': 100}, '21': {'count': 100}, '16': {'count': 100}, '39': {'count': 100}, '8': {'count': 100}, '20': {'count': 100}, '61': {'count': 100}, '41': {'count': 100}, '93': {'count': 100}, '56': {'count': 100}, '73': {'count': 100}, '58': {'count': 100}, '11': {'count': 100}, '25': {'count': 100}, '37': {'count': 100}, '63': {'count': 100}, '24': {'count': 100}, '22': {'count': 100}, '17': {'count': 100}, '4': {'count': 100}, '6': {'count': 100}, '9': {'count': 100}, '57': {'count': 100}, '2': {'count': 100}, '32': {'count': 100}, '52': {'count': 100}, '42': {'count': 100}, '77': {'count': 100}, '27': {'count': 100}, '65': {'count': 100}, '7': {'count': 100}, '35': {'count': 100}, '82': {'count': 100}, '66': {'count': 100}, '90': {'count': 100}, '67': {'count': 100}, '91': {'count': 100}, '10': {'count': 100}, '78': {'count': 100}, '54': {'count': 100}, '89': {'count': 100}, '18': {'count': 100}, '13': {'count': 100}, '50': {'count': 100}, '26': {'count': 100}, '83': {'count': 100}, '47': {'count': 100}, '95': {'count': 100}, '76': {'count': 100}, '59': {'count': 100}, '85': {'count': 100}, '19': {'count': 100}, '46': {'count': 100}, '1': {'count': 100}, '74': {'count': 100}, '60': {'count': 100}, '64': {'count': 100}, '45': {'count': 100}, '36': {'count': 100}, '87': {'count': 100}, '30': {'count': 100}, '99': {'count': 100}, '80': {'count': 100}, '28': {'count': 100}, '98': {'count': 100}, '12': {'count': 100}, '94': {'count': 100}, '68': {'count': 100}, '44': {'count': 100}, '31': {'count': 100}, '79': {'count': 100}, '34': {'count': 100}, '55': {'count': 100}, '62': {'count': 100}, '96': {'count': 100}, '84': {'count': 100}, '38': {'count': 100}, '86': {'count': 100}, '5': {'count': 100}, '48': {'count': 100}, '3': {'count': 100}, '88': {'count': 100}}}} | | [CIFAR100ZeroShot](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ZeroShotClassification | i2t | [Web] | None | None | | [CIFAR10Clustering](https://huggingface.co/datasets/uoft-cs/cifar10) (Alex Krizhevsky, 2009) | ['eng'] | ImageClustering | i2i | [Web] | {'test': 10000} | {'test': {'num_samples': 10000, 'unique_num_labels': 10, 'min_image_width': 32, 'average_image_width': 32.0, 'max_image_width': 32, 'min_image_height': 32, 'average_image_height': 32.0, 'max_image_height': 32, 'labels': {'3': {'count': 1000}, '8': {'count': 1000}, '0': {'count': 1000}, '6': {'count': 1000}, '1': {'count': 1000}, '9': {'count': 1000}, '5': {'count': 1000}, '7': {'count': 1000}, '4': {'count': 1000}, '2': {'count': 1000}}}} | @@ -179,7 +179,7 @@ The following tables give you an overview of the tasks in MTEB. | [CVBenchDepth](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | None | None | | [CVBenchDistance](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | None | None | | [CVBenchRelation](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | None | None | -| [Caltech101](https://ieeexplore.ieee.org/document/1384978) (Li Fei-Fei, 2004) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [Caltech101](https://ieeexplore.ieee.org/document/1384978) (Li Fei-Fei, 2004) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 6084} | {'test': {'num_samples': 6084, 'unique_num_labels': 102, 'min_image_width': 80, 'average_image_width': 311.72, 'max_image_width': 3481, 'min_image_height': 101, 'average_image_height': 241.84, 'max_image_height': 3999, 'labels': {'4': {'count': 437}, '37': {'count': 405}, '38': {'count': 405}, '57': {'count': 170}, '66': {'count': 768}, '0': {'count': 25}, '1': {'count': 770}, '2': {'count': 12}, '3': {'count': 12}, '5': {'count': 17}, '6': {'count': 24}, '7': {'count': 16}, '8': {'count': 3}, '9': {'count': 98}, '10': {'count': 68}, '11': {'count': 13}, '12': {'count': 55}, '13': {'count': 61}, '14': {'count': 20}, '15': {'count': 13}, '16': {'count': 93}, '17': {'count': 17}, '18': {'count': 29}, '19': {'count': 32}, '20': {'count': 77}, '22': {'count': 39}, '23': {'count': 43}, '24': {'count': 40}, '25': {'count': 20}, '26': {'count': 21}, '27': {'count': 27}, '28': {'count': 37}, '29': {'count': 22}, '30': {'count': 35}, '31': {'count': 38}, '32': {'count': 45}, '33': {'count': 34}, '34': {'count': 23}, '35': {'count': 34}, '36': {'count': 55}, '39': {'count': 37}, '40': {'count': 37}, '41': {'count': 15}, '42': {'count': 4}, '43': {'count': 4}, '44': {'count': 21}, '45': {'count': 69}, '46': {'count': 70}, '47': {'count': 12}, '48': {'count': 24}, '49': {'count': 58}, '50': {'count': 50}, '51': {'count': 1}, '52': {'count': 34}, '53': {'count': 56}, '54': {'count': 84}, '55': {'count': 31}, '56': {'count': 51}, '58': {'count': 48}, '59': {'count': 11}, '60': {'count': 36}, '61': {'count': 13}, '62': {'count': 10}, '63': {'count': 57}, '64': {'count': 2}, '65': {'count': 46}, '67': {'count': 25}, '68': {'count': 5}, '69': {'count': 9}, '70': {'count': 17}, '71': {'count': 8}, '72': {'count': 15}, '73': {'count': 23}, '74': {'count': 4}, '75': {'count': 27}, '76': {'count': 52}, '77': {'count': 29}, '78': {'count': 19}, '79': {'count': 10}, '80': {'count': 33}, '81': {'count': 9}, '82': {'count': 54}, '83': {'count': 27}, '84': {'count': 5}, '85': {'count': 34}, '86': {'count': 15}, '87': {'count': 56}, '88': {'count': 29}, '89': {'count': 34}, '90': {'count': 5}, '91': {'count': 55}, '92': {'count': 19}, '93': {'count': 56}, '94': {'count': 45}, '95': {'count': 209}, '96': {'count': 7}, '97': {'count': 29}, '98': {'count': 4}, '99': {'count': 26}, '100': {'count': 9}, '101': {'count': 30}, '21': {'count': 17}}}} | | [Caltech101ZeroShot](https://ieeexplore.ieee.org/document/1384978) (Li Fei-Fei, 2004) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [CanadaTaxCourtOutcomesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [CataloniaTweetClassification](https://aclanthology.org/2020.lrec-1.171/) | ['cat', 'spa'] | Classification | s2s | [Government, Social, Written] | None | None | @@ -220,7 +220,7 @@ The following tables give you an overview of the tasks in MTEB. | [Core17InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | {'test': 19919} | {'test': {'num_samples': 19919, 'num_docs': 19899, 'num_queries': 20, 'number_of_characters': 44450333, 'min_document_length': 7, 'average_document_length': 2233.03, 'max_document_length': 2959, 'unique_docs': 19143, 'min_query_length': 55, 'average_query_length': 109.75, 'max_query_length': 278, 'unique_queries': 20, 'min_instruction_length': 102, 'average_instruction_length': 295.55, 'max_instruction_length': 811, 'unique_instructions': 20, 'min_changed_instruction_length': 151, 'average_changed_instruction_length': 355.2, 'max_changed_instruction_length': 837, 'unique_changed_instructions': 20, 'min_average_relevant_docs_per_query': 4, 'average_relevant_docs_per_query': 32.7, 'max_average_relevant_docs_per_query': 55, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}} | | [CorporateLobbyingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [CosQA](https://arxiv.org/abs/2105.13239) (Junjie Huang, 2021) | ['eng', 'python'] | Retrieval | p2p | [Programming, Written] | {'test': 21104} | {'test': {'number_of_characters': 5728450, 'num_samples': 21104, 'num_queries': 500, 'num_documents': 20604, 'min_document_length': 18, 'average_document_length': 0.89, 'max_document_length': 83, 'unique_documents': 20604, 'min_query_length': 88, 'average_query_length': 11420.09, 'max_query_length': 6396, 'unique_queries': 500, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 500}} | -| [Country211](https://huggingface.co/datasets/clip-benchmark/wds_country211) (Radford et al., 2021) | ['eng'] | ImageClassification | i2i | [Scene] | None | None | +| [Country211](https://huggingface.co/datasets/clip-benchmark/wds_country211) (Radford et al., 2021) | ['eng'] | ImageClassification | i2i | [Scene] | {'test': 21100} | {'test': {'num_samples': 21100, 'unique_num_labels': 211, 'min_image_width': 32, 'average_image_width': 468.59, 'max_image_width': 500, 'min_image_height': 37, 'average_image_height': 381.73, 'max_image_height': 500, 'labels': {'0': {'count': 100}, '1': {'count': 100}, '2': {'count': 100}, '3': {'count': 100}, '4': {'count': 100}, '5': {'count': 100}, '6': {'count': 100}, '7': {'count': 100}, '8': {'count': 100}, '9': {'count': 100}, '10': {'count': 100}, '11': {'count': 100}, '12': {'count': 100}, '13': {'count': 100}, '14': {'count': 100}, '15': {'count': 100}, '16': {'count': 100}, '17': {'count': 100}, '18': {'count': 100}, '19': {'count': 100}, '20': {'count': 100}, '21': {'count': 100}, '22': {'count': 100}, '23': {'count': 100}, '24': {'count': 100}, '25': {'count': 100}, '26': {'count': 100}, '27': {'count': 100}, '28': {'count': 100}, '29': {'count': 100}, '30': {'count': 100}, '31': {'count': 100}, '32': {'count': 100}, '33': {'count': 100}, '34': {'count': 100}, '35': {'count': 100}, '36': {'count': 100}, '37': {'count': 100}, '38': {'count': 100}, '39': {'count': 100}, '40': {'count': 100}, '41': {'count': 100}, '42': {'count': 100}, '43': {'count': 100}, '44': {'count': 100}, '45': {'count': 100}, '46': {'count': 100}, '47': {'count': 100}, '48': {'count': 100}, '49': {'count': 100}, '50': {'count': 100}, '51': {'count': 100}, '52': {'count': 100}, '53': {'count': 100}, '54': {'count': 100}, '55': {'count': 100}, '56': {'count': 100}, '57': {'count': 100}, '58': {'count': 100}, '59': {'count': 100}, '60': {'count': 100}, '61': {'count': 100}, '62': {'count': 100}, '63': {'count': 100}, '64': {'count': 100}, '65': {'count': 100}, '66': {'count': 100}, '67': {'count': 100}, '68': {'count': 100}, '69': {'count': 100}, '70': {'count': 100}, '71': {'count': 100}, '72': {'count': 100}, '73': {'count': 100}, '74': {'count': 100}, '75': {'count': 100}, '76': {'count': 100}, '77': {'count': 100}, '78': {'count': 100}, '79': {'count': 100}, '80': {'count': 100}, '81': {'count': 100}, '82': {'count': 100}, '83': {'count': 100}, '84': {'count': 100}, '85': {'count': 100}, '86': {'count': 100}, '87': {'count': 100}, '88': {'count': 100}, '89': {'count': 100}, '90': {'count': 100}, '91': {'count': 100}, '92': {'count': 100}, '93': {'count': 100}, '94': {'count': 100}, '95': {'count': 100}, '96': {'count': 100}, '97': {'count': 100}, '98': {'count': 100}, '99': {'count': 100}, '100': {'count': 100}, '101': {'count': 100}, '102': {'count': 100}, '103': {'count': 100}, '104': {'count': 100}, '105': {'count': 100}, '106': {'count': 100}, '107': {'count': 100}, '108': {'count': 100}, '109': {'count': 100}, '110': {'count': 100}, '111': {'count': 100}, '112': {'count': 100}, '113': {'count': 100}, '114': {'count': 100}, '115': {'count': 100}, '116': {'count': 100}, '117': {'count': 100}, '118': {'count': 100}, '119': {'count': 100}, '120': {'count': 100}, '121': {'count': 100}, '122': {'count': 100}, '123': {'count': 100}, '124': {'count': 100}, '125': {'count': 100}, '126': {'count': 100}, '127': {'count': 100}, '128': {'count': 100}, '129': {'count': 100}, '130': {'count': 100}, '131': {'count': 100}, '132': {'count': 100}, '133': {'count': 100}, '134': {'count': 100}, '135': {'count': 100}, '136': {'count': 100}, '137': {'count': 100}, '138': {'count': 100}, '139': {'count': 100}, '140': {'count': 100}, '141': {'count': 100}, '142': {'count': 100}, '143': {'count': 100}, '144': {'count': 100}, '145': {'count': 100}, '146': {'count': 100}, '147': {'count': 100}, '148': {'count': 100}, '149': {'count': 100}, '150': {'count': 100}, '151': {'count': 100}, '152': {'count': 100}, '153': {'count': 100}, '154': {'count': 100}, '155': {'count': 100}, '156': {'count': 100}, '157': {'count': 100}, '158': {'count': 100}, '159': {'count': 100}, '160': {'count': 100}, '161': {'count': 100}, '162': {'count': 100}, '163': {'count': 100}, '164': {'count': 100}, '165': {'count': 100}, '166': {'count': 100}, '167': {'count': 100}, '168': {'count': 100}, '169': {'count': 100}, '170': {'count': 100}, '171': {'count': 100}, '172': {'count': 100}, '173': {'count': 100}, '174': {'count': 100}, '175': {'count': 100}, '176': {'count': 100}, '177': {'count': 100}, '178': {'count': 100}, '179': {'count': 100}, '180': {'count': 100}, '181': {'count': 100}, '182': {'count': 100}, '183': {'count': 100}, '184': {'count': 100}, '185': {'count': 100}, '186': {'count': 100}, '187': {'count': 100}, '188': {'count': 100}, '189': {'count': 100}, '190': {'count': 100}, '191': {'count': 100}, '192': {'count': 100}, '193': {'count': 100}, '194': {'count': 100}, '195': {'count': 100}, '196': {'count': 100}, '197': {'count': 100}, '198': {'count': 100}, '199': {'count': 100}, '200': {'count': 100}, '201': {'count': 100}, '202': {'count': 100}, '203': {'count': 100}, '204': {'count': 100}, '205': {'count': 100}, '206': {'count': 100}, '207': {'count': 100}, '208': {'count': 100}, '209': {'count': 100}, '210': {'count': 100}}}} | | [Country211ZeroShot](https://huggingface.co/datasets/clip-benchmark/wds_country211) (Radford et al., 2021) | ['eng'] | ZeroShotClassification | i2t | [Scene] | None | None | | [CovidRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None | | [CrossLingualSemanticDiscriminationWMT19](https://huggingface.co/datasets/Andrianos/clsd_wmt19_21) | ['deu', 'fra'] | Retrieval | s2s | [News, Written] | None | None | @@ -237,7 +237,7 @@ The following tables give you an overview of the tasks in MTEB. | [DBPediaHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [DBpediaClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Encyclopaedic, Written] | None | None | | [DKHateClassification](https://aclanthology.org/2020.lrec-1.430/) | ['dan'] | Classification | s2s | [Social, Written] | None | None | -| [DTD](https://www.robots.ox.ac.uk/~vgg/data/dtd/) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [DTD](https://www.robots.ox.ac.uk/~vgg/data/dtd/) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 1880} | {'test': {'num_samples': 1880, 'unique_num_labels': 47, 'min_image_width': 300, 'average_image_width': 488.98, 'max_image_width': 900, 'min_image_height': 300, 'average_image_height': 447.5, 'max_image_height': 778, 'labels': {'0': {'count': 40}, '1': {'count': 40}, '10': {'count': 40}, '11': {'count': 40}, '12': {'count': 40}, '13': {'count': 40}, '14': {'count': 40}, '15': {'count': 40}, '16': {'count': 40}, '17': {'count': 40}, '18': {'count': 40}, '19': {'count': 40}, '2': {'count': 40}, '20': {'count': 40}, '21': {'count': 40}, '22': {'count': 40}, '23': {'count': 40}, '24': {'count': 40}, '25': {'count': 40}, '26': {'count': 40}, '27': {'count': 40}, '28': {'count': 40}, '29': {'count': 40}, '3': {'count': 40}, '30': {'count': 40}, '31': {'count': 40}, '32': {'count': 40}, '33': {'count': 40}, '34': {'count': 40}, '35': {'count': 40}, '36': {'count': 40}, '37': {'count': 40}, '38': {'count': 40}, '39': {'count': 40}, '4': {'count': 40}, '40': {'count': 40}, '41': {'count': 40}, '42': {'count': 40}, '43': {'count': 40}, '44': {'count': 40}, '45': {'count': 40}, '46': {'count': 40}, '5': {'count': 40}, '6': {'count': 40}, '7': {'count': 40}, '8': {'count': 40}, '9': {'count': 40}}}} | | [DTDZeroShot](https://www.robots.ox.ac.uk/~vgg/data/dtd/) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [DalajClassification](https://spraakbanken.gu.se/en/resources/superlim) | ['swe'] | Classification | s2s | [Non-fiction, Written] | None | None | | [DanFeverRetrieval](https://aclanthology.org/2021.nodalida-main.47/) | ['dan'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Spoken] | None | None | @@ -263,14 +263,14 @@ The following tables give you an overview of the tasks in MTEB. | [EncyclopediaVQAIT2ITRetrieval](https://github.com/google-research/google-research/tree/master/encyclopedic_vqa) (Mensink et al., 2023) | ['eng'] | Any2AnyRetrieval | it2it | [Encyclopaedic] | None | None | | [EstQA](https://www.semanticscholar.org/paper/Extractive-Question-Answering-for-Estonian-Language-182912IAPM-Alum%C3%A4e/ea4f60ab36cadca059c880678bc4c51e293a85d6?utm_source=direct_link) | ['est'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [EstonianValenceClassification](https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054) | ['est'] | Classification | s2s | [News, Written] | None | None | -| [EuroSAT](https://ieeexplore.ieee.org/document/8736785) (Helber et al., 2019) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [EuroSAT](https://ieeexplore.ieee.org/document/8736785) (Helber et al., 2019) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 5400} | {'test': {'num_samples': 5400, 'unique_num_labels': 10, 'min_image_width': 64, 'average_image_width': 64.0, 'max_image_width': 64, 'min_image_height': 64, 'average_image_height': 64.0, 'max_image_height': 64, 'labels': {'4': {'count': 501}, '3': {'count': 496}, '7': {'count': 554}, '2': {'count': 573}, '9': {'count': 609}, '0': {'count': 596}, '8': {'count': 529}, '1': {'count': 608}, '5': {'count': 396}, '6': {'count': 538}}}} | | [EuroSATZeroShot](https://ieeexplore.ieee.org/document/8736785) (Helber et al., 2019) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | -| [FER2013](https://arxiv.org/abs/1412.6572) (Ian J. Goodfellow, 2015) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [FER2013](https://arxiv.org/abs/1412.6572) (Ian J. Goodfellow, 2015) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 7178} | {'test': {'num_samples': 7178, 'unique_num_labels': 7, 'min_image_width': 48, 'average_image_width': 48.0, 'max_image_width': 48, 'min_image_height': 48, 'average_image_height': 48.0, 'max_image_height': 48, 'labels': {'0': {'count': 958}, '1': {'count': 111}, '2': {'count': 1024}, '3': {'count': 1774}, '4': {'count': 1233}, '5': {'count': 1247}, '6': {'count': 831}}}} | | [FER2013ZeroShot](https://arxiv.org/abs/1412.6572) (Ian J. Goodfellow, 2015) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [FEVER](https://fever.ai/) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [FEVER-NL](https://huggingface.co/datasets/clips/beir-nl-fever) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [FEVERHardNegatives](https://fever.ai/) | ['eng'] | Retrieval | s2p | | None | None | -| [FGVCAircraft](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [FGVCAircraft](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 3333} | {'test': {'num_samples': 3333, 'unique_num_labels': 100, 'min_image_width': 800, 'average_image_width': 1098.58, 'max_image_width': 1600, 'min_image_height': 413, 'average_image_height': 747.0, 'max_image_height': 1197, 'labels': {'0': {'count': 33}, '1': {'count': 33}, '2': {'count': 34}, '3': {'count': 33}, '4': {'count': 33}, '5': {'count': 34}, '6': {'count': 33}, '7': {'count': 33}, '8': {'count': 34}, '9': {'count': 33}, '10': {'count': 33}, '11': {'count': 34}, '12': {'count': 33}, '13': {'count': 33}, '14': {'count': 34}, '15': {'count': 33}, '16': {'count': 33}, '17': {'count': 34}, '18': {'count': 33}, '19': {'count': 33}, '20': {'count': 34}, '21': {'count': 33}, '22': {'count': 33}, '23': {'count': 34}, '24': {'count': 33}, '25': {'count': 33}, '26': {'count': 34}, '27': {'count': 33}, '28': {'count': 33}, '29': {'count': 34}, '30': {'count': 33}, '31': {'count': 33}, '32': {'count': 34}, '33': {'count': 33}, '34': {'count': 33}, '35': {'count': 34}, '36': {'count': 33}, '37': {'count': 33}, '38': {'count': 34}, '39': {'count': 33}, '40': {'count': 33}, '41': {'count': 34}, '42': {'count': 33}, '43': {'count': 33}, '44': {'count': 34}, '45': {'count': 33}, '46': {'count': 33}, '47': {'count': 34}, '48': {'count': 33}, '49': {'count': 33}, '50': {'count': 34}, '51': {'count': 33}, '52': {'count': 33}, '53': {'count': 34}, '54': {'count': 33}, '55': {'count': 33}, '56': {'count': 34}, '57': {'count': 33}, '58': {'count': 33}, '59': {'count': 34}, '60': {'count': 33}, '61': {'count': 33}, '62': {'count': 34}, '63': {'count': 33}, '64': {'count': 33}, '65': {'count': 34}, '66': {'count': 33}, '67': {'count': 33}, '68': {'count': 34}, '69': {'count': 33}, '70': {'count': 33}, '71': {'count': 34}, '72': {'count': 33}, '73': {'count': 33}, '74': {'count': 34}, '75': {'count': 33}, '76': {'count': 33}, '77': {'count': 34}, '78': {'count': 33}, '79': {'count': 33}, '80': {'count': 34}, '81': {'count': 33}, '82': {'count': 33}, '83': {'count': 34}, '84': {'count': 33}, '85': {'count': 33}, '86': {'count': 34}, '87': {'count': 33}, '88': {'count': 33}, '89': {'count': 34}, '90': {'count': 33}, '91': {'count': 33}, '92': {'count': 34}, '93': {'count': 33}, '94': {'count': 33}, '95': {'count': 34}, '96': {'count': 33}, '97': {'count': 33}, '98': {'count': 34}, '99': {'count': 33}}}} | | [FGVCAircraftZeroShot](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [FORBI2IRetrieval](https://github.com/pxiangwu/FORB) (Pengxiang Wu, 2023) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None | | [FQuADRetrieval](https://huggingface.co/datasets/manu/fquad2_test) | ['fra'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | @@ -296,7 +296,7 @@ The following tables give you an overview of the tasks in MTEB. | [Flickr30kI2TRetrieval](https://www.semanticscholar.org/paper/From-image-descriptions-to-visual-denotations%3A-New-Young-Lai/44040913380206991b1991daf1192942e038fe31) (Peter Young, 2014) | ['eng'] | Any2AnyRetrieval | i2t | [Web, Written] | None | None | | [Flickr30kT2IRetrieval](https://www.semanticscholar.org/paper/From-image-descriptions-to-visual-denotations%3A-New-Young-Lai/44040913380206991b1991daf1192942e038fe31) (Peter Young, 2014) | ['eng'] | Any2AnyRetrieval | t2i | [Web, Written] | None | None | | [FloresBitextMining](https://huggingface.co/datasets/facebook/flores) (Goyal et al., 2022) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | BitextMining | s2s | [Encyclopaedic, Non-fiction, Written] | None | None | -| [Food101Classification](https://huggingface.co/datasets/ethz/food101) (Bossard et al., 2014) | ['eng'] | ImageClassification | i2i | [Web] | None | None | +| [Food101Classification](https://huggingface.co/datasets/ethz/food101) (Bossard et al., 2014) | ['eng'] | ImageClassification | i2i | [Web] | {'validation': 25250} | {'validation': {'num_samples': 25250, 'unique_num_labels': 101, 'min_image_width': 287, 'average_image_width': 495.82, 'max_image_width': 512, 'min_image_height': 213, 'average_image_height': 475.08, 'max_image_height': 512, 'labels': {'6': {'count': 250}, '79': {'count': 250}, '81': {'count': 250}, '53': {'count': 250}, '10': {'count': 250}, '20': {'count': 250}, '77': {'count': 250}, '48': {'count': 250}, '86': {'count': 250}, '84': {'count': 250}, '76': {'count': 250}, '34': {'count': 250}, '51': {'count': 250}, '21': {'count': 250}, '64': {'count': 250}, '0': {'count': 250}, '43': {'count': 250}, '44': {'count': 250}, '73': {'count': 250}, '57': {'count': 250}, '14': {'count': 250}, '5': {'count': 250}, '46': {'count': 250}, '55': {'count': 250}, '93': {'count': 250}, '98': {'count': 250}, '38': {'count': 250}, '11': {'count': 250}, '99': {'count': 250}, '72': {'count': 250}, '22': {'count': 250}, '59': {'count': 250}, '70': {'count': 250}, '16': {'count': 250}, '2': {'count': 250}, '58': {'count': 250}, '83': {'count': 250}, '96': {'count': 250}, '39': {'count': 250}, '49': {'count': 250}, '45': {'count': 250}, '88': {'count': 250}, '9': {'count': 250}, '26': {'count': 250}, '94': {'count': 250}, '4': {'count': 250}, '65': {'count': 250}, '32': {'count': 250}, '27': {'count': 250}, '36': {'count': 250}, '87': {'count': 250}, '69': {'count': 250}, '85': {'count': 250}, '25': {'count': 250}, '40': {'count': 250}, '19': {'count': 250}, '35': {'count': 250}, '56': {'count': 250}, '42': {'count': 250}, '60': {'count': 250}, '68': {'count': 250}, '100': {'count': 250}, '41': {'count': 250}, '92': {'count': 250}, '24': {'count': 250}, '3': {'count': 250}, '89': {'count': 250}, '75': {'count': 250}, '17': {'count': 250}, '97': {'count': 250}, '61': {'count': 250}, '33': {'count': 250}, '80': {'count': 250}, '30': {'count': 250}, '8': {'count': 250}, '74': {'count': 250}, '66': {'count': 250}, '31': {'count': 250}, '18': {'count': 250}, '67': {'count': 250}, '37': {'count': 250}, '13': {'count': 250}, '63': {'count': 250}, '28': {'count': 250}, '47': {'count': 250}, '52': {'count': 250}, '54': {'count': 250}, '1': {'count': 250}, '82': {'count': 250}, '91': {'count': 250}, '95': {'count': 250}, '7': {'count': 250}, '29': {'count': 250}, '78': {'count': 250}, '15': {'count': 250}, '23': {'count': 250}, '12': {'count': 250}, '62': {'count': 250}, '50': {'count': 250}, '71': {'count': 250}, '90': {'count': 250}}}} | | [Food101ZeroShot](https://huggingface.co/datasets/ethz/food101) (Bossard et al., 2014) | ['eng'] | ZeroShotClassification | i2t | [Web] | None | None | | [FrenchBookReviews](https://huggingface.co/datasets/Abirate/french_book_reviews) | ['fra'] | Classification | s2s | [Reviews, Written] | None | None | | [FrenkEnClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubešić, 2019) | ['eng'] | Classification | s2s | [Social, Written] | None | None | @@ -306,7 +306,7 @@ The following tables give you an overview of the tasks in MTEB. | [GLDv2I2IRetrieval](https://openaccess.thecvf.com/content_CVPR_2020/html/Weyand_Google_Landmarks_Dataset_v2_-_A_Large-Scale_Benchmark_for_Instance-Level_CVPR_2020_paper.html) (Weyand et al., 2020) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None | | [GLDv2I2TRetrieval](https://openaccess.thecvf.com/content_CVPR_2020/html/Weyand_Google_Landmarks_Dataset_v2_-_A_Large-Scale_Benchmark_for_Instance-Level_CVPR_2020_paper.html) (Weyand et al., 2020) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | None | None | | [GPUSpeedTask](https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_tasks/speed.py#L83-L96) | ['eng'] | Speed | s2s | [Fiction, Written] | None | None | -| [GTSRB](https://benchmark.ini.rub.de/) (Stallkamp et al., 2011) | ['eng'] | ImageClassification | i2i | [Scene] | None | None | +| [GTSRB](https://benchmark.ini.rub.de/) (Stallkamp et al., 2011) | ['eng'] | ImageClassification | i2i | [Scene] | {'test': 12630} | {'test': {'num_samples': 12630, 'unique_num_labels': 43, 'min_image_width': 25, 'average_image_width': 50.51, 'max_image_width': 266, 'min_image_height': 25, 'average_image_height': 50.36, 'max_image_height': 232, 'labels': {'16': {'count': 150}, '1': {'count': 720}, '38': {'count': 690}, '33': {'count': 210}, '11': {'count': 420}, '18': {'count': 390}, '12': {'count': 690}, '25': {'count': 480}, '35': {'count': 390}, '7': {'count': 450}, '23': {'count': 150}, '4': {'count': 660}, '9': {'count': 480}, '21': {'count': 90}, '20': {'count': 90}, '27': {'count': 60}, '3': {'count': 450}, '13': {'count': 720}, '10': {'count': 660}, '5': {'count': 630}, '17': {'count': 360}, '34': {'count': 120}, '2': {'count': 750}, '8': {'count': 450}, '30': {'count': 150}, '24': {'count': 90}, '15': {'count': 210}, '26': {'count': 180}, '28': {'count': 150}, '22': {'count': 120}, '14': {'count': 270}, '32': {'count': 60}, '29': {'count': 90}, '6': {'count': 150}, '36': {'count': 120}, '40': {'count': 90}, '41': {'count': 60}, '31': {'count': 270}, '19': {'count': 60}, '0': {'count': 60}, '39': {'count': 90}, '42': {'count': 90}, '37': {'count': 60}}}} | | [GTSRBZeroShot](https://benchmark.ini.rub.de/) (Stallkamp et al., 2011) | ['eng'] | ZeroShotClassification | i2t | [Scene] | None | None | | [GeoreviewClassification](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None | | [GeoreviewClusteringP2P](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Clustering | p2p | [Reviews, Written] | None | None | @@ -545,8 +545,8 @@ The following tables give you an overview of the tasks in MTEB. | [OpusparcusPC](https://gem-benchmark.com/data_cards/opusparcus) (Mathias Creutz, 2018) | ['deu', 'eng', 'fin', 'fra', 'rus', 'swe'] | PairClassification | s2s | [Spoken, Spoken] | None | None | | [OralArgumentQuestionPurposeLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [OverrulingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [OxfordFlowersClassification](https://huggingface.co/datasets/nelorth/oxford-flowers/viewer/default/train) | ['eng'] | ImageClassification | i2i | [Reviews] | None | None | -| [OxfordPets](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [OxfordFlowersClassification](https://huggingface.co/datasets/nelorth/oxford-flowers/viewer/default/train) | ['eng'] | ImageClassification | i2i | [Reviews] | {'test': 1020} | {'test': {'num_samples': 1020, 'unique_num_labels': 102, 'min_image_width': 500, 'average_image_width': 618.07, 'max_image_width': 873, 'min_image_height': 500, 'average_image_height': 538.26, 'max_image_height': 928, 'labels': {'0': {'count': 9}, '1': {'count': 9}, '2': {'count': 10}, '3': {'count': 9}, '4': {'count': 11}, '5': {'count': 11}, '6': {'count': 10}, '7': {'count': 10}, '8': {'count': 11}, '9': {'count': 10}, '10': {'count': 10}, '11': {'count': 9}, '12': {'count': 10}, '13': {'count': 10}, '14': {'count': 10}, '15': {'count': 9}, '16': {'count': 11}, '17': {'count': 11}, '18': {'count': 10}, '19': {'count': 9}, '20': {'count': 9}, '21': {'count': 10}, '22': {'count': 11}, '23': {'count': 11}, '24': {'count': 10}, '25': {'count': 11}, '26': {'count': 10}, '27': {'count': 9}, '28': {'count': 11}, '29': {'count': 10}, '30': {'count': 10}, '31': {'count': 9}, '32': {'count': 10}, '33': {'count': 10}, '34': {'count': 10}, '35': {'count': 11}, '36': {'count': 9}, '37': {'count': 10}, '38': {'count': 10}, '39': {'count': 11}, '40': {'count': 10}, '41': {'count': 10}, '42': {'count': 11}, '43': {'count': 10}, '44': {'count': 10}, '45': {'count': 10}, '46': {'count': 10}, '47': {'count': 9}, '48': {'count': 10}, '49': {'count': 11}, '50': {'count': 10}, '51': {'count': 10}, '52': {'count': 10}, '53': {'count': 10}, '54': {'count': 10}, '55': {'count': 10}, '56': {'count': 10}, '57': {'count': 11}, '58': {'count': 10}, '59': {'count': 10}, '60': {'count': 10}, '61': {'count': 10}, '62': {'count': 9}, '63': {'count': 10}, '64': {'count': 10}, '65': {'count': 9}, '66': {'count': 11}, '67': {'count': 10}, '68': {'count': 11}, '69': {'count': 9}, '70': {'count': 9}, '71': {'count': 10}, '72': {'count': 10}, '73': {'count': 10}, '74': {'count': 10}, '75': {'count': 10}, '76': {'count': 10}, '77': {'count': 10}, '78': {'count': 9}, '79': {'count': 10}, '80': {'count': 10}, '81': {'count': 11}, '82': {'count': 10}, '83': {'count': 9}, '84': {'count': 11}, '85': {'count': 10}, '86': {'count': 10}, '87': {'count': 10}, '88': {'count': 10}, '89': {'count': 10}, '90': {'count': 10}, '91': {'count': 10}, '92': {'count': 10}, '93': {'count': 9}, '94': {'count': 9}, '95': {'count': 10}, '96': {'count': 10}, '97': {'count': 10}, '98': {'count': 10}, '99': {'count': 10}, '100': {'count': 10}, '101': {'count': 11}}}} | +| [OxfordPets](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 3669} | {'test': {'num_samples': 3669, 'unique_num_labels': 37, 'min_image_width': 137, 'average_image_width': 443.46, 'max_image_width': 1646, 'min_image_height': 103, 'average_image_height': 399.38, 'max_image_height': 2160, 'labels': {'0': {'count': 98}, '1': {'count': 100}, '2': {'count': 100}, '3': {'count': 100}, '4': {'count': 100}, '5': {'count': 100}, '6': {'count': 100}, '7': {'count': 88}, '8': {'count': 99}, '9': {'count': 100}, '10': {'count': 100}, '11': {'count': 97}, '12': {'count': 100}, '13': {'count': 100}, '14': {'count': 100}, '15': {'count': 100}, '16': {'count': 100}, '17': {'count': 100}, '18': {'count': 99}, '19': {'count': 100}, '20': {'count': 100}, '21': {'count': 100}, '22': {'count': 100}, '23': {'count': 100}, '24': {'count': 100}, '25': {'count': 100}, '26': {'count': 100}, '27': {'count': 100}, '28': {'count': 100}, '29': {'count': 100}, '30': {'count': 99}, '31': {'count': 100}, '32': {'count': 100}, '33': {'count': 100}, '34': {'count': 89}, '35': {'count': 100}, '36': {'count': 100}}}} | | [OxfordPetsZeroShot](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [PAC](https://arxiv.org/pdf/2211.13112.pdf) (Łukasz Augustyniak, 2022) | ['pol'] | Classification | p2p | [Legal, Written] | None | None | | [PAWSX](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | @@ -555,7 +555,7 @@ The following tables give you an overview of the tasks in MTEB. | [PSC](http://www.lrec-conf.org/proceedings/lrec2014/pdf/1211_Paper.pdf) | ['pol'] | PairClassification | s2s | [News, Written] | None | None | | [ParsinluEntail](https://github.com/persiannlp/parsinlu) | ['fas'] | PairClassification | s2s | | None | None | | [ParsinluQueryParaphPC](https://huggingface.co/datasets/persiannlp/parsinlu_query_paraphrasing) | ['fas'] | PairClassification | s2s | | None | None | -| [PatchCamelyon](https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24) | ['eng'] | ImageClassification | i2i | [Medical] | None | None | +| [PatchCamelyon](https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24) | ['eng'] | ImageClassification | i2i | [Medical] | {'test': 32768} | {'test': {'num_samples': 32768, 'unique_num_labels': 2, 'min_image_width': 96, 'average_image_width': 96.0, 'max_image_width': 96, 'min_image_height': 96, 'average_image_height': 96.0, 'max_image_height': 96, 'labels': {'0': {'count': 16391}, '1': {'count': 16377}}}} | | [PatchCamelyonZeroShot](https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24) | ['eng'] | ZeroShotClassification | i2t | [Medical] | None | None | | [PatentClassification](https://aclanthology.org/P19-1212.pdf) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [PawsXPairClassification](https://arxiv.org/abs/1908.11828) (Yinfei Yang, 2019) | ['cmn', 'deu', 'eng', 'fra', 'jpn', 'kor', 'spa'] | PairClassification | s2s | [Encyclopaedic, Web, Written] | {'test': 14000, 'validation': 14000} | {'test': {'num_samples': 14000, 'number_of_characters': 2551922, 'min_sentence1_length': 2, 'avg_sentence1_length': 91.18, 'max_sentence1_length': 268, 'unique_sentence1': 13404, 'min_sentence2_length': 2, 'avg_sentence2_length': 91.1, 'max_sentence2_length': 247, 'unique_sentence2': 13462, 'unique_labels': 2, 'labels': {'1': {'count': 6285}, '0': {'count': 7715}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 478034, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.78, 'max_sentence1_length': 268, 'unique_sentence1': 1934, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.24, 'max_sentence2_length': 235, 'unique_sentence2': 1938, 'unique_labels': 2, 'labels': {'1': {'count': 895}, '0': {'count': 1105}}}, 'en': {'num_samples': 2000, 'number_of_characters': 454362, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.76, 'max_sentence1_length': 209, 'unique_sentence1': 1761, 'min_sentence2_length': 25, 'avg_sentence2_length': 113.42, 'max_sentence2_length': 209, 'unique_sentence2': 1800, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'es': {'num_samples': 2000, 'number_of_characters': 471226, 'min_sentence1_length': 2, 'avg_sentence1_length': 117.81, 'max_sentence1_length': 226, 'unique_sentence1': 1955, 'min_sentence2_length': 22, 'avg_sentence2_length': 117.8, 'max_sentence2_length': 233, 'unique_sentence2': 1959, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 480033, 'min_sentence1_length': 2, 'avg_sentence1_length': 120.03, 'max_sentence1_length': 238, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.99, 'max_sentence2_length': 247, 'unique_sentence2': 1953, 'unique_labels': 2, 'labels': {'1': {'count': 903}, '0': {'count': 1097}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 235106, 'min_sentence1_length': 2, 'avg_sentence1_length': 58.68, 'max_sentence1_length': 192, 'unique_sentence1': 1944, 'min_sentence2_length': 2, 'avg_sentence2_length': 58.88, 'max_sentence2_length': 198, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 883}, '0': {'count': 1117}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 260149, 'min_sentence1_length': 2, 'avg_sentence1_length': 64.96, 'max_sentence1_length': 153, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.11, 'max_sentence2_length': 159, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 896}, '0': {'count': 1104}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 173012, 'min_sentence1_length': 2, 'avg_sentence1_length': 43.23, 'max_sentence1_length': 120, 'unique_sentence1': 1909, 'min_sentence2_length': 2, 'avg_sentence2_length': 43.27, 'max_sentence2_length': 113, 'unique_sentence2': 1909, 'unique_labels': 2, 'labels': {'1': {'count': 894}, '0': {'count': 1106}}}}}, 'validation': {'num_samples': 14000, 'number_of_characters': 2524625, 'min_sentence1_length': 2, 'avg_sentence1_length': 90.13, 'max_sentence1_length': 248, 'unique_sentence1': 13357, 'min_sentence2_length': 2, 'avg_sentence2_length': 90.2, 'max_sentence2_length': 275, 'unique_sentence2': 13397, 'unique_labels': 2, 'labels': {'1': {'count': 5948}, '0': {'count': 8052}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 467643, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.82, 'max_sentence1_length': 248, 'unique_sentence1': 1914, 'min_sentence2_length': 2, 'avg_sentence2_length': 117.0, 'max_sentence2_length': 275, 'unique_sentence2': 1920, 'unique_labels': 2, 'labels': {'1': {'count': 831}, '0': {'count': 1169}}}, 'en': {'num_samples': 2000, 'number_of_characters': 451931, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.11, 'max_sentence1_length': 213, 'unique_sentence1': 1758, 'min_sentence2_length': 25, 'avg_sentence2_length': 112.86, 'max_sentence2_length': 213, 'unique_sentence2': 1771, 'unique_labels': 2, 'labels': {'1': {'count': 863}, '0': {'count': 1137}}}, 'es': {'num_samples': 2000, 'number_of_characters': 466112, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.33, 'max_sentence1_length': 240, 'unique_sentence1': 1938, 'min_sentence2_length': 2, 'avg_sentence2_length': 116.73, 'max_sentence2_length': 241, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 847}, '0': {'count': 1153}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 478510, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.5, 'max_sentence1_length': 233, 'unique_sentence1': 1933, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.75, 'max_sentence2_length': 246, 'unique_sentence2': 1939, 'unique_labels': 2, 'labels': {'1': {'count': 860}, '0': {'count': 1140}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 229655, 'min_sentence1_length': 2, 'avg_sentence1_length': 57.51, 'max_sentence1_length': 126, 'unique_sentence1': 1957, 'min_sentence2_length': 2, 'avg_sentence2_length': 57.32, 'max_sentence2_length': 121, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 854}, '0': {'count': 1146}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 261355, 'min_sentence1_length': 2, 'avg_sentence1_length': 65.16, 'max_sentence1_length': 178, 'unique_sentence1': 1963, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.52, 'max_sentence2_length': 174, 'unique_sentence2': 1968, 'unique_labels': 2, 'labels': {'1': {'count': 840}, '0': {'count': 1160}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 169419, 'min_sentence1_length': 2, 'avg_sentence1_length': 42.45, 'max_sentence1_length': 101, 'unique_sentence1': 1899, 'min_sentence2_length': 2, 'avg_sentence2_length': 42.26, 'max_sentence2_length': 120, 'unique_sentence2': 1895, 'unique_labels': 2, 'labels': {'1': {'count': 853}, '0': {'count': 1147}}}}}} | @@ -590,7 +590,7 @@ The following tables give you an overview of the tasks in MTEB. | [QuoraRetrievalHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | None | None | | [RARbCode](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Programming, Written] | None | None | | [RARbMath](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [RESISC45](https://ieeexplore.ieee.org/abstract/document/7891544) (Cheng et al., 2017) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [RESISC45](https://ieeexplore.ieee.org/abstract/document/7891544) (Cheng et al., 2017) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 6300} | {'test': {'num_samples': 6300, 'unique_num_labels': 45, 'min_image_width': 256, 'average_image_width': 256.0, 'max_image_width': 256, 'min_image_height': 256, 'average_image_height': 256.0, 'max_image_height': 256, 'labels': {'31': {'count': 135}, '11': {'count': 144}, '28': {'count': 135}, '43': {'count': 154}, '41': {'count': 144}, '33': {'count': 134}, '19': {'count': 130}, '16': {'count': 127}, '22': {'count': 130}, '34': {'count': 143}, '24': {'count': 164}, '0': {'count': 169}, '13': {'count': 146}, '25': {'count': 115}, '6': {'count': 132}, '36': {'count': 135}, '39': {'count': 142}, '18': {'count': 140}, '23': {'count': 147}, '37': {'count': 159}, '15': {'count': 122}, '29': {'count': 140}, '9': {'count': 159}, '27': {'count': 140}, '21': {'count': 131}, '3': {'count': 134}, '1': {'count': 162}, '32': {'count': 153}, '26': {'count': 150}, '35': {'count': 151}, '44': {'count': 118}, '30': {'count': 154}, '20': {'count': 139}, '4': {'count': 130}, '42': {'count': 127}, '40': {'count': 137}, '5': {'count': 140}, '17': {'count': 142}, '2': {'count': 123}, '38': {'count': 130}, '10': {'count': 140}, '12': {'count': 146}, '8': {'count': 146}, '7': {'count': 143}, '14': {'count': 118}}}} | | [RESISC45ZeroShot](https://ieeexplore.ieee.org/abstract/document/7891544) (Cheng et al., 2017) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [ROxfordEasyI2IMultiChoice](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyMultiChoice | i2i | [Web] | None | None | | [ROxfordEasyI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | None | None | @@ -662,7 +662,7 @@ The following tables give you an overview of the tasks in MTEB. | [SNLRetrieval](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | | [SOPI2IRetrieval](https://paperswithcode.com/dataset/stanford-online-products) (Oh Song et al., 2016) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None | | [SRNCorpusBitextMining](https://arxiv.org/abs/2212.06383) (Zwennicker et al., 2022) | ['nld', 'srn'] | BitextMining | s2s | [Social, Web, Written] | None | None | -| [STL10](https://cs.stanford.edu/~acoates/stl10/) (Coates et al., 2011) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [STL10](https://cs.stanford.edu/~acoates/stl10/) (Coates et al., 2011) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 8000} | {'test': {'num_samples': 8000, 'unique_num_labels': 10, 'min_image_width': 96, 'average_image_width': 96.0, 'max_image_width': 96, 'min_image_height': 96, 'average_image_height': 96.0, 'max_image_height': 96, 'labels': {'0': {'count': 800}, '1': {'count': 800}, '2': {'count': 800}, '3': {'count': 800}, '4': {'count': 800}, '5': {'count': 800}, '6': {'count': 800}, '7': {'count': 800}, '8': {'count': 800}, '9': {'count': 800}}}} | | [STL10ZeroShot](https://cs.stanford.edu/~acoates/stl10/) (Coates et al., 2011) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [STS12](https://www.aclweb.org/anthology/S12-1051.pdf) (Agirre et al., 2012) | ['eng'] | STS | s2s | [Encyclopaedic, News, Written] | {'test': 3108} | {'test': {'num_samples': 3108, 'number_of_characters': 402118, 'min_sentence1_length': 3, 'average_sentence1_len': 63.79, 'max_sentence1_length': 220, 'unique_sentence1': 2236, 'min_sentence2_length': 7, 'average_sentence2_len': 65.59, 'max_sentence2_length': 204, 'unique_sentence2': 2797, 'min_score': 0.0, 'avg_score': 3.51, 'max_score': 5.0}} | | [STS12VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS | i2i | [Encyclopaedic, News, Written] | {'test': 3108} | {'test': {'num_samples': 3108, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 3.51, 'max_score': 5.0}} | @@ -682,7 +682,7 @@ The following tables give you an overview of the tasks in MTEB. | [STSBenchmarkMultilingualSTS](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | STS | s2s | [News, Social, Spoken, Web, Written] | None | None | | [STSBenchmarkMultilingualVisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | VisualSTS | i2i | [News, Social, Spoken, Web, Written] | {'dev': 15000, 'test': 13790} | {'dev': {'num_samples': 15000, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'en': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'de': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'es': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'fr': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'it': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'nl': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'pl': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'pt': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'ru': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'zh': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}}}, 'test': {'num_samples': 13790, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'en': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'de': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'es': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'fr': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'it': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'nl': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'pl': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'pt': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'ru': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'zh': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}}}} | | [STSES](https://huggingface.co/datasets/PlanTL-GOB-ES/sts-es) (Agirre et al., 2015) | ['spa'] | STS | s2s | [Written] | None | None | -| [SUN397](https://ieeexplore.ieee.org/abstract/document/5539970) (Xiao et al., 2010) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [SUN397](https://ieeexplore.ieee.org/abstract/document/5539970) (Xiao et al., 2010) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 21750} | {'test': {'num_samples': 21750, 'unique_num_labels': 397, 'min_image_width': 125, 'average_image_width': 354.22, 'max_image_width': 696, 'min_image_height': 94, 'average_image_height': 291.17, 'max_image_height': 595, 'labels': {'227': {'count': 439}, '213': {'count': 335}, '53': {'count': 23}, '350': {'count': 40}, '73': {'count': 38}, '316': {'count': 63}, '177': {'count': 80}, '25': {'count': 39}, '275': {'count': 31}, '328': {'count': 33}, '263': {'count': 47}, '239': {'count': 26}, '41': {'count': 213}, '319': {'count': 51}, '91': {'count': 16}, '95': {'count': 183}, '396': {'count': 20}, '259': {'count': 36}, '107': {'count': 167}, '381': {'count': 164}, '174': {'count': 167}, '246': {'count': 44}, '67': {'count': 31}, '374': {'count': 28}, '354': {'count': 22}, '72': {'count': 100}, '97': {'count': 32}, '256': {'count': 57}, '247': {'count': 57}, '159': {'count': 49}, '270': {'count': 135}, '133': {'count': 215}, '197': {'count': 40}, '12': {'count': 38}, '2': {'count': 226}, '115': {'count': 75}, '200': {'count': 93}, '47': {'count': 103}, '9': {'count': 37}, '22': {'count': 76}, '255': {'count': 34}, '267': {'count': 22}, '244': {'count': 93}, '85': {'count': 115}, '342': {'count': 87}, '55': {'count': 50}, '7': {'count': 41}, '337': {'count': 99}, '38': {'count': 28}, '269': {'count': 69}, '106': {'count': 15}, '298': {'count': 27}, '361': {'count': 53}, '8': {'count': 108}, '166': {'count': 47}, '280': {'count': 51}, '35': {'count': 61}, '147': {'count': 82}, '214': {'count': 26}, '284': {'count': 28}, '286': {'count': 66}, '113': {'count': 67}, '83': {'count': 38}, '82': {'count': 236}, '365': {'count': 17}, '242': {'count': 116}, '186': {'count': 38}, '87': {'count': 111}, '274': {'count': 48}, '27': {'count': 95}, '283': {'count': 22}, '4': {'count': 76}, '334': {'count': 139}, '364': {'count': 21}, '48': {'count': 408}, '311': {'count': 41}, '101': {'count': 64}, '131': {'count': 55}, '172': {'count': 31}, '355': {'count': 28}, '308': {'count': 56}, '5': {'count': 47}, '318': {'count': 155}, '86': {'count': 87}, '46': {'count': 230}, '111': {'count': 69}, '88': {'count': 54}, '23': {'count': 47}, '70': {'count': 61}, '217': {'count': 34}, '11': {'count': 76}, '193': {'count': 207}, '0': {'count': 99}, '303': {'count': 23}, '324': {'count': 47}, '377': {'count': 19}, '345': {'count': 39}, '154': {'count': 49}, '393': {'count': 68}, '152': {'count': 58}, '317': {'count': 27}, '384': {'count': 46}, '257': {'count': 38}, '294': {'count': 47}, '145': {'count': 23}, '289': {'count': 33}, '375': {'count': 19}, '57': {'count': 42}, '15': {'count': 62}, '109': {'count': 24}, '139': {'count': 24}, '66': {'count': 26}, '340': {'count': 32}, '150': {'count': 41}, '118': {'count': 105}, '333': {'count': 27}, '126': {'count': 55}, '366': {'count': 116}, '358': {'count': 151}, '251': {'count': 37}, '309': {'count': 35}, '54': {'count': 20}, '327': {'count': 38}, '3': {'count': 60}, '21': {'count': 56}, '17': {'count': 62}, '146': {'count': 84}, '94': {'count': 42}, '243': {'count': 48}, '335': {'count': 85}, '245': {'count': 141}, '279': {'count': 187}, '360': {'count': 25}, '192': {'count': 105}, '49': {'count': 31}, '230': {'count': 81}, '357': {'count': 22}, '64': {'count': 72}, '112': {'count': 26}, '338': {'count': 70}, '216': {'count': 99}, '234': {'count': 183}, '300': {'count': 153}, '188': {'count': 48}, '254': {'count': 41}, '184': {'count': 183}, '373': {'count': 47}, '221': {'count': 86}, '84': {'count': 49}, '81': {'count': 119}, '161': {'count': 97}, '352': {'count': 21}, '105': {'count': 43}, '39': {'count': 59}, '383': {'count': 40}, '341': {'count': 56}, '63': {'count': 158}, '125': {'count': 29}, '302': {'count': 83}, '262': {'count': 40}, '392': {'count': 51}, '326': {'count': 173}, '228': {'count': 93}, '339': {'count': 25}, '80': {'count': 73}, '30': {'count': 42}, '264': {'count': 112}, '56': {'count': 94}, '321': {'count': 16}, '395': {'count': 52}, '68': {'count': 45}, '211': {'count': 45}, '44': {'count': 26}, '299': {'count': 21}, '220': {'count': 35}, '61': {'count': 20}, '138': {'count': 55}, '108': {'count': 111}, '10': {'count': 35}, '386': {'count': 28}, '297': {'count': 49}, '210': {'count': 36}, '175': {'count': 77}, '260': {'count': 68}, '391': {'count': 69}, '102': {'count': 77}, '26': {'count': 44}, '232': {'count': 54}, '6': {'count': 158}, '124': {'count': 43}, '14': {'count': 23}, '201': {'count': 39}, '168': {'count': 18}, '202': {'count': 26}, '140': {'count': 31}, '261': {'count': 60}, '104': {'count': 27}, '356': {'count': 22}, '34': {'count': 147}, '225': {'count': 111}, '60': {'count': 84}, '156': {'count': 35}, '237': {'count': 45}, '268': {'count': 87}, '310': {'count': 31}, '249': {'count': 73}, '281': {'count': 46}, '75': {'count': 89}, '77': {'count': 53}, '132': {'count': 45}, '235': {'count': 42}, '336': {'count': 84}, '123': {'count': 27}, '349': {'count': 90}, '180': {'count': 49}, '378': {'count': 17}, '332': {'count': 30}, '185': {'count': 29}, '389': {'count': 60}, '382': {'count': 77}, '198': {'count': 54}, '74': {'count': 48}, '231': {'count': 85}, '76': {'count': 54}, '151': {'count': 64}, '182': {'count': 17}, '209': {'count': 39}, '344': {'count': 37}, '204': {'count': 67}, '329': {'count': 23}, '380': {'count': 91}, '388': {'count': 32}, '116': {'count': 29}, '24': {'count': 103}, '199': {'count': 33}, '369': {'count': 14}, '359': {'count': 77}, '325': {'count': 39}, '323': {'count': 34}, '162': {'count': 35}, '33': {'count': 46}, '129': {'count': 21}, '287': {'count': 30}, '155': {'count': 24}, '170': {'count': 157}, '296': {'count': 40}, '110': {'count': 102}, '304': {'count': 21}, '164': {'count': 37}, '278': {'count': 23}, '71': {'count': 18}, '194': {'count': 24}, '136': {'count': 117}, '103': {'count': 134}, '330': {'count': 26}, '347': {'count': 26}, '206': {'count': 50}, '178': {'count': 43}, '362': {'count': 26}, '119': {'count': 111}, '208': {'count': 33}, '165': {'count': 44}, '90': {'count': 36}, '167': {'count': 40}, '187': {'count': 26}, '99': {'count': 50}, '390': {'count': 64}, '205': {'count': 16}, '65': {'count': 30}, '293': {'count': 23}, '223': {'count': 19}, '96': {'count': 31}, '305': {'count': 44}, '100': {'count': 57}, '385': {'count': 18}, '78': {'count': 42}, '59': {'count': 20}, '37': {'count': 59}, '219': {'count': 76}, '212': {'count': 28}, '1': {'count': 26}, '122': {'count': 35}, '92': {'count': 62}, '43': {'count': 39}, '196': {'count': 56}, '19': {'count': 25}, '128': {'count': 35}, '376': {'count': 77}, '313': {'count': 30}, '114': {'count': 54}, '121': {'count': 31}, '169': {'count': 62}, '331': {'count': 55}, '238': {'count': 16}, '179': {'count': 31}, '127': {'count': 31}, '370': {'count': 98}, '149': {'count': 47}, '346': {'count': 41}, '250': {'count': 22}, '276': {'count': 25}, '163': {'count': 43}, '18': {'count': 33}, '282': {'count': 23}, '215': {'count': 33}, '258': {'count': 60}, '240': {'count': 29}, '233': {'count': 14}, '93': {'count': 27}, '69': {'count': 23}, '266': {'count': 26}, '387': {'count': 55}, '141': {'count': 18}, '191': {'count': 26}, '183': {'count': 42}, '271': {'count': 22}, '120': {'count': 32}, '98': {'count': 53}, '29': {'count': 34}, '28': {'count': 21}, '144': {'count': 26}, '351': {'count': 50}, '368': {'count': 20}, '314': {'count': 27}, '45': {'count': 17}, '218': {'count': 50}, '348': {'count': 25}, '157': {'count': 35}, '117': {'count': 24}, '367': {'count': 24}, '13': {'count': 31}, '363': {'count': 22}, '79': {'count': 28}, '312': {'count': 27}, '372': {'count': 29}, '189': {'count': 21}, '50': {'count': 22}, '160': {'count': 35}, '16': {'count': 39}, '222': {'count': 21}, '58': {'count': 37}, '153': {'count': 64}, '62': {'count': 21}, '290': {'count': 25}, '292': {'count': 24}, '285': {'count': 25}, '343': {'count': 32}, '301': {'count': 19}, '190': {'count': 46}, '195': {'count': 24}, '135': {'count': 30}, '315': {'count': 25}, '203': {'count': 29}, '307': {'count': 18}, '142': {'count': 25}, '173': {'count': 28}, '236': {'count': 41}, '171': {'count': 23}, '371': {'count': 17}, '130': {'count': 15}, '277': {'count': 39}, '248': {'count': 22}, '181': {'count': 35}, '40': {'count': 20}, '322': {'count': 15}, '273': {'count': 23}, '148': {'count': 23}, '295': {'count': 25}, '32': {'count': 21}, '320': {'count': 25}, '137': {'count': 32}, '253': {'count': 36}, '31': {'count': 19}, '306': {'count': 27}, '51': {'count': 19}, '52': {'count': 29}, '176': {'count': 31}, '241': {'count': 23}, '265': {'count': 32}, '394': {'count': 26}, '158': {'count': 26}, '226': {'count': 28}, '288': {'count': 21}, '353': {'count': 19}, '291': {'count': 21}, '224': {'count': 26}, '36': {'count': 38}, '20': {'count': 22}, '252': {'count': 18}, '134': {'count': 24}, '143': {'count': 21}, '207': {'count': 28}, '89': {'count': 16}, '272': {'count': 23}, '379': {'count': 24}, '229': {'count': 20}, '42': {'count': 23}}}} | | [SUN397ZeroShot](https://ieeexplore.ieee.org/abstract/document/5539970) (Xiao et al., 2010) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [SadeemQuestionRetrieval](https://huggingface.co/datasets/sadeem-ai/sadeem-ar-eval-retrieval-questions) | ['ara'] | Retrieval | s2p | [Written, Written] | None | None | | [SanskritShlokasClassification](https://github.com/goru001/nlp-for-sanskrit) | ['san'] | Classification | s2s | [Religious, Written] | None | None | @@ -718,7 +718,7 @@ The following tables give you an overview of the tasks in MTEB. | [StackExchangeClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | p2p | [Web, Written] | None | None | | [StackOverflowDupQuestions](https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf) (Xueqing Liu, 2018) | ['eng'] | Reranking | s2s | [Blog, Programming, Written] | None | None | | [StackOverflowQA](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 21925} | {'test': {'number_of_characters': 26584028, 'num_samples': 21925, 'num_queries': 1994, 'num_documents': 19931, 'min_document_length': 61, 'average_document_length': 130.32, 'max_document_length': 22234, 'unique_documents': 19931, 'min_query_length': 5, 'average_query_length': 12029.38, 'max_query_length': 46028, 'unique_queries': 1994, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1994}} | -| [StanfordCars](https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content) (Jonathan Krause, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | None | None | +| [StanfordCars](https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content) (Jonathan Krause, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 8041} | {'test': {'num_samples': 8041, 'unique_num_labels': 196, 'min_image_width': 78, 'average_image_width': 701.18, 'max_image_width': 7800, 'min_image_height': 41, 'average_image_height': 483.75, 'max_image_height': 5400, 'labels': {'180': {'count': 38}, '102': {'count': 39}, '144': {'count': 44}, '186': {'count': 43}, '184': {'count': 38}, '77': {'count': 37}, '117': {'count': 41}, '164': {'count': 44}, '31': {'count': 41}, '59': {'count': 36}, '48': {'count': 37}, '107': {'count': 44}, '115': {'count': 37}, '134': {'count': 42}, '82': {'count': 40}, '50': {'count': 43}, '153': {'count': 42}, '32': {'count': 42}, '21': {'count': 42}, '150': {'count': 43}, '3': {'count': 42}, '80': {'count': 45}, '106': {'count': 44}, '190': {'count': 46}, '169': {'count': 44}, '194': {'count': 43}, '90': {'count': 38}, '4': {'count': 40}, '163': {'count': 43}, '147': {'count': 45}, '187': {'count': 43}, '43': {'count': 44}, '6': {'count': 39}, '30': {'count': 44}, '73': {'count': 43}, '29': {'count': 41}, '165': {'count': 41}, '179': {'count': 42}, '105': {'count': 41}, '2': {'count': 43}, '64': {'count': 45}, '34': {'count': 41}, '74': {'count': 44}, '84': {'count': 43}, '24': {'count': 39}, '167': {'count': 42}, '136': {'count': 43}, '133': {'count': 33}, '155': {'count': 39}, '119': {'count': 42}, '129': {'count': 41}, '127': {'count': 39}, '35': {'count': 41}, '170': {'count': 46}, '36': {'count': 38}, '63': {'count': 29}, '182': {'count': 42}, '42': {'count': 46}, '17': {'count': 42}, '75': {'count': 43}, '0': {'count': 44}, '62': {'count': 44}, '173': {'count': 41}, '16': {'count': 40}, '104': {'count': 43}, '49': {'count': 42}, '122': {'count': 44}, '81': {'count': 45}, '191': {'count': 42}, '92': {'count': 39}, '145': {'count': 43}, '95': {'count': 41}, '54': {'count': 39}, '114': {'count': 45}, '112': {'count': 42}, '151': {'count': 35}, '91': {'count': 40}, '188': {'count': 40}, '20': {'count': 42}, '33': {'count': 44}, '86': {'count': 44}, '128': {'count': 38}, '142': {'count': 40}, '19': {'count': 46}, '177': {'count': 41}, '11': {'count': 36}, '45': {'count': 43}, '60': {'count': 43}, '8': {'count': 41}, '56': {'count': 37}, '28': {'count': 42}, '120': {'count': 44}, '5': {'count': 44}, '85': {'count': 42}, '68': {'count': 38}, '22': {'count': 39}, '108': {'count': 44}, '89': {'count': 41}, '132': {'count': 42}, '125': {'count': 42}, '137': {'count': 39}, '158': {'count': 36}, '58': {'count': 44}, '123': {'count': 39}, '52': {'count': 44}, '27': {'count': 41}, '13': {'count': 42}, '70': {'count': 35}, '25': {'count': 34}, '185': {'count': 38}, '171': {'count': 44}, '9': {'count': 33}, '40': {'count': 35}, '178': {'count': 45}, '44': {'count': 32}, '97': {'count': 46}, '87': {'count': 39}, '159': {'count': 44}, '146': {'count': 44}, '51': {'count': 41}, '121': {'count': 40}, '1': {'count': 32}, '160': {'count': 48}, '78': {'count': 48}, '109': {'count': 43}, '103': {'count': 42}, '174': {'count': 30}, '181': {'count': 46}, '23': {'count': 45}, '111': {'count': 45}, '166': {'count': 47}, '172': {'count': 43}, '66': {'count': 38}, '192': {'count': 41}, '148': {'count': 42}, '72': {'count': 44}, '141': {'count': 32}, '71': {'count': 45}, '7': {'count': 45}, '152': {'count': 44}, '183': {'count': 40}, '98': {'count': 27}, '94': {'count': 45}, '126': {'count': 41}, '100': {'count': 42}, '131': {'count': 43}, '116': {'count': 42}, '39': {'count': 39}, '149': {'count': 36}, '101': {'count': 39}, '139': {'count': 42}, '69': {'count': 42}, '12': {'count': 41}, '14': {'count': 43}, '96': {'count': 42}, '41': {'count': 34}, '189': {'count': 43}, '10': {'count': 38}, '140': {'count': 34}, '26': {'count': 35}, '57': {'count': 44}, '88': {'count': 44}, '67': {'count': 40}, '93': {'count': 43}, '193': {'count': 45}, '161': {'count': 45}, '118': {'count': 68}, '110': {'count': 42}, '154': {'count': 42}, '138': {'count': 42}, '143': {'count': 46}, '61': {'count': 37}, '176': {'count': 44}, '113': {'count': 45}, '18': {'count': 40}, '53': {'count': 40}, '47': {'count': 42}, '157': {'count': 29}, '168': {'count': 38}, '124': {'count': 43}, '79': {'count': 43}, '130': {'count': 42}, '46': {'count': 35}, '55': {'count': 46}, '195': {'count': 40}, '38': {'count': 36}, '37': {'count': 40}, '99': {'count': 33}, '83': {'count': 42}, '162': {'count': 36}, '135': {'count': 24}, '175': {'count': 38}, '156': {'count': 36}, '15': {'count': 43}, '65': {'count': 41}, '76': {'count': 40}}}} | | [StanfordCarsI2IRetrieval](https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content) (Jonathan Krause, 2013) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None | | [StanfordCarsZeroShot](https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content) (Jonathan Krause, 2013) | ['eng'] | ZeroShotClassification | i2t | [Scene] | None | None | | [StatcanDialogueDatasetRetrieval](https://mcgill-nlp.github.io/statcan-dialogue-dataset/) | ['eng', 'fra'] | Retrieval | s2p | [Government, Web, Written] | None | None | @@ -815,7 +815,7 @@ The following tables give you an overview of the tasks in MTEB. | [TwitterSemEval2015](https://alt.qcri.org/semeval2015/task1/) | ['eng'] | PairClassification | s2s | [Social, Written] | None | None | | [TwitterURLCorpus](https://languagenet.github.io/) | ['eng'] | PairClassification | s2s | [Social, Written] | {'test': 51534} | {'test': {'num_samples': 51534, 'number_of_characters': 8659940, 'min_sentence1_length': 24, 'avg_sentence1_length': 79.49, 'max_sentence1_length': 126, 'unique_sentence1': 4329, 'min_sentence2_length': 6, 'avg_sentence2_length': 88.55, 'max_sentence2_length': 608, 'unique_sentence2': 41304, 'unique_labels': 2, 'labels': {'0': {'count': 38546}, '1': {'count': 12988}}}} | | [UCCVCommonLawLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [UCF101](https://huggingface.co/datasets/flwrlabs/ucf101) (Khurram Soomro, 2012) | ['eng'] | ImageClassification | i2i | [Scene] | None | None | +| [UCF101](https://huggingface.co/datasets/flwrlabs/ucf101) (Khurram Soomro, 2012) | ['eng'] | ImageClassification | i2i | [Scene] | {'test': 697222} | {'test': {'num_samples': 697222, 'unique_num_labels': 101, 'min_image_width': 320, 'average_image_width': 320.12, 'max_image_width': 400, 'min_image_height': 226, 'average_image_height': 239.98, 'max_image_height': 240, 'labels': {'0': {'count': 7475}, '1': {'count': 6341}, '2': {'count': 6181}, '3': {'count': 6320}, '4': {'count': 3708}, '5': {'count': 7296}, '6': {'count': 4004}, '7': {'count': 3923}, '8': {'count': 2267}, '9': {'count': 5587}, '10': {'count': 8946}, '11': {'count': 12714}, '12': {'count': 6053}, '13': {'count': 3191}, '14': {'count': 3696}, '15': {'count': 5468}, '16': {'count': 10032}, '17': {'count': 8346}, '18': {'count': 5098}, '19': {'count': 10811}, '20': {'count': 6378}, '21': {'count': 3385}, '22': {'count': 3974}, '23': {'count': 4781}, '24': {'count': 5867}, '25': {'count': 7904}, '26': {'count': 12181}, '27': {'count': 4511}, '28': {'count': 4402}, '29': {'count': 5513}, '30': {'count': 3236}, '31': {'count': 7160}, '32': {'count': 6455}, '33': {'count': 3766}, '34': {'count': 8362}, '35': {'count': 3521}, '36': {'count': 3263}, '37': {'count': 5112}, '38': {'count': 9685}, '39': {'count': 4598}, '40': {'count': 6682}, '41': {'count': 8690}, '42': {'count': 3591}, '43': {'count': 11432}, '44': {'count': 3458}, '45': {'count': 10080}, '46': {'count': 16507}, '47': {'count': 3001}, '48': {'count': 6524}, '49': {'count': 7786}, '50': {'count': 4657}, '51': {'count': 8795}, '52': {'count': 3992}, '53': {'count': 5668}, '54': {'count': 6575}, '55': {'count': 8662}, '56': {'count': 5253}, '57': {'count': 3761}, '58': {'count': 8679}, '59': {'count': 11986}, '60': {'count': 15720}, '61': {'count': 12080}, '62': {'count': 10634}, '63': {'count': 6161}, '64': {'count': 13934}, '65': {'count': 8393}, '66': {'count': 5452}, '67': {'count': 7905}, '68': {'count': 12354}, '69': {'count': 4060}, '70': {'count': 9075}, '71': {'count': 2689}, '72': {'count': 5435}, '73': {'count': 17655}, '74': {'count': 5693}, '75': {'count': 12572}, '76': {'count': 9543}, '77': {'count': 10793}, '78': {'count': 4134}, '79': {'count': 4832}, '80': {'count': 8977}, '81': {'count': 7381}, '82': {'count': 4927}, '83': {'count': 12469}, '84': {'count': 3843}, '85': {'count': 4945}, '86': {'count': 6724}, '87': {'count': 6582}, '88': {'count': 7046}, '89': {'count': 5874}, '90': {'count': 4878}, '91': {'count': 6417}, '92': {'count': 3762}, '93': {'count': 7349}, '94': {'count': 8149}, '95': {'count': 3925}, '96': {'count': 3378}, '97': {'count': 7721}, '98': {'count': 3671}, '99': {'count': 6292}, '100': {'count': 6508}}}} | | [UCF101ZeroShot](https://huggingface.co/datasets/flwrlabs/ucf101) (Khurram Soomro, 2012) | ['eng'] | ZeroShotClassification | i2t | [Scene] | None | None | | [UkrFormalityClassification](https://huggingface.co/datasets/ukr-detect/ukr-formality-dataset-translated-gyafc) | ['ukr'] | Classification | s2s | [News, Written] | None | None | | [UnfairTOSLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | From efaa990b6c7a4916c3349bda4204b2b323927966 Mon Sep 17 00:00:00 2001 From: Sam <40773225+sam-hey@users.noreply.github.com> Date: Mon, 17 Feb 2025 08:19:55 +0100 Subject: [PATCH 026/233] ci: Rerun tests that fail due to networking issues. (#2029) * fix: rerun tests that fail - Networking * update tests to use tmp_path * set versions for dev dependencies * add pytest options to pyproject.toml * add rerun json.decoder.JSONDecodeError * remove JSONDecodeError from pyproject.toml * add huggingface_hub.errors.HfHubHTTPError * add huggingface_hub.errors.LocalEntryNotFoundError https://github.com/embeddings-benchmark/mteb/actions/runs/13298535701/job/37139767443?pr=2044 * FileNotFoundError https://github.com/embeddings-benchmark/mteb/actions/runs/13302915091/job/37147507251?pr=2029 * add doc to pytest rerun --------- Co-authored-by: sam021313 <40773225+sam021313@users.noreply.github.com> --- Makefile | 4 +-- pyproject.toml | 21 ++++++++++++- tests/test_benchmark/test_benchmark.py | 42 +++++++++++++++----------- tests/test_tasks/test_all_abstasks.py | 6 ++++ 4 files changed, 52 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index 9729d080ff..41888ca185 100644 --- a/Makefile +++ b/Makefile @@ -20,11 +20,11 @@ lint-check: test: @echo "--- 🧪 Running tests ---" - pytest -n auto --durations=5 + pytest -n auto test-with-coverage: @echo "--- 🧪 Running tests with coverage ---" - pytest -n auto --durations=5 --cov-report=term-missing --cov-config=pyproject.toml --cov=mteb + pytest -n auto --cov-report=term-missing --cov-config=pyproject.toml --cov=mteb pr: @echo "--- 🚀 Running requirements for a PR ---" diff --git a/pyproject.toml b/pyproject.toml index 277667e73a..d30874d3d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,7 @@ mteb = "mteb.cli:main" [project.optional-dependencies] dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint update -"pytest", "pytest-xdist", "pytest-coverage"] +"pytest>=8.3.4", "pytest-xdist>=3.6.1", "pytest-coverage>=0.0", "pytest-rerunfailures>=15.0"] codecarbon = ["codecarbon"] speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] peft = ["peft>=0.11.0"] @@ -155,3 +155,22 @@ tag_format = "{version}" major_types = ["breaking"] minor_types = ["feat"] patch_types = ["fix", "perf"] + + +[tool.pytest.ini_options] +addopts = """ + --reruns 3 + --only-rerun requests.exceptions.ReadTimeout + --only-rerun huggingface_hub.errors.HfHubHTTPError + --only-rerun huggingface_hub.errors.LocalEntryNotFoundError + --only-rerun FileNotFoundError + --durations=5 + --reruns-delay 10 + """ +# --reruns 3 -> # Retry failed tests 3 times +# requests.exceptions.ReadTimeout -> # HF Read timed out -> https://github.com/embeddings-benchmark/mteb/actions/runs/13275350693/job/37093688544 +# huggingface_hub.errors.HfHubHTTPError -> # HF is unavailable, e.g. seen here: https://github.com/embeddings-benchmark/mteb/actions/runs/13275350693/job/37093688544 +# huggingface_hub.errors.LocalEntryNotFoundError -> # Gateway Time-out from HF, e.g. seen here: https://github.com/embeddings-benchmark/mteb/actions/runs/13275350693/job/37093688544 +# FileNotFoundError -> HF Cache is broken: https://github.com/embeddings-benchmark/mteb/actions/runs/13302915091/job/37147507251?pr=2029 +# --durations=5 -> Show the 5 slowest tests +# --reruns-delay 10 -> Delay between reruns in seconds to avoid running into the same issue again \ No newline at end of file diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index 37b632f59a..d114900b1a 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -41,7 +41,7 @@ def test_mulitple_mteb_tasks( ): """Test that multiple tasks can be run""" eval = mteb.MTEB(tasks=tasks) - eval.run(model, output_folder=str(tmp_path), overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) # ensure that we can generate a readme from the output folder generate_readme(tmp_path) @@ -56,7 +56,9 @@ def test_mulitple_mteb_tasks( MockTorchbf16Encoder(), ], ) -def test_benchmark_encoders_on_task(task: str | mteb.AbsTask, model: mteb.Encoder): +def test_benchmark_encoders_on_task( + task: str | mteb.AbsTask, model: mteb.Encoder, tmp_path: Path +): """Test that a task can be fetched and run using a variety of encoders""" if isinstance(task, str): tasks = mteb.get_tasks(tasks=[task]) @@ -64,7 +66,7 @@ def test_benchmark_encoders_on_task(task: str | mteb.AbsTask, model: mteb.Encode tasks = [task] eval = mteb.MTEB(tasks=tasks) - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) @pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID[:1]) @@ -90,7 +92,9 @@ def test_reload_results(task: str | mteb.AbsTask, model: mteb.Encoder, tmp_path: @pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID) -def test_prompt_name_passed_to_all_encodes(task_name: str | mteb.AbsTask): +def test_prompt_name_passed_to_all_encodes( + task_name: str | mteb.AbsTask, tmp_path: Path +): """Test that all tasks correctly pass down the prompt_name to the encoder which supports it, and that the encoder which does not support it does not receive it. """ @@ -123,17 +127,19 @@ def encode(self, sentences, **kwargs): eval.run( model, - output_folder="tests/results", + output_folder=tmp_path.as_posix(), overwrite_results=True, ) # Test that the task_name is not passed down to the encoder model = EncoderWithoutInstructions("average_word_embeddings_levy_dependency") assert model.prompts == {}, "The encoder should not have any prompts" - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) @pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID) -def test_encode_kwargs_passed_to_all_encodes(task_name: str | mteb.AbsTask): +def test_encode_kwargs_passed_to_all_encodes( + task_name: str | mteb.AbsTask, tmp_path: Path +): """Test that all tasks correctly pass down the encode_kwargs to the encoder.""" my_encode_kwargs = {"no_one_uses_this_args": "but_its_here"} @@ -157,14 +163,14 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): model = MockEncoderWithKwargs() eval.run( model, - output_folder="tests/results", + output_folder=tmp_path.as_posix(), overwrite_results=True, encode_kwargs=my_encode_kwargs, ) @pytest.mark.parametrize("model", [MockNumpyEncoder()]) -def test_run_using_benchmark(model: mteb.Encoder): +def test_run_using_benchmark(model: mteb.Encoder, tmp_path: Path): """Test that a benchmark object can be run using the MTEB class.""" bench = mteb.Benchmark( name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"]) @@ -172,12 +178,12 @@ def test_run_using_benchmark(model: mteb.Encoder): eval = mteb.MTEB(tasks=bench) eval.run( - model, output_folder="tests/results", overwrite_results=True + model, output_folder=tmp_path.as_posix(), overwrite_results=True ) # we just want to test that it runs @pytest.mark.parametrize("model", [MockNumpyEncoder()]) -def test_run_using_list_of_benchmark(model: mteb.Encoder): +def test_run_using_list_of_benchmark(model: mteb.Encoder, tmp_path: Path): """Test that a list of benchmark objects can be run using the MTEB class.""" bench = [ mteb.Benchmark( @@ -187,7 +193,7 @@ def test_run_using_list_of_benchmark(model: mteb.Encoder): eval = mteb.MTEB(tasks=bench) eval.run( - model, output_folder="tests/results", overwrite_results=True + model, output_folder=tmp_path.as_posix(), overwrite_results=True ) # we just want to test that it runs @@ -213,7 +219,7 @@ def test_get_benchmark(name): @pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID) @pytest.mark.parametrize("is_task_name", [True, False]) def test_prompt_name_passed_to_all_encodes_with_prompts( - task: mteb.AbsTask | str, is_task_name: bool + task: mteb.AbsTask | str, is_task_name: bool, tmp_path: Path ): """Test that all tasks and task_types correctly pass down the prompt_name to the encoder with prompts.""" _task_name = task.metadata.name if isinstance(task, mteb.AbsTask) else task @@ -242,7 +248,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): ) eval.run( model, - output_folder="tests/results", + output_folder=tmp_path.as_posix(), overwrite_results=True, ) @@ -259,7 +265,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): model = MockSentenceTransformerWrapper(MockEncoderWithExistingPrompts()) eval.run( model, - output_folder="tests/results", + output_folder=tmp_path.as_posix(), overwrite_results=True, ) @@ -277,7 +283,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): ) @pytest.mark.parametrize("is_task_name", [True, False]) def test_model_query_passage_prompts_task_type( - task: mteb.AbsTask | str, is_task_name: bool + task: mteb.AbsTask | str, is_task_name: bool, tmp_path: Path ): """Test that the model with prompts is correctly called.""" tasks = [task] @@ -317,7 +323,7 @@ def encode(self, sentences, prompt_name: str | None = None, *args, **kwargs): eval.run( model, model_prompts=prompt_list, - output_folder="tests/results", + output_folder=tmp_path.as_posix(), overwrite_results=True, ) model = MockSentenceTransformerWrapper( @@ -327,6 +333,6 @@ def encode(self, sentences, prompt_name: str | None = None, *args, **kwargs): eval.run( model, model_prompts=prompt_list, - output_folder="tests/results", + output_folder=tmp_path.as_posix(), overwrite_results=True, ) diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index 7a87914f0a..ce1f8ab87e 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -91,6 +91,12 @@ async def check_datasets_are_available_on_hf(tasks): assert False, f"Datasets not available on Hugging Face:\n{pretty_print}" +@pytest.mark.flaky( + reruns=3, + reruns_delay=5, + only_rerun=["AssertionError"], + reason="May fail due to network issues", +) def test_dataset_availability(): """Checks if the datasets are available on Hugging Face using both their name and revision.""" tasks = MTEB().tasks_cls From 26360a0b856d97cf8b589218365288bdf55ae791 Mon Sep 17 00:00:00 2001 From: Sam <40773225+sam-hey@users.noreply.github.com> Date: Mon, 17 Feb 2025 08:39:51 +0100 Subject: [PATCH 027/233] fix: generate metadata (#2063) * fix: generate metadata * use logging not print for script * lint * add iso639 to dev pyproject * fix import * add memory_usage_mb * set version for iso639 Co-authored-by: Kenneth Enevoldsen --------- Co-authored-by: sam021313 <40773225+sam021313@users.noreply.github.com> Co-authored-by: Kenneth Enevoldsen Co-authored-by: Roman Solomatin --- pyproject.toml | 10 ++- scripts/generate_metadata.py | 79 +++++++++++++++-------- tests/scripts/test_generate_model_meta.py | 48 ++++++++++++++ 3 files changed, 109 insertions(+), 28 deletions(-) create mode 100644 tests/scripts/test_generate_model_meta.py diff --git a/pyproject.toml b/pyproject.toml index d30874d3d3..580ac8caac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,8 +53,14 @@ homepage = "https://github.com/embeddings-benchmark/mteb" mteb = "mteb.cli:main" [project.optional-dependencies] -dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint update -"pytest>=8.3.4", "pytest-xdist>=3.6.1", "pytest-coverage>=0.0", "pytest-rerunfailures>=15.0"] +dev = [ +"ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint update +"pytest>=8.3.4", +"pytest-xdist>=3.6.1", +"pytest-coverage>=0.0", +"pytest-rerunfailures>=15.0", +"iso639>=0.1.4" # used for tests/scripts/test_generate_model_meta.py +] codecarbon = ["codecarbon"] speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] peft = ["peft>=0.11.0"] diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py index 4ae87fdbca..5c181275e7 100644 --- a/scripts/generate_metadata.py +++ b/scripts/generate_metadata.py @@ -1,7 +1,7 @@ from __future__ import annotations import json -import warnings +import logging from pathlib import Path import iso639 @@ -137,7 +137,7 @@ def convert_code(code: str) -> str | None: script = lang_to_script[lang_code] return f"{lang_code}_{script}" except Exception as e: - print(f"Couldn't convert {code}, reason: {e}") + logging.warning(f"Couldn't convert {code}, reason: {e}") return None @@ -153,7 +153,7 @@ def get_embedding_dimensions(model_name: str) -> int | None: pooling_config = json.loads(in_file.read()) return pooling_config.get("word_embedding_dimension", None) except Exception as e: - print(f"Couldn't get embedding size for {model_name}, reason: {e}") + logging.warning(f"Couldn't get embedding size for {model_name}, reason: {e}") return None @@ -164,45 +164,66 @@ def get_max_token(model_name: str) -> int | None: config = json.loads(in_file.read()) return config.get("max_position_embeddings", None) except Exception as e: - print(f"Couldn't get embedding size for {model_name}, reason: {e}") + logging.warning(f"Couldn't get embedding size for {model_name}, reason: {e}") return None +BASE_MODEL_ERRORS = ["tmp/"] + + def get_base_model(model_name: str) -> str | None: try: file_path = hf_hub_download(repo_id=model_name, filename="config.json") with open(file_path) as in_file: config = json.loads(in_file.read()) base_model = config.get("_name_or_path", None) + if base_model in BASE_MODEL_ERRORS: + logging.warning( + f"Base model error for {model_name} with base model {base_model}" + ) + return None if base_model != model_name: return base_model else: return None except Exception as e: - print(f"Couldn't get base model for {model_name}, reason: {e}") + logging.warning(f"Couldn't get base model for {model_name}, reason: {e}") return None -def model_meta_from_hf_hub(model_name: str) -> ModelMeta: +def load_model_card(model_name: str) -> dict: + card = ModelCard.load(model_name) + return card.data.to_dict() + + +def get_language_from_card(card_data: dict) -> str | None: + languages = card_data.get("language", None) + if isinstance(languages, str): + languages = [languages] + if languages is not None: + languages = [convert_code(l) for l in languages] + languages = [l for l in languages if l is not None] + return languages + + +def model_meta_from_hf_hub_cross_encoder(model_name: str) -> ModelMeta: + pass + + +def model_meta_from_hf_hub_embedding(model_name: str) -> ModelMeta: try: - card = ModelCard.load(model_name) - card_data = card.data.to_dict() + card_data = load_model_card(model_name) frameworks = ["PyTorch"] if card_data.get("library_name", None) == "sentence-transformers": frameworks.append("Sentence Transformers") - languages = card_data.get("language", None) - if isinstance(languages, str): - languages = [languages] - if languages is not None: - languages = [convert_code(l) for l in languages] - languages = [l for l in languages if l is not None] + languages = get_language_from_card(card_data) repo_info = api.repo_info(model_name) revision = repo_info.sha release_date = repo_info.created_at.strftime("%Y-%m-%d") try: n_parameters = repo_info.safetensors.total except Exception as e: - print(f"Couldn't get model size for {model_name}, reason: {e}") + logging.warning(f"Couldn't get model size for {model_name}, reason: {e}") n_parameters = None n_dimensions = get_embedding_dimensions(model_name) datasets = card_data.get("datasets", None) @@ -223,14 +244,17 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: adapted_from=get_base_model(model_name), training_datasets=training_datasets, open_weights=True, - superseded_by=None, max_tokens=get_max_token(model_name), embed_dim=n_dimensions, similarity_fn_name="cosine", reference=f"https://huggingface.co/{model_name}", + public_training_code=None, + public_training_data=None, + use_instructions=None, + memory_usage_mb=None, ) except Exception as e: - warnings.warn(f"Failed to extract metadata from model: {e}.") + logging.error(f"Failed to extract metadata from model: {e}.") return ModelMeta( name=model_name, revision=None, @@ -241,12 +265,13 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: embed_dim=None, license=None, open_weights=True, - public_training_code=None, - public_training_data=None, similarity_fn_name=None, - use_instructions=None, training_datasets=None, - frameworks=[], + framework=[], + use_instructions=None, + public_training_data=None, + public_training_code=None, + memory_usage_mb=None, ) @@ -256,14 +281,16 @@ def code_from_meta(meta: ModelMeta) -> str: return template.format(variable_name=variable_name, meta=meta.__repr__()) -def main(): - out_path = Path("mteb/models/misc_models.py") +def main(out_path: Path, model_names: list[str] = to_keep): with open(out_path, "w") as out_file: out_file.write("from mteb.model_meta import ModelMeta\n\n") - for model in tqdm(to_keep, desc="Generating metadata for all models."): - meta = model_meta_from_hf_hub(model) + for model_name in tqdm(model_names, desc="Generating metadata for all models."): + meta = model_meta_from_hf_hub_embedding(model_name) + out_file.write(code_from_meta(meta)) if __name__ == "__main__": - main() + out_path = Path("mteb/models/new_tmp.py") + model_names = ["jinaai/jina-reranker-v2-base-multilingual"] + main(out_path, model_names) diff --git a/tests/scripts/test_generate_model_meta.py b/tests/scripts/test_generate_model_meta.py new file mode 100644 index 0000000000..c930eff497 --- /dev/null +++ b/tests/scripts/test_generate_model_meta.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +import importlib.util +from pathlib import Path + +import pytest + +from scripts.generate_metadata import get_base_model +from scripts.generate_metadata import main as generate_metadata_main + + +def test_create_model_meta_embedding_models_from_hf(tmp_path: Path): + models = ["intfloat/multilingual-e5-large", "intfloat/multilingual-e5-small"] + tmp_path = tmp_path / "new_models.py" + generate_metadata_main(tmp_path, models) + + assert tmp_path.exists() + assert tmp_path.read_text().startswith("from mteb.model_meta import ModelMeta") + + spec = importlib.util.spec_from_file_location("new_models", tmp_path) + new_models = importlib.util.module_from_spec(spec) + spec.loader.exec_module(new_models) + + assert hasattr(new_models, "intfloat__multilingual_e5_large") + assert hasattr(new_models, "intfloat__multilingual_e5_small") + + assert ( + new_models.intfloat__multilingual_e5_large.name + == "intfloat/multilingual-e5-large" + ) + assert ( + new_models.intfloat__multilingual_e5_small.name + == "intfloat/multilingual-e5-small" + ) + + +def test_get_base_model_name_is_the_same(): + model_name = "jinaai/jina-embeddings-v3" + model = get_base_model(model_name) + assert model is None + + +@pytest.mark.skip(reason="No support for cross-encoder models") +def test_create_model_meta_cross_encoder_models_from_hf(tmp_path: Path): + models = ["intfloat/multilingual-e5-cross-encoder"] + tmp_path = tmp_path / "new_models.py" + generate_metadata_main(tmp_path, models) + assert True From 8d4adbf4bcbb4fc8dfd52862f67e3dc3d4837b50 Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 17 Feb 2025 07:45:04 +0000 Subject: [PATCH 028/233] 1.34.15 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 580ac8caac..31893bbde8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.34.14" +version = "1.34.15" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From efe2578c06265419d6ea613108d156bb4f124f8f Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Mon, 17 Feb 2025 12:52:48 +0300 Subject: [PATCH 029/233] fix: add missing `e5` training datasets (#2065) add missing training datasets --- mteb/models/e5_models.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index 1814eacc89..a2d68ce266 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -137,6 +137,10 @@ "HotpotQAHardNegatives": ["train"], "HotpotQA-PL": ["train"], # translation not trained on "HotpotQA-NL": ["train"], # translation not trained on + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "MrTidyRetrieval": ["train"], } e5_mult_small = ModelMeta( From 8ef26d019a29553df28b24a18fd5544c04a3179f Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 17 Feb 2025 10:16:36 +0000 Subject: [PATCH 030/233] 1.34.16 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 31893bbde8..4f6f2f1bea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.34.15" +version = "1.34.16" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From b14963fe6ace93ca9c1e7e066577f7e44250f823 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Mon, 17 Feb 2025 11:51:50 +0100 Subject: [PATCH 031/233] fix: Ensure voyage model uses different naming scheme (#2083) * fix: Added make command for running leaderboard locally * fix: Ensure voyage models doesn't re-use the name --- Makefile | 6 +++++- mteb/models/voyage_models.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 41888ca185..e9efdf8307 100644 --- a/Makefile +++ b/Makefile @@ -42,4 +42,8 @@ model-load-test: @echo "--- 🚀 Running model load test ---" pip install ".[dev, speedtask, pylate,gritlm,xformers,model2vec]" python scripts/extract_model_names.py $(BASE_BRANCH) --return_one_model_name_per_file - python tests/test_models/model_loading.py --model_name_file scripts/model_names.txt \ No newline at end of file + python tests/test_models/model_loading.py --model_name_file scripts/model_names.txt + +run-leaderboard: + @echo "--- 🚀 Running leaderboard locally ---" + python -m mteb.leaderboard.app \ No newline at end of file diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index ebcd815f29..9c5860ba66 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -241,7 +241,7 @@ def _batched_encode( public_training_data=None, ) -voyage_code_2 = ModelMeta( +voyage_code_3 = ModelMeta( name="voyageai/voyage-code-3", revision="1", release_date="2024-12-04", From 2d1f10d102180d303b55cbc20f43881ffdee3be5 Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 17 Feb 2025 11:01:04 +0000 Subject: [PATCH 032/233] 1.34.17 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4f6f2f1bea..c9999fda59 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.34.16" +version = "1.34.17" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 07562f4d27c8319249c2a28ff9903ad5bf3b4173 Mon Sep 17 00:00:00 2001 From: Shikhar Shiromani Date: Mon, 17 Feb 2025 03:14:52 -0800 Subject: [PATCH 033/233] fix: Freeze model/rank columns in leaderboard (#2044) * fix: freeze model/rank columns in leaderboard * freezing zero-shot column * update min gradio version to 5.16.0 in pyproject.toml --------- Co-authored-by: Shikhar Shiromani --- mteb/leaderboard/table.py | 28 ++-------------------------- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index ef28392cf7..237b7627c1 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -49,23 +49,6 @@ def split_on_capital(s: str) -> str: return " ".join(re.findall(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", s)) -def get_column_widths(df: pd.DataFrame) -> list[str]: - widths = [] - for column_name in df.columns: - column_word_lengths = [len(word) for word in column_name.split()] - if is_numeric_dtype(df[column_name]): - value_lengths = [len(f"{value:.2f}") for value in df[column_name]] - else: - value_lengths = [len(str(value)) for value in df[column_name]] - try: - max_length = max(max(column_word_lengths), max(value_lengths)) - n_pixels = 35 + (max_length * 12.5) - widths.append(f"{n_pixels}px") - except Exception: - widths.append("50px") - return widths - - def get_column_types(df: pd.DataFrame) -> list[str]: types = [] for column_name in df.columns: @@ -212,10 +195,6 @@ def scores_to_tables( } ) joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank")) - column_widths = get_column_widths(joint_table) - task_column_widths = get_column_widths(per_task) - # overriding for model name - column_widths[1] = "250px" column_types = get_column_types(joint_table) # setting model name column to markdown column_types[1] = "markdown" @@ -240,12 +219,9 @@ def scores_to_tables( return ( gr.DataFrame( joint_table_style, - column_widths=column_widths, datatype=column_types, interactive=False, - wrap=True, - ), - gr.DataFrame( - per_task_style, column_widths=task_column_widths, interactive=False + pinned_columns=3, ), + gr.DataFrame(per_task_style, interactive=False, pinned_columns=1), ) diff --git a/pyproject.toml b/pyproject.toml index c9999fda59..0872ea3d40 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,7 +64,7 @@ dev = [ codecarbon = ["codecarbon"] speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] peft = ["peft>=0.11.0"] -leaderboard = ["gradio>=5.7.1", "gradio_rangeslider>=0.0.8", "plotly>=5.24.0,<6.0.0"] +leaderboard = ["gradio>=5.16.0", "gradio_rangeslider>=0.0.8", "plotly>=5.24.0,<6.0.0"] flagembedding = ["FlagEmbedding"] jina = ["einops>=0.8.0"] flash_attention = ["flash-attn>=2.6.3"] From 879b243ba230cd7ea6a51a2459b5c5b70422323a Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 17 Feb 2025 11:27:42 +0000 Subject: [PATCH 034/233] 1.34.18 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0872ea3d40..bbfea6e3e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.34.17" +version = "1.34.18" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 12d9b96842d64a159dba39013b2a121e7b436f9b Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Mon, 17 Feb 2025 13:03:21 +0100 Subject: [PATCH 035/233] fix: Fixed previous incorrect specification of splits for CMTEB ( MTEB(cmn, v1) ) (#2086) Fixes #2064 --- mteb/benchmarks/benchmarks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 6c9dbaafcb..b491ed5651 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1215,14 +1215,14 @@ "JDReview", ], ) - + get_tasks(tasks=["MultilingualSentiment"], eval_splits=["test"]) + get_tasks( tasks=[ + "MultilingualSentiment", "ATEC", "BQ", "STSB", ], - eval_splits=["validation"], + eval_splits=["test"], ) ), description="The Chinese Massive Text Embedding Benchmark (C-MTEB) is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets.", From 72d454f9d34b652c03e8409c8dfafe3ecb6ecb09 Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 17 Feb 2025 12:27:41 +0000 Subject: [PATCH 036/233] 1.34.19 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bbfea6e3e7..b194ad9848 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.34.18" +version = "1.34.19" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From c6e51230a9a65b9e5c25bae54e8d4b2c8095c7f8 Mon Sep 17 00:00:00 2001 From: Ruslan Bel'kov Date: Mon, 17 Feb 2025 15:47:59 +0300 Subject: [PATCH 037/233] Remove duplicated string in docstring of TaskMetadata class (#2087) * Remove duplicated string in docstring of TaskMetadata class * Remove duplicated dataset field --- mteb/abstasks/TaskMetadata.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index 1f6971d0e5..2fed5c32f5 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -267,18 +267,15 @@ class TaskMetadata(BaseModel): "Government", "Legal", "Medical", "Poetry", "Religious", "Reviews", "Web", "Spoken", "Written". A dataset can belong to multiple domains. task_subtypes: The subtypes of the task. E.g. includes "Sentiment/Hate speech", "Thematic Clustering". Feel free to update the list as needed. license: The license of the data specified as lowercase, e.g. "cc-by-nc-4.0". If the license is not specified, use "not specified". For custom licenses a URL is used. - license: The license of the data specified as lowercase, e.g. "cc-by-nc-4.0". If the license is not specified, use "not specified". For custom licenses a URL is used. annotations_creators: The type of the annotators. Includes "expert-annotated" (annotated by experts), "human-annotated" (annotated e.g. by mturkers), "derived" (derived from structure in the data). dialect: The dialect of the data, if applicable. Ideally specified as a BCP-47 language tag. Empty list if no dialects are present. sample_creation: The method of text creation. Includes "found", "created", "machine-translated", "machine-translated and verified", and "machine-translated and localized". prompt: The prompt used for the task. Can be a string or a dictionary containing the query and passage prompts. - prompt: The prompt used for the task. Can be a string or a dictionary containing the query and passage prompts. bibtex_citation: The BibTeX citation for the dataset. Should be an empty string if no citation is available. """ - dataset: dict[str, Any] dataset: dict[str, Any] name: str From 1006770c098869d5c9db2e1b3b13c2c190c34a26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 17 Feb 2025 15:19:33 +0100 Subject: [PATCH 038/233] fix: Smarter leaderboard caching with cachetools (#2085) * Added smarter caching to callbacks * Added cachetools as a dependency * Ran linting * Removed debugging print statement * Bumped Gradio version * Dependency fixes * Dependency fixes --------- Co-authored-by: Kenneth Enevoldsen --- mteb/leaderboard/app.py | 121 ++++++++++++++++++++++++++++++++++------ pyproject.toml | 2 +- 2 files changed, 104 insertions(+), 19 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 8e8b40edfb..9a707160c4 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -9,13 +9,13 @@ from typing import Literal from urllib.parse import urlencode +import cachetools import gradio as gr import pandas as pd from gradio_rangeslider import RangeSlider import mteb from mteb.benchmarks.benchmarks import MTEB_multilingual -from mteb.caching import json_cache from mteb.leaderboard.figures import performance_size_plot, radar_chart from mteb.leaderboard.table import scores_to_tables @@ -470,7 +470,10 @@ def filter_models( # This sets the benchmark from the URL query parameters demo.load(set_benchmark_on_load, inputs=[], outputs=[benchmark_select]) - @json_cache + @cachetools.cached( + cache={}, + key=lambda benchmark_name: hash(benchmark_name), + ) def on_benchmark_select(benchmark_name): start_time = time.time() benchmark = mteb.get_benchmark(benchmark_name) @@ -495,7 +498,7 @@ def on_benchmark_select(benchmark_name): languages, domains, types, - [task.metadata.name for task in benchmark.tasks], + sorted([task.metadata.name for task in benchmark.tasks]), scores, ) @@ -505,7 +508,12 @@ def on_benchmark_select(benchmark_name): outputs=[lang_select, domain_select, type_select, task_select, scores], ) - @json_cache + @cachetools.cached( + cache={}, + key=lambda benchmark_name, languages: hash( + (hash(benchmark_name), hash(tuple(languages))) + ), + ) def update_scores_on_lang_change(benchmark_name, languages): start_time = time.time() benchmark_results = all_benchmark_results[benchmark_name] @@ -520,6 +528,17 @@ def update_scores_on_lang_change(benchmark_name, languages): outputs=[scores], ) + @cachetools.cached( + cache={}, + key=lambda benchmark_name, type_select, domain_select, lang_select: hash( + ( + hash(benchmark_name), + hash(tuple(type_select)), + hash(tuple(domain_select)), + hash(tuple(lang_select)), + ) + ), + ) def update_task_list(benchmark_name, type_select, domain_select, lang_select): start_time = time.time() tasks_to_keep = [] @@ -533,7 +552,7 @@ def update_task_list(benchmark_name, type_select, domain_select, lang_select): tasks_to_keep.append(task.metadata.name) elapsed = time.time() - start_time logger.info(f"update_task_list callback: {elapsed}s") - return tasks_to_keep + return sorted(tasks_to_keep) type_select.input( update_task_list, @@ -551,6 +570,26 @@ def update_task_list(benchmark_name, type_select, domain_select, lang_select): outputs=[task_select], ) + @cachetools.cached( + cache={}, + key=lambda scores, + tasks, + availability, + compatibility, + instructions, + model_size, + zero_shot: hash( + ( + id(scores), + hash(tuple(tasks)), + hash(availability), + hash(tuple(compatibility)), + hash(instructions), + hash(model_size), + hash(zero_shot), + ) + ), + ) def update_models( scores: list[dict], tasks: list[str], @@ -572,8 +611,11 @@ def update_models( zero_shot_setting=zero_shot, ) elapsed = time.time() - start_time + if model_names == filtered_models: + # This indicates that the models should not be filtered + return None logger.info(f"update_models callback: {elapsed}s") - return filtered_models + return sorted(filtered_models) scores.change( update_models, @@ -667,22 +709,41 @@ def update_models( outputs=[models], ) + @cachetools.cached( + cache={}, + key=lambda scores, search_query, tasks, models_to_keep, benchmark_name: hash( + ( + id(scores), + hash(search_query), + hash(tuple(tasks)), + id(models_to_keep), + hash(benchmark_name), + ) + ), + ) def update_tables( scores, search_query: str, tasks, models_to_keep, + benchmark_name: str, ): start_time = time.time() tasks = set(tasks) - models_to_keep = set(models_to_keep) - filtered_scores = [] - for entry in scores: - if entry["task_name"] not in tasks: - continue - if entry["model_name"] not in models_to_keep: - continue - filtered_scores.append(entry) + benchmark = mteb.get_benchmark(benchmark_name) + benchmark_tasks = {task.metadata.name for task in benchmark.tasks} + if (benchmark_tasks != tasks) or (models_to_keep is not None): + filtered_scores = [] + for entry in scores: + if entry["task_name"] not in tasks: + continue + if (models_to_keep is not None) and ( + entry["model_name"] not in models_to_keep + ): + continue + filtered_scores.append(entry) + else: + filtered_scores = scores summary, per_task = scores_to_tables(filtered_scores, search_query) elapsed = time.time() - start_time logger.info(f"update_tables callback: {elapsed}s") @@ -690,26 +751,50 @@ def update_tables( task_select.change( update_tables, - inputs=[scores, searchbar, task_select, models], + inputs=[scores, searchbar, task_select, models, benchmark_select], outputs=[summary_table, per_task_table], ) scores.change( update_tables, - inputs=[scores, searchbar, task_select, models], + inputs=[scores, searchbar, task_select, models, benchmark_select], outputs=[summary_table, per_task_table], ) models.change( update_tables, - inputs=[scores, searchbar, task_select, models], + inputs=[scores, searchbar, task_select, models, benchmark_select], outputs=[summary_table, per_task_table], ) searchbar.submit( update_tables, - inputs=[scores, searchbar, task_select, models], + inputs=[scores, searchbar, task_select, models, benchmark_select], outputs=[summary_table, per_task_table], ) gr.Markdown(acknowledgment_md, elem_id="ack_markdown") + +# Prerun on all benchmarks, so that results of callbacks get cached +for benchmark in benchmarks: + bench_languages, bench_domains, bench_types, bench_tasks, bench_scores = ( + on_benchmark_select(benchmark.name) + ) + filtered_models = update_models( + bench_scores, + bench_tasks, + availability=None, + compatibility=[], + instructions=None, + model_size=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), + zero_shot="soft", + ) + # We have to call this both on the filtered and unfiltered task, because the callbacks + # also gets called twice for some reason + update_tables(bench_scores, "", bench_tasks, filtered_models, benchmark.name) + filtered_tasks = update_task_list( + benchmark.name, bench_types, bench_domains, bench_languages + ) + update_tables(bench_scores, "", filtered_tasks, filtered_models, benchmark.name) + + if __name__ == "__main__": demo.launch() diff --git a/pyproject.toml b/pyproject.toml index b194ad9848..960fecb44b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,7 +64,7 @@ dev = [ codecarbon = ["codecarbon"] speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] peft = ["peft>=0.11.0"] -leaderboard = ["gradio>=5.16.0", "gradio_rangeslider>=0.0.8", "plotly>=5.24.0,<6.0.0"] +leaderboard = ["gradio>=5.16.0,<6.0.0", "gradio_rangeslider>=0.0.8", "plotly>=5.24.0,<6.0.0", "cachetools>=5.2.0"] flagembedding = ["FlagEmbedding"] jina = ["einops>=0.8.0"] flash_attention = ["flash-attn>=2.6.3"] From 6637ff95945b12a61594d09eafe88c82d3dfe4e4 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Mon, 17 Feb 2025 15:20:24 +0100 Subject: [PATCH 039/233] fix: Missing fixes for #2086 - change MultilingualSentiment split from test to validation in CMTEB (#2088) * fix: Fixed previous incorrect specification of splits for CMTEB ( MTEB(cmn, v1) ) Fixes #2064 * change MultilingualSentiment split from test to validation in CMTEB --- mteb/benchmarks/benchmarks.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index b491ed5651..6e4f10b617 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1224,6 +1224,12 @@ ], eval_splits=["test"], ) + + get_tasks( + tasks=[ + "MultilingualSentiment", + ], + eval_splits=["validation"], + ) ), description="The Chinese Massive Text Embedding Benchmark (C-MTEB) is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets.", reference="https://github.com/FlagOpen/FlagEmbedding/tree/master/research/C_MTEB", From 1f9cfc80bcc7624d8493005bb4a970b618982246 Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 17 Feb 2025 14:42:14 +0000 Subject: [PATCH 040/233] 1.34.20 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 960fecb44b..8f6cad6e9a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.34.19" +version = "1.34.20" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 1b1d327ee62eaea7a2ff44b9f1a16039a77e208d Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Mon, 17 Feb 2025 20:47:17 +0300 Subject: [PATCH 041/233] merge gme models (#2089) --- mteb/models/gme_models.py | 65 ------------------------------------- mteb/models/gme_v_models.py | 26 +++------------ mteb/models/overview.py | 5 ++- 3 files changed, 6 insertions(+), 90 deletions(-) delete mode 100644 mteb/models/gme_models.py diff --git a/mteb/models/gme_models.py b/mteb/models/gme_models.py deleted file mode 100644 index 42c0e48e14..0000000000 --- a/mteb/models/gme_models.py +++ /dev/null @@ -1,65 +0,0 @@ -from __future__ import annotations - -import logging - -from mteb.model_meta import ModelMeta - -logger = logging.getLogger(__name__) - -gme_qwen2_vl_2b_instruct = ModelMeta( - loader=None, - name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", - languages=["eng_Latn"], - open_weights=True, - revision="cfeb66885b598de483cc04eb08c7d9da534d7afe", - release_date="2024-12-21", - n_parameters=int(2.21 * 1e9), - memory_usage_mb=8427, - max_tokens=32768, - embed_dim=1536, - license="mit", - similarity_fn_name="cosine", - framework=["PyTorch"], - reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", - use_instructions=True, - adapted_from=None, - superseded_by=None, - training_datasets={ - # Only annotating text data for now - # source: https://arxiv.org/pdf/2412.16855 - "MSMARCO": ["train"], - "MSMARCO.v2": ["train"], - "mMARCO-NL": ["train"], # translation not trained on - }, - public_training_code=None, - public_training_data=None, -) - -gme_qwen2_vl_7b_instruct = ModelMeta( - loader=None, - name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", - languages=["eng_Latn"], - open_weights=True, - revision="d42eca5a540526cfa982a349724b24b25c12a95e", - release_date="2024-12-21", - n_parameters=int(8.29 * 1e9), - memory_usage_mb=8427, - max_tokens=32768, - embed_dim=3584, - license="mit", - similarity_fn_name="cosine", - framework=["PyTorch"], - reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct", - use_instructions=True, - adapted_from=None, - superseded_by=None, - training_datasets={ - # Only annotating text data for now - # source: https://arxiv.org/pdf/2412.16855 - "MSMARCO": ["train"], - "MSMARCO.v2": ["train"], - "mMARCO-NL": ["train"], # translation not trained on - }, - public_training_code=None, - public_training_data=None, -) diff --git a/mteb/models/gme_v_models.py b/mteb/models/gme_v_models.py index b12bd75eb3..19f6e4714a 100644 --- a/mteb/models/gme_v_models.py +++ b/mteb/models/gme_v_models.py @@ -12,9 +12,9 @@ from tqdm.autonotebook import tqdm from transformers import AutoModelForVision2Seq, AutoProcessor -import mteb from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta +from mteb.models.wrapper import Wrapper logging.basicConfig(level=logging.WARNING) logger = logging.getLogger(__name__) @@ -130,7 +130,7 @@ def embed( return embeddings -class GmeQwen2VL: +class GmeQwen2VL(Wrapper): def __init__( self, model_name: str = HF_GME_QWEN2VL_2B, @@ -168,8 +168,7 @@ def encode( ) def encode_queries(self, queries: list[str], **kwargs): - kwargs.update(prompt_type=PromptType.query) - embeddings = self.encode(queries, **kwargs) + embeddings = self.encode(queries, prompt_type=PromptType.query, **kwargs) return embeddings def encode_corpus(self, corpus: list[dict[str, str]], **kwargs): @@ -187,8 +186,7 @@ def encode_corpus(self, corpus: list[dict[str, str]], **kwargs): else doc["text"].strip() for doc in corpus ] - kwargs.update(prompt_type=PromptType.passage) - embeddings = self.encode(sentences, is_query=False, **kwargs) + embeddings = self.encode(sentences, prompt_type=PromptType.passage**kwargs) return embeddings def get_image_embeddings(self, images: list[Image.Image] | DataLoader, **kwargs): @@ -206,22 +204,6 @@ def calculate_probs(self, text_embeddings, image_embeddings): probs = (logits * 100).softmax(dim=-1) return probs - ## FIXME: Might want to subclass from Wrapper. - def get_instruction(task_name: str, prompt_type: PromptType | None) -> str: - """Get the instruction/prompt to be used for encoding sentences.""" - task = mteb.get_task(task_name=task_name) - task_metadata = task.metadata - if isinstance(task_metadata.prompt, dict) and prompt_type: - if task_metadata.prompt.get(prompt_type.value): - return task_metadata.prompt[prompt_type.value] - logger.warning( - f"Prompt type '{prompt_type}' not found in task metadata for task '{task_name}'." - ) - return "" - if task_metadata.prompt: - return task_metadata.prompt - return task.abstask_prompt - def get_fused_embeddings( self, texts: list[str] | None = None, diff --git a/mteb/models/overview.py b/mteb/models/overview.py index ef41f79088..470b0fa72a 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -30,7 +30,7 @@ e5_v, evaclip_models, fa_models, - gme_models, + gme_v_models, google_models, gritlm_models, gte_models, @@ -93,11 +93,9 @@ e5_models, e5_v, evaclip_models, - gme_models, google_models, gritlm_models, gte_models, - gme_models, ibm_granite_models, inf_models, jasper_models, @@ -118,6 +116,7 @@ openai_models, openclip_models, piccolo_models, + gme_v_models, promptriever_models, qtack_models, repllama_models, From 3deb7eaf3d57752f625abf26d561b45ab0c47d98 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Tue, 18 Feb 2025 07:51:47 +0200 Subject: [PATCH 042/233] fix: Add back task filtering by modalities (#2080) * add back task filtering by modalities * add unit test * check if task modalities is a subset of model modalities and fix tests * add model_modalities_more_than_task_modalities case --- mteb/evaluation/MTEB.py | 12 +++++ tests/test_benchmark/mock_models.py | 62 ++++++++++++++++++++++++++ tests/test_benchmark/test_benchmark.py | 42 +++++++++++++++++ 3 files changed, 116 insertions(+) diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index e59940f013..96a1a97ad5 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -497,6 +497,18 @@ def run( del self.tasks[0] # empty memory continue + # NOTE: skip evaluation if the model does not support all of the task's modalities. + # If the model covers more than the task's modalities, evaluation will still be run. + sorted_task_modalities = sorted(task.metadata.modalities) + if meta.modalities is not None and any( + m not in meta.modalities for m in sorted_task_modalities + ): + logger.info( + f"{meta.name} only supports {meta.modalities}, but the task modalities are {sorted_task_modalities}." + ) + del self.tasks[0] # empty memory + continue + task_eval_splits = ( eval_splits if eval_splits is not None else task.eval_splits ) diff --git a/tests/test_benchmark/mock_models.py b/tests/test_benchmark/mock_models.py index 1043c791f6..a0d35cf8fa 100644 --- a/tests/test_benchmark/mock_models.py +++ b/tests/test_benchmark/mock_models.py @@ -15,6 +15,7 @@ import mteb from mteb import SentenceTransformerWrapper from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta from tests.test_benchmark.task_grid import MOCK_TASK_TEST_GRID @@ -43,6 +44,28 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): class MockCLIPEncoder: + mteb_model_meta = ModelMeta( + name="MockCLIPModel", + languages=["eng_Latn"], + revision="3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268", + release_date="2021-02-06", + modalities=["image", "text"], + n_parameters=86_600_000, + memory_usage_mb=330, + max_tokens=None, + embed_dim=768, + license=None, + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/openai/clip-vit-base-patch32", + similarity_fn_name=None, + use_instructions=False, + training_datasets=None, + ) + model_card_data = mteb_model_meta + def __init__(self): pass @@ -66,6 +89,45 @@ def calculate_probs(self, text_embeddings, image_embeddings): return torch.randn(image_embeddings.shape[0], text_embeddings.shape[0]) +class MockMocoEncoder: + mteb_model_meta = ModelMeta( + name="MockMocoModel", + languages=["eng_Latn"], + revision="7d091cd70772c5c0ecf7f00b5f12ca609a99d69d", + release_date="2024-01-01", + modalities=["image"], + n_parameters=86_600_000, + memory_usage_mb=330, + max_tokens=None, + embed_dim=768, + license=None, + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch"], + reference="https://github.com/facebookresearch/moco-v3", + similarity_fn_name=None, + use_instructions=False, + training_datasets=None, + ) + model_card_data = mteb_model_meta + + def __init__(self): + pass + + def get_text_embeddings(self, texts, **kwargs): + pass + + def get_image_embeddings(self, images, **kwargs): + pass + + def get_fused_embeddings(self, texts, images, **kwargs): + pass + + def calculate_probs(self, text_embeddings, image_embeddings): + pass + + class MockSentenceTransformer(SentenceTransformer): """A mock implementation of the SentenceTransformer intended to implement just the encode, method using the same arguments.""" diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index d114900b1a..d7357664fe 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -4,6 +4,7 @@ import logging from pathlib import Path +from unittest.mock import patch import numpy as np import pytest @@ -13,8 +14,11 @@ import mteb import mteb.overview from mteb.create_meta import generate_readme +from mteb.evaluation.MTEB import logger from .mock_models import ( + MockCLIPEncoder, + MockMocoEncoder, MockNumpyEncoder, MockSentenceTransformer, MockSentenceTransformerWrapper, @@ -22,6 +26,8 @@ MockTorchEncoder, ) from .mock_tasks import ( + MockImageClusteringTask, + MockImageTextPairClassificationTask, MockInstructionRetrival, MockMultilingualInstructionRetrival, MockMultilingualRerankingTask, @@ -336,3 +342,39 @@ def encode(self, sentences, prompt_name: str | None = None, *args, **kwargs): output_folder=tmp_path.as_posix(), overwrite_results=True, ) + + +# NOTE: Covers image and image-text tasks. Can be extended to cover new mixed-modality task types. +@pytest.mark.parametrize( + "task", [MockImageTextPairClassificationTask(), MockRetrievalTask()] +) +@patch.object(logger, "info") +def test_task_modality_filtering(mock_logger, task): + eval = mteb.MTEB(tasks=[task]) + + # Run the evaluation + eval.run( + model=MockMocoEncoder(), + output_folder="tests/results", + overwrite_results=True, + ) + + # Check that the task was skipped and the correct log message was generated + task_modalities = ", ".join( + f"'{modality}'" for modality in sorted(task.metadata.modalities) + ) + mock_logger.assert_called_with( + f"MockMocoModel only supports ['image'], but the task modalities are [{task_modalities}]." + ) + + +@pytest.mark.parametrize("task", [MockImageClusteringTask()]) +def test_task_modality_filtering_model_modalities_more_than_task_modalities(task): + eval = mteb.MTEB(tasks=[task]) + + # Run the evaluation + eval.run( + model=MockCLIPEncoder(), + output_folder="tests/results", + overwrite_results=True, + ) From 544bcd1c9e78622dc45d7761a89d7fac07d26b71 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 18 Feb 2025 06:18:09 +0000 Subject: [PATCH 043/233] 1.34.21 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8f6cad6e9a..eb3401e0ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.34.20" +version = "1.34.21" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From bbfbc45a168cf7617d10c021d97b5beafde04dc1 Mon Sep 17 00:00:00 2001 From: sufen-f Date: Tue, 18 Feb 2025 01:46:57 -0800 Subject: [PATCH 044/233] Added gtr-t5-base/large/xl/xxl metadata to mteb (#2092) * Added GTR Models to codebase * Linted gtr models file. * Added gtr-base/large/xl/xxl to sentence_transformers_models.py * Added memory_usage_mb and training_datasets * Reformatted training dataset names * Reformatted training dataset names * Reformatted training dataset names --------- Co-authored-by: sufen --- mteb/models/sentence_transformers_models.py | 126 ++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 275224db61..add6689699 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -374,3 +374,129 @@ public_training_data=None, training_datasets={"SNLI": ["train"], "Community QA": ["train"]}, ) +gtr_t5_large = ModelMeta( + name="sentence-transformers/gtr-t5-large", + languages=["eng-Latn"], # in format eng-Latn + open_weights=True, + revision="a2c8ac47f998531948d4cbe32a0b577a7037a5e3", + release_date="2022-02-09", + n_parameters=335_000_000, + memory_usage_mb=639, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/sentence-transformers/gtr-t5-large", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets={ + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "mMARCO-NL": ["train"], # translation not trained on + "NQ": ["train"], + "NQ-NL": ["train"], # translation not trained on + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "Community QA": ["train"], + }, +) + +gtr_t5_xl = ModelMeta( + name="sentence-transformers/gtr-t5-xl", + languages=["eng-Latn"], # in format eng-Latn + open_weights=True, + revision="23a8d667a1ad2578af181ce762867003c498d1bf", + release_date="2022-02-09", + n_parameters=1_240_000_000, + memory_usage_mb=2367, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/sentence-transformers/gtr-t5-xl", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets={ + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "mMARCO-NL": ["train"], # translation not trained on + "NQ": ["train"], + "NQ-NL": ["train"], # translation not trained on + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "Community QA": ["train"], + }, +) +gtr_t5_xxl = ModelMeta( + name="sentence-transformers/gtr-t5-xxl", + languages=["eng-Latn"], # in format eng-Latn + open_weights=True, + revision="73f2a9156a3dcc2194dfdb2bf201cd7d17e17884", + release_date="2022-02-09", + n_parameters=4_860_000_000, + memory_usage_mb=9279, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/sentence-transformers/gtr-t5-xxl", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets={ + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "mMARCO-NL": ["train"], # translation not trained on + "NQ": ["train"], + "NQ-NL": ["train"], # translation not trained on + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "Community QA": ["train"], + }, +) + +gtr_t5_base = ModelMeta( + name="sentence-transformers/gtr-t5-base", + languages=["eng-Latn"], # in format eng-Latn + open_weights=True, + revision="7027e9594267928589816394bdd295273ddc0739", + release_date="2022-02-09", + n_parameters=110_000_000, + memory_usage_mb=209, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/sentence-transformers/gtr-t5-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets={ + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "mMARCO-NL": ["train"], # translation not trained on + "NQ": ["train"], + "NQ-NL": ["train"], # translation not trained on + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "Community QA": ["train"], + }, +) From 0371102c5288cf7246d990db42d3d129aa1af02a Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Tue, 18 Feb 2025 11:56:17 +0200 Subject: [PATCH 045/233] misc: Add Any2TextMutipleChoice Descriptive Statistics (#2095) * add Any2TextMutipleChoiceDescriptiveStatistics * run on all tasks --- .../Image/AbsTaskAny2TextMultipleChoice.py | 85 ++++++++++++++++++- .../Any2TextMutipleChoice/CVBenchCount.json | 37 ++++++++ .../Any2TextMutipleChoice/CVBenchDepth.json | 25 ++++++ .../CVBenchDistance.json | 25 ++++++ .../CVBenchRelation.json | 25 ++++++ 5 files changed, 195 insertions(+), 2 deletions(-) create mode 100644 mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json create mode 100644 mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json create mode 100644 mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json create mode 100644 mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json diff --git a/mteb/abstasks/Image/AbsTaskAny2TextMultipleChoice.py b/mteb/abstasks/Image/AbsTaskAny2TextMultipleChoice.py index 50991b6aee..f3a9fc1ec5 100644 --- a/mteb/abstasks/Image/AbsTaskAny2TextMultipleChoice.py +++ b/mteb/abstasks/Image/AbsTaskAny2TextMultipleChoice.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from collections import Counter from typing import Any from datasets import Dataset @@ -8,10 +9,57 @@ from ...encoder_interface import Encoder from ...evaluation.evaluators import Any2TextMultipleChoiceEvaluator from ..AbsTask import AbsTask, ScoresDict +from ..TaskMetadata import DescriptiveStatistics logger = logging.getLogger(__name__) +class Any2TextMutipleChoiceDescriptiveStatistics(DescriptiveStatistics): + """Descriptive statistics for Any2TextMutipleChoice + + Attributes: + num_samples: number of samples in the dataset. + + min_image_width: Minimum width of images + average_image_width: Average width of images + max_image_width: Maximum width of images + + min_image_height: Minimum height of images + average_image_height: Average height of images + max_image_height: Maximum height of images + + min_num_choices: Minimum number of choices + average_num_choices: Average number of choices + max_num_choices: Maximum number of choices + + answers: dict of answer frequencies + + min_question_length: Minimum length of questions + average_question_length: Average length of questions + max_question_length: Maximum length of questions + """ + + num_samples: int + + min_image_width: float + average_image_width: float + max_image_width: float + + min_image_height: float + average_image_height: float + max_image_height: float + + min_num_choices: int + average_num_choices: float + max_num_choices: int + + answers: dict[str, dict[str, int]] + + min_question_length: int + average_question_length: float + max_question_length: int + + class AbsTaskAny2TextMultipleChoice(AbsTask): """Abstract class for Any to Text Multiple Choice tasks, where the queries and be either text or image, or both. @@ -34,8 +82,41 @@ def _add_main_score(self, scores) -> None: def _calculate_metrics_from_split( self, split: str, hf_subset: str | None = None, compute_overall: bool = False - ): - pass + ) -> Any2TextMutipleChoiceDescriptiveStatistics: + imgs = self.dataset[split][self.query_column_names["image"]] + questions = self.dataset[split][self.query_column_names["text"]] + choices = self.dataset[split][self.choices_column_name] + answers = self.dataset[split][self.label_column_name] + + num_samples = len(answers) + answer_count = Counter(answers) + img_widths, img_heights = [], [] + for img in imgs: + width, height = img.size + img_heights.append(height) + img_widths.append(width) + + choices_len = [len(c) for c in choices] + questions_len = [len(q) for q in questions] + + return Any2TextMutipleChoiceDescriptiveStatistics( + num_samples=num_samples, + min_image_width=min(img_widths), + average_image_width=sum(img_widths) / len(img_widths), + max_image_width=max(img_widths), + min_image_height=min(img_heights), + average_image_height=sum(img_heights) / len(img_heights), + max_image_height=max(img_heights), + min_num_choices=min(choices_len), + average_num_choices=sum(choices_len) / len(choices_len), + max_num_choices=max(choices_len), + min_question_length=min(questions_len), + average_question_length=sum(questions_len) / len(questions_len), + max_question_length=max(questions_len), + answers={ + str(answer): {"count": count} for answer, count in answer_count.items() + }, + ) def _evaluate_subset( self, diff --git a/mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json b/mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json new file mode 100644 index 0000000000..9a4d8077c7 --- /dev/null +++ b/mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json @@ -0,0 +1,37 @@ +{ + "test": { + "num_samples": 788, + "min_image_width": 200, + "average_image_width": 757.6789340101523, + "max_image_width": 2200, + "min_image_height": 181, + "average_image_height": 631.3147208121827, + "max_image_height": 2200, + "min_num_choices": 4, + "average_num_choices": 4.550761421319797, + "max_num_choices": 6, + "min_question_length": 30, + "average_question_length": 34.35406091370558, + "max_question_length": 45, + "answers": { + "2": { + "count": 169 + }, + "4": { + "count": 63 + }, + "3": { + "count": 167 + }, + "1": { + "count": 184 + }, + "0": { + "count": 182 + }, + "5": { + "count": 23 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json b/mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json new file mode 100644 index 0000000000..1995597a46 --- /dev/null +++ b/mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json @@ -0,0 +1,25 @@ +{ + "test": { + "num_samples": 600, + "min_image_width": 561, + "average_image_width": 1090.9616666666666, + "max_image_width": 1600, + "min_image_height": 427, + "average_image_height": 715.985, + "max_image_height": 900, + "min_num_choices": 2, + "average_num_choices": 2.0, + "max_num_choices": 2, + "min_question_length": 130, + "average_question_length": 136.04333333333332, + "max_question_length": 147, + "answers": { + "0": { + "count": 300 + }, + "1": { + "count": 300 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json b/mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json new file mode 100644 index 0000000000..439aa253b6 --- /dev/null +++ b/mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json @@ -0,0 +1,25 @@ +{ + "test": { + "num_samples": 600, + "min_image_width": 561, + "average_image_width": 1099.2883333333334, + "max_image_width": 1600, + "min_image_height": 427, + "average_image_height": 720.9983333333333, + "max_image_height": 900, + "min_num_choices": 2, + "average_num_choices": 2.0, + "max_num_choices": 2, + "min_question_length": 204, + "average_question_length": 212.40333333333334, + "max_question_length": 223, + "answers": { + "0": { + "count": 303 + }, + "1": { + "count": 297 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json b/mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json new file mode 100644 index 0000000000..e0587321da --- /dev/null +++ b/mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json @@ -0,0 +1,25 @@ +{ + "test": { + "num_samples": 650, + "min_image_width": 189, + "average_image_width": 546.3169230769231, + "max_image_width": 2200, + "min_image_height": 190, + "average_image_height": 448.4492307692308, + "max_image_height": 2200, + "min_num_choices": 2, + "average_num_choices": 2.0, + "max_num_choices": 2, + "min_question_length": 132, + "average_question_length": 181.45846153846153, + "max_question_length": 224, + "answers": { + "0": { + "count": 327 + }, + "1": { + "count": 323 + } + } + } +} \ No newline at end of file From 9ca55f055943c44cadf654d6b49cc67b88b45d3d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 18 Feb 2025 09:58:47 +0000 Subject: [PATCH 046/233] Update tasks table --- docs/tasks.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index 7dbd5935ae..31daafeb84 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -175,10 +175,10 @@ The following tables give you an overview of the tasks in MTEB. | [CUADWarrantyDurationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [CUB200I2IRetrieval](https://www.florian-schroff.de/publications/CUB-200.pdf) (Welinder et al., 2010) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | None | None | | [CUREv1](https://huggingface.co/datasets/clinia/CUREv1) | ['eng', 'fra', 'spa'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | -| [CVBenchCount](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | None | None | -| [CVBenchDepth](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | None | None | -| [CVBenchDistance](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | None | None | -| [CVBenchRelation](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | None | None | +| [CVBenchCount](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | {'test': 788} | {'test': {'num_samples': 788, 'min_image_width': 200, 'average_image_width': 757.68, 'max_image_width': 2200, 'min_image_height': 181, 'average_image_height': 631.31, 'max_image_height': 2200, 'min_num_choices': 4, 'average_num_choices': 4.55, 'max_num_choices': 6, 'min_question_length': 30, 'average_question_length': 34.35, 'max_question_length': 45, 'answers': {'2': {'count': 169}, '4': {'count': 63}, '3': {'count': 167}, '1': {'count': 184}, '0': {'count': 182}, '5': {'count': 23}}}} | +| [CVBenchDepth](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | {'test': 600} | {'test': {'num_samples': 600, 'min_image_width': 561, 'average_image_width': 1090.96, 'max_image_width': 1600, 'min_image_height': 427, 'average_image_height': 715.99, 'max_image_height': 900, 'min_num_choices': 2, 'average_num_choices': 2.0, 'max_num_choices': 2, 'min_question_length': 130, 'average_question_length': 136.04, 'max_question_length': 147, 'answers': {'0': {'count': 300}, '1': {'count': 300}}}} | +| [CVBenchDistance](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | {'test': 600} | {'test': {'num_samples': 600, 'min_image_width': 561, 'average_image_width': 1099.29, 'max_image_width': 1600, 'min_image_height': 427, 'average_image_height': 721.0, 'max_image_height': 900, 'min_num_choices': 2, 'average_num_choices': 2.0, 'max_num_choices': 2, 'min_question_length': 204, 'average_question_length': 212.4, 'max_question_length': 223, 'answers': {'0': {'count': 303}, '1': {'count': 297}}}} | +| [CVBenchRelation](https://arxiv.org/pdf/2406.16860) (Tong et al., 2024) | ['eng'] | Any2TextMutipleChoice | it2t | [Academic] | {'test': 650} | {'test': {'num_samples': 650, 'min_image_width': 189, 'average_image_width': 546.32, 'max_image_width': 2200, 'min_image_height': 190, 'average_image_height': 448.45, 'max_image_height': 2200, 'min_num_choices': 2, 'average_num_choices': 2.0, 'max_num_choices': 2, 'min_question_length': 132, 'average_question_length': 181.46, 'max_question_length': 224, 'answers': {'0': {'count': 327}, '1': {'count': 323}}}} | | [Caltech101](https://ieeexplore.ieee.org/document/1384978) (Li Fei-Fei, 2004) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 6084} | {'test': {'num_samples': 6084, 'unique_num_labels': 102, 'min_image_width': 80, 'average_image_width': 311.72, 'max_image_width': 3481, 'min_image_height': 101, 'average_image_height': 241.84, 'max_image_height': 3999, 'labels': {'4': {'count': 437}, '37': {'count': 405}, '38': {'count': 405}, '57': {'count': 170}, '66': {'count': 768}, '0': {'count': 25}, '1': {'count': 770}, '2': {'count': 12}, '3': {'count': 12}, '5': {'count': 17}, '6': {'count': 24}, '7': {'count': 16}, '8': {'count': 3}, '9': {'count': 98}, '10': {'count': 68}, '11': {'count': 13}, '12': {'count': 55}, '13': {'count': 61}, '14': {'count': 20}, '15': {'count': 13}, '16': {'count': 93}, '17': {'count': 17}, '18': {'count': 29}, '19': {'count': 32}, '20': {'count': 77}, '22': {'count': 39}, '23': {'count': 43}, '24': {'count': 40}, '25': {'count': 20}, '26': {'count': 21}, '27': {'count': 27}, '28': {'count': 37}, '29': {'count': 22}, '30': {'count': 35}, '31': {'count': 38}, '32': {'count': 45}, '33': {'count': 34}, '34': {'count': 23}, '35': {'count': 34}, '36': {'count': 55}, '39': {'count': 37}, '40': {'count': 37}, '41': {'count': 15}, '42': {'count': 4}, '43': {'count': 4}, '44': {'count': 21}, '45': {'count': 69}, '46': {'count': 70}, '47': {'count': 12}, '48': {'count': 24}, '49': {'count': 58}, '50': {'count': 50}, '51': {'count': 1}, '52': {'count': 34}, '53': {'count': 56}, '54': {'count': 84}, '55': {'count': 31}, '56': {'count': 51}, '58': {'count': 48}, '59': {'count': 11}, '60': {'count': 36}, '61': {'count': 13}, '62': {'count': 10}, '63': {'count': 57}, '64': {'count': 2}, '65': {'count': 46}, '67': {'count': 25}, '68': {'count': 5}, '69': {'count': 9}, '70': {'count': 17}, '71': {'count': 8}, '72': {'count': 15}, '73': {'count': 23}, '74': {'count': 4}, '75': {'count': 27}, '76': {'count': 52}, '77': {'count': 29}, '78': {'count': 19}, '79': {'count': 10}, '80': {'count': 33}, '81': {'count': 9}, '82': {'count': 54}, '83': {'count': 27}, '84': {'count': 5}, '85': {'count': 34}, '86': {'count': 15}, '87': {'count': 56}, '88': {'count': 29}, '89': {'count': 34}, '90': {'count': 5}, '91': {'count': 55}, '92': {'count': 19}, '93': {'count': 56}, '94': {'count': 45}, '95': {'count': 209}, '96': {'count': 7}, '97': {'count': 29}, '98': {'count': 4}, '99': {'count': 26}, '100': {'count': 9}, '101': {'count': 30}, '21': {'count': 17}}}} | | [Caltech101ZeroShot](https://ieeexplore.ieee.org/document/1384978) (Li Fei-Fei, 2004) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | None | None | | [CanadaTaxCourtOutcomesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | From e0b364b5961e392e7662bfab9bf5ddb460b4943f Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Wed, 19 Feb 2025 15:36:36 +0100 Subject: [PATCH 047/233] fix: Updated model annotations for GTE, e5, gritlm, and SFR models (#2101) Reported with references to paper + qoutes. --- mteb/models/arctic_models.py | 246 +++++++------------------------ mteb/models/e5_instruct.py | 6 + mteb/models/gte_models.py | 6 +- mteb/models/salesforce_models.py | 5 + 4 files changed, 68 insertions(+), 195 deletions(-) diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index 9009656dac..8397157f43 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -81,8 +81,44 @@ "zho_Hans", ] +arctic_v1_training_datasets = { + # source: https://arxiv.org/pdf/2405.05374 + # splits not specified to assuming everything + # in MTEB + "NQ": ["test"], + "NQ-NL": ["test"], # translated from NQ (not trained on) + "NQHardNegatives": ["test"], + "NQ-PL": ["test"], + "HotPotQA": ["test"], # translated, not trained on + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + "HotpotQA-NL": ["test"], # translated from hotpotQA (not trained on) + "FEVER": ["test"], + "FEVER-NL": ["test"], # translated from FEVER (not trained on) + "FEVERHardNegatives": ["test"], + # not in MTEB + # trained on stack exchange (title-body) + # "stackexchange": [], + # potentially means that: + # "StackExchangeClusteringP2P": ["test"], + # "StackExchangeClusteringP2P.v2": ["test"], + # "StackExchangeClustering": ["test"], + # "StackExchangeClustering.v2": ["test"], + # not in MTEB + # "paq": [], + # "s2orc": [], + # "other": [], # undisclosed including webdata +} # also use synthetic + +arctic_v2_training_datasets = { + **arctic_v1_training_datasets, + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], +} + arctic_embed_xs = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="Snowflake/snowflake-arctic-embed-xs", revision="742da4f66e1823b5b4dbe6c320a1375a1fd85f9e", @@ -105,39 +141,12 @@ superseded_by=None, public_training_code=None, public_training_data=None, - training_datasets={ - # source: https://arxiv.org/pdf/2405.05374 - # splits not specified to assuming everything - # in MTEB - "NQ": ["test"], - "NQ-NL": ["test"], # translated from NQ (not trained on) - "NQHardNegatives": ["test"], - "NQ-PL": ["test"], - "HotPotQA": ["test"], # translated, not trained on - "HotPotQAHardNegatives": ["test"], - "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) - "HotpotQA-NL": ["test"], # translated from hotpotQA (not trained on) - "FEVER": ["test"], - "FEVER-NL": ["test"], # translated from FEVER (not trained on) - "FEVERHardNegatives": ["test"], - # not in MTEB - # trained on stack exchange (title-body) - # "stackexchange": [], - # potentially means that: - # "StackExchangeClusteringP2P": ["test"], - # "StackExchangeClusteringP2P.v2": ["test"], - # "StackExchangeClustering": ["test"], - # "StackExchangeClustering.v2": ["test"], - # not in MTEB - # "paq": [], - # "s2orc": [], - # "other": [], # undisclosed including webdata - }, # also use synthetic + training_datasets=arctic_v1_training_datasets, ) arctic_embed_s = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="Snowflake/snowflake-arctic-embed-s", revision="d3c1d2d433dd0fdc8e9ca01331a5f225639e798f", @@ -160,38 +169,12 @@ superseded_by=None, public_training_code=None, public_training_data=None, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2405.05374 - # splits not specified to assuming everything - # in MTEB - "NQ": ["test"], - "NQ-NL": ["test"], # translated from NQ (not trained on) - "NQHardNegatives": ["test"], - "HotPotQA": ["test"], - "HotPotQAHardNegatives": ["test"], - "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) - "HotpotQA-NL": ["test"], # translated from hotpotQA (not trained on) - "FEVER": ["test"], - "FEVER-NL": ["test"], # translated from FEVER (not trained on) - "FEVERHardNegatives": ["test"], - # not in MTEB - # trained on stack exchange (title-body) - # "stackexchange": [], - # potentially means that: - # "StackExchangeClusteringP2P": ["test"], - # "StackExchangeClusteringP2P.v2": ["test"], - # "StackExchangeClustering": ["test"], - # "StackExchangeClustering.v2": ["test"], - # not in MTEB - # "paq": [], - # "s2orc": [], - # "other": [], # undisclosed including webdata - }, # also use synthetic + training_datasets=arctic_v1_training_datasets, ) arctic_embed_m = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="Snowflake/snowflake-arctic-embed-m", revision="cc17beacbac32366782584c8752220405a0f3f40", @@ -214,37 +197,11 @@ superseded_by="Snowflake/snowflake-arctic-embed-m-v1.5", public_training_code=None, public_training_data=None, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2405.05374 - # splits not specified to assuming everything - # in MTEB - "NQ": ["test"], - "NQ-NL": ["test"], # translated from NQ (not trained on) - "NQHardNegatives": ["test"], - "HotPotQA": ["test"], - "HotPotQAHardNegatives": ["test"], - "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) - "HotpotQA-NL": ["test"], # translated from hotpotQA (not trained on) - "FEVER": ["test"], - "FEVER-NL": ["test"], # translated from FEVER (not trained on) - "FEVERHardNegatives": ["test"], - # not in MTEB - # trained on stack exchange (title-body) - # "stackexchange": [], - # potentially means that: - # "StackExchangeClusteringP2P": ["test"], - # "StackExchangeClusteringP2P.v2": ["test"], - # "StackExchangeClustering": ["test"], - # "StackExchangeClustering.v2": ["test"], - # not in MTEB - # "paq": [], - # "s2orc": [], - # "other": [], # undisclosed including webdata - }, # also use synthetic + training_datasets=arctic_v1_training_datasets, ) arctic_embed_m_long = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="Snowflake/snowflake-arctic-embed-m-long", revision="89d0f6ab196eead40b90cb6f9fefec01a908d2d1", @@ -268,38 +225,11 @@ superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", public_training_code=None, public_training_data=None, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2405.05374 - # splits not specified to assuming everything - # in MTEB - "NQ": ["test"], - "NQ-NL": ["test"], # translated from NQ (not trained on) - "NQHardNegatives": ["test"], - "HotPotQA": ["test"], - "HotPotQAHardNegatives": ["test"], - "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) - "HotpotQA-NL": ["test"], # translated from hotpotQA (not trained on) - "FEVER": ["test"], - "FEVER-NL": ["test"], # translated from FEVER (not trained on) - "FEVERHardNegatives": ["test"], - # trained on stack exchange, unsure if sources match - # not in MTEB - # trained on stack exchange (title-body) - # "stackexchange": [], - # potentially means that: - # "StackExchangeClusteringP2P": ["test"], - # "StackExchangeClusteringP2P.v2": ["test"], - # "StackExchangeClustering": ["test"], - # "StackExchangeClustering.v2": ["test"], - # not in MTEB - # "paq": [], - # "s2orc": [], - # "other": [], # undisclosed including webdata - }, # also use synthetic + training_datasets=arctic_v1_training_datasets, ) arctic_embed_l = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="Snowflake/snowflake-arctic-embed-l", revision="9a9e5834d2e89cdd8bb72b64111dde496e4fe78c", @@ -322,37 +252,11 @@ superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0", public_training_code=None, public_training_data=None, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2405.05374 - # splits not specified to assuming everything - # in MTEB - "NQ": ["test"], - "NQ-NL": ["test"], # translated from NQ (not trained on) - "NQHardNegatives": ["test"], - "HotPotQA": ["test"], - "HotPotQAHardNegatives": ["test"], - "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) - "HotpotQA-NL": ["test"], # translated from hotpotQA (not trained on) - "FEVER": ["test"], - "FEVER-NL": ["test"], # translated from FEVER (not trained on) - "FEVERHardNegatives": ["test"], - # not in MTEB - # trained on stack exchange (title-body) - # "stackexchange": [], - # potentially means that: - # "StackExchangeClusteringP2P": ["test"], - # "StackExchangeClusteringP2P.v2": ["test"], - # "StackExchangeClustering": ["test"], - # "StackExchangeClustering.v2": ["test"], - # not in MTEB - # "paq": [], - # "s2orc": [], - # "other": [], # undisclosed including webdata - }, # also use synthetic + training_datasets=arctic_v1_training_datasets, ) arctic_embed_m_v1_5 = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="Snowflake/snowflake-arctic-embed-m-v1.5", revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47", @@ -378,11 +282,11 @@ superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", public_training_code=None, public_training_data=None, - training_datasets=None, + training_datasets=arctic_v1_training_datasets, ) arctic_embed_m_v2_0 = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="Snowflake/snowflake-arctic-embed-m-v2.0", revision="f2a7d59d80dfda5b1d14f096f3ce88bb6bf9ebdc", @@ -406,34 +310,11 @@ superseded_by=None, public_training_code=None, public_training_data=None, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2405.05374 - # splits not specified to assuming everything - # in MTEB - "NQ": ["test"], - "NQHardNegatives": ["test"], - "HotPotQA": ["test"], - "HotPotQAHardNegatives": ["test"], - "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) - "FEVER": ["test"], - "FEVERHardNegatives": ["test"], - # not in MTEB - # trained on stack exchange (title-body) - # "stackexchange": [], - # potentially means that: - # "StackExchangeClusteringP2P": ["test"], - # "StackExchangeClusteringP2P.v2": ["test"], - # "StackExchangeClustering": ["test"], - # "StackExchangeClustering.v2": ["test"], - # not in MTEB - # "paq": [], - # "s2orc": [], - # "other": [], # undisclosed including webdata - }, # also use synthetic + training_datasets=arctic_v2_training_datasets, ) arctic_embed_l_v2_0 = ModelMeta( - loader=partial( + loader=partial( # type: ignore sentence_transformers_loader, model_name="Snowflake/snowflake-arctic-embed-l-v2.0", revision="edc2df7b6c25794b340229ca082e7c78782e6374", @@ -456,28 +337,5 @@ superseded_by=None, public_training_code=None, public_training_data=None, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2405.05374 - # splits not specified to assuming everything - # in MTEB - "NQ": ["test"], - "NQHardNegatives": ["test"], - "HotPotQA": ["test"], - "HotPotQAHardNegatives": ["test"], - "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) - "FEVER": ["test"], - "FEVERHardNegatives": ["test"], - # not in MTEB - # trained on stack exchange (title-body) - # "stackexchange": [], - # potentially means that: - # "StackExchangeClusteringP2P": ["test"], - # "StackExchangeClusteringP2P.v2": ["test"], - # "StackExchangeClustering": ["test"], - # "StackExchangeClustering.v2": ["test"], - # not in MTEB - # "paq": [], - # "s2orc": [], - # "other": [], # undisclosed including webdata - }, # also use synthetic + training_datasets=arctic_v2_training_datasets, ) diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index 87f75fdd16..e27fc99b77 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -25,6 +25,9 @@ "HotpotQAHardNegatives": ["train"], "HotpotQA-PL": ["train"], # translation not trained on "HotpotQA-NL": ["train"], # translation not trained on + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], # https://arxiv.org/pdf/2402.09906, section M } e5_instruct = ModelMeta( @@ -163,6 +166,9 @@ "STS12": ["train"], "STS22": ["train"], "STSBenchmark": ["train"], + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], # https://arxiv.org/pdf/2402.05672, table 2 }, adapted_from="intfloat/e5-mistral-7b-instruct", superseded_by=None, diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index 8e681b0a31..ab4bbb09ae 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -276,9 +276,13 @@ def instruction_template( "HotpotQA-NL": ["train"], "FEVER": ["train"], "FEVER-NL": ["train"], - "MIRACLReranking": ["train"], "MrTidyRetrieval": ["train"], "MultiLongDocRetrieval": ["train"], + "MIRACLReranking": ["train"], + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": [ + "train" + ], # https://arxiv.org/pdf/2407.19669, Table 11 # not in MTEB: # - TriviaQA # - SQuAD diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index 6cb3af7ea0..d8a99c3559 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -30,6 +30,11 @@ def instruction_template( "HotpotQAHardNegatives": ["train"], "HotpotQA-PL": ["train"], # translation not trained on "HotpotQA-NL": ["train"], # translation not trained on + # source: https://github.com/embeddings-benchmark/leaderboard/issues/41 + # qoute: In the realm of Semantic Textual Similarity (STS), it is trained on STS12, STS22, and STSBenchmark + "STS12": ["train"], + "STS22": ["train"], + "STSBenchmark": ["train"], } SFR_Embedding_2_R = ModelMeta( From 6b9f945183ceb01c7bc330fe9cddc132491012fb Mon Sep 17 00:00:00 2001 From: Niklas Muennighoff Date: Wed, 19 Feb 2025 06:48:50 -0800 Subject: [PATCH 048/233] fix: Update links (#2098) * Fix link * Fix link --- mteb/benchmarks/benchmarks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 6e4f10b617..fcf5ad8bd6 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -79,7 +79,7 @@ to tasks in the original MTEB, and contains tasks that are not as frequently used for model training. This way the new benchmark and leaderboard can give our users a more realistic expectation of models' generalization performance. -The original MTEB leaderboard is available under the [MTEB(eng, classic)](https://huggingface.co/spaces/mteb/leaderboard?benchmark_name=MTEB%28eng%2C+classic%29) tab. +The original MTEB leaderboard is available under the [MTEB(eng, v1)](http://mteb-leaderboard.hf.space/?benchmark_name=MTEB%28eng%2C+v1%29) tab. """, citation="", contacts=["KennethEnevoldsen", "Muennighoff"], @@ -159,7 +159,7 @@ ), description="""The original English benchmark by Muennighoff et al., (2023). This page is an adaptation of the [old MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard_legacy). -We recommend that you use [MTEB(eng)](https://huggingface.co/spaces/mteb/leaderboard?benchmark_name=MTEB%28eng%29) instead as it uses updated versions of the task making it notably faster to run and resolves [a known bug](https://github.com/embeddings-benchmark/mteb/issues/1156) in existing tasks. This benchmark also removes datasets common for fine-tuning such as MSMARCO, which makes model performance scores more comparable. However, generally, both benchmarks provide similar estimates. +We recommend that you use [MTEB(eng, v2)](http://mteb-leaderboard.hf.space/?benchmark_name=MTEB%28eng%2C+v2%29) instead as it uses updated versions of the task making it notably faster to run and resolves [a known bug](https://github.com/embeddings-benchmark/mteb/issues/1156) in existing tasks. This benchmark also removes datasets common for fine-tuning such as MSMARCO, which makes model performance scores more comparable. However, generally, both benchmarks provide similar estimates. """, citation="""@inproceedings{muennighoff-etal-2023-mteb, title = "{MTEB}: Massive Text Embedding Benchmark", From 06489abca007261c7e6b11f36d4844c5ed5efdcb Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 19 Feb 2025 15:00:51 +0000 Subject: [PATCH 049/233] 1.34.22 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index eb3401e0ff..da3efaac66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.34.21" +version = "1.34.22" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From c69b8c318390d294f979b1add1c68186ec4b8355 Mon Sep 17 00:00:00 2001 From: Samuel Yang Date: Thu, 20 Feb 2025 13:41:24 +0800 Subject: [PATCH 050/233] Add model inf-retriever-v1-1.5b (#2106) Add inf-retriever-v1-1.5b model --- mteb/models/inf_models.py | 60 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/mteb/models/inf_models.py b/mteb/models/inf_models.py index fbb14c93c0..f53d8c9bdb 100644 --- a/mteb/models/inf_models.py +++ b/mteb/models/inf_models.py @@ -4,6 +4,37 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader +inf_retreiver_v1_training_data = { + # eng_Latn + "ArguAna": ["train"], + "CQADupstackRetrieval": ["train"], + "ClimateFEVER": ["train"], + "DBPedia": ["train"], + "FEVER": ["train"], + "FiQA2018": ["train"], + "HotpotQA": ["train"], + "MSMARCO": ["train"], + "NFCorpus": ["train"], + "NQ": ["train"], + "QuoraRetrieval": ["train"], + "SCIDOCS": ["train"], + "SciFact": ["train"], + "TRECCOVID": ["train"], + "Touche2020": ["train"], + ## and other private data of INF TECH (not in MTEB), + # + # zho_Hans + "CmedqaRetrieval": ["train"], + "CovidRetrieval": ["train"], + "DuRetrieval": ["train"], + "EcomRetrieval": ["train"], + "MMarcoRetrieval": ["train"], + "MedicalRetrieval": ["train"], + "T2Retrieval": ["train"], + "VideoRetrieval": ["train"], + ## and other private data of INF TECH (not in MTEB), +} + inf_retriever_v1 = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, @@ -28,5 +59,32 @@ adapted_from="Alibaba-NLP/gte-Qwen2-7B-instruct", public_training_code=None, public_training_data=None, - training_datasets=None, + training_datasets=inf_retreiver_v1_training_data, +) + +inf_retriever_v1_1_5B = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="infly/inf-retriever-v1-1.5b", + revision="c9c05c2dd50707a486966ba81703021ae2094a06", + trust_remote_code=True, + ), + name="infly/inf-retriever-v1-1.5b", + languages=["eng_Latn", "zho_Hans"], + open_weights=True, + revision="c9c05c2dd50707a486966ba81703021ae2094a06", + release_date="2025-02-08", # initial commit of hf model. + n_parameters=1_543_268_864, + memory_usage_mb=2944, + embed_dim=1536, + license="apache-2.0", + max_tokens=32768, + reference="https://huggingface.co/infly/inf-retriever-v1-1.5b", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + adapted_from="Alibaba-NLP/gte-Qwen2-1.5B-instruct", + public_training_code=None, + public_training_data=None, + training_datasets=inf_retreiver_v1_training_data, ) From caa0b77e4c21310a64f480cbd710c62810deb134 Mon Sep 17 00:00:00 2001 From: Niklas Muennighoff Date: Wed, 19 Feb 2025 22:23:21 -0800 Subject: [PATCH 051/233] docs: Fix typos & refine text (#2102) * Update app.py * Fix typos --- mteb/leaderboard/app.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 9a707160c4..3966ffba03 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -23,9 +23,12 @@ acknowledgment_md = """ ### Acknowledgment -We thank [ServiceNow](https://www.servicenow.com/), [Contextual AI](https://contextual.ai/) and [Hugging Face](https://huggingface.co/) for their generous sponsorship. If you'd like to sponsor us, please get in [touch](mailto:n.muennighoff@gmail.com). +We thank [Google](https://cloud.google.com/), [ServiceNow](https://www.servicenow.com/), [Contextual AI](https://contextual.ai/) and [Hugging Face](https://huggingface.co/) for their generous sponsorship. If you'd like to sponsor us, please get in [touch](mailto:n.muennighoff@gmail.com).