diff --git a/README.md b/README.md
index e08545aec6..e2f7a523a5 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ pip install mteb
## Usage
-* Using a python script (see [scripts/run_mteb_english.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_english.py) and [mteb/mtebscripts](https://github.com/embeddings-benchmark/mtebscripts) for more):
+* Using a python script:
```python
import mteb
@@ -77,11 +77,11 @@ Click on each section below to see the details.
- Dataset selection
+ Task selection
-### Dataset selection
+### Task selection
-Datasets can be selected by providing the list of datasets, but also
+Tasks can be selected by providing the list of datasets, but also
* by their task (e.g. "Clustering" or "Classification")
@@ -121,11 +121,18 @@ evaluation = mteb.MTEB(tasks=[
# for an example of a HF subset see "Subset" in the dataset viewer at: https://huggingface.co/datasets/mteb/bucc-bitext-mining
```
-There are also presets available for certain task collections, e.g. to select the 56 English datasets that form the "Overall MTEB English leaderboard":
+
+
+
+ Running a benchmark
+
+`mteb` comes with a set of predefined benchmarks. These can be fetched using `get_benchmark` and run in a similar fashion to other sets of tasks.
+For instance to select the 56 English datasets that form the "Overall MTEB English leaderboard":
```python
-from mteb import MTEB_MAIN_EN
-evaluation = mteb.MTEB(tasks=MTEB_MAIN_EN, task_langs=["en"])
+import mteb
+mteb_eng = mteb.get_benchmark("MTEB(eng)")
+evaluation = mteb.MTEB(tasks=mteb_eng, eval_splits=["test"])
```
diff --git a/mteb/__init__.py b/mteb/__init__.py
index be5edd97ed..2b98827014 100644
--- a/mteb/__init__.py
+++ b/mteb/__init__.py
@@ -2,7 +2,7 @@
from importlib.metadata import version
-from mteb.benchmarks import (
+from mteb.benchmarks.benchmarks import (
MTEB_MAIN_EN,
MTEB_MAIN_RU,
MTEB_RETRIEVAL_LAW,
@@ -14,7 +14,8 @@
from mteb.models import get_model, get_model_meta
from mteb.overview import TASKS_REGISTRY, get_task, get_tasks
-from .benchmarks import Benchmark
+from .benchmarks.benchmarks import Benchmark
+from .benchmarks.get_benchmark import get_benchmark
__version__ = version("mteb") # fetch version from install metadata
@@ -32,4 +33,5 @@
"get_model_meta",
"load_results",
"Benchmark",
+ "get_benchmark",
]
diff --git a/mteb/benchmarks/__init__.py b/mteb/benchmarks/__init__.py
new file mode 100644
index 0000000000..fb1d12a293
--- /dev/null
+++ b/mteb/benchmarks/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from mteb.benchmarks.benchmarks import *
diff --git a/mteb/benchmarks.py b/mteb/benchmarks/benchmarks.py
similarity index 63%
rename from mteb/benchmarks.py
rename to mteb/benchmarks/benchmarks.py
index 9485230a62..048c74d75a 100644
--- a/mteb/benchmarks.py
+++ b/mteb/benchmarks/benchmarks.py
@@ -3,16 +3,44 @@
from dataclasses import dataclass
from typing import Sequence
+from pydantic import AnyUrl, BeforeValidator, TypeAdapter
+from typing_extensions import Annotated
+
from mteb.abstasks.AbsTask import AbsTask
from mteb.overview import get_tasks
+http_url_adapter = TypeAdapter(AnyUrl)
+UrlString = Annotated[
+ str, BeforeValidator(lambda value: str(http_url_adapter.validate_python(value)))
+] # Allows the type to be a string, but ensures that the string is a URL
+
@dataclass
class Benchmark:
+ """A benchmark object intended to run a certain benchmark within MTEB.
+
+ Args:
+ name: The name of the benchmark
+ tasks: The tasks within the benchmark.
+ description: A description of the benchmark, should include its intended goal and potentially a description of its construction
+ reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github.
+ citation: A bibtex citation
+
+ Example:
+ >>> Benchmark(
+ ... name="MTEB(custom)",
+ ... tasks=mteb.get_tasks(
+ ... tasks=["AmazonCounterfactualClassification", "AmazonPolarityClassification"],
+ ... languages=["eng"],
+ ... ),
+ ... description="A custom benchmark"
+ ... )
+ """
+
name: str
- tasks: Sequence[str] | Sequence[AbsTask]
+ tasks: Sequence[AbsTask]
description: str | None = None
- reference: str | None = None
+ reference: UrlString | None = None
citation: str | None = None
def __iter__(self):
@@ -27,75 +55,78 @@ def __getitem__(self, index):
MTEB_MAIN_EN = Benchmark(
name="MTEB(eng)",
- tasks=[
- "AmazonCounterfactualClassification",
- "AmazonPolarityClassification",
- "AmazonReviewsClassification",
- "ArguAna",
- "ArxivClusteringP2P",
- "ArxivClusteringS2S",
- "AskUbuntuDupQuestions",
- "BIOSSES",
- "Banking77Classification",
- "BiorxivClusteringP2P",
- "BiorxivClusteringS2S",
- "CQADupstackAndroidRetrieval",
- "CQADupstackEnglishRetrieval",
- "CQADupstackGamingRetrieval",
- "CQADupstackGisRetrieval",
- "CQADupstackMathematicaRetrieval",
- "CQADupstackPhysicsRetrieval",
- "CQADupstackProgrammersRetrieval",
- "CQADupstackStatsRetrieval",
- "CQADupstackTexRetrieval",
- "CQADupstackUnixRetrieval",
- "CQADupstackWebmastersRetrieval",
- "CQADupstackWordpressRetrieval",
- "ClimateFEVER",
- "DBPedia",
- "EmotionClassification",
- "FEVER",
- "FiQA2018",
- "HotpotQA",
- "ImdbClassification",
- "MSMARCO",
- "MTOPDomainClassification",
- "MTOPIntentClassification",
- "MassiveIntentClassification",
- "MassiveScenarioClassification",
- "MedrxivClusteringP2P",
- "MedrxivClusteringS2S",
- "MindSmallReranking",
- "NFCorpus",
- "NQ",
- "QuoraRetrieval",
- "RedditClustering",
- "RedditClusteringP2P",
- "SCIDOCS",
- "SICK-R",
- "STS12",
- "STS13",
- "STS14",
- "STS15",
- "STS16",
- "STS17",
- "STS22",
- "STSBenchmark",
- "SciDocsRR",
- "SciFact",
- "SprintDuplicateQuestions",
- "StackExchangeClustering",
- "StackExchangeClusteringP2P",
- "StackOverflowDupQuestions",
- "SummEval",
- "TRECCOVID",
- "Touche2020",
- "ToxicConversationsClassification",
- "TweetSentimentExtractionClassification",
- "TwentyNewsgroupsClustering",
- "TwitterSemEval2015",
- "TwitterURLCorpus",
- ],
+ tasks=get_tasks(
+ tasks=[
+ "AmazonCounterfactualClassification",
+ "AmazonPolarityClassification",
+ "AmazonReviewsClassification",
+ "ArguAna",
+ "ArxivClusteringP2P",
+ "ArxivClusteringS2S",
+ "AskUbuntuDupQuestions",
+ "BIOSSES",
+ "Banking77Classification",
+ "BiorxivClusteringP2P",
+ "BiorxivClusteringS2S",
+ "CQADupstackAndroidRetrieval",
+ "CQADupstackEnglishRetrieval",
+ "CQADupstackGamingRetrieval",
+ "CQADupstackGisRetrieval",
+ "CQADupstackMathematicaRetrieval",
+ "CQADupstackPhysicsRetrieval",
+ "CQADupstackProgrammersRetrieval",
+ "CQADupstackStatsRetrieval",
+ "CQADupstackTexRetrieval",
+ "CQADupstackUnixRetrieval",
+ "CQADupstackWebmastersRetrieval",
+ "CQADupstackWordpressRetrieval",
+ "ClimateFEVER",
+ "DBPedia",
+ "EmotionClassification",
+ "FEVER",
+ "FiQA2018",
+ "HotpotQA",
+ "ImdbClassification",
+ "MSMARCO",
+ "MTOPDomainClassification",
+ "MTOPIntentClassification",
+ "MassiveIntentClassification",
+ "MassiveScenarioClassification",
+ "MedrxivClusteringP2P",
+ "MedrxivClusteringS2S",
+ "MindSmallReranking",
+ "NFCorpus",
+ "NQ",
+ "QuoraRetrieval",
+ "RedditClustering",
+ "RedditClusteringP2P",
+ "SCIDOCS",
+ "SICK-R",
+ "STS12",
+ "STS13",
+ "STS14",
+ "STS15",
+ "STS16",
+ "STS17",
+ "STS22",
+ "STSBenchmark",
+ "SciDocsRR",
+ "SciFact",
+ "SprintDuplicateQuestions",
+ "StackExchangeClustering",
+ "StackExchangeClusteringP2P",
+ "StackOverflowDupQuestions",
+ "SummEval",
+ "TRECCOVID",
+ "Touche2020",
+ "ToxicConversationsClassification",
+ "TweetSentimentExtractionClassification",
+ "TwentyNewsgroupsClustering",
+ "TwitterSemEval2015",
+ "TwitterURLCorpus",
+ ],
+ languages=["eng"],
+ ),
description="Main English benchmarks from MTEB",
citation="""@inproceedings{muennighoff-etal-2023-mteb,
title = "{MTEB}: Massive Text Embedding Benchmark",
@@ -170,11 +201,13 @@ def __getitem__(self, index):
MTEB_RETRIEVAL_WITH_INSTRUCTIONS = Benchmark(
name="MTEB(Retrieval w/Instructions)",
- tasks=[
- "Robust04InstructionRetrieval",
- "News21InstructionRetrieval",
- "Core17InstructionRetrieval",
- ],
+ tasks=get_tasks(
+ tasks=[
+ "Robust04InstructionRetrieval",
+ "News21InstructionRetrieval",
+ "Core17InstructionRetrieval",
+ ]
+ ),
description="Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions.",
reference="https://arxiv.org/abs/2403.15246",
citation="""@misc{weller2024followir,
@@ -188,33 +221,37 @@ def __getitem__(self, index):
)
MTEB_RETRIEVAL_LAW = Benchmark(
- name="MTEB(law)",
- tasks=[
- "LegalSummarization",
- "LegalBenchConsumerContractsQA",
- "LegalBenchCorporateLobbying",
- "AILACasedocs",
- "AILAStatutes",
- "LeCaRDv2",
- "LegalQuAD",
- "GerDaLIRSmall",
- ],
- description="Legal benchmarks from MTEB",
+ name="MTEB(law)", # This benchmark is likely in the need of an update
+ tasks=get_tasks(
+ tasks=[
+ "AILACasedocs",
+ "AILAStatutes",
+ "LegalSummarization",
+ "GerDaLIRSmall",
+ "LeCaRDv2",
+ "LegalBenchConsumerContractsQA",
+ "LegalBenchCorporateLobbying",
+ "LegalQuAD",
+ ]
+ ),
+ description="Legal benchmarks from MTEB.",
reference="https://aclanthology.org/2023.eacl-main.148/",
citation=None,
)
MTEB_MINERS_BITEXT_MINING = Benchmark(
name="MINERSBitextMining",
- tasks=[
- "BUCCBitextMining",
- "LinceMTBitextMining",
- "NollySentiBitextMining",
- "NusaXBitextMining",
- "NusaTranslationBitextMining",
- "PhincBitextMining",
- "TatoebaBitextMining",
- ],
+ tasks=get_tasks(
+ tasks=[
+ "BUCC",
+ "LinceMTBitextMining",
+ "NollySentiBitextMining",
+ "NusaXBitextMining",
+ "NusaTranslationBitextMining",
+ "PhincBitextMining",
+ "Tatoeba",
+ ]
+ ),
description="BitextMining benchmark from MINERS",
reference="https://arxiv.org/pdf/2406.07424",
citation="""
@@ -228,37 +265,43 @@ def __getitem__(self, index):
)
SEB = Benchmark(
name="MTEB(Scandinavian)",
- tasks=[
- "BornholmBitextMining",
- "NorwegianCourtsBitextMining",
- "AngryTweetsClassification",
- "DanishPoliticalCommentsClassification",
- "DKHateClassification",
- "LccSentimentClassification",
- "MassiveIntentClassification",
- "MassiveScenarioClassification",
- "NordicLangClassification",
- "ScalaClassification",
- "NoRecClassification",
- "NorwegianParliamentClassification",
- "DalajClassification",
- "SwedishSentimentClassification",
- "SweRecClassification",
- "DanFEVER",
- "TV2Nordretrieval",
- "TwitterHjerneRetrieval",
- "NorQuadRetrieval",
- "SNLRetrieval",
- "SwednRetrieval",
- "SweFaqRetrieval",
- "WikiClusteringP2P.v2",
- "SNLHierarchicalClusteringP2P",
- "SNLHierarchicalClusteringS2S",
- "VGHierarchicalClusteringP2P",
- "VGHierarchicalClusteringS2S",
- "SwednClusteringP2P",
- "SwednClusteringS2S",
- ],
+ tasks=get_tasks(
+ tasks=[
+ # Bitext
+ "BornholmBitextMining",
+ "NorwegianCourtsBitextMining",
+ # Classification
+ "AngryTweetsClassification",
+ "DanishPoliticalCommentsClassification",
+ "DalajClassification",
+ "DKHateClassification",
+ "LccSentimentClassification",
+ "MassiveIntentClassification",
+ "MassiveScenarioClassification",
+ "NordicLangClassification",
+ "NoRecClassification",
+ "NorwegianParliamentClassification",
+ "ScalaClassification",
+ "SwedishSentimentClassification",
+ "SweRecClassification",
+ # Retrieval
+ "DanFEVER",
+ "NorQuadRetrieval",
+ "SNLRetrieval",
+ "SwednRetrieval",
+ "SweFaqRetrieval",
+ "TV2Nordretrieval",
+ "TwitterHjerneRetrieval",
+ # Clustering
+ "SNLHierarchicalClusteringS2S",
+ "SNLHierarchicalClusteringP2P",
+ "SwednClusteringP2P",
+ "SwednClusteringS2S",
+ "VGHierarchicalClusteringS2S",
+ "VGHierarchicalClusteringP2P",
+ ],
+ languages=["dan", "swe", "nno", "nob"],
+ ),
description="A curated selection of tasks coverering the Scandinavian languages; Danish, Swedish and Norwegian, including Bokmål and Nynorsk.",
reference="https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/",
citation="""@misc{enevoldsen2024scandinavian,
@@ -273,18 +316,20 @@ def __getitem__(self, index):
CoIR = Benchmark(
name="CoIR",
- tasks=[
- "AppsRetrieval",
- "CosQA",
- "SyntheticText2SQL",
- "COIRCodeSearchNetRetrieval",
- "CodeSearchNetCCRetrieval",
- "CodeTransOceanDL",
- "CodeTransOceanContest",
- "StackOverflowQA",
- "CodeFeedbackMT",
- "CodeFeedbackST",
- ],
+ tasks=get_tasks(
+ tasks=[
+ "AppsRetrieval",
+ "CodeFeedbackMT",
+ "CodeFeedbackST",
+ "CodeSearchNetCCRetrieval",
+ "CodeTransOceanContest",
+ "CodeTransOceanDL",
+ "CosQA",
+ "COIRCodeSearchNetRetrieval",
+ "StackOverflowQA",
+ "SyntheticText2SQL",
+ ]
+ ),
description="CoIR: A Comprehensive Benchmark for Code Information Retrieval Models",
reference="https://github.com/CoIR-team/coir",
citation="""@misc{li2024coircomprehensivebenchmarkcode,
@@ -322,19 +367,19 @@ def __getitem__(self, index):
"OpusparcusPC",
"PawsXPairClassification",
# Reranking
- "SyntecReranking",
"AlloprofReranking",
+ "SyntecReranking",
# Retrieval
"AlloprofRetrieval",
"BSARDRetrieval",
+ "MintakaRetrieval",
"SyntecRetrieval",
"XPQARetrieval",
- "MintakaRetrieval",
# STS
- "SummEvalFr",
- "STSBenchmarkMultilingualSTS",
- "STS22",
"SICKFr",
+ "STS22",
+ "STSBenchmarkMultilingualSTS",
+ "SummEvalFr",
],
),
description="Main French benchmarks from MTEB",
@@ -426,27 +471,27 @@ def __getitem__(self, index):
languages=["pol"],
tasks=[
# Classification
+ "AllegroReviews",
"CBD",
+ "MassiveIntentClassification",
+ "MassiveScenarioClassification",
"PolEmo2.0-IN",
"PolEmo2.0-OUT",
- "AllegroReviews",
"PAC",
- "MassiveIntentClassification",
- "MassiveScenarioClassification",
# Clustering
"EightTagsClustering",
"PlscClusteringS2S",
"PlscClusteringP2P",
# Pair Classification
- "SICK-E-PL",
- "PpcPC",
"CDSC-E",
+ "PpcPC",
"PSC",
+ "SICK-E-PL",
# STS
- "SICK-R-PL",
"CDSC-R",
"STS22",
"STSBenchmarkMultilingualSTS",
+ "SICK-R-PL",
],
),
description="Main Polish benchmarks from MTEB",
diff --git a/mteb/benchmarks/get_benchmark.py b/mteb/benchmarks/get_benchmark.py
new file mode 100644
index 0000000000..169e3bcd50
--- /dev/null
+++ b/mteb/benchmarks/get_benchmark.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+import difflib
+
+import mteb.benchmarks.benchmarks as benchmark_module
+from mteb.benchmarks import Benchmark
+
+BENCHMARK_REGISTRY = {
+ inst.name: inst
+ for nam, inst in benchmark_module.__dict__.items()
+ if isinstance(inst, Benchmark)
+}
+
+
+def get_benchmark(
+ benchmark_name: str,
+) -> Benchmark:
+ if benchmark_name not in BENCHMARK_REGISTRY:
+ close_matches = difflib.get_close_matches(
+ benchmark_name, BENCHMARK_REGISTRY.keys()
+ )
+ if close_matches:
+ suggestion = f"KeyError: '{benchmark_name}' not found. Did you mean: {close_matches[0]}?"
+ else:
+ suggestion = f"KeyError: '{benchmark_name}' not found and no similar keys were found."
+ raise KeyError(suggestion)
+ return BENCHMARK_REGISTRY[benchmark_name]
diff --git a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
index a9cf4cea25..a01bda1d80 100644
--- a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
+++ b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
@@ -12,7 +12,7 @@ class FilipinoHateSpeechClassification(AbsTaskClassification):
description="Filipino Twitter dataset for sentiment classification.",
reference="https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019",
dataset={
- "path": "hate-speech-filipino/hate_speech_filipino",
+ "path": "legacy-datasets/hate_speech_filipino",
"revision": "1994e9bb7f3ec07518e3f0d9e870cb293e234686",
"trust_remote_code": True,
},
diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py
index 3d32d923bc..742c7930e9 100644
--- a/tests/test_benchmark/test_benchmark.py
+++ b/tests/test_benchmark/test_benchmark.py
@@ -10,7 +10,7 @@
from sentence_transformers import SentenceTransformer
import mteb
-from mteb.benchmarks import Benchmark
+from mteb.benchmarks.benchmarks import Benchmark
from mteb.create_meta import generate_readme
from .mock_models import (
@@ -127,9 +127,28 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs):
@pytest.mark.parametrize("model", [MockNumpyEncoder()])
def test_run_using_benchmark(model: mteb.Encoder):
"""Test that a benchmark object can be run using the MTEB class."""
- bench = Benchmark(name="test_bench", tasks=["STS12", "SummEval"])
+ bench = Benchmark(
+ name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"])
+ )
eval = mteb.MTEB(tasks=bench)
eval.run(
model, output_folder="tests/results", overwrite_results=True
) # we just want to test that it runs
+
+
+def test_benchmark_names_must_be_unique():
+ import mteb.benchmarks.benchmarks as benchmark_module
+
+ names = [
+ inst.name
+ for nam, inst in benchmark_module.__dict__.items()
+ if isinstance(inst, Benchmark)
+ ]
+ assert len(names) == len(set(names))
+
+
+@pytest.mark.parametrize("name", ["MTEB(eng)", "MTEB(rus)", "MTEB(Scandinavian)"])
+def test_get_benchmarks(name):
+ benchmark = mteb.get_benchmark(benchmark_name=name)
+ assert isinstance(benchmark, mteb.Benchmark)