diff --git a/README.md b/README.md index e08545aec6..e2f7a523a5 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ pip install mteb ## Usage -* Using a python script (see [scripts/run_mteb_english.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_english.py) and [mteb/mtebscripts](https://github.com/embeddings-benchmark/mtebscripts) for more): +* Using a python script: ```python import mteb @@ -77,11 +77,11 @@ Click on each section below to see the details.
- Dataset selection + Task selection -### Dataset selection +### Task selection -Datasets can be selected by providing the list of datasets, but also +Tasks can be selected by providing the list of datasets, but also * by their task (e.g. "Clustering" or "Classification") @@ -121,11 +121,18 @@ evaluation = mteb.MTEB(tasks=[ # for an example of a HF subset see "Subset" in the dataset viewer at: https://huggingface.co/datasets/mteb/bucc-bitext-mining ``` -There are also presets available for certain task collections, e.g. to select the 56 English datasets that form the "Overall MTEB English leaderboard": +
+ +
+ Running a benchmark + +`mteb` comes with a set of predefined benchmarks. These can be fetched using `get_benchmark` and run in a similar fashion to other sets of tasks. +For instance to select the 56 English datasets that form the "Overall MTEB English leaderboard": ```python -from mteb import MTEB_MAIN_EN -evaluation = mteb.MTEB(tasks=MTEB_MAIN_EN, task_langs=["en"]) +import mteb +mteb_eng = mteb.get_benchmark("MTEB(eng)") +evaluation = mteb.MTEB(tasks=mteb_eng, eval_splits=["test"]) ```
diff --git a/mteb/__init__.py b/mteb/__init__.py index be5edd97ed..2b98827014 100644 --- a/mteb/__init__.py +++ b/mteb/__init__.py @@ -2,7 +2,7 @@ from importlib.metadata import version -from mteb.benchmarks import ( +from mteb.benchmarks.benchmarks import ( MTEB_MAIN_EN, MTEB_MAIN_RU, MTEB_RETRIEVAL_LAW, @@ -14,7 +14,8 @@ from mteb.models import get_model, get_model_meta from mteb.overview import TASKS_REGISTRY, get_task, get_tasks -from .benchmarks import Benchmark +from .benchmarks.benchmarks import Benchmark +from .benchmarks.get_benchmark import get_benchmark __version__ = version("mteb") # fetch version from install metadata @@ -32,4 +33,5 @@ "get_model_meta", "load_results", "Benchmark", + "get_benchmark", ] diff --git a/mteb/benchmarks/__init__.py b/mteb/benchmarks/__init__.py new file mode 100644 index 0000000000..fb1d12a293 --- /dev/null +++ b/mteb/benchmarks/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from mteb.benchmarks.benchmarks import * diff --git a/mteb/benchmarks.py b/mteb/benchmarks/benchmarks.py similarity index 63% rename from mteb/benchmarks.py rename to mteb/benchmarks/benchmarks.py index 9485230a62..048c74d75a 100644 --- a/mteb/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -3,16 +3,44 @@ from dataclasses import dataclass from typing import Sequence +from pydantic import AnyUrl, BeforeValidator, TypeAdapter +from typing_extensions import Annotated + from mteb.abstasks.AbsTask import AbsTask from mteb.overview import get_tasks +http_url_adapter = TypeAdapter(AnyUrl) +UrlString = Annotated[ + str, BeforeValidator(lambda value: str(http_url_adapter.validate_python(value))) +] # Allows the type to be a string, but ensures that the string is a URL + @dataclass class Benchmark: + """A benchmark object intended to run a certain benchmark within MTEB. + + Args: + name: The name of the benchmark + tasks: The tasks within the benchmark. + description: A description of the benchmark, should include its intended goal and potentially a description of its construction + reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github. + citation: A bibtex citation + + Example: + >>> Benchmark( + ... name="MTEB(custom)", + ... tasks=mteb.get_tasks( + ... tasks=["AmazonCounterfactualClassification", "AmazonPolarityClassification"], + ... languages=["eng"], + ... ), + ... description="A custom benchmark" + ... ) + """ + name: str - tasks: Sequence[str] | Sequence[AbsTask] + tasks: Sequence[AbsTask] description: str | None = None - reference: str | None = None + reference: UrlString | None = None citation: str | None = None def __iter__(self): @@ -27,75 +55,78 @@ def __getitem__(self, index): MTEB_MAIN_EN = Benchmark( name="MTEB(eng)", - tasks=[ - "AmazonCounterfactualClassification", - "AmazonPolarityClassification", - "AmazonReviewsClassification", - "ArguAna", - "ArxivClusteringP2P", - "ArxivClusteringS2S", - "AskUbuntuDupQuestions", - "BIOSSES", - "Banking77Classification", - "BiorxivClusteringP2P", - "BiorxivClusteringS2S", - "CQADupstackAndroidRetrieval", - "CQADupstackEnglishRetrieval", - "CQADupstackGamingRetrieval", - "CQADupstackGisRetrieval", - "CQADupstackMathematicaRetrieval", - "CQADupstackPhysicsRetrieval", - "CQADupstackProgrammersRetrieval", - "CQADupstackStatsRetrieval", - "CQADupstackTexRetrieval", - "CQADupstackUnixRetrieval", - "CQADupstackWebmastersRetrieval", - "CQADupstackWordpressRetrieval", - "ClimateFEVER", - "DBPedia", - "EmotionClassification", - "FEVER", - "FiQA2018", - "HotpotQA", - "ImdbClassification", - "MSMARCO", - "MTOPDomainClassification", - "MTOPIntentClassification", - "MassiveIntentClassification", - "MassiveScenarioClassification", - "MedrxivClusteringP2P", - "MedrxivClusteringS2S", - "MindSmallReranking", - "NFCorpus", - "NQ", - "QuoraRetrieval", - "RedditClustering", - "RedditClusteringP2P", - "SCIDOCS", - "SICK-R", - "STS12", - "STS13", - "STS14", - "STS15", - "STS16", - "STS17", - "STS22", - "STSBenchmark", - "SciDocsRR", - "SciFact", - "SprintDuplicateQuestions", - "StackExchangeClustering", - "StackExchangeClusteringP2P", - "StackOverflowDupQuestions", - "SummEval", - "TRECCOVID", - "Touche2020", - "ToxicConversationsClassification", - "TweetSentimentExtractionClassification", - "TwentyNewsgroupsClustering", - "TwitterSemEval2015", - "TwitterURLCorpus", - ], + tasks=get_tasks( + tasks=[ + "AmazonCounterfactualClassification", + "AmazonPolarityClassification", + "AmazonReviewsClassification", + "ArguAna", + "ArxivClusteringP2P", + "ArxivClusteringS2S", + "AskUbuntuDupQuestions", + "BIOSSES", + "Banking77Classification", + "BiorxivClusteringP2P", + "BiorxivClusteringS2S", + "CQADupstackAndroidRetrieval", + "CQADupstackEnglishRetrieval", + "CQADupstackGamingRetrieval", + "CQADupstackGisRetrieval", + "CQADupstackMathematicaRetrieval", + "CQADupstackPhysicsRetrieval", + "CQADupstackProgrammersRetrieval", + "CQADupstackStatsRetrieval", + "CQADupstackTexRetrieval", + "CQADupstackUnixRetrieval", + "CQADupstackWebmastersRetrieval", + "CQADupstackWordpressRetrieval", + "ClimateFEVER", + "DBPedia", + "EmotionClassification", + "FEVER", + "FiQA2018", + "HotpotQA", + "ImdbClassification", + "MSMARCO", + "MTOPDomainClassification", + "MTOPIntentClassification", + "MassiveIntentClassification", + "MassiveScenarioClassification", + "MedrxivClusteringP2P", + "MedrxivClusteringS2S", + "MindSmallReranking", + "NFCorpus", + "NQ", + "QuoraRetrieval", + "RedditClustering", + "RedditClusteringP2P", + "SCIDOCS", + "SICK-R", + "STS12", + "STS13", + "STS14", + "STS15", + "STS16", + "STS17", + "STS22", + "STSBenchmark", + "SciDocsRR", + "SciFact", + "SprintDuplicateQuestions", + "StackExchangeClustering", + "StackExchangeClusteringP2P", + "StackOverflowDupQuestions", + "SummEval", + "TRECCOVID", + "Touche2020", + "ToxicConversationsClassification", + "TweetSentimentExtractionClassification", + "TwentyNewsgroupsClustering", + "TwitterSemEval2015", + "TwitterURLCorpus", + ], + languages=["eng"], + ), description="Main English benchmarks from MTEB", citation="""@inproceedings{muennighoff-etal-2023-mteb, title = "{MTEB}: Massive Text Embedding Benchmark", @@ -170,11 +201,13 @@ def __getitem__(self, index): MTEB_RETRIEVAL_WITH_INSTRUCTIONS = Benchmark( name="MTEB(Retrieval w/Instructions)", - tasks=[ - "Robust04InstructionRetrieval", - "News21InstructionRetrieval", - "Core17InstructionRetrieval", - ], + tasks=get_tasks( + tasks=[ + "Robust04InstructionRetrieval", + "News21InstructionRetrieval", + "Core17InstructionRetrieval", + ] + ), description="Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions.", reference="https://arxiv.org/abs/2403.15246", citation="""@misc{weller2024followir, @@ -188,33 +221,37 @@ def __getitem__(self, index): ) MTEB_RETRIEVAL_LAW = Benchmark( - name="MTEB(law)", - tasks=[ - "LegalSummarization", - "LegalBenchConsumerContractsQA", - "LegalBenchCorporateLobbying", - "AILACasedocs", - "AILAStatutes", - "LeCaRDv2", - "LegalQuAD", - "GerDaLIRSmall", - ], - description="Legal benchmarks from MTEB", + name="MTEB(law)", # This benchmark is likely in the need of an update + tasks=get_tasks( + tasks=[ + "AILACasedocs", + "AILAStatutes", + "LegalSummarization", + "GerDaLIRSmall", + "LeCaRDv2", + "LegalBenchConsumerContractsQA", + "LegalBenchCorporateLobbying", + "LegalQuAD", + ] + ), + description="Legal benchmarks from MTEB.", reference="https://aclanthology.org/2023.eacl-main.148/", citation=None, ) MTEB_MINERS_BITEXT_MINING = Benchmark( name="MINERSBitextMining", - tasks=[ - "BUCCBitextMining", - "LinceMTBitextMining", - "NollySentiBitextMining", - "NusaXBitextMining", - "NusaTranslationBitextMining", - "PhincBitextMining", - "TatoebaBitextMining", - ], + tasks=get_tasks( + tasks=[ + "BUCC", + "LinceMTBitextMining", + "NollySentiBitextMining", + "NusaXBitextMining", + "NusaTranslationBitextMining", + "PhincBitextMining", + "Tatoeba", + ] + ), description="BitextMining benchmark from MINERS", reference="https://arxiv.org/pdf/2406.07424", citation=""" @@ -228,37 +265,43 @@ def __getitem__(self, index): ) SEB = Benchmark( name="MTEB(Scandinavian)", - tasks=[ - "BornholmBitextMining", - "NorwegianCourtsBitextMining", - "AngryTweetsClassification", - "DanishPoliticalCommentsClassification", - "DKHateClassification", - "LccSentimentClassification", - "MassiveIntentClassification", - "MassiveScenarioClassification", - "NordicLangClassification", - "ScalaClassification", - "NoRecClassification", - "NorwegianParliamentClassification", - "DalajClassification", - "SwedishSentimentClassification", - "SweRecClassification", - "DanFEVER", - "TV2Nordretrieval", - "TwitterHjerneRetrieval", - "NorQuadRetrieval", - "SNLRetrieval", - "SwednRetrieval", - "SweFaqRetrieval", - "WikiClusteringP2P.v2", - "SNLHierarchicalClusteringP2P", - "SNLHierarchicalClusteringS2S", - "VGHierarchicalClusteringP2P", - "VGHierarchicalClusteringS2S", - "SwednClusteringP2P", - "SwednClusteringS2S", - ], + tasks=get_tasks( + tasks=[ + # Bitext + "BornholmBitextMining", + "NorwegianCourtsBitextMining", + # Classification + "AngryTweetsClassification", + "DanishPoliticalCommentsClassification", + "DalajClassification", + "DKHateClassification", + "LccSentimentClassification", + "MassiveIntentClassification", + "MassiveScenarioClassification", + "NordicLangClassification", + "NoRecClassification", + "NorwegianParliamentClassification", + "ScalaClassification", + "SwedishSentimentClassification", + "SweRecClassification", + # Retrieval + "DanFEVER", + "NorQuadRetrieval", + "SNLRetrieval", + "SwednRetrieval", + "SweFaqRetrieval", + "TV2Nordretrieval", + "TwitterHjerneRetrieval", + # Clustering + "SNLHierarchicalClusteringS2S", + "SNLHierarchicalClusteringP2P", + "SwednClusteringP2P", + "SwednClusteringS2S", + "VGHierarchicalClusteringS2S", + "VGHierarchicalClusteringP2P", + ], + languages=["dan", "swe", "nno", "nob"], + ), description="A curated selection of tasks coverering the Scandinavian languages; Danish, Swedish and Norwegian, including Bokmål and Nynorsk.", reference="https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/", citation="""@misc{enevoldsen2024scandinavian, @@ -273,18 +316,20 @@ def __getitem__(self, index): CoIR = Benchmark( name="CoIR", - tasks=[ - "AppsRetrieval", - "CosQA", - "SyntheticText2SQL", - "COIRCodeSearchNetRetrieval", - "CodeSearchNetCCRetrieval", - "CodeTransOceanDL", - "CodeTransOceanContest", - "StackOverflowQA", - "CodeFeedbackMT", - "CodeFeedbackST", - ], + tasks=get_tasks( + tasks=[ + "AppsRetrieval", + "CodeFeedbackMT", + "CodeFeedbackST", + "CodeSearchNetCCRetrieval", + "CodeTransOceanContest", + "CodeTransOceanDL", + "CosQA", + "COIRCodeSearchNetRetrieval", + "StackOverflowQA", + "SyntheticText2SQL", + ] + ), description="CoIR: A Comprehensive Benchmark for Code Information Retrieval Models", reference="https://github.com/CoIR-team/coir", citation="""@misc{li2024coircomprehensivebenchmarkcode, @@ -322,19 +367,19 @@ def __getitem__(self, index): "OpusparcusPC", "PawsXPairClassification", # Reranking - "SyntecReranking", "AlloprofReranking", + "SyntecReranking", # Retrieval "AlloprofRetrieval", "BSARDRetrieval", + "MintakaRetrieval", "SyntecRetrieval", "XPQARetrieval", - "MintakaRetrieval", # STS - "SummEvalFr", - "STSBenchmarkMultilingualSTS", - "STS22", "SICKFr", + "STS22", + "STSBenchmarkMultilingualSTS", + "SummEvalFr", ], ), description="Main French benchmarks from MTEB", @@ -426,27 +471,27 @@ def __getitem__(self, index): languages=["pol"], tasks=[ # Classification + "AllegroReviews", "CBD", + "MassiveIntentClassification", + "MassiveScenarioClassification", "PolEmo2.0-IN", "PolEmo2.0-OUT", - "AllegroReviews", "PAC", - "MassiveIntentClassification", - "MassiveScenarioClassification", # Clustering "EightTagsClustering", "PlscClusteringS2S", "PlscClusteringP2P", # Pair Classification - "SICK-E-PL", - "PpcPC", "CDSC-E", + "PpcPC", "PSC", + "SICK-E-PL", # STS - "SICK-R-PL", "CDSC-R", "STS22", "STSBenchmarkMultilingualSTS", + "SICK-R-PL", ], ), description="Main Polish benchmarks from MTEB", diff --git a/mteb/benchmarks/get_benchmark.py b/mteb/benchmarks/get_benchmark.py new file mode 100644 index 0000000000..169e3bcd50 --- /dev/null +++ b/mteb/benchmarks/get_benchmark.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +import difflib + +import mteb.benchmarks.benchmarks as benchmark_module +from mteb.benchmarks import Benchmark + +BENCHMARK_REGISTRY = { + inst.name: inst + for nam, inst in benchmark_module.__dict__.items() + if isinstance(inst, Benchmark) +} + + +def get_benchmark( + benchmark_name: str, +) -> Benchmark: + if benchmark_name not in BENCHMARK_REGISTRY: + close_matches = difflib.get_close_matches( + benchmark_name, BENCHMARK_REGISTRY.keys() + ) + if close_matches: + suggestion = f"KeyError: '{benchmark_name}' not found. Did you mean: {close_matches[0]}?" + else: + suggestion = f"KeyError: '{benchmark_name}' not found and no similar keys were found." + raise KeyError(suggestion) + return BENCHMARK_REGISTRY[benchmark_name] diff --git a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py index a9cf4cea25..a01bda1d80 100644 --- a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py +++ b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py @@ -12,7 +12,7 @@ class FilipinoHateSpeechClassification(AbsTaskClassification): description="Filipino Twitter dataset for sentiment classification.", reference="https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019", dataset={ - "path": "hate-speech-filipino/hate_speech_filipino", + "path": "legacy-datasets/hate_speech_filipino", "revision": "1994e9bb7f3ec07518e3f0d9e870cb293e234686", "trust_remote_code": True, }, diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index 3d32d923bc..742c7930e9 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -10,7 +10,7 @@ from sentence_transformers import SentenceTransformer import mteb -from mteb.benchmarks import Benchmark +from mteb.benchmarks.benchmarks import Benchmark from mteb.create_meta import generate_readme from .mock_models import ( @@ -127,9 +127,28 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): @pytest.mark.parametrize("model", [MockNumpyEncoder()]) def test_run_using_benchmark(model: mteb.Encoder): """Test that a benchmark object can be run using the MTEB class.""" - bench = Benchmark(name="test_bench", tasks=["STS12", "SummEval"]) + bench = Benchmark( + name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"]) + ) eval = mteb.MTEB(tasks=bench) eval.run( model, output_folder="tests/results", overwrite_results=True ) # we just want to test that it runs + + +def test_benchmark_names_must_be_unique(): + import mteb.benchmarks.benchmarks as benchmark_module + + names = [ + inst.name + for nam, inst in benchmark_module.__dict__.items() + if isinstance(inst, Benchmark) + ] + assert len(names) == len(set(names)) + + +@pytest.mark.parametrize("name", ["MTEB(eng)", "MTEB(rus)", "MTEB(Scandinavian)"]) +def test_get_benchmarks(name): + benchmark = mteb.get_benchmark(benchmark_name=name) + assert isinstance(benchmark, mteb.Benchmark)