diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index 443725ec7f..1d2e4fcb05 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -5,6 +5,7 @@ import random from abc import ABC, abstractmethod from collections.abc import Sequence +from copy import copy from typing import Any import datasets @@ -62,6 +63,7 @@ class AbsTask(ABC): dataset: dict[HFSubset, DatasetDict] | None = None # type: ignore data_loaded: bool = False is_multilingual: bool = False + hf_subsets: list[HFSubset] | None = None def __init__(self, seed: int = 42, **kwargs: Any): self.save_suffix = kwargs.get("save_suffix", "") @@ -110,10 +112,13 @@ def evaluate( self.dataset: dict[HFSubset, DatasetDict] scores = {} - hf_subsets = list(self.dataset.keys()) if self.is_multilingual else ["default"] + if self.hf_subsets is None: + hf_subsets = list(self.dataset.keys()) + else: + hf_subsets = copy(self.hf_subsets) - if subsets_to_run is not None: - hf_subsets = [s for s in hf_subsets if s in subsets_to_run] + if subsets_to_run is not None: # allow overwrites of pre-filtering + hf_subsets = subsets_to_run for hf_subset in hf_subsets: logger.info( @@ -218,16 +223,13 @@ def calculate_metadata_metrics( ) descriptive_stats[split][hf_subset_stat] = {} - eval_langs = ( - list(self.metadata.eval_langs.keys()) - if isinstance(self.metadata.eval_langs, dict) - else self.metadata.eval_langs + pbar_subsets = tqdm.tqdm( + self.metadata.hf_subsets_to_langscripts, + desc="Processing Languages...", ) - - pbar_subsets = tqdm.tqdm(eval_langs, desc="Processing Languages...") for hf_subset in pbar_subsets: - pbar_subsets.set_postfix_str(f"Language: {hf_subset}") - logger.info(f"Processing metadata for language {hf_subset}") + pbar_subsets.set_postfix_str(f"Huggingface subset: {hf_subset}") + logger.info(f"Processing metadata for subset {hf_subset}") split_details = self._calculate_metrics_from_split(split, hf_subset) descriptive_stats[split][hf_subset_stat][hf_subset] = split_details else: @@ -252,12 +254,8 @@ def metadata_dict(self) -> dict[str, Any]: @property def languages(self) -> list[str]: """Returns the languages of the task""" - # check if self.hf_subsets is set - if self.is_multilingual and hasattr(self, "hf_subsets"): - assert isinstance( - self.metadata.eval_langs, dict - ), "eval_langs must be dict for multilingual tasks" - eval_langs = self.metadata.eval_langs + if self.hf_subsets: + eval_langs = self.metadata.hf_subsets_to_langscripts languages = [] for lang in self.hf_subsets: @@ -275,31 +273,43 @@ def filter_eval_splits(self, eval_splits: list[str] | None) -> AbsTask: return self def filter_languages( - self, languages: list[str] | None, script: list[str] | None = None + self, + languages: list[str] | None, + script: list[str] | None = None, + hf_subsets: list[HFSubset] | None = None, + exclusive_language_filter: bool = False, ) -> AbsTask: """Filter the languages of the task. Args: languages: list of languages to filter the task by can be either a 3-letter langauge code (e.g. "eng") or also include the script (e.g. "eng-Latn") - script: list of scripts to filter the task by. Will be ignored if language code specified the script. If None, all scripts are included. + script: A list of scripts to filter the task by. Will be ignored if language code specified the script. If None, all scripts are included. If the language code does not specify the script the intersection of the language and script will be used. + hf_subsets: A list of huggingface subsets to filter on. This is useful if a dataset have multiple subsets containing the desired language, + but you only want to test on one. An example is STS22 which e.g. have both "en" and "de-en" which both contains English. + exclusive_language_filter: Some datasets contains more than one language e.g. for STS22 the subset "de-en" contain eng and deu. If + exclusive_language_filter is set to False both of these will be kept, but if set to True only those that contains all the languages + specified will be kept. """ lang_scripts = LanguageScripts.from_languages_and_scripts(languages, script) subsets_to_keep = [] - if not isinstance(self.metadata.eval_langs, dict): - self.hf_subsets = self.metadata.eval_langs - return self - - for hf_subset, langs in self.metadata.eval_langs.items(): - for langscript in langs: - if lang_scripts.contains_language( - langscript - ) or lang_scripts.contains_script(langscript): + for hf_subset, langs in self.metadata.hf_subsets_to_langscripts.items(): + if (hf_subsets is not None) and (hf_subset not in hf_subsets): + continue + if exclusive_language_filter is False: + for langscript in langs: + if lang_scripts.contains_language( + langscript + ) or lang_scripts.contains_script(langscript): + subsets_to_keep.append(hf_subset) + break + + if exclusive_language_filter is True and languages: + if lang_scripts.contains_languages(langs): subsets_to_keep.append(hf_subset) - break self.hf_subsets = subsets_to_keep return self diff --git a/mteb/abstasks/MultilingualTask.py b/mteb/abstasks/MultilingualTask.py index 3fd007df6d..6516e74bd0 100644 --- a/mteb/abstasks/MultilingualTask.py +++ b/mteb/abstasks/MultilingualTask.py @@ -12,9 +12,7 @@ def __init__(self, hf_subsets: list[str] | None = None, **kwargs): lang for lang in hf_subsets if lang in self.metadata.eval_langs ] if hf_subsets is not None and len(hf_subsets) > 0: - self.hf_subsets = ( - hf_subsets # TODO: case where user provides langs not in the dataset - ) + self.hf_subsets = hf_subsets else: self.hf_subsets = self.metadata.eval_langs self.is_multilingual = True diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index edb4326cae..d5c249e008 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -9,7 +9,7 @@ from mteb.abstasks.AbsTask import AbsTask from mteb.load_results.benchmark_results import BenchmarkResults from mteb.load_results.load_results import load_results -from mteb.overview import MTEBTasks, get_tasks +from mteb.overview import MTEBTasks, get_task, get_tasks http_url_adapter = TypeAdapter(AnyUrl) UrlString = Annotated[ @@ -27,6 +27,7 @@ class Benchmark: description: A description of the benchmark, should include its intended goal and potentially a description of its construction reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github. citation: A bibtex citation + contacts: The people to contact in case of a problem in the benchmark, preferably a GitHub handle. Example: >>> Benchmark( @@ -44,6 +45,7 @@ class Benchmark: description: str | None = None reference: UrlString | None = None citation: str | None = None + contacts: list[str] | None = None def __iter__(self): return iter(self.tasks) @@ -70,55 +72,65 @@ def load_results( MTEB_EN = Benchmark( name="MTEB(eng, beta)", - tasks=get_tasks( - tasks=[ - "AmazonCounterfactualClassification", - "ArguAna", - "ArXivHierarchicalClusteringP2P", - "ArXivHierarchicalClusteringS2S", - "AskUbuntuDupQuestions", - "BIOSSES", - "Banking77Classification", - "BiorxivClusteringP2P.v2", - "CQADupstackGamingRetrieval", - "CQADupstackUnixRetrieval", - "ClimateFEVERHardNegatives", - "FEVERHardNegatives", - "FiQA2018", - "HotpotQAHardNegatives", - "ImdbClassification", - "MTOPDomainClassification", - "MassiveIntentClassification", - "MassiveScenarioClassification", - "MedrxivClusteringP2P.v2", - "MedrxivClusteringS2S.v2", - "MindSmallReranking", - "SCIDOCS", - "SICK-R", - "STS12", - "STS13", - "STS14", - "STS15", - "STS17", - "STS22.v2", - "STSBenchmark", - "SprintDuplicateQuestions", - "StackExchangeClustering.v2", - "StackExchangeClusteringP2P.v2", - "TRECCOVID", - "Touche2020Retrieval.v3", - "ToxicConversationsClassification", - "TweetSentimentExtractionClassification", - "TwentyNewsgroupsClustering.v2", - "TwitterSemEval2015", - "TwitterURLCorpus", - "SummEvalSummarization.v2", - ], - languages=["eng"], - eval_splits=["test"], + tasks=MTEBTasks( + get_tasks( + tasks=[ + "ArguAna", + "ArXivHierarchicalClusteringP2P", + "ArXivHierarchicalClusteringS2S", + "AskUbuntuDupQuestions", + "BIOSSES", + "Banking77Classification", + "BiorxivClusteringP2P.v2", + "CQADupstackGamingRetrieval", + "CQADupstackUnixRetrieval", + "ClimateFEVERHardNegatives", + "FEVERHardNegatives", + "FiQA2018", + "HotpotQAHardNegatives", + "ImdbClassification", + "MTOPDomainClassification", + "MassiveIntentClassification", + "MassiveScenarioClassification", + "MedrxivClusteringP2P.v2", + "MedrxivClusteringS2S.v2", + "MindSmallReranking", + "SCIDOCS", + "SICK-R", + "STS12", + "STS13", + "STS14", + "STS15", + "STSBenchmark", + "SprintDuplicateQuestions", + "StackExchangeClustering.v2", + "StackExchangeClusteringP2P.v2", + "TRECCOVID", + "Touche2020Retrieval.v3", + "ToxicConversationsClassification", + "TweetSentimentExtractionClassification", + "TwentyNewsgroupsClustering.v2", + "TwitterSemEval2015", + "TwitterURLCorpus", + "SummEvalSummarization.v2", + ], + languages=["eng"], + eval_splits=["test"], + exclusive_language_filter=True, + ) + + ( + get_task( + "AmazonCounterfactualClassification", + eval_splits=["test"], + hf_subsets=["en"], + ), + get_task("STS17", eval_splits=["test"], hf_subsets=["en-en"]), + get_task("STS22.v2", eval_splits=["test"], hf_subsets=["en"]), + ), ), description="English benchmarks from MTEB", citation="", + contacts=["KennethEnevoldsen", "Muennighoff"], ) MTEB_ENG_CLASSIC = Benchmark( @@ -126,7 +138,6 @@ def load_results( tasks=MTEBTasks( get_tasks( tasks=[ - "AmazonCounterfactualClassification", "AmazonPolarityClassification", "AmazonReviewsClassification", "ArguAna", @@ -175,8 +186,6 @@ def load_results( "STS14", "STS15", "STS16", - "STS17", - "STS22", "STSBenchmark", "SciDocsRR", "SciFact", @@ -197,6 +206,15 @@ def load_results( eval_splits=["test"], ) + get_tasks(tasks=["MSMARCO"], languages=["eng"], eval_splits=["dev"]) + + ( + get_task( + "AmazonCounterfactualClassification", + eval_splits=["test"], + hf_subsets=["en"], + ), + get_task("STS17", eval_splits=["test"], hf_subsets=["en-en"]), + get_task("STS22", eval_splits=["test"], hf_subsets=["en"]), + ) ), description="The original English benchmark by Muennighoff et al., (2023).", citation="""@inproceedings{muennighoff-etal-2023-mteb, @@ -217,6 +235,7 @@ def load_results( pages = "2014--2037", } """, + contacts=["Muennighoff"], ) MTEB_MAIN_RU = Benchmark( @@ -407,6 +426,7 @@ def load_results( archivePrefix={arXiv}, primaryClass={cs.CL} }""", + contacts=["KennethEnevoldsen", "x-tabdeveloping", "Samoed"], ) CoIR = Benchmark( @@ -469,46 +489,49 @@ def load_results( journal={arXiv preprint arXiv:2404.06347}, year={2024} }""", + contacts=["gowitheflow-1998"], ) MTEB_FRA = Benchmark( name="MTEB(fra)", - tasks=get_tasks( - languages=["fra"], - tasks=[ - # Classification - "AmazonReviewsClassification", - "MasakhaNEWSClassification", - "MassiveIntentClassification", - "MassiveScenarioClassification", - "MTOPDomainClassification", - "MTOPIntentClassification", - # Clustering - "AlloProfClusteringP2P", - "AlloProfClusteringS2S", - "HALClusteringS2S", - "MasakhaNEWSClusteringP2P", - "MasakhaNEWSClusteringS2S", - "MLSUMClusteringP2P", - "MLSUMClusteringS2S", - # Pair Classification - "OpusparcusPC", - "PawsXPairClassification", - # Reranking - "AlloprofReranking", - "SyntecReranking", - # Retrieval - "AlloprofRetrieval", - "BSARDRetrieval", - "MintakaRetrieval", - "SyntecRetrieval", - "XPQARetrieval", - # STS - "SICKFr", - "STS22", - "STSBenchmarkMultilingualSTS", - "SummEvalFr", - ], + tasks=MTEBTasks( + get_tasks( + languages=["fra"], + tasks=[ + # Classification + "AmazonReviewsClassification", + "MasakhaNEWSClassification", + "MassiveIntentClassification", + "MassiveScenarioClassification", + "MTOPDomainClassification", + "MTOPIntentClassification", + # Clustering + "AlloProfClusteringP2P", + "AlloProfClusteringS2S", + "HALClusteringS2S", + "MasakhaNEWSClusteringP2P", + "MasakhaNEWSClusteringS2S", + "MLSUMClusteringP2P", + "MLSUMClusteringS2S", + # Pair Classification + "OpusparcusPC", + "PawsXPairClassification", + # Reranking + "AlloprofReranking", + "SyntecReranking", + # Retrieval + "AlloprofRetrieval", + "BSARDRetrieval", + "MintakaRetrieval", + "SyntecRetrieval", + "XPQARetrieval", + # STS + "SICKFr", + "STSBenchmarkMultilingualSTS", + "SummEvalFr", + ], + ) + + (get_task("STS22", eval_splits=["test"], hf_subsets=["fr"]),) ), description="Main French benchmarks from MTEB", reference="https://arxiv.org/abs/2405.20468", @@ -521,6 +544,7 @@ def load_results( primaryClass={cs.CL}, url={https://arxiv.org/abs/2405.20468}, }""", + contacts=["imenelydiaker"], ) @@ -528,6 +552,7 @@ def load_results( name="MTEB(deu)", tasks=get_tasks( languages=["deu"], + exclusive_language_filter=True, tasks=[ # Classification "AmazonCounterfactualClassification", @@ -595,32 +620,34 @@ def load_results( MTEB_POL = Benchmark( name="MTEB(pol)", - tasks=get_tasks( - languages=["pol"], - tasks=[ - # Classification - "AllegroReviews", - "CBD", - "MassiveIntentClassification", - "MassiveScenarioClassification", - "PolEmo2.0-IN", - "PolEmo2.0-OUT", - "PAC", - # Clustering - "EightTagsClustering", - "PlscClusteringS2S", - "PlscClusteringP2P", - # Pair Classification - "CDSC-E", - "PpcPC", - "PSC", - "SICK-E-PL", - # STS - "CDSC-R", - "STS22", - "STSBenchmarkMultilingualSTS", - "SICK-R-PL", - ], + tasks=MTEBTasks( + get_tasks( + languages=["pol"], + tasks=[ + # Classification + "AllegroReviews", + "CBD", + "MassiveIntentClassification", + "MassiveScenarioClassification", + "PolEmo2.0-IN", + "PolEmo2.0-OUT", + "PAC", + # Clustering + "EightTagsClustering", + "PlscClusteringS2S", + "PlscClusteringP2P", + # Pair Classification + "CDSC-E", + "PpcPC", + "PSC", + "SICK-E-PL", + # STS + "CDSC-R", + "STSBenchmarkMultilingualSTS", + "SICK-R-PL", + ], + ) + + (get_task("STS22", eval_splits=["test"], hf_subsets=["pl"]),), ), description="Main Polish benchmarks from MTEB", reference="https://arxiv.org/abs/2405.10138", @@ -813,6 +840,7 @@ def load_results( description="The Multilingual benchmarks from MMTEB. Currently under development.", reference=None, citation=None, + contacts=["KennethEnevoldsen"], ) MTEB_JPN = Benchmark( @@ -850,6 +878,39 @@ def load_results( ) +indic_languages = [ + "asm", + "awa", + "ben", + "bgc", + "bho", + "doi", + "gbm", + "gom", + "guj", + "hin", + "hne", + "kan", + "kas", + "mai", + "mal", + "mar", + "mni", + "mup", + "mwr", + "nep", + "npi", + "ori", + "ory", + "pan", + "raj", + "san", + "snd", + "tam", + "tel", + "urd", +] + MTEB_INDIC = Benchmark( name="MTEB(Indic, beta)", tasks=get_tasks( @@ -885,13 +946,59 @@ def load_results( # reranking "WikipediaRerankingMultilingual", ], + languages=indic_languages, + exclusive_language_filter=True, ), description="Main Indic benchmark from MMTEB", reference=None, citation=None, + contacts=["KennethEnevoldsen"], ) +eu_languages = [ + # official EU languages (56) - we could include the whole economic area e.g. Norway - additioanlly we could include minority languages (probably a good idea?) + # germanic + "dan", + "eng", + "deu", + "nld", + "swe", + # romance + "fra", + "ita", + "por", + "spa", + "ron", + # slavic + "bul", + "hrv", + "ces", + "pol", + "slk", + "slv", + # baltic + "lav", + "lit", + "est", + # finno-ugric + "fin", + "hun", + # other indo european + "ell", + # non-indo european + "mlt", + "gle", + # Schengen Area + "nno", + "nob", + "isl", + "ron", + "eus", # Basque - recognized minority language + "ron", # Romanian - recognized minority language + "rom", # Romani - recognized minority language +] + MTEB_EU = Benchmark( name="MTEB(Europe, beta)", tasks=get_tasks( @@ -970,11 +1077,14 @@ def load_results( "STS17", "SICK-R-PL", "STSES", - ] + ], + languages=eu_languages, + exclusive_language_filter=True, ), description="Main European benchmark from MMTEB", reference=None, citation=None, + contacts=["KennethEnevoldsen"], ) LONG_EMBED = Benchmark( diff --git a/mteb/languages.py b/mteb/languages.py index 9b170a707f..e83dd308cd 100644 --- a/mteb/languages.py +++ b/mteb/languages.py @@ -5,6 +5,7 @@ from __future__ import annotations import json +from collections.abc import Iterable from dataclasses import dataclass from pathlib import Path @@ -81,5 +82,19 @@ def contains_language(self, language: str) -> bool: return True return False + def contains_languages(self, languages: Iterable[str]) -> bool: + """Whether is contains all of the languages""" + for l in languages: + if not self.contains_language(l): + return False + return True + def contains_script(self, script: str) -> bool: return script in self.scripts + + def contains_scripts(self, scripts: Iterable[str]) -> bool: + """Whether is contains all of the scripts""" + for s in scripts: + if not self.contains_script(s): + return False + return True diff --git a/mteb/overview.py b/mteb/overview.py index 43f8cebc1b..ced0e7729f 100644 --- a/mteb/overview.py +++ b/mteb/overview.py @@ -232,6 +232,7 @@ def get_tasks( tasks: list[str] | None = None, exclude_superseded: bool = True, eval_splits: list[str] | None = None, + exclusive_language_filter: bool = False, ) -> MTEBTasks: """Get a list of tasks based on the specified filters. @@ -247,6 +248,9 @@ def get_tasks( tasks: A list of task names to include. If None, all tasks which pass the filters are included. exclude_superseded: A boolean flag to exclude datasets which are superseded by another. eval_splits: A list of evaluation splits to include. If None, all splits are included. + exclusive_language_filter: Some datasets contains more than one language e.g. for STS22 the subset "de-en" contain eng and deu. If + exclusive_language_filter is set to False both of these will be kept, but if set to True only those that contains all the languages + specified will be kept. Returns: A list of all initialized tasks objects which pass all of the filters (AND operation). @@ -256,10 +260,18 @@ def get_tasks( >>> get_tasks(languages=["eng"], script=["Latn"], task_types=["Classification"]) >>> get_tasks(languages=["eng"], script=["Latn"], task_types=["Clustering"], exclude_superseded=False) >>> get_tasks(languages=["eng"], tasks=["WikipediaRetrievalMultilingual"], eval_splits=["test"]) + >>> get_tasks(tasks=["STS22"], languages=["eng"], exclusive_language_filter=True) # don't include multilingual subsets containing English """ if tasks: _tasks = [ - get_task(task, languages, script, eval_splits=eval_splits) for task in tasks + get_task( + task, + languages, + script, + eval_splits=eval_splits, + exclusive_language_filter=exclusive_language_filter, + ) + for task in tasks ] return MTEBTasks(_tasks) @@ -289,6 +301,8 @@ def get_task( languages: list[str] | None = None, script: list[str] | None = None, eval_splits: list[str] | None = None, + hf_subsets: list[str] | None = None, + exclusive_language_filter: bool = False, ) -> AbsTask: """Get a task by name. @@ -298,6 +312,10 @@ def get_task( "eng-Latn". For multilingual tasks this will also remove languages that are not in the specified list. script: A list of script codes (ISO 15924 codes). If None, all scripts are included. For multilingual tasks this will also remove scripts eval_splits: A list of evaluation splits to include. If None, all splits are included. + hf_subsets: A list of Huggingface subsets to evaluate on. + exclusive_language_filter: Some datasets contains more than one language e.g. for STS22 the subset "de-en" contain eng and deu. If + exclusive_language_filter is set to False both of these will be kept, but if set to True only those that contains all the languages + specified will be kept. Returns: An initialized task object. @@ -319,4 +337,9 @@ def get_task( task = TASKS_REGISTRY[task_name]() if eval_splits: task.filter_eval_splits(eval_splits=eval_splits) - return task.filter_languages(languages, script) + return task.filter_languages( + languages, + script, + hf_subsets=hf_subsets, + exclusive_language_filter=exclusive_language_filter, + )