From 0c5c3a544bea7dcb4c6e6d75d612638171cf0332 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Mon, 13 Jan 2025 01:46:04 +0500 Subject: [PATCH 01/15] fix: update max tokens for OpenAI (#1772) update max tokens --- mteb/models/openai_models.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index c187bfa317..619a4a747f 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -25,7 +25,7 @@ def __init__( **kwargs, ) -> None: """Wrapper for OpenAIs embedding API. - To handle documents larger than 8192 tokens, we truncate the document to the specified sequence length. + To handle documents larger than 8191 tokens, we truncate the document to the specified sequence length. """ requires_package(self, "openai", "Openai text embedding") from openai import OpenAI @@ -124,7 +124,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: OpenAIWrapper, model_name="text-embedding-3-small", tokenizer_name="cl100k_base", - max_tokens=8192, + max_tokens=8191, ), max_tokens=8191, embed_dim=1536, @@ -149,7 +149,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: OpenAIWrapper, model_name="text-embedding-3-large", tokenizer_name="cl100k_base", - max_tokens=8192, + max_tokens=8191, ), max_tokens=8191, embed_dim=3072, @@ -172,7 +172,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: OpenAIWrapper, model_name="text-embedding-ada-002", tokenizer_name="cl100k_base", - max_tokens=8192, + max_tokens=8191, ), reference="https://openai.com/index/new-and-improved-embedding-model/", max_tokens=8191, From 71dbd61c2b1b82e3d19ed0a4914f59886d4f0007 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Mon, 13 Jan 2025 12:46:21 +0200 Subject: [PATCH 02/15] ci: skip AfriSentiLID for now (#1785) * skip AfriSentiLID for now * skip relevant test case instead --------- Co-authored-by: Isaac Chung --- tests/test_tasks/test_all_abstasks.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index 20eff8c434..e252293ea5 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -86,7 +86,14 @@ async def check_datasets_are_available_on_hf(tasks): def test_dataset_availability(): """Checks if the datasets are available on Hugging Face using both their name and revision.""" tasks = MTEB().tasks_cls - tasks = [t for t in tasks if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING] + tasks = [ + t + for t in tasks + if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING + if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING + and t.metadata.name + != "AfriSentiLangClassification" # HOTFIX: Issue#1777. Remove this line when issue is resolved. + ] asyncio.run(check_datasets_are_available_on_hf(tasks)) From bad27a68cf28ae6bd8191a65598db3ad562c5955 Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 13 Jan 2025 11:01:06 +0000 Subject: [PATCH 03/15] 1.28.7 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6dc6189822..13135cd8d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.28.6" +version = "1.28.7" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 9b117a8245a8c56470d99b8ca3d6b2f6b6819dd8 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Mon, 13 Jan 2025 15:46:32 +0200 Subject: [PATCH 04/15] ci: fix model loading test (#1775) * pass base branch into the make command as an arg * test a file that has custom wrapper * what about overview * just dont check overview * revert instance check * explicitly omit overview and init * remove test change * try on a lot of models * revert test model file --------- Co-authored-by: Isaac Chung --- .github/workflows/model_loading.yml | 2 +- Makefile | 2 +- scripts/extract_model_names.py | 7 ++++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/model_loading.yml b/.github/workflows/model_loading.yml index 8707a9c1d6..c139536321 100644 --- a/.github/workflows/model_loading.yml +++ b/.github/workflows/model_loading.yml @@ -21,4 +21,4 @@ jobs: - name: Install dependencies and run tests run: | - make model-load-test + make model-load-test BASE_BRANCH=${{ github.event.pull_request.base.ref }} diff --git a/Makefile b/Makefile index 6e8647a2ce..02d0ba2478 100644 --- a/Makefile +++ b/Makefile @@ -41,5 +41,5 @@ build-docs: model-load-test: @echo "--- 🚀 Running model load test ---" pip install ".[dev, speedtask, pylate,gritlm,xformers,model2vec]" - python scripts/extract_model_names.py + python scripts/extract_model_names.py $(BASE_BRANCH) python tests/test_models/model_loading.py --model_name_file scripts/model_names.txt \ No newline at end of file diff --git a/scripts/extract_model_names.py b/scripts/extract_model_names.py index dbe99a990e..ba1bc1a8b0 100644 --- a/scripts/extract_model_names.py +++ b/scripts/extract_model_names.py @@ -19,7 +19,12 @@ def get_changed_files(base_branch="main"): changed_files = diff.splitlines() return [ - f for f in changed_files if f.startswith("mteb/models/") and f.endswith(".py") + f + for f in changed_files + if f.startswith("mteb/models/") + and f.endswith(".py") + and "overview" not in f + and "init" not in f ] From 4a70e5d8996a341097c81782b463b1822f9708fe Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Mon, 13 Jan 2025 18:44:10 +0100 Subject: [PATCH 05/15] feat: Update task filtering, fixing bug which included cross-lingual tasks in overly many benchmarks (#1787) * feat: Update task filtering, fixing bug on MTEB - Updated task filtering adding exclusive_language_filter and hf_subset - fix bug in MTEB where cross-lingual splits were included - added missing language filtering to MTEB(europe, beta) and MTEB(indic, beta) The following code outlines the problems: ```py import mteb from mteb.benchmarks import MTEB_ENG_CLASSIC task = [t for t in MTEB_ENG_CLASSIC.tasks if t.metadata.name == "STS22"][0] # was eq. to: task = mteb.get_task("STS22", languages=["eng"]) task.hf_subsets # correct filtering to English datasets: # ['en', 'de-en', 'es-en', 'pl-en', 'zh-en'] # However it should be: # ['en'] # with the changes it is: task = [t for t in MTEB_ENG_CLASSIC.tasks if t.metadata.name == "STS22"][0] task.hf_subsets # ['en'] # eq. to task = mteb.get_task("STS22", hf_subsets=["en"]) # which you can also obtain using the exclusive_language_filter (though not if there was multiple english splits): task = mteb.get_task("STS22", languages=["eng"], exclusive_language_filter=True) ``` * format * remove "en-ext" from AmazonCounterfactualClassification * fixed mteb(deu) * fix: simplify in a few areas --- mteb/abstasks/AbsTask.py | 68 +++--- mteb/abstasks/MultilingualTask.py | 4 +- mteb/benchmarks/benchmarks.py | 336 ++++++++++++++++++++---------- mteb/languages.py | 15 ++ mteb/overview.py | 27 ++- 5 files changed, 303 insertions(+), 147 deletions(-) diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index 443725ec7f..1d2e4fcb05 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -5,6 +5,7 @@ import random from abc import ABC, abstractmethod from collections.abc import Sequence +from copy import copy from typing import Any import datasets @@ -62,6 +63,7 @@ class AbsTask(ABC): dataset: dict[HFSubset, DatasetDict] | None = None # type: ignore data_loaded: bool = False is_multilingual: bool = False + hf_subsets: list[HFSubset] | None = None def __init__(self, seed: int = 42, **kwargs: Any): self.save_suffix = kwargs.get("save_suffix", "") @@ -110,10 +112,13 @@ def evaluate( self.dataset: dict[HFSubset, DatasetDict] scores = {} - hf_subsets = list(self.dataset.keys()) if self.is_multilingual else ["default"] + if self.hf_subsets is None: + hf_subsets = list(self.dataset.keys()) + else: + hf_subsets = copy(self.hf_subsets) - if subsets_to_run is not None: - hf_subsets = [s for s in hf_subsets if s in subsets_to_run] + if subsets_to_run is not None: # allow overwrites of pre-filtering + hf_subsets = subsets_to_run for hf_subset in hf_subsets: logger.info( @@ -218,16 +223,13 @@ def calculate_metadata_metrics( ) descriptive_stats[split][hf_subset_stat] = {} - eval_langs = ( - list(self.metadata.eval_langs.keys()) - if isinstance(self.metadata.eval_langs, dict) - else self.metadata.eval_langs + pbar_subsets = tqdm.tqdm( + self.metadata.hf_subsets_to_langscripts, + desc="Processing Languages...", ) - - pbar_subsets = tqdm.tqdm(eval_langs, desc="Processing Languages...") for hf_subset in pbar_subsets: - pbar_subsets.set_postfix_str(f"Language: {hf_subset}") - logger.info(f"Processing metadata for language {hf_subset}") + pbar_subsets.set_postfix_str(f"Huggingface subset: {hf_subset}") + logger.info(f"Processing metadata for subset {hf_subset}") split_details = self._calculate_metrics_from_split(split, hf_subset) descriptive_stats[split][hf_subset_stat][hf_subset] = split_details else: @@ -252,12 +254,8 @@ def metadata_dict(self) -> dict[str, Any]: @property def languages(self) -> list[str]: """Returns the languages of the task""" - # check if self.hf_subsets is set - if self.is_multilingual and hasattr(self, "hf_subsets"): - assert isinstance( - self.metadata.eval_langs, dict - ), "eval_langs must be dict for multilingual tasks" - eval_langs = self.metadata.eval_langs + if self.hf_subsets: + eval_langs = self.metadata.hf_subsets_to_langscripts languages = [] for lang in self.hf_subsets: @@ -275,31 +273,43 @@ def filter_eval_splits(self, eval_splits: list[str] | None) -> AbsTask: return self def filter_languages( - self, languages: list[str] | None, script: list[str] | None = None + self, + languages: list[str] | None, + script: list[str] | None = None, + hf_subsets: list[HFSubset] | None = None, + exclusive_language_filter: bool = False, ) -> AbsTask: """Filter the languages of the task. Args: languages: list of languages to filter the task by can be either a 3-letter langauge code (e.g. "eng") or also include the script (e.g. "eng-Latn") - script: list of scripts to filter the task by. Will be ignored if language code specified the script. If None, all scripts are included. + script: A list of scripts to filter the task by. Will be ignored if language code specified the script. If None, all scripts are included. If the language code does not specify the script the intersection of the language and script will be used. + hf_subsets: A list of huggingface subsets to filter on. This is useful if a dataset have multiple subsets containing the desired language, + but you only want to test on one. An example is STS22 which e.g. have both "en" and "de-en" which both contains English. + exclusive_language_filter: Some datasets contains more than one language e.g. for STS22 the subset "de-en" contain eng and deu. If + exclusive_language_filter is set to False both of these will be kept, but if set to True only those that contains all the languages + specified will be kept. """ lang_scripts = LanguageScripts.from_languages_and_scripts(languages, script) subsets_to_keep = [] - if not isinstance(self.metadata.eval_langs, dict): - self.hf_subsets = self.metadata.eval_langs - return self - - for hf_subset, langs in self.metadata.eval_langs.items(): - for langscript in langs: - if lang_scripts.contains_language( - langscript - ) or lang_scripts.contains_script(langscript): + for hf_subset, langs in self.metadata.hf_subsets_to_langscripts.items(): + if (hf_subsets is not None) and (hf_subset not in hf_subsets): + continue + if exclusive_language_filter is False: + for langscript in langs: + if lang_scripts.contains_language( + langscript + ) or lang_scripts.contains_script(langscript): + subsets_to_keep.append(hf_subset) + break + + if exclusive_language_filter is True and languages: + if lang_scripts.contains_languages(langs): subsets_to_keep.append(hf_subset) - break self.hf_subsets = subsets_to_keep return self diff --git a/mteb/abstasks/MultilingualTask.py b/mteb/abstasks/MultilingualTask.py index 3fd007df6d..6516e74bd0 100644 --- a/mteb/abstasks/MultilingualTask.py +++ b/mteb/abstasks/MultilingualTask.py @@ -12,9 +12,7 @@ def __init__(self, hf_subsets: list[str] | None = None, **kwargs): lang for lang in hf_subsets if lang in self.metadata.eval_langs ] if hf_subsets is not None and len(hf_subsets) > 0: - self.hf_subsets = ( - hf_subsets # TODO: case where user provides langs not in the dataset - ) + self.hf_subsets = hf_subsets else: self.hf_subsets = self.metadata.eval_langs self.is_multilingual = True diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index edb4326cae..d5c249e008 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -9,7 +9,7 @@ from mteb.abstasks.AbsTask import AbsTask from mteb.load_results.benchmark_results import BenchmarkResults from mteb.load_results.load_results import load_results -from mteb.overview import MTEBTasks, get_tasks +from mteb.overview import MTEBTasks, get_task, get_tasks http_url_adapter = TypeAdapter(AnyUrl) UrlString = Annotated[ @@ -27,6 +27,7 @@ class Benchmark: description: A description of the benchmark, should include its intended goal and potentially a description of its construction reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github. citation: A bibtex citation + contacts: The people to contact in case of a problem in the benchmark, preferably a GitHub handle. Example: >>> Benchmark( @@ -44,6 +45,7 @@ class Benchmark: description: str | None = None reference: UrlString | None = None citation: str | None = None + contacts: list[str] | None = None def __iter__(self): return iter(self.tasks) @@ -70,55 +72,65 @@ def load_results( MTEB_EN = Benchmark( name="MTEB(eng, beta)", - tasks=get_tasks( - tasks=[ - "AmazonCounterfactualClassification", - "ArguAna", - "ArXivHierarchicalClusteringP2P", - "ArXivHierarchicalClusteringS2S", - "AskUbuntuDupQuestions", - "BIOSSES", - "Banking77Classification", - "BiorxivClusteringP2P.v2", - "CQADupstackGamingRetrieval", - "CQADupstackUnixRetrieval", - "ClimateFEVERHardNegatives", - "FEVERHardNegatives", - "FiQA2018", - "HotpotQAHardNegatives", - "ImdbClassification", - "MTOPDomainClassification", - "MassiveIntentClassification", - "MassiveScenarioClassification", - "MedrxivClusteringP2P.v2", - "MedrxivClusteringS2S.v2", - "MindSmallReranking", - "SCIDOCS", - "SICK-R", - "STS12", - "STS13", - "STS14", - "STS15", - "STS17", - "STS22.v2", - "STSBenchmark", - "SprintDuplicateQuestions", - "StackExchangeClustering.v2", - "StackExchangeClusteringP2P.v2", - "TRECCOVID", - "Touche2020Retrieval.v3", - "ToxicConversationsClassification", - "TweetSentimentExtractionClassification", - "TwentyNewsgroupsClustering.v2", - "TwitterSemEval2015", - "TwitterURLCorpus", - "SummEvalSummarization.v2", - ], - languages=["eng"], - eval_splits=["test"], + tasks=MTEBTasks( + get_tasks( + tasks=[ + "ArguAna", + "ArXivHierarchicalClusteringP2P", + "ArXivHierarchicalClusteringS2S", + "AskUbuntuDupQuestions", + "BIOSSES", + "Banking77Classification", + "BiorxivClusteringP2P.v2", + "CQADupstackGamingRetrieval", + "CQADupstackUnixRetrieval", + "ClimateFEVERHardNegatives", + "FEVERHardNegatives", + "FiQA2018", + "HotpotQAHardNegatives", + "ImdbClassification", + "MTOPDomainClassification", + "MassiveIntentClassification", + "MassiveScenarioClassification", + "MedrxivClusteringP2P.v2", + "MedrxivClusteringS2S.v2", + "MindSmallReranking", + "SCIDOCS", + "SICK-R", + "STS12", + "STS13", + "STS14", + "STS15", + "STSBenchmark", + "SprintDuplicateQuestions", + "StackExchangeClustering.v2", + "StackExchangeClusteringP2P.v2", + "TRECCOVID", + "Touche2020Retrieval.v3", + "ToxicConversationsClassification", + "TweetSentimentExtractionClassification", + "TwentyNewsgroupsClustering.v2", + "TwitterSemEval2015", + "TwitterURLCorpus", + "SummEvalSummarization.v2", + ], + languages=["eng"], + eval_splits=["test"], + exclusive_language_filter=True, + ) + + ( + get_task( + "AmazonCounterfactualClassification", + eval_splits=["test"], + hf_subsets=["en"], + ), + get_task("STS17", eval_splits=["test"], hf_subsets=["en-en"]), + get_task("STS22.v2", eval_splits=["test"], hf_subsets=["en"]), + ), ), description="English benchmarks from MTEB", citation="", + contacts=["KennethEnevoldsen", "Muennighoff"], ) MTEB_ENG_CLASSIC = Benchmark( @@ -126,7 +138,6 @@ def load_results( tasks=MTEBTasks( get_tasks( tasks=[ - "AmazonCounterfactualClassification", "AmazonPolarityClassification", "AmazonReviewsClassification", "ArguAna", @@ -175,8 +186,6 @@ def load_results( "STS14", "STS15", "STS16", - "STS17", - "STS22", "STSBenchmark", "SciDocsRR", "SciFact", @@ -197,6 +206,15 @@ def load_results( eval_splits=["test"], ) + get_tasks(tasks=["MSMARCO"], languages=["eng"], eval_splits=["dev"]) + + ( + get_task( + "AmazonCounterfactualClassification", + eval_splits=["test"], + hf_subsets=["en"], + ), + get_task("STS17", eval_splits=["test"], hf_subsets=["en-en"]), + get_task("STS22", eval_splits=["test"], hf_subsets=["en"]), + ) ), description="The original English benchmark by Muennighoff et al., (2023).", citation="""@inproceedings{muennighoff-etal-2023-mteb, @@ -217,6 +235,7 @@ def load_results( pages = "2014--2037", } """, + contacts=["Muennighoff"], ) MTEB_MAIN_RU = Benchmark( @@ -407,6 +426,7 @@ def load_results( archivePrefix={arXiv}, primaryClass={cs.CL} }""", + contacts=["KennethEnevoldsen", "x-tabdeveloping", "Samoed"], ) CoIR = Benchmark( @@ -469,46 +489,49 @@ def load_results( journal={arXiv preprint arXiv:2404.06347}, year={2024} }""", + contacts=["gowitheflow-1998"], ) MTEB_FRA = Benchmark( name="MTEB(fra)", - tasks=get_tasks( - languages=["fra"], - tasks=[ - # Classification - "AmazonReviewsClassification", - "MasakhaNEWSClassification", - "MassiveIntentClassification", - "MassiveScenarioClassification", - "MTOPDomainClassification", - "MTOPIntentClassification", - # Clustering - "AlloProfClusteringP2P", - "AlloProfClusteringS2S", - "HALClusteringS2S", - "MasakhaNEWSClusteringP2P", - "MasakhaNEWSClusteringS2S", - "MLSUMClusteringP2P", - "MLSUMClusteringS2S", - # Pair Classification - "OpusparcusPC", - "PawsXPairClassification", - # Reranking - "AlloprofReranking", - "SyntecReranking", - # Retrieval - "AlloprofRetrieval", - "BSARDRetrieval", - "MintakaRetrieval", - "SyntecRetrieval", - "XPQARetrieval", - # STS - "SICKFr", - "STS22", - "STSBenchmarkMultilingualSTS", - "SummEvalFr", - ], + tasks=MTEBTasks( + get_tasks( + languages=["fra"], + tasks=[ + # Classification + "AmazonReviewsClassification", + "MasakhaNEWSClassification", + "MassiveIntentClassification", + "MassiveScenarioClassification", + "MTOPDomainClassification", + "MTOPIntentClassification", + # Clustering + "AlloProfClusteringP2P", + "AlloProfClusteringS2S", + "HALClusteringS2S", + "MasakhaNEWSClusteringP2P", + "MasakhaNEWSClusteringS2S", + "MLSUMClusteringP2P", + "MLSUMClusteringS2S", + # Pair Classification + "OpusparcusPC", + "PawsXPairClassification", + # Reranking + "AlloprofReranking", + "SyntecReranking", + # Retrieval + "AlloprofRetrieval", + "BSARDRetrieval", + "MintakaRetrieval", + "SyntecRetrieval", + "XPQARetrieval", + # STS + "SICKFr", + "STSBenchmarkMultilingualSTS", + "SummEvalFr", + ], + ) + + (get_task("STS22", eval_splits=["test"], hf_subsets=["fr"]),) ), description="Main French benchmarks from MTEB", reference="https://arxiv.org/abs/2405.20468", @@ -521,6 +544,7 @@ def load_results( primaryClass={cs.CL}, url={https://arxiv.org/abs/2405.20468}, }""", + contacts=["imenelydiaker"], ) @@ -528,6 +552,7 @@ def load_results( name="MTEB(deu)", tasks=get_tasks( languages=["deu"], + exclusive_language_filter=True, tasks=[ # Classification "AmazonCounterfactualClassification", @@ -595,32 +620,34 @@ def load_results( MTEB_POL = Benchmark( name="MTEB(pol)", - tasks=get_tasks( - languages=["pol"], - tasks=[ - # Classification - "AllegroReviews", - "CBD", - "MassiveIntentClassification", - "MassiveScenarioClassification", - "PolEmo2.0-IN", - "PolEmo2.0-OUT", - "PAC", - # Clustering - "EightTagsClustering", - "PlscClusteringS2S", - "PlscClusteringP2P", - # Pair Classification - "CDSC-E", - "PpcPC", - "PSC", - "SICK-E-PL", - # STS - "CDSC-R", - "STS22", - "STSBenchmarkMultilingualSTS", - "SICK-R-PL", - ], + tasks=MTEBTasks( + get_tasks( + languages=["pol"], + tasks=[ + # Classification + "AllegroReviews", + "CBD", + "MassiveIntentClassification", + "MassiveScenarioClassification", + "PolEmo2.0-IN", + "PolEmo2.0-OUT", + "PAC", + # Clustering + "EightTagsClustering", + "PlscClusteringS2S", + "PlscClusteringP2P", + # Pair Classification + "CDSC-E", + "PpcPC", + "PSC", + "SICK-E-PL", + # STS + "CDSC-R", + "STSBenchmarkMultilingualSTS", + "SICK-R-PL", + ], + ) + + (get_task("STS22", eval_splits=["test"], hf_subsets=["pl"]),), ), description="Main Polish benchmarks from MTEB", reference="https://arxiv.org/abs/2405.10138", @@ -813,6 +840,7 @@ def load_results( description="The Multilingual benchmarks from MMTEB. Currently under development.", reference=None, citation=None, + contacts=["KennethEnevoldsen"], ) MTEB_JPN = Benchmark( @@ -850,6 +878,39 @@ def load_results( ) +indic_languages = [ + "asm", + "awa", + "ben", + "bgc", + "bho", + "doi", + "gbm", + "gom", + "guj", + "hin", + "hne", + "kan", + "kas", + "mai", + "mal", + "mar", + "mni", + "mup", + "mwr", + "nep", + "npi", + "ori", + "ory", + "pan", + "raj", + "san", + "snd", + "tam", + "tel", + "urd", +] + MTEB_INDIC = Benchmark( name="MTEB(Indic, beta)", tasks=get_tasks( @@ -885,13 +946,59 @@ def load_results( # reranking "WikipediaRerankingMultilingual", ], + languages=indic_languages, + exclusive_language_filter=True, ), description="Main Indic benchmark from MMTEB", reference=None, citation=None, + contacts=["KennethEnevoldsen"], ) +eu_languages = [ + # official EU languages (56) - we could include the whole economic area e.g. Norway - additioanlly we could include minority languages (probably a good idea?) + # germanic + "dan", + "eng", + "deu", + "nld", + "swe", + # romance + "fra", + "ita", + "por", + "spa", + "ron", + # slavic + "bul", + "hrv", + "ces", + "pol", + "slk", + "slv", + # baltic + "lav", + "lit", + "est", + # finno-ugric + "fin", + "hun", + # other indo european + "ell", + # non-indo european + "mlt", + "gle", + # Schengen Area + "nno", + "nob", + "isl", + "ron", + "eus", # Basque - recognized minority language + "ron", # Romanian - recognized minority language + "rom", # Romani - recognized minority language +] + MTEB_EU = Benchmark( name="MTEB(Europe, beta)", tasks=get_tasks( @@ -970,11 +1077,14 @@ def load_results( "STS17", "SICK-R-PL", "STSES", - ] + ], + languages=eu_languages, + exclusive_language_filter=True, ), description="Main European benchmark from MMTEB", reference=None, citation=None, + contacts=["KennethEnevoldsen"], ) LONG_EMBED = Benchmark( diff --git a/mteb/languages.py b/mteb/languages.py index 9b170a707f..e83dd308cd 100644 --- a/mteb/languages.py +++ b/mteb/languages.py @@ -5,6 +5,7 @@ from __future__ import annotations import json +from collections.abc import Iterable from dataclasses import dataclass from pathlib import Path @@ -81,5 +82,19 @@ def contains_language(self, language: str) -> bool: return True return False + def contains_languages(self, languages: Iterable[str]) -> bool: + """Whether is contains all of the languages""" + for l in languages: + if not self.contains_language(l): + return False + return True + def contains_script(self, script: str) -> bool: return script in self.scripts + + def contains_scripts(self, scripts: Iterable[str]) -> bool: + """Whether is contains all of the scripts""" + for s in scripts: + if not self.contains_script(s): + return False + return True diff --git a/mteb/overview.py b/mteb/overview.py index 43f8cebc1b..ced0e7729f 100644 --- a/mteb/overview.py +++ b/mteb/overview.py @@ -232,6 +232,7 @@ def get_tasks( tasks: list[str] | None = None, exclude_superseded: bool = True, eval_splits: list[str] | None = None, + exclusive_language_filter: bool = False, ) -> MTEBTasks: """Get a list of tasks based on the specified filters. @@ -247,6 +248,9 @@ def get_tasks( tasks: A list of task names to include. If None, all tasks which pass the filters are included. exclude_superseded: A boolean flag to exclude datasets which are superseded by another. eval_splits: A list of evaluation splits to include. If None, all splits are included. + exclusive_language_filter: Some datasets contains more than one language e.g. for STS22 the subset "de-en" contain eng and deu. If + exclusive_language_filter is set to False both of these will be kept, but if set to True only those that contains all the languages + specified will be kept. Returns: A list of all initialized tasks objects which pass all of the filters (AND operation). @@ -256,10 +260,18 @@ def get_tasks( >>> get_tasks(languages=["eng"], script=["Latn"], task_types=["Classification"]) >>> get_tasks(languages=["eng"], script=["Latn"], task_types=["Clustering"], exclude_superseded=False) >>> get_tasks(languages=["eng"], tasks=["WikipediaRetrievalMultilingual"], eval_splits=["test"]) + >>> get_tasks(tasks=["STS22"], languages=["eng"], exclusive_language_filter=True) # don't include multilingual subsets containing English """ if tasks: _tasks = [ - get_task(task, languages, script, eval_splits=eval_splits) for task in tasks + get_task( + task, + languages, + script, + eval_splits=eval_splits, + exclusive_language_filter=exclusive_language_filter, + ) + for task in tasks ] return MTEBTasks(_tasks) @@ -289,6 +301,8 @@ def get_task( languages: list[str] | None = None, script: list[str] | None = None, eval_splits: list[str] | None = None, + hf_subsets: list[str] | None = None, + exclusive_language_filter: bool = False, ) -> AbsTask: """Get a task by name. @@ -298,6 +312,10 @@ def get_task( "eng-Latn". For multilingual tasks this will also remove languages that are not in the specified list. script: A list of script codes (ISO 15924 codes). If None, all scripts are included. For multilingual tasks this will also remove scripts eval_splits: A list of evaluation splits to include. If None, all splits are included. + hf_subsets: A list of Huggingface subsets to evaluate on. + exclusive_language_filter: Some datasets contains more than one language e.g. for STS22 the subset "de-en" contain eng and deu. If + exclusive_language_filter is set to False both of these will be kept, but if set to True only those that contains all the languages + specified will be kept. Returns: An initialized task object. @@ -319,4 +337,9 @@ def get_task( task = TASKS_REGISTRY[task_name]() if eval_splits: task.filter_eval_splits(eval_splits=eval_splits) - return task.filter_languages(languages, script) + return task.filter_languages( + languages, + script, + hf_subsets=hf_subsets, + exclusive_language_filter=exclusive_language_filter, + ) From cfdbecbf778fa7312fdae6591be043fdd0d30fa7 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Mon, 13 Jan 2025 18:44:28 +0100 Subject: [PATCH 06/15] fix: Add gritlm --- mteb/models/gritlm_models.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index 91acafa26e..a4f5befd19 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -5,6 +5,7 @@ from mteb.model_meta import ModelMeta +from .e5_models import E5_TRAINING_DATA from .instruct_wrapper import instruct_wrapper logger = logging.getLogger(__name__) @@ -29,7 +30,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: open_weights=True, revision="13f00a0e36500c80ce12870ea513846a066004af", release_date="2024-02-15", - training_datasets={"GritLM/tulu2": ["train"]}, n_parameters=7_240_000_000, memory_usage=None, embed_dim=4096, @@ -39,6 +39,10 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, + training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data + public_training_code=True, # https://github.com/ContextualAI/gritlm + public_training_data=False, ) gritlm8x7b = ModelMeta( loader=partial( # type: ignore @@ -50,7 +54,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: ), name="GritLM/GritLM-8x7B", languages=["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"], - training_datasets={"GritLM/tulu2": ["train"]}, open_weights=True, revision="7f089b13e3345510281733ca1e6ff871b5b4bc76", release_date="2024-02-15", @@ -63,4 +66,8 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, + training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data + public_training_code=True, # https://github.com/ContextualAI/gritlm + public_training_data=False, ) From 15a68121f58b7f80130487ab85f5751315d1f3b5 Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 13 Jan 2025 17:51:27 +0000 Subject: [PATCH 07/15] 1.29.0 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 13135cd8d5..52bb150045 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.28.7" +version = "1.29.0" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 25150f9387b49862330ac3e6fd154bbfde51c60d Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Mon, 13 Jan 2025 19:13:28 +0100 Subject: [PATCH 08/15] fix: Added more annotations! --- mteb/models/bge_models.py | 24 ++++++++++ mteb/models/misc_models.py | 79 +++++++++++++++++++------------ mteb/models/ru_sentence_models.py | 43 ++++++++++------- 3 files changed, 101 insertions(+), 45 deletions(-) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 5a395f014a..dc3679a8da 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -6,6 +6,30 @@ model_prompts = {"query": "Represent this sentence for searching relevant passages: "} +bge_m_training_data = { + # source: https://arxiv.org/pdf/2402.03216 + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "LeCaRDv2": ["train"], + "CMedQAv1-reranking": ["train"], + "CMedQAv2-reranking": ["train"], + "MrTidyRetrieval": ["train"], + "T2Reranking": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + # + synthetic data +} + bge_training_data = { # source: https://data.baai.ac.cn/details/BAAI-MTP "NQ": ["test"], diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index d5734b448c..b2a661fe5a 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -5,6 +5,10 @@ import torch from mteb.model_meta import ModelMeta, sentence_transformers_loader +from mteb.models.e5_models import E5_TRAINING_DATA + +from .bge_models import bge_m_training_data, bge_training_data +from .sentence_transformers_models import sent_trf_training_dataset Haon_Chen__speed_embedding_7b_instruct = ModelMeta( name="Haon-Chen/speed-embedding-7b-instruct", @@ -198,7 +202,8 @@ reference="https://huggingface.co/BeastyZ/e5-R-mistral-7b", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"BeastyZ/E5-R": ["train"]}, + training_datasets=E5_TRAINING_DATA, + # not MTEB: {"BeastyZ/E5-R": ["train"]}, adapted_from="/ConRetriever/public_weight_mistral", superseded_by=None, ) @@ -295,13 +300,14 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=True, + public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Bulbasaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is GTE-tiny where training data is unknown + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/dwsdwass", superseded_by=None, ) @@ -317,13 +323,14 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=True, + public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Ivysaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is GTE-tiny where training data is unknown + # not MTEB: {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/jhjghjgh", superseded_by=None, ) @@ -345,7 +352,8 @@ reference="https://huggingface.co/Mihaiii/Squirtle", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=bge_training_data, # source model is bge-base-en-v1.5 + # not MTEB: {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test21", superseded_by=None, ) @@ -367,7 +375,8 @@ reference="https://huggingface.co/Mihaiii/Venusaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is unkown + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test14", superseded_by=None, ) @@ -389,7 +398,8 @@ reference="https://huggingface.co/Mihaiii/Wartortle", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=bge_training_data, # distill from bge-base-en-v1.5 + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test22", superseded_by=None, ) @@ -477,7 +487,7 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets={}, # not in MTEB: {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="aubmindlab/bert-base-arabertv02", superseded_by=None, ) @@ -499,7 +509,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not in MTEB + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", superseded_by=None, ) @@ -521,7 +533,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, # derived from + # not in MTEB: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", superseded_by=None, ) @@ -543,7 +557,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=None, # derived from labSE + # as well as: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/LaBSE", superseded_by=None, ) @@ -565,7 +581,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not in MTEB: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="tomaarsen/mpnet-base-all-nli-triplet", superseded_by=None, ) @@ -587,7 +605,7 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets={}, # not in MTEB: "Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="UBC-NLP/MARBERTv2", superseded_by=None, ) @@ -719,7 +737,8 @@ reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.4", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"manu/embedding_data_v2_100k": ["train"]}, + training_datasets=None, + # Not in MTEB: {"manu/embedding_data_v2_100k": ["train"]}, adapted_from="croissantllm/CroissantCool-v0.2", superseded_by=None, ) @@ -1365,7 +1384,8 @@ reference="https://huggingface.co/aari1995/German_Semantic_STS_V2", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"stsb_multi_mt": ["train"]}, + training_datasets=None, # couldn't figure out the source model + # {"stsb_multi_mt": ["train"]}, adapted_from="/content/drive/MyDrive/Stanford_NLU/Project/false_friends/gbert_large_sts_only", superseded_by=None, ) @@ -1481,18 +1501,18 @@ reference="https://huggingface.co/deepvk/USER-bge-m3", similarity_fn_name="cosine", use_instructions=None, - training_datasets={ - "deepvk/ru-HNP": ["train"], - "deepvk/ru-WANLI": ["train"], - "Shitao/bge-m3-data": ["train"], - "RussianNLP/russian_super_glue": ["train"], - "reciTAL/mlsum": ["train"], - "Milana/russian_keywords": ["train"], - "IlyaGusev/gazeta": ["train"], - "d0rj/gsm8k-ru": ["train"], - "bragovo/dsum_ru": ["train"], - "CarlBrendt/Summ_Dialog_News": ["train"], - }, + training_datasets=bge_m_training_data, # derived from. + # not in MTEB: + # "deepvk/ru-HNP": ["train"], + # "deepvk/ru-WANLI": ["train"], + # "Shitao/bge-m3-data": ["train"], + # "RussianNLP/russian_super_glue": ["train"], + # "reciTAL/mlsum": ["train"], + # "Milana/russian_keywords": ["train"], + # "IlyaGusev/gazeta": ["train"], + # "d0rj/gsm8k-ru": ["train"], + # "bragovo/dsum_ru": ["train"], + # "CarlBrendt/Summ_Dialog_News": ["train"], adapted_from="USER-bge-m3", superseded_by=None, ) @@ -1622,7 +1642,8 @@ reference="https://huggingface.co/shibing624/text2vec-base-multilingual", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"shibing624/nli-zh-all": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not MTEB: {"shibing624/nli-zh-all": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", superseded_by=None, ) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index a520bdca11..6bca544b11 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -6,6 +6,8 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader +from .bge_models import bge_training_data + rubert_tiny2 = ModelMeta( name="cointegrated/rubert-tiny2", languages=["rus_Cyrl"], @@ -96,20 +98,27 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, training_datasets={ - "deepvk/ru-HNP": ["train"], - "deepvk/ru-WANLI": ["train"], - "Shitao/bge-m3-data": ["train"], - "RussianNLP/russian_super_glue": ["train"], - "reciTAL/mlsum": ["train"], - "Helsinki-NLP/opus-100": ["train"], - "Helsinki-NLP/bible_para": ["train"], - "d0rj/rudetoxifier_data_detox": ["train"], - "s-nlp/ru_paradetox": ["train"], - "Milana/russian_keywords": ["train"], - "IlyaGusev/gazeta": ["train"], - "d0rj/gsm8k-ru": ["train"], - "bragovo/dsum_ru": ["train"], - "CarlBrendt/Summ_Dialog_News": ["train"], + "BibleNLPBitextMining": ["train"], + "MLSUMClusteringP2P": ["train"], + "MLSUMClusteringP2P.v2": ["train"], + "MLSUMClusteringS2S": ["train"], + "MLSUMClusteringS2S.v2": ["train"], + **bge_training_data, + # not MTEB: + # "deepvk/ru-HNP": ["train"], + # "deepvk/ru-WANLI": ["train"], + # "Shitao/bge-m3-data": ["train"], + # "RussianNLP/russian_super_glue": ["train"], + # "reciTAL/mlsum": ["train"], + # "Helsinki-NLP/opus-100": ["train"], + # "Helsinki-NLP/bible_para": ["train"], + # "d0rj/rudetoxifier_data_detox": ["train"], + # "s-nlp/ru_paradetox": ["train"], + # "Milana/russian_keywords": ["train"], + # "IlyaGusev/gazeta": ["train"], + # "d0rj/gsm8k-ru": ["train"], + # "bragovo/dsum_ru": ["train"], + # "CarlBrendt/Summ_Dialog_News": ["train"], }, ) @@ -213,7 +222,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets={"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + training_datasets=None, # source model in unknown + # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, ) labse_ru_turbo = ModelMeta( @@ -231,7 +241,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets={"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + training_datasets=None, # source model in unknown + # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, ) From 3ba7e22d52320166ec003cbd04c5f09bc0eefe24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 13 Jan 2025 22:11:18 +0100 Subject: [PATCH 09/15] fix: Added C-MTEB (#1786) Added C-MTEB --- mteb/benchmarks/benchmarks.py | 49 +++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index d5c249e008..3478d48b25 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1147,3 +1147,52 @@ def load_results( reference="https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6", citation=None, ) + +C_MTEB = Benchmark( + name="MTEB(Chinese)", + tasks=get_tasks( + tasks=[ + "T2Retrieval", + "MMarcoRetrieval", + "DuRetrieval", + "CovidRetrieval", + "CmedqaRetrieval", + "EcomRetrieval", + "MedicalRetrieval", + "VideoRetrieval", + "T2Reranking", + "MMarcoReranking", + "CMedQAv1-reranking", + "CMedQAv2-reranking", + "Ocnli", + "Cmnli", + "CLSClusteringS2S", + "CLSClusteringP2P", + "ThuNewsClusteringS2S", + "ThuNewsClusteringP2P", + "ATEC", + "BQ", + "LCQMC", + "PAWSX", + "STSB", + "AFQMC", + "QBQTC", + "TNews", + "IFlyTek", + "Waimai", + "OnlineShopping", + "MultilingualSentiment", + "JDReview", + ], + ), + description="The Chinese Massive Text Embedding Benchmark (C-MTEB) is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets.", + reference="https://github.com/FlagOpen/FlagEmbedding/tree/master/research/C_MTEB", + citation="""@misc{c-pack, + title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, + author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff}, + year={2023}, + eprint={2309.07597}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +}""", +) From 48370c7b94be22b98816c8410b2f792c1c499169 Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 13 Jan 2025 21:27:26 +0000 Subject: [PATCH 10/15] 1.29.1 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 52bb150045..bf7b21eed2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.0" +version = "1.29.1" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From e9e9118b9bf6cbda678c70d6776a8f290833eff3 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Tue, 14 Jan 2025 12:53:58 +0900 Subject: [PATCH 11/15] docs: Add contact to MMTEB benchmarks (#1796) * Add myself to MMTEB benchmarks * lint --- mteb/benchmarks/benchmarks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 3478d48b25..0537c604f7 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -840,7 +840,7 @@ def load_results( description="The Multilingual benchmarks from MMTEB. Currently under development.", reference=None, citation=None, - contacts=["KennethEnevoldsen"], + contacts=["KennethEnevoldsen", "isaac-chung"], ) MTEB_JPN = Benchmark( @@ -952,7 +952,7 @@ def load_results( description="Main Indic benchmark from MMTEB", reference=None, citation=None, - contacts=["KennethEnevoldsen"], + contacts=["KennethEnevoldsen", "isaac-chung"], ) @@ -1084,7 +1084,7 @@ def load_results( description="Main European benchmark from MMTEB", reference=None, citation=None, - contacts=["KennethEnevoldsen"], + contacts=["KennethEnevoldsen", "isaac-chung"], ) LONG_EMBED = Benchmark( From 94103e6a2e8156678c3858045286cbd50b5d49c5 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Tue, 14 Jan 2025 15:44:54 +0500 Subject: [PATCH 12/15] fix: loading pre 11 (#1798) * fix loading pre 11 * add similarity * lint * run all task types --- mteb/load_results/task_results.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index e1b9b9d69d..72cae5a93d 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -387,15 +387,16 @@ def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult: main_score = task.metadata.main_score for split, split_score in scores.items(): for hf_subset, hf_subset_scores in split_score.items(): - if task.metadata.type == "STS": - for name, prev_name in [ - ("cosine", "cos_sim"), - ("manhattan", "manhattan"), - ("euclidean", "euclidean"), - ]: - prev_name_scores = hf_subset_scores.pop( - prev_name, {"spearman": "NaN"} - ) + for name, prev_name in [ + ("cosine", "cos_sim"), + ("manhattan", "manhattan"), + ("euclidean", "euclidean"), + ("dot", "dot"), + ("max", "max"), + ("similarity", "similarity"), + ]: + prev_name_scores = hf_subset_scores.pop(prev_name, None) + if prev_name_scores is not None: for k, v in prev_name_scores.items(): hf_subset_scores[f"{name}_{k}"] = v From b6fb5b8ca7285ec426e952dfbcb1805935f5cf12 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 14 Jan 2025 10:49:54 +0000 Subject: [PATCH 13/15] 1.29.2 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bf7b21eed2..9f1e4deacb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.1" +version = "1.29.2" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From a2028840a6b4f77057761664edce8cae2edb64d1 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Tue, 14 Jan 2025 17:46:26 +0500 Subject: [PATCH 14/15] fix: allow to load no revision available (#1801) * fix allow to load no revision available * lint * add require_model_meta to leaderboard * lint --- mteb/leaderboard/app.py | 4 +- mteb/load_results/load_results.py | 1 + pyproject.toml | 2 +- scripts/compare_leaderboard_results.py | 90 +++++++++++++++----------- 4 files changed, 57 insertions(+), 40 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index d1383cf1a7..e3c7d0aad2 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -24,7 +24,9 @@ def load_results(): results_cache_path = Path(__file__).parent.joinpath("__cached_results.json") if not results_cache_path.exists(): all_results = ( - mteb.load_results(only_main_score=True).join_revisions().filter_models() + mteb.load_results(only_main_score=True, require_model_meta=False) + .join_revisions() + .filter_models() ) all_results.to_disk(results_cache_path) return all_results diff --git a/mteb/load_results/load_results.py b/mteb/load_results/load_results.py index 03ec6fb308..ef851a1dc2 100644 --- a/mteb/load_results/load_results.py +++ b/mteb/load_results/load_results.py @@ -139,6 +139,7 @@ def load_results( continue model_name, revision = model_name_and_revision + model_name = model_name.replace("__", "/") if models_to_keep is not None and model_name not in models_to_keep: continue elif models_to_keep is not None and models_to_keep[model_name] is not None: diff --git a/pyproject.toml b/pyproject.toml index 9f1e4deacb..0f96f554d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,7 @@ dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint codecarbon = ["codecarbon"] speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] peft = ["peft>=0.11.0"] -leaderboard = ["gradio>=5.7.1", "gradio_rangeslider>=0.0.8"] +leaderboard = ["gradio>=5.7.1", "gradio_rangeslider>=0.0.8", "plotly>=5.24.0"] flagembedding = ["FlagEmbedding"] jina = ["einops>=0.8.0"] flash_attention = ["flash-attn>=2.6.3"] diff --git a/scripts/compare_leaderboard_results.py b/scripts/compare_leaderboard_results.py index bbeb912bb4..1fe9c3d766 100644 --- a/scripts/compare_leaderboard_results.py +++ b/scripts/compare_leaderboard_results.py @@ -2,70 +2,84 @@ import json import logging +from collections import defaultdict from pathlib import Path -from mteb import MTEB_ENG_CLASSIC, load_results +from mteb import get_benchmark, load_results logging.basicConfig(level=logging.INFO) models = [ - "dunzhang/stella_en_1.5B_v5", - "dunzhang/stella_en_400M_v5", + "intfloat/multilingual-e5-small", # Add other models here ] +benchmark = get_benchmark("MTEB(Chinese)") + +results = [] # in same folder as mteb repo # git clone https://github.com/embeddings-benchmark/leaderboard -data_tasks_path = Path("../../leaderboard/boards_data/en/data_tasks/") +# get path of current file +base_path = Path(__file__).parent.parent.parent / "leaderboard" / "boards_data" -results = [] for model_name_to_search in models: model_results = load_results( models=[model_name_to_search], - tasks=MTEB_ENG_CLASSIC.tasks, + tasks=benchmark.tasks, only_main_score=True, + require_model_meta=False, ) - cur_model = {} + cur_model = {task.metadata.name: defaultdict(dict) for task in benchmark.tasks} for model_res in model_results: for task_res in model_res.task_results: task_name = task_res.task.metadata.name - split = "test" if task_name != "MSMARCO" else "dev" - scores = [score["main_score"] for score in task_res.scores[split]] - # this tmp solution, because some tasks have multiple results - cur_model[task_name] = {"new": round((sum(scores) / len(scores)) * 100, 2)} - for task_dir in data_tasks_path.iterdir(): - if task_dir.is_dir(): - results_file_path = task_dir / "default.jsonl" - if results_file_path.exists(): - with open(results_file_path) as file: - for line in file: - data = json.loads(line) - model_name = data.get("Model", "") - if model_name_to_search in model_name: - for key, value in data.items(): - if key in [ - "index", - "Rank", - "Model", - "Model Size (Million Parameters)", - "Memory Usage (GB, fp32)", - "Embedding Dimensions", - "Max Tokens", - "Average", - ]: - continue - for benchmark_task in MTEB_ENG_CLASSIC.tasks: - if benchmark_task.metadata.name in key: - cur_model[benchmark_task.metadata.name][ - "old" - ] = value + split = ( + "test" + if "test" in task_res.task.metadata.eval_splits + else task_res.task.metadata.eval_splits[0] + ) + if split in task_res.scores: + scores = [score["main_score"] for score in task_res.scores[split]] + cur_model[task_name]["new"] = round( + (sum(scores) / len(scores)) * 100, 2 + ) + + for lang_path in base_path.iterdir(): + data_tasks_path = lang_path / "data_tasks" + + for task_dir in data_tasks_path.iterdir(): + if task_dir.is_dir(): + results_file_path = task_dir / "default.jsonl" + if results_file_path.exists(): + with open(results_file_path) as file: + for line in file: + data = json.loads(line) + model_name = data.get("Model", "") + if model_name_to_search in model_name: + for key, value in data.items(): + if key in [ + "index", + "Rank", + "Model", + "Model Size (Million Parameters)", + "Memory Usage (GB, fp32)", + "Embedding Dimensions", + "Max Tokens", + "Average", + ]: + continue + for benchmark_task in benchmark.tasks: + if benchmark_task.metadata.name in key: + cur_model[benchmark_task.metadata.name][ + "old" + ] = value sorted_cur_model = { task.metadata.name: cur_model[task.metadata.name] - for task in MTEB_ENG_CLASSIC.tasks + for task in benchmark.tasks if task.metadata.name in cur_model } results.append({"model": model_name_to_search, "results": sorted_cur_model}) From bcb2cd97c8afb80e11d636ea34689bd08f922b19 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 14 Jan 2025 13:03:43 +0000 Subject: [PATCH 15/15] 1.29.3 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0f96f554d0..441332dd73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.2" +version = "1.29.3" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [