From a168496735329fbf9d3044b9bd2ae3e01046decc Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 17:06:40 +0100 Subject: [PATCH 01/17] fix: Leaderboard: `K` instead of `M` Fixes #1752 --- mteb/models/stella_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 153ee6aa99..a738f4461e 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -21,7 +21,7 @@ use_instructions=True, revision="1bb50bc7bb726810eac2140e62155b88b0df198f", release_date="2024-07-12", - n_parameters=435_000, + n_parameters=435_000_000, max_tokens=8192, embed_dim=4096, license="mit", @@ -45,7 +45,7 @@ use_instructions=True, revision="d03be74b361d4eb24f42a2fe5bd2e29917df4604", release_date="2024-07-12", - n_parameters=1_540_000, + n_parameters=1_540_000_000, max_tokens=131072, embed_dim=8960, license="mit", From e61d7f2086c889ada85ab202d1672561e266675c Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 17:57:25 +0100 Subject: [PATCH 02/17] format --- mteb/leaderboard/app.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index ba336f8ea5..d1383cf1a7 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -5,7 +5,6 @@ import logging import tempfile import time -from collections import defaultdict from pathlib import Path from urllib.parse import urlencode @@ -17,7 +16,6 @@ from mteb.caching import json_cache from mteb.leaderboard.figures import performance_size_plot, radar_chart from mteb.leaderboard.table import scores_to_tables -from mteb.models.overview import get_model_meta logger = logging.getLogger(__name__) @@ -143,28 +141,28 @@ def update_task_info(task_names: str) -> gr.DataFrame: ) lang_select = gr.Dropdown( all_results.languages, - value=list(sorted(default_results.languages)), + value=sorted(default_results.languages), multiselect=True, label="Language", info="Select languages to include.", ) type_select = gr.Dropdown( all_results.task_types, - value=list(sorted(default_results.task_types)), + value=sorted(default_results.task_types), multiselect=True, label="Task Type", info="Select task types to include.", ) domain_select = gr.Dropdown( all_results.domains, - value=list(sorted(default_results.domains)), + value=sorted(default_results.domains), multiselect=True, label="Domain", info="Select domains to include.", ) task_select = gr.Dropdown( all_results.task_names, - value=list(sorted(default_results.task_names)), + value=sorted(default_results.task_names), allow_custom_value=True, multiselect=True, label="Task", @@ -330,16 +328,16 @@ def on_benchmark_select(benchmark_name): benchmark = mteb.get_benchmark(benchmark_name) languages = [task.languages for task in benchmark.tasks if task.languages] languages = set(itertools.chain.from_iterable(languages)) - languages = list(sorted(languages)) + languages = sorted(languages) domains = [ task.metadata.domains for task in benchmark.tasks if task.metadata.domains ] domains = set(itertools.chain.from_iterable(domains)) types = {task.metadata.type for task in benchmark.tasks if task.metadata.type} languages, domains, types = ( - list(sorted(languages)), - list(sorted(domains)), - list(sorted(types)), + sorted(languages), + sorted(domains), + sorted(types), ) elapsed = time.time() - start_time benchmark_results = all_benchmark_results[benchmark_name] From e1b89e30900eed9002687141122d0b9178cfef69 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 17:57:57 +0100 Subject: [PATCH 03/17] fixed existing annotations to refer to task name instead of hf dataset --- mteb/models/misc_models.py | 73 +++++---- mteb/models/sentence_transformers_models.py | 166 ++++++++++++-------- 2 files changed, 140 insertions(+), 99 deletions(-) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index d05461af17..d5734b448c 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -113,38 +113,47 @@ similarity_fn_name="cosine", use_instructions=None, training_datasets={ - "s2orc": ["train"], - "flax-sentence-embeddings/stackexchange_title_body_jsonl": ["train"], - "flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl": [ - "train" - ], - "flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl": [ - "train" - ], - "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl": [ - "train" - ], - "sentence-transformers/reddit-title-body": ["train"], - "msmarco": ["train"], - "gooaq": ["train"], - "yahoo_answers_topics": ["train"], - "code_search_net": ["train"], - "search_qa": ["train"], - "eli5": ["train"], - "snli": ["train"], - "multi_nli": ["train"], - "wikihow": ["train"], - "natural_questions": ["train"], - "trivia_qa": ["train"], - "embedding-data/sentence-compression": ["train"], - "embedding-data/flickr30k-captions": ["train"], - "embedding-data/altlex": ["train"], - "embedding-data/simple-wiki": ["train"], - "embedding-data/QQP": ["train"], - "embedding-data/SPECTER": ["train"], - "embedding-data/PAQ_pairs": ["train"], - "embedding-data/WikiAnswers": ["train"], - "sentence-transformers/embedding-training-data": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_title_body_jsonl": ["train"], + # "flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl": [ + # "train" + # ], + # "flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl": [ + # "train" + # ], + # "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl": [ + # "train" + # ], + # "sentence-transformers/reddit-title-body": ["train"], + # "msmarco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], + # "sentence-transformers/embedding-training-data": ["train"], }, adapted_from="hum-lodestone-v1", superseded_by=None, diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index ea02508c36..557f4f9a89 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -78,27 +78,36 @@ superseded_by=None, adapted_from=None, training_datasets={ - "s2orc": ["train"], - "flax-sentence-embeddings/stackexchange_xml": ["train"], - "ms_marco": ["train"], - "gooaq": ["train"], - "yahoo_answers_topics": ["train"], - "code_search_net": ["train"], - "search_qa": ["train"], - "eli5": ["train"], - "snli": ["train"], - "multi_nli": ["train"], - "wikihow": ["train"], - "natural_questions": ["train"], - "trivia_qa": ["train"], - "embedding-data/sentence-compression": ["train"], - "embedding-data/flickr30k-captions": ["train"], - "embedding-data/altlex": ["train"], - "embedding-data/simple-wiki": ["train"], - "embedding-data/QQP": ["train"], - "embedding-data/SPECTER": ["train"], - "embedding-data/PAQ_pairs": ["train"], - "embedding-data/WikiAnswers": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], }, ) @@ -196,27 +205,36 @@ superseded_by=None, adapted_from=None, training_datasets={ - "s2orc": ["train"], - "flax-sentence-embeddings/stackexchange_xml": ["train"], - "ms_marco": ["train"], - "gooaq": ["train"], - "yahoo_answers_topics": ["train"], - "code_search_net": ["train"], - "search_qa": ["train"], - "eli5": ["train"], - "snli": ["train"], - "multi_nli": ["train"], - "wikihow": ["train"], - "natural_questions": ["train"], - "trivia_qa": ["train"], - "embedding-data/sentence-compression": ["train"], - "embedding-data/flickr30k-captions": ["train"], - "embedding-data/altlex": ["train"], - "embedding-data/simple-wiki": ["train"], - "embedding-data/QQP": ["train"], - "embedding-data/SPECTER": ["train"], - "embedding-data/PAQ_pairs": ["train"], - "embedding-data/WikiAnswers": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], }, ) @@ -319,27 +337,36 @@ superseded_by=None, adapted_from=None, training_datasets={ - "s2orc": ["train"], - "flax-sentence-embeddings/stackexchange_xml": ["train"], - "ms_marco": ["train"], - "gooaq": ["train"], - "yahoo_answers_topics": ["train"], - "code_search_net": ["train"], - "search_qa": ["train"], - "eli5": ["train"], - "snli": ["train"], - "multi_nli": ["train"], - "wikihow": ["train"], - "natural_questions": ["train"], - "trivia_qa": ["train"], - "embedding-data/sentence-compression": ["train"], - "embedding-data/flickr30k-captions": ["train"], - "embedding-data/altlex": ["train"], - "embedding-data/simple-wiki": ["train"], - "embedding-data/QQP": ["train"], - "embedding-data/SPECTER": ["train"], - "embedding-data/PAQ_pairs": ["train"], - "embedding-data/WikiAnswers": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], }, ) @@ -361,9 +388,14 @@ superseded_by=None, adapted_from=None, training_datasets={ - "sentence-transformers/all-nli": ["train"], - "sentence-transformers/stsb": ["train"], - "sentence-transformers/quora-duplicates": ["train"], - "sentence-transformers/natural-questions": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "sentence-transformers/all-nli": ["train"], + # "sentence-transformers/stsb": ["train"], + # "sentence-transformers/quora-duplicates": ["train"], + # "sentence-transformers/natural-questions": ["train"], }, ) From 9ffeae46885944cde69356672d2eb184afaeb491 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 17:58:05 +0100 Subject: [PATCH 04/17] added annotation to nvidia --- mteb/models/nvidia_models.py | 54 ++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 72274b41de..2af1be27ce 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -72,6 +72,54 @@ def encode( return embeddings +training_datasets = { + # source: https://arxiv.org/pdf/2405.17428 + "ArguAna": ["train"], + "ArguAna-PL": ["train"], + "NanoArguAnaRetrieval": ["train"], + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FiQA2018": ["train"], + "FiQA2018-PL": ["train"], # translation not trained on + "STS12": ["train"], + "STS22": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "ImdbClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringS2S.v2": ["train"], + "BiorxivClusteringP2P": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwentyNewsgroupsClustering.v2": ["train"], + "STSBenchmark": ["train"], + "STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on +} NV_embed_v2 = ModelMeta( loader=partial( # type: ignore NvEmbedWrapper, @@ -92,6 +140,9 @@ def encode( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=training_datasets, + public_training_code=None, + public_training_data=True, ) NV_embed_v1 = ModelMeta( @@ -114,4 +165,7 @@ def encode( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=training_datasets, + public_training_code=None, + public_training_data=True, ) From 0495d323d5eb133174935d766a3fbac81e798799 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:00:50 +0100 Subject: [PATCH 05/17] added voyage --- mteb/models/voyage_models.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index 70f61e2c52..12925b235b 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -157,6 +157,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_finance_2 = ModelMeta( @@ -179,6 +182,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_law_2 = ModelMeta( @@ -201,6 +207,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_code_2 = ModelMeta( @@ -223,6 +232,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_large_2 = ModelMeta( @@ -245,6 +257,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_2 = ModelMeta( @@ -267,6 +282,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, + public_training_code=False, ) voyage_multilingual_2 = ModelMeta( name="voyageai/voyage-multilingual-2", @@ -288,6 +306,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_3 = ModelMeta( @@ -310,6 +331,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_3_lite = ModelMeta( @@ -332,4 +356,7 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) From 5f7ef656a41d2cac755f52f2c66a93e8a8593525 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:05:09 +0100 Subject: [PATCH 06/17] added uae annotations --- mteb/models/uae_models.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mteb/models/uae_models.py b/mteb/models/uae_models.py index 5c47cba67d..ffdaa29f74 100644 --- a/mteb/models/uae_models.py +++ b/mteb/models/uae_models.py @@ -75,4 +75,13 @@ def encode( framework=["Sentence Transformers", "PyTorch"], reference="https://huggingface.co/WhereIsAI/UAE-Large-V1", use_instructions=True, + training_datasets={ + # source: https://arxiv.org/pdf/2309.12871 + # not in MTEB + "MNLI": [], + "NLI": [], + "SNLI": [], + }, + public_training_data=True, + public_training_code=True, ) From ac480127b8fef3464113a351cb63397a7383237c Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:07:13 +0100 Subject: [PATCH 07/17] Added stella annotations --- mteb/models/sentence_transformers_models.py | 3 +++ mteb/models/stella_models.py | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 557f4f9a89..05ce11d8a7 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -128,6 +128,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, + training_datasets=None, + public_training_data=False, # currently not release + public_training_code=False, ) paraphrase_multilingual_mpnet_base_v2 = ModelMeta( diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index a738f4461e..8709196319 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -28,6 +28,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_400M_v5", + training_datasets=None, + public_training_data=False, # currently not released + public_training_code=False, ) stella_en_1_5b = ModelMeta( @@ -52,4 +55,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", + training_datasets=None, + public_training_data=False, # currently not released + public_training_code=False, ) From c1c7eb6fe95f25d67654b938b06707e60b829ae8 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:22:09 +0100 Subject: [PATCH 08/17] sentence trf models --- mteb/models/sentence_transformers_models.py | 204 ++++++++------------ 1 file changed, 82 insertions(+), 122 deletions(-) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 05ce11d8a7..7878d6ac6c 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -60,6 +60,40 @@ "zho_Hant", ] +sent_trf_training_dataset = { + # derived from datasheets + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], +} + all_MiniLM_L6_v2 = ModelMeta( name="sentence-transformers/all-MiniLM-L6-v2", languages=["eng-Latn"], @@ -77,38 +111,29 @@ use_instructions=False, superseded_by=None, adapted_from=None, - training_datasets={ - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - # not in MTEB - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "natural_questions": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, + training_datasets=sent_trf_training_dataset, +) + +all_MiniLM_L12_v2 = ModelMeta( + name="sentence-transformers/all-MiniLM-L12-v2", + languages=["eng-Latn"], + open_weights=True, + revision="364dd28d28dcd3359b537f3cf1f5348ba679da62", + release_date="2021-08-30", + n_parameters=33_400_000, + memory_usage=None, + embed_dim=384, + license="apache-2.0", + max_tokens=256, + reference="https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets=sent_trf_training_dataset, + public_training_code=False, + public_training_data=True, ) paraphrase_multilingual_MiniLM_L12_v2 = ModelMeta( @@ -128,9 +153,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, - training_datasets=None, - public_training_data=False, # currently not release + training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) public_training_code=False, + public_training_data=True, ) paraphrase_multilingual_mpnet_base_v2 = ModelMeta( @@ -150,6 +175,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, + training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) + public_training_code=False, + public_training_data=True, ) labse = ModelMeta( @@ -169,6 +197,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, + training_datasets=None, # scraped and mined webdata including CC, wiki, see section 3.1 https://aclanthology.org/2022.acl-long.62.pdf + public_training_code=True, # https://www.kaggle.com/models/google/labse/tensorFlow2/labse/2?tfhub-redirect=true + public_training_data=False, ) multi_qa_MiniLM_L6_cos_v1 = ModelMeta( @@ -187,7 +218,10 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, superseded_by=None, - adapted_from=None, + adapted_from="nreimers/MiniLM-L6-H384-uncased", + training_datasets=sent_trf_training_dataset, # assumed + public_training_code=None, + public_training_data=None, ) all_mpnet_base_v2 = ModelMeta( @@ -207,38 +241,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - training_datasets={ - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - # not in MTEB - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "natural_questions": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, + training_datasets=sent_trf_training_dataset, ) jina_embeddings_v2_base_en = ModelMeta( @@ -258,7 +261,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, - training_datasets={"allenai/c4": ["train"]}, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC ) jina_embeddings_v2_small_en = ModelMeta( @@ -278,7 +283,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} ) jina_embedding_b_en_v1 = ModelMeta( @@ -298,7 +305,9 @@ use_instructions=False, superseded_by="jinaai/jina-embeddings-v2-base-en", adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} ) jina_embedding_s_en_v1 = ModelMeta( @@ -318,61 +327,12 @@ use_instructions=False, superseded_by="jinaai/jina-embeddings-v2-small-en", adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} ) -all_MiniLM_L12_v2 = ModelMeta( - name="sentence-transformers/all-MiniLM-L12-v2", - languages=["eng-Latn"], - open_weights=True, - revision="364dd28d28dcd3359b537f3cf1f5348ba679da62", - release_date="2021-08-30", - n_parameters=33_400_000, - memory_usage=None, - embed_dim=384, - license="apache-2.0", - max_tokens=256, - reference="https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets={ - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - # not in MTEB - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "natural_questions": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, -) - microllama_text_embedding = ModelMeta( name="keeeeenw/MicroLlama-text-embedding", languages=["eng-Latn"], From 4ec91216393ac9fafceffdeae15f4dbcf9b0d807 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:28:18 +0100 Subject: [PATCH 09/17] added salesforce and e5 --- mteb/models/e5_instruct.py | 8 ++++++- mteb/models/e5_models.py | 40 ++++++++++++++++++++++++++++++++ mteb/models/salesforce_models.py | 25 ++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index f26d78ed6d..182a6ea4b2 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -6,7 +6,7 @@ from mteb.model_meta import ModelMeta -from .e5_models import E5_PAPER_RELEASE_DATE, XLMR_LANGUAGES +from .e5_models import E5_PAPER_RELEASE_DATE, E5_TRAINING_DATA, XLMR_LANGUAGES from .instruct_wrapper import instruct_wrapper MISTRAL_LANGUAGES = ["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"] @@ -40,6 +40,9 @@ embed_dim=1024, license="mit", max_tokens=514, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_mistral = ModelMeta( @@ -69,4 +72,7 @@ embed_dim=4096, license="mit", max_tokens=32768, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index 4fee54de79..9537824e59 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -113,6 +113,19 @@ PromptType.passage.value: "passage: ", } +E5_TRAINING_DATA = { + # from 4.2 in https://arxiv.org/pdf/2212.03533 + # also pre-training data from a variety of sources (stackexchange, semantic scholar, reddit, CC, ...) + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on +} + e5_mult_small = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, @@ -134,6 +147,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_mult_base = ModelMeta( @@ -156,6 +172,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_mult_large = ModelMeta( @@ -179,6 +198,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_small_v2 = ModelMeta( @@ -201,6 +223,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_small = ModelMeta( @@ -224,6 +249,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_base_v2 = ModelMeta( @@ -249,6 +277,9 @@ use_instructions=True, superseded_by=None, adapted_from=None, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_large_v2 = ModelMeta( @@ -274,6 +305,9 @@ use_instructions=True, superseded_by=None, adapted_from=None, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_large = ModelMeta( @@ -299,6 +333,9 @@ use_instructions=True, superseded_by="intfloat/e5-large-v2", adapted_from=None, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_base = ModelMeta( @@ -324,4 +361,7 @@ use_instructions=True, superseded_by="intfloat/e5-base-v2", adapted_from=None, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index b1d45b949c..18db09a2b5 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -40,6 +40,19 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + adapted_from="intfloat/e5-mistral-7b-instruct", + public_training_code=False, + public_training_data=False, + training_datasets={ # inherits from e5 + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + }, ) @@ -68,4 +81,16 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=False, + public_training_data=False, + training_datasets={ # inherits from e5 + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + }, ) From c54859d72454dbe41a74d920ad09193337cfa258 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:29:30 +0100 Subject: [PATCH 10/17] jina --- mteb/models/jina_models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index 122f190657..f1a05d210d 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -214,7 +214,7 @@ def encode( open_weights=True, revision="215a6e121fa0183376388ac6b1ae230326bfeaed", release_date="2024-09-18", # official release date - n_parameters=572 * 1e6, + n_parameters=int(572 * 1e6), max_tokens=8194, embed_dim=4096, license="cc-by-nc-4.0", @@ -222,4 +222,7 @@ def encode( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, reference="https://huggingface.co/jinaai/jina-embeddings-v3", + training_datasets=None, + public_training_code=False, + public_training_data=False, ) From d7f5684ac29c86877942c529ca70cdd5237baa48 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:36:42 +0100 Subject: [PATCH 11/17] bge + model2vec --- mteb/models/bge_models.py | 9 +++++++++ mteb/models/model2vec_models.py | 23 ++++++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index cc183374c6..734f71262d 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -27,6 +27,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=None, # https://github.com/staoxiao/RetroMAE, includes wikipedia and bookcorpus and contrastive pairs (unknown) + public_training_code=False, + public_training_data=False, ) bge_base_en_v1_5 = ModelMeta( @@ -50,6 +53,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=None, # https://github.com/staoxiao/RetroMAE, includes wikipedia and bookcorpus and contrastive pairs (unknown) + public_training_code=False, + public_training_data=False, ) bge_large_en_v1_5 = ModelMeta( @@ -73,4 +79,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=None, # https://github.com/staoxiao/RetroMAE, includes wikipedia and bookcorpus and contrastive pairs (unknown) + public_training_code=False, + public_training_data=False, ) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 1541d3ca3d..55968f104c 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -63,7 +63,7 @@ def encode( open_weights=True, revision="5f4f5ca159b7321a8b39739bba0794fa0debddf4", release_date="2024-09-21", - n_parameters=103 * 1e6, + n_parameters=int(103 * 1e6), max_tokens=np.inf, # Theoretically infinite embed_dim=256, license="mit", @@ -73,6 +73,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + training_datasets=None, # source is unkown + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) @@ -96,6 +99,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + training_datasets=None, # source is unkown + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) m2v_base_output = ModelMeta( @@ -118,6 +124,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + training_datasets=None, # source is unkown + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) m2v_multilingual_output = ModelMeta( @@ -140,6 +149,9 @@ def encode( use_instructions=False, adapted_from="sentence-transformers/LaBSE", superseded_by=None, + training_datasets=None, # source is unkown + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) potion_base_2m = ModelMeta( @@ -162,6 +174,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + training_datasets=None, # source is unkown + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) potion_base_4m = ModelMeta( @@ -184,6 +199,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + training_datasets=None, # source is unkown + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) potion_base_8m = ModelMeta( @@ -206,4 +224,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + training_datasets=None, # source is unkown + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) From 9ea60ff5c1718592b1e364264ffe59f7ba8f2c6f Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:45:52 +0100 Subject: [PATCH 12/17] added llm2vec annotations --- mteb/models/llm2vec_models.py | 49 +++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/mteb/models/llm2vec_models.py b/mteb/models/llm2vec_models.py index e962289aac..cbc42fe5ed 100644 --- a/mteb/models/llm2vec_models.py +++ b/mteb/models/llm2vec_models.py @@ -20,6 +20,31 @@ def llm2vec_instruction(instruction): return instruction +llm2vec_supervised_training_data = { + # source, section g1: https://arxiv.org/pdf/2404.05961 + # splits assumed but unkown + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "MrTidyRetrieval": ["train"], + "T2Reranking": ["train"], +} + + class LLM2VecWrapper(Wrapper): def __init__( self, @@ -100,6 +125,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_llama3_8b_unsupervised = ModelMeta( @@ -124,6 +152,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) @@ -149,6 +180,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_mistral7b_unsupervised = ModelMeta( @@ -173,6 +207,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) llm2vec_llama2_7b_supervised = ModelMeta( @@ -197,6 +234,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_llama2_7b_unsupervised = ModelMeta( @@ -221,6 +261,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) llm2vec_sheared_llama_supervised = ModelMeta( @@ -245,6 +288,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_sheared_llama_unsupervised = ModelMeta( @@ -269,4 +315,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) From b123d9203047fb043e46b5ac14f2bc43438ccce6 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:47:59 +0100 Subject: [PATCH 13/17] add jasper --- mteb/models/jasper_models.py | 8 +++++--- mteb/models/nvidia_models.py | 6 +++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index 60fa4f6975..bb9533c155 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -16,6 +16,7 @@ from .wrapper import Wrapper logger = logging.getLogger(__name__) +from .nvidia_models import nvidia_training_datasets class JasperWrapper(Wrapper): @@ -90,7 +91,8 @@ def encode( use_instructions=True, adapted_from=None, superseded_by=None, - training_datasets={ - "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], - }, + training_datasets=nvidia_training_datasets, # "In jasper model the teacher model is nvidia/NV-Embed-v2", source https://huggingface.co/infgrad/jasper_en_vision_language_v1 + # "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], + public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 2af1be27ce..6bf4e041aa 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -72,7 +72,7 @@ def encode( return embeddings -training_datasets = { +nvidia_training_datasets = { # source: https://arxiv.org/pdf/2405.17428 "ArguAna": ["train"], "ArguAna-PL": ["train"], @@ -140,7 +140,7 @@ def encode( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - training_datasets=training_datasets, + training_datasets=nvidia_training_datasets, public_training_code=None, public_training_data=True, ) @@ -165,7 +165,7 @@ def encode( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - training_datasets=training_datasets, + training_datasets=nvidia_training_datasets, public_training_code=None, public_training_data=True, ) From aa728d1ab858003b5430cdf2de4a6f3e1ea95b7a Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:49:02 +0100 Subject: [PATCH 14/17] format --- mteb/models/jasper_models.py | 2 +- mteb/models/stella_models.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index bb9533c155..0062df2acc 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -13,10 +13,10 @@ from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta +from .nvidia_models import nvidia_training_datasets from .wrapper import Wrapper logger = logging.getLogger(__name__) -from .nvidia_models import nvidia_training_datasets class JasperWrapper(Wrapper): diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 8709196319..c7a1a0f347 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -28,7 +28,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_400M_v5", - training_datasets=None, + training_datasets=None, public_training_data=False, # currently not released public_training_code=False, ) @@ -55,7 +55,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", - training_datasets=None, + training_datasets=None, public_training_data=False, # currently not released public_training_code=False, ) From 121bf0ea25e0f92b29c17a4a1ec1996698ca16d8 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sun, 12 Jan 2025 21:10:30 +0100 Subject: [PATCH 15/17] format --- mteb/models/model2vec_models.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 8f1621fcb2..8430b8aa58 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -9,11 +9,11 @@ from mteb.model_meta import ModelMeta +from .bge_models import bge_training_data from .wrapper import Wrapper logger = logging.getLogger(__name__) -from .bge_models import bge_training_data class Model2VecWrapper(Wrapper): def __init__( @@ -74,7 +74,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=bge_training_data, # distilled public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) @@ -100,7 +100,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=bge_training_data, # distilled public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) @@ -125,7 +125,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=bge_training_data, # distilled public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) @@ -150,7 +150,7 @@ def encode( use_instructions=False, adapted_from="sentence-transformers/LaBSE", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=bge_training_data, # distilled public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) @@ -175,7 +175,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=bge_training_data, # distilled public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) @@ -200,7 +200,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=bge_training_data, # distilled public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) @@ -225,7 +225,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=bge_training_data, # distilled public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) From b2b9ccaa9889593de67c833f019c2bdddf0f5cdf Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Mon, 13 Jan 2025 15:03:53 +0100 Subject: [PATCH 16/17] Updated annotations and moved jina models --- mteb/models/jina_models.py | 89 ++++++++++++++++ mteb/models/model2vec_models.py | 2 +- mteb/models/sentence_transformers_models.py | 111 ++++---------------- 3 files changed, 109 insertions(+), 93 deletions(-) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index f1a05d210d..728ffaa98f 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -226,3 +226,92 @@ def encode( public_training_code=False, public_training_data=False, ) + + +jina_embeddings_v2_base_en = ModelMeta( + name="jinaai/jina-embeddings-v2-base-en", + languages=["eng-Latn"], + open_weights=True, + revision="6e85f575bc273f1fd840a658067d0157933c83f0", + release_date="2023-09-27", + n_parameters=137_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-en", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC +) + +jina_embeddings_v2_small_en = ModelMeta( + name="jinaai/jina-embeddings-v2-small-en", + languages=["eng-Latn"], + open_weights=True, + revision="796cff318cdd4e5fbe8b7303a1ef8cbec36996ef", + release_date="2023-09-27", + n_parameters=32_700_000, + memory_usage=None, + embed_dim=512, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/jinaai/jina-embeddings-v2-small-en", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} +) + +jina_embedding_b_en_v1 = ModelMeta( + name="jinaai/jina-embedding-b-en-v1", + languages=["eng-Latn"], + open_weights=True, + revision="aa0645035294a8c0607ce5bb700aba982cdff32c", + release_date="2023-07-07", + n_parameters=110_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/jinaai/jina-embedding-b-en-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by="jinaai/jina-embeddings-v2-base-en", + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} +) + +jina_embedding_s_en_v1 = ModelMeta( + name="jinaai/jina-embedding-s-en-v1", + languages=["eng-Latn"], + open_weights=True, + revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", + release_date="2023-07-07", + n_parameters=35_000_000, + memory_usage=None, + embed_dim=512, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/jinaai/jina-embedding-s-en-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by="jinaai/jina-embeddings-v2-small-en", + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} +) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 8430b8aa58..1a58bbf8e3 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -150,7 +150,7 @@ def encode( use_instructions=False, adapted_from="sentence-transformers/LaBSE", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=None, public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index b4cd80f9c5..f8b01c6eaf 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -112,6 +112,8 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, + public_training_code=True, + public_training_data=True, ) all_MiniLM_L12_v2 = ModelMeta( @@ -132,7 +134,7 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, - public_training_code=False, + public_training_code=True, public_training_data=True, ) @@ -154,7 +156,7 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) - public_training_code=False, + public_training_code=True, public_training_data=True, ) @@ -175,8 +177,19 @@ use_instructions=False, superseded_by=None, adapted_from=None, - training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) - public_training_code=False, + training_datasets=sent_trf_training_dataset, + # + https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/paraphrases/training.py + # which include (not in MTEB): + # "all-nli": all_nli_train_dataset, + # "sentence-compression": sentence_compression_train_dataset, + # "simple-wiki": simple_wiki_train_dataset, + # "altlex": altlex_train_dataset, + # "quora-duplicates": quora_train_dataset, + # "coco-captions": coco_train_dataset, + # "flickr30k-captions": flickr_train_dataset, + # "yahoo-answers": yahoo_answers_train_dataset, + # "stack-exchange": stack_exchange_train_dataset, + public_training_code=True, public_training_data=True, ) @@ -242,94 +255,8 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, -) - -jina_embeddings_v2_base_en = ModelMeta( - name="jinaai/jina-embeddings-v2-base-en", - languages=["eng-Latn"], - open_weights=True, - revision="6e85f575bc273f1fd840a658067d0157933c83f0", - release_date="2023-09-27", - n_parameters=137_000_000, - memory_usage=None, - embed_dim=768, - license="apache-2.0", - max_tokens=8192, - reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-en", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC -) - -jina_embeddings_v2_small_en = ModelMeta( - name="jinaai/jina-embeddings-v2-small-en", - languages=["eng-Latn"], - open_weights=True, - revision="796cff318cdd4e5fbe8b7303a1ef8cbec36996ef", - release_date="2023-09-27", - n_parameters=32_700_000, - memory_usage=None, - embed_dim=512, - license="apache-2.0", - max_tokens=8192, - reference="https://huggingface.co/jinaai/jina-embeddings-v2-small-en", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} -) - -jina_embedding_b_en_v1 = ModelMeta( - name="jinaai/jina-embedding-b-en-v1", - languages=["eng-Latn"], - open_weights=True, - revision="aa0645035294a8c0607ce5bb700aba982cdff32c", - release_date="2023-07-07", - n_parameters=110_000_000, - memory_usage=None, - embed_dim=768, - license="apache-2.0", - max_tokens=512, - reference="https://huggingface.co/jinaai/jina-embedding-b-en-v1", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by="jinaai/jina-embeddings-v2-base-en", - adapted_from=None, - training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} -) - -jina_embedding_s_en_v1 = ModelMeta( - name="jinaai/jina-embedding-s-en-v1", - languages=["eng-Latn"], - open_weights=True, - revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", - release_date="2023-07-07", - n_parameters=35_000_000, - memory_usage=None, - embed_dim=512, - license="apache-2.0", - max_tokens=512, - reference="https://huggingface.co/jinaai/jina-embedding-s-en-v1", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by="jinaai/jina-embeddings-v2-small-en", - adapted_from=None, - training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} + public_training_code=True, + public_training_data=True, ) From 3aab7ecf44e5a65e0284303f16df863926a72269 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Wed, 15 Jan 2025 16:34:18 +0100 Subject: [PATCH 17/17] fix: add even more training dataset annotations (#1793) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: update max tokens for OpenAI (#1772) update max tokens * ci: skip AfriSentiLID for now (#1785) * skip AfriSentiLID for now * skip relevant test case instead --------- Co-authored-by: Isaac Chung * 1.28.7 Automatically generated by python-semantic-release * ci: fix model loading test (#1775) * pass base branch into the make command as an arg * test a file that has custom wrapper * what about overview * just dont check overview * revert instance check * explicitly omit overview and init * remove test change * try on a lot of models * revert test model file --------- Co-authored-by: Isaac Chung * feat: Update task filtering, fixing bug which included cross-lingual tasks in overly many benchmarks (#1787) * feat: Update task filtering, fixing bug on MTEB - Updated task filtering adding exclusive_language_filter and hf_subset - fix bug in MTEB where cross-lingual splits were included - added missing language filtering to MTEB(europe, beta) and MTEB(indic, beta) The following code outlines the problems: ```py import mteb from mteb.benchmarks import MTEB_ENG_CLASSIC task = [t for t in MTEB_ENG_CLASSIC.tasks if t.metadata.name == "STS22"][0] # was eq. to: task = mteb.get_task("STS22", languages=["eng"]) task.hf_subsets # correct filtering to English datasets: # ['en', 'de-en', 'es-en', 'pl-en', 'zh-en'] # However it should be: # ['en'] # with the changes it is: task = [t for t in MTEB_ENG_CLASSIC.tasks if t.metadata.name == "STS22"][0] task.hf_subsets # ['en'] # eq. to task = mteb.get_task("STS22", hf_subsets=["en"]) # which you can also obtain using the exclusive_language_filter (though not if there was multiple english splits): task = mteb.get_task("STS22", languages=["eng"], exclusive_language_filter=True) ``` * format * remove "en-ext" from AmazonCounterfactualClassification * fixed mteb(deu) * fix: simplify in a few areas * fix: Add gritlm * 1.29.0 Automatically generated by python-semantic-release * fix: Added more annotations! * fix: Added C-MTEB (#1786) Added C-MTEB * 1.29.1 Automatically generated by python-semantic-release * docs: Add contact to MMTEB benchmarks (#1796) * Add myself to MMTEB benchmarks * lint * fix: loading pre 11 (#1798) * fix loading pre 11 * add similarity * lint * run all task types * 1.29.2 Automatically generated by python-semantic-release * fix: allow to load no revision available (#1801) * fix allow to load no revision available * lint * add require_model_meta to leaderboard * lint * 1.29.3 Automatically generated by python-semantic-release --------- Co-authored-by: Roman Solomatin Co-authored-by: Isaac Chung Co-authored-by: Isaac Chung Co-authored-by: github-actions Co-authored-by: Márton Kardos --- .github/workflows/model_loading.yml | 2 +- Makefile | 2 +- mteb/abstasks/AbsTask.py | 68 +++-- mteb/abstasks/MultilingualTask.py | 4 +- mteb/benchmarks/benchmarks.py | 385 +++++++++++++++++-------- mteb/languages.py | 15 + mteb/leaderboard/app.py | 4 +- mteb/load_results/load_results.py | 1 + mteb/load_results/task_results.py | 19 +- mteb/models/bge_models.py | 24 ++ mteb/models/gritlm_models.py | 11 +- mteb/models/misc_models.py | 79 +++-- mteb/models/openai_models.py | 8 +- mteb/models/ru_sentence_models.py | 43 ++- mteb/overview.py | 27 +- pyproject.toml | 4 +- scripts/compare_leaderboard_results.py | 90 +++--- scripts/extract_model_names.py | 7 +- tests/test_tasks/test_all_abstasks.py | 9 +- 19 files changed, 550 insertions(+), 252 deletions(-) diff --git a/.github/workflows/model_loading.yml b/.github/workflows/model_loading.yml index 8707a9c1d6..c139536321 100644 --- a/.github/workflows/model_loading.yml +++ b/.github/workflows/model_loading.yml @@ -21,4 +21,4 @@ jobs: - name: Install dependencies and run tests run: | - make model-load-test + make model-load-test BASE_BRANCH=${{ github.event.pull_request.base.ref }} diff --git a/Makefile b/Makefile index 6e8647a2ce..02d0ba2478 100644 --- a/Makefile +++ b/Makefile @@ -41,5 +41,5 @@ build-docs: model-load-test: @echo "--- 🚀 Running model load test ---" pip install ".[dev, speedtask, pylate,gritlm,xformers,model2vec]" - python scripts/extract_model_names.py + python scripts/extract_model_names.py $(BASE_BRANCH) python tests/test_models/model_loading.py --model_name_file scripts/model_names.txt \ No newline at end of file diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index 443725ec7f..1d2e4fcb05 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -5,6 +5,7 @@ import random from abc import ABC, abstractmethod from collections.abc import Sequence +from copy import copy from typing import Any import datasets @@ -62,6 +63,7 @@ class AbsTask(ABC): dataset: dict[HFSubset, DatasetDict] | None = None # type: ignore data_loaded: bool = False is_multilingual: bool = False + hf_subsets: list[HFSubset] | None = None def __init__(self, seed: int = 42, **kwargs: Any): self.save_suffix = kwargs.get("save_suffix", "") @@ -110,10 +112,13 @@ def evaluate( self.dataset: dict[HFSubset, DatasetDict] scores = {} - hf_subsets = list(self.dataset.keys()) if self.is_multilingual else ["default"] + if self.hf_subsets is None: + hf_subsets = list(self.dataset.keys()) + else: + hf_subsets = copy(self.hf_subsets) - if subsets_to_run is not None: - hf_subsets = [s for s in hf_subsets if s in subsets_to_run] + if subsets_to_run is not None: # allow overwrites of pre-filtering + hf_subsets = subsets_to_run for hf_subset in hf_subsets: logger.info( @@ -218,16 +223,13 @@ def calculate_metadata_metrics( ) descriptive_stats[split][hf_subset_stat] = {} - eval_langs = ( - list(self.metadata.eval_langs.keys()) - if isinstance(self.metadata.eval_langs, dict) - else self.metadata.eval_langs + pbar_subsets = tqdm.tqdm( + self.metadata.hf_subsets_to_langscripts, + desc="Processing Languages...", ) - - pbar_subsets = tqdm.tqdm(eval_langs, desc="Processing Languages...") for hf_subset in pbar_subsets: - pbar_subsets.set_postfix_str(f"Language: {hf_subset}") - logger.info(f"Processing metadata for language {hf_subset}") + pbar_subsets.set_postfix_str(f"Huggingface subset: {hf_subset}") + logger.info(f"Processing metadata for subset {hf_subset}") split_details = self._calculate_metrics_from_split(split, hf_subset) descriptive_stats[split][hf_subset_stat][hf_subset] = split_details else: @@ -252,12 +254,8 @@ def metadata_dict(self) -> dict[str, Any]: @property def languages(self) -> list[str]: """Returns the languages of the task""" - # check if self.hf_subsets is set - if self.is_multilingual and hasattr(self, "hf_subsets"): - assert isinstance( - self.metadata.eval_langs, dict - ), "eval_langs must be dict for multilingual tasks" - eval_langs = self.metadata.eval_langs + if self.hf_subsets: + eval_langs = self.metadata.hf_subsets_to_langscripts languages = [] for lang in self.hf_subsets: @@ -275,31 +273,43 @@ def filter_eval_splits(self, eval_splits: list[str] | None) -> AbsTask: return self def filter_languages( - self, languages: list[str] | None, script: list[str] | None = None + self, + languages: list[str] | None, + script: list[str] | None = None, + hf_subsets: list[HFSubset] | None = None, + exclusive_language_filter: bool = False, ) -> AbsTask: """Filter the languages of the task. Args: languages: list of languages to filter the task by can be either a 3-letter langauge code (e.g. "eng") or also include the script (e.g. "eng-Latn") - script: list of scripts to filter the task by. Will be ignored if language code specified the script. If None, all scripts are included. + script: A list of scripts to filter the task by. Will be ignored if language code specified the script. If None, all scripts are included. If the language code does not specify the script the intersection of the language and script will be used. + hf_subsets: A list of huggingface subsets to filter on. This is useful if a dataset have multiple subsets containing the desired language, + but you only want to test on one. An example is STS22 which e.g. have both "en" and "de-en" which both contains English. + exclusive_language_filter: Some datasets contains more than one language e.g. for STS22 the subset "de-en" contain eng and deu. If + exclusive_language_filter is set to False both of these will be kept, but if set to True only those that contains all the languages + specified will be kept. """ lang_scripts = LanguageScripts.from_languages_and_scripts(languages, script) subsets_to_keep = [] - if not isinstance(self.metadata.eval_langs, dict): - self.hf_subsets = self.metadata.eval_langs - return self - - for hf_subset, langs in self.metadata.eval_langs.items(): - for langscript in langs: - if lang_scripts.contains_language( - langscript - ) or lang_scripts.contains_script(langscript): + for hf_subset, langs in self.metadata.hf_subsets_to_langscripts.items(): + if (hf_subsets is not None) and (hf_subset not in hf_subsets): + continue + if exclusive_language_filter is False: + for langscript in langs: + if lang_scripts.contains_language( + langscript + ) or lang_scripts.contains_script(langscript): + subsets_to_keep.append(hf_subset) + break + + if exclusive_language_filter is True and languages: + if lang_scripts.contains_languages(langs): subsets_to_keep.append(hf_subset) - break self.hf_subsets = subsets_to_keep return self diff --git a/mteb/abstasks/MultilingualTask.py b/mteb/abstasks/MultilingualTask.py index 3fd007df6d..6516e74bd0 100644 --- a/mteb/abstasks/MultilingualTask.py +++ b/mteb/abstasks/MultilingualTask.py @@ -12,9 +12,7 @@ def __init__(self, hf_subsets: list[str] | None = None, **kwargs): lang for lang in hf_subsets if lang in self.metadata.eval_langs ] if hf_subsets is not None and len(hf_subsets) > 0: - self.hf_subsets = ( - hf_subsets # TODO: case where user provides langs not in the dataset - ) + self.hf_subsets = hf_subsets else: self.hf_subsets = self.metadata.eval_langs self.is_multilingual = True diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index edb4326cae..0537c604f7 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -9,7 +9,7 @@ from mteb.abstasks.AbsTask import AbsTask from mteb.load_results.benchmark_results import BenchmarkResults from mteb.load_results.load_results import load_results -from mteb.overview import MTEBTasks, get_tasks +from mteb.overview import MTEBTasks, get_task, get_tasks http_url_adapter = TypeAdapter(AnyUrl) UrlString = Annotated[ @@ -27,6 +27,7 @@ class Benchmark: description: A description of the benchmark, should include its intended goal and potentially a description of its construction reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github. citation: A bibtex citation + contacts: The people to contact in case of a problem in the benchmark, preferably a GitHub handle. Example: >>> Benchmark( @@ -44,6 +45,7 @@ class Benchmark: description: str | None = None reference: UrlString | None = None citation: str | None = None + contacts: list[str] | None = None def __iter__(self): return iter(self.tasks) @@ -70,55 +72,65 @@ def load_results( MTEB_EN = Benchmark( name="MTEB(eng, beta)", - tasks=get_tasks( - tasks=[ - "AmazonCounterfactualClassification", - "ArguAna", - "ArXivHierarchicalClusteringP2P", - "ArXivHierarchicalClusteringS2S", - "AskUbuntuDupQuestions", - "BIOSSES", - "Banking77Classification", - "BiorxivClusteringP2P.v2", - "CQADupstackGamingRetrieval", - "CQADupstackUnixRetrieval", - "ClimateFEVERHardNegatives", - "FEVERHardNegatives", - "FiQA2018", - "HotpotQAHardNegatives", - "ImdbClassification", - "MTOPDomainClassification", - "MassiveIntentClassification", - "MassiveScenarioClassification", - "MedrxivClusteringP2P.v2", - "MedrxivClusteringS2S.v2", - "MindSmallReranking", - "SCIDOCS", - "SICK-R", - "STS12", - "STS13", - "STS14", - "STS15", - "STS17", - "STS22.v2", - "STSBenchmark", - "SprintDuplicateQuestions", - "StackExchangeClustering.v2", - "StackExchangeClusteringP2P.v2", - "TRECCOVID", - "Touche2020Retrieval.v3", - "ToxicConversationsClassification", - "TweetSentimentExtractionClassification", - "TwentyNewsgroupsClustering.v2", - "TwitterSemEval2015", - "TwitterURLCorpus", - "SummEvalSummarization.v2", - ], - languages=["eng"], - eval_splits=["test"], + tasks=MTEBTasks( + get_tasks( + tasks=[ + "ArguAna", + "ArXivHierarchicalClusteringP2P", + "ArXivHierarchicalClusteringS2S", + "AskUbuntuDupQuestions", + "BIOSSES", + "Banking77Classification", + "BiorxivClusteringP2P.v2", + "CQADupstackGamingRetrieval", + "CQADupstackUnixRetrieval", + "ClimateFEVERHardNegatives", + "FEVERHardNegatives", + "FiQA2018", + "HotpotQAHardNegatives", + "ImdbClassification", + "MTOPDomainClassification", + "MassiveIntentClassification", + "MassiveScenarioClassification", + "MedrxivClusteringP2P.v2", + "MedrxivClusteringS2S.v2", + "MindSmallReranking", + "SCIDOCS", + "SICK-R", + "STS12", + "STS13", + "STS14", + "STS15", + "STSBenchmark", + "SprintDuplicateQuestions", + "StackExchangeClustering.v2", + "StackExchangeClusteringP2P.v2", + "TRECCOVID", + "Touche2020Retrieval.v3", + "ToxicConversationsClassification", + "TweetSentimentExtractionClassification", + "TwentyNewsgroupsClustering.v2", + "TwitterSemEval2015", + "TwitterURLCorpus", + "SummEvalSummarization.v2", + ], + languages=["eng"], + eval_splits=["test"], + exclusive_language_filter=True, + ) + + ( + get_task( + "AmazonCounterfactualClassification", + eval_splits=["test"], + hf_subsets=["en"], + ), + get_task("STS17", eval_splits=["test"], hf_subsets=["en-en"]), + get_task("STS22.v2", eval_splits=["test"], hf_subsets=["en"]), + ), ), description="English benchmarks from MTEB", citation="", + contacts=["KennethEnevoldsen", "Muennighoff"], ) MTEB_ENG_CLASSIC = Benchmark( @@ -126,7 +138,6 @@ def load_results( tasks=MTEBTasks( get_tasks( tasks=[ - "AmazonCounterfactualClassification", "AmazonPolarityClassification", "AmazonReviewsClassification", "ArguAna", @@ -175,8 +186,6 @@ def load_results( "STS14", "STS15", "STS16", - "STS17", - "STS22", "STSBenchmark", "SciDocsRR", "SciFact", @@ -197,6 +206,15 @@ def load_results( eval_splits=["test"], ) + get_tasks(tasks=["MSMARCO"], languages=["eng"], eval_splits=["dev"]) + + ( + get_task( + "AmazonCounterfactualClassification", + eval_splits=["test"], + hf_subsets=["en"], + ), + get_task("STS17", eval_splits=["test"], hf_subsets=["en-en"]), + get_task("STS22", eval_splits=["test"], hf_subsets=["en"]), + ) ), description="The original English benchmark by Muennighoff et al., (2023).", citation="""@inproceedings{muennighoff-etal-2023-mteb, @@ -217,6 +235,7 @@ def load_results( pages = "2014--2037", } """, + contacts=["Muennighoff"], ) MTEB_MAIN_RU = Benchmark( @@ -407,6 +426,7 @@ def load_results( archivePrefix={arXiv}, primaryClass={cs.CL} }""", + contacts=["KennethEnevoldsen", "x-tabdeveloping", "Samoed"], ) CoIR = Benchmark( @@ -469,46 +489,49 @@ def load_results( journal={arXiv preprint arXiv:2404.06347}, year={2024} }""", + contacts=["gowitheflow-1998"], ) MTEB_FRA = Benchmark( name="MTEB(fra)", - tasks=get_tasks( - languages=["fra"], - tasks=[ - # Classification - "AmazonReviewsClassification", - "MasakhaNEWSClassification", - "MassiveIntentClassification", - "MassiveScenarioClassification", - "MTOPDomainClassification", - "MTOPIntentClassification", - # Clustering - "AlloProfClusteringP2P", - "AlloProfClusteringS2S", - "HALClusteringS2S", - "MasakhaNEWSClusteringP2P", - "MasakhaNEWSClusteringS2S", - "MLSUMClusteringP2P", - "MLSUMClusteringS2S", - # Pair Classification - "OpusparcusPC", - "PawsXPairClassification", - # Reranking - "AlloprofReranking", - "SyntecReranking", - # Retrieval - "AlloprofRetrieval", - "BSARDRetrieval", - "MintakaRetrieval", - "SyntecRetrieval", - "XPQARetrieval", - # STS - "SICKFr", - "STS22", - "STSBenchmarkMultilingualSTS", - "SummEvalFr", - ], + tasks=MTEBTasks( + get_tasks( + languages=["fra"], + tasks=[ + # Classification + "AmazonReviewsClassification", + "MasakhaNEWSClassification", + "MassiveIntentClassification", + "MassiveScenarioClassification", + "MTOPDomainClassification", + "MTOPIntentClassification", + # Clustering + "AlloProfClusteringP2P", + "AlloProfClusteringS2S", + "HALClusteringS2S", + "MasakhaNEWSClusteringP2P", + "MasakhaNEWSClusteringS2S", + "MLSUMClusteringP2P", + "MLSUMClusteringS2S", + # Pair Classification + "OpusparcusPC", + "PawsXPairClassification", + # Reranking + "AlloprofReranking", + "SyntecReranking", + # Retrieval + "AlloprofRetrieval", + "BSARDRetrieval", + "MintakaRetrieval", + "SyntecRetrieval", + "XPQARetrieval", + # STS + "SICKFr", + "STSBenchmarkMultilingualSTS", + "SummEvalFr", + ], + ) + + (get_task("STS22", eval_splits=["test"], hf_subsets=["fr"]),) ), description="Main French benchmarks from MTEB", reference="https://arxiv.org/abs/2405.20468", @@ -521,6 +544,7 @@ def load_results( primaryClass={cs.CL}, url={https://arxiv.org/abs/2405.20468}, }""", + contacts=["imenelydiaker"], ) @@ -528,6 +552,7 @@ def load_results( name="MTEB(deu)", tasks=get_tasks( languages=["deu"], + exclusive_language_filter=True, tasks=[ # Classification "AmazonCounterfactualClassification", @@ -595,32 +620,34 @@ def load_results( MTEB_POL = Benchmark( name="MTEB(pol)", - tasks=get_tasks( - languages=["pol"], - tasks=[ - # Classification - "AllegroReviews", - "CBD", - "MassiveIntentClassification", - "MassiveScenarioClassification", - "PolEmo2.0-IN", - "PolEmo2.0-OUT", - "PAC", - # Clustering - "EightTagsClustering", - "PlscClusteringS2S", - "PlscClusteringP2P", - # Pair Classification - "CDSC-E", - "PpcPC", - "PSC", - "SICK-E-PL", - # STS - "CDSC-R", - "STS22", - "STSBenchmarkMultilingualSTS", - "SICK-R-PL", - ], + tasks=MTEBTasks( + get_tasks( + languages=["pol"], + tasks=[ + # Classification + "AllegroReviews", + "CBD", + "MassiveIntentClassification", + "MassiveScenarioClassification", + "PolEmo2.0-IN", + "PolEmo2.0-OUT", + "PAC", + # Clustering + "EightTagsClustering", + "PlscClusteringS2S", + "PlscClusteringP2P", + # Pair Classification + "CDSC-E", + "PpcPC", + "PSC", + "SICK-E-PL", + # STS + "CDSC-R", + "STSBenchmarkMultilingualSTS", + "SICK-R-PL", + ], + ) + + (get_task("STS22", eval_splits=["test"], hf_subsets=["pl"]),), ), description="Main Polish benchmarks from MTEB", reference="https://arxiv.org/abs/2405.10138", @@ -813,6 +840,7 @@ def load_results( description="The Multilingual benchmarks from MMTEB. Currently under development.", reference=None, citation=None, + contacts=["KennethEnevoldsen", "isaac-chung"], ) MTEB_JPN = Benchmark( @@ -850,6 +878,39 @@ def load_results( ) +indic_languages = [ + "asm", + "awa", + "ben", + "bgc", + "bho", + "doi", + "gbm", + "gom", + "guj", + "hin", + "hne", + "kan", + "kas", + "mai", + "mal", + "mar", + "mni", + "mup", + "mwr", + "nep", + "npi", + "ori", + "ory", + "pan", + "raj", + "san", + "snd", + "tam", + "tel", + "urd", +] + MTEB_INDIC = Benchmark( name="MTEB(Indic, beta)", tasks=get_tasks( @@ -885,13 +946,59 @@ def load_results( # reranking "WikipediaRerankingMultilingual", ], + languages=indic_languages, + exclusive_language_filter=True, ), description="Main Indic benchmark from MMTEB", reference=None, citation=None, + contacts=["KennethEnevoldsen", "isaac-chung"], ) +eu_languages = [ + # official EU languages (56) - we could include the whole economic area e.g. Norway - additioanlly we could include minority languages (probably a good idea?) + # germanic + "dan", + "eng", + "deu", + "nld", + "swe", + # romance + "fra", + "ita", + "por", + "spa", + "ron", + # slavic + "bul", + "hrv", + "ces", + "pol", + "slk", + "slv", + # baltic + "lav", + "lit", + "est", + # finno-ugric + "fin", + "hun", + # other indo european + "ell", + # non-indo european + "mlt", + "gle", + # Schengen Area + "nno", + "nob", + "isl", + "ron", + "eus", # Basque - recognized minority language + "ron", # Romanian - recognized minority language + "rom", # Romani - recognized minority language +] + MTEB_EU = Benchmark( name="MTEB(Europe, beta)", tasks=get_tasks( @@ -970,11 +1077,14 @@ def load_results( "STS17", "SICK-R-PL", "STSES", - ] + ], + languages=eu_languages, + exclusive_language_filter=True, ), description="Main European benchmark from MMTEB", reference=None, citation=None, + contacts=["KennethEnevoldsen", "isaac-chung"], ) LONG_EMBED = Benchmark( @@ -1037,3 +1147,52 @@ def load_results( reference="https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6", citation=None, ) + +C_MTEB = Benchmark( + name="MTEB(Chinese)", + tasks=get_tasks( + tasks=[ + "T2Retrieval", + "MMarcoRetrieval", + "DuRetrieval", + "CovidRetrieval", + "CmedqaRetrieval", + "EcomRetrieval", + "MedicalRetrieval", + "VideoRetrieval", + "T2Reranking", + "MMarcoReranking", + "CMedQAv1-reranking", + "CMedQAv2-reranking", + "Ocnli", + "Cmnli", + "CLSClusteringS2S", + "CLSClusteringP2P", + "ThuNewsClusteringS2S", + "ThuNewsClusteringP2P", + "ATEC", + "BQ", + "LCQMC", + "PAWSX", + "STSB", + "AFQMC", + "QBQTC", + "TNews", + "IFlyTek", + "Waimai", + "OnlineShopping", + "MultilingualSentiment", + "JDReview", + ], + ), + description="The Chinese Massive Text Embedding Benchmark (C-MTEB) is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets.", + reference="https://github.com/FlagOpen/FlagEmbedding/tree/master/research/C_MTEB", + citation="""@misc{c-pack, + title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, + author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff}, + year={2023}, + eprint={2309.07597}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +}""", +) diff --git a/mteb/languages.py b/mteb/languages.py index 9b170a707f..e83dd308cd 100644 --- a/mteb/languages.py +++ b/mteb/languages.py @@ -5,6 +5,7 @@ from __future__ import annotations import json +from collections.abc import Iterable from dataclasses import dataclass from pathlib import Path @@ -81,5 +82,19 @@ def contains_language(self, language: str) -> bool: return True return False + def contains_languages(self, languages: Iterable[str]) -> bool: + """Whether is contains all of the languages""" + for l in languages: + if not self.contains_language(l): + return False + return True + def contains_script(self, script: str) -> bool: return script in self.scripts + + def contains_scripts(self, scripts: Iterable[str]) -> bool: + """Whether is contains all of the scripts""" + for s in scripts: + if not self.contains_script(s): + return False + return True diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index d1383cf1a7..e3c7d0aad2 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -24,7 +24,9 @@ def load_results(): results_cache_path = Path(__file__).parent.joinpath("__cached_results.json") if not results_cache_path.exists(): all_results = ( - mteb.load_results(only_main_score=True).join_revisions().filter_models() + mteb.load_results(only_main_score=True, require_model_meta=False) + .join_revisions() + .filter_models() ) all_results.to_disk(results_cache_path) return all_results diff --git a/mteb/load_results/load_results.py b/mteb/load_results/load_results.py index 03ec6fb308..ef851a1dc2 100644 --- a/mteb/load_results/load_results.py +++ b/mteb/load_results/load_results.py @@ -139,6 +139,7 @@ def load_results( continue model_name, revision = model_name_and_revision + model_name = model_name.replace("__", "/") if models_to_keep is not None and model_name not in models_to_keep: continue elif models_to_keep is not None and models_to_keep[model_name] is not None: diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index e1b9b9d69d..72cae5a93d 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -387,15 +387,16 @@ def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult: main_score = task.metadata.main_score for split, split_score in scores.items(): for hf_subset, hf_subset_scores in split_score.items(): - if task.metadata.type == "STS": - for name, prev_name in [ - ("cosine", "cos_sim"), - ("manhattan", "manhattan"), - ("euclidean", "euclidean"), - ]: - prev_name_scores = hf_subset_scores.pop( - prev_name, {"spearman": "NaN"} - ) + for name, prev_name in [ + ("cosine", "cos_sim"), + ("manhattan", "manhattan"), + ("euclidean", "euclidean"), + ("dot", "dot"), + ("max", "max"), + ("similarity", "similarity"), + ]: + prev_name_scores = hf_subset_scores.pop(prev_name, None) + if prev_name_scores is not None: for k, v in prev_name_scores.items(): hf_subset_scores[f"{name}_{k}"] = v diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 5a395f014a..dc3679a8da 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -6,6 +6,30 @@ model_prompts = {"query": "Represent this sentence for searching relevant passages: "} +bge_m_training_data = { + # source: https://arxiv.org/pdf/2402.03216 + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "LeCaRDv2": ["train"], + "CMedQAv1-reranking": ["train"], + "CMedQAv2-reranking": ["train"], + "MrTidyRetrieval": ["train"], + "T2Reranking": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + # + synthetic data +} + bge_training_data = { # source: https://data.baai.ac.cn/details/BAAI-MTP "NQ": ["test"], diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index 91acafa26e..a4f5befd19 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -5,6 +5,7 @@ from mteb.model_meta import ModelMeta +from .e5_models import E5_TRAINING_DATA from .instruct_wrapper import instruct_wrapper logger = logging.getLogger(__name__) @@ -29,7 +30,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: open_weights=True, revision="13f00a0e36500c80ce12870ea513846a066004af", release_date="2024-02-15", - training_datasets={"GritLM/tulu2": ["train"]}, n_parameters=7_240_000_000, memory_usage=None, embed_dim=4096, @@ -39,6 +39,10 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, + training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data + public_training_code=True, # https://github.com/ContextualAI/gritlm + public_training_data=False, ) gritlm8x7b = ModelMeta( loader=partial( # type: ignore @@ -50,7 +54,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: ), name="GritLM/GritLM-8x7B", languages=["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"], - training_datasets={"GritLM/tulu2": ["train"]}, open_weights=True, revision="7f089b13e3345510281733ca1e6ff871b5b4bc76", release_date="2024-02-15", @@ -63,4 +66,8 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, + training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data + public_training_code=True, # https://github.com/ContextualAI/gritlm + public_training_data=False, ) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index d5734b448c..b2a661fe5a 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -5,6 +5,10 @@ import torch from mteb.model_meta import ModelMeta, sentence_transformers_loader +from mteb.models.e5_models import E5_TRAINING_DATA + +from .bge_models import bge_m_training_data, bge_training_data +from .sentence_transformers_models import sent_trf_training_dataset Haon_Chen__speed_embedding_7b_instruct = ModelMeta( name="Haon-Chen/speed-embedding-7b-instruct", @@ -198,7 +202,8 @@ reference="https://huggingface.co/BeastyZ/e5-R-mistral-7b", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"BeastyZ/E5-R": ["train"]}, + training_datasets=E5_TRAINING_DATA, + # not MTEB: {"BeastyZ/E5-R": ["train"]}, adapted_from="/ConRetriever/public_weight_mistral", superseded_by=None, ) @@ -295,13 +300,14 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=True, + public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Bulbasaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is GTE-tiny where training data is unknown + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/dwsdwass", superseded_by=None, ) @@ -317,13 +323,14 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=True, + public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Ivysaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is GTE-tiny where training data is unknown + # not MTEB: {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/jhjghjgh", superseded_by=None, ) @@ -345,7 +352,8 @@ reference="https://huggingface.co/Mihaiii/Squirtle", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=bge_training_data, # source model is bge-base-en-v1.5 + # not MTEB: {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test21", superseded_by=None, ) @@ -367,7 +375,8 @@ reference="https://huggingface.co/Mihaiii/Venusaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is unkown + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test14", superseded_by=None, ) @@ -389,7 +398,8 @@ reference="https://huggingface.co/Mihaiii/Wartortle", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=bge_training_data, # distill from bge-base-en-v1.5 + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test22", superseded_by=None, ) @@ -477,7 +487,7 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets={}, # not in MTEB: {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="aubmindlab/bert-base-arabertv02", superseded_by=None, ) @@ -499,7 +509,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not in MTEB + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", superseded_by=None, ) @@ -521,7 +533,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, # derived from + # not in MTEB: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", superseded_by=None, ) @@ -543,7 +557,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=None, # derived from labSE + # as well as: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/LaBSE", superseded_by=None, ) @@ -565,7 +581,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not in MTEB: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="tomaarsen/mpnet-base-all-nli-triplet", superseded_by=None, ) @@ -587,7 +605,7 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets={}, # not in MTEB: "Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="UBC-NLP/MARBERTv2", superseded_by=None, ) @@ -719,7 +737,8 @@ reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.4", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"manu/embedding_data_v2_100k": ["train"]}, + training_datasets=None, + # Not in MTEB: {"manu/embedding_data_v2_100k": ["train"]}, adapted_from="croissantllm/CroissantCool-v0.2", superseded_by=None, ) @@ -1365,7 +1384,8 @@ reference="https://huggingface.co/aari1995/German_Semantic_STS_V2", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"stsb_multi_mt": ["train"]}, + training_datasets=None, # couldn't figure out the source model + # {"stsb_multi_mt": ["train"]}, adapted_from="/content/drive/MyDrive/Stanford_NLU/Project/false_friends/gbert_large_sts_only", superseded_by=None, ) @@ -1481,18 +1501,18 @@ reference="https://huggingface.co/deepvk/USER-bge-m3", similarity_fn_name="cosine", use_instructions=None, - training_datasets={ - "deepvk/ru-HNP": ["train"], - "deepvk/ru-WANLI": ["train"], - "Shitao/bge-m3-data": ["train"], - "RussianNLP/russian_super_glue": ["train"], - "reciTAL/mlsum": ["train"], - "Milana/russian_keywords": ["train"], - "IlyaGusev/gazeta": ["train"], - "d0rj/gsm8k-ru": ["train"], - "bragovo/dsum_ru": ["train"], - "CarlBrendt/Summ_Dialog_News": ["train"], - }, + training_datasets=bge_m_training_data, # derived from. + # not in MTEB: + # "deepvk/ru-HNP": ["train"], + # "deepvk/ru-WANLI": ["train"], + # "Shitao/bge-m3-data": ["train"], + # "RussianNLP/russian_super_glue": ["train"], + # "reciTAL/mlsum": ["train"], + # "Milana/russian_keywords": ["train"], + # "IlyaGusev/gazeta": ["train"], + # "d0rj/gsm8k-ru": ["train"], + # "bragovo/dsum_ru": ["train"], + # "CarlBrendt/Summ_Dialog_News": ["train"], adapted_from="USER-bge-m3", superseded_by=None, ) @@ -1622,7 +1642,8 @@ reference="https://huggingface.co/shibing624/text2vec-base-multilingual", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"shibing624/nli-zh-all": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not MTEB: {"shibing624/nli-zh-all": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", superseded_by=None, ) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index c187bfa317..619a4a747f 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -25,7 +25,7 @@ def __init__( **kwargs, ) -> None: """Wrapper for OpenAIs embedding API. - To handle documents larger than 8192 tokens, we truncate the document to the specified sequence length. + To handle documents larger than 8191 tokens, we truncate the document to the specified sequence length. """ requires_package(self, "openai", "Openai text embedding") from openai import OpenAI @@ -124,7 +124,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: OpenAIWrapper, model_name="text-embedding-3-small", tokenizer_name="cl100k_base", - max_tokens=8192, + max_tokens=8191, ), max_tokens=8191, embed_dim=1536, @@ -149,7 +149,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: OpenAIWrapper, model_name="text-embedding-3-large", tokenizer_name="cl100k_base", - max_tokens=8192, + max_tokens=8191, ), max_tokens=8191, embed_dim=3072, @@ -172,7 +172,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: OpenAIWrapper, model_name="text-embedding-ada-002", tokenizer_name="cl100k_base", - max_tokens=8192, + max_tokens=8191, ), reference="https://openai.com/index/new-and-improved-embedding-model/", max_tokens=8191, diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index a520bdca11..6bca544b11 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -6,6 +6,8 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader +from .bge_models import bge_training_data + rubert_tiny2 = ModelMeta( name="cointegrated/rubert-tiny2", languages=["rus_Cyrl"], @@ -96,20 +98,27 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, training_datasets={ - "deepvk/ru-HNP": ["train"], - "deepvk/ru-WANLI": ["train"], - "Shitao/bge-m3-data": ["train"], - "RussianNLP/russian_super_glue": ["train"], - "reciTAL/mlsum": ["train"], - "Helsinki-NLP/opus-100": ["train"], - "Helsinki-NLP/bible_para": ["train"], - "d0rj/rudetoxifier_data_detox": ["train"], - "s-nlp/ru_paradetox": ["train"], - "Milana/russian_keywords": ["train"], - "IlyaGusev/gazeta": ["train"], - "d0rj/gsm8k-ru": ["train"], - "bragovo/dsum_ru": ["train"], - "CarlBrendt/Summ_Dialog_News": ["train"], + "BibleNLPBitextMining": ["train"], + "MLSUMClusteringP2P": ["train"], + "MLSUMClusteringP2P.v2": ["train"], + "MLSUMClusteringS2S": ["train"], + "MLSUMClusteringS2S.v2": ["train"], + **bge_training_data, + # not MTEB: + # "deepvk/ru-HNP": ["train"], + # "deepvk/ru-WANLI": ["train"], + # "Shitao/bge-m3-data": ["train"], + # "RussianNLP/russian_super_glue": ["train"], + # "reciTAL/mlsum": ["train"], + # "Helsinki-NLP/opus-100": ["train"], + # "Helsinki-NLP/bible_para": ["train"], + # "d0rj/rudetoxifier_data_detox": ["train"], + # "s-nlp/ru_paradetox": ["train"], + # "Milana/russian_keywords": ["train"], + # "IlyaGusev/gazeta": ["train"], + # "d0rj/gsm8k-ru": ["train"], + # "bragovo/dsum_ru": ["train"], + # "CarlBrendt/Summ_Dialog_News": ["train"], }, ) @@ -213,7 +222,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets={"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + training_datasets=None, # source model in unknown + # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, ) labse_ru_turbo = ModelMeta( @@ -231,7 +241,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets={"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + training_datasets=None, # source model in unknown + # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, ) diff --git a/mteb/overview.py b/mteb/overview.py index 43f8cebc1b..ced0e7729f 100644 --- a/mteb/overview.py +++ b/mteb/overview.py @@ -232,6 +232,7 @@ def get_tasks( tasks: list[str] | None = None, exclude_superseded: bool = True, eval_splits: list[str] | None = None, + exclusive_language_filter: bool = False, ) -> MTEBTasks: """Get a list of tasks based on the specified filters. @@ -247,6 +248,9 @@ def get_tasks( tasks: A list of task names to include. If None, all tasks which pass the filters are included. exclude_superseded: A boolean flag to exclude datasets which are superseded by another. eval_splits: A list of evaluation splits to include. If None, all splits are included. + exclusive_language_filter: Some datasets contains more than one language e.g. for STS22 the subset "de-en" contain eng and deu. If + exclusive_language_filter is set to False both of these will be kept, but if set to True only those that contains all the languages + specified will be kept. Returns: A list of all initialized tasks objects which pass all of the filters (AND operation). @@ -256,10 +260,18 @@ def get_tasks( >>> get_tasks(languages=["eng"], script=["Latn"], task_types=["Classification"]) >>> get_tasks(languages=["eng"], script=["Latn"], task_types=["Clustering"], exclude_superseded=False) >>> get_tasks(languages=["eng"], tasks=["WikipediaRetrievalMultilingual"], eval_splits=["test"]) + >>> get_tasks(tasks=["STS22"], languages=["eng"], exclusive_language_filter=True) # don't include multilingual subsets containing English """ if tasks: _tasks = [ - get_task(task, languages, script, eval_splits=eval_splits) for task in tasks + get_task( + task, + languages, + script, + eval_splits=eval_splits, + exclusive_language_filter=exclusive_language_filter, + ) + for task in tasks ] return MTEBTasks(_tasks) @@ -289,6 +301,8 @@ def get_task( languages: list[str] | None = None, script: list[str] | None = None, eval_splits: list[str] | None = None, + hf_subsets: list[str] | None = None, + exclusive_language_filter: bool = False, ) -> AbsTask: """Get a task by name. @@ -298,6 +312,10 @@ def get_task( "eng-Latn". For multilingual tasks this will also remove languages that are not in the specified list. script: A list of script codes (ISO 15924 codes). If None, all scripts are included. For multilingual tasks this will also remove scripts eval_splits: A list of evaluation splits to include. If None, all splits are included. + hf_subsets: A list of Huggingface subsets to evaluate on. + exclusive_language_filter: Some datasets contains more than one language e.g. for STS22 the subset "de-en" contain eng and deu. If + exclusive_language_filter is set to False both of these will be kept, but if set to True only those that contains all the languages + specified will be kept. Returns: An initialized task object. @@ -319,4 +337,9 @@ def get_task( task = TASKS_REGISTRY[task_name]() if eval_splits: task.filter_eval_splits(eval_splits=eval_splits) - return task.filter_languages(languages, script) + return task.filter_languages( + languages, + script, + hf_subsets=hf_subsets, + exclusive_language_filter=exclusive_language_filter, + ) diff --git a/pyproject.toml b/pyproject.toml index 6dc6189822..441332dd73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.28.6" +version = "1.29.3" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ @@ -57,7 +57,7 @@ dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint codecarbon = ["codecarbon"] speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] peft = ["peft>=0.11.0"] -leaderboard = ["gradio>=5.7.1", "gradio_rangeslider>=0.0.8"] +leaderboard = ["gradio>=5.7.1", "gradio_rangeslider>=0.0.8", "plotly>=5.24.0"] flagembedding = ["FlagEmbedding"] jina = ["einops>=0.8.0"] flash_attention = ["flash-attn>=2.6.3"] diff --git a/scripts/compare_leaderboard_results.py b/scripts/compare_leaderboard_results.py index bbeb912bb4..1fe9c3d766 100644 --- a/scripts/compare_leaderboard_results.py +++ b/scripts/compare_leaderboard_results.py @@ -2,70 +2,84 @@ import json import logging +from collections import defaultdict from pathlib import Path -from mteb import MTEB_ENG_CLASSIC, load_results +from mteb import get_benchmark, load_results logging.basicConfig(level=logging.INFO) models = [ - "dunzhang/stella_en_1.5B_v5", - "dunzhang/stella_en_400M_v5", + "intfloat/multilingual-e5-small", # Add other models here ] +benchmark = get_benchmark("MTEB(Chinese)") + +results = [] # in same folder as mteb repo # git clone https://github.com/embeddings-benchmark/leaderboard -data_tasks_path = Path("../../leaderboard/boards_data/en/data_tasks/") +# get path of current file +base_path = Path(__file__).parent.parent.parent / "leaderboard" / "boards_data" -results = [] for model_name_to_search in models: model_results = load_results( models=[model_name_to_search], - tasks=MTEB_ENG_CLASSIC.tasks, + tasks=benchmark.tasks, only_main_score=True, + require_model_meta=False, ) - cur_model = {} + cur_model = {task.metadata.name: defaultdict(dict) for task in benchmark.tasks} for model_res in model_results: for task_res in model_res.task_results: task_name = task_res.task.metadata.name - split = "test" if task_name != "MSMARCO" else "dev" - scores = [score["main_score"] for score in task_res.scores[split]] - # this tmp solution, because some tasks have multiple results - cur_model[task_name] = {"new": round((sum(scores) / len(scores)) * 100, 2)} - for task_dir in data_tasks_path.iterdir(): - if task_dir.is_dir(): - results_file_path = task_dir / "default.jsonl" - if results_file_path.exists(): - with open(results_file_path) as file: - for line in file: - data = json.loads(line) - model_name = data.get("Model", "") - if model_name_to_search in model_name: - for key, value in data.items(): - if key in [ - "index", - "Rank", - "Model", - "Model Size (Million Parameters)", - "Memory Usage (GB, fp32)", - "Embedding Dimensions", - "Max Tokens", - "Average", - ]: - continue - for benchmark_task in MTEB_ENG_CLASSIC.tasks: - if benchmark_task.metadata.name in key: - cur_model[benchmark_task.metadata.name][ - "old" - ] = value + split = ( + "test" + if "test" in task_res.task.metadata.eval_splits + else task_res.task.metadata.eval_splits[0] + ) + if split in task_res.scores: + scores = [score["main_score"] for score in task_res.scores[split]] + cur_model[task_name]["new"] = round( + (sum(scores) / len(scores)) * 100, 2 + ) + + for lang_path in base_path.iterdir(): + data_tasks_path = lang_path / "data_tasks" + + for task_dir in data_tasks_path.iterdir(): + if task_dir.is_dir(): + results_file_path = task_dir / "default.jsonl" + if results_file_path.exists(): + with open(results_file_path) as file: + for line in file: + data = json.loads(line) + model_name = data.get("Model", "") + if model_name_to_search in model_name: + for key, value in data.items(): + if key in [ + "index", + "Rank", + "Model", + "Model Size (Million Parameters)", + "Memory Usage (GB, fp32)", + "Embedding Dimensions", + "Max Tokens", + "Average", + ]: + continue + for benchmark_task in benchmark.tasks: + if benchmark_task.metadata.name in key: + cur_model[benchmark_task.metadata.name][ + "old" + ] = value sorted_cur_model = { task.metadata.name: cur_model[task.metadata.name] - for task in MTEB_ENG_CLASSIC.tasks + for task in benchmark.tasks if task.metadata.name in cur_model } results.append({"model": model_name_to_search, "results": sorted_cur_model}) diff --git a/scripts/extract_model_names.py b/scripts/extract_model_names.py index dbe99a990e..ba1bc1a8b0 100644 --- a/scripts/extract_model_names.py +++ b/scripts/extract_model_names.py @@ -19,7 +19,12 @@ def get_changed_files(base_branch="main"): changed_files = diff.splitlines() return [ - f for f in changed_files if f.startswith("mteb/models/") and f.endswith(".py") + f + for f in changed_files + if f.startswith("mteb/models/") + and f.endswith(".py") + and "overview" not in f + and "init" not in f ] diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index 20eff8c434..e252293ea5 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -86,7 +86,14 @@ async def check_datasets_are_available_on_hf(tasks): def test_dataset_availability(): """Checks if the datasets are available on Hugging Face using both their name and revision.""" tasks = MTEB().tasks_cls - tasks = [t for t in tasks if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING] + tasks = [ + t + for t in tasks + if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING + if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING + and t.metadata.name + != "AfriSentiLangClassification" # HOTFIX: Issue#1777. Remove this line when issue is resolved. + ] asyncio.run(check_datasets_are_available_on_hf(tasks))