From a168496735329fbf9d3044b9bd2ae3e01046decc Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 17:06:40 +0100 Subject: [PATCH 01/21] fix: Leaderboard: `K` instead of `M` Fixes #1752 --- mteb/models/stella_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 153ee6aa99..a738f4461e 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -21,7 +21,7 @@ use_instructions=True, revision="1bb50bc7bb726810eac2140e62155b88b0df198f", release_date="2024-07-12", - n_parameters=435_000, + n_parameters=435_000_000, max_tokens=8192, embed_dim=4096, license="mit", @@ -45,7 +45,7 @@ use_instructions=True, revision="d03be74b361d4eb24f42a2fe5bd2e29917df4604", release_date="2024-07-12", - n_parameters=1_540_000, + n_parameters=1_540_000_000, max_tokens=131072, embed_dim=8960, license="mit", From e61d7f2086c889ada85ab202d1672561e266675c Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 17:57:25 +0100 Subject: [PATCH 02/21] format --- mteb/leaderboard/app.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index ba336f8ea5..d1383cf1a7 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -5,7 +5,6 @@ import logging import tempfile import time -from collections import defaultdict from pathlib import Path from urllib.parse import urlencode @@ -17,7 +16,6 @@ from mteb.caching import json_cache from mteb.leaderboard.figures import performance_size_plot, radar_chart from mteb.leaderboard.table import scores_to_tables -from mteb.models.overview import get_model_meta logger = logging.getLogger(__name__) @@ -143,28 +141,28 @@ def update_task_info(task_names: str) -> gr.DataFrame: ) lang_select = gr.Dropdown( all_results.languages, - value=list(sorted(default_results.languages)), + value=sorted(default_results.languages), multiselect=True, label="Language", info="Select languages to include.", ) type_select = gr.Dropdown( all_results.task_types, - value=list(sorted(default_results.task_types)), + value=sorted(default_results.task_types), multiselect=True, label="Task Type", info="Select task types to include.", ) domain_select = gr.Dropdown( all_results.domains, - value=list(sorted(default_results.domains)), + value=sorted(default_results.domains), multiselect=True, label="Domain", info="Select domains to include.", ) task_select = gr.Dropdown( all_results.task_names, - value=list(sorted(default_results.task_names)), + value=sorted(default_results.task_names), allow_custom_value=True, multiselect=True, label="Task", @@ -330,16 +328,16 @@ def on_benchmark_select(benchmark_name): benchmark = mteb.get_benchmark(benchmark_name) languages = [task.languages for task in benchmark.tasks if task.languages] languages = set(itertools.chain.from_iterable(languages)) - languages = list(sorted(languages)) + languages = sorted(languages) domains = [ task.metadata.domains for task in benchmark.tasks if task.metadata.domains ] domains = set(itertools.chain.from_iterable(domains)) types = {task.metadata.type for task in benchmark.tasks if task.metadata.type} languages, domains, types = ( - list(sorted(languages)), - list(sorted(domains)), - list(sorted(types)), + sorted(languages), + sorted(domains), + sorted(types), ) elapsed = time.time() - start_time benchmark_results = all_benchmark_results[benchmark_name] From e1b89e30900eed9002687141122d0b9178cfef69 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 17:57:57 +0100 Subject: [PATCH 03/21] fixed existing annotations to refer to task name instead of hf dataset --- mteb/models/misc_models.py | 73 +++++---- mteb/models/sentence_transformers_models.py | 166 ++++++++++++-------- 2 files changed, 140 insertions(+), 99 deletions(-) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index d05461af17..d5734b448c 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -113,38 +113,47 @@ similarity_fn_name="cosine", use_instructions=None, training_datasets={ - "s2orc": ["train"], - "flax-sentence-embeddings/stackexchange_title_body_jsonl": ["train"], - "flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl": [ - "train" - ], - "flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl": [ - "train" - ], - "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl": [ - "train" - ], - "sentence-transformers/reddit-title-body": ["train"], - "msmarco": ["train"], - "gooaq": ["train"], - "yahoo_answers_topics": ["train"], - "code_search_net": ["train"], - "search_qa": ["train"], - "eli5": ["train"], - "snli": ["train"], - "multi_nli": ["train"], - "wikihow": ["train"], - "natural_questions": ["train"], - "trivia_qa": ["train"], - "embedding-data/sentence-compression": ["train"], - "embedding-data/flickr30k-captions": ["train"], - "embedding-data/altlex": ["train"], - "embedding-data/simple-wiki": ["train"], - "embedding-data/QQP": ["train"], - "embedding-data/SPECTER": ["train"], - "embedding-data/PAQ_pairs": ["train"], - "embedding-data/WikiAnswers": ["train"], - "sentence-transformers/embedding-training-data": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_title_body_jsonl": ["train"], + # "flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl": [ + # "train" + # ], + # "flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl": [ + # "train" + # ], + # "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl": [ + # "train" + # ], + # "sentence-transformers/reddit-title-body": ["train"], + # "msmarco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], + # "sentence-transformers/embedding-training-data": ["train"], }, adapted_from="hum-lodestone-v1", superseded_by=None, diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index ea02508c36..557f4f9a89 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -78,27 +78,36 @@ superseded_by=None, adapted_from=None, training_datasets={ - "s2orc": ["train"], - "flax-sentence-embeddings/stackexchange_xml": ["train"], - "ms_marco": ["train"], - "gooaq": ["train"], - "yahoo_answers_topics": ["train"], - "code_search_net": ["train"], - "search_qa": ["train"], - "eli5": ["train"], - "snli": ["train"], - "multi_nli": ["train"], - "wikihow": ["train"], - "natural_questions": ["train"], - "trivia_qa": ["train"], - "embedding-data/sentence-compression": ["train"], - "embedding-data/flickr30k-captions": ["train"], - "embedding-data/altlex": ["train"], - "embedding-data/simple-wiki": ["train"], - "embedding-data/QQP": ["train"], - "embedding-data/SPECTER": ["train"], - "embedding-data/PAQ_pairs": ["train"], - "embedding-data/WikiAnswers": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], }, ) @@ -196,27 +205,36 @@ superseded_by=None, adapted_from=None, training_datasets={ - "s2orc": ["train"], - "flax-sentence-embeddings/stackexchange_xml": ["train"], - "ms_marco": ["train"], - "gooaq": ["train"], - "yahoo_answers_topics": ["train"], - "code_search_net": ["train"], - "search_qa": ["train"], - "eli5": ["train"], - "snli": ["train"], - "multi_nli": ["train"], - "wikihow": ["train"], - "natural_questions": ["train"], - "trivia_qa": ["train"], - "embedding-data/sentence-compression": ["train"], - "embedding-data/flickr30k-captions": ["train"], - "embedding-data/altlex": ["train"], - "embedding-data/simple-wiki": ["train"], - "embedding-data/QQP": ["train"], - "embedding-data/SPECTER": ["train"], - "embedding-data/PAQ_pairs": ["train"], - "embedding-data/WikiAnswers": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], }, ) @@ -319,27 +337,36 @@ superseded_by=None, adapted_from=None, training_datasets={ - "s2orc": ["train"], - "flax-sentence-embeddings/stackexchange_xml": ["train"], - "ms_marco": ["train"], - "gooaq": ["train"], - "yahoo_answers_topics": ["train"], - "code_search_net": ["train"], - "search_qa": ["train"], - "eli5": ["train"], - "snli": ["train"], - "multi_nli": ["train"], - "wikihow": ["train"], - "natural_questions": ["train"], - "trivia_qa": ["train"], - "embedding-data/sentence-compression": ["train"], - "embedding-data/flickr30k-captions": ["train"], - "embedding-data/altlex": ["train"], - "embedding-data/simple-wiki": ["train"], - "embedding-data/QQP": ["train"], - "embedding-data/SPECTER": ["train"], - "embedding-data/PAQ_pairs": ["train"], - "embedding-data/WikiAnswers": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], }, ) @@ -361,9 +388,14 @@ superseded_by=None, adapted_from=None, training_datasets={ - "sentence-transformers/all-nli": ["train"], - "sentence-transformers/stsb": ["train"], - "sentence-transformers/quora-duplicates": ["train"], - "sentence-transformers/natural-questions": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "sentence-transformers/all-nli": ["train"], + # "sentence-transformers/stsb": ["train"], + # "sentence-transformers/quora-duplicates": ["train"], + # "sentence-transformers/natural-questions": ["train"], }, ) From 9ffeae46885944cde69356672d2eb184afaeb491 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 17:58:05 +0100 Subject: [PATCH 04/21] added annotation to nvidia --- mteb/models/nvidia_models.py | 54 ++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 72274b41de..2af1be27ce 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -72,6 +72,54 @@ def encode( return embeddings +training_datasets = { + # source: https://arxiv.org/pdf/2405.17428 + "ArguAna": ["train"], + "ArguAna-PL": ["train"], + "NanoArguAnaRetrieval": ["train"], + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FiQA2018": ["train"], + "FiQA2018-PL": ["train"], # translation not trained on + "STS12": ["train"], + "STS22": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "ImdbClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringS2S.v2": ["train"], + "BiorxivClusteringP2P": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwentyNewsgroupsClustering.v2": ["train"], + "STSBenchmark": ["train"], + "STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on +} NV_embed_v2 = ModelMeta( loader=partial( # type: ignore NvEmbedWrapper, @@ -92,6 +140,9 @@ def encode( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=training_datasets, + public_training_code=None, + public_training_data=True, ) NV_embed_v1 = ModelMeta( @@ -114,4 +165,7 @@ def encode( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=training_datasets, + public_training_code=None, + public_training_data=True, ) From 0495d323d5eb133174935d766a3fbac81e798799 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:00:50 +0100 Subject: [PATCH 05/21] added voyage --- mteb/models/voyage_models.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index 70f61e2c52..12925b235b 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -157,6 +157,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_finance_2 = ModelMeta( @@ -179,6 +182,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_law_2 = ModelMeta( @@ -201,6 +207,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_code_2 = ModelMeta( @@ -223,6 +232,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_large_2 = ModelMeta( @@ -245,6 +257,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_2 = ModelMeta( @@ -267,6 +282,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, + public_training_code=False, ) voyage_multilingual_2 = ModelMeta( name="voyageai/voyage-multilingual-2", @@ -288,6 +306,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_3 = ModelMeta( @@ -310,6 +331,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_3_lite = ModelMeta( @@ -332,4 +356,7 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) From 5f7ef656a41d2cac755f52f2c66a93e8a8593525 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:05:09 +0100 Subject: [PATCH 06/21] added uae annotations --- mteb/models/uae_models.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mteb/models/uae_models.py b/mteb/models/uae_models.py index 5c47cba67d..ffdaa29f74 100644 --- a/mteb/models/uae_models.py +++ b/mteb/models/uae_models.py @@ -75,4 +75,13 @@ def encode( framework=["Sentence Transformers", "PyTorch"], reference="https://huggingface.co/WhereIsAI/UAE-Large-V1", use_instructions=True, + training_datasets={ + # source: https://arxiv.org/pdf/2309.12871 + # not in MTEB + "MNLI": [], + "NLI": [], + "SNLI": [], + }, + public_training_data=True, + public_training_code=True, ) From ac480127b8fef3464113a351cb63397a7383237c Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:07:13 +0100 Subject: [PATCH 07/21] Added stella annotations --- mteb/models/sentence_transformers_models.py | 3 +++ mteb/models/stella_models.py | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 557f4f9a89..05ce11d8a7 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -128,6 +128,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, + training_datasets=None, + public_training_data=False, # currently not release + public_training_code=False, ) paraphrase_multilingual_mpnet_base_v2 = ModelMeta( diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index a738f4461e..8709196319 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -28,6 +28,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_400M_v5", + training_datasets=None, + public_training_data=False, # currently not released + public_training_code=False, ) stella_en_1_5b = ModelMeta( @@ -52,4 +55,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", + training_datasets=None, + public_training_data=False, # currently not released + public_training_code=False, ) From c1c7eb6fe95f25d67654b938b06707e60b829ae8 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:22:09 +0100 Subject: [PATCH 08/21] sentence trf models --- mteb/models/sentence_transformers_models.py | 204 ++++++++------------ 1 file changed, 82 insertions(+), 122 deletions(-) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 05ce11d8a7..7878d6ac6c 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -60,6 +60,40 @@ "zho_Hant", ] +sent_trf_training_dataset = { + # derived from datasheets + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], +} + all_MiniLM_L6_v2 = ModelMeta( name="sentence-transformers/all-MiniLM-L6-v2", languages=["eng-Latn"], @@ -77,38 +111,29 @@ use_instructions=False, superseded_by=None, adapted_from=None, - training_datasets={ - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - # not in MTEB - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "natural_questions": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, + training_datasets=sent_trf_training_dataset, +) + +all_MiniLM_L12_v2 = ModelMeta( + name="sentence-transformers/all-MiniLM-L12-v2", + languages=["eng-Latn"], + open_weights=True, + revision="364dd28d28dcd3359b537f3cf1f5348ba679da62", + release_date="2021-08-30", + n_parameters=33_400_000, + memory_usage=None, + embed_dim=384, + license="apache-2.0", + max_tokens=256, + reference="https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets=sent_trf_training_dataset, + public_training_code=False, + public_training_data=True, ) paraphrase_multilingual_MiniLM_L12_v2 = ModelMeta( @@ -128,9 +153,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, - training_datasets=None, - public_training_data=False, # currently not release + training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) public_training_code=False, + public_training_data=True, ) paraphrase_multilingual_mpnet_base_v2 = ModelMeta( @@ -150,6 +175,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, + training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) + public_training_code=False, + public_training_data=True, ) labse = ModelMeta( @@ -169,6 +197,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, + training_datasets=None, # scraped and mined webdata including CC, wiki, see section 3.1 https://aclanthology.org/2022.acl-long.62.pdf + public_training_code=True, # https://www.kaggle.com/models/google/labse/tensorFlow2/labse/2?tfhub-redirect=true + public_training_data=False, ) multi_qa_MiniLM_L6_cos_v1 = ModelMeta( @@ -187,7 +218,10 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, superseded_by=None, - adapted_from=None, + adapted_from="nreimers/MiniLM-L6-H384-uncased", + training_datasets=sent_trf_training_dataset, # assumed + public_training_code=None, + public_training_data=None, ) all_mpnet_base_v2 = ModelMeta( @@ -207,38 +241,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - training_datasets={ - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - # not in MTEB - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "natural_questions": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, + training_datasets=sent_trf_training_dataset, ) jina_embeddings_v2_base_en = ModelMeta( @@ -258,7 +261,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, - training_datasets={"allenai/c4": ["train"]}, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC ) jina_embeddings_v2_small_en = ModelMeta( @@ -278,7 +283,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} ) jina_embedding_b_en_v1 = ModelMeta( @@ -298,7 +305,9 @@ use_instructions=False, superseded_by="jinaai/jina-embeddings-v2-base-en", adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} ) jina_embedding_s_en_v1 = ModelMeta( @@ -318,61 +327,12 @@ use_instructions=False, superseded_by="jinaai/jina-embeddings-v2-small-en", adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} ) -all_MiniLM_L12_v2 = ModelMeta( - name="sentence-transformers/all-MiniLM-L12-v2", - languages=["eng-Latn"], - open_weights=True, - revision="364dd28d28dcd3359b537f3cf1f5348ba679da62", - release_date="2021-08-30", - n_parameters=33_400_000, - memory_usage=None, - embed_dim=384, - license="apache-2.0", - max_tokens=256, - reference="https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets={ - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - # not in MTEB - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "natural_questions": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, -) - microllama_text_embedding = ModelMeta( name="keeeeenw/MicroLlama-text-embedding", languages=["eng-Latn"], From 4ec91216393ac9fafceffdeae15f4dbcf9b0d807 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:28:18 +0100 Subject: [PATCH 09/21] added salesforce and e5 --- mteb/models/e5_instruct.py | 8 ++++++- mteb/models/e5_models.py | 40 ++++++++++++++++++++++++++++++++ mteb/models/salesforce_models.py | 25 ++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index f26d78ed6d..182a6ea4b2 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -6,7 +6,7 @@ from mteb.model_meta import ModelMeta -from .e5_models import E5_PAPER_RELEASE_DATE, XLMR_LANGUAGES +from .e5_models import E5_PAPER_RELEASE_DATE, E5_TRAINING_DATA, XLMR_LANGUAGES from .instruct_wrapper import instruct_wrapper MISTRAL_LANGUAGES = ["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"] @@ -40,6 +40,9 @@ embed_dim=1024, license="mit", max_tokens=514, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_mistral = ModelMeta( @@ -69,4 +72,7 @@ embed_dim=4096, license="mit", max_tokens=32768, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index 4fee54de79..9537824e59 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -113,6 +113,19 @@ PromptType.passage.value: "passage: ", } +E5_TRAINING_DATA = { + # from 4.2 in https://arxiv.org/pdf/2212.03533 + # also pre-training data from a variety of sources (stackexchange, semantic scholar, reddit, CC, ...) + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on +} + e5_mult_small = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, @@ -134,6 +147,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_mult_base = ModelMeta( @@ -156,6 +172,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_mult_large = ModelMeta( @@ -179,6 +198,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_small_v2 = ModelMeta( @@ -201,6 +223,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_small = ModelMeta( @@ -224,6 +249,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_base_v2 = ModelMeta( @@ -249,6 +277,9 @@ use_instructions=True, superseded_by=None, adapted_from=None, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_large_v2 = ModelMeta( @@ -274,6 +305,9 @@ use_instructions=True, superseded_by=None, adapted_from=None, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_large = ModelMeta( @@ -299,6 +333,9 @@ use_instructions=True, superseded_by="intfloat/e5-large-v2", adapted_from=None, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_base = ModelMeta( @@ -324,4 +361,7 @@ use_instructions=True, superseded_by="intfloat/e5-base-v2", adapted_from=None, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index b1d45b949c..18db09a2b5 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -40,6 +40,19 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + adapted_from="intfloat/e5-mistral-7b-instruct", + public_training_code=False, + public_training_data=False, + training_datasets={ # inherits from e5 + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + }, ) @@ -68,4 +81,16 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=False, + public_training_data=False, + training_datasets={ # inherits from e5 + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + }, ) From c54859d72454dbe41a74d920ad09193337cfa258 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:29:30 +0100 Subject: [PATCH 10/21] jina --- mteb/models/jina_models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index 122f190657..f1a05d210d 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -214,7 +214,7 @@ def encode( open_weights=True, revision="215a6e121fa0183376388ac6b1ae230326bfeaed", release_date="2024-09-18", # official release date - n_parameters=572 * 1e6, + n_parameters=int(572 * 1e6), max_tokens=8194, embed_dim=4096, license="cc-by-nc-4.0", @@ -222,4 +222,7 @@ def encode( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, reference="https://huggingface.co/jinaai/jina-embeddings-v3", + training_datasets=None, + public_training_code=False, + public_training_data=False, ) From d7f5684ac29c86877942c529ca70cdd5237baa48 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:36:42 +0100 Subject: [PATCH 11/21] bge + model2vec --- mteb/models/bge_models.py | 9 +++++++++ mteb/models/model2vec_models.py | 23 ++++++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index cc183374c6..734f71262d 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -27,6 +27,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=None, # https://github.com/staoxiao/RetroMAE, includes wikipedia and bookcorpus and contrastive pairs (unknown) + public_training_code=False, + public_training_data=False, ) bge_base_en_v1_5 = ModelMeta( @@ -50,6 +53,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=None, # https://github.com/staoxiao/RetroMAE, includes wikipedia and bookcorpus and contrastive pairs (unknown) + public_training_code=False, + public_training_data=False, ) bge_large_en_v1_5 = ModelMeta( @@ -73,4 +79,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=None, # https://github.com/staoxiao/RetroMAE, includes wikipedia and bookcorpus and contrastive pairs (unknown) + public_training_code=False, + public_training_data=False, ) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 1541d3ca3d..55968f104c 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -63,7 +63,7 @@ def encode( open_weights=True, revision="5f4f5ca159b7321a8b39739bba0794fa0debddf4", release_date="2024-09-21", - n_parameters=103 * 1e6, + n_parameters=int(103 * 1e6), max_tokens=np.inf, # Theoretically infinite embed_dim=256, license="mit", @@ -73,6 +73,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + training_datasets=None, # source is unkown + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) @@ -96,6 +99,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + training_datasets=None, # source is unkown + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) m2v_base_output = ModelMeta( @@ -118,6 +124,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + training_datasets=None, # source is unkown + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) m2v_multilingual_output = ModelMeta( @@ -140,6 +149,9 @@ def encode( use_instructions=False, adapted_from="sentence-transformers/LaBSE", superseded_by=None, + training_datasets=None, # source is unkown + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) potion_base_2m = ModelMeta( @@ -162,6 +174,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + training_datasets=None, # source is unkown + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) potion_base_4m = ModelMeta( @@ -184,6 +199,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + training_datasets=None, # source is unkown + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) potion_base_8m = ModelMeta( @@ -206,4 +224,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + training_datasets=None, # source is unkown + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) From 9ea60ff5c1718592b1e364264ffe59f7ba8f2c6f Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:45:52 +0100 Subject: [PATCH 12/21] added llm2vec annotations --- mteb/models/llm2vec_models.py | 49 +++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/mteb/models/llm2vec_models.py b/mteb/models/llm2vec_models.py index e962289aac..cbc42fe5ed 100644 --- a/mteb/models/llm2vec_models.py +++ b/mteb/models/llm2vec_models.py @@ -20,6 +20,31 @@ def llm2vec_instruction(instruction): return instruction +llm2vec_supervised_training_data = { + # source, section g1: https://arxiv.org/pdf/2404.05961 + # splits assumed but unkown + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "MrTidyRetrieval": ["train"], + "T2Reranking": ["train"], +} + + class LLM2VecWrapper(Wrapper): def __init__( self, @@ -100,6 +125,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_llama3_8b_unsupervised = ModelMeta( @@ -124,6 +152,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) @@ -149,6 +180,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_mistral7b_unsupervised = ModelMeta( @@ -173,6 +207,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) llm2vec_llama2_7b_supervised = ModelMeta( @@ -197,6 +234,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_llama2_7b_unsupervised = ModelMeta( @@ -221,6 +261,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) llm2vec_sheared_llama_supervised = ModelMeta( @@ -245,6 +288,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_sheared_llama_unsupervised = ModelMeta( @@ -269,4 +315,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) From b123d9203047fb043e46b5ac14f2bc43438ccce6 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:47:59 +0100 Subject: [PATCH 13/21] add jasper --- mteb/models/jasper_models.py | 8 +++++--- mteb/models/nvidia_models.py | 6 +++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index 60fa4f6975..bb9533c155 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -16,6 +16,7 @@ from .wrapper import Wrapper logger = logging.getLogger(__name__) +from .nvidia_models import nvidia_training_datasets class JasperWrapper(Wrapper): @@ -90,7 +91,8 @@ def encode( use_instructions=True, adapted_from=None, superseded_by=None, - training_datasets={ - "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], - }, + training_datasets=nvidia_training_datasets, # "In jasper model the teacher model is nvidia/NV-Embed-v2", source https://huggingface.co/infgrad/jasper_en_vision_language_v1 + # "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], + public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 2af1be27ce..6bf4e041aa 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -72,7 +72,7 @@ def encode( return embeddings -training_datasets = { +nvidia_training_datasets = { # source: https://arxiv.org/pdf/2405.17428 "ArguAna": ["train"], "ArguAna-PL": ["train"], @@ -140,7 +140,7 @@ def encode( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - training_datasets=training_datasets, + training_datasets=nvidia_training_datasets, public_training_code=None, public_training_data=True, ) @@ -165,7 +165,7 @@ def encode( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - training_datasets=training_datasets, + training_datasets=nvidia_training_datasets, public_training_code=None, public_training_data=True, ) From aa728d1ab858003b5430cdf2de4a6f3e1ea95b7a Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 18:49:02 +0100 Subject: [PATCH 14/21] format --- mteb/models/jasper_models.py | 2 +- mteb/models/stella_models.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index bb9533c155..0062df2acc 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -13,10 +13,10 @@ from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta +from .nvidia_models import nvidia_training_datasets from .wrapper import Wrapper logger = logging.getLogger(__name__) -from .nvidia_models import nvidia_training_datasets class JasperWrapper(Wrapper): diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 8709196319..c7a1a0f347 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -28,7 +28,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_400M_v5", - training_datasets=None, + training_datasets=None, public_training_data=False, # currently not released public_training_code=False, ) @@ -55,7 +55,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", - training_datasets=None, + training_datasets=None, public_training_data=False, # currently not released public_training_code=False, ) From 121bf0ea25e0f92b29c17a4a1ec1996698ca16d8 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sun, 12 Jan 2025 21:10:30 +0100 Subject: [PATCH 15/21] format --- mteb/models/model2vec_models.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 8f1621fcb2..8430b8aa58 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -9,11 +9,11 @@ from mteb.model_meta import ModelMeta +from .bge_models import bge_training_data from .wrapper import Wrapper logger = logging.getLogger(__name__) -from .bge_models import bge_training_data class Model2VecWrapper(Wrapper): def __init__( @@ -74,7 +74,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=bge_training_data, # distilled public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) @@ -100,7 +100,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=bge_training_data, # distilled public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) @@ -125,7 +125,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=bge_training_data, # distilled public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) @@ -150,7 +150,7 @@ def encode( use_instructions=False, adapted_from="sentence-transformers/LaBSE", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=bge_training_data, # distilled public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) @@ -175,7 +175,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=bge_training_data, # distilled public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) @@ -200,7 +200,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=bge_training_data, # distilled public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) @@ -225,7 +225,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=bge_training_data, # distilled public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) From b2b9ccaa9889593de67c833f019c2bdddf0f5cdf Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Mon, 13 Jan 2025 15:03:53 +0100 Subject: [PATCH 16/21] Updated annotations and moved jina models --- mteb/models/jina_models.py | 89 ++++++++++++++++ mteb/models/model2vec_models.py | 2 +- mteb/models/sentence_transformers_models.py | 111 ++++---------------- 3 files changed, 109 insertions(+), 93 deletions(-) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index f1a05d210d..728ffaa98f 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -226,3 +226,92 @@ def encode( public_training_code=False, public_training_data=False, ) + + +jina_embeddings_v2_base_en = ModelMeta( + name="jinaai/jina-embeddings-v2-base-en", + languages=["eng-Latn"], + open_weights=True, + revision="6e85f575bc273f1fd840a658067d0157933c83f0", + release_date="2023-09-27", + n_parameters=137_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-en", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC +) + +jina_embeddings_v2_small_en = ModelMeta( + name="jinaai/jina-embeddings-v2-small-en", + languages=["eng-Latn"], + open_weights=True, + revision="796cff318cdd4e5fbe8b7303a1ef8cbec36996ef", + release_date="2023-09-27", + n_parameters=32_700_000, + memory_usage=None, + embed_dim=512, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/jinaai/jina-embeddings-v2-small-en", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} +) + +jina_embedding_b_en_v1 = ModelMeta( + name="jinaai/jina-embedding-b-en-v1", + languages=["eng-Latn"], + open_weights=True, + revision="aa0645035294a8c0607ce5bb700aba982cdff32c", + release_date="2023-07-07", + n_parameters=110_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/jinaai/jina-embedding-b-en-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by="jinaai/jina-embeddings-v2-base-en", + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} +) + +jina_embedding_s_en_v1 = ModelMeta( + name="jinaai/jina-embedding-s-en-v1", + languages=["eng-Latn"], + open_weights=True, + revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", + release_date="2023-07-07", + n_parameters=35_000_000, + memory_usage=None, + embed_dim=512, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/jinaai/jina-embedding-s-en-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by="jinaai/jina-embeddings-v2-small-en", + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} +) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 8430b8aa58..1a58bbf8e3 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -150,7 +150,7 @@ def encode( use_instructions=False, adapted_from="sentence-transformers/LaBSE", superseded_by=None, - training_datasets=bge_training_data, # distilled + training_datasets=None, public_training_code=True, # https://github.com/MinishLab/model2vec public_training_data=False, ) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index b4cd80f9c5..f8b01c6eaf 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -112,6 +112,8 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, + public_training_code=True, + public_training_data=True, ) all_MiniLM_L12_v2 = ModelMeta( @@ -132,7 +134,7 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, - public_training_code=False, + public_training_code=True, public_training_data=True, ) @@ -154,7 +156,7 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) - public_training_code=False, + public_training_code=True, public_training_data=True, ) @@ -175,8 +177,19 @@ use_instructions=False, superseded_by=None, adapted_from=None, - training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) - public_training_code=False, + training_datasets=sent_trf_training_dataset, + # + https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/paraphrases/training.py + # which include (not in MTEB): + # "all-nli": all_nli_train_dataset, + # "sentence-compression": sentence_compression_train_dataset, + # "simple-wiki": simple_wiki_train_dataset, + # "altlex": altlex_train_dataset, + # "quora-duplicates": quora_train_dataset, + # "coco-captions": coco_train_dataset, + # "flickr30k-captions": flickr_train_dataset, + # "yahoo-answers": yahoo_answers_train_dataset, + # "stack-exchange": stack_exchange_train_dataset, + public_training_code=True, public_training_data=True, ) @@ -242,94 +255,8 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, -) - -jina_embeddings_v2_base_en = ModelMeta( - name="jinaai/jina-embeddings-v2-base-en", - languages=["eng-Latn"], - open_weights=True, - revision="6e85f575bc273f1fd840a658067d0157933c83f0", - release_date="2023-09-27", - n_parameters=137_000_000, - memory_usage=None, - embed_dim=768, - license="apache-2.0", - max_tokens=8192, - reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-en", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC -) - -jina_embeddings_v2_small_en = ModelMeta( - name="jinaai/jina-embeddings-v2-small-en", - languages=["eng-Latn"], - open_weights=True, - revision="796cff318cdd4e5fbe8b7303a1ef8cbec36996ef", - release_date="2023-09-27", - n_parameters=32_700_000, - memory_usage=None, - embed_dim=512, - license="apache-2.0", - max_tokens=8192, - reference="https://huggingface.co/jinaai/jina-embeddings-v2-small-en", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} -) - -jina_embedding_b_en_v1 = ModelMeta( - name="jinaai/jina-embedding-b-en-v1", - languages=["eng-Latn"], - open_weights=True, - revision="aa0645035294a8c0607ce5bb700aba982cdff32c", - release_date="2023-07-07", - n_parameters=110_000_000, - memory_usage=None, - embed_dim=768, - license="apache-2.0", - max_tokens=512, - reference="https://huggingface.co/jinaai/jina-embedding-b-en-v1", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by="jinaai/jina-embeddings-v2-base-en", - adapted_from=None, - training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} -) - -jina_embedding_s_en_v1 = ModelMeta( - name="jinaai/jina-embedding-s-en-v1", - languages=["eng-Latn"], - open_weights=True, - revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", - release_date="2023-07-07", - n_parameters=35_000_000, - memory_usage=None, - embed_dim=512, - license="apache-2.0", - max_tokens=512, - reference="https://huggingface.co/jinaai/jina-embedding-s-en-v1", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by="jinaai/jina-embeddings-v2-small-en", - adapted_from=None, - training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} + public_training_code=True, + public_training_data=True, ) From cfdbecbf778fa7312fdae6591be043fdd0d30fa7 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Mon, 13 Jan 2025 18:44:28 +0100 Subject: [PATCH 17/21] fix: Add gritlm --- mteb/models/gritlm_models.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index 91acafa26e..a4f5befd19 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -5,6 +5,7 @@ from mteb.model_meta import ModelMeta +from .e5_models import E5_TRAINING_DATA from .instruct_wrapper import instruct_wrapper logger = logging.getLogger(__name__) @@ -29,7 +30,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: open_weights=True, revision="13f00a0e36500c80ce12870ea513846a066004af", release_date="2024-02-15", - training_datasets={"GritLM/tulu2": ["train"]}, n_parameters=7_240_000_000, memory_usage=None, embed_dim=4096, @@ -39,6 +39,10 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, + training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data + public_training_code=True, # https://github.com/ContextualAI/gritlm + public_training_data=False, ) gritlm8x7b = ModelMeta( loader=partial( # type: ignore @@ -50,7 +54,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: ), name="GritLM/GritLM-8x7B", languages=["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"], - training_datasets={"GritLM/tulu2": ["train"]}, open_weights=True, revision="7f089b13e3345510281733ca1e6ff871b5b4bc76", release_date="2024-02-15", @@ -63,4 +66,8 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, + training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data + public_training_code=True, # https://github.com/ContextualAI/gritlm + public_training_data=False, ) From 25150f9387b49862330ac3e6fd154bbfde51c60d Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Mon, 13 Jan 2025 19:13:28 +0100 Subject: [PATCH 18/21] fix: Added more annotations! --- mteb/models/bge_models.py | 24 ++++++++++ mteb/models/misc_models.py | 79 +++++++++++++++++++------------ mteb/models/ru_sentence_models.py | 43 ++++++++++------- 3 files changed, 101 insertions(+), 45 deletions(-) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 5a395f014a..dc3679a8da 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -6,6 +6,30 @@ model_prompts = {"query": "Represent this sentence for searching relevant passages: "} +bge_m_training_data = { + # source: https://arxiv.org/pdf/2402.03216 + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "LeCaRDv2": ["train"], + "CMedQAv1-reranking": ["train"], + "CMedQAv2-reranking": ["train"], + "MrTidyRetrieval": ["train"], + "T2Reranking": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + # + synthetic data +} + bge_training_data = { # source: https://data.baai.ac.cn/details/BAAI-MTP "NQ": ["test"], diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index d5734b448c..b2a661fe5a 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -5,6 +5,10 @@ import torch from mteb.model_meta import ModelMeta, sentence_transformers_loader +from mteb.models.e5_models import E5_TRAINING_DATA + +from .bge_models import bge_m_training_data, bge_training_data +from .sentence_transformers_models import sent_trf_training_dataset Haon_Chen__speed_embedding_7b_instruct = ModelMeta( name="Haon-Chen/speed-embedding-7b-instruct", @@ -198,7 +202,8 @@ reference="https://huggingface.co/BeastyZ/e5-R-mistral-7b", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"BeastyZ/E5-R": ["train"]}, + training_datasets=E5_TRAINING_DATA, + # not MTEB: {"BeastyZ/E5-R": ["train"]}, adapted_from="/ConRetriever/public_weight_mistral", superseded_by=None, ) @@ -295,13 +300,14 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=True, + public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Bulbasaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is GTE-tiny where training data is unknown + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/dwsdwass", superseded_by=None, ) @@ -317,13 +323,14 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=True, + public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Ivysaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is GTE-tiny where training data is unknown + # not MTEB: {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/jhjghjgh", superseded_by=None, ) @@ -345,7 +352,8 @@ reference="https://huggingface.co/Mihaiii/Squirtle", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=bge_training_data, # source model is bge-base-en-v1.5 + # not MTEB: {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test21", superseded_by=None, ) @@ -367,7 +375,8 @@ reference="https://huggingface.co/Mihaiii/Venusaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is unkown + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test14", superseded_by=None, ) @@ -389,7 +398,8 @@ reference="https://huggingface.co/Mihaiii/Wartortle", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=bge_training_data, # distill from bge-base-en-v1.5 + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test22", superseded_by=None, ) @@ -477,7 +487,7 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets={}, # not in MTEB: {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="aubmindlab/bert-base-arabertv02", superseded_by=None, ) @@ -499,7 +509,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not in MTEB + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", superseded_by=None, ) @@ -521,7 +533,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, # derived from + # not in MTEB: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", superseded_by=None, ) @@ -543,7 +557,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=None, # derived from labSE + # as well as: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/LaBSE", superseded_by=None, ) @@ -565,7 +581,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not in MTEB: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="tomaarsen/mpnet-base-all-nli-triplet", superseded_by=None, ) @@ -587,7 +605,7 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets={}, # not in MTEB: "Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="UBC-NLP/MARBERTv2", superseded_by=None, ) @@ -719,7 +737,8 @@ reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.4", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"manu/embedding_data_v2_100k": ["train"]}, + training_datasets=None, + # Not in MTEB: {"manu/embedding_data_v2_100k": ["train"]}, adapted_from="croissantllm/CroissantCool-v0.2", superseded_by=None, ) @@ -1365,7 +1384,8 @@ reference="https://huggingface.co/aari1995/German_Semantic_STS_V2", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"stsb_multi_mt": ["train"]}, + training_datasets=None, # couldn't figure out the source model + # {"stsb_multi_mt": ["train"]}, adapted_from="/content/drive/MyDrive/Stanford_NLU/Project/false_friends/gbert_large_sts_only", superseded_by=None, ) @@ -1481,18 +1501,18 @@ reference="https://huggingface.co/deepvk/USER-bge-m3", similarity_fn_name="cosine", use_instructions=None, - training_datasets={ - "deepvk/ru-HNP": ["train"], - "deepvk/ru-WANLI": ["train"], - "Shitao/bge-m3-data": ["train"], - "RussianNLP/russian_super_glue": ["train"], - "reciTAL/mlsum": ["train"], - "Milana/russian_keywords": ["train"], - "IlyaGusev/gazeta": ["train"], - "d0rj/gsm8k-ru": ["train"], - "bragovo/dsum_ru": ["train"], - "CarlBrendt/Summ_Dialog_News": ["train"], - }, + training_datasets=bge_m_training_data, # derived from. + # not in MTEB: + # "deepvk/ru-HNP": ["train"], + # "deepvk/ru-WANLI": ["train"], + # "Shitao/bge-m3-data": ["train"], + # "RussianNLP/russian_super_glue": ["train"], + # "reciTAL/mlsum": ["train"], + # "Milana/russian_keywords": ["train"], + # "IlyaGusev/gazeta": ["train"], + # "d0rj/gsm8k-ru": ["train"], + # "bragovo/dsum_ru": ["train"], + # "CarlBrendt/Summ_Dialog_News": ["train"], adapted_from="USER-bge-m3", superseded_by=None, ) @@ -1622,7 +1642,8 @@ reference="https://huggingface.co/shibing624/text2vec-base-multilingual", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"shibing624/nli-zh-all": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not MTEB: {"shibing624/nli-zh-all": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", superseded_by=None, ) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index a520bdca11..6bca544b11 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -6,6 +6,8 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader +from .bge_models import bge_training_data + rubert_tiny2 = ModelMeta( name="cointegrated/rubert-tiny2", languages=["rus_Cyrl"], @@ -96,20 +98,27 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, training_datasets={ - "deepvk/ru-HNP": ["train"], - "deepvk/ru-WANLI": ["train"], - "Shitao/bge-m3-data": ["train"], - "RussianNLP/russian_super_glue": ["train"], - "reciTAL/mlsum": ["train"], - "Helsinki-NLP/opus-100": ["train"], - "Helsinki-NLP/bible_para": ["train"], - "d0rj/rudetoxifier_data_detox": ["train"], - "s-nlp/ru_paradetox": ["train"], - "Milana/russian_keywords": ["train"], - "IlyaGusev/gazeta": ["train"], - "d0rj/gsm8k-ru": ["train"], - "bragovo/dsum_ru": ["train"], - "CarlBrendt/Summ_Dialog_News": ["train"], + "BibleNLPBitextMining": ["train"], + "MLSUMClusteringP2P": ["train"], + "MLSUMClusteringP2P.v2": ["train"], + "MLSUMClusteringS2S": ["train"], + "MLSUMClusteringS2S.v2": ["train"], + **bge_training_data, + # not MTEB: + # "deepvk/ru-HNP": ["train"], + # "deepvk/ru-WANLI": ["train"], + # "Shitao/bge-m3-data": ["train"], + # "RussianNLP/russian_super_glue": ["train"], + # "reciTAL/mlsum": ["train"], + # "Helsinki-NLP/opus-100": ["train"], + # "Helsinki-NLP/bible_para": ["train"], + # "d0rj/rudetoxifier_data_detox": ["train"], + # "s-nlp/ru_paradetox": ["train"], + # "Milana/russian_keywords": ["train"], + # "IlyaGusev/gazeta": ["train"], + # "d0rj/gsm8k-ru": ["train"], + # "bragovo/dsum_ru": ["train"], + # "CarlBrendt/Summ_Dialog_News": ["train"], }, ) @@ -213,7 +222,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets={"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + training_datasets=None, # source model in unknown + # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, ) labse_ru_turbo = ModelMeta( @@ -231,7 +241,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets={"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + training_datasets=None, # source model in unknown + # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, ) From 7fd97f7a75985bc18612d8934a15ab8d1e2c1203 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Tue, 14 Jan 2025 14:57:45 +0100 Subject: [PATCH 19/21] Added BGE Chinese and multilingual-gemma models --- mteb/models/bge_models.py | 141 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index dc3679a8da..1744fb11f9 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -5,6 +5,7 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader model_prompts = {"query": "Represent this sentence for searching relevant passages: "} +model_prompts_zh = {"query": "为这个句子生成表示以用于检索相关文章:"} bge_m_training_data = { # source: https://arxiv.org/pdf/2402.03216 @@ -60,6 +61,34 @@ # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) } +bge_chinese_training_data = { + # source: https://arxiv.org/pdf/2309.07597 + "T2Retrieval": ["train"], + "DuReader": ["train"], + "MMarcoReranking": ["train"], + "CMedQAv2-reranking": ["train"], + "Cmnli": ["train"], + "Ocnli": ["train"], + # not in mteb + # - multi-cpr + # - NLI-zh + # Dataset Pairs + # wudao (title, passage) + # cmrc2018 (query, context) + # dureader (query, context) + # simclue (sentence_a, sentence_b) + # csl (title, abstract) + # amazon_reviews_multi (title, body) + # wiki_atomic_edits (base_sentence, edited_sentence) + # mlqa (question, context) + # xlsum (title, summary) (title, text) + # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further + # "wikipedia": [], # title + section title, passage + # "reddit": [], # title, body + # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) + # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) +} + bge_small_en_v1_5 = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, @@ -137,3 +166,115 @@ public_training_code=None, # seemingly released (at least for some models, but the link is broken training_datasets=bge_training_data, ) + +bge_small_zh_v1_5 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-small-zh-v1.5", + revision="7999e1d3359715c523056ef9478215996d62a620", + model_prompts=model_prompts_zh, + ), + name="BAAI/bge-small-zh-v1.5", + languages=["zho_Hans"], + open_weights=True, + revision="7999e1d3359715c523056ef9478215996d62a620", + release_date="2023-09-12", # initial commit of hf model. + n_parameters=24_000_000, + memory_usage=None, + embed_dim=512, + license="mit", + max_tokens=512, + reference="https://huggingface.co/BAAI/bge-small-zh-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP + public_training_code=None, # seemingly released (at least for some models, but the link is broken + training_datasets=bge_chinese_training_data, +) + +bge_base_zh_v1_5 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-base-zh-v1.5", + revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65", + model_prompts=model_prompts_zh, + ), + name="BAAI/bge-base-zh-v1.5", + languages=["zho_Hans"], + open_weights=True, + revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65", + release_date="2023-09-11", # initial commit of hf model. + n_parameters=438_000_000, + memory_usage=None, + embed_dim=768, + license="mit", + max_tokens=512, + reference="https://huggingface.co/BAAI/bge-base-zh-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP + public_training_code=None, # seemingly released (at least for some models, but the link is broken + training_datasets=bge_chinese_training_data, +) + +bge_large_zh_v1_5 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-large-zh-v1.5", + revision="79e7739b6ab944e86d6171e44d24c997fc1e0116", + model_prompts=model_prompts_zh, + ), + name="BAAI/bge-large-zh-v1.5", + languages=["zho_Hans"], + open_weights=True, + revision="79e7739b6ab944e86d6171e44d24c997fc1e0116", + release_date="2023-09-12", # initial commit of hf model. + n_parameters=1_340_000_000, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/BAAI/bge-large-zh-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP + public_training_code=None, # seemingly released (at least for some models, but the link is broken + training_datasets=bge_chinese_training_data, +) + +bge_multilingual_gemma2 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-multilingual-gemma2", + revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a", + ), + name="BAAI/bge-multilingual-gemma2", + languages=[ + "eng_Latn", + "zho_Hans", + "kor_Hang", + "kor_Latn", + "fra_Latn", + "jpn_Jpan", + "jpn_Latn", + ], # This list is incomlete. Their description says "and more". + # I'm also unsure about the scripts. + open_weights=True, + revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a", + release_date="2024-07-25", # initial commit of hf model. + n_parameters=9.24 * 1e9, + memory_usage=None, + embed_dim=3584, # from old C-MTEB leaderboard + license="gemma", + max_tokens=8192, # from old C-MTEB leaderboard + reference="https://huggingface.co/BAAI/bge-multilingual-gemma2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=False, + public_training_code=False, + training_datasets=None, # not disclosed +) From 8586220c176901e7d87e79b630459fd51a8e80e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Tue, 14 Jan 2025 16:02:10 +0100 Subject: [PATCH 20/21] Added GTE multilingual and Chinese models --- mteb/models/gte_models.py | 195 +++++++++++++++++++++++++++++++++++++- 1 file changed, 194 insertions(+), 1 deletion(-) diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index 648fc18850..d735af6a28 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -5,7 +5,7 @@ import torch from mteb.encoder_interface import PromptType -from mteb.model_meta import ModelMeta +from mteb.model_meta import ModelMeta, sentence_transformers_loader from mteb.models.instruct_wrapper import instruct_wrapper @@ -105,3 +105,196 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, ) + +gte_small_zh = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="thenlper/gte-small-zh", + revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a", + ), + name="thenlper/gte-small-zh", + languages=["zho_Hans"], + open_weights=True, + revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a", + release_date="2023-11-8", # initial commit of hf model. + n_parameters=30.3 * 1e6, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/thenlper/gte-small-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=False, + public_training_code=None, + training_datasets=None, # Not disclosed +) + +gte_base_zh = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="thenlper/gte-base-zh", + revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c", + ), + name="thenlper/gte-base-zh", + languages=["zho_Hans"], + open_weights=True, + revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c", + release_date="2023-11-8", # initial commit of hf model. + n_parameters=102 * 1e6, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/thenlper/gte-base-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=False, + public_training_code=None, + training_datasets=None, # Not disclosed +) + +gte_large_zh = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="thenlper/gte-large-zh", + revision="64c364e579de308104a9b2c170ca009502f4f545", + ), + name="thenlper/gte-large-zh", + languages=["zho_Hans"], + open_weights=True, + revision="64c364e579de308104a9b2c170ca009502f4f545", + release_date="2023-11-8", # initial commit of hf model. + n_parameters=326 * 1e6, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/thenlper/gte-large-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=False, + public_training_code=None, + training_datasets=None, # Not disclosed +) + +gte_multilingual_langs = [ + "afr_Latn", + "ara_Arab", + "aze_Latn", + "bel_Cyrl", + "bul_Cyrl", + "ben_Beng", + "cat_Latn", + "ceb_Latn", + "ces_Latn", + "cym_Latn", + "dan_Latn", + "deu_Latn", + "ell_Grek", + "eng_Latn", + "spa_Latn", + "est_Latn", + "eus_Latn", + "fas_Arab", + "fin_Latn", + "fra_Latn", + "glg_Latn", + "guj_Gujr", + "heb_Hebr", + "hin_Deva", + "hrv_Latn", + "hat_Latn", + "hun_Latn", + "hye_Armn", + "ind_Latn", + "isl_Latn", + "ita_Latn", + "jpn_Jpan", + "jav_Latn", + "kat_Geor", + "kaz_Cyrl", + "khm_Khmr", + "kan_Knda", + "kor_Hang", + "kir_Cyrl", + "lao_Laoo", + "lit_Latn", + "lav_Latn", + "mkd_Cyrl", + "mal_Mlym", + "mon_Cyrl", + "mar_Deva", + "msa_Latn", + "mya_Mymr", + "nep_Deva", + "nld_Latn", + "nor_Latn", + "pan_Guru", + "pol_Latn", + "por_Latn", + "que_Latn", + "ron_Latn", + "rus_Cyrl", + "sin_Sinh", + "slk_Latn", + "slv_Latn", + "swa_Latn", + "tam_Taml", + "tel_Telu", + "tha_Thai", + "tgl_Latn", + "tur_Latn", + "ukr_Cyrl", + "urd_Arab", + "vie_Latn", + "yor_Latn", + "zho_Hans", +] +# Source: https://arxiv.org/pdf/2407.19669 +gte_multi_training_data = { + "T2Retrieval": ["train"], + "DuReader": ["train"], + "MMarcoReranking": ["train"], + "CMedQAv2-reranking": ["train"], + "NQ": ["train"], + "MSMARCO": ["train"], + "HotpotQA": ["train"], + "FEVER": ["train"], + "MIRACLReranking": ["train"], + "MrTidyRetrieval": ["train"], + "MultiLongDocRetrieval": ["train"], + # not in MTEB: + # - TriviaQA + # - SQuAD + # - AllNLI + # - Multi-CPR +} + +gte_multilingual_base = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="Alibaba-NLP/gte-multilingual-base", + revision="ca1791e0bcc104f6db161f27de1340241b13c5a4", + ), + name="Alibaba-NLP/gte-multilingual-base", + languages=gte_multilingual_langs, + open_weights=True, + revision="ca1791e0bcc104f6db161f27de1340241b13c5a4", + release_date="2024-07-20", # initial commit of hf model. + n_parameters=305 * 1e6, + memory_usage=None, + embed_dim=1024, + license="apache-2", + max_tokens=8192, + reference="https://huggingface.co/Alibaba-NLP/gte-multilingual-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=True, + public_training_code=None, # couldn't find + training_datasets=gte_multi_training_data, +) From ddd6cda5a16fc93cbd141fbc94b0f67f528ed00d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Tue, 14 Jan 2025 16:09:49 +0100 Subject: [PATCH 21/21] Fixed date format --- mteb/models/gte_models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index d735af6a28..f800aaa941 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -116,7 +116,7 @@ def instruction_template( languages=["zho_Hans"], open_weights=True, revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a", - release_date="2023-11-8", # initial commit of hf model. + release_date="2023-11-08", # initial commit of hf model. n_parameters=30.3 * 1e6, memory_usage=None, embed_dim=1024, @@ -141,7 +141,7 @@ def instruction_template( languages=["zho_Hans"], open_weights=True, revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c", - release_date="2023-11-8", # initial commit of hf model. + release_date="2023-11-08", # initial commit of hf model. n_parameters=102 * 1e6, memory_usage=None, embed_dim=1024, @@ -166,7 +166,7 @@ def instruction_template( languages=["zho_Hans"], open_weights=True, revision="64c364e579de308104a9b2c170ca009502f4f545", - release_date="2023-11-8", # initial commit of hf model. + release_date="2023-11-08", # initial commit of hf model. n_parameters=326 * 1e6, memory_usage=None, embed_dim=1024,