diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 56efff84d9..05547d6a04 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -298,6 +298,60 @@ "zho_Hans", # zh ] +bge_m_training_data = { + # source: https://arxiv.org/pdf/2402.03216 + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "LeCaRDv2": ["train"], + "CMedQAv1-reranking": ["train"], + "CMedQAv2-reranking": ["train"], + "MrTidyRetrieval": ["train"], + "T2Reranking": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + # + synthetic data +} + +bge_training_data = { + # source: https://data.baai.ac.cn/details/BAAI-MTP + "NQ": ["test"], + "NQHardNegatives": ["test"], + "AmazonReviewsClassification": [ + "validation", + "test", + ], # assumed from: amazon_reviews_multi + "MLQARetrieval": [ + "validation", + "test", + ], # assumed from mlqa (question, context) + # not in mteb + # Dataset Pairs + # wudao (title, passage) + # cmrc2018 (query, context) + # dureader (query, context) + # simclue (sentence_a, sentence_b) + # csl (title, abstract) + # amazon_reviews_multi (title, body) + # wiki_atomic_edits (base_sentence, edited_sentence) + # mlqa (question, context) + # xlsum (title, summary) (title, text) + # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further + # "wikipedia": [], # title + section title, passage + # "reddit": [], # title, body + # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) + # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) +} + bge_small_en_v1_5 = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, @@ -321,35 +375,7 @@ use_instructions=True, public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken - training_datasets={ - # source: https://data.baai.ac.cn/details/BAAI-MTP - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], # assumed from: amazon_reviews_multi - "MLQARetrieval": [ - "validation", - "test", - ], # assumed from mlqa (question, context) - # not in mteb - # Dataset Pairs - # wudao (title, passage) - # cmrc2018 (query, context) - # dureader (query, context) - # simclue (sentence_a, sentence_b) - # csl (title, abstract) - # amazon_reviews_multi (title, body) - # wiki_atomic_edits (base_sentence, edited_sentence) - # mlqa (question, context) - # xlsum (title, summary) (title, text) - # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further - # "wikipedia": [], # title + section title, passage - # "reddit": [], # title, body - # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) - # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) - }, + training_datasets=bge_training_data, ) bge_base_en_v1_5 = ModelMeta( @@ -375,35 +401,7 @@ use_instructions=True, public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken - training_datasets={ - # source: https://data.baai.ac.cn/details/BAAI-MTP - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], # assumed from: amazon_reviews_multi - "MLQARetrieval": [ - "validation", - "test", - ], # assumed from mlqa (question, context) - # not in mteb - # Dataset Pairs - # wudao (title, passage) - # cmrc2018 (query, context) - # dureader (query, context) - # simclue (sentence_a, sentence_b) - # csl (title, abstract) - # amazon_reviews_multi (title, body) - # wiki_atomic_edits (base_sentence, edited_sentence) - # mlqa (question, context) - # xlsum (title, summary) (title, text) - # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further - # "wikipedia": [], # title + section title, passage - # "reddit": [], # title, body - # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) - # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) - }, + training_datasets=bge_training_data, ) bge_large_en_v1_5 = ModelMeta( @@ -429,35 +427,7 @@ use_instructions=True, public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken - training_datasets={ - # source: https://data.baai.ac.cn/details/BAAI-MTP - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], # assumed from: amazon_reviews_multi - "MLQARetrieval": [ - "validation", - "test", - ], # assumed from mlqa (question, context) - # not in mteb - # Dataset Pairs - # wudao (title, passage) - # cmrc2018 (query, context) - # dureader (query, context) - # simclue (sentence_a, sentence_b) - # csl (title, abstract) - # amazon_reviews_multi (title, body) - # wiki_atomic_edits (base_sentence, edited_sentence) - # mlqa (question, context) - # xlsum (title, summary) (title, text) - # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further - # "wikipedia": [], # title + section title, passage - # "reddit": [], # title, body - # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) - # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) - }, + training_datasets=bge_training_data, ) bge_small_zh_v1_5 = ModelMeta( diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index f26d78ed6d..182a6ea4b2 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -6,7 +6,7 @@ from mteb.model_meta import ModelMeta -from .e5_models import E5_PAPER_RELEASE_DATE, XLMR_LANGUAGES +from .e5_models import E5_PAPER_RELEASE_DATE, E5_TRAINING_DATA, XLMR_LANGUAGES from .instruct_wrapper import instruct_wrapper MISTRAL_LANGUAGES = ["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"] @@ -40,6 +40,9 @@ embed_dim=1024, license="mit", max_tokens=514, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_mistral = ModelMeta( @@ -69,4 +72,7 @@ embed_dim=4096, license="mit", max_tokens=32768, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index 97b117002b..9537824e59 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -113,6 +113,19 @@ PromptType.passage.value: "passage: ", } +E5_TRAINING_DATA = { + # from 4.2 in https://arxiv.org/pdf/2212.03533 + # also pre-training data from a variety of sources (stackexchange, semantic scholar, reddit, CC, ...) + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on +} + e5_mult_small = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, @@ -134,26 +147,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - # table 1: - # Wikipedia 150M - # mC4 160M - # Multilingual CC News 160M - # NLLB 160M - # Reddit 160M - # S2ORC 50M - # Stackexchange 50M - # xP3 80M - # Misc. SBERT Data 10M - # ---- - # from Misc. SBERT Data 10M: - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_mult_base = ModelMeta( @@ -176,26 +172,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2402.05672 - # table 1: - # Wikipedia 150M - # mC4 160M - # Multilingual CC News 160M - # NLLB 160M - # Reddit 160M - # S2ORC 50M - # Stackexchange 50M - # xP3 80M - # Misc. SBERT Data 10M - # ---- - # from Misc. SBERT Data 10M: - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_mult_large = ModelMeta( @@ -219,26 +198,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2402.05672 - # table 1: - # Wikipedia 150M - # mC4 160M - # Multilingual CC News 160M - # NLLB 160M - # Reddit 160M - # S2ORC 50M - # Stackexchange 50M - # xP3 80M - # Misc. SBERT Data 10M - # ---- - # from Misc. SBERT Data 10M: - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_small_v2 = ModelMeta( @@ -261,14 +223,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_small = ModelMeta( @@ -292,14 +249,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_base_v2 = ModelMeta( @@ -325,14 +277,9 @@ use_instructions=True, superseded_by=None, adapted_from=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_large_v2 = ModelMeta( @@ -358,14 +305,9 @@ use_instructions=True, superseded_by=None, adapted_from=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_large = ModelMeta( @@ -391,14 +333,9 @@ use_instructions=True, superseded_by="intfloat/e5-large-v2", adapted_from=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_base = ModelMeta( @@ -424,12 +361,7 @@ use_instructions=True, superseded_by="intfloat/e5-base-v2", adapted_from=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index 91acafa26e..a4f5befd19 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -5,6 +5,7 @@ from mteb.model_meta import ModelMeta +from .e5_models import E5_TRAINING_DATA from .instruct_wrapper import instruct_wrapper logger = logging.getLogger(__name__) @@ -29,7 +30,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: open_weights=True, revision="13f00a0e36500c80ce12870ea513846a066004af", release_date="2024-02-15", - training_datasets={"GritLM/tulu2": ["train"]}, n_parameters=7_240_000_000, memory_usage=None, embed_dim=4096, @@ -39,6 +39,10 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, + training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data + public_training_code=True, # https://github.com/ContextualAI/gritlm + public_training_data=False, ) gritlm8x7b = ModelMeta( loader=partial( # type: ignore @@ -50,7 +54,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: ), name="GritLM/GritLM-8x7B", languages=["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"], - training_datasets={"GritLM/tulu2": ["train"]}, open_weights=True, revision="7f089b13e3345510281733ca1e6ff871b5b4bc76", release_date="2024-02-15", @@ -63,4 +66,8 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, + training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data + public_training_code=True, # https://github.com/ContextualAI/gritlm + public_training_data=False, ) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index 60fa4f6975..0062df2acc 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -13,6 +13,7 @@ from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta +from .nvidia_models import nvidia_training_datasets from .wrapper import Wrapper logger = logging.getLogger(__name__) @@ -90,7 +91,8 @@ def encode( use_instructions=True, adapted_from=None, superseded_by=None, - training_datasets={ - "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], - }, + training_datasets=nvidia_training_datasets, # "In jasper model the teacher model is nvidia/NV-Embed-v2", source https://huggingface.co/infgrad/jasper_en_vision_language_v1 + # "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], + public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index 122f190657..728ffaa98f 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -214,7 +214,7 @@ def encode( open_weights=True, revision="215a6e121fa0183376388ac6b1ae230326bfeaed", release_date="2024-09-18", # official release date - n_parameters=572 * 1e6, + n_parameters=int(572 * 1e6), max_tokens=8194, embed_dim=4096, license="cc-by-nc-4.0", @@ -222,4 +222,96 @@ def encode( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, reference="https://huggingface.co/jinaai/jina-embeddings-v3", + training_datasets=None, + public_training_code=False, + public_training_data=False, +) + + +jina_embeddings_v2_base_en = ModelMeta( + name="jinaai/jina-embeddings-v2-base-en", + languages=["eng-Latn"], + open_weights=True, + revision="6e85f575bc273f1fd840a658067d0157933c83f0", + release_date="2023-09-27", + n_parameters=137_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-en", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC +) + +jina_embeddings_v2_small_en = ModelMeta( + name="jinaai/jina-embeddings-v2-small-en", + languages=["eng-Latn"], + open_weights=True, + revision="796cff318cdd4e5fbe8b7303a1ef8cbec36996ef", + release_date="2023-09-27", + n_parameters=32_700_000, + memory_usage=None, + embed_dim=512, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/jinaai/jina-embeddings-v2-small-en", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} +) + +jina_embedding_b_en_v1 = ModelMeta( + name="jinaai/jina-embedding-b-en-v1", + languages=["eng-Latn"], + open_weights=True, + revision="aa0645035294a8c0607ce5bb700aba982cdff32c", + release_date="2023-07-07", + n_parameters=110_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/jinaai/jina-embedding-b-en-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by="jinaai/jina-embeddings-v2-base-en", + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} +) + +jina_embedding_s_en_v1 = ModelMeta( + name="jinaai/jina-embedding-s-en-v1", + languages=["eng-Latn"], + open_weights=True, + revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", + release_date="2023-07-07", + n_parameters=35_000_000, + memory_usage=None, + embed_dim=512, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/jinaai/jina-embedding-s-en-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by="jinaai/jina-embeddings-v2-small-en", + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} ) diff --git a/mteb/models/llm2vec_models.py b/mteb/models/llm2vec_models.py index e962289aac..cbc42fe5ed 100644 --- a/mteb/models/llm2vec_models.py +++ b/mteb/models/llm2vec_models.py @@ -20,6 +20,31 @@ def llm2vec_instruction(instruction): return instruction +llm2vec_supervised_training_data = { + # source, section g1: https://arxiv.org/pdf/2404.05961 + # splits assumed but unkown + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "MrTidyRetrieval": ["train"], + "T2Reranking": ["train"], +} + + class LLM2VecWrapper(Wrapper): def __init__( self, @@ -100,6 +125,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_llama3_8b_unsupervised = ModelMeta( @@ -124,6 +152,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) @@ -149,6 +180,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_mistral7b_unsupervised = ModelMeta( @@ -173,6 +207,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) llm2vec_llama2_7b_supervised = ModelMeta( @@ -197,6 +234,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_llama2_7b_unsupervised = ModelMeta( @@ -221,6 +261,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) llm2vec_sheared_llama_supervised = ModelMeta( @@ -245,6 +288,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_sheared_llama_unsupervised = ModelMeta( @@ -269,4 +315,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 5e8fcae0ac..88dad0050a 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -5,6 +5,10 @@ import torch from mteb.model_meta import ModelMeta, sentence_transformers_loader +from mteb.models.e5_models import E5_TRAINING_DATA + +from .bge_models import bge_m_training_data, bge_training_data +from .sentence_transformers_models import sent_trf_training_dataset Haon_Chen__speed_embedding_7b_instruct = ModelMeta( name="Haon-Chen/speed-embedding-7b-instruct", @@ -113,38 +117,47 @@ similarity_fn_name="cosine", use_instructions=None, training_datasets={ - "s2orc": ["train"], - "flax-sentence-embeddings/stackexchange_title_body_jsonl": ["train"], - "flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl": [ - "train" - ], - "flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl": [ - "train" - ], - "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl": [ - "train" - ], - "sentence-transformers/reddit-title-body": ["train"], - "msmarco": ["train"], - "gooaq": ["train"], - "yahoo_answers_topics": ["train"], - "code_search_net": ["train"], - "search_qa": ["train"], - "eli5": ["train"], - "snli": ["train"], - "multi_nli": ["train"], - "wikihow": ["train"], - "natural_questions": ["train"], - "trivia_qa": ["train"], - "embedding-data/sentence-compression": ["train"], - "embedding-data/flickr30k-captions": ["train"], - "embedding-data/altlex": ["train"], - "embedding-data/simple-wiki": ["train"], - "embedding-data/QQP": ["train"], - "embedding-data/SPECTER": ["train"], - "embedding-data/PAQ_pairs": ["train"], - "embedding-data/WikiAnswers": ["train"], - "sentence-transformers/embedding-training-data": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_title_body_jsonl": ["train"], + # "flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl": [ + # "train" + # ], + # "flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl": [ + # "train" + # ], + # "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl": [ + # "train" + # ], + # "sentence-transformers/reddit-title-body": ["train"], + # "msmarco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], + # "sentence-transformers/embedding-training-data": ["train"], }, adapted_from="hum-lodestone-v1", superseded_by=None, @@ -189,7 +202,8 @@ reference="https://huggingface.co/BeastyZ/e5-R-mistral-7b", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"BeastyZ/E5-R": ["train"]}, + training_datasets=E5_TRAINING_DATA, + # not MTEB: {"BeastyZ/E5-R": ["train"]}, adapted_from="/ConRetriever/public_weight_mistral", superseded_by=None, ) @@ -286,13 +300,14 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=True, + public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Bulbasaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is GTE-tiny where training data is unknown + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/dwsdwass", superseded_by=None, ) @@ -308,13 +323,14 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=True, + public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Ivysaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is GTE-tiny where training data is unknown + # not MTEB: {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/jhjghjgh", superseded_by=None, ) @@ -336,7 +352,8 @@ reference="https://huggingface.co/Mihaiii/Squirtle", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=bge_training_data, # source model is bge-base-en-v1.5 + # not MTEB: {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test21", superseded_by=None, ) @@ -358,7 +375,8 @@ reference="https://huggingface.co/Mihaiii/Venusaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is unkown + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test14", superseded_by=None, ) @@ -380,7 +398,8 @@ reference="https://huggingface.co/Mihaiii/Wartortle", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=bge_training_data, # distill from bge-base-en-v1.5 + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test22", superseded_by=None, ) @@ -468,7 +487,7 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets={}, # not in MTEB: {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="aubmindlab/bert-base-arabertv02", superseded_by=None, ) @@ -490,7 +509,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not in MTEB + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", superseded_by=None, ) @@ -512,7 +533,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, # derived from + # not in MTEB: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", superseded_by=None, ) @@ -534,7 +557,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=None, # derived from labSE + # as well as: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/LaBSE", superseded_by=None, ) @@ -556,7 +581,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not in MTEB: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="tomaarsen/mpnet-base-all-nli-triplet", superseded_by=None, ) @@ -578,7 +605,7 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets={}, # not in MTEB: "Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="UBC-NLP/MARBERTv2", superseded_by=None, ) @@ -710,7 +737,8 @@ reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.4", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"manu/embedding_data_v2_100k": ["train"]}, + training_datasets=None, + # Not in MTEB: {"manu/embedding_data_v2_100k": ["train"]}, adapted_from="croissantllm/CroissantCool-v0.2", superseded_by=None, ) @@ -1356,7 +1384,8 @@ reference="https://huggingface.co/aari1995/German_Semantic_STS_V2", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"stsb_multi_mt": ["train"]}, + training_datasets=None, # couldn't figure out the source model + # {"stsb_multi_mt": ["train"]}, adapted_from="/content/drive/MyDrive/Stanford_NLU/Project/false_friends/gbert_large_sts_only", superseded_by=None, ) @@ -1472,18 +1501,18 @@ reference="https://huggingface.co/deepvk/USER-bge-m3", similarity_fn_name="cosine", use_instructions=None, - training_datasets={ - "deepvk/ru-HNP": ["train"], - "deepvk/ru-WANLI": ["train"], - "Shitao/bge-m3-data": ["train"], - "RussianNLP/russian_super_glue": ["train"], - "reciTAL/mlsum": ["train"], - "Milana/russian_keywords": ["train"], - "IlyaGusev/gazeta": ["train"], - "d0rj/gsm8k-ru": ["train"], - "bragovo/dsum_ru": ["train"], - "CarlBrendt/Summ_Dialog_News": ["train"], - }, + training_datasets=bge_m_training_data, # derived from. + # not in MTEB: + # "deepvk/ru-HNP": ["train"], + # "deepvk/ru-WANLI": ["train"], + # "Shitao/bge-m3-data": ["train"], + # "RussianNLP/russian_super_glue": ["train"], + # "reciTAL/mlsum": ["train"], + # "Milana/russian_keywords": ["train"], + # "IlyaGusev/gazeta": ["train"], + # "d0rj/gsm8k-ru": ["train"], + # "bragovo/dsum_ru": ["train"], + # "CarlBrendt/Summ_Dialog_News": ["train"], adapted_from="USER-bge-m3", superseded_by=None, ) @@ -1613,7 +1642,8 @@ reference="https://huggingface.co/shibing624/text2vec-base-multilingual", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"shibing624/nli-zh-all": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not MTEB: {"shibing624/nli-zh-all": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", superseded_by=None, ) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 37da533457..1a58bbf8e3 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -9,6 +9,7 @@ from mteb.model_meta import ModelMeta +from .bge_models import bge_training_data from .wrapper import Wrapper logger = logging.getLogger(__name__) @@ -72,21 +73,10 @@ def encode( reference="https://huggingface.co/minishlab/M2V_base_glove_subword", use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, superseded_by=None, + training_datasets=bge_training_data, # distilled + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) @@ -110,20 +100,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, + training_datasets=bge_training_data, # distilled + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) m2v_base_output = ModelMeta( @@ -146,20 +125,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, + training_datasets=bge_training_data, # distilled + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) m2v_multilingual_output = ModelMeta( @@ -182,8 +150,9 @@ def encode( use_instructions=False, adapted_from="sentence-transformers/LaBSE", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model + training_datasets=None, + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) potion_base_2m = ModelMeta( @@ -206,20 +175,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, + training_datasets=bge_training_data, # distilled + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) potion_base_4m = ModelMeta( @@ -242,20 +200,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, + training_datasets=bge_training_data, # distilled + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) potion_base_8m = ModelMeta( @@ -278,18 +225,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, + training_datasets=bge_training_data, # distilled + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 72274b41de..6bf4e041aa 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -72,6 +72,54 @@ def encode( return embeddings +nvidia_training_datasets = { + # source: https://arxiv.org/pdf/2405.17428 + "ArguAna": ["train"], + "ArguAna-PL": ["train"], + "NanoArguAnaRetrieval": ["train"], + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FiQA2018": ["train"], + "FiQA2018-PL": ["train"], # translation not trained on + "STS12": ["train"], + "STS22": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "ImdbClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringS2S.v2": ["train"], + "BiorxivClusteringP2P": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwentyNewsgroupsClustering.v2": ["train"], + "STSBenchmark": ["train"], + "STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on +} NV_embed_v2 = ModelMeta( loader=partial( # type: ignore NvEmbedWrapper, @@ -92,6 +140,9 @@ def encode( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=nvidia_training_datasets, + public_training_code=None, + public_training_data=True, ) NV_embed_v1 = ModelMeta( @@ -114,4 +165,7 @@ def encode( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=nvidia_training_datasets, + public_training_code=None, + public_training_data=True, ) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index a520bdca11..6bca544b11 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -6,6 +6,8 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader +from .bge_models import bge_training_data + rubert_tiny2 = ModelMeta( name="cointegrated/rubert-tiny2", languages=["rus_Cyrl"], @@ -96,20 +98,27 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, training_datasets={ - "deepvk/ru-HNP": ["train"], - "deepvk/ru-WANLI": ["train"], - "Shitao/bge-m3-data": ["train"], - "RussianNLP/russian_super_glue": ["train"], - "reciTAL/mlsum": ["train"], - "Helsinki-NLP/opus-100": ["train"], - "Helsinki-NLP/bible_para": ["train"], - "d0rj/rudetoxifier_data_detox": ["train"], - "s-nlp/ru_paradetox": ["train"], - "Milana/russian_keywords": ["train"], - "IlyaGusev/gazeta": ["train"], - "d0rj/gsm8k-ru": ["train"], - "bragovo/dsum_ru": ["train"], - "CarlBrendt/Summ_Dialog_News": ["train"], + "BibleNLPBitextMining": ["train"], + "MLSUMClusteringP2P": ["train"], + "MLSUMClusteringP2P.v2": ["train"], + "MLSUMClusteringS2S": ["train"], + "MLSUMClusteringS2S.v2": ["train"], + **bge_training_data, + # not MTEB: + # "deepvk/ru-HNP": ["train"], + # "deepvk/ru-WANLI": ["train"], + # "Shitao/bge-m3-data": ["train"], + # "RussianNLP/russian_super_glue": ["train"], + # "reciTAL/mlsum": ["train"], + # "Helsinki-NLP/opus-100": ["train"], + # "Helsinki-NLP/bible_para": ["train"], + # "d0rj/rudetoxifier_data_detox": ["train"], + # "s-nlp/ru_paradetox": ["train"], + # "Milana/russian_keywords": ["train"], + # "IlyaGusev/gazeta": ["train"], + # "d0rj/gsm8k-ru": ["train"], + # "bragovo/dsum_ru": ["train"], + # "CarlBrendt/Summ_Dialog_News": ["train"], }, ) @@ -213,7 +222,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets={"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + training_datasets=None, # source model in unknown + # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, ) labse_ru_turbo = ModelMeta( @@ -231,7 +241,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets={"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + training_datasets=None, # source model in unknown + # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, ) diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index b1d45b949c..18db09a2b5 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -40,6 +40,19 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + adapted_from="intfloat/e5-mistral-7b-instruct", + public_training_code=False, + public_training_data=False, + training_datasets={ # inherits from e5 + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + }, ) @@ -68,4 +81,16 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=False, + public_training_data=False, + training_datasets={ # inherits from e5 + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + }, ) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 18b08f16f3..f8b01c6eaf 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -60,6 +60,40 @@ "zho_Hant", ] +sent_trf_training_dataset = { + # derived from datasheets + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], +} + all_MiniLM_L6_v2 = ModelMeta( name="sentence-transformers/all-MiniLM-L6-v2", languages=["eng-Latn"], @@ -77,40 +111,31 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # does sentence transformer count? + training_datasets=sent_trf_training_dataset, + public_training_code=True, + public_training_data=True, +) + +all_MiniLM_L12_v2 = ModelMeta( + name="sentence-transformers/all-MiniLM-L12-v2", + languages=["eng-Latn"], + open_weights=True, + revision="364dd28d28dcd3359b537f3cf1f5348ba679da62", + release_date="2021-08-30", + n_parameters=33_400_000, + memory_usage=None, + embed_dim=384, + license="apache-2.0", + max_tokens=256, + reference="https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets=sent_trf_training_dataset, + public_training_code=True, public_training_data=True, - training_datasets={ - # source: frontmatter in readme - # trained on stack exchange, unsure if sources match - "StackExchangeClusteringP2P": ["test"], - "StackExchangeClusteringP2P.v2": ["test"], - "StackExchangeClustering": ["test"], - "StackExchangeClustering.v2": ["test"], - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], - # Non MTEB sources - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, ) paraphrase_multilingual_MiniLM_L12_v2 = ModelMeta( @@ -130,6 +155,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, + training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) + public_training_code=True, + public_training_data=True, ) paraphrase_multilingual_mpnet_base_v2 = ModelMeta( @@ -149,6 +177,20 @@ use_instructions=False, superseded_by=None, adapted_from=None, + training_datasets=sent_trf_training_dataset, + # + https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/paraphrases/training.py + # which include (not in MTEB): + # "all-nli": all_nli_train_dataset, + # "sentence-compression": sentence_compression_train_dataset, + # "simple-wiki": simple_wiki_train_dataset, + # "altlex": altlex_train_dataset, + # "quora-duplicates": quora_train_dataset, + # "coco-captions": coco_train_dataset, + # "flickr30k-captions": flickr_train_dataset, + # "yahoo-answers": yahoo_answers_train_dataset, + # "stack-exchange": stack_exchange_train_dataset, + public_training_code=True, + public_training_data=True, ) labse = ModelMeta( @@ -168,6 +210,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, + training_datasets=None, # scraped and mined webdata including CC, wiki, see section 3.1 https://aclanthology.org/2022.acl-long.62.pdf + public_training_code=True, # https://www.kaggle.com/models/google/labse/tensorFlow2/labse/2?tfhub-redirect=true + public_training_data=False, ) multi_qa_MiniLM_L6_cos_v1 = ModelMeta( @@ -186,7 +231,10 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, superseded_by=None, - adapted_from=None, + adapted_from="nreimers/MiniLM-L6-H384-uncased", + training_datasets=sent_trf_training_dataset, # assumed + public_training_code=None, + public_training_data=None, ) all_mpnet_base_v2 = ModelMeta( @@ -206,280 +254,11 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # does sentence transformer count? - public_training_data=True, - training_datasets={ - # source: frontmatter in readme - # trained on stack exchange, unsure if sources match - "StackExchangeClusteringP2P": ["test"], - "StackExchangeClusteringP2P.v2": ["test"], - "StackExchangeClustering": ["test"], - "StackExchangeClustering.v2": ["test"], - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], - # Non MTEB source - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, -) - -# Source: https://arxiv.org/pdf/1907.04307 -use_multilingual_languages = [ - "ara-Arab", # Arabic - "zho-Hans", # Chinese (Simplified, PRC) - "zho-Hant", # Chinese (Traditional, Taiwan) - "nld-Latn", # Dutch - "eng-Latn", # English - "deu-Latn", # German - "fra-Latn", # French - "ita-Latn", # Italian - "por-Latn", # Portuguese - "spa-Latn", # Spanish - "jpn-Jpan", # Japanese - "kor-Kore", # Korean - "rus-Cyrl", # Russian - "pol-Latn", # Polish - "tha-Thai", # Thai - "tur-Latn", # Turkish -] -use_multilingual_training_data = { - # I'm not certain since they mined this themselves, but I would assume that there is significant overlap - "StackOverflowQARetrieval": ["train", "test"], - # Not in MTEB: - # - SNLI translated to 15 languages (could have intersections with other NLI datasets) - # - Translation pairs: Mined from the internet - # - QA mined from Reddit, StackOverflow, YahooAnswers (could be problematic) -} -distiluse_base_multilingual_cased_v2 = ModelMeta( - name="sentence-transformers/distiluse-base-multilingual-cased-v2", - languages=use_multilingual_languages, - open_weights=True, - revision="dad0fa1ee4fa6e982d3adbce87c73c02e6aee838", - release_date="2021-06-22", # First commit - n_parameters=135 * 1e6, - memory_usage=None, - embed_dim=768, - license="apache-2.0", - max_tokens=512, - reference="https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, + training_datasets=sent_trf_training_dataset, public_training_code=True, public_training_data=True, - training_datasets=use_multilingual_training_data, ) -use_cmlm_multilingual = ModelMeta( - name="sentence-transformers/use-cmlm-multilingual", - languages=paraphrase_langs, - open_weights=True, - revision="6f8ff6583c371cbc4d6d3b93a5e37a888fd54574", - release_date="2022-04-14", # First commit - n_parameters=472 * 1e6, - memory_usage=None, - embed_dim=768, - license="apache-2.0", - max_tokens=256, - reference="https://huggingface.co/sentence-transformers/use-cmlm-multilingual", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from="sentence-transformers/LaBSE", - public_training_code=True, - public_training_data=True, - training_datasets={ - # Not in MTEB: - # - SNLI - # - Translation corpus based largely on Uszkoreit et al. (2010) - }, -) - - -jina_embeddings_v2_base_en = ModelMeta( - name="jinaai/jina-embeddings-v2-base-en", - languages=["eng-Latn"], - open_weights=True, - revision="6e85f575bc273f1fd840a658067d0157933c83f0", - release_date="2023-09-27", - n_parameters=137_000_000, - memory_usage=None, - embed_dim=768, - license="apache-2.0", - max_tokens=8192, - reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-en", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets={"allenai/c4": ["train"]}, -) - -jina_embeddings_v2_base_zh = ModelMeta( - name="jinaai/jina-embeddings-v2-base-zh", - languages=["eng-Latn", "zho-Hans"], - open_weights=True, - revision="c1ff9086a89a1123d7b5eff58055a665db4fb4b9", - release_date="2024-01-10", - n_parameters=161_000_000, - memory_usage=None, - embed_dim=768, - license="apache-2.0", - max_tokens=8192, - reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets={ - # source: https://arxiv.org/pdf/2402.17016 - "XNLI": ["train"], - "MLSumClusteringS2S": ["train"], - "MLSumClusteringP2P": ["train"], - # Not in MTEB: - # - MQA - # - XLSUM - }, -) - - -jina_embeddings_v2_small_en = ModelMeta( - name="jinaai/jina-embeddings-v2-small-en", - languages=["eng-Latn"], - open_weights=True, - revision="796cff318cdd4e5fbe8b7303a1ef8cbec36996ef", - release_date="2023-09-27", - n_parameters=32_700_000, - memory_usage=None, - embed_dim=512, - license="apache-2.0", - max_tokens=8192, - reference="https://huggingface.co/jinaai/jina-embeddings-v2-small-en", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, -) - -jina_embedding_b_en_v1 = ModelMeta( - name="jinaai/jina-embedding-b-en-v1", - languages=["eng-Latn"], - open_weights=True, - revision="aa0645035294a8c0607ce5bb700aba982cdff32c", - release_date="2023-07-07", - n_parameters=110_000_000, - memory_usage=None, - embed_dim=768, - license="apache-2.0", - max_tokens=512, - reference="https://huggingface.co/jinaai/jina-embedding-b-en-v1", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by="jinaai/jina-embeddings-v2-base-en", - adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, -) - -jina_embedding_s_en_v1 = ModelMeta( - name="jinaai/jina-embedding-s-en-v1", - languages=["eng-Latn"], - open_weights=True, - revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", - release_date="2023-07-07", - n_parameters=35_000_000, - memory_usage=None, - embed_dim=512, - license="apache-2.0", - max_tokens=512, - reference="https://huggingface.co/jinaai/jina-embedding-s-en-v1", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by="jinaai/jina-embeddings-v2-small-en", - adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, -) - - -all_MiniLM_L12_v2 = ModelMeta( - name="sentence-transformers/all-MiniLM-L12-v2", - languages=["eng-Latn"], - open_weights=True, - revision="364dd28d28dcd3359b537f3cf1f5348ba679da62", - release_date="2021-08-30", - n_parameters=33_400_000, - memory_usage=None, - embed_dim=384, - license="apache-2.0", - max_tokens=256, - reference="https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - public_training_code=False, # does sentence transformer count? - public_training_data=True, - training_datasets={ - # source: frontmatter in readme - # trained on stack exchange, unsure if sources match - "StackExchangeClusteringP2P": ["test"], - "StackExchangeClusteringP2P.v2": ["test"], - "StackExchangeClustering": ["test"], - "StackExchangeClustering.v2": ["test"], - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], - # Non MTEB sources - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, -) microllama_text_embedding = ModelMeta( name="keeeeenw/MicroLlama-text-embedding", @@ -499,9 +278,11 @@ superseded_by=None, adapted_from=None, training_datasets={ - # shource yaml header: - "NQ": ["test"] - # not in MTEB: + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB # "sentence-transformers/all-nli": ["train"], # "sentence-transformers/stsb": ["train"], # "sentence-transformers/quora-duplicates": ["train"], diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index a738f4461e..c7a1a0f347 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -28,6 +28,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_400M_v5", + training_datasets=None, + public_training_data=False, # currently not released + public_training_code=False, ) stella_en_1_5b = ModelMeta( @@ -52,4 +55,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", + training_datasets=None, + public_training_data=False, # currently not released + public_training_code=False, ) diff --git a/mteb/models/uae_models.py b/mteb/models/uae_models.py index 5c47cba67d..ffdaa29f74 100644 --- a/mteb/models/uae_models.py +++ b/mteb/models/uae_models.py @@ -75,4 +75,13 @@ def encode( framework=["Sentence Transformers", "PyTorch"], reference="https://huggingface.co/WhereIsAI/UAE-Large-V1", use_instructions=True, + training_datasets={ + # source: https://arxiv.org/pdf/2309.12871 + # not in MTEB + "MNLI": [], + "NLI": [], + "SNLI": [], + }, + public_training_data=True, + public_training_code=True, ) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index 70f61e2c52..12925b235b 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -157,6 +157,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_finance_2 = ModelMeta( @@ -179,6 +182,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_law_2 = ModelMeta( @@ -201,6 +207,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_code_2 = ModelMeta( @@ -223,6 +232,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_large_2 = ModelMeta( @@ -245,6 +257,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_2 = ModelMeta( @@ -267,6 +282,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, + public_training_code=False, ) voyage_multilingual_2 = ModelMeta( name="voyageai/voyage-multilingual-2", @@ -288,6 +306,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_3 = ModelMeta( @@ -310,6 +331,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_3_lite = ModelMeta( @@ -332,4 +356,7 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, )