embeddings-benchmark · KennethEnevoldsen · Jan 17, 2025 · Jan 11, 2025 · Jan 11, 2025 · Jan 11, 2025
diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py
@@ -298,6 +298,60 @@
     "zho_Hans",  # zh
 ]
 
+bge_m_training_data = {
+    # source: https://arxiv.org/pdf/2402.03216
+    "MIRACLRetrieval": ["train"],
+    "MIRACLRetrievalHardNegatives": ["train"],
+    "MIRACLReranking": ["train"],
+    "LeCaRDv2": ["train"],
+    "CMedQAv1-reranking": ["train"],
+    "CMedQAv2-reranking": ["train"],
+    "MrTidyRetrieval": ["train"],
+    "T2Reranking": ["train"],
+    "MSMARCO": ["train"],
+    "MSMARCOHardNegatives": ["train"],
+    "NanoMSMARCORetrieval": ["train"],
+    "MSMARCO-PL": ["train"],  # translation not trained on
+    "NQ": ["train"],
+    "NQHardNegatives": ["train"],
+    "NanoNQRetrieval": ["train"],
+    "NQ-PL": ["train"],  # translation not trained on
+    "HotpotQA": ["train"],
+    "HotpotQA-PL": ["train"],  # translation not trained on
+    "HotpotQAHardNegatives": ["train"],
+    # + synthetic data
+}
+
+bge_training_data = {
+    # source: https://data.baai.ac.cn/details/BAAI-MTP
+    "NQ": ["test"],
+    "NQHardNegatives": ["test"],
+    "AmazonReviewsClassification": [
+        "validation",
+        "test",
+    ],  # assumed from: amazon_reviews_multi
+    "MLQARetrieval": [
+        "validation",
+        "test",
+    ],  # assumed from mlqa	(question, context)
+    # not in mteb
+    # Dataset	Pairs
+    # wudao	(title, passage)
+    # cmrc2018	(query, context)
+    # dureader	(query, context)
+    # simclue	(sentence_a, sentence_b)
+    # csl	(title, abstract)
+    # amazon_reviews_multi	(title, body)
+    # wiki_atomic_edits	(base_sentence, edited_sentence)
+    # mlqa	(question, context)
+    # xlsum	(title, summary) (title, text)
+    # "sentence-transformers data": [],  # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
+    # "wikipedia": [],  # title + section title, passage
+    # "reddit": [],  # title, body
+    # "stackexchange": [],  # (title, upvoted answer) (title+body, upvoted answer)
+    # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
+}
+
 bge_small_en_v1_5 = ModelMeta(
     loader=partial(  # type: ignore
         sentence_transformers_loader,
@@ -321,35 +375,7 @@
     use_instructions=True,
     public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
     public_training_code=None,  # seemingly released (at least for some models, but the link is broken
-    training_datasets={
-        # source: https://data.baai.ac.cn/details/BAAI-MTP
-        "NQ": ["test"],
-        "NQHardNegatives": ["test"],
-        "AmazonReviewsClassification": [
-            "validation",
-            "test",
-        ],  # assumed from: amazon_reviews_multi
-        "MLQARetrieval": [
-            "validation",
-            "test",
-        ],  # assumed from mlqa	(question, context)
-        # not in mteb
-        # Dataset	Pairs
-        # wudao	(title, passage)
-        # cmrc2018	(query, context)
-        # dureader	(query, context)
-        # simclue	(sentence_a, sentence_b)
-        # csl	(title, abstract)
-        # amazon_reviews_multi	(title, body)
-        # wiki_atomic_edits	(base_sentence, edited_sentence)
-        # mlqa	(question, context)
-        # xlsum	(title, summary) (title, text)
-        # "sentence-transformers data": [],  # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
-        # "wikipedia": [],  # title + section title, passage
-        # "reddit": [],  # title, body
-        # "stackexchange": [],  # (title, upvoted answer) (title+body, upvoted answer)
-        # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
-    },
+    training_datasets=bge_training_data,
 )
 
 bge_base_en_v1_5 = ModelMeta(
@@ -375,35 +401,7 @@
     use_instructions=True,
     public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
     public_training_code=None,  # seemingly released (at least for some models, but the link is broken
-    training_datasets={
-        # source: https://data.baai.ac.cn/details/BAAI-MTP
-        "NQ": ["test"],
-        "NQHardNegatives": ["test"],
-        "AmazonReviewsClassification": [
-            "validation",
-            "test",
-        ],  # assumed from: amazon_reviews_multi
-        "MLQARetrieval": [
-            "validation",
-            "test",
-        ],  # assumed from mlqa	(question, context)
-        # not in mteb
-        # Dataset	Pairs
-        # wudao	(title, passage)
-        # cmrc2018	(query, context)
-        # dureader	(query, context)
-        # simclue	(sentence_a, sentence_b)
-        # csl	(title, abstract)
-        # amazon_reviews_multi	(title, body)
-        # wiki_atomic_edits	(base_sentence, edited_sentence)
-        # mlqa	(question, context)
-        # xlsum	(title, summary) (title, text)
-        # "sentence-transformers data": [],  # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
-        # "wikipedia": [],  # title + section title, passage
-        # "reddit": [],  # title, body
-        # "stackexchange": [],  # (title, upvoted answer) (title+body, upvoted answer)
-        # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
-    },
+    training_datasets=bge_training_data,
 )
 
 bge_large_en_v1_5 = ModelMeta(
@@ -429,35 +427,7 @@
     use_instructions=True,
     public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
     public_training_code=None,  # seemingly released (at least for some models, but the link is broken
-    training_datasets={
-        # source: https://data.baai.ac.cn/details/BAAI-MTP
-        "NQ": ["test"],
-        "NQHardNegatives": ["test"],
-        "AmazonReviewsClassification": [
-            "validation",
-            "test",
-        ],  # assumed from: amazon_reviews_multi
-        "MLQARetrieval": [
-            "validation",
-            "test",
-        ],  # assumed from mlqa	(question, context)
-        # not in mteb
-        # Dataset	Pairs
-        # wudao	(title, passage)
-        # cmrc2018	(query, context)
-        # dureader	(query, context)
-        # simclue	(sentence_a, sentence_b)
-        # csl	(title, abstract)
-        # amazon_reviews_multi	(title, body)
-        # wiki_atomic_edits	(base_sentence, edited_sentence)
-        # mlqa	(question, context)
-        # xlsum	(title, summary) (title, text)
-        # "sentence-transformers data": [],  # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
-        # "wikipedia": [],  # title + section title, passage
-        # "reddit": [],  # title, body
-        # "stackexchange": [],  # (title, upvoted answer) (title+body, upvoted answer)
-        # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
-    },
+    training_datasets=bge_training_data,
 )
 
 bge_small_zh_v1_5 = ModelMeta(

diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py
@@ -6,7 +6,7 @@
 
 from mteb.model_meta import ModelMeta
 
-from .e5_models import E5_PAPER_RELEASE_DATE, XLMR_LANGUAGES
+from .e5_models import E5_PAPER_RELEASE_DATE, E5_TRAINING_DATA, XLMR_LANGUAGES
 from .instruct_wrapper import instruct_wrapper
 
 MISTRAL_LANGUAGES = ["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"]
@@ -40,6 +40,9 @@
     embed_dim=1024,
     license="mit",
     max_tokens=514,
+    public_training_data=False,
+    public_training_code=False,
+    training_datasets=E5_TRAINING_DATA,
 )
 
 e5_mistral = ModelMeta(
@@ -69,4 +72,7 @@
     embed_dim=4096,
     license="mit",
     max_tokens=32768,
+    public_training_data=False,
+    public_training_code=False,
+    training_datasets=E5_TRAINING_DATA,
 )