embeddings-benchmark · Samoed · Jan 22, 2025 · Jan 19, 2025 · Jan 19, 2025 · Jan 19, 2025
diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py
@@ -7,8 +7,8 @@
 model_prompts = {"query": "Represent this sentence for searching relevant passages: "}
 model_prompts_zh = {"query": "为这个句子生成表示以用于检索相关文章："}
 
-bge_m_training_data = {
-    # source: https://arxiv.org/pdf/2402.03216
+bge_m3_training_data = {
+    # source: https://arxiv.org/abs/2402.03216
     "MIRACLRetrieval": ["train"],
     "MIRACLRetrievalHardNegatives": ["train"],
     "MIRACLReranking": ["train"],
@@ -28,6 +28,28 @@
     "HotpotQA": ["train"],
     "HotpotQA-PL": ["train"],  # translation not trained on
     "HotpotQAHardNegatives": ["train"],
+    "T2Retrieval": ["train"],
+    "DuReader": ["train"],
+    "MMarcoReranking": ["train"],
+    "CodeSearchNet": ["train"],
+    # not in mteb
+    # "s2orc"
+    # Wikipedia
+    # "xP3"
+    # "mC4"
+    # "CC-News"
+    # "MTP"
+    # "NLLB"
+    # "CCMatrix"
+    # TriviaQA
+    # COL-IEE
+    # PubMedQA
+    # SQuAD
+    # SimCSE
+    # mMARCO-ZH
+    # LawGPT
+    # NLI-zh2, LeCaRDv2,
+    # NLI, MultiLongDoc (their syntetic)
     # + synthetic data
 }
 
@@ -89,38 +111,6 @@
     # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
 }
 
-bgem3_training_data = {
-    # source https://arxiv.org/abs/2402.03216
-    "T2Retrieval": ["train"],
-    "DuReader": ["train"],
-    "MMarcoReranking": ["train"],
-    "CMedQAv2-reranking": ["train"],
-    "HotpotQA": ["train"],
-    "NQ": ["train"],
-    "MSMARCO": ["train"],
-    "MrTidyRetrieval": ["train"],
-    "MIRACLRetrieval": ["train"],
-    "CodeSearchNet": ["train"],
-    # not in mteb
-    # "s2orc"
-    # Wikipedia
-    # "xP3"
-    # "mC4"
-    # "CC-News"
-    # "MTP"
-    # "NLLB"
-    # "CCMatrix"
-    # TriviaQA
-    # COL-IEE
-    # PubMedQA
-    # SQuAD
-    # SimCSE
-    # mMARCO-ZH
-    # LawGPT
-    # NLI-zh2, LeCaRDv2,
-    # NLI, MultiLongDoc (their syntetic)
-}
-
 # https://huggingface.co/BAAI/bge-m3/discussions/29
 bgem3_languages = [
     "afr_Latn",  # af
@@ -298,59 +288,6 @@
     "zho_Hans",  # zh
 ]
 
-bge_m_training_data = {
-    # source: https://arxiv.org/pdf/2402.03216
-    "MIRACLRetrieval": ["train"],
-    "MIRACLRetrievalHardNegatives": ["train"],
-    "MIRACLReranking": ["train"],
-    "LeCaRDv2": ["train"],
-    "CMedQAv1-reranking": ["train"],
-    "CMedQAv2-reranking": ["train"],
-    "MrTidyRetrieval": ["train"],
-    "T2Reranking": ["train"],
-    "MSMARCO": ["train"],
-    "MSMARCOHardNegatives": ["train"],
-    "NanoMSMARCORetrieval": ["train"],
-    "MSMARCO-PL": ["train"],  # translation not trained on
-    "NQ": ["train"],
-    "NQHardNegatives": ["train"],
-    "NanoNQRetrieval": ["train"],
-    "NQ-PL": ["train"],  # translation not trained on
-    "HotpotQA": ["train"],
-    "HotpotQA-PL": ["train"],  # translation not trained on
-    "HotpotQAHardNegatives": ["train"],
-    # + synthetic data
-}
-
-bge_training_data = {
-    # source: https://data.baai.ac.cn/details/BAAI-MTP
-    "NQ": ["test"],
-    "NQHardNegatives": ["test"],
-    "AmazonReviewsClassification": [
-        "validation",
-        "test",
-    ],  # assumed from: amazon_reviews_multi
-    "MLQARetrieval": [
-        "validation",
-        "test",
-    ],  # assumed from mlqa	(question, context)
-    # not in mteb
-    # Dataset	Pairs
-    # wudao	(title, passage)
-    # cmrc2018	(query, context)
-    # dureader	(query, context)
-    # simclue	(sentence_a, sentence_b)
-    # csl	(title, abstract)
-    # amazon_reviews_multi	(title, body)
-    # wiki_atomic_edits	(base_sentence, edited_sentence)
-    # mlqa	(question, context)
-    # xlsum	(title, summary) (title, text)
-    # "sentence-transformers data": [],  # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
-    # "wikipedia": [],  # title + section title, passage
-    # "reddit": [],  # title, body
-    # "stackexchange": [],  # (title, upvoted answer) (title+body, upvoted answer)
-    # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
-}
 
 bge_small_en_v1_5 = ModelMeta(
     loader=partial(  # type: ignore
@@ -522,8 +459,8 @@
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=False,
     public_training_code=None,
-    public_training_data=None,
-    training_datasets=bgem3_training_data,
+    public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
+    training_datasets=bge_m3_training_data,
 )
 
 

diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py
@@ -165,7 +165,9 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
     use_instructions=False,
     adapted_from=None,
     superseded_by=None,
-    training_datasets=None,
+    training_datasets={
+        "MSMARCO": ["train"],  # dev?
+    },
 )
 
 
@@ -218,5 +220,9 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
     use_instructions=False,
     adapted_from=None,
     superseded_by=None,
-    training_datasets=None,
+    training_datasets={
+        "MSMARCO": ["train"],
+        "DuRetrieval": [],
+        "MIRACL": ["train"],
+    },
 )
diff --git a/mteb/models/ibm_granite_models.py b/mteb/models/ibm_granite_models.py
@@ -20,6 +20,65 @@
     "zho_Hans",
 ]
 
+granite_training_data = {
+    # Multilingual MC4
+    # Multilingual Webhose
+    # English Wikipedia
+    # Multilingual Wikimedia
+    "WikipediaRetrievalMultilingual": [],
+    "WikipediaRerankingMultilingual": [],
+    # Miracl Corpus (Title-Body)
+    # Stack Exchange Duplicate questions (titles)
+    # Stack Exchange Duplicate questions (titles)
+    # Stack Exchange Duplicate questions (bodies)
+    "StackOverflowDupQuestions": [],
+    "AskUbuntuDupQuestions": [],
+    # Stack Exchange (Title, Answer) pairs
+    # Stack Exchange (Title, Body) pairs
+    # Stack Exchange (Title, Body) pairs
+    # Machine Translations of Stack Exchange Duplicate questions (titles)
+    # Machine Translations of Stack Exchange (Title+Body, Answer) pairs
+    "StackExchangeClusteringP2P": [],
+    "StackExchangeClusteringP2P.v2": [],
+    "StackExchangeClustering": [],
+    "StackExchangeClustering.v2": [],
+    # SearchQA
+    # S2ORC (Title, Abstract)
+    # WikiAnswers Duplicate question pairs
+    # CCNews
+    # XSum
+    # SimpleWiki
+    # Machine Translated Cross Lingual Parallel Corpora
+    # SPECTER citation triplets
+    # Machine Translations of SPECTER citation triplets
+    # Natural Questions (NQ)
+    "NQ": ["test"],
+    "NQHardNegatives": ["test"],
+    # SQuAD2.0
+    # HotpotQA
+    "HotPotQA": ["test"],
+    "HotPotQAHardNegatives": ["test"],
+    "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+    # Fever
+    "FEVER": ["test"],
+    "FEVERHardNegatives": ["test"],
+    # PubMed
+    # Multilingual Miracl Triples
+    "MIRACLRetrieval": ["train"],
+    "MIRACLRetrievalHardNegatives": ["train"],
+    "MIRACLReranking": ["train"],
+    # Multilingual MrTydi Triples
+    "MrTidyRetrieval": ["train"],
+    # Sadeeem Question Asnwering
+    # DBPedia Title-Body Pairs
+    "DBPedia": ["train"],
+    # Synthetic: English Query-Wikipedia Passage
+    # Synthetic: English Fact Verification
+    # Synthetic: Multilingual Query-Wikipedia Passage
+    # Synthetic: Multilingual News Summaries
+    # IBM Internal Triples
+    # IBM Internal Title-Body Pairs
+}
 
 granite_107m_multilingual = ModelMeta(
     loader=partial(  # type: ignore
@@ -44,7 +103,7 @@
     public_training_code=None,
     public_training_data=None,
     use_instructions=False,
-    training_datasets=None,
+    training_datasets=granite_training_data,
 )
 
 granite_278m_multilingual = ModelMeta(
@@ -70,7 +129,7 @@
     public_training_code=None,
     public_training_data=None,
     use_instructions=False,
-    training_datasets=None,
+    training_datasets=granite_training_data,
 )
 
 granite_30m_english = ModelMeta(
@@ -96,7 +155,7 @@
     public_training_code=None,
     public_training_data=None,
     use_instructions=False,
-    training_datasets=None,
+    training_datasets=granite_training_data,
 )
 
 granite_125m_english = ModelMeta(
@@ -122,5 +181,5 @@
     public_training_code=None,
     public_training_data=None,
     use_instructions=False,
-    training_datasets=None,
+    training_datasets=granite_training_data,
 )
diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py
@@ -222,9 +222,25 @@ def encode(
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
     reference="https://huggingface.co/jinaai/jina-embeddings-v3",
-    training_datasets=None,
     public_training_code=None,
     public_training_data=None,
+    training_datasets={
+        # CulturaX
+        "STS12": [],
+        # "SICK": [],
+        # "WMT19": [],
+        # "MADLAD-3B": [],
+        # NLI
+        "MSMARCO": ["train"],
+        "MSMARCOHardNegatives": ["train"],
+        "NanoMSMARCORetrieval": ["train"],
+        "NQ": ["train"],
+        "NQHardNegatives": ["train"],
+        "NanoNQRetrieval": ["train"],
+        "NQ-PL": ["train"],  # translation not trained on
+        # oasst1, oasst2
+    },
+    adapted_from="XLM-RoBERTa",
 )
 
 

diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py
@@ -7,7 +7,7 @@
 from mteb.model_meta import ModelMeta, sentence_transformers_loader
 from mteb.models.e5_models import E5_TRAINING_DATA
 
-from .bge_models import bge_m_training_data, bge_training_data
+from .bge_models import bge_m3_training_data, bge_training_data
 from .sentence_transformers_models import sent_trf_training_dataset
 
 Haon_Chen__speed_embedding_7b_instruct = ModelMeta(
@@ -1445,7 +1445,7 @@
     reference="https://huggingface.co/deepvk/USER-bge-m3",
     similarity_fn_name="cosine",
     use_instructions=None,
-    training_datasets=bge_m_training_data,  # derived from.
+    training_datasets=bge_m3_training_data,  # derived from.
     # not in MTEB:
     # "deepvk/ru-HNP": ["train"],
     # "deepvk/ru-WANLI": ["train"],