Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 57 additions & 87 deletions mteb/models/bge_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,60 @@
"zho_Hans", # zh
]

bge_m_training_data = {
# source: https://arxiv.org/pdf/2402.03216
"MIRACLRetrieval": ["train"],
"MIRACLRetrievalHardNegatives": ["train"],
"MIRACLReranking": ["train"],
"LeCaRDv2": ["train"],
"CMedQAv1-reranking": ["train"],
"CMedQAv2-reranking": ["train"],
"MrTidyRetrieval": ["train"],
"T2Reranking": ["train"],
"MSMARCO": ["train"],
"MSMARCOHardNegatives": ["train"],
"NanoMSMARCORetrieval": ["train"],
"MSMARCO-PL": ["train"], # translation not trained on
"NQ": ["train"],
"NQHardNegatives": ["train"],
"NanoNQRetrieval": ["train"],
"NQ-PL": ["train"], # translation not trained on
"HotpotQA": ["train"],
"HotpotQA-PL": ["train"], # translation not trained on
"HotpotQAHardNegatives": ["train"],
# + synthetic data
}

bge_training_data = {
# source: https://data.baai.ac.cn/details/BAAI-MTP
"NQ": ["test"],
"NQHardNegatives": ["test"],
"AmazonReviewsClassification": [
"validation",
"test",
], # assumed from: amazon_reviews_multi
"MLQARetrieval": [
"validation",
"test",
], # assumed from mlqa (question, context)
# not in mteb
# Dataset Pairs
# wudao (title, passage)
# cmrc2018 (query, context)
# dureader (query, context)
# simclue (sentence_a, sentence_b)
# csl (title, abstract)
# amazon_reviews_multi (title, body)
# wiki_atomic_edits (base_sentence, edited_sentence)
# mlqa (question, context)
# xlsum (title, summary) (title, text)
# "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
# "wikipedia": [], # title + section title, passage
# "reddit": [], # title, body
# "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer)
# "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract)
}

bge_small_en_v1_5 = ModelMeta(
loader=partial( # type: ignore
sentence_transformers_loader,
Expand All @@ -321,35 +375,7 @@
use_instructions=True,
public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP
public_training_code=None, # seemingly released (at least for some models, but the link is broken
training_datasets={
# source: https://data.baai.ac.cn/details/BAAI-MTP
"NQ": ["test"],
"NQHardNegatives": ["test"],
"AmazonReviewsClassification": [
"validation",
"test",
], # assumed from: amazon_reviews_multi
"MLQARetrieval": [
"validation",
"test",
], # assumed from mlqa (question, context)
# not in mteb
# Dataset Pairs
# wudao (title, passage)
# cmrc2018 (query, context)
# dureader (query, context)
# simclue (sentence_a, sentence_b)
# csl (title, abstract)
# amazon_reviews_multi (title, body)
# wiki_atomic_edits (base_sentence, edited_sentence)
# mlqa (question, context)
# xlsum (title, summary) (title, text)
# "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
# "wikipedia": [], # title + section title, passage
# "reddit": [], # title, body
# "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer)
# "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract)
},
training_datasets=bge_training_data,
)

bge_base_en_v1_5 = ModelMeta(
Expand All @@ -375,35 +401,7 @@
use_instructions=True,
public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP
public_training_code=None, # seemingly released (at least for some models, but the link is broken
training_datasets={
# source: https://data.baai.ac.cn/details/BAAI-MTP
"NQ": ["test"],
"NQHardNegatives": ["test"],
"AmazonReviewsClassification": [
"validation",
"test",
], # assumed from: amazon_reviews_multi
"MLQARetrieval": [
"validation",
"test",
], # assumed from mlqa (question, context)
# not in mteb
# Dataset Pairs
# wudao (title, passage)
# cmrc2018 (query, context)
# dureader (query, context)
# simclue (sentence_a, sentence_b)
# csl (title, abstract)
# amazon_reviews_multi (title, body)
# wiki_atomic_edits (base_sentence, edited_sentence)
# mlqa (question, context)
# xlsum (title, summary) (title, text)
# "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
# "wikipedia": [], # title + section title, passage
# "reddit": [], # title, body
# "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer)
# "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract)
},
training_datasets=bge_training_data,
)

bge_large_en_v1_5 = ModelMeta(
Expand All @@ -429,35 +427,7 @@
use_instructions=True,
public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP
public_training_code=None, # seemingly released (at least for some models, but the link is broken
training_datasets={
# source: https://data.baai.ac.cn/details/BAAI-MTP
"NQ": ["test"],
"NQHardNegatives": ["test"],
"AmazonReviewsClassification": [
"validation",
"test",
], # assumed from: amazon_reviews_multi
"MLQARetrieval": [
"validation",
"test",
], # assumed from mlqa (question, context)
# not in mteb
# Dataset Pairs
# wudao (title, passage)
# cmrc2018 (query, context)
# dureader (query, context)
# simclue (sentence_a, sentence_b)
# csl (title, abstract)
# amazon_reviews_multi (title, body)
# wiki_atomic_edits (base_sentence, edited_sentence)
# mlqa (question, context)
# xlsum (title, summary) (title, text)
# "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
# "wikipedia": [], # title + section title, passage
# "reddit": [], # title, body
# "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer)
# "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract)
},
training_datasets=bge_training_data,
)

bge_small_zh_v1_5 = ModelMeta(
Expand Down
8 changes: 7 additions & 1 deletion mteb/models/e5_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from mteb.model_meta import ModelMeta

from .e5_models import E5_PAPER_RELEASE_DATE, XLMR_LANGUAGES
from .e5_models import E5_PAPER_RELEASE_DATE, E5_TRAINING_DATA, XLMR_LANGUAGES
from .instruct_wrapper import instruct_wrapper

MISTRAL_LANGUAGES = ["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"]
Expand Down Expand Up @@ -40,6 +40,9 @@
embed_dim=1024,
license="mit",
max_tokens=514,
public_training_data=False,
public_training_code=False,
training_datasets=E5_TRAINING_DATA,
)

e5_mistral = ModelMeta(
Expand Down Expand Up @@ -69,4 +72,7 @@
embed_dim=4096,
license="mit",
max_tokens=32768,
public_training_data=False,
public_training_code=False,
training_datasets=E5_TRAINING_DATA,
)
Loading
Loading