Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 26 additions & 89 deletions mteb/models/bge_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
model_prompts = {"query": "Represent this sentence for searching relevant passages: "}
model_prompts_zh = {"query": "为这个句子生成表示以用于检索相关文章:"}

bge_m_training_data = {
# source: https://arxiv.org/pdf/2402.03216
bge_m3_training_data = {
# source: https://arxiv.org/abs/2402.03216
"MIRACLRetrieval": ["train"],
"MIRACLRetrievalHardNegatives": ["train"],
"MIRACLReranking": ["train"],
Expand All @@ -28,6 +28,28 @@
"HotpotQA": ["train"],
"HotpotQA-PL": ["train"], # translation not trained on
"HotpotQAHardNegatives": ["train"],
"T2Retrieval": ["train"],
"DuReader": ["train"],
"MMarcoReranking": ["train"],
"CodeSearchNet": ["train"],
# not in mteb
# "s2orc"
# Wikipedia
# "xP3"
# "mC4"
# "CC-News"
# "MTP"
# "NLLB"
# "CCMatrix"
# TriviaQA
# COL-IEE
# PubMedQA
# SQuAD
# SimCSE
# mMARCO-ZH
# LawGPT
# NLI-zh2, LeCaRDv2,
# NLI, MultiLongDoc (their syntetic)
# + synthetic data
}

Expand Down Expand Up @@ -89,38 +111,6 @@
# "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract)
}

bgem3_training_data = {
# source https://arxiv.org/abs/2402.03216
"T2Retrieval": ["train"],
"DuReader": ["train"],
"MMarcoReranking": ["train"],
"CMedQAv2-reranking": ["train"],
"HotpotQA": ["train"],
"NQ": ["train"],
"MSMARCO": ["train"],
"MrTidyRetrieval": ["train"],
"MIRACLRetrieval": ["train"],
"CodeSearchNet": ["train"],
# not in mteb
# "s2orc"
# Wikipedia
# "xP3"
# "mC4"
# "CC-News"
# "MTP"
# "NLLB"
# "CCMatrix"
# TriviaQA
# COL-IEE
# PubMedQA
# SQuAD
# SimCSE
# mMARCO-ZH
# LawGPT
# NLI-zh2, LeCaRDv2,
# NLI, MultiLongDoc (their syntetic)
}

# https://huggingface.co/BAAI/bge-m3/discussions/29
bgem3_languages = [
"afr_Latn", # af
Expand Down Expand Up @@ -298,59 +288,6 @@
"zho_Hans", # zh
]

bge_m_training_data = {
# source: https://arxiv.org/pdf/2402.03216
"MIRACLRetrieval": ["train"],
"MIRACLRetrievalHardNegatives": ["train"],
"MIRACLReranking": ["train"],
"LeCaRDv2": ["train"],
"CMedQAv1-reranking": ["train"],
"CMedQAv2-reranking": ["train"],
"MrTidyRetrieval": ["train"],
"T2Reranking": ["train"],
"MSMARCO": ["train"],
"MSMARCOHardNegatives": ["train"],
"NanoMSMARCORetrieval": ["train"],
"MSMARCO-PL": ["train"], # translation not trained on
"NQ": ["train"],
"NQHardNegatives": ["train"],
"NanoNQRetrieval": ["train"],
"NQ-PL": ["train"], # translation not trained on
"HotpotQA": ["train"],
"HotpotQA-PL": ["train"], # translation not trained on
"HotpotQAHardNegatives": ["train"],
# + synthetic data
}

bge_training_data = {
# source: https://data.baai.ac.cn/details/BAAI-MTP
"NQ": ["test"],
"NQHardNegatives": ["test"],
"AmazonReviewsClassification": [
"validation",
"test",
], # assumed from: amazon_reviews_multi
"MLQARetrieval": [
"validation",
"test",
], # assumed from mlqa (question, context)
# not in mteb
# Dataset Pairs
# wudao (title, passage)
# cmrc2018 (query, context)
# dureader (query, context)
# simclue (sentence_a, sentence_b)
# csl (title, abstract)
# amazon_reviews_multi (title, body)
# wiki_atomic_edits (base_sentence, edited_sentence)
# mlqa (question, context)
# xlsum (title, summary) (title, text)
# "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
# "wikipedia": [], # title + section title, passage
# "reddit": [], # title, body
# "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer)
# "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract)
}
Comment on lines -301 to -353
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was duplicated


bge_small_en_v1_5 = ModelMeta(
loader=partial( # type: ignore
Expand Down Expand Up @@ -522,8 +459,8 @@
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
public_training_code=None,
public_training_data=None,
training_datasets=bgem3_training_data,
public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
training_datasets=bge_m3_training_data,
)


Expand Down
10 changes: 8 additions & 2 deletions mteb/models/colbert_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,9 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
use_instructions=False,
adapted_from=None,
superseded_by=None,
training_datasets=None,
training_datasets={
"MSMARCO": ["train"], # dev?
},
)


Expand Down Expand Up @@ -218,5 +220,9 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
use_instructions=False,
adapted_from=None,
superseded_by=None,
training_datasets=None,
training_datasets={
"MSMARCO": ["train"],
"DuRetrieval": [],
"MIRACL": ["train"],
},
)
67 changes: 63 additions & 4 deletions mteb/models/ibm_granite_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,65 @@
"zho_Hans",
]

granite_training_data = {
# Multilingual MC4
# Multilingual Webhose
# English Wikipedia
# Multilingual Wikimedia
"WikipediaRetrievalMultilingual": [],
"WikipediaRerankingMultilingual": [],
# Miracl Corpus (Title-Body)
# Stack Exchange Duplicate questions (titles)
# Stack Exchange Duplicate questions (titles)
# Stack Exchange Duplicate questions (bodies)
"StackOverflowDupQuestions": [],
"AskUbuntuDupQuestions": [],
# Stack Exchange (Title, Answer) pairs
# Stack Exchange (Title, Body) pairs
# Stack Exchange (Title, Body) pairs
# Machine Translations of Stack Exchange Duplicate questions (titles)
# Machine Translations of Stack Exchange (Title+Body, Answer) pairs
"StackExchangeClusteringP2P": [],
"StackExchangeClusteringP2P.v2": [],
"StackExchangeClustering": [],
"StackExchangeClustering.v2": [],
# SearchQA
# S2ORC (Title, Abstract)
# WikiAnswers Duplicate question pairs
# CCNews
# XSum
# SimpleWiki
# Machine Translated Cross Lingual Parallel Corpora
# SPECTER citation triplets
# Machine Translations of SPECTER citation triplets
# Natural Questions (NQ)
"NQ": ["test"],
"NQHardNegatives": ["test"],
# SQuAD2.0
# HotpotQA
"HotPotQA": ["test"],
"HotPotQAHardNegatives": ["test"],
"HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on)
# Fever
"FEVER": ["test"],
"FEVERHardNegatives": ["test"],
# PubMed
# Multilingual Miracl Triples
"MIRACLRetrieval": ["train"],
"MIRACLRetrievalHardNegatives": ["train"],
"MIRACLReranking": ["train"],
# Multilingual MrTydi Triples
"MrTidyRetrieval": ["train"],
# Sadeeem Question Asnwering
# DBPedia Title-Body Pairs
"DBPedia": ["train"],
# Synthetic: English Query-Wikipedia Passage
# Synthetic: English Fact Verification
# Synthetic: Multilingual Query-Wikipedia Passage
# Synthetic: Multilingual News Summaries
# IBM Internal Triples
# IBM Internal Title-Body Pairs
}

granite_107m_multilingual = ModelMeta(
loader=partial( # type: ignore
Expand All @@ -44,7 +103,7 @@
public_training_code=None,
public_training_data=None,
use_instructions=False,
training_datasets=None,
training_datasets=granite_training_data,
)

granite_278m_multilingual = ModelMeta(
Expand All @@ -70,7 +129,7 @@
public_training_code=None,
public_training_data=None,
use_instructions=False,
training_datasets=None,
training_datasets=granite_training_data,
)

granite_30m_english = ModelMeta(
Expand All @@ -96,7 +155,7 @@
public_training_code=None,
public_training_data=None,
use_instructions=False,
training_datasets=None,
training_datasets=granite_training_data,
)

granite_125m_english = ModelMeta(
Expand All @@ -122,5 +181,5 @@
public_training_code=None,
public_training_data=None,
use_instructions=False,
training_datasets=None,
training_datasets=granite_training_data,
)
18 changes: 17 additions & 1 deletion mteb/models/jina_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,9 +222,25 @@ def encode(
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
reference="https://huggingface.co/jinaai/jina-embeddings-v3",
training_datasets=None,
public_training_code=None,
public_training_data=None,
training_datasets={
# CulturaX
"STS12": [],
# "SICK": [],
# "WMT19": [],
# "MADLAD-3B": [],
# NLI
"MSMARCO": ["train"],
"MSMARCOHardNegatives": ["train"],
"NanoMSMARCORetrieval": ["train"],
"NQ": ["train"],
"NQHardNegatives": ["train"],
"NanoNQRetrieval": ["train"],
"NQ-PL": ["train"], # translation not trained on
# oasst1, oasst2
},
adapted_from="XLM-RoBERTa",
)


Expand Down
4 changes: 2 additions & 2 deletions mteb/models/misc_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from mteb.model_meta import ModelMeta, sentence_transformers_loader
from mteb.models.e5_models import E5_TRAINING_DATA

from .bge_models import bge_m_training_data, bge_training_data
from .bge_models import bge_m3_training_data, bge_training_data
from .sentence_transformers_models import sent_trf_training_dataset

Haon_Chen__speed_embedding_7b_instruct = ModelMeta(
Expand Down Expand Up @@ -1445,7 +1445,7 @@
reference="https://huggingface.co/deepvk/USER-bge-m3",
similarity_fn_name="cosine",
use_instructions=None,
training_datasets=bge_m_training_data, # derived from.
training_datasets=bge_m3_training_data, # derived from.
# not in MTEB:
# "deepvk/ru-HNP": ["train"],
# "deepvk/ru-WANLI": ["train"],
Expand Down
Loading
Loading