Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 40 additions & 34 deletions mteb/models/bge_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,40 +656,6 @@
training_datasets=bge_m3_training_data,
)

bge_multilingual_gemma2 = ModelMeta(
loader=partial( # type: ignore
sentence_transformers_loader,
model_name="BAAI/bge-multilingual-gemma2",
revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a",
),
name="BAAI/bge-multilingual-gemma2",
languages=[
"eng_Latn",
"zho_Hans",
"kor_Hang",
"kor_Latn",
"fra_Latn",
"jpn_Jpan",
"jpn_Latn",
], # This list is incomlete. Their description says "and more".
# I'm also unsure about the scripts.
open_weights=True,
revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a",
release_date="2024-07-25", # initial commit of hf model.
n_parameters=9.24 * 1e9,
memory_usage_mb=35254,
embed_dim=3584, # from old C-MTEB leaderboard
license="gemma",
max_tokens=8192, # from old C-MTEB leaderboard
reference="https://huggingface.co/BAAI/bge-multilingual-gemma2",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
public_training_code=None,
public_training_data=None,
training_datasets=None, # not disclosed
)

# Contents of cfli/bge-full-data
bge_full_data = {
# source: https://arxiv.org/pdf/2409.15700
Expand Down Expand Up @@ -746,6 +712,46 @@
"STSBenchmark": ["train"],
}


bge_multilingual_gemma2 = ModelMeta(
loader=partial( # type: ignore
sentence_transformers_loader,
model_name="BAAI/bge-multilingual-gemma2",
revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a",
),
name="BAAI/bge-multilingual-gemma2",
languages=[
"eng_Latn",
"zho_Hans",
"kor_Hang",
"kor_Latn",
"fra_Latn",
"jpn_Jpan",
"jpn_Latn",
], # This list is incomlete. Their description says "and more".
# I'm also unsure about the scripts.
open_weights=True,
revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a",
release_date="2024-07-25", # initial commit of hf model.
n_parameters=9.24 * 1e9,
memory_usage_mb=35254,
embed_dim=3584, # from old C-MTEB leaderboard
license="gemma",
max_tokens=8192, # from old C-MTEB leaderboard
reference="https://huggingface.co/BAAI/bge-multilingual-gemma2",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
public_training_code=None,
public_training_data=None,
training_datasets={
**bge_full_data,
**bge_m3_training_data,
"MIRACLReranking": ["train"],
"MrTidyRetrieval": ["train"],
},
)

bge_en_icl = ModelMeta(
loader=partial(
sentence_transformers_loader,
Expand Down
70 changes: 69 additions & 1 deletion mteb/models/misc_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,75 @@
reference="https://huggingface.co/Gameselo/STS-multilingual-mpnet-base-v2",
similarity_fn_name="cosine",
use_instructions=None,
training_datasets=None,
training_datasets={
# Source: https://huggingface.co/datasets/Gameselo/monolingual-wideNLI
# https://huggingface.co/Gameselo/STS-multilingual-mpnet-base-v2/discussions/2
# SNLI,
# MNLI,
# QNLI,
# WNLI,
# SciTail
# Vitamin C
# Trains on all of MTEB
"AlphaNLI": ["train"],
"RTE3": ["train"],
"AmazonPolarityClassification": ["train"],
"AmazonReviewsClassification": ["train"],
"ArguAna": ["train"],
"ArxivClusteringP2P": ["train"],
"ArxivClusteringS2S": ["train"],
"AskUbuntuDupQuestions": ["train"],
"BIOSSES": ["train"],
"Banking77Classification": ["train"],
"BiorxivClusteringP2P": ["train"],
"BiorxivClusteringS2S": ["train"],
"CQADupstackRetrieval": ["train"],
"ClimateFEVER": ["train"],
"DBPedia": ["train"],
"EmotionClassification": ["train"],
"FEVER": ["train"],
"FiQA2018": ["train"],
"HotpotQA": ["train"],
"ImdbClassification": ["train"],
"MTOPDomainClassification": ["train"],
"MTOPIntentClassification": ["train"],
"MassiveIntentClassification": ["train"],
"MassiveScenarioClassification": ["train"],
"MedrxivClusteringP2P": ["train"],
"MedrxivClusteringS2S": ["train"],
"MindSmallReranking": ["train"],
"NFCorpus": ["train"],
"NQ": ["train"],
"QuoraRetrieval": ["train"],
"RedditClustering": ["train"],
"RedditClusteringP2P": ["train"],
"SCIDOCS": ["train"],
"SICK-R": ["train"],
"STS12": ["train"],
"STS13": ["train"],
"STS14": ["train"],
"STS15": ["train"],
"STS16": ["train"],
"STSBenchmark": ["train"],
"SciDocsRR": ["train"],
"SciFact": ["train"],
"SprintDuplicateQuestions": ["train"],
"StackExchangeClustering": ["train"],
"StackExchangeClusteringP2P": ["train"],
"StackOverflowDupQuestions": ["train"],
"SummEval": ["train"],
"TRECCOVID": ["train"],
"Touche2020": ["train"],
"ToxicConversationsClassification": ["train"],
"TweetSentimentExtractionClassification": ["train"],
"TwentyNewsgroupsClustering": ["train"],
"TwitterSemEval2015": ["train"],
"TwitterURLCorpus": ["train"],
"MSMARCO": ["train"],
"AmazonCounterfactualClassification": ["train"],
"STS17": ["train"],
"STS22": ["train"],
},
adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
superseded_by=None,
)
Expand Down
2 changes: 1 addition & 1 deletion mteb/models/voyage_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def _batched_encode(
similarity_fn_name="cosine",
framework=["API"],
use_instructions=True,
training_datasets=None, # Not known
training_datasets=VOYAGE_TRAINING_DATA, # src: private communication with Voyage
public_training_code=None,
public_training_data=None,
)
Expand Down
2 changes: 1 addition & 1 deletion mteb/models/voyage_v.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,5 +259,5 @@ def get_fused_embeddings(
public_training_data=None,
reference="https://huggingface.co/voyageai/voyage-multimodal-3",
use_instructions=None,
training_datasets=None,
training_datasets={}, # No overlap with MTEB according to Voyage, could overlap with MIEB, didn't ask
)