diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index f3ea8d3e4b..9b900c98e9 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -656,40 +656,6 @@ training_datasets=bge_m3_training_data, ) -bge_multilingual_gemma2 = ModelMeta( - loader=partial( # type: ignore - sentence_transformers_loader, - model_name="BAAI/bge-multilingual-gemma2", - revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a", - ), - name="BAAI/bge-multilingual-gemma2", - languages=[ - "eng_Latn", - "zho_Hans", - "kor_Hang", - "kor_Latn", - "fra_Latn", - "jpn_Jpan", - "jpn_Latn", - ], # This list is incomlete. Their description says "and more". - # I'm also unsure about the scripts. - open_weights=True, - revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a", - release_date="2024-07-25", # initial commit of hf model. - n_parameters=9.24 * 1e9, - memory_usage_mb=35254, - embed_dim=3584, # from old C-MTEB leaderboard - license="gemma", - max_tokens=8192, # from old C-MTEB leaderboard - reference="https://huggingface.co/BAAI/bge-multilingual-gemma2", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - public_training_code=None, - public_training_data=None, - training_datasets=None, # not disclosed -) - # Contents of cfli/bge-full-data bge_full_data = { # source: https://arxiv.org/pdf/2409.15700 @@ -746,6 +712,46 @@ "STSBenchmark": ["train"], } + +bge_multilingual_gemma2 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-multilingual-gemma2", + revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a", + ), + name="BAAI/bge-multilingual-gemma2", + languages=[ + "eng_Latn", + "zho_Hans", + "kor_Hang", + "kor_Latn", + "fra_Latn", + "jpn_Jpan", + "jpn_Latn", + ], # This list is incomlete. Their description says "and more". + # I'm also unsure about the scripts. + open_weights=True, + revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a", + release_date="2024-07-25", # initial commit of hf model. + n_parameters=9.24 * 1e9, + memory_usage_mb=35254, + embed_dim=3584, # from old C-MTEB leaderboard + license="gemma", + max_tokens=8192, # from old C-MTEB leaderboard + reference="https://huggingface.co/BAAI/bge-multilingual-gemma2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets={ + **bge_full_data, + **bge_m3_training_data, + "MIRACLReranking": ["train"], + "MrTidyRetrieval": ["train"], + }, +) + bge_en_icl = ModelMeta( loader=partial( sentence_transformers_loader, diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 61cce4e071..363a79a7df 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -50,7 +50,75 @@ reference="https://huggingface.co/Gameselo/STS-multilingual-mpnet-base-v2", similarity_fn_name="cosine", use_instructions=None, - training_datasets=None, + training_datasets={ + # Source: https://huggingface.co/datasets/Gameselo/monolingual-wideNLI + # https://huggingface.co/Gameselo/STS-multilingual-mpnet-base-v2/discussions/2 + # SNLI, + # MNLI, + # QNLI, + # WNLI, + # SciTail + # Vitamin C + # Trains on all of MTEB + "AlphaNLI": ["train"], + "RTE3": ["train"], + "AmazonPolarityClassification": ["train"], + "AmazonReviewsClassification": ["train"], + "ArguAna": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringS2S": ["train"], + "AskUbuntuDupQuestions": ["train"], + "BIOSSES": ["train"], + "Banking77Classification": ["train"], + "BiorxivClusteringP2P": ["train"], + "BiorxivClusteringS2S": ["train"], + "CQADupstackRetrieval": ["train"], + "ClimateFEVER": ["train"], + "DBPedia": ["train"], + "EmotionClassification": ["train"], + "FEVER": ["train"], + "FiQA2018": ["train"], + "HotpotQA": ["train"], + "ImdbClassification": ["train"], + "MTOPDomainClassification": ["train"], + "MTOPIntentClassification": ["train"], + "MassiveIntentClassification": ["train"], + "MassiveScenarioClassification": ["train"], + "MedrxivClusteringP2P": ["train"], + "MedrxivClusteringS2S": ["train"], + "MindSmallReranking": ["train"], + "NFCorpus": ["train"], + "NQ": ["train"], + "QuoraRetrieval": ["train"], + "RedditClustering": ["train"], + "RedditClusteringP2P": ["train"], + "SCIDOCS": ["train"], + "SICK-R": ["train"], + "STS12": ["train"], + "STS13": ["train"], + "STS14": ["train"], + "STS15": ["train"], + "STS16": ["train"], + "STSBenchmark": ["train"], + "SciDocsRR": ["train"], + "SciFact": ["train"], + "SprintDuplicateQuestions": ["train"], + "StackExchangeClustering": ["train"], + "StackExchangeClusteringP2P": ["train"], + "StackOverflowDupQuestions": ["train"], + "SummEval": ["train"], + "TRECCOVID": ["train"], + "Touche2020": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwitterSemEval2015": ["train"], + "TwitterURLCorpus": ["train"], + "MSMARCO": ["train"], + "AmazonCounterfactualClassification": ["train"], + "STS17": ["train"], + "STS22": ["train"], + }, adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", superseded_by=None, ) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index aad3cdedfc..3f728ccee0 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -262,7 +262,7 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, # Not known + training_datasets=VOYAGE_TRAINING_DATA, # src: private communication with Voyage public_training_code=None, public_training_data=None, ) diff --git a/mteb/models/voyage_v.py b/mteb/models/voyage_v.py index 1086f88ee5..fc880347c5 100644 --- a/mteb/models/voyage_v.py +++ b/mteb/models/voyage_v.py @@ -259,5 +259,5 @@ def get_fused_embeddings( public_training_data=None, reference="https://huggingface.co/voyageai/voyage-multimodal-3", use_instructions=None, - training_datasets=None, + training_datasets={}, # No overlap with MTEB according to Voyage, could overlap with MIEB, didn't ask )