diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 358f8931aa..6b6c366835 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -67,8 +67,20 @@ def instruction_template( "MedrxivClusteringS2S.v2": ["train"], "TwentyNewsgroupsClustering": ["train"], "TwentyNewsgroupsClustering.v2": ["train"], + "StackExchangeClustering": ["train"], + "StackExchangeClustering.v2": ["train"], + "StackExchangeClusteringP2P": ["train"], + "StackExchangeClusteringP2P.v2": ["train"], + "RedditClustering": ["train"], + "RedditClustering.v2": ["train"], + "RedditClusteringP2P": ["train"], + "RedditClusteringP2P.v2": ["train"], "STSBenchmark": ["train"], "STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "MrTidyRetrieval": ["train"], } NV_embed_v2 = ModelMeta( diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index d576288c3a..4ff51ad0f1 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -413,12 +413,62 @@ def _batched_encode( framework=["API"], use_instructions=True, training_datasets={ - # MTEB(eng, classic) training data: + # MTEB(eng, v1) training data: + "AmazonPolarityClassification": ["train"], + "AmazonReviewsClassification": ["train"], "ArguAna": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringS2S": ["train"], + "AskUbuntuDupQuestions": ["train"], + "BIOSSES": ["train"], + "Banking77Classification": ["train"], + "BiorxivClusteringP2P": ["train"], + "BiorxivClusteringS2S": ["train"], + "CQADupstackRetrieval": ["train"], + "ClimateFEVER": ["train"], + "DBPedia": ["train"], + "EmotionClassification": ["train"], + "FEVER": ["train"], + "FiQA2018": ["train"], + "HotpotQA": ["train"], + "ImdbClassification": ["train"], + "MTOPDomainClassification": ["train"], + "MTOPIntentClassification": ["train"], + "MassiveIntentClassification": ["train"], + "MassiveScenarioClassification": ["train"], + "MedrxivClusteringP2P": ["train"], + "MedrxivClusteringS2S": ["train"], + "MindSmallReranking": ["train"], + "NFCorpus": ["train"], + "NQ": ["train"], + "QuoraRetrieval": ["train"], + "RedditClustering": ["train"], + "RedditClusteringP2P": ["train"], + "SCIDOCS": ["train"], + "SICK-R": ["train"], + "STS12": ["train"], + "STS13": ["train"], + "STS14": ["train"], + "STS15": ["train"], + "STS16": ["train"], + "STSBenchmark": ["train"], + "SciDocsRR": ["train"], + "SciFact": ["train"], + "SprintDuplicateQuestions": ["train"], + "StackExchangeClustering": ["train"], + "StackExchangeClusteringP2P": ["train"], + "StackOverflowDupQuestions": ["train"], + "SummEval": ["train"], + "TRECCOVID": ["train"], + "Touche2020": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwitterSemEval2015": ["train"], + "TwitterURLCorpus": ["train"], "ArguAna-PL": ["train"], "ArguAna-NL": ["train"], # translation not trained on "NanoArguAnaRetrieval": ["train"], - "HotpotQA": ["train"], "HotpotQA-PL": ["train"], # translation not trained on "HotpotQA-NL": ["train"], # translation not trained on "HotpotQAHardNegatives": ["train"], @@ -427,43 +477,24 @@ def _batched_encode( "NanoMSMARCORetrieval": ["train"], "MSMARCO-PL": ["train"], # translation not trained on "mMARCO-NL": ["train"], # translation not trained on - "NQ": ["train"], "NQHardNegatives": ["train"], "NanoNQRetrieval": ["train"], "NQ-PL": ["train"], # translation not trained on "NQ-NL": ["train"], # translation not trained on - "FEVER": ["train"], "FEVERHardNegatives": ["train"], "NanoFEVERRetrieval": ["train"], "FEVER-NL": ["train"], # translation not trained on - "FiQA2018": ["train"], "FiQA2018-PL": ["train"], # translation not trained on "FiQA2018-NL": ["train"], # translation not trained on - "STS12": ["train"], "STS22": ["train"], - "AmazonReviewsClassification": ["train"], "AmazonCounterfactualClassification": ["train"], - "Banking77Classification": ["train"], - "EmotionClassification": ["train"], - "ImdbClassification": ["train"], - "MTOPIntentClassification": ["train"], - "ToxicConversationsClassification": ["train"], - "TweetSentimentExtractionClassification": ["train"], - "ArxivClusteringP2P": ["train"], "ArxivClusteringP2P.v2": ["train"], - "ArxivClusteringS2S": ["train"], "ArxivClusteringS2S.v2": ["train"], - "BiorxivClusteringP2P": ["train"], "BiorxivClusteringP2P.v2": ["train"], - "BiorxivClusteringS2S": ["train"], "BiorxivClusteringS2S.v2": ["train"], - "MedrxivClusteringP2P": ["train"], "MedrxivClusteringP2P.v2": ["train"], - "MedrxivClusteringS2S": ["train"], "MedrxivClusteringS2S.v2": ["train"], - "TwentyNewsgroupsClustering": ["train"], "TwentyNewsgroupsClustering.v2": ["train"], - "STSBenchmark": ["train"], "STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on }, public_training_code=None,