diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 8d5ad3fcaa..41587c8afc 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -53,6 +53,109 @@ adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", superseded_by=None, ) + +kalm_training_data = { + # from technical report + # not in MTEB: + # ExpertQA + # MEDI2BGE + # OpenOrca + # PAQ + # PubMedQA + # SearchQA + # arxiv_qa + # rag-dataset-12000 + # CC-News + # SQuAD 2.0 + # TriviaQA + # WebGPT Comparisons + # MultiNLI + # NLLB + # WikiAnswers + # SimCSE NLI + # SNLI + # Aya Dataset + # eli5 + # ---- + # in MTEB: + "CodeFeedbackMT": ["train"], + "CodeFeedbackST": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "TRECCOVID": ["train"], + "DBPedia": ["train"], + "ESCIReranking": ["train"], + "FEVER": ["train"], + "FiQA2018": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FEVER-NL": ["train"], # translation not trained on + "FiQA2018-NL": ["train"], # translation not trained on + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQA-NL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MultiLongDocRetrieval": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "mMARCO-NL": ["train"], # translation not trained on + "MSMARCOv2": ["train"], + "NFCorpus": ["train"], + "SciFact": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "NQ-NL": ["train"], # translation not trained on + "YahooAnswersTopicsClassification": ["train"], + "ContractNLIConfidentialityOfAgreementLegalBenchClassification": ["train"], + "ContractNLIExplicitIdentificationLegalBenchClassification": ["train"], + "ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification": [ + "train" + ], + "ContractNLILimitedUseLegalBenchClassification": ["train"], + "ContractNLINoLicensingLegalBenchClassification": ["train"], + "ContractNLINoticeOnCompelledDisclosureLegalBenchClassification": ["train"], + "ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification": [ + "train" + ], + "ContractNLIPermissibleCopyLegalBenchClassification": ["train"], + "ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification": [ + "train" + ], + "ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification": ["train"], + "ContractNLIReturnOfConfidentialInformationLegalBenchClassification": ["train"], + "ContractNLISharingWithEmployeesLegalBenchClassification": ["train"], + "ContractNLISharingWithThirdPartiesLegalBenchClassification": ["train"], + "ContractNLISurvivalOfObligationsLegalBenchClassification": ["train"], + "QuoraRetrieval": ["train"], + "NanoQuoraRetrieval": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "Banking77Classification": ["train"], + "AmazonPolarityClassification": ["train"], + "ImdbClassification": ["train"], + "EmotionClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "MrTidyRetrieval": ["train"], + "PawsXPairClassification": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "MultilingualSentiment": ["train"], + "MassiveIntentClassification": ["train"], + "MassiveScenarioClassification": ["train"], + "MTOPDomainClassification": ["train"], + "MTOPIntentClassification": ["train"], +} + HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v1 = ModelMeta( name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", revision="45e42c89990c40aca042659133fc8b13c28634b5", @@ -61,7 +164,7 @@ loader=None, n_parameters=494032768, memory_usage_mb=1885, - max_tokens=131072.0, + max_tokens=512, embed_dim=896, license="mit", open_weights=True, @@ -71,7 +174,7 @@ reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", similarity_fn_name="cosine", use_instructions=None, - training_datasets=None, + training_datasets=kalm_training_data, adapted_from="/mnt/shgeminicephfs/wx-dc-plt-hpc/xinshuohu/Output/Embedding/Qwen2-0.5B-eos_mean_pretrain_0806_1e-4_uen_sft_1022_filtered_v2_inst_3node_g8_1e-5_sin-0.1_mrl", superseded_by=None, ) @@ -83,7 +186,7 @@ loader=None, n_parameters=494032768, memory_usage_mb=1885, - max_tokens=131072.0, + max_tokens=512, embed_dim=896, license="mit", open_weights=True, @@ -93,7 +196,7 @@ reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-v1", similarity_fn_name="cosine", use_instructions=None, - training_datasets=None, + training_datasets=kalm_training_data, adapted_from="/mnt/shgeminicephfs/wx-dc-plt-hpc/xinshuohu/Output/Embedding/Qwen2-0.5B-eos_mean_pretrain_0806_1e-4_uen_sft_0902_filtered_v2_3node_g8_1e-5_sin-0.1", superseded_by=None, ) @@ -208,6 +311,15 @@ adapted_from="intfloat/e5-mistral-7b-instruct", superseded_by=None, ) + +bilingual_embedding_training_data = { + "STSBenchmark": ["train"], + "STSBenchmarkMultilingualSTS": ["train"], + "XNLI": ["train"], + # not in mteb + # SNLI +} + Lajavaness__bilingual_embedding_base = ModelMeta( name="Lajavaness/bilingual-embedding-base", revision="0bfc54bb2aa2666dd84715289c7ef58a95eb4d8d", @@ -231,7 +343,7 @@ reference="https://huggingface.co/Lajavaness/bilingual-embedding-base", similarity_fn_name="cosine", use_instructions=None, - training_datasets=None, + training_datasets=bilingual_embedding_training_data, adapted_from="dangvantuan/bilingual_impl", superseded_by=None, ) @@ -258,7 +370,7 @@ reference="https://huggingface.co/Lajavaness/bilingual-embedding-large", similarity_fn_name="cosine", use_instructions=None, - training_datasets=None, + training_datasets=bilingual_embedding_training_data, adapted_from="dangvantuan/bilingual_impl", superseded_by=None, ) @@ -285,7 +397,7 @@ reference="https://huggingface.co/Lajavaness/bilingual-embedding-small", similarity_fn_name="cosine", use_instructions=None, - training_datasets=None, + training_datasets=bilingual_embedding_training_data, adapted_from="dangvantuan/bilingual_impl", superseded_by=None, ) @@ -1254,8 +1366,25 @@ reference="https://huggingface.co/avsolatorio/GIST-Embedding-v0", similarity_fn_name="cosine", use_instructions=None, - training_datasets=None, - adapted_from=None, + training_datasets={ + **bge_training_data, + # not in mteb: + # MEDI + # all MTEB CLF datasets that has a train split: + "AmazonPolarityClassification": ["train"], + "AmazonReviewsClassification": ["train"], + "EmotionClassification": ["train"], + "ImdbClassification": ["train"], + "MTOPDomainClassification": ["train"], + "MTOPIntentClassification": ["train"], + "MassiveIntentClassification": ["train"], + "MassiveScenarioClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "Banking77Classification": ["train"], + "AmazonCounterfactualClassification": ["train"], + }, + adapted_from="BAAI/bge-large-en-v1.5", superseded_by=None, ) avsolatorio__GIST_all_MiniLM_L6_v2 = ModelMeta( diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 6b6c366835..ff0f3f5ef2 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -41,7 +41,6 @@ def instruction_template( "FEVERHardNegatives": ["train"], "NanoFEVERRetrieval": ["train"], "FiQA2018": ["train"], - "FiQA2018-PL": ["train"], # translation not trained on "FiQA2018-NL": ["train"], # translation not trained on "STS12": ["train"], "STS22": ["train"], @@ -56,7 +55,6 @@ def instruction_template( "ArxivClusteringP2P": ["train"], "ArxivClusteringP2P.v2": ["train"], "ArxivClusteringS2S": ["train"], - "ArxivClusteringS2S.v2": ["train"], "BiorxivClusteringP2P": ["train"], "BiorxivClusteringP2P.v2": ["train"], "BiorxivClusteringS2S": ["train"], diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index c9ec0807dd..fdcf30e82d 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -21,7 +21,6 @@ def instruction_template( **E5_MISTRAL_TRAINING_DATA, # From previously released blogpost which now have been taken down: "FiQA2018": ["train"], - "FiQA2018-PL": ["train"], "FiQA2018-NL": ["train"], # translation not trained on "FEVER": ["train"], "FEVERHardNegatives": ["train"], diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 6b8ec969b1..9163dd81b2 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -4,6 +4,7 @@ from mteb.model_meta import ModelMeta from mteb.models.instruct_wrapper import instruct_wrapper +from mteb.models.nvidia_models import nvidia_training_datasets stella_en_400M = ModelMeta( # https://huggingface.co/dunzhang/stella_en_400M_v5/discussions/21#671a6205ac1e2416090f2bf4 @@ -57,7 +58,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", - training_datasets=None, + training_datasets=nvidia_training_datasets, # also distilled from gte-qwen (but training data is unknown) #2164 public_training_code="https://github.com/NovaSearch-Team/RAG-Retrieval/blob/c40f4638b705eb77d88305d2056901ed550f9f4b/rag_retrieval/train/embedding/README.md", public_training_data=None, ) @@ -121,7 +122,7 @@ open_weights=True, revision="17bb1c32a93a8fc5f6fc9e91d5ea86da99983cfe", release_date="2024-02-27", - n_parameters=326 * 1e6, + n_parameters=int(326 * 1e6), memory_usage_mb=1242, embed_dim=1792, license="mit", @@ -143,7 +144,7 @@ open_weights=True, revision="b1075144f440ab4409c05622c1179130ebd57d03", release_date="2024-06-04", - n_parameters=326 * 1e6, + n_parameters=int(326 * 1e6), memory_usage_mb=1242, embed_dim=1792, license="mit", diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index 4ff51ad0f1..aad3cdedfc 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -416,85 +416,62 @@ def _batched_encode( # MTEB(eng, v1) training data: "AmazonPolarityClassification": ["train"], "AmazonReviewsClassification": ["train"], - "ArguAna": ["train"], - "ArxivClusteringP2P": ["train"], - "ArxivClusteringS2S": ["train"], - "AskUbuntuDupQuestions": ["train"], - "BIOSSES": ["train"], - "Banking77Classification": ["train"], - "BiorxivClusteringP2P": ["train"], - "BiorxivClusteringS2S": ["train"], - "CQADupstackRetrieval": ["train"], - "ClimateFEVER": ["train"], - "DBPedia": ["train"], "EmotionClassification": ["train"], - "FEVER": ["train"], - "FiQA2018": ["train"], "HotpotQA": ["train"], "ImdbClassification": ["train"], "MTOPDomainClassification": ["train"], "MTOPIntentClassification": ["train"], + "MindSmallReranking": ["train"], "MassiveIntentClassification": ["train"], "MassiveScenarioClassification": ["train"], "MedrxivClusteringP2P": ["train"], "MedrxivClusteringS2S": ["train"], - "MindSmallReranking": ["train"], - "NFCorpus": ["train"], - "NQ": ["train"], - "QuoraRetrieval": ["train"], - "RedditClustering": ["train"], - "RedditClusteringP2P": ["train"], - "SCIDOCS": ["train"], - "SICK-R": ["train"], "STS12": ["train"], - "STS13": ["train"], - "STS14": ["train"], - "STS15": ["train"], - "STS16": ["train"], "STSBenchmark": ["train"], - "SciDocsRR": ["train"], - "SciFact": ["train"], - "SprintDuplicateQuestions": ["train"], - "StackExchangeClustering": ["train"], - "StackExchangeClusteringP2P": ["train"], "StackOverflowDupQuestions": ["train"], - "SummEval": ["train"], - "TRECCOVID": ["train"], - "Touche2020": ["train"], "ToxicConversationsClassification": ["train"], "TweetSentimentExtractionClassification": ["train"], - "TwentyNewsgroupsClustering": ["train"], - "TwitterSemEval2015": ["train"], - "TwitterURLCorpus": ["train"], + "BiorxivClusteringP2P": ["train"], + "BiorxivClusteringS2S": ["train"], + "Banking77Classification": ["train"], + "ArguAna": ["train"], "ArguAna-PL": ["train"], "ArguAna-NL": ["train"], # translation not trained on "NanoArguAnaRetrieval": ["train"], - "HotpotQA-PL": ["train"], # translation not trained on - "HotpotQA-NL": ["train"], # translation not trained on - "HotpotQAHardNegatives": ["train"], - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "mMARCO-NL": ["train"], # translation not trained on + "STS22": ["train"], + "AmazonCounterfactualClassification": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringS2S": ["train"], + "NQ": ["train"], + "SciFact": ["train"], + "QuoraRetrieval": ["train"], + "NanoQuoraRetrieval": ["train"], "NQHardNegatives": ["train"], "NanoNQRetrieval": ["train"], "NQ-PL": ["train"], # translation not trained on "NQ-NL": ["train"], # translation not trained on + "NFCorpus": ["train"], "FEVERHardNegatives": ["train"], "NanoFEVERRetrieval": ["train"], "FEVER-NL": ["train"], # translation not trained on - "FiQA2018-PL": ["train"], # translation not trained on "FiQA2018-NL": ["train"], # translation not trained on - "STS22": ["train"], - "AmazonCounterfactualClassification": ["train"], - "ArxivClusteringP2P.v2": ["train"], - "ArxivClusteringS2S.v2": ["train"], "BiorxivClusteringP2P.v2": ["train"], "BiorxivClusteringS2S.v2": ["train"], "MedrxivClusteringP2P.v2": ["train"], "MedrxivClusteringS2S.v2": ["train"], - "TwentyNewsgroupsClustering.v2": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "mMARCO-NL": ["train"], # translation not trained on + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQA-NL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "FEVER": ["train"], + "FiQA2018": ["train"], + "DBPedia": ["train"], + "TRECCOVID": ["train"], + "ArxivClusteringP2P.v2": ["train"], "STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on }, public_training_code=None,