diff --git a/mteb/models/kalm_models.py b/mteb/models/kalm_models.py index 19be60b7d0..68f17460c3 100644 --- a/mteb/models/kalm_models.py +++ b/mteb/models/kalm_models.py @@ -7,7 +7,6 @@ import numpy as np import torch - from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper @@ -167,6 +166,118 @@ def encode( "MTOPIntentClassification": ["train"], } + +kalm_v2_training_data = { + # from technical report + # not in MTEB: + # ExpertQA + # MEDI2BGE + # OpenOrca + # PAQ + # PubMedQA + # SearchQA + # arxiv_qa + # rag-dataset-12000 + # CC-News + # SQuAD 2.0 + # TriviaQA + # WebGPT Comparisons + # MultiNLI + # NLLB + # WikiAnswers + # SimCSE NLI + # SNLI + # Aya Dataset + # eli5 + # ---- + # in MTEB: + "CodeFeedbackMT": ["train"], + "CodeFeedbackST": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "TRECCOVID": ["train"], + "DBPedia": ["train"], + "ESCIReranking": ["train"], + "FEVER": ["train"], + "FiQA2018": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FEVER-NL": ["train"], # translation not trained on + "FiQA2018-NL": ["train"], # translation not trained on + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQA-NL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MultiLongDocRetrieval": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "mMARCO-NL": ["train"], # translation not trained on + "MSMARCOv2": ["train"], + "NFCorpus": ["train"], + "SciFact": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "NQ-NL": ["train"], # translation not trained on + "YahooAnswersTopicsClassification": ["train"], + "ContractNLIConfidentialityOfAgreementLegalBenchClassification": ["train"], + "ContractNLIExplicitIdentificationLegalBenchClassification": ["train"], + "ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification": [ + "train" + ], + "ContractNLILimitedUseLegalBenchClassification": ["train"], + "ContractNLINoLicensingLegalBenchClassification": ["train"], + "ContractNLINoticeOnCompelledDisclosureLegalBenchClassification": ["train"], + "ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification": [ + "train" + ], + "ContractNLIPermissibleCopyLegalBenchClassification": ["train"], + "ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification": [ + "train" + ], + "ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification": ["train"], + "ContractNLIReturnOfConfidentialInformationLegalBenchClassification": ["train"], + "ContractNLISharingWithEmployeesLegalBenchClassification": ["train"], + "ContractNLISharingWithThirdPartiesLegalBenchClassification": ["train"], + "ContractNLISurvivalOfObligationsLegalBenchClassification": ["train"], + "QuoraRetrieval": ["train"], + "NanoQuoraRetrieval": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "Banking77Classification": ["train"], + "AmazonPolarityClassification": ["train"], + "ImdbClassification": ["train"], + "EmotionClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "MrTidyRetrieval": ["train"], + "PawsXPairClassification": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "MultilingualSentiment": ["train"], + "MassiveIntentClassification": ["train"], + "MassiveScenarioClassification": ["train"], + "MTOPDomainClassification": ["train"], + "MTOPIntentClassification": ["train"], + "Reddit-Clustering": ["train"], + "Reddit-Clustering-P2P": ["train"], + "Stackexchange-Clustering": ["train"], + "Stackexchange-Clustering-P2P": ["train"], + "TwentyNewsgroups-Clustering": ["train"], + "ATEC": ["train"], + "BQ": ["train"], + "CQADupstack": ["train"], +} + + KaLM_task_prompts = { "AmazonCounterfactualClassification": "Given an Amazon review, judge whether it is counterfactual.", "AmazonPolarityClassification": "Classifying Amazon reviews into positive or negative sentiment", @@ -229,6 +340,146 @@ def encode( "RuSciBenchOECDClusteringP2P": "Identify the topic or theme of the Russian articles.", } +KaLM_v2_task_prompts = { + "AmazonCounterfactualClassification": "Given an Amazon review, judge whether it is counterfactual.", + "AmazonPolarityClassification": "Classifying Amazon reviews into positive or negative sentiment", + "AmazonReviewsClassification": "Classifying the given Amazon review into its appropriate rating category", + "Banking77Classification": "Given an online banking query, find the corresponding intents", + "EmotionClassification": "Classifying the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise", + "ImdbClassification": "Classifying the sentiment expressed in the given movie review text from the IMDB dataset", + "MassiveIntentClassification": "Given a user utterance as query, find the user intents", + "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios", + "MTOPDomainClassification": "Classifying the intent domain of the given utterance in task-oriented conversation", + "MTOPIntentClassification": "Classifying the intent of the given utterance in task-oriented conversation", + "ToxicConversationsClassification": "Classifying the given comments as either toxic or not toxic", + "TweetSentimentExtractionClassification": "Classifying the sentiment of a given tweet as either positive, negative, or neutral", + "TNews": "Categorizing the given news title", + "IFlyTek": "Given an App description text, find the appropriate fine-grained category", + "MultilingualSentiment": "Classifying sentiment of the customer review into positive, neutral, or negative", + "JDReview": "Classifying sentiment of the customer review for iPhone into positive or negative", + "OnlineShopping": "Classifying sentiment of the customer review into positive or negative", + "Waimai": "Classify the customer review from a food takeaway platform into positive or negative", + "ArxivClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts", + "ArxivClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles", + "BiorxivClusteringP2P": "Identify the main category of Biorxiv papers based on the titles and abstracts", + "BiorxivClusteringS2S": "Identify the main category of Biorxiv papers based on the titles", + "MedrxivClusteringP2P": "Identify the main category of Medrxiv papers based on the titles and abstracts", + "MedrxivClusteringS2S": "Identify the main category of Medrxiv papers based on the titles", + "RedditClustering": "Identify the topic or theme of Reddit posts based on the titles", + "RedditClusteringP2P": "Identify the topic or theme of Reddit posts based on the titles and posts", + "StackExchangeClustering": "Identify the topic or theme of StackExchange posts based on the titles", + "StackExchangeClusteringP2P": "Identify the topic or theme of StackExchange posts based on the given paragraphs", + "TwentyNewsgroupsClustering": "Identify the topic or theme of the given news articles", + "CLSClusteringS2S": "Identify the main category of scholar papers based on the titles", + "CLSClusteringP2P": "Identify the main category of scholar papers based on the titles and abstracts", + "ThuNewsClusteringS2S": "Identify the topic or theme of the given news articles based on the titles", + "ThuNewsClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents", + "Cmnli-query": "Retrieve semantically similar text", + "Cmnli-passage": "Retrieve semantically similar text", + "Ocnli-query": "Retrieve semantically similar text", + "Ocnli-passage": "Retrieve semantically similar text", + "SprintDuplicateQuestions-query": "Retrieve semantically similar questions", + "SprintDuplicateQuestions-passage": "Retrieve semantically similar questions", + "TwitterSemEval2015-query": "Retrieve semantically similar text", + "TwitterSemEval2015-passage": "Retrieve semantically similar text", + "TwitterURLCorpus-query": "Retrieve semantically similar text", + "TwitterURLCorpus-passage": "Retrieve semantically similar text", + "CMedQAv1-reranking": "Given a query, retrieve documents that answer the query", + "CMedQAv2-reranking": "Given a query, retrieve documents that answer the query", + "MMarcoReranking": "Given a query, retrieve documents that answer the query", + "T2Reranking": "Given a query, retrieve documents that answer the query", + "AskUbuntuDupQuestions-query": "Retrieve semantically similar questions", + "AskUbuntuDupQuestions-passage": "Retrieve semantically similar questions", + "MindSmallReranking": "Given a query, retrieve documents that answer the query", + "SciDocsRR-query": "Retrieve relevant paper titles", + "SciDocsRR-passage": "Retrieve relevant paper titles", + "StackOverflowDupQuestions-query": "Retrieve semantically similar questions", + "StackOverflowDupQuestions-passage": "Retrieve semantically similar questions", + "CmedqaRetrieval": "Given a query, retrieve documents that answer the query", + "CovidRetrieval": "Given a query, retrieve documents that answer the query", + "DuRetrieval": "Given a query, retrieve documents that answer the query", + "EcomRetrieval": "Given a query, retrieve documents that answer the query", + "MedicalRetrieval": "Given a query, retrieve documents that answer the query", + "MMarcoRetrieval": "Given a query, retrieve documents that answer the query", + "T2Retrieval": "Given a query, retrieve documents that answer the query", + "VideoRetrieval": "Given a query, retrieve documents that answer the query", + "MSMARCO": "Given a query, retrieve documents that answer the query", + "ArguAna": "Given a query, retrieve documents that answer the query", + "ClimateFEVER": "Given a query, retrieve documents that answer the query", + "CQADupstackAndroidRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackAndroidRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackEnglishRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackEnglishRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackGamingRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackGamingRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackGisRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackGisRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackMathematicaRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackMathematicaRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackPhysicsRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackPhysicsRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackProgrammersRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackProgrammersRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackStatsRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackStatsRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackTexRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackTexRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackUnixRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackUnixRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackWebmastersRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackWebmastersRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackWordpressRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackWordpressRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "DBPedia": "Given a query, retrieve documents that answer the query", + "FEVER": "Given a query, retrieve documents that answer the query", + "FiQA2018": "Given a query, retrieve documents that answer the query", + "HotpotQA": "Given a query, retrieve documents that answer the query", + "NFCorpus": "Given a query, retrieve documents that answer the query", + "NQ": "Given a query, retrieve documents that answer the query", + "QuoraRetrieval-query": "Retrieve semantically similar questions", + "QuoraRetrieval-passage": "Retrieve semantically similar questions", + "SCIDOCS-query": "Given a query, retrieve documents that answer the query", + "SCIDOCS-passage": "Given a query, retrieve documents that answer the query", + "SciFact": "Given a query, retrieve documents that answer the query", + "Touche2020": "Given a query, retrieve documents that answer the query", + "TRECCOVID": "Given a query, retrieve documents that answer the query", + "AFQMC-query": "Retrieve semantically similar text", + "AFQMC-passage": "Retrieve semantically similar text", + "ATEC-query": "Retrieve semantically similar text", + "ATEC-passage": "Retrieve semantically similar text", + "BQ-query": "Retrieve semantically similar text", + "BQ-passage": "Retrieve semantically similar text", + "LCQMC-query": "Retrieve semantically similar text", + "LCQMC-passage": "Retrieve semantically similar text", + "PAWSX-query": "Retrieve semantically similar text", + "PAWSX-passage": "Retrieve semantically similar text", + "QBQTC-query": "Retrieve semantically similar text", + "QBQTC-passage": "Retrieve semantically similar text", + "STSB-query": "Retrieve semantically similar text", + "STSB-passage": "Retrieve semantically similar text", + "BIOSSES-query": "Retrieve semantically similar text", + "BIOSSES-passage": "Retrieve semantically similar text", + "SICK-R-query": "Retrieve semantically similar text", + "SICK-R-passage": "Retrieve semantically similar text", + "STS12-query": "Retrieve semantically similar text", + "STS12-passage": "Retrieve semantically similar text", + "STS13-query": "Retrieve semantically similar text", + "STS13-passage": "Retrieve semantically similar text", + "STS14-query": "Retrieve semantically similar text", + "STS14-passage": "Retrieve semantically similar text", + "STS15-query": "Retrieve semantically similar text", + "STS15-passage": "Retrieve semantically similar text", + "STS16-query": "Retrieve semantically similar text", + "STS16-passage": "Retrieve semantically similar text", + "STS17-query": "Retrieve semantically similar text", + "STS17-passage": "Retrieve semantically similar text", + "STS22-query": "Retrieve semantically similar text", + "STS22-passage": "Retrieve semantically similar text", + "STSBenchmark-query": "Retrieve semantically similar text", + "STSBenchmark-passage": "Retrieve semantically similar text", + "SummEval-query": "Retrieve semantically similar summaries", + "SummEval-passage": "Retrieve semantically similar summaries", +} KaLM_X_task_prompts = { "Classification": "classify the query into different classes.", @@ -517,6 +768,37 @@ def encode( superseded_by=None, ) +HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v2 = ModelMeta( + loader=partial( # type: ignore + InstructSentenceTransformerWrapper, + model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v2", + revision="d2a21c232dc712ae8230af56d1027cf21b7864bf", + instruction_template=KaLM_INSTRUCTION, + max_seq_length=512, + apply_instruction_to_passages=False, + prompts_dict=KaLM_v2_task_prompts, + ), + name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v2", + revision="d2a21c232dc712ae8230af56d1027cf21b7864bf", + release_date="2025-06-25", + languages=["eng-Latn", "zho-Hans"], + n_parameters=494032768, + memory_usage_mb=942, + max_tokens=512, + embed_dim=896, + license="mit", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v2", + similarity_fn_name="cosine", + use_instructions=True, + training_datasets=kalm_v2_training_data, + adapted_from="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5", + superseded_by=None, +) + # KaLM_Embedding_X_0605 = ModelMeta( # loader=partial(