diff --git a/mteb/models/hit_tmg_models.py b/mteb/models/hit_tmg_models.py deleted file mode 100644 index 15540416af..0000000000 --- a/mteb/models/hit_tmg_models.py +++ /dev/null @@ -1,316 +0,0 @@ -from __future__ import annotations - -import logging -from collections.abc import Sequence -from functools import partial -from typing import Any - -import numpy as np -import torch - -from mteb.encoder_interface import PromptType -from mteb.model_meta import ModelMeta -from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper - -logger = logging.getLogger(__name__) - - -class KALMWrapper(InstructSentenceTransformerWrapper): - def encode( - self, - sentences: Sequence[str], - *, - task_name: str, - prompt_type: PromptType | None = None, - **kwargs: Any, - ) -> np.ndarray: - if self.add_eos_token: - sentences = [ - example + self.model.tokenizer.eos_token for example in sentences - ] - - instruction = self.get_task_instruction( - task_name, prompt_type, self.prompts_dict - ) - # import there due to circular imports - from mteb import get_task - - task = get_task(task_name) - - # to passage prompts won't be applied to passages - if not self.apply_instruction_to_passages and prompt_type == PromptType.passage: - instruction = None - logger.info( - f"No instruction used, because prompt type = {prompt_type.passage}" - ) - - if task.metadata.type in ["STS", "PairClassification", "Summarization"]: - logger.info( - f"No instruction used, because task type = {task.metadata.type}" - ) - instruction = None - - if instruction: - logger.info(f"Using instruction: '{instruction}' for task: '{task_name}'") - - embeddings = self.model.encode( - sentences, - prompt=instruction, - **kwargs, - ) - - if isinstance(embeddings, torch.Tensor): - # sometimes in kwargs can be return_tensors=True - embeddings = embeddings.cpu().detach().float().numpy() - return embeddings - - -kalm_training_data = { - # from technical report - # not in MTEB: - # ExpertQA - # MEDI2BGE - # OpenOrca - # PAQ - # PubMedQA - # SearchQA - # arxiv_qa - # rag-dataset-12000 - # CC-News - # SQuAD 2.0 - # TriviaQA - # WebGPT Comparisons - # MultiNLI - # NLLB - # WikiAnswers - # SimCSE NLI - # SNLI - # Aya Dataset - # eli5 - # ---- - # in MTEB: - "CodeFeedbackMT": ["train"], - "CodeFeedbackST": ["train"], - "ArxivClusteringP2P": ["train"], - "ArxivClusteringS2S": ["train"], - "ArxivClusteringP2P.v2": ["train"], - "TRECCOVID": ["train"], - "DBPedia": ["train"], - "ESCIReranking": ["train"], - "FEVER": ["train"], - "FiQA2018": ["train"], - "FEVERHardNegatives": ["train"], - "NanoFEVERRetrieval": ["train"], - "FEVER-NL": ["train"], # translation not trained on - "FiQA2018-NL": ["train"], # translation not trained on - "HotpotQA-PL": ["train"], # translation not trained on - "HotpotQA-NL": ["train"], # translation not trained on - "HotpotQAHardNegatives": ["train"], - "MultiLongDocRetrieval": ["train"], - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "mMARCO-NL": ["train"], # translation not trained on - "MSMARCOv2": ["train"], - "NFCorpus": ["train"], - "SciFact": ["train"], - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - "NQ-NL": ["train"], # translation not trained on - "YahooAnswersTopicsClassification": ["train"], - "ContractNLIConfidentialityOfAgreementLegalBenchClassification": ["train"], - "ContractNLIExplicitIdentificationLegalBenchClassification": ["train"], - "ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification": [ - "train" - ], - "ContractNLILimitedUseLegalBenchClassification": ["train"], - "ContractNLINoLicensingLegalBenchClassification": ["train"], - "ContractNLINoticeOnCompelledDisclosureLegalBenchClassification": ["train"], - "ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification": [ - "train" - ], - "ContractNLIPermissibleCopyLegalBenchClassification": ["train"], - "ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification": [ - "train" - ], - "ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification": ["train"], - "ContractNLIReturnOfConfidentialInformationLegalBenchClassification": ["train"], - "ContractNLISharingWithEmployeesLegalBenchClassification": ["train"], - "ContractNLISharingWithThirdPartiesLegalBenchClassification": ["train"], - "ContractNLISurvivalOfObligationsLegalBenchClassification": ["train"], - "QuoraRetrieval": ["train"], - "NanoQuoraRetrieval": ["train"], - "BiorxivClusteringP2P.v2": ["train"], - "BiorxivClusteringS2S.v2": ["train"], - "MedrxivClusteringP2P.v2": ["train"], - "MedrxivClusteringS2S.v2": ["train"], - "Banking77Classification": ["train"], - "AmazonPolarityClassification": ["train"], - "ImdbClassification": ["train"], - "EmotionClassification": ["train"], - "TweetSentimentExtractionClassification": ["train"], - "ToxicConversationsClassification": ["train"], - "MIRACLRetrieval": ["train"], - "MIRACLRetrievalHardNegatives": ["train"], - "MIRACLReranking": ["train"], - "MrTidyRetrieval": ["train"], - "PawsXPairClassification": ["train"], - "AmazonReviewsClassification": ["train"], - "AmazonCounterfactualClassification": ["train"], - "MultilingualSentiment": ["train"], - "MassiveIntentClassification": ["train"], - "MassiveScenarioClassification": ["train"], - "MTOPDomainClassification": ["train"], - "MTOPIntentClassification": ["train"], -} - -HIT_TMG_task_prompts = { - "AmazonCounterfactualClassification": "Given an Amazon review, judge whether it is counterfactual.", - "AmazonPolarityClassification": "Classifying Amazon reviews into positive or negative sentiment", - "AmazonReviewsClassification": "Classifying the given Amazon review into its appropriate rating category", - "Banking77Classification": "Given an online banking query, find the corresponding intents", - "EmotionClassification": "Classifying the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise", - "ImdbClassification": "Classifying the sentiment expressed in the given movie review text from the IMDB dataset", - "MassiveIntentClassification": "Given a user utterance as query, find the user intents", - "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios", - "MTOPDomainClassification": "Classifying the intent domain of the given utterance in task-oriented conversation", - "MTOPIntentClassification": "Classifying the intent of the given utterance in task-oriented conversation", - "ToxicConversationsClassification": "Classifying the given comments as either toxic or not toxic", - "TweetSentimentExtractionClassification": "Classifying the sentiment of a given tweet as either positive, negative, or neutral", - "TNews": "Categorizing the given news title", - "IFlyTek": "Given an App description text, find the appropriate fine-grained category", - "MultilingualSentiment": "Classifying sentiment of the customer review into positive, neutral, or negative", - "JDReview": "Classifying sentiment of the customer review for iPhone into positive or negative", - "OnlineShopping": "Classifying sentiment of the customer review into positive or negative", - "Waimai": "Classify the customer review from a food takeaway platform into positive or negative", - "MasakhaNEWSClassification": "Classifying the category of french news.", - "CBD": "Classifying the sentiment of polish tweet reviews", - "PolEmo2.0-IN": "Classifying the sentiment of in-domain (medicine and hotels) online reviews", - "PolEmo2.0-OUT": "Classifying the sentiment of out-of-domain (products and school) online reviews", - "AllegroReviews": "Classifying the sentiment of reviews from e-commerce marketplace Allegro", - "PAC": 'Classifying the sentence into one of the two types: "BEZPIECZNE_POSTANOWIENIE_UMOWNE" and "KLAUZULA_ABUZYWNA"', - "GeoreviewClassification": "Classifying the sentiment of Russian reviews.", - "HeadlineClassification": "Classifying the topic of Russian headlines.", - "InappropriatenessClassification": "Detecting inappropriate messages on sensitive topics", - "KinopoiskClassification": "Classifying the sentiment of Kinopoisk reviews.", - "RuReviewsClassification": "Classifying the sentiment of Russian product reviews.", - "RuSciBenchGRNTIClassification": "Classifying the topic of Russian scientific papers.", - "RuSciBenchOECDClassification": "Classifying the topic of Russian scientific papers.", - "CEDRClassification": "Classification of sentences by emotions.", - "SensitiveTopicsClassification": "Detecting inappropriate messages on sensitive topics.", - "ArxivClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts", - "ArxivClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles", - "BiorxivClusteringP2P": "Identify the main category of Biorxiv papers based on the titles and abstracts", - "BiorxivClusteringS2S": "Identify the main category of Biorxiv papers based on the titles", - "MedrxivClusteringP2P": "Identify the main category of Medrxiv papers based on the titles and abstracts", - "MedrxivClusteringS2S": "Identify the main category of Medrxiv papers based on the titles", - "RedditClustering": "Identify the topic or theme of Reddit posts based on the titles and posts", - "RedditClusteringP2P": "Identify the topic or theme of Reddit posts based on the titles and posts", - "StackExchangeClustering": "Identify the topic or theme of StackExchange posts based on the given paragraphs", - "StackExchangeClusteringP2P": "Identify the topic or theme of StackExchange posts based on the given paragraphs", - "TwentyNewsgroupsClustering": "Identify the topic or theme of the given news articles", - "CLSClusteringS2S": "Identify the main category of scholar papers based on the titles", - "CLSClusteringP2P": "Identify the main category of scholar papers based on the titles and abstracts", - "ThuNewsClusteringS2S": "Identify the topic or theme of the given news articles based on the titles", - "ThuNewsClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents", - "AlloProfClusteringP2P": "Identify the main category of Allo Prof document based on the titles and descriptions", - "AlloProfClusteringS2S": "Identify the main category of Allo Prof document based on the titles", - "HALClusteringS2S": "Identify the main category of academic passage based on the titles and contents", - "MasakhaNEWSClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents", - "MasakhaNEWSClusteringS2S": "Identify the topic or theme of the given news articles based on the titles", - "MLSUMClusteringP2P": "Identify the topic or theme of the given articles based on the titles and contents", - "MLSUMClusteringS2S": "Identify the topic or theme of the given articles based on the titles", - "EightTagsClustering": "Identify of headlines from social media posts in Polish into 8 categories: film, history, food, medicine, motorization, work, sport and technology", - "GeoreviewClusteringP2P": "Identify the topic or theme of the Russian reviews.", - "RuSciBenchGRNTIClusteringP2P": "Identify the topic or theme of the Russian articles.", - "RuSciBenchOECDClusteringP2P": "Identify the topic or theme of the Russian articles.", -} - -HIT_TMG_INSTRUCTION = "Instruct: {instruction} \n Query: " - -HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v1 = ModelMeta( - loader=partial( # type: ignore - KALMWrapper, - model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", - revision="45e42c89990c40aca042659133fc8b13c28634b5", - instruction_template=HIT_TMG_INSTRUCTION, - max_seq_length=512, - apply_instruction_to_passages=False, - prompts_dict=HIT_TMG_task_prompts, - ), - name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", - revision="45e42c89990c40aca042659133fc8b13c28634b5", - release_date="2024-10-23", - languages=["eng-Latn", "zho-Hans"], - n_parameters=494032768, - memory_usage_mb=1885, - max_tokens=512, - embed_dim=896, - license="mit", - open_weights=True, - public_training_code=None, - public_training_data=None, - framework=["PyTorch", "Sentence Transformers"], - reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", - similarity_fn_name="cosine", - use_instructions=True, - training_datasets=kalm_training_data, # Replace with actual dataset if available - adapted_from="Qwen/Qwen2-0.5B", - superseded_by=None, -) - -HIT_TMG__KaLM_embedding_multilingual_mini_v1 = ModelMeta( - name="HIT-TMG/KaLM-embedding-multilingual-mini-v1", - revision="8a82a0cd2b322b91723e252486f7cce6fd8ac9d3", - release_date="2024-08-27", - languages=["eng-Latn", "zho-Hans"], - n_parameters=494032768, - memory_usage_mb=1885, - max_tokens=512, - embed_dim=896, - license="mit", - open_weights=True, - public_training_code=None, - public_training_data=None, - framework=["PyTorch"], - reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-v1", - similarity_fn_name="cosine", - use_instructions=None, - training_datasets=kalm_training_data, - adapted_from="Qwen/Qwen2-0.5B", - superseded_by=None, -) - -HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v1_5 = ModelMeta( - loader=partial( # type: ignore - KALMWrapper, - model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5", - revision="fcff2f8a54e4cd96b7766fef1ee960a43d42bb3c", - instruction_template=HIT_TMG_INSTRUCTION, - max_seq_length=512, - apply_instruction_to_passages=False, - prompts_dict=HIT_TMG_task_prompts, - ), - name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5", - revision="fcff2f8a54e4cd96b7766fef1ee960a43d42bb3c", - release_date="2024-12-26", - languages=["eng-Latn", "zho-Hans"], - n_parameters=494032768, - memory_usage_mb=1885, - max_tokens=512, - embed_dim=896, - license="mit", - open_weights=True, - public_training_code=None, - public_training_data=None, - framework=["PyTorch", "Sentence Transformers"], - reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5", - similarity_fn_name="cosine", - use_instructions=True, - training_datasets=kalm_training_data, - adapted_from="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", - superseded_by=None, -) diff --git a/mteb/models/kalm_models.py b/mteb/models/kalm_models.py new file mode 100644 index 0000000000..9f60c2c767 --- /dev/null +++ b/mteb/models/kalm_models.py @@ -0,0 +1,549 @@ +from __future__ import annotations + +import logging +from collections.abc import Sequence +from functools import partial +from typing import Any + +import numpy as np +import torch + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta +from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper + +logger = logging.getLogger(__name__) + + +class KALMWrapper(InstructSentenceTransformerWrapper): + def encode( + self, + sentences: Sequence[str], + *, + task_name: str, + prompt_type: PromptType | None = None, + **kwargs: Any, + ) -> np.ndarray: + if self.add_eos_token: + sentences = [ + example + self.model.tokenizer.eos_token for example in sentences + ] + + instruction = self.get_task_instruction( + task_name, prompt_type, self.prompts_dict + ) + # import there due to circular imports + from mteb import get_task + + task = get_task(task_name) + + # to passage prompts won't be applied to passages + if not self.apply_instruction_to_passages and prompt_type == PromptType.passage: + instruction = None + logger.info( + f"No instruction used, because prompt type = {prompt_type.passage}" + ) + + if task.metadata.type in ["STS", "PairClassification", "Summarization"]: + logger.info( + f"No instruction used, because task type = {task.metadata.type}" + ) + instruction = None + + if instruction: + logger.info(f"Using instruction: '{instruction}' for task: '{task_name}'") + + embeddings = self.model.encode( + sentences, + prompt=instruction, + **kwargs, + ) + + if isinstance(embeddings, torch.Tensor): + # sometimes in kwargs can be return_tensors=True + embeddings = embeddings.cpu().detach().float().numpy() + return embeddings + + +kalm_training_data = { + # from technical report + # not in MTEB: + # ExpertQA + # MEDI2BGE + # OpenOrca + # PAQ + # PubMedQA + # SearchQA + # arxiv_qa + # rag-dataset-12000 + # CC-News + # SQuAD 2.0 + # TriviaQA + # WebGPT Comparisons + # MultiNLI + # NLLB + # WikiAnswers + # SimCSE NLI + # SNLI + # Aya Dataset + # eli5 + # ---- + # in MTEB: + "CodeFeedbackMT": ["train"], + "CodeFeedbackST": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "TRECCOVID": ["train"], + "DBPedia": ["train"], + "ESCIReranking": ["train"], + "FEVER": ["train"], + "FiQA2018": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FEVER-NL": ["train"], # translation not trained on + "FiQA2018-NL": ["train"], # translation not trained on + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQA-NL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MultiLongDocRetrieval": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "mMARCO-NL": ["train"], # translation not trained on + "MSMARCOv2": ["train"], + "NFCorpus": ["train"], + "SciFact": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "NQ-NL": ["train"], # translation not trained on + "YahooAnswersTopicsClassification": ["train"], + "ContractNLIConfidentialityOfAgreementLegalBenchClassification": ["train"], + "ContractNLIExplicitIdentificationLegalBenchClassification": ["train"], + "ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification": [ + "train" + ], + "ContractNLILimitedUseLegalBenchClassification": ["train"], + "ContractNLINoLicensingLegalBenchClassification": ["train"], + "ContractNLINoticeOnCompelledDisclosureLegalBenchClassification": ["train"], + "ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification": [ + "train" + ], + "ContractNLIPermissibleCopyLegalBenchClassification": ["train"], + "ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification": [ + "train" + ], + "ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification": ["train"], + "ContractNLIReturnOfConfidentialInformationLegalBenchClassification": ["train"], + "ContractNLISharingWithEmployeesLegalBenchClassification": ["train"], + "ContractNLISharingWithThirdPartiesLegalBenchClassification": ["train"], + "ContractNLISurvivalOfObligationsLegalBenchClassification": ["train"], + "QuoraRetrieval": ["train"], + "NanoQuoraRetrieval": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "Banking77Classification": ["train"], + "AmazonPolarityClassification": ["train"], + "ImdbClassification": ["train"], + "EmotionClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "MrTidyRetrieval": ["train"], + "PawsXPairClassification": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "MultilingualSentiment": ["train"], + "MassiveIntentClassification": ["train"], + "MassiveScenarioClassification": ["train"], + "MTOPDomainClassification": ["train"], + "MTOPIntentClassification": ["train"], +} + +KaLM_task_prompts = { + "AmazonCounterfactualClassification": "Given an Amazon review, judge whether it is counterfactual.", + "AmazonPolarityClassification": "Classifying Amazon reviews into positive or negative sentiment", + "AmazonReviewsClassification": "Classifying the given Amazon review into its appropriate rating category", + "Banking77Classification": "Given an online banking query, find the corresponding intents", + "EmotionClassification": "Classifying the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise", + "ImdbClassification": "Classifying the sentiment expressed in the given movie review text from the IMDB dataset", + "MassiveIntentClassification": "Given a user utterance as query, find the user intents", + "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios", + "MTOPDomainClassification": "Classifying the intent domain of the given utterance in task-oriented conversation", + "MTOPIntentClassification": "Classifying the intent of the given utterance in task-oriented conversation", + "ToxicConversationsClassification": "Classifying the given comments as either toxic or not toxic", + "TweetSentimentExtractionClassification": "Classifying the sentiment of a given tweet as either positive, negative, or neutral", + "TNews": "Categorizing the given news title", + "IFlyTek": "Given an App description text, find the appropriate fine-grained category", + "MultilingualSentiment": "Classifying sentiment of the customer review into positive, neutral, or negative", + "JDReview": "Classifying sentiment of the customer review for iPhone into positive or negative", + "OnlineShopping": "Classifying sentiment of the customer review into positive or negative", + "Waimai": "Classify the customer review from a food takeaway platform into positive or negative", + "MasakhaNEWSClassification": "Classifying the category of french news.", + "CBD": "Classifying the sentiment of polish tweet reviews", + "PolEmo2.0-IN": "Classifying the sentiment of in-domain (medicine and hotels) online reviews", + "PolEmo2.0-OUT": "Classifying the sentiment of out-of-domain (products and school) online reviews", + "AllegroReviews": "Classifying the sentiment of reviews from e-commerce marketplace Allegro", + "PAC": 'Classifying the sentence into one of the two types: "BEZPIECZNE_POSTANOWIENIE_UMOWNE" and "KLAUZULA_ABUZYWNA"', + "GeoreviewClassification": "Classifying the sentiment of Russian reviews.", + "HeadlineClassification": "Classifying the topic of Russian headlines.", + "InappropriatenessClassification": "Detecting inappropriate messages on sensitive topics", + "KinopoiskClassification": "Classifying the sentiment of Kinopoisk reviews.", + "RuReviewsClassification": "Classifying the sentiment of Russian product reviews.", + "RuSciBenchGRNTIClassification": "Classifying the topic of Russian scientific papers.", + "RuSciBenchOECDClassification": "Classifying the topic of Russian scientific papers.", + "CEDRClassification": "Classification of sentences by emotions.", + "SensitiveTopicsClassification": "Detecting inappropriate messages on sensitive topics.", + "ArxivClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts", + "ArxivClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles", + "BiorxivClusteringP2P": "Identify the main category of Biorxiv papers based on the titles and abstracts", + "BiorxivClusteringS2S": "Identify the main category of Biorxiv papers based on the titles", + "MedrxivClusteringP2P": "Identify the main category of Medrxiv papers based on the titles and abstracts", + "MedrxivClusteringS2S": "Identify the main category of Medrxiv papers based on the titles", + "RedditClustering": "Identify the topic or theme of Reddit posts based on the titles and posts", + "RedditClusteringP2P": "Identify the topic or theme of Reddit posts based on the titles and posts", + "StackExchangeClustering": "Identify the topic or theme of StackExchange posts based on the given paragraphs", + "StackExchangeClusteringP2P": "Identify the topic or theme of StackExchange posts based on the given paragraphs", + "TwentyNewsgroupsClustering": "Identify the topic or theme of the given news articles", + "CLSClusteringS2S": "Identify the main category of scholar papers based on the titles", + "CLSClusteringP2P": "Identify the main category of scholar papers based on the titles and abstracts", + "ThuNewsClusteringS2S": "Identify the topic or theme of the given news articles based on the titles", + "ThuNewsClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents", + "AlloProfClusteringP2P": "Identify the main category of Allo Prof document based on the titles and descriptions", + "AlloProfClusteringS2S": "Identify the main category of Allo Prof document based on the titles", + "HALClusteringS2S": "Identify the main category of academic passage based on the titles and contents", + "MasakhaNEWSClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents", + "MasakhaNEWSClusteringS2S": "Identify the topic or theme of the given news articles based on the titles", + "MLSUMClusteringP2P": "Identify the topic or theme of the given articles based on the titles and contents", + "MLSUMClusteringS2S": "Identify the topic or theme of the given articles based on the titles", + "EightTagsClustering": "Identify of headlines from social media posts in Polish into 8 categories: film, history, food, medicine, motorization, work, sport and technology", + "GeoreviewClusteringP2P": "Identify the topic or theme of the Russian reviews.", + "RuSciBenchGRNTIClusteringP2P": "Identify the topic or theme of the Russian articles.", + "RuSciBenchOECDClusteringP2P": "Identify the topic or theme of the Russian articles.", +} + + +KaLM_X_task_prompts = { + "Classification": "classify the query into different classes.", + "MultilabelClassification": "Instruct: classify the query into different classes.", + "Clustering": "classify the query into different classes.", + "Reranking-query": "Given a query, retrieve documents that answer the query.", + "Retrieval-query": "Given a query, retrieve documents that answer the query.", + "InstructionRetrieval-query": "Given a query, retrieve documents that answer the query.", + "AmazonCounterfactualClassification": "Classify a given Amazon customer review text as either counterfactual or not-counterfactual", + "AmazonPolarityClassification": "Classify Amazon reviews into positive or negative sentiment", + "AmazonReviewsClassification": "Classify the given Amazon review into its appropriate rating category", + "Banking77Classification": "Given a online banking query, find the corresponding intents", + "EmotionClassification": "Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise", + "ImdbClassification": "Classify the sentiment expressed in the given movie review text from the IMDB dataset", + "MassiveIntentClassification": "Given a user utterance as query, find the user intents", + "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios", + "MTOPDomainClassification": "Classify the intent domain of the given utterance in task-oriented conversation", + "MTOPIntentClassification": "Classify the intent of the given utterance in task-oriented conversation", + "ToxicConversationsClassification": "Classify the given comments as either toxic or not toxic", + "TweetSentimentExtractionClassification": "Classify the sentiment of a given tweet as either positive, negative, or neutral", + "TNews": "Classify the fine-grained category of the given news title", + "IFlyTek": "Given an App description text, find the appropriate fine-grained category", + "MultilingualSentiment": "Classify sentiment of the customer review into positive, neutral, or negative", + "JDReview": "Classify the customer review for iPhone on e-commerce platform into positive or negative", + "OnlineShopping": "Classify the customer review for online shopping into positive or negative", + "Waimai": "Classify the customer review from a food takeaway platform into positive or negative", + "ArxivClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts", + "ArxivClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles", + "BiorxivClusteringP2P": "Identify the main category of Biorxiv papers based on the titles and abstracts", + "BiorxivClusteringS2S": "Identify the main category of Biorxiv papers based on the titles", + "MedrxivClusteringP2P": "Identify the main category of Medrxiv papers based on the titles and abstracts", + "MedrxivClusteringS2S": "Identify the main category of Medrxiv papers based on the titles", + "RedditClustering": "Identify the topic or theme of Reddit posts based on the titles", + "RedditClusteringP2P": "Identify the topic or theme of Reddit posts based on the titles and posts", + "StackExchangeClustering": "Identify the topic or theme of StackExchange posts based on the titles", + "StackExchangeClusteringP2P": "Identify the topic or theme of StackExchange posts based on the given paragraphs", + "TwentyNewsgroupsClustering": "Identify the topic or theme of the given news articles", + "CLSClusteringS2S": "Identify the main category of scholar papers based on the titles", + "CLSClusteringP2P": "Identify the main category of scholar papers based on the titles and abstracts", + "ThuNewsClusteringS2S": "Identify the topic or theme of the given news articles based on the titles", + "ThuNewsClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents", + "AskUbuntuDupQuestions-query": "Retrieve duplicate questions from AskUbuntu forum", + "MindSmallReranking-query": "Retrieve relevant news articles based on user browsing history", + "SciDocsRR-query": "Given a title of a scientific paper, retrieve the titles of other relevant papers", + "StackOverflowDupQuestions-query": "Retrieve duplicate questions from StackOverflow forum", + "T2Reranking-query": "Given a Chinese search query, retrieve web passages that answer the question", + "MMarcoReranking-query": "Given a Chinese search query, retrieve web passages that answer the question", + "CMedQAv1-reranking-query": "Given a Chinese community medical question, retrieve replies that best answer the question", + "CMedQAv2-reranking-query": "Given a Chinese community medical question, retrieve replies that best answer the question", + "ArguAna-query": "Given a claim, find documents that refute the claim", + "ArguAna-passage": "Given a claim, find documents that refute the claim", + "ClimateFEVER-query": "Given a claim about climate change, retrieve documents that support or refute the claim", + "ClimateFEVERHardNegatives-query": "Given a claim about climate change, retrieve documents that support or refute the claim", + "DBPedia-query": "Given a query, retrieve relevant entity descriptions from DBPedia", + "FEVER-query": "Given a claim, retrieve documents that support or refute the claim", + "FEVERHardNegatives-query": "Given a claim, retrieve documents that support or refute the claim", + "FiQA2018-query": "Given a financial question, retrieve user replies that best answer the question", + "HotpotQA-query": "Given a multi-hop question, retrieve documents that can help answer the question", + "HotpotQAHardNegatives-query": "Given a multi-hop question, retrieve documents that can help answer the question", + "MSMARCO-query": "Given a web search query, retrieve relevant passages that answer the query", + "NFCorpus-query": "Given a question, retrieve relevant documents that best answer the question", + "NQ-query": "Given a question, retrieve Wikipedia passages that answer the question", + "QuoraRetrieval-query": "Given a question, retrieve questions that are semantically equivalent to the given question", + "SCIDOCS-query": "Given a scientific paper title, retrieve paper abstracts that are cited by the given paper", + "SciFact-query": "Given a scientific claim, retrieve documents that support or refute the claim", + "Touche2020-query": "Given a question, retrieve detailed and persuasive arguments that answer the question", + "Touche2020Retrieval.v3-query": "Given a question, retrieve detailed and persuasive arguments that answer the question", + "TRECCOVID-query": "Given a query on COVID-19, retrieve documents that answer the query", + "T2Retrieval-query": "Given a Chinese search query, retrieve web passages that answer the question", + "MMarcoRetrieval-query": "Given a web search query, retrieve relevant passages that answer the query", + "DuRetrieval-query": "Given a Chinese search query, retrieve web passages that answer the question", + "CovidRetrieval-query": "Given a question on COVID-19, retrieve news articles that answer the question", + "CmedqaRetrieval-query": "Given a Chinese community medical question, retrieve replies that best answer the question", + "EcomRetrieval-query": "Given a user query from an e-commerce website, retrieve description sentences of relevant products", + "MedicalRetrieval-query": "Given a medical question, retrieve user replies that best answer the question", + "VideoRetrieval-query": "Given a video search query, retrieve the titles of relevant videos", + "MasakhaNEWSClassification": "Classify the News in the given texts into one of the seven category: politics,sports,health,business,entertainment,technology,religion ", + "AlloProfClusteringP2P": "Identify the main category of Allo Prof document based on the titles and descriptions", + "AlloProfClusteringS2S": "Identify the topic of document titles from Allo Prof dataset", + "HALClusteringS2S": "Identify the main category of academic passage based on the titles and contents", + "MasakhaNEWSClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents", + "MasakhaNEWSClusteringS2S": "Identify the topic or theme of the given news articles based on the titles", + "MLSUMClusteringP2P": "Identify the topic or theme of the given articles based on the titles and contents", + "MLSUMClusteringS2S": "Identify the topic or theme of the given articles based on the titles", + "SyntecReranking-query": "Given a question, retrieve passages that answer the question", + "AlloprofReranking-query": "Given a question, retrieve passages that answer the question", + "AlloprofRetrieval-query": "Given a question, retrieve passages that answer the question", + "BSARDRetrieval-query": "Given a question, retrieve passages that answer the question", + "SyntecRetrieval-query": "Given a question, retrieve passages that answer the question", + "XPQARetrieval-query": "Given a question, retrieve passages that answer the question", + "MintakaRetrieval-query": "Given a question, retrieve passages that answer the question", + "CBD": "Classify the sentiment of polish tweet reviews", + "PolEmo2.0-IN": "Classify the sentiment of in-domain (medicine and hotels) online reviews", + "PolEmo2.0-OUT": "Classify the sentiment of out-of-domain (products and school) online reviews", + "AllegroReviews": "Classify the sentiment of reviews from e-commerce marketplace Allegro", + "PAC": "Classify the sentence into one of the two types: \"BEZPIECZNE_POSTANOWIENIE_UMOWNE\" and \"KLAUZULA_ABUZYWNA\"", + "EightTagsClustering": "Identify of headlines from social media posts in Polish into 8 categories: film, history, food, medicine, motorization, work, sport and technology", + "ArguAna-PL-query": "Given a claim, find documents that refute the claim", + "DBPedia-PL-query": "Given a query, retrieve relevant entity descriptions from DBPedia", + "FiQA-PL-query": "Given a financial question, retrieve user replies that best answer the question", + "HotpotQA-PL-query": "Given a multi-hop question, retrieve documents that can help answer the question", + "MSMARCO-PL-query": "Given a web search query, retrieve relevant passages that answer the query", + "NFCorpus-PL-query": "Given a question, retrieve relevant documents that best answer the question", + "NQ-PL-query": "Given a question, retrieve Wikipedia passages that answer the question", + "Quora-PL-query": "Given a question, retrieve questions that are semantically equivalent to the given question", + "SCIDOCS-PL-query": "Given a scientific paper title, retrieve paper abstracts that are cited by the given paper", + "SciFact-PL-query": "Given a scientific claim, retrieve documents that support or refute the claim", + "TRECCOVID-PL-query": "Given a query on COVID-19, retrieve documents that answer the query", + "GeoreviewClassification": "Classify the organization rating based on the reviews", + "HeadlineClassification": "Classify the topic or theme of the given news headline", + "InappropriatenessClassification": "Classify the given message as either sensitive topic or not", + "KinopoiskClassification": "Classify the sentiment expressed in the given movie review text", + "RuReviewsClassification": "Classify product reviews into positive, negative or neutral sentiment", + "RuSciBenchGRNTIClassification": "Classify the category of scientific papers based on the titles and abstracts", + "RuSciBenchOECDClassification": "Classify the category of scientific papers based on the titles and abstracts", + "GeoreviewClusteringP2P": "Identify the organization category based on the reviews", + "RuSciBenchGRNTIClusteringP2P": "Identify the category of scientific papers based on the titles and abstracts", + "RuSciBenchOECDClusteringP2P": "Identify the category of scientific papers based on the titles and abstracts", + "RuBQReranking-query": "Given a question, retrieve Wikipedia passages that answer the question", + "RiaNewsRetrieval-query": "Given a headline, retrieval relevant articles", + "RuBQRetrieval-query": "Given a question, retrieve Wikipedia passages that answer the question", + "AppsRetrieval-query": "Given a question about code problem, retrieval code that can solve user's problem", + "COIRCodeSearchNetRetrieval-query": "Given a code snippet, retrieve the comment corresponding to that code.", + "CodeEditSearchRetrieval-query": "Given a piece of code, retrieval code that in the ", + "CodeFeedbackMT-query": "Given a question about coding, retrieval code or passage that can solve user's question", + "CodeFeedbackST-query": "Given a question about coding, retrieval code or passage that can solve user's question", + "CodeSearchNetCCRetrieval-query": "Given a code comment, retrieve the code snippet corresponding to that comment.", + "CodeSearchNetRetrieval-query": "Given a code snippet, retrieve the comment corresponding to that code.", + "CodeTransOceanContest-query": "Given a piece for code, retrieval semantically similar code", + "CodeTransOceanDL-query": "Given a piece for code, retrieval semantically similar code", + "CosQA-query": "Given a question about coding, retrieval code or passage that can solve user's question", + "StackOverflowQA-query": "Given a question about coding, retrieval code or passage that can solve user's question", + "SyntheticText2SQL-query": "Given a user's question, retrieve SQL queries that are appropriate responses to the question", + "BulgarianStoreReviewSentimentClassfication": "Classify user reviews into positive or negative sentiment", + "CzechProductReviewSentimentClassification": "Classify product reviews into positive or negative sentiment", + "GreekLegalCodeClassification": "Given a greek legal text, classify its topic", + "DBpediaClassification": "Given a Wikipedia articles, categorized it into classes based on its DBpedia ontology", + "FinancialPhrasebankClassification": "Given financial news, categorized by sentiment into positive, negative, or neutral", + "PoemSentimentClassification": "Gvien a poem, categorized by sentiment into positive, no_impact, negative or mixed", + "TweetTopicSingleClassification": "Gvien a twitter, classify its topic", + "EstonianValenceClassification": "Given a news article, categorized by sentiment into negatiivne, positiivne, neutraalne or vastuolulin", + "FilipinoShopeeReviewsClassification": "Given a shop review, classify its rating on a scale from 1 to 5", + "GujaratiNewsClassification": "Given a Gujarati news articles, classify ist topic", + "SentimentAnalysisHindi": "Given a hindi text, categorized by sentiment into positive, negative or neutral", + "IndonesianIdClickbaitClassification": "Given an Indonesian news headlines, classify its into clickbait or non-clickbait", + "ItaCaseholdClassification": "Given a judgments, classify its topic", + "KorSarcasmClassification": "Given a twitter, categorized it into sarcasm or not_sarcasm", + "KurdishSentimentClassification": "Given a text, categorized by sentiment into positive or negative", + "MacedonianTweetSentimentClassification": "Given a Macedonian tweet, categorized by sentiment into positive, negative, or neutral", + "AfriSentiClassification": "Given a text, categorized by sentiment into positive, negative, or neutral", + "CataloniaTweetClassification": "Given a tweet, categorized by sentiment into AGAINST, FAVOR or NEUTRAL", + "CyrillicTurkicLangClassification": "Given a text, classify its language", + "IndicLangClassification": "Given a text, classify its language", + "MultiHateClassification": "Given a text, categorized by sentiment into hate or non-hate", + "NusaParagraphEmotionClassification": "Given a paragraph, classify its emotion", + "NusaX-senti": "Given a text, categorized by sentiment into positive or negative", + "SwissJudgementClassification": "Given a news article, categorized it into approval or dismissal", + "NepaliNewsClassification": "Given a news article, categorized it into business, entertainment or sports", + "OdiaNewsClassification": "Given a news article, categorized it into business, entertainment or sports", + "PunjabiNewsClassification": "Given a news article, categorized it into two-classes", + "SinhalaNewsClassification": "Given a news article, categorized it into political, business, technology, sports and Entertainment", + "CSFDSKMovieReviewSentimentClassification": "Given a movie review, classify its rating on a scale from 0 to 5", + "SiswatiNewsClassification": "Given a news article, classify its topic", + "SlovakMovieReviewSentimentClassification": "Given a movie review, categorized it into positive or negative", + "SwahiliNewsClassification": "Given a news article, classify its domain", + "TswanaNewsClassification": "Given a news article, classify its topic", + "IsiZuluNewsClassification": "Given a news article, classify its topic", + "WikiCitiesClustering": "Identify of Wikipedia articles of cities by country", + "RomaniBibleClustering": "Identify verses from the Bible in Kalderash Romani by book.", + "ArXivHierarchicalClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts", + "ArXivHierarchicalClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles", + "BigPatentClustering.v2": "Identify the category of documents from the Big Patent dataset", + "AlloProfClusteringS2S.v2": "Identify the topic of document titles from Allo Prof dataset", + "HALClusteringS2S.v2": "Identify the topic of titles from HAL", + "SIB200ClusteringS2S": "Identify the category of documents", + "WikiClusteringP2P.v2": "Identify the category of wiki passages", + "PlscClusteringP2P.v2": "Identify the category of titles+abstracts from Library of Science", + "KorHateSpeechMLClassification": "Given a Korean online news comments, classify its fine-grained hate speech classes", + "MalteseNewsClassification": "Given a maltese new, classify its topic", + "MultiEURLEXMultilabelClassification": "Given a text, classify its topic", + "BrazilianToxicTweetsClassification": "Given a tweet, classify its topic", + "AILAStatutes-query": "Identifying the most relevant statutes for a given situation", + "HagridRetrieval-query": "Retrieval the relevant passage for the given query", + "LegalBenchCorporateLobbying-query": "Retrieval the relevant passage for the given query", + "LEMBPasskeyRetrieval-query": "Retrieval the relevant passage for the given query", + "BelebeleRetrieval-query": "Retrieval the relevant passage for the given query", + "MLQARetrieval-query": "Retrieval the relevant passage for the given query", + "StatcanDialogueDatasetRetrieval-query": "Retrieval the relevant passage for the given query", + "WikipediaRetrievalMultilingual-query": "Retrieval the relevant passage for the given query", + "Core17InstructionRetrieval-query": "Retrieval the relevant passage for the given query", + "News21InstructionRetrieval-query": "Retrieval the relevant passage for the given query", + "Robust04InstructionRetrieval-query": "Retrieval the relevant passage for the given query", + "WebLINXCandidatesReranking-query": "Retrieval the relevant passage for the given query", + "WikipediaRerankingMultilingual-query": "Retrieval the relevant passage for the given query", + "MIRACLRetrievalHardNegatives-query": "Retrieval relevant passage for the given query", + "CQADupstackRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackGamingRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackGamingRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackUnixRetrieval-query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question", + "CQADupstackUnixRetrieval-passage": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question" +} + +KaLM_INSTRUCTION = "Instruct: {instruction} \n Query: " + +HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v1 = ModelMeta( + loader=partial( # type: ignore + KALMWrapper, + model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", + revision="45e42c89990c40aca042659133fc8b13c28634b5", + instruction_template=KaLM_INSTRUCTION, + max_seq_length=512, + apply_instruction_to_passages=False, + prompts_dict=KaLM_task_prompts, + ), + name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", + revision="45e42c89990c40aca042659133fc8b13c28634b5", + release_date="2024-10-23", + languages=["eng-Latn", "zho-Hans"], + n_parameters=494032768, + memory_usage_mb=1885, + max_tokens=512, + embed_dim=896, + license="mit", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", + similarity_fn_name="cosine", + use_instructions=True, + training_datasets=kalm_training_data, # Replace with actual dataset if available + adapted_from="Qwen/Qwen2-0.5B", + superseded_by=None, +) + +HIT_TMG__KaLM_embedding_multilingual_mini_v1 = ModelMeta( + name="HIT-TMG/KaLM-embedding-multilingual-mini-v1", + revision="8a82a0cd2b322b91723e252486f7cce6fd8ac9d3", + release_date="2024-08-27", + languages=["eng-Latn", "zho-Hans"], + n_parameters=494032768, + memory_usage_mb=1885, + max_tokens=512, + embed_dim=896, + license="mit", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-v1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=kalm_training_data, + adapted_from="Qwen/Qwen2-0.5B", + superseded_by=None, +) + +HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v1_5 = ModelMeta( + loader=partial( # type: ignore + KALMWrapper, + model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5", + revision="fcff2f8a54e4cd96b7766fef1ee960a43d42bb3c", + instruction_template=KaLM_INSTRUCTION, + max_seq_length=512, + apply_instruction_to_passages=False, + prompts_dict=KaLM_task_prompts, + ), + name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5", + revision="fcff2f8a54e4cd96b7766fef1ee960a43d42bb3c", + release_date="2024-12-26", + languages=["eng-Latn", "zho-Hans"], + n_parameters=494032768, + memory_usage_mb=1885, + max_tokens=512, + embed_dim=896, + license="mit", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5", + similarity_fn_name="cosine", + use_instructions=True, + training_datasets=kalm_training_data, + adapted_from="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", + superseded_by=None, +) + + + +KaLM-Team__KaLM_Embedding_X_0605 = ModelMeta( + loader=partial( + KALMWrapper, + model_name="KaLM-Team/KaLM-Embedding-X-0605", + revision="1", + instruction_template=KaLM_INSTRUCTION, + max_seq_length=512, + apply_instruction_to_passages=True, + prompts_dict=KaLM_X_task_prompts, + ), + name="KaLM-Team/KaLM-Embedding-X-0605", + revision="1", + languages=None, + open_weights=False, + release_date="2025-06-05", + n_parameters=9.24 * 1e9, + memory_usage_mb=35254, + max_tokens=8192, + embed_dim=3584, + license=None, + reference="https://github.com/KaLM-Team/KaLM-Embedding-X", + similarity_fn_name="cosine", + framework=["Sentence Transformers","PyTorch"], + use_instructions=True, + public_training_code="https://github.com/HITsz-TMG/KaLM-Embedding", + public_training_data=None, + training_datasets=kalm_training_data, +) \ No newline at end of file diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 096db2cacd..94308fd48d 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -122,6 +122,152 @@ superseded_by=None, ) +kalm_training_data = { + # from technical report + # not in MTEB: + # ExpertQA + # MEDI2BGE + # OpenOrca + # PAQ + # PubMedQA + # SearchQA + # arxiv_qa + # rag-dataset-12000 + # CC-News + # SQuAD 2.0 + # TriviaQA + # WebGPT Comparisons + # MultiNLI + # NLLB + # WikiAnswers + # SimCSE NLI + # SNLI + # Aya Dataset + # eli5 + # ---- + # in MTEB: + "CodeFeedbackMT": ["train"], + "CodeFeedbackST": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "TRECCOVID": ["train"], + "DBPedia": ["train"], + "ESCIReranking": ["train"], + "FEVER": ["train"], + "FiQA2018": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FEVER-NL": ["train"], # translation not trained on + "FiQA2018-NL": ["train"], # translation not trained on + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQA-NL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MultiLongDocRetrieval": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "mMARCO-NL": ["train"], # translation not trained on + "MSMARCOv2": ["train"], + "NFCorpus": ["train"], + "SciFact": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "NQ-NL": ["train"], # translation not trained on + "YahooAnswersTopicsClassification": ["train"], + "ContractNLIConfidentialityOfAgreementLegalBenchClassification": ["train"], + "ContractNLIExplicitIdentificationLegalBenchClassification": ["train"], + "ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification": [ + "train" + ], + "ContractNLILimitedUseLegalBenchClassification": ["train"], + "ContractNLINoLicensingLegalBenchClassification": ["train"], + "ContractNLINoticeOnCompelledDisclosureLegalBenchClassification": ["train"], + "ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification": [ + "train" + ], + "ContractNLIPermissibleCopyLegalBenchClassification": ["train"], + "ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification": [ + "train" + ], + "ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification": ["train"], + "ContractNLIReturnOfConfidentialInformationLegalBenchClassification": ["train"], + "ContractNLISharingWithEmployeesLegalBenchClassification": ["train"], + "ContractNLISharingWithThirdPartiesLegalBenchClassification": ["train"], + "ContractNLISurvivalOfObligationsLegalBenchClassification": ["train"], + "QuoraRetrieval": ["train"], + "NanoQuoraRetrieval": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "Banking77Classification": ["train"], + "AmazonPolarityClassification": ["train"], + "ImdbClassification": ["train"], + "EmotionClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "MrTidyRetrieval": ["train"], + "PawsXPairClassification": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "MultilingualSentiment": ["train"], + "MassiveIntentClassification": ["train"], + "MassiveScenarioClassification": ["train"], + "MTOPDomainClassification": ["train"], + "MTOPIntentClassification": ["train"], +} + +HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v1 = ModelMeta( + name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", + revision="45e42c89990c40aca042659133fc8b13c28634b5", + release_date="2024-10-23", + languages=None, + loader=None, + n_parameters=494032768, + memory_usage_mb=1885, + max_tokens=512, + embed_dim=896, + license="mit", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=kalm_training_data, + adapted_from="Qwen/Qwen2-0.5B", + superseded_by=None, +) +HIT_TMG__KaLM_embedding_multilingual_mini_v1 = ModelMeta( + name="HIT-TMG/KaLM-embedding-multilingual-mini-v1", + revision="8a82a0cd2b322b91723e252486f7cce6fd8ac9d3", + release_date="2024-08-27", + languages=None, + loader=None, + n_parameters=494032768, + memory_usage_mb=1885, + max_tokens=512, + embed_dim=896, + license="mit", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-v1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=kalm_training_data, + adapted_from="Qwen/Qwen2-0.5B", + superseded_by=None, +) Hum_Works__lodestone_base_4096_v1 = ModelMeta( name="Hum-Works/lodestone-base-4096-v1", revision="9bbc2d0b57dd2198aea029404b0f976712a7d966", diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 52a034adb3..d3e00a4d76 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -43,12 +43,12 @@ google_models, gritlm_models, gte_models, - hit_tmg_models, ibm_granite_models, inf_models, jasper_models, jina_clip, jina_models, + kalm_models, lens_models, lgai_embedding_models, linq_models, @@ -123,12 +123,12 @@ google_models, gritlm_models, gte_models, - hit_tmg_models, ibm_granite_models, inf_models, jasper_models, jina_models, jina_clip, + kalm_models, lens_models, lgai_embedding_models, linq_models,