diff --git a/mteb/models/kalm_models.py b/mteb/models/kalm_models.py new file mode 100644 index 0000000000..55cfa41d0f --- /dev/null +++ b/mteb/models/kalm_models.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +from functools import partial + +from mteb.model_meta import ModelMeta +from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper + + +MODEL_PROMPTS = { + "Classification": "Instruct: classify the query into different classes. \n Query: ", + "MultilabelClassification": "Instruct: classify the query into different classes. \n Query: ", + "Clustering": "Instruct: classify the query into different classes. \n Query: ", + "Reranking-query": "Instruct: Given a query, retrieve documents that answer the query. \n Query: ", + "Retrieval-query": "Instruct: Given a query, retrieve documents that answer the query. \n Query: ", +} + +kalm_training_data = { + # from technical report + # not in MTEB: + # ExpertQA + # MEDI2BGE + # OpenOrca + # PAQ + # PubMedQA + # SearchQA + # arxiv_qa + # rag-dataset-12000 + # CC-News + # SQuAD 2.0 + # TriviaQA + # WebGPT Comparisons + # MultiNLI + # NLLB + # WikiAnswers + # SimCSE NLI + # SNLI + # Aya Dataset + # eli5 + # ---- + # in MTEB: + "CodeFeedbackMT": ["train"], + "CodeFeedbackST": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "TRECCOVID": ["train"], + "DBPedia": ["train"], + "ESCIReranking": ["train"], + "FEVER": ["train"], + "FiQA2018": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "HotpotQAHardNegatives": ["train"], + "MultiLongDocRetrieval": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCOv2": ["train"], + "NFCorpus": ["train"], + "SciFact": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "QuoraRetrieval": ["train"], + "NanoQuoraRetrieval": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "Banking77Classification": ["train"], + "AmazonPolarityClassification": ["train"], + "ImdbClassification": ["train"], + "EmotionClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "MrTidyRetrieval": ["train"], + "PawsXPairClassification": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "MultilingualSentiment": ["train"], + "MassiveIntentClassification": ["train"], + "MassiveScenarioClassification": ["train"], + "MTOPDomainClassification": ["train"], + "MTOPIntentClassification": ["train"], +} + +KaLM_Embedding_X_0605 = ModelMeta( + name="KaLM-Team/KaLM-Embedding-X-0605", + loader=None, + languages=None, + open_weights=False, + revision="1", + release_date="2025-06-05", + n_parameters=9.24 * 1e9, + memory_usage_mb=35254, + max_tokens=8192, + embed_dim=3584, + license=None, + reference="https://github.com/KaLM-Team/KaLM-Embedding-X", + similarity_fn_name="cosine", + framework=["Sentence Transformers","PyTorch"], + use_instructions=True, + public_training_code="https://github.com/HITsz-TMG/KaLM-Embedding", + public_training_data=None, + training_datasets=kalm_training_data, +) \ No newline at end of file diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index b895122266..94308fd48d 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -243,7 +243,7 @@ similarity_fn_name="cosine", use_instructions=None, training_datasets=kalm_training_data, - adapted_from="/mnt/shgeminicephfs/wx-dc-plt-hpc/xinshuohu/Output/Embedding/Qwen2-0.5B-eos_mean_pretrain_0806_1e-4_uen_sft_1022_filtered_v2_inst_3node_g8_1e-5_sin-0.1_mrl", + adapted_from="Qwen/Qwen2-0.5B", superseded_by=None, ) HIT_TMG__KaLM_embedding_multilingual_mini_v1 = ModelMeta( @@ -265,7 +265,7 @@ similarity_fn_name="cosine", use_instructions=None, training_datasets=kalm_training_data, - adapted_from="/mnt/shgeminicephfs/wx-dc-plt-hpc/xinshuohu/Output/Embedding/Qwen2-0.5B-eos_mean_pretrain_0806_1e-4_uen_sft_0902_filtered_v2_3node_g8_1e-5_sin-0.1", + adapted_from="Qwen/Qwen2-0.5B", superseded_by=None, ) Hum_Works__lodestone_base_4096_v1 = ModelMeta( diff --git a/mteb/models/overview.py b/mteb/models/overview.py index e5300c7323..661bf51782 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -47,6 +47,7 @@ jasper_models, jina_clip, jina_models, + kalm_models, lens_models, linq_models, llm2clip_models, @@ -122,6 +123,7 @@ jasper_models, jina_models, jina_clip, + kalm_models, lens_models, linq_models, llm2clip_models,