diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index c89b64fc72..3eed189d33 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -84,3 +84,78 @@ public_training_data=None, training_datasets=E5_TRAINING_DATA, ) + +zeta_alpha_ai__Zeta_Alpha_E5_Mistral = ModelMeta( + loader=partial( # type: ignore + instruct_wrapper, + model_name_or_path="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + instruction_template=E5_INSTRUCTION, + attn="cccc", + pooling_method="lasttoken", + mode="embedding", + torch_dtype=torch.bfloat16, + # The ST script does not normalize while the HF one does so unclear what to do + # https://huggingface.co/intfloat/e5-mistral-7b-instruct#transformers + normalized=True, + ), + name="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + revision="c791d37474fa6a5c72eb3a2522be346bc21fbfc3", + release_date="2024-08-30", + languages=["eng_Latn"], + n_parameters=7110660096, + max_tokens=32768.0, + embed_dim=4096, + license="mit", + open_weights=True, + public_training_data=None, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + similarity_fn_name="cosine", + use_instructions=True, + training_datasets={ + # copied from e5 + # source: https://arxiv.org/pdf/2212.03533 + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], # dev? + # source: https://www.zeta-alpha.com/post/fine-tuning-an-llm-for-state-of-the-art-retrieval-zeta-alpha-s-top-10-submission-to-the-the-mteb-be + # "Arguana", + # "FEVER", + # "FIQA", + # "HotPotQA", + # "MsMarco (passage)", + # "NFCorpus", + # "SciFact", + # "NLI", + # "SQuad", + # "StackExchange", + # "TriviaQA", + # "SciRep", + # "SciRepEval" + # mteb + # https://huggingface.co/datasets/mteb/raw_arxiv + # "ArxivClusteringS2S": ["train"], + # "ArxivClusteringP2P": ["train"], + # https://huggingface.co/datasets/mteb/raw_biorxiv + # "BiorxivClusteringS2S": ["train"], + # "BiorxivClusteringP2P": ["train"], + # https://huggingface.co/datasets/mteb/raw_medrxiv + # "MedrxivClusteringS2S": ["train"], + # "MedrxivClusteringP2P": ["train"], + # as their train datasets + "AmazonCounterfactualClassification": ["train"], + "AmazonReviewsClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ImdbClassification": ["train"], + "STS12": ["train"], + "STS22": ["train"], + "STSBenchmark": ["train"], + }, + adapted_from="intfloat/e5-mistral-7b-instruct", + superseded_by=None, +) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index ba6e3e8163..bf41d3cdba 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -1607,27 +1607,7 @@ adapted_from="/workspace/v3-matryoshka_aubmindlab-bert-base-arabertv02-2024-10-12_13-55-06/checkpoint-26250", superseded_by=None, ) -zeta_alpha_ai__Zeta_Alpha_E5_Mistral = ModelMeta( - name="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", - revision="3e6076bdc2ff592a2f95fbc04570e51db5aa0c0c", - release_date="2024-08-30", - languages=["eng_Latn"], - loader=None, - n_parameters=7110660096, - max_tokens=32768.0, - embed_dim=4096, - license="mit", - open_weights=True, - public_training_code=None, - public_training_data=None, - framework=["PyTorch"], - reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", - similarity_fn_name="cosine", - use_instructions=None, - training_datasets=None, - adapted_from="intfloat/e5-mistral-7b-instruct", - superseded_by=None, -) + sbert_chinese_general_v1 = ModelMeta( name="DMetaSoul/sbert-chinese-general-v1", revision="bd27765956bcc2fcf682de0097819947ac10037e",