embeddings-benchmark · Samoed · Jan 22, 2025 · Jan 9, 2025 · Jan 9, 2025 · Jan 11, 2025
diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py
@@ -84,3 +84,78 @@
     public_training_data=None,
     training_datasets=E5_TRAINING_DATA,
 )
+
+zeta_alpha_ai__Zeta_Alpha_E5_Mistral = ModelMeta(
+    loader=partial(  # type: ignore
+        instruct_wrapper,
+        model_name_or_path="zeta-alpha-ai/Zeta-Alpha-E5-Mistral",
+        instruction_template=E5_INSTRUCTION,
+        attn="cccc",
+        pooling_method="lasttoken",
+        mode="embedding",
+        torch_dtype=torch.bfloat16,
+        # The ST script does not normalize while the HF one does so unclear what to do
+        # https://huggingface.co/intfloat/e5-mistral-7b-instruct#transformers
+        normalized=True,
+    ),
+    name="zeta-alpha-ai/Zeta-Alpha-E5-Mistral",
+    revision="c791d37474fa6a5c72eb3a2522be346bc21fbfc3",
+    release_date="2024-08-30",
+    languages=["eng_Latn"],
+    n_parameters=7110660096,
+    max_tokens=32768.0,
+    embed_dim=4096,
+    license="mit",
+    open_weights=True,
+    public_training_data=None,
+    public_training_code=None,
+    framework=["PyTorch"],
+    reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral",
+    similarity_fn_name="cosine",
+    use_instructions=True,
+    training_datasets={
+        # copied from e5
+        # source: https://arxiv.org/pdf/2212.03533
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "MSMARCO": ["train"],  # dev?
+        # source: https://www.zeta-alpha.com/post/fine-tuning-an-llm-for-state-of-the-art-retrieval-zeta-alpha-s-top-10-submission-to-the-the-mteb-be
+        # "Arguana",
+        # "FEVER",
+        # "FIQA",
+        # "HotPotQA",
+        # "MsMarco (passage)",
+        # "NFCorpus",
+        # "SciFact",
+        # "NLI",
+        # "SQuad",
+        # "StackExchange",
+        # "TriviaQA",
+        # "SciRep",
+        # "SciRepEval"
+        # mteb
+        # https://huggingface.co/datasets/mteb/raw_arxiv
+        # "ArxivClusteringS2S": ["train"],
+        # "ArxivClusteringP2P": ["train"],
+        # https://huggingface.co/datasets/mteb/raw_biorxiv
+        # "BiorxivClusteringS2S": ["train"],
+        # "BiorxivClusteringP2P": ["train"],
+        # https://huggingface.co/datasets/mteb/raw_medrxiv
+        # "MedrxivClusteringS2S": ["train"],
+        # "MedrxivClusteringP2P": ["train"],
+        # as their train datasets
+        "AmazonCounterfactualClassification": ["train"],
+        "AmazonReviewsClassification": ["train"],
+        "Banking77Classification": ["train"],
+        "EmotionClassification": ["train"],
+        "MTOPIntentClassification": ["train"],
+        "ToxicConversationsClassification": ["train"],
+        "TweetSentimentExtractionClassification": ["train"],
+        "ImdbClassification": ["train"],
+        "STS12": ["train"],
+        "STS22": ["train"],
+        "STSBenchmark": ["train"],
+    },
+    adapted_from="intfloat/e5-mistral-7b-instruct",
+    superseded_by=None,
+)
diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py
@@ -1607,27 +1607,7 @@
     adapted_from="/workspace/v3-matryoshka_aubmindlab-bert-base-arabertv02-2024-10-12_13-55-06/checkpoint-26250",
     superseded_by=None,
 )
-zeta_alpha_ai__Zeta_Alpha_E5_Mistral = ModelMeta(
-    name="zeta-alpha-ai/Zeta-Alpha-E5-Mistral",
-    revision="3e6076bdc2ff592a2f95fbc04570e51db5aa0c0c",
-    release_date="2024-08-30",
-    languages=["eng_Latn"],
-    loader=None,
-    n_parameters=7110660096,
-    max_tokens=32768.0,
-    embed_dim=4096,
-    license="mit",
-    open_weights=True,
-    public_training_code=None,
-    public_training_data=None,
-    framework=["PyTorch"],
-    reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral",
-    similarity_fn_name="cosine",
-    use_instructions=None,
-    training_datasets=None,
-    adapted_from="intfloat/e5-mistral-7b-instruct",
-    superseded_by=None,
-)
+
 sbert_chinese_general_v1 = ModelMeta(
     name="DMetaSoul/sbert-chinese-general-v1",
     revision="bd27765956bcc2fcf682de0097819947ac10037e",