embeddings-benchmark · fzliu · Apr 8, 2025 · Apr 8, 2025 · Apr 8, 2025 · Apr 8, 2025
diff --git a/mteb/abstasks/AbsTaskRTEB.py b/mteb/abstasks/AbsTaskRTEB.py
diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py
@@ -123,6 +123,7 @@
     "Summarization",
     "InstructionRetrieval",
     "Speed",
+    "RTEB",
 ) + MIEB_TASK_TYPE
 
 TASK_TYPE = Literal[_TASK_TYPE]

diff --git a/mteb/abstasks/__init__.py b/mteb/abstasks/__init__.py
@@ -10,6 +10,7 @@
 from .AbsTaskPairClassification import *
 from .AbsTaskReranking import *
 from .AbsTaskRetrieval import *
+from .AbsTaskRTEB import *
 from .AbsTaskSpeedTask import *
 from .AbsTaskSTS import *
 from .AbsTaskSummarization import *

diff --git a/mteb/model_meta.py b/mteb/model_meta.py
@@ -41,6 +41,7 @@
     "ColBERT",
 ]
 DISTANCE_METRICS = Literal["cosine", "max_sim", "dot"]
+EMBEDDING_DTYPES = Literal["float32", "int8", "binary"]
 
 
 def sentence_transformers_loader(
@@ -61,6 +62,34 @@ def get_loader_name(
     return loader.__name__
 
 
+def model_id(
+    model_name: str,
+    embd_dtype: str
+    | None,  # Keep None here as input can still be None before default assignment
+    embd_dim: int | None,
+) -> str:
+    # Handle potential None values passed to the function, even if the class attribute has a default
+    if model_name is None:
+        # Or handle appropriately, maybe raise error if name is critical for ID
+        model_name_part = "unknown_model"
+    else:
+        model_name_part = model_name.replace("/", "__")
+
+    dtype_str = embd_dtype if embd_dtype else "unknown_dtype"
+    dim_str = f"{embd_dim}d" if embd_dim else "unknown_dim"
+
+    # Check if default was used implicitly for dtype
+    if embd_dtype is None:
+        # If the class attribute defaults to 'float32', reflect that possibility if None is passed
+        # However, the class instance will have 'float32' if not specified.
+        # Let's assume the function should reflect the actual value passed or derived.
+        # If the intention is to always use the default if None is passed, adjust logic here.
+        # For now, stick to representing the input or lack thereof.
+        pass  # dtype_str is already "unknown_dtype"
+
+    return f"{model_name_part}_{dtype_str}_{dim_str}"
+
+
 class ModelMeta(BaseModel):
     """The model metadata object.
 
@@ -73,6 +102,7 @@ class ModelMeta(BaseModel):
         max_tokens: The maximum number of tokens the model can handle. Can be None if the maximum number of tokens is not known (e.g. for proprietary
             models).
         embed_dim: The dimension of the embeddings produced by the model. Currently all models are assumed to produce fixed-size embeddings.
+        embd_dtype: The data type of the embeddings produced by the model (e.g., "float32", "int8", "binary"). Defaults to "float32".
         revision: The revision number of the model. If None, it is assumed that the metadata (including the loader) is valid for all revisions of the model.
         release_date: The date the model's revision was released.
         license: The license under which the model is released. Required if open_weights is True.
@@ -119,6 +149,10 @@ class ModelMeta(BaseModel):
     superseded_by: str | None = None
     is_cross_encoder: bool | None = None
     modalities: list[MODALITIES] = ["text"]
+    # Attribute merged from rteb/ebr/core/meta.py
+    embd_dtype: EMBEDDING_DTYPES = (
+        "float32"  # Defaulting to float32 as requested, type hint updated
+    )
 
     def to_dict(self):
         dict_repr = self.model_dump()
@@ -276,6 +310,15 @@ def calculate_memory_usage_mb(self) -> int | None:
         model_memory_mb = model_memory_bytes / MB
         return round(model_memory_mb)
 
+    @property
+    def _id(self) -> str:
+        """Generates a unique ID for the model based on name, dtype, and dimension."""
+        if self.name is None:
+            raise ValueError("Model name is required to generate an ID.")
+        # Note: Using target's embed_dim and the newly added embd_dtype
+        # self.embd_dtype will be 'float32' by default if not specified otherwise
+        return model_id(self.name, self.embd_dtype, self.embed_dim)
+
 
 def collect_similar_tasks(dataset: str, visited: set[str]) -> set[str]:
     """Recursively collect all similar tasks for a given dataset."""

diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py
@@ -368,6 +368,31 @@ def _batched_encode(
     public_training_data=None,
 )
 
+voyage_3_large = ModelMeta(
+    name="voyageai/voyage-3-large",  # Use the identifier the user provided
+    revision="1",  # Assuming revision 1
+    release_date="2024-09-18",  # Assuming same release as voyage-3
+    languages=None,
+    loader=partial(  # type: ignore
+        VoyageWrapper,
+        model_name="voyage-3-large",  # Match the API model name
+        model_prompts=model_prompts,
+    ),
+    max_tokens=32000,  # Assuming same as voyage-3
-    max_tokens=32000,  # Assuming same as voyage-3
+    max_tokens=32768,
-    max_tokens=32000,  # Assuming same as voyage-3
+    max_tokens=32768,
+    embed_dim=1024,  # Assuming same as voyage-3
+    open_weights=False,
+    n_parameters=None,
+    memory_usage_mb=None,
+    license=None,
+    reference="https://blog.voyageai.com/2024/09/18/voyage-3/",  # Assuming same reference
+    similarity_fn_name="cosine",
+    framework=["API"],
+    use_instructions=True,
+    training_datasets=VOYAGE_TRAINING_DATA,
+    public_training_code=None,
+    public_training_data=None,
+)
+
 voyage_3_lite = ModelMeta(
     name="voyageai/voyage-3-lite",
     revision="1",

diff --git a/mteb/tasks/RTEB/RTEBAILACasedocsTask.py b/mteb/tasks/RTEB/RTEBAILACasedocsTask.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+import logging
+
+from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB
+
+logger = logging.getLogger(__name__)
+
+
+class RTEBAILACasedocs(AbsTaskRTEB):
+    _TASK_SPECIFIC_METADATA = {
+        "task_name": "RTEBAILACasedocs",
+        "description": "RTEB evaluation for AILACasedocs dataset.",
+        "reference": "https://zenodo.org/records/4063986",
+        "dataset_path": "embedding-benchmark/AILACasedocs",
+        "dataset_revision": "main",
+        "main_score": "ndcg_at_10",
+        "revision": "1.0.1",
+        "date": None,  # Date not specified in dataset metadata
+        "domains": ["Legal"],
+        "task_subtypes": ["Article retrieval"],
+        "annotations_creators": "derived",
+        "license": "cc-by-4.0",  # Standardized license format
+        "bibtex_citation": """@dataset{paheli_bhattacharya_2020_4063986,
+  author       = {Paheli Bhattacharya and
+                  Kripabandhu Ghosh and
+                  Saptarshi Ghosh and
+                  Arindam Pal and
+                  Parth Mehta and
+                  Arnab Bhattacharya and
+                  Prasenjit Majumder},
+  title        = {AILA 2019 Precedent & Statute Retrieval Task},
+  month        = oct,
+  year         = 2020,
+  publisher    = {Zenodo},
+  doi          = {10.5281/zenodo.4063986},
+  url          = {https://doi.org/10.5281/zenodo.4063986}
+}""",
+        "modalities": ["text"],
+        "eval_langs": ["eng-Latn"],
+    }
+
+    metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA)
+
+    def __init__(self, **kwargs):
+        # Allow configuration via environment variable or default to the original path
+        super().__init__(rteb_dataset_name="AILACasedocs", **kwargs)
diff --git a/mteb/tasks/RTEB/RTEBAILAStatutesTask.py b/mteb/tasks/RTEB/RTEBAILAStatutesTask.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+import logging
+
+from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB
+
+logger = logging.getLogger(__name__)
+
+
+class RTEBAILAStatutes(AbsTaskRTEB):
+    _TASK_SPECIFIC_METADATA = {
+        "task_name": "RTEBAILAStatutes",
+        "description": "RTEB evaluation for AILAStatutes dataset.",
+        "reference": "https://zenodo.org/records/4063986",
+        "dataset_path": "embedding-benchmark/AILAStatutes",
+        "dataset_revision": "main",
+        "main_score": "ndcg_at_10",
+        "revision": "1.0.1",
+        "date": None,  # Date not specified in dataset metadata
+        "domains": ["Legal"],
+        "task_subtypes": ["Article retrieval"],
+        "annotations_creators": "derived",
+        "license": "cc-by-4.0",  # Standardized license format
+        "bibtex_citation": """@dataset{paheli_bhattacharya_2020_4063986,
+  author       = {Paheli Bhattacharya and
+                  Kripabandhu Ghosh and
+                  Saptarshi Ghosh and
+                  Arindam Pal and
+                  Parth Mehta and
+                  Arnab Bhattacharya and
+                  Prasenjit Majumder},
+  title        = {AILA 2019 Precedent & Statute Retrieval Task},
+  month        = oct,
+  year         = 2020,
+  publisher    = {Zenodo},
+  doi          = {10.5281/zenodo.4063986},
+  url          = {https://doi.org/10.5281/zenodo.4063986}
+}""",
+        "modalities": ["text"],
+        "eval_langs": ["eng-Latn"],
+    }
+
+    metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA)
+
+    def __init__(self, **kwargs):
+        super().__init__(rteb_dataset_name="AILAStatutes", **kwargs)
diff --git a/mteb/tasks/RTEB/RTEBAPPSTask.py b/mteb/tasks/RTEB/RTEBAPPSTask.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+import logging
+
+from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB
+
+logger = logging.getLogger(__name__)
+
+
+class RTEBAPPS(AbsTaskRTEB):
+    _TASK_SPECIFIC_METADATA = {
+        "task_name": "RTEBAPPS",
+        "description": "RTEB evaluation for APPS dataset.",
+        "reference": "https://arxiv.org/abs/2105.09938",
+        "dataset_path": "embedding-benchmark/APPS",
+        "dataset_revision": "main",
+        "main_score": "ndcg_at_10",
+        "revision": "1.0.1",
+        "date": ("2021-05-20", "2021-05-20"),
+        "task_subtypes": ["Code retrieval"],
+        "license": "mit",
+        "annotations_creators": "derived",
+        "text_creation": "found",
+        "bibtex_citation": """@article{hendrycksapps2021,
+  title={Measuring Coding Challenge Competence With APPS},
+  author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt},
+  journal={NeurIPS},
+  year={2021}
+}""",
+        "modalities": ["text"],
+        "dialect": [],
+    }
+
+    metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA)
+
+    def __init__(self, **kwargs):
+        super().__init__(rteb_dataset_name="APPS", **kwargs)
diff --git a/mteb/tasks/RTEB/RTEBCOVID_QATask.py b/mteb/tasks/RTEB/RTEBCOVID_QATask.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+import logging
+
+from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB
+
+logger = logging.getLogger(__name__)
+
+
+class RTEBCOVID_QA(AbsTaskRTEB):
+    _TASK_SPECIFIC_METADATA = {
+        "task_name": "RTEBCOVID_QA",
+        "description": "RTEB evaluation for COVID_QA dataset.",
+        "reference": "https://aclanthology.org/2020.nlpcovid19-acl.18/",
+        "dataset_path": "embedding-benchmark/COVID_QA",
+        "dataset_revision": "main",
+        "main_score": "ndcg_at_10",
+        "revision": "1.0.1",
+        "date": ("2020-01-01", "2020-12-31"),
+        "domains": ["Medical"],
+        "task_subtypes": ["Question answering"],
+        "license": "apache-2.0",
+        "annotations_creators": "expert-annotated",
+        "text_creation": "found",
+        "bibtex_citation": """@inproceedings{moller-etal-2020-covid,
+    title = "{COVID}-QA: A Question Answering Dataset for {COVID}-19",
+    author = "M{\"o}ller, Erik  and
+      Brasch, Malte  and
+      Eger, Steffen  and
+      {\"U}z{\"u}mc{\"u}o{\\u{g}}lu, Hakan  and
+      Reimers, Nils  and
+      Gurevych, Iryna",
+    booktitle = "Proceedings of the 1st Workshop on NLP for COVID-19 (part 2) at ACL 2020",
+    month = nov,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2020.nlpcovid19-acl.18",
+    doi = "10.18653/v1/2020.nlpcovid19-acl.18",
+    pages = "145--152",
+    abstract = "We present COVID-QA, a Question Answering dataset consisting of 2,019 question/answer pairs annotated by volunteer biomedical experts on scientific articles about COVID-19. The dataset is designed to be challenging for current QA systems, as it requires reasoning over multiple sentences and paragraphs. We provide baseline results using several state-of-the-art QA models and analyze their performance.",
+}""",
+        "modalities": ["text"],
+    }
+
+    metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA)
+
+    def __init__(self, **kwargs):
+        super().__init__(rteb_dataset_name="COVID_QA", **kwargs)
diff --git a/mteb/tasks/RTEB/RTEBChatDoctor_HealthCareMagicTask.py b/mteb/tasks/RTEB/RTEBChatDoctor_HealthCareMagicTask.py
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+import logging
+
+from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB
+
+logger = logging.getLogger(__name__)
+
+
+class RTEBChatDoctor_HealthCareMagic(AbsTaskRTEB):
+    _TASK_SPECIFIC_METADATA = {
+        "task_name": "RTEBChatDoctor_HealthCareMagic",
+        "description": "RTEB evaluation for ChatDoctor_HealthCareMagic dataset.",
+        "reference": "https://github.com/Kent0n-Li/ChatDoctor",
+        "dataset_path": "embedding-benchmark/ChatDoctor_HealthCareMagic",
+        "dataset_revision": "main",
+        "main_score": "ndcg_at_10",
+        "revision": "1.0.1",
+        "date": ("2023-06-24", "2023-06-24"),
+        "task_subtypes": [],
+        "license": "cc-by-4.0",
+        "annotations_creators": "derived",
+        "text_creation": "found",
+        "bibtex_citation": """@article{Li2023ChatDoctor,
+  author = {Li, Yunxiang and Li, Zihan and Zhang, Kai and Dan, Ruilong and Jiang, Steve and Zhang, You},
+  title = {ChatDoctor: A Medical Chat Model Fine-Tuned on a Large Language Model Meta-AI (LLaMA) Using Medical Domain Knowledge},
+  journal = {Cureus},
+  year = {2023},
+  volume = {15},
+  number = {6},
+  pages = {e40895},
+  doi = {10.7759/cureus.40895}
+}""",
+        "modalities": ["text"],
+        "dialect": [],
+    }
+
+    metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA)
+
+    def __init__(self, **kwargs):
+        super().__init__(
+            rteb_dataset_name="ChatDoctor_HealthCareMagic",
+            **kwargs,
+        )
diff --git a/mteb/tasks/RTEB/RTEBConvFinQATask.py b/mteb/tasks/RTEB/RTEBConvFinQATask.py
@@ -0,0 +1,36 @@
+from __future__ import annotations
+
+import logging
+
+from mteb.abstasks.AbsTaskRTEB import AbsTaskRTEB
+
+logger = logging.getLogger(__name__)
+
+
+class RTEBConvFinQA(AbsTaskRTEB):
+    _TASK_SPECIFIC_METADATA = {
+        "task_name": "RTEBConvFinQA",
+        "description": "RTEB evaluation for ConvFinQA dataset.",
+        "reference": "https://github.com/czyssrs/ConvFinQA",
+        "dataset_path": "embedding-benchmark/ConvFinQA",
+        "dataset_revision": "main",
+        "main_score": "ndcg_at_10",
+        "revision": "1.0.1",
+        "date": ("2022-10-07", "2022-10-07"),
+        "task_subtypes": ["Question answering"],
+        "license": "mit",
+        "annotations_creators": "derived",
+        "text_creation": "found",
+        "bibtex_citation": """@article{chen2022convfinqa,
+  title={ConvFinQA: Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering},
+  author={Chen, Zhiyu and Chen, Wenhu and Wang, Chuhan and Zhang, Xinyi and Zhang, Yuchi and Smrz, Pavel and Yu, Xiangyu and Fung, Pascale},
+  journal={arXiv preprint arXiv:2210.03849},
+  year={2022}
+}""",
+        "modalities": ["text"],
+    }
+
+    metadata = AbsTaskRTEB.create_rteb_task_metadata(**_TASK_SPECIFIC_METADATA)
+
+    def __init__(self, **kwargs):
+        super().__init__(rteb_dataset_name="ConvFinQA", **kwargs)