embeddings-benchmark · KennethEnevoldsen · Oct 26, 2024 · Oct 24, 2024 · Oct 24, 2024 · Oct 24, 2024
diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py
@@ -48,11 +48,11 @@ def filter_tasks(
                 continue
             if languages is not None:
                 task_languages = task_result.languages
-                if not any([lang in task_languages for lang in languages]):
+                if not any(lang in task_languages for lang in languages):
                     continue
             if domains is not None:
                 task_domains = task_result.domains
-                if not any([domain in task_domains for domain in domains]):
+                if not any(domain in task_domains for domain in domains):
                     continue
             if (task_types is not None) and (task_result.task_type not in task_types):
                 continue
@@ -100,7 +100,7 @@ def get_scores(
         if format == "long":
             entries = []
             for task_res in self.task_results:
-                entry = dict(
+                entry = dict(  # noqa
                     model_name=self.model_name,
                     model_revision=self.model_revision,
                     task_name=task_res.task_name,
@@ -140,7 +140,7 @@ def domains(self) -> list[str]:
 
     @property
     def task_types(self) -> list[str]:
-        return list(set([task_res.task_type for task_res in self.task_results]))
+        return list({task_res.task_type for task_res in self.task_results})
 
     @property
     def task_names(self) -> list[str]:

diff --git a/mteb/load_results/load_results.py b/mteb/load_results/load_results.py
@@ -103,9 +103,6 @@ def load_results(
             extract the model name and revision from the path.
         validate_and_filter: If True it will validate that the results object for the task contains the correct splits and filter out
             splits from the results object that are not default in the task metadata. Defaults to True.
-
-    Returns:
-
     """
     repo_directory = download_of_results(results_repo, download_latest=download_latest)
     model_paths = [p for p in (repo_directory / "results").glob("*") if p.is_dir()]

diff --git a/mteb/model_meta.py b/mteb/model_meta.py
@@ -1,24 +1,32 @@
 from __future__ import annotations
 
-from datetime import date
+import logging
 from functools import partial
-from typing import TYPE_CHECKING, Annotated, Any, Callable, Literal
+from typing import TYPE_CHECKING, Any, Callable, Literal
 
-from pydantic import BaseModel, BeforeValidator, TypeAdapter
+from pydantic import BaseModel
 
+from mteb.abstasks.TaskMetadata import STR_DATE, STR_URL
 from mteb.encoder_interface import Encoder
 
 from .languages import ISO_LANGUAGE_SCRIPT
 
 if TYPE_CHECKING:
     from .models.sentence_transformer_wrapper import SentenceTransformerWrapper
 
-Frameworks = Literal["Sentence Transformers", "PyTorch"]
+logger = logging.getLogger(__name__)
 
-pastdate_adapter = TypeAdapter(date)
-STR_DATE = Annotated[
-    str, BeforeValidator(lambda value: str(pastdate_adapter.validate_python(value)))
-]  # Allows the type to be a string, but ensures that the string is a valid date
+
+FRAMEWORKS = Literal[
+    "Sentence Transformers",
+    "PyTorch",
+    "GritLM",
+    "LLM2Vec",
+    "TensorFlow",
+    "API",
+    "Tevatron",
+]
+DISTANCE_METRICS = Literal["cosine"]
 
 
 def sentence_transformers_loader(
@@ -53,12 +61,19 @@ class ModelMeta(BaseModel):
         embed_dim: The dimension of the embeddings produced by the model. Currently all models are assumed to produce fixed-size embeddings.
         revision: The revision number of the model. If None it is assumed that the metadata (including the loader) is valid for all revisions of the model.
         release_date: The date the model's revision was released.
-        license: The license under which the model is released. Required if open_source is True.
-        open_source: Whether the model is open source or proprietary.
+        license: The license under which the model is released. Required if open_weights is True.
+        open_weights: Whether the model is open source or proprietary.
+        public_training_data: Whether the training data used to train the model is publicly available.
+        public_training_code: Whether the code used to train the model is publicly available.
         similarity_fn_name: The distance metric used by the model.
         framework: The framework the model is implemented in, can be a list of frameworks e.g. `["Sentence Transformers", "PyTorch"]`.
+        reference: A URL to the model's page on huggingface or another source.
         languages: The languages the model is intended for specified as a 3 letter language code followed by a script code e.g. "eng-Latn" for English
             in the Latin script.
+        use_instuctions: Whether the model uses instructions E.g. for prompt-based models. This also include models that require a specific format for
+            input such as "query: {document}" or "passage: {document}".
+        zero_shot_benchmarks: A list of benchmarks on which the model has been evaluated in a zero-shot setting. By default we assume that all models
+            are evaluated non-zero-shot unless specified otherwise.
     """
 
     name: str | None
@@ -71,9 +86,14 @@ class ModelMeta(BaseModel):
     max_tokens: int | None = None
     embed_dim: int | None = None
     license: str | None = None
-    open_source: bool | None = None
-    similarity_fn_name: str | None = None
-    framework: list[Frameworks] = []
+    open_weights: bool | None = None
+    public_training_data: bool | None = None
+    public_training_code: bool | None = None
+    framework: list[FRAMEWORKS] = []
+    reference: STR_URL | None = None
+    similarity_fn_name: DISTANCE_METRICS | None = None
+    use_instuctions: bool | None = None
+    zero_shot_benchmarks: list[str] | None = None
 
     def to_dict(self):
         dict_repr = self.model_dump()
@@ -83,6 +103,9 @@ def to_dict(self):
 
     def load_model(self, **kwargs: Any) -> Encoder:
         if self.loader is None:
+            logger.warning(
+                f"Loader not specified for model {self.name}, loading using sentence transformers."
+            )
             loader = partial(
                 sentence_transformers_loader,
                 model_name=self.name,

diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py
@@ -1,3 +1,24 @@
 from __future__ import annotations
 
-from mteb.models.overview import *
+import logging
+
+from mteb.models.overview import (
+    MODEL_REGISTRY,
+    ModelMeta,
+    get_model,
+    get_model_meta,
+    get_model_metas,
+    model_meta_from_sentence_transformers,
+)
+
+logger = logging.getLogger(__name__)
+
+
+__all__ = [
+    "MODEL_REGISTRY",
+    "ModelMeta",
+    "get_model",
+    "get_model_meta",
+    "get_model_metas",
+    "model_meta_from_sentence_transformers",
+]
diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py
@@ -4,8 +4,17 @@
 
 arctic_m_v1_5 = ModelMeta(
     name="Snowflake/snowflake-arctic-embed-m-v1.5",
-    languages=["eng_Latn"],
-    open_source=True,
     revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47",
     release_date="2024-07-08",  # initial commit of hf model.
+    languages=["eng_Latn"],
+    open_weights=True,
+    framework=["Sentence Transformers", "PyTorch"],
+    n_parameters=109_000_000,
+    memory_usage=None,
+    max_tokens=512,
+    embed_dim=256,
+    license="apache-2.0",
+    reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5",
+    similarity_fn_name="cosine_similarity",
+    use_instuctions=False,
 )
diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py
@@ -15,9 +15,18 @@
     ),
     name="BAAI/bge-small-en-v1.5",
     languages=["eng_Latn"],
-    open_source=True,
+    open_weights=True,
     revision="5c38ec7c405ec4b44b94cc5a9bb96e735b38267a",
     release_date="2023-09-12",  # initial commit of hf model.
+    n_parameters=24_000_000,
+    memory_usage=None,
+    embed_dim=512,
+    license="mit",
+    max_tokens=512,
+    reference="https://huggingface.co/BAAI/bge-small-en-v1.5",
+    similarity_fn_name="cosine",
+    framework=["Sentence Transformers", "PyTorch"],
+    use_instuctions=False,
 )
 
 bge_base_en_v1_5 = ModelMeta(
@@ -29,9 +38,18 @@
     ),
     name="BAAI/bge-base-en-v1.5",
     languages=["eng_Latn"],
-    open_source=True,
+    open_weights=True,
     revision="a5beb1e3e68b9ab74eb54cfd186867f64f240e1a",
     release_date="2023-09-11",  # initial commit of hf model.
+    n_parameters=438_000_000,
+    memory_usage=None,
+    embed_dim=768,
+    license="mit",
+    max_tokens=512,
+    reference="https://huggingface.co/BAAI/bge-base-en-v1.5",
+    similarity_fn_name="cosine",
+    framework=["Sentence Transformers", "PyTorch"],
+    use_instuctions=False,
 )
 
 bge_large_en_v1_5 = ModelMeta(
@@ -43,7 +61,16 @@
     ),
     name="BAAI/bge-large-en-v1.5",
     languages=["eng_Latn"],
-    open_source=True,
+    open_weights=True,
     revision="d4aa6901d3a41ba39fb536a557fa166f842b0e09",
     release_date="2023-09-12",  # initial commit of hf model.
+    n_parameters=1_340_000_000,
+    memory_usage=None,
+    embed_dim=1024,
+    license="mit",
+    max_tokens=512,
+    reference="https://huggingface.co/BAAI/bge-large-en-v1.5",
+    similarity_fn_name="cosine",
+    framework=["Sentence Transformers", "PyTorch"],
+    use_instuctions=False,
 )
diff --git a/mteb/models/bm25.py b/mteb/models/bm25.py
@@ -117,7 +117,16 @@ def encode(self, texts: list[str], **kwargs):
     loader=partial(bm25_loader, model_name="bm25s"),  # type: ignore
     name="bm25s",
     languages=["eng_Latn"],
-    open_source=True,
+    open_weights=True,
     revision="0_1_10",
     release_date="2024-07-10",  ## release of version 0.1.10
+    n_parameters=None,
+    memory_usage=None,
+    embed_dim=None,
+    license=None,
+    max_tokens=None,
+    reference=None,
+    similarity_fn_name=None,
+    framework=[],
+    use_instuctions=False,
 )
diff --git a/mteb/models/cache_wrapper.py b/mteb/models/cache_wrapper.py
@@ -4,7 +4,7 @@
 import json
 import logging
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any
 
 import numpy as np
 import torch
@@ -21,17 +21,17 @@
 class TextVectorMap:
     def __init__(
         self,
-        directory: Union[str | Path],
+        directory: str | Path,
         initial_vectors: int = 100000,
     ):
         self.directory = Path(directory)
         self.directory.mkdir(parents=True, exist_ok=True)
         self.vectors_file = self.directory / "vectors.npy"
         self.index_file = self.directory / "index.json"
         self.dimension_file = self.directory / "dimension"
-        self.hash_to_index: Dict[str, int] = {}
-        self.vectors: Optional[np.memmap] = None
-        self.vector_dim: Optional[int] = None
+        self.hash_to_index: dict[str, int] = {}
+        self.vectors: np.memmap | None = None
+        self.vector_dim: int | None = None
         self.initial_vectors = initial_vectors
         logger.info(f"Initialized TextVectorMap in directory: {self.directory}")
         self._initialize_vectors_file()
@@ -141,7 +141,7 @@ def save(self) -> None:
             logger.error(f"Error saving TextVectorMap: {str(e)}")
             raise
 
-    def load(self, name: str = None) -> None:
+    def load(self, name: str | None = None) -> None:
         name_details = name if name else ""
         try:
             self._load_dimension()
@@ -176,7 +176,7 @@ def load(self, name: str = None) -> None:
             logger.error(f"Error loading TextVectorMap ({name_details}): {str(e)}")
             raise
 
-    def get_vector(self, text: str) -> Optional[np.ndarray]:
+    def get_vector(self, text: str) -> np.ndarray | None:
         try:
             text_hash = self._hash_text(text)
             if text_hash not in self.hash_to_index:
@@ -203,7 +203,7 @@ def close(self):
 
 
 class CachedEmbeddingWrapper(Wrapper, Encoder):
-    def __init__(self, model: Encoder, cache_path: Union[str | Path]):
+    def __init__(self, model: Encoder, cache_path: str | Path):
         self._model = model
         self.cache_path = Path(cache_path)
         self.cache_path.mkdir(parents=True, exist_ok=True)
@@ -217,7 +217,7 @@ def __init__(self, model: Encoder, cache_path: Union[str | Path]):
 
         logger.info("Initialized CachedEmbeddingWrapper")
 
-    def encode(self, texts: List[str], batch_size: int = 32, **kwargs) -> np.ndarray:
+    def encode(self, texts: list[str], batch_size: int = 32, **kwargs) -> np.ndarray:
         """Encode texts using the wrapped model, with caching"""
         try:
             results = []
@@ -282,7 +282,7 @@ def __getattr__(self, name: str) -> Any:
                     f"has attribute '{name}'"
                 )
 
-    def __dir__(self) -> List[str]:
+    def __dir__(self) -> list[str]:
         """Return all attributes from both this class and the wrapped model"""
         return list(set(super().__dir__() + dir(self._model)))
 

diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py
@@ -83,7 +83,7 @@ def encode(
     ),
     name="embed-multilingual-v3.0",
     languages=[],  # Unknown, but support >100 languages
-    open_source=False,
+    open_weights=False,
     revision="1",
     release_date="2023-11-02",
     n_parameters=None,
@@ -92,7 +92,8 @@ def encode(
     embed_dim=1024,
     license=None,
     similarity_fn_name="cosine",
-    framework=[],
+    framework=["API"],
+    use_instuctions=False,
 )
 
 cohere_eng_3 = ModelMeta(
@@ -103,7 +104,7 @@ def encode(
     ),
     name="embed-english-v3.0",
     languages=["eng-Latn"],
-    open_source=False,
+    open_weights=False,
     revision="1",
     release_date="2023-11-02",
     n_parameters=None,
@@ -112,5 +113,6 @@ def encode(
     embed_dim=1024,
     license=None,
     similarity_fn_name="cosine",
-    framework=[],
+    framework=["API"],
+    use_instuctions=False,
 )