Skip to content
8 changes: 4 additions & 4 deletions mteb/load_results/benchmark_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,11 @@ def filter_tasks(
continue
if languages is not None:
task_languages = task_result.languages
if not any([lang in task_languages for lang in languages]):
if not any(lang in task_languages for lang in languages):
continue
if domains is not None:
task_domains = task_result.domains
if not any([domain in task_domains for domain in domains]):
if not any(domain in task_domains for domain in domains):
continue
if (task_types is not None) and (task_result.task_type not in task_types):
continue
Expand Down Expand Up @@ -100,7 +100,7 @@ def get_scores(
if format == "long":
entries = []
for task_res in self.task_results:
entry = dict(
entry = dict( # noqa
model_name=self.model_name,
model_revision=self.model_revision,
task_name=task_res.task_name,
Expand Down Expand Up @@ -140,7 +140,7 @@ def domains(self) -> list[str]:

@property
def task_types(self) -> list[str]:
return list(set([task_res.task_type for task_res in self.task_results]))
return list({task_res.task_type for task_res in self.task_results})

@property
def task_names(self) -> list[str]:
Expand Down
3 changes: 0 additions & 3 deletions mteb/load_results/load_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,6 @@ def load_results(
extract the model name and revision from the path.
validate_and_filter: If True it will validate that the results object for the task contains the correct splits and filter out
splits from the results object that are not default in the task metadata. Defaults to True.

Returns:

"""
repo_directory = download_of_results(results_repo, download_latest=download_latest)
model_paths = [p for p in (repo_directory / "results").glob("*") if p.is_dir()]
Expand Down
49 changes: 36 additions & 13 deletions mteb/model_meta.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,32 @@
from __future__ import annotations

from datetime import date
import logging
from functools import partial
from typing import TYPE_CHECKING, Annotated, Any, Callable, Literal
from typing import TYPE_CHECKING, Any, Callable, Literal

from pydantic import BaseModel, BeforeValidator, TypeAdapter
from pydantic import BaseModel

from mteb.abstasks.TaskMetadata import STR_DATE, STR_URL
from mteb.encoder_interface import Encoder

from .languages import ISO_LANGUAGE_SCRIPT

if TYPE_CHECKING:
from .models.sentence_transformer_wrapper import SentenceTransformerWrapper

Frameworks = Literal["Sentence Transformers", "PyTorch"]
logger = logging.getLogger(__name__)

pastdate_adapter = TypeAdapter(date)
STR_DATE = Annotated[
str, BeforeValidator(lambda value: str(pastdate_adapter.validate_python(value)))
] # Allows the type to be a string, but ensures that the string is a valid date

FRAMEWORKS = Literal[
"Sentence Transformers",
"PyTorch",
"GritLM",
"LLM2Vec",
"TensorFlow",
"API",
"Tevatron",
]
DISTANCE_METRICS = Literal["cosine"]


def sentence_transformers_loader(
Expand Down Expand Up @@ -53,12 +61,19 @@ class ModelMeta(BaseModel):
embed_dim: The dimension of the embeddings produced by the model. Currently all models are assumed to produce fixed-size embeddings.
revision: The revision number of the model. If None it is assumed that the metadata (including the loader) is valid for all revisions of the model.
release_date: The date the model's revision was released.
license: The license under which the model is released. Required if open_source is True.
open_source: Whether the model is open source or proprietary.
license: The license under which the model is released. Required if open_weights is True.
open_weights: Whether the model is open source or proprietary.
public_training_data: Whether the training data used to train the model is publicly available.
public_training_code: Whether the code used to train the model is publicly available.
similarity_fn_name: The distance metric used by the model.
framework: The framework the model is implemented in, can be a list of frameworks e.g. `["Sentence Transformers", "PyTorch"]`.
reference: A URL to the model's page on huggingface or another source.
languages: The languages the model is intended for specified as a 3 letter language code followed by a script code e.g. "eng-Latn" for English
in the Latin script.
use_instuctions: Whether the model uses instructions E.g. for prompt-based models. This also include models that require a specific format for
input such as "query: {document}" or "passage: {document}".
zero_shot_benchmarks: A list of benchmarks on which the model has been evaluated in a zero-shot setting. By default we assume that all models
are evaluated non-zero-shot unless specified otherwise.
"""

name: str | None
Expand All @@ -71,9 +86,14 @@ class ModelMeta(BaseModel):
max_tokens: int | None = None
embed_dim: int | None = None
license: str | None = None
open_source: bool | None = None
similarity_fn_name: str | None = None
framework: list[Frameworks] = []
open_weights: bool | None = None
public_training_data: bool | None = None
public_training_code: bool | None = None
framework: list[FRAMEWORKS] = []
reference: STR_URL | None = None
similarity_fn_name: DISTANCE_METRICS | None = None
use_instuctions: bool | None = None
zero_shot_benchmarks: list[str] | None = None

def to_dict(self):
dict_repr = self.model_dump()
Expand All @@ -83,6 +103,9 @@ def to_dict(self):

def load_model(self, **kwargs: Any) -> Encoder:
if self.loader is None:
logger.warning(
f"Loader not specified for model {self.name}, loading using sentence transformers."
)
loader = partial(
sentence_transformers_loader,
model_name=self.name,
Expand Down
23 changes: 22 additions & 1 deletion mteb/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,24 @@
from __future__ import annotations

from mteb.models.overview import *
import logging

from mteb.models.overview import (
MODEL_REGISTRY,
ModelMeta,
get_model,
get_model_meta,
get_model_metas,
model_meta_from_sentence_transformers,
)

logger = logging.getLogger(__name__)


__all__ = [
"MODEL_REGISTRY",
"ModelMeta",
"get_model",
"get_model_meta",
"get_model_metas",
"model_meta_from_sentence_transformers",
]
13 changes: 11 additions & 2 deletions mteb/models/arctic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,17 @@

arctic_m_v1_5 = ModelMeta(
name="Snowflake/snowflake-arctic-embed-m-v1.5",
languages=["eng_Latn"],
open_source=True,
revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47",
release_date="2024-07-08", # initial commit of hf model.
languages=["eng_Latn"],
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=109_000_000,
memory_usage=None,
max_tokens=512,
embed_dim=256,
license="apache-2.0",
reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5",
similarity_fn_name="cosine_similarity",
use_instuctions=False,
)
33 changes: 30 additions & 3 deletions mteb/models/bge_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,18 @@
),
name="BAAI/bge-small-en-v1.5",
languages=["eng_Latn"],
open_source=True,
open_weights=True,
revision="5c38ec7c405ec4b44b94cc5a9bb96e735b38267a",
release_date="2023-09-12", # initial commit of hf model.
n_parameters=24_000_000,
memory_usage=None,
embed_dim=512,
license="mit",
max_tokens=512,
reference="https://huggingface.co/BAAI/bge-small-en-v1.5",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instuctions=False,
)

bge_base_en_v1_5 = ModelMeta(
Expand All @@ -29,9 +38,18 @@
),
name="BAAI/bge-base-en-v1.5",
languages=["eng_Latn"],
open_source=True,
open_weights=True,
revision="a5beb1e3e68b9ab74eb54cfd186867f64f240e1a",
release_date="2023-09-11", # initial commit of hf model.
n_parameters=438_000_000,
memory_usage=None,
embed_dim=768,
license="mit",
max_tokens=512,
reference="https://huggingface.co/BAAI/bge-base-en-v1.5",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instuctions=False,
)

bge_large_en_v1_5 = ModelMeta(
Expand All @@ -43,7 +61,16 @@
),
name="BAAI/bge-large-en-v1.5",
languages=["eng_Latn"],
open_source=True,
open_weights=True,
revision="d4aa6901d3a41ba39fb536a557fa166f842b0e09",
release_date="2023-09-12", # initial commit of hf model.
n_parameters=1_340_000_000,
memory_usage=None,
embed_dim=1024,
license="mit",
max_tokens=512,
reference="https://huggingface.co/BAAI/bge-large-en-v1.5",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instuctions=False,
)
11 changes: 10 additions & 1 deletion mteb/models/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,16 @@ def encode(self, texts: list[str], **kwargs):
loader=partial(bm25_loader, model_name="bm25s"), # type: ignore
name="bm25s",
languages=["eng_Latn"],
open_source=True,
open_weights=True,
revision="0_1_10",
release_date="2024-07-10", ## release of version 0.1.10
n_parameters=None,
memory_usage=None,
embed_dim=None,
license=None,
max_tokens=None,
reference=None,
similarity_fn_name=None,
framework=[],
use_instuctions=False,
)
20 changes: 10 additions & 10 deletions mteb/models/cache_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import json
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from typing import Any

import numpy as np
import torch
Expand All @@ -21,17 +21,17 @@
class TextVectorMap:
def __init__(
self,
directory: Union[str | Path],
directory: str | Path,
initial_vectors: int = 100000,
):
self.directory = Path(directory)
self.directory.mkdir(parents=True, exist_ok=True)
self.vectors_file = self.directory / "vectors.npy"
self.index_file = self.directory / "index.json"
self.dimension_file = self.directory / "dimension"
self.hash_to_index: Dict[str, int] = {}
self.vectors: Optional[np.memmap] = None
self.vector_dim: Optional[int] = None
self.hash_to_index: dict[str, int] = {}
self.vectors: np.memmap | None = None
self.vector_dim: int | None = None
self.initial_vectors = initial_vectors
logger.info(f"Initialized TextVectorMap in directory: {self.directory}")
self._initialize_vectors_file()
Expand Down Expand Up @@ -141,7 +141,7 @@ def save(self) -> None:
logger.error(f"Error saving TextVectorMap: {str(e)}")
raise

def load(self, name: str = None) -> None:
def load(self, name: str | None = None) -> None:
name_details = name if name else ""
try:
self._load_dimension()
Expand Down Expand Up @@ -176,7 +176,7 @@ def load(self, name: str = None) -> None:
logger.error(f"Error loading TextVectorMap ({name_details}): {str(e)}")
raise

def get_vector(self, text: str) -> Optional[np.ndarray]:
def get_vector(self, text: str) -> np.ndarray | None:
try:
text_hash = self._hash_text(text)
if text_hash not in self.hash_to_index:
Expand All @@ -203,7 +203,7 @@ def close(self):


class CachedEmbeddingWrapper(Wrapper, Encoder):
def __init__(self, model: Encoder, cache_path: Union[str | Path]):
def __init__(self, model: Encoder, cache_path: str | Path):
self._model = model
self.cache_path = Path(cache_path)
self.cache_path.mkdir(parents=True, exist_ok=True)
Expand All @@ -217,7 +217,7 @@ def __init__(self, model: Encoder, cache_path: Union[str | Path]):

logger.info("Initialized CachedEmbeddingWrapper")

def encode(self, texts: List[str], batch_size: int = 32, **kwargs) -> np.ndarray:
def encode(self, texts: list[str], batch_size: int = 32, **kwargs) -> np.ndarray:
"""Encode texts using the wrapped model, with caching"""
try:
results = []
Expand Down Expand Up @@ -282,7 +282,7 @@ def __getattr__(self, name: str) -> Any:
f"has attribute '{name}'"
)

def __dir__(self) -> List[str]:
def __dir__(self) -> list[str]:
"""Return all attributes from both this class and the wrapped model"""
return list(set(super().__dir__() + dir(self._model)))

Expand Down
10 changes: 6 additions & 4 deletions mteb/models/cohere_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def encode(
),
name="embed-multilingual-v3.0",
languages=[], # Unknown, but support >100 languages
open_source=False,
open_weights=False,
revision="1",
release_date="2023-11-02",
n_parameters=None,
Expand All @@ -92,7 +92,8 @@ def encode(
embed_dim=1024,
license=None,
similarity_fn_name="cosine",
framework=[],
framework=["API"],
use_instuctions=False,
)

cohere_eng_3 = ModelMeta(
Expand All @@ -103,7 +104,7 @@ def encode(
),
name="embed-english-v3.0",
languages=["eng-Latn"],
open_source=False,
open_weights=False,
revision="1",
release_date="2023-11-02",
n_parameters=None,
Expand All @@ -112,5 +113,6 @@ def encode(
embed_dim=1024,
license=None,
similarity_fn_name="cosine",
framework=[],
framework=["API"],
use_instuctions=False,
)
Loading