Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions docs/contributing/adding_a_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,36 @@ Typically, it only requires that you fill in metadata about the model and add it

This works for all [Sentence Transformers](https://sbert.net) compatible models. Once filled out, you can submit your model to `mteb` by submitting a PR.

You can generate it automatically by using:

=== "General model from hub"
```python
from mteb.models import ModelMeta

meta = ModelMeta.from_hub("Qwen/Qwen3-Embedding-0.6B")
print(meta.to_python())
```

=== "For Sentence transformers model"
```python
from mteb.models import ModelMeta
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B", device="cpu")
meta = ModelMeta.from_sentence_transformer_model(model)
print(meta.to_python())
```

=== "For CrossEncoder"
```python
from mteb.models import ModelMeta
from sentence_transformers import CrossEncoder

model = SentenceTransformer("Qwen/Qwen3-Reranker-0.6B", device="cpu")
meta = ModelMeta.from_cross_encoder(model)
print(meta.to_python())
```


### Calculating the Memory Usage

Expand Down
21 changes: 8 additions & 13 deletions mteb/deprecated_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,11 @@
from time import time
from typing import TYPE_CHECKING, Any

from mteb.abstasks.task_metadata import TaskCategory, TaskType
from mteb.models.get_model_meta import (
_model_meta_from_cross_encoder,
_model_meta_from_sentence_transformers,
)

if sys.version_info >= (3, 13):
from warnings import deprecated
else:
from typing_extensions import deprecated

import datasets

import mteb
from mteb.abstasks import AbsTask
from mteb.abstasks.task_metadata import TaskCategory, TaskType
from mteb.benchmarks import Benchmark
from mteb.models import (
CrossEncoderWrapper,
Expand All @@ -39,6 +29,11 @@
from mteb.results import TaskResult
from mteb.types import ScoresDict

if sys.version_info >= (3, 13):
from warnings import deprecated
else:
from typing_extensions import deprecated

if TYPE_CHECKING:
from sentence_transformers import CrossEncoder, SentenceTransformer

Expand Down Expand Up @@ -669,9 +664,9 @@ def _get_model_meta(model: EncoderProtocol) -> ModelMeta:
from sentence_transformers import CrossEncoder, SentenceTransformer

if isinstance(model, CrossEncoder):
meta = _model_meta_from_cross_encoder(model)
meta = ModelMeta.from_cross_encoder(model)
elif isinstance(model, SentenceTransformer):
meta = _model_meta_from_sentence_transformers(model)
meta = ModelMeta.from_sentence_transformer_model(model)
else:
meta = ModelMeta(
loader=None,
Expand Down
35 changes: 2 additions & 33 deletions mteb/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import logging
from collections.abc import Iterable
from copy import deepcopy
from pathlib import Path
from time import time
from typing import TYPE_CHECKING, Any, cast
Expand Down Expand Up @@ -53,36 +52,6 @@ class OverwriteStrategy(HelpfulStrEnum):
ONLY_CACHE = "only-cache"


_empty_model_meta = ModelMeta(
loader=None,
name=None,
revision=None,
release_date=None,
languages=None,
framework=[],
similarity_fn_name=None,
n_parameters=None,
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
public_training_data=None,
use_instructions=None,
training_datasets=None,
modalities=[],
)


def _create_empty_model_meta() -> ModelMeta:
logger.warning("Model metadata is missing. Using empty metadata.")
meta = deepcopy(_empty_model_meta)
meta.revision = "no_revision_available"
meta.name = "no_model_name_available"
return meta


def _sanitize_model(
model: ModelMeta | MTEBModels | SentenceTransformer | CrossEncoder,
) -> tuple[MTEBModels | ModelMeta, ModelMeta, ModelName, Revision]:
Expand All @@ -101,9 +70,9 @@ def _sanitize_model(
elif hasattr(model, "mteb_model_meta"):
meta = model.mteb_model_meta # type: ignore[attr-defined]
if not isinstance(meta, ModelMeta):
meta = _create_empty_model_meta()
meta = ModelMeta.from_hub(None)
else:
meta = _create_empty_model_meta() if not isinstance(model, ModelMeta) else model
meta = ModelMeta.from_hub(None) if not isinstance(model, ModelMeta) else model

model_name = cast(str, meta.name)
model_revision = cast(str, meta.revision)
Expand Down
129 changes: 3 additions & 126 deletions mteb/models/get_model_meta.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,15 @@
from __future__ import annotations

import difflib
import logging
import warnings
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any

from huggingface_hub import ModelCard
from huggingface_hub.errors import RepositoryNotFoundError
from typing import Any

from mteb.abstasks import AbsTask
from mteb.models import (
CrossEncoderWrapper,
ModelMeta,
MTEBModels,
sentence_transformers_loader,
)
from mteb.models.model_implementations import MODEL_REGISTRY

if TYPE_CHECKING:
from sentence_transformers import CrossEncoder, SentenceTransformer

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -101,24 +90,9 @@ def get_model(
Returns:
A model object
"""
from sentence_transformers import CrossEncoder, SentenceTransformer

meta = get_model_meta(model_name, revision)
model = meta.load_model(**kwargs)

# If revision not available in the modelmeta, try to extract it from sentence-transformers
if hasattr(model, "model") and isinstance(model.model, SentenceTransformer): # type: ignore
_meta = _model_meta_from_sentence_transformers(model.model) # type: ignore
if meta.revision is None:
meta.revision = _meta.revision if _meta.revision else meta.revision
if not meta.similarity_fn_name:
meta.similarity_fn_name = _meta.similarity_fn_name

elif isinstance(model, CrossEncoder):
_meta = _model_meta_from_cross_encoder(model.model)
if meta.revision is None:
meta.revision = _meta.revision if _meta.revision else meta.revision

model.mteb_model_meta = meta # type: ignore
return model

Expand Down Expand Up @@ -148,12 +122,8 @@ def get_model_meta(
logger.info(
"Model not found in model registry. Attempting to extract metadata by loading the model ({model_name}) using HuggingFace."
)
try:
meta = _model_meta_from_hf_hub(model_name)
meta.revision = revision
return meta
except RepositoryNotFoundError:
pass
meta = ModelMeta.from_hub(model_name, revision)
return meta

not_found_msg = f"Model '{model_name}' not found in MTEB registry"
not_found_msg += " nor on the Huggingface Hub." if fetch_from_hf else "."
Expand All @@ -171,96 +141,3 @@ def get_model_meta(
suggestion = f" Did you mean: '{close_matches[0]}'?"

raise KeyError(not_found_msg + suggestion)


def _model_meta_from_hf_hub(model_name: str) -> ModelMeta:
card = ModelCard.load(model_name)
card_data = card.data.to_dict()
frameworks = ["PyTorch"]
loader = None
if card_data.get("library_name", None) == "sentence-transformers":
frameworks.append("Sentence Transformers")
loader = sentence_transformers_loader
else:
msg = (
"Model library not recognized, defaulting to Sentence Transformers loader."
)
logger.warning(msg)
warnings.warn(msg)
loader = sentence_transformers_loader

revision = card_data.get("base_model_revision", None)
license = card_data.get("license", None)
meta = ModelMeta(
loader=loader,
name=model_name,
revision=revision,
release_date=ModelMeta.fetch_release_date(model_name),
languages=None,
license=license,
framework=frameworks, # type: ignore
training_datasets=None,
similarity_fn_name=None,
n_parameters=None,
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
open_weights=True,
public_training_code=None,
public_training_data=None,
use_instructions=None,
)
return meta


def _model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta:
model_name = model.model.name_or_path
meta = ModelMeta(
loader=CrossEncoderWrapper,
name=model_name,
revision=model.config._commit_hash,
release_date=ModelMeta.fetch_release_date(model_name),
languages=None,
framework=["Sentence Transformers"],
similarity_fn_name=None,
n_parameters=None,
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=True,
public_training_code=None,
public_training_data=None,
use_instructions=None,
training_datasets=None,
)
return meta


def _model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMeta:
name: str | None = (
model.model_card_data.model_name
if model.model_card_data.model_name
else model.model_card_data.base_model
)
embeddings_dim = model.get_sentence_embedding_dimension()
meta = ModelMeta(
loader=sentence_transformers_loader,
name=name,
revision=model.model_card_data.base_model_revision,
release_date=ModelMeta.fetch_release_date(name) if name else None,
languages=None,
framework=["Sentence Transformers"],
similarity_fn_name=None,
n_parameters=None,
memory_usage_mb=None,
max_tokens=None,
embed_dim=embeddings_dim,
license=None,
open_weights=True,
public_training_code=None,
public_training_data=None,
use_instructions=None,
training_datasets=None,
)
return meta
Loading