Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ install:
install-for-tests:
@echo "--- 🚀 Installing project dependencies for test ---"
@echo "This ensures that the project is not installed in editable mode"
pip install ".[dev,speedtask]"
pip install ".[dev,speedtask,bm25s,pylate]"

lint:
@echo "--- 🧹 Running linters ---"
Expand Down
15 changes: 13 additions & 2 deletions mteb/evaluation/MTEB.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@
from mteb.abstasks.AbsTask import ScoresDict
from mteb.encoder_interface import Encoder
from mteb.model_meta import ModelMeta
from mteb.models import model_meta_from_sentence_transformers
from mteb.models import (
model_meta_from_cross_encoder,
model_meta_from_sentence_transformers,
)

from ..abstasks.AbsTask import AbsTask
from ..load_results.task_results import TaskResult
Expand Down Expand Up @@ -495,7 +498,7 @@ def create_model_meta(model: Encoder) -> ModelMeta:
meta = model.mteb_model_meta # type: ignore
else:
try:
meta = model_meta_from_sentence_transformers(model) # type: ignore
meta = MTEB._get_model_meta(model)
except AttributeError:
logger.warning(
"Could not find model metadata. Please set the model.mteb_model_meta attribute or if you are using "
Expand Down Expand Up @@ -597,3 +600,11 @@ def _get_missing_evaluations(
missing_evaluations[split]["missing_subsets"] = missing_subsets

return missing_evaluations

@staticmethod
def _get_model_meta(model: Encoder) -> ModelMeta:
if isinstance(model, CrossEncoder):
meta = model_meta_from_cross_encoder(model)
else:
meta = model_meta_from_sentence_transformers(model)
return meta
4 changes: 2 additions & 2 deletions mteb/model_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
"PyLate",
"ColBERT",
]
DISTANCE_METRICS = Literal["cosine", "max_sim", "dot"]
DISTANCE_METRICS = Literal["cosine", "MaxSim", "dot"]


def sentence_transformers_loader(
Expand Down Expand Up @@ -111,7 +111,7 @@ def get_similarity_function(self) -> Callable[[np.ndarray, np.ndarray], np.ndarr
return cos_sim
elif self.similarity_fn_name == "dot":
return dot_score
elif self.similarity_fn_name == "max_sim":
elif self.similarity_fn_name == "MaxSim":
return max_sim
elif self.similarity_fn_name is None:
raise ValueError("Similarity function not specified.")
Expand Down
2 changes: 2 additions & 0 deletions mteb/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
get_model,
get_model_meta,
get_model_metas,
model_meta_from_cross_encoder,
model_meta_from_sentence_transformers,
)
from mteb.models.sentence_transformer_wrapper import SentenceTransformerWrapper
Expand All @@ -17,5 +18,6 @@
"get_model_meta",
"get_model_metas",
"model_meta_from_sentence_transformers",
"model_meta_from_cross_encoder",
"SentenceTransformerWrapper",
]
4 changes: 2 additions & 2 deletions mteb/models/colbert_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
max_tokens=180, # Reduced for Benchmarking - see ColBERT paper
embed_dim=None, # Bag of Embeddings (128) for each token
license="mit",
similarity_fn_name="max_sim",
similarity_fn_name="MaxSim",
framework=["PyLate", "ColBERT"],
reference="https://huggingface.co/colbert-ir/colbertv2.0",
use_instructions=False,
Expand Down Expand Up @@ -213,7 +213,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
max_tokens=8192,
embed_dim=None, # Bag of Embeddings (128) for each token
license="cc-by-nc-4.0",
similarity_fn_name="max_sim",
similarity_fn_name="MaxSim",
framework=["PyLate", "ColBERT"],
reference="https://huggingface.co/jinaai/jina-colbert-v2",
use_instructions=False,
Expand Down
53 changes: 51 additions & 2 deletions mteb/models/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Any

from huggingface_hub import ModelCard
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder, SentenceTransformer

from mteb.abstasks.AbsTask import AbsTask
from mteb.encoder_interface import Encoder
Expand Down Expand Up @@ -172,6 +172,11 @@ def get_model(model_name: str, revision: str | None = None, **kwargs: Any) -> En
if not meta.similarity_fn_name:
meta.similarity_fn_name = _meta.similarity_fn_name

elif isinstance(model, CrossEncoder):
_meta = model_meta_from_cross_encoder(model.model)
if meta.revision is None:
meta.revision = _meta.revision if _meta.revision else meta.revision

model.mteb_model_meta = meta # type: ignore
return model

Expand Down Expand Up @@ -251,6 +256,49 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta:
)


def model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta:
try:
name = model.model.name_or_path

meta = ModelMeta(
name=name,
revision=model.config._commit_hash,
release_date=None,
languages=None,
framework=["Sentence Transformers"],
similarity_fn_name=None,
n_parameters=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=True,
public_training_code=None,
use_instructions=None,
training_datasets=None,
)
except AttributeError as e:
logger.warning(
f"Failed to extract metadata from model: {e}. Upgrading to sentence-transformers v3.0.0 or above is recommended."
)
meta = ModelMeta(
name=None,
revision=None,
languages=None,
release_date=None,
n_parameters=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=True,
public_training_code=None,
similarity_fn_name=None,
use_instructions=None,
training_datasets=None,
framework=[],
)
return meta


def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMeta:
try:
name = (
Expand All @@ -263,6 +311,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe
if isinstance(model.model_card_data.language, str)
else model.model_card_data.language
)
embeddings_dim = model.get_sentence_embedding_dimension()
meta = ModelMeta(
name=name,
revision=model.model_card_data.base_model_revision,
Expand All @@ -272,7 +321,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe
similarity_fn_name=model.similarity_fn_name,
n_parameters=None,
max_tokens=None,
embed_dim=None,
embed_dim=embeddings_dim,
license=None,
open_weights=True,
public_training_code=None,
Expand Down
57 changes: 30 additions & 27 deletions tests/test_benchmark/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
def test_mulitple_mteb_tasks(tasks: list[AbsTask], model: mteb.Encoder, tmp_path: Path):
"""Test that multiple tasks can be run"""
eval = mteb.MTEB(tasks=tasks)
eval.run(model, output_folder=str(tmp_path), overwrite_results=True)
eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True)

# ensure that we can generate a readme from the output folder
generate_readme(tmp_path)
Expand All @@ -56,33 +56,35 @@ def test_mulitple_mteb_tasks(tasks: list[AbsTask], model: mteb.Encoder, tmp_path
MockTorchbf16Encoder(),
],
)
def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder):
def test_benchmark_encoders_on_task(
task: str | AbsTask, model: mteb.Encoder, tmp_path: Path
):
"""Test that a task can be fetched and run using a variety of encoders"""
if isinstance(task, str):
tasks = mteb.get_tasks(tasks=[task])
else:
tasks = [task]

eval = mteb.MTEB(tasks=tasks)
eval.run(model, output_folder="tests/results", overwrite_results=True)
eval.run(model, output_folder=tmp_path.as_posix())


@pytest.mark.parametrize("task", [MockMultilingualRetrievalTask()])
@pytest.mark.parametrize(
"model",
[MockSentenceTransformer()],
)
def test_run_eval_without_co2_tracking(task: str | AbsTask, model: mteb.Encoder):
def test_run_eval_without_co2_tracking(
task: str | AbsTask, model: mteb.Encoder, tmp_path: Path
):
"""Test that a task can be fetched and run without CO2 tracking"""
if isinstance(task, str):
tasks = mteb.get_tasks(tasks=[task])
else:
tasks = [task]

eval = mteb.MTEB(tasks=tasks)
eval.run(
model, output_folder="tests/results", overwrite_results=True, co2_tracker=False
)
eval.run(model, output_folder=tmp_path.as_posix(), co2_tracker=False)


@pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID[:1])
Expand All @@ -95,20 +97,22 @@ def test_reload_results(task: str | AbsTask, model: mteb.Encoder, tmp_path: Path
tasks = [task]

eval = mteb.MTEB(tasks=tasks)
results = eval.run(model, output_folder=str(tmp_path), overwrite_results=True)
results = eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True)

assert isinstance(results, list)
assert isinstance(results[0], mteb.TaskResult)

# reload the results
results = eval.run(model, output_folder=str(tmp_path), overwrite_results=False)
results = eval.run(
model, output_folder=tmp_path.as_posix(), overwrite_results=False
)

assert isinstance(results, list)
assert isinstance(results[0], mteb.TaskResult)


@pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID)
def test_prompt_name_passed_to_all_encodes(task_name: str | AbsTask):
def test_prompt_name_passed_to_all_encodes(task_name: str | AbsTask, tmp_path: Path):
"""Test that all tasks correctly pass down the prompt_name to the encoder which supports it, and that the encoder which does not support it does not
receive it.
"""
Expand Down Expand Up @@ -141,17 +145,17 @@ def encode(self, sentences, **kwargs):

eval.run(
model,
output_folder="tests/results",
output_folder=tmp_path.as_posix(),
overwrite_results=True,
)
# Test that the task_name is not passed down to the encoder
model = EncoderWithoutInstructions("average_word_embeddings_levy_dependency")
assert model.prompts == {}, "The encoder should not have any prompts"
eval.run(model, output_folder="tests/results", overwrite_results=True)
eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True)


@pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID)
def test_encode_kwargs_passed_to_all_encodes(task_name: str | AbsTask):
def test_encode_kwargs_passed_to_all_encodes(task_name: str | AbsTask, tmp_path: Path):
"""Test that all tasks correctly pass down the encode_kwargs to the encoder."""
my_encode_kwargs = {"no_one_uses_this_args": "but_its_here"}

Expand All @@ -175,35 +179,35 @@ def encode(self, sentences, task_name: str | None = None, **kwargs):
model = MockEncoderWithKwargs()
eval.run(
model,
output_folder="tests/results",
output_folder=tmp_path.as_posix(),
overwrite_results=True,
encode_kwargs=my_encode_kwargs,
)


@pytest.mark.parametrize("model", [MockNumpyEncoder()])
def test_run_using_benchmark(model: mteb.Encoder):
def test_run_using_benchmark(model: mteb.Encoder, tmp_path: Path):
"""Test that a benchmark object can be run using the MTEB class."""
bench = Benchmark(
name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"])
)

eval = mteb.MTEB(tasks=[bench])
eval.run(
model, output_folder="tests/results", overwrite_results=True
model, output_folder=tmp_path.as_posix(), overwrite_results=True
) # we just want to test that it runs


@pytest.mark.parametrize("model", [MockNumpyEncoder()])
def test_run_using_list_of_benchmark(model: mteb.Encoder):
def test_run_using_list_of_benchmark(model: mteb.Encoder, tmp_path: Path):
"""Test that a list of benchmark objects can be run using the MTEB class."""
bench = [
Benchmark(name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"]))
]

eval = mteb.MTEB(tasks=bench)
eval.run(
model, output_folder="tests/results", overwrite_results=True
model, output_folder=tmp_path.as_posix()
) # we just want to test that it runs


Expand All @@ -229,7 +233,7 @@ def test_get_benchmark(name):
@pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID)
@pytest.mark.parametrize("is_task_name", [True, False])
def test_prompt_name_passed_to_all_encodes_with_prompts(
task: AbsTask | str, is_task_name: bool
task: AbsTask | str, is_task_name: bool, tmp_path: Path
):
"""Test that all tasks and task_types correctly pass down the prompt_name to the encoder with prompts."""
_task_name = task.metadata.name if isinstance(task, AbsTask) else task
Expand Down Expand Up @@ -258,8 +262,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs):
)
eval.run(
model,
output_folder="tests/results",
overwrite_results=True,
output_folder=tmp_path.as_posix(),
)

class MockEncoderWithExistingPrompts(mteb.Encoder):
Expand All @@ -275,7 +278,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs):
model = MockSentenceTransformerWrapper(MockEncoderWithExistingPrompts())
eval.run(
model,
output_folder="tests/results",
output_folder=tmp_path.as_posix(),
overwrite_results=True,
)

Expand All @@ -292,7 +295,9 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs):
],
)
@pytest.mark.parametrize("is_task_name", [True, False])
def test_model_query_passage_prompts_task_type(task: AbsTask | str, is_task_name: bool):
def test_model_query_passage_prompts_task_type(
task: AbsTask | str, is_task_name: bool, tmp_path: Path
):
"""Test that the model with prompts is correctly called."""
tasks = [task]

Expand Down Expand Up @@ -331,8 +336,7 @@ def encode(self, sentences, prompt_name: str | None = None, *args, **kwargs):
eval.run(
model,
model_prompts=prompt_list,
output_folder="tests/results",
overwrite_results=True,
output_folder=tmp_path.as_posix(),
)
model = MockSentenceTransformerWrapper(
MockSentenceEncoderWithPrompts(), model_prompts=prompt_list
Expand All @@ -341,6 +345,5 @@ def encode(self, sentences, prompt_name: str | None = None, *args, **kwargs):
eval.run(
model,
model_prompts=prompt_list,
output_folder="tests/results",
overwrite_results=True,
output_folder=tmp_path.as_posix(),
)
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import logging
from pathlib import Path

import pytest

Expand All @@ -18,7 +19,7 @@

@pytest.mark.parametrize("task", TASK_TEST_GRID)
@pytest.mark.parametrize("model", [MockNumpyEncoder()])
def test_benchmark_datasets(task: str | AbsTask, model: mteb.Encoder):
def test_benchmark_datasets(task: str | AbsTask, model: mteb.Encoder, tmp_path: Path):
"""Test that a task can be fetched and run"""
eval = MTEB(tasks=[task])
eval.run(model, output_folder="tests/results", overwrite_results=True)
eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True)
Loading
Loading