diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index be2f5af1f0..9e088aacdf 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -82,6 +82,7 @@ def __call__( self.top_k, task_name=self.task_name, # type: ignore instructions=instructions, + score_function="bm25", **kwargs, ) else: diff --git a/mteb/evaluation/evaluators/model_classes.py b/mteb/evaluation/evaluators/model_classes.py index b05de30d7f..b2d2c54be8 100644 --- a/mteb/evaluation/evaluators/model_classes.py +++ b/mteb/evaluation/evaluators/model_classes.py @@ -332,9 +332,19 @@ def _full_corpus_search( query_embeddings = torch.as_tensor(query_embeddings).to(device) sub_corpus_embeddings = torch.as_tensor(sub_corpus_embeddings).to(device) - score_function = ( - self.model.similarity if hasattr(self.model, "similarity") else cos_sim - ) + if hasattr(self.model.model, "mteb_model_meta") or hasattr( + self.model, "similarity" + ): + score_function = ( + self.model.similarity + if hasattr(self.model, "similarity") + else self.model.model.mteb_model_meta.get_similarity_function() + ) + else: + logger.warning( + "The model does not provide `mteb_model_meta`; defaulting to the cosine similarity function." + ) + score_function = cos_sim with torch.inference_mode(): scores = score_function(query_embeddings, sub_corpus_embeddings) diff --git a/mteb/evaluation/evaluators/utils.py b/mteb/evaluation/evaluators/utils.py index e01e0ec463..14ca673ce9 100644 --- a/mteb/evaluation/evaluators/utils.py +++ b/mteb/evaluation/evaluators/utils.py @@ -70,6 +70,34 @@ def _cos_sim_core(a_tensor, b_tensor): return _cos_sim_core(a, b) +def max_sim(a: np.ndarray, b: np.ndarray) -> np.ndarray: + """Computes the max-similarity max_sim(a[i], b[j]) for all i and j. + Works with a Tensor of the shape (batch_size, num_tokens, token_dim) + + Return: + Matrix with res[i][j] = max_sim(a[i], b[j]) + """ # noqa: D402 + if not isinstance(a, torch.Tensor): + a = torch.tensor(a, dtype=torch.float32) + + if not isinstance(b, torch.Tensor): + b = torch.tensor(b, dtype=torch.float32) + + if len(a.shape) == 2: + a = a.unsqueeze(0) + + if len(b.shape) == 2: + b = b.unsqueeze(0) + + scores = torch.einsum( + "ash,bth->abst", + a, + b, + ) + + return scores.max(axis=-1).values.sum(axis=-1) + + def dot_score(a: torch.Tensor, b: torch.Tensor): """Computes the dot-product dot_prod(a[i], b[j]) for all i and j. :return: Matrix with res[i][j] = dot_prod(a[i], b[j]) diff --git a/mteb/model_meta.py b/mteb/model_meta.py index 1754ab4bbb..bb063e7ba3 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -4,11 +4,13 @@ from functools import partial from typing import TYPE_CHECKING, Any, Callable, Literal +import numpy as np from pydantic import BaseModel, ConfigDict from mteb.abstasks.AbsTask import AbsTask from mteb.abstasks.TaskMetadata import STR_DATE, STR_URL from mteb.encoder_interface import Encoder +from mteb.evaluation.evaluators.utils import cos_sim, dot_score, max_sim from .languages import ISO_LANGUAGE_SCRIPT @@ -30,7 +32,6 @@ "PyLate", "ColBERT", ] -DISTANCE_METRICS = Literal["cosine", "max_sim", "dot"] def sentence_transformers_loader( @@ -51,6 +52,9 @@ def get_loader_name( return loader.__name__ +DISTANCE_METRICS = Literal["cosine", "MaxSim", "dot"] + + class ModelMeta(BaseModel): """The model metadata object. @@ -106,6 +110,16 @@ class ModelMeta(BaseModel): superseded_by: str | None = None citation: str | None = None + def get_similarity_function(self) -> Callable[[np.ndarray, np.ndarray], np.ndarray]: + if self.similarity_fn_name == "cosine": + return cos_sim + elif self.similarity_fn_name == "dot": + return dot_score + elif self.similarity_fn_name == "MaxSim": + return max_sim + elif self.similarity_fn_name is None: + raise ValueError("Similarity function not specified.") + def to_dict(self): dict_repr = self.model_dump() loader = dict_repr.pop("loader", None) diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index 8753791bff..6ce7ca6fb9 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -100,10 +100,13 @@ def encode( ) logger.info(f"Encoding {len(sentences)} sentences.") + if "request_qid" in kwargs: + kwargs.pop("request_qid") pred = self.model.encode( sentences, prompt_name=prompt_name, is_query=True if prompt_type == PromptType.query else False, + convert_to_tensor=True, **kwargs, ) @@ -158,7 +161,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: max_tokens=180, # Reduced for Benchmarking - see ColBERT paper embed_dim=None, # Bag of Embeddings (128) for each token license="mit", - similarity_fn_name="max_sim", + similarity_fn_name="MaxSim", framework=["PyLate", "ColBERT"], reference="https://huggingface.co/colbert-ir/colbertv2.0", use_instructions=False, @@ -209,7 +212,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: max_tokens=8192, embed_dim=None, # Bag of Embeddings (128) for each token license="cc-by-nc-4.0", - similarity_fn_name="max_sim", + similarity_fn_name="MaxSim", framework=["PyLate", "ColBERT"], reference="https://huggingface.co/jinaai/jina-colbert-v2", use_instructions=False, diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 4e19bed19c..e9774cacd9 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -157,9 +157,12 @@ def get_model(model_name: str, revision: str | None = None, **kwargs: Any) -> En model = meta.load_model(**kwargs) # If revision not available in the modelmeta, try to extract it from sentence-transformers - if meta.revision is None and isinstance(model, SentenceTransformer): - _meta = model_meta_from_sentence_transformers(model) - meta.revision = _meta.revision if _meta.revision else meta.revision + if isinstance(model.model, SentenceTransformer): + _meta = model_meta_from_sentence_transformers(model.model) + if meta.revision is None: + meta.revision = _meta.revision if _meta.revision else meta.revision + if not meta.similarity_fn_name: + meta.similarity_fn_name = _meta.similarity_fn_name model.mteb_model_meta = meta # type: ignore return model diff --git a/mteb/models/sentence_transformer_wrapper.py b/mteb/models/sentence_transformer_wrapper.py index 9ec25a9896..bb47467838 100644 --- a/mteb/models/sentence_transformer_wrapper.py +++ b/mteb/models/sentence_transformer_wrapper.py @@ -21,7 +21,6 @@ def __init__( model: str | SentenceTransformer | CrossEncoder, revision: str | None = None, model_prompts: dict[str, str] | None = None, - similarity_fn_name: str | None = None, **kwargs, ) -> None: """Wrapper for SentenceTransformer models. @@ -33,7 +32,6 @@ def __init__( First priority is given to the composed prompt of task name + prompt type (query or passage), then to the specific task prompt, then to the composed prompt of task type + prompt type, then to the specific task type prompt, and finally to the specific prompt type. - similarity_fn_name: A similarity function to use. **kwargs: Additional arguments to pass to the SentenceTransformer model. """ if isinstance(model, str): @@ -61,9 +59,7 @@ def __init__( if isinstance(self.model, CrossEncoder): self.predict = self.handle_instructions_predict - if similarity_fn_name: - self.similarity = self.get_similarity_function(similarity_fn_name) - elif hasattr(self.model, "similarity") and callable(self.model.similarity): + if hasattr(self.model, "similarity") and callable(self.model.similarity): self.similarity = self.model.similarity def encode( diff --git a/mteb/models/wrapper.py b/mteb/models/wrapper.py index 76b31ba529..956071d3dc 100644 --- a/mteb/models/wrapper.py +++ b/mteb/models/wrapper.py @@ -3,12 +3,9 @@ import logging from typing import Callable, get_args -import numpy as np - import mteb from mteb.abstasks.TaskMetadata import TASK_TYPE from mteb.encoder_interface import PromptType -from mteb.evaluation.evaluators.utils import cos_sim, dot_score logger = logging.getLogger(__name__) @@ -67,18 +64,6 @@ def get_prompt_name( ) return None - @staticmethod - def get_similarity_function( - similarity_fn_name: str, - ) -> Callable[[np.ndarray, np.ndarray], np.ndarray]: - if similarity_fn_name == "cosine": - return cos_sim - if similarity_fn_name == "dot": - return dot_score - raise ValueError( - "Invalid similarity function. Should be one of ['cosine', 'dot']" - ) - @staticmethod def validate_task_to_prompt_name( task_to_prompt_name: dict[str, str] | None, diff --git a/tests/test_benchmark/test_models.py b/tests/test_benchmark/test_models.py new file mode 100644 index 0000000000..ee5bed091b --- /dev/null +++ b/tests/test_benchmark/test_models.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +import pytest + +import mteb +from mteb import MTEB +from mteb.abstasks import AbsTask + +from .mock_tasks import MockRetrievalTask + + +@pytest.mark.parametrize("model", ["colbert-ir/colbertv2.0"]) +@pytest.mark.parametrize("task", [MockRetrievalTask()]) +def test_colbert_model_e2e(task: AbsTask, model: str): + pytest.importorskip("pylate", reason="pylate not installed") + eval_splits = ["test"] + model = mteb.get_model(model) + evaluation = MTEB(tasks=[task]) + + results = evaluation.run( + model, + eval_splits=eval_splits, + corpus_chunk_size=500, + ) + result = results[0] + + assert result.scores["test"][0]["ndcg_at_1"] == 1.0 + + +def test_bm25s_e2e(): + # fails for dataset smaller then 1000 + pytest.importorskip("bm25s", reason="bm25s not installed") + pytest.importorskip("Stemmer", reason="PyStemmer not installed") + + model = mteb.get_model("bm25s") + tasks = mteb.get_tasks(tasks=["NFCorpus"]) + eval_splits = ["test"] + + evaluation = MTEB(tasks=tasks) + + results = evaluation.run(model, eval_splits=eval_splits) + result = results[0] + + assert result.scores["test"][0]["ndcg_at_1"] == 0.42879