Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions docs/usage/loading_results.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,30 @@ print(task_names)
# ['SpartQA', 'PlscClusteringP2P.v2', 'StackOverflowQA', 'JSICK', ...
```

### Getting Benchmark Results

If you loaded results for a specific benchmark, you can get the aggregated benchmark scores for each model using the `get_benchmark_result()` method:

```python
import mteb
from mteb.cache import ResultCache

# Load results for a specific benchmark
benchmark = mteb.get_benchmark("MTEB(eng, v2)")
cache = ResultCache()
cache.download_from_remote() # download results from the remote repository
results = cache.load_results(
models=["intfloat/e5-small", "intfloat/multilingual-e5-small"],
tasks=benchmark,
)

benchmark_scores_df = results.get_benchmark_result()
print(benchmark_scores_df)
# Rank (Borda) Model Zero-shot Memory Usage (MB) Number of Parameters (B) Embedding Dimensions Max Tokens ... Classification Clustering Pair Classification Reranking Retrieval STS Summarization
# 0 1 [e5-small](https://huggingface.co/intfloat/e5-... 100 127 0.033 384 512.0 ... 0.599545 0.422085 0.850895 0.444613 0.450684 0.790284 0.310609
# 1 2 [multilingual-e5-small](https://huggingface.co... 95 449 0.118 384 512.0 ... 0.673919 0.413591 0.840878 0.431942 0.464342 0.800185 0.292190
```

### Filtering Results

There is also utility function that allows you to select certain models or tasks:
Expand Down
44 changes: 31 additions & 13 deletions mteb/benchmarks/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,16 @@
from __future__ import annotations

from collections.abc import Iterable, Sequence
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Literal

import pandas as pd

from mteb.benchmarks._create_table import (
_create_per_language_table_from_benchmark_results,
_create_per_task_table_from_benchmark_results,
_create_summary_table_from_benchmark_results,
_create_summary_table_mean_public_private,
_create_summary_table_mean_subset,
_create_summary_table_mean_task_type,
)
from mteb.results import BenchmarkResults
from mteb.abstasks.abstask import AbsTask
from mteb.types import StrURL

if TYPE_CHECKING:
from mteb.abstasks import AbsTask
from mteb.results import BenchmarkResults


@dataclass
Expand All @@ -43,7 +37,7 @@ class Benchmark:
"""

name: str
tasks: Sequence["AbsTask"]
tasks: Sequence[AbsTask]
description: str | None = None
reference: StrURL | None = None
citation: str | None = None
Expand All @@ -53,13 +47,13 @@ class Benchmark:
display_name: str | None = None
language_view: list[str] | Literal["all"] = field(default_factory=list)

def __iter__(self) -> Iterable["AbsTask"]:
def __iter__(self) -> Iterable[AbsTask]:
return iter(self.tasks)

def __len__(self) -> int:
return len(self.tasks)

def __getitem__(self, index: int) -> "AbsTask":
def __getitem__(self, index: int) -> AbsTask:
return self.tasks[index]

def _create_summary_table(
Expand All @@ -70,6 +64,10 @@ def _create_summary_table(
Returns:
A pandas DataFrame representing the summary results.
"""
from mteb.benchmarks._create_table import (
_create_summary_table_from_benchmark_results,
)
Comment on lines +67 to +69
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not at the top?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Problems with circular imports

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm would be great to avoid it - seems like AbsTask could be moved to type checking

Copy link
Member

@Samoed Samoed Dec 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Problem not with AbsTask

>>> import mteb
Traceback (most recent call last):
  File "<python-input-0>", line 1, in <module>
    import mteb
  File "/mteb/mteb/__init__.py", line 6, in <module>
    from mteb.deprecated_evaluator import MTEB
  File "/mteb/mteb/deprecated_evaluator.py", line 21, in <module>
    from mteb.benchmarks import Benchmark
  File "/mteb/mteb/benchmarks/__init__.py", line 1, in <module>
    from mteb.benchmarks.benchmark import Benchmark
  File "/mteb/mteb/benchmarks/benchmark.py", line 11, in <module>
    from mteb.benchmarks._create_table import (
        _create_summary_table_from_benchmark_results,
    )
  File "/mteb/mteb/benchmarks/_create_table.py", line 9, in <module>
    from mteb.get_tasks import get_task, get_tasks
  File "/mteb/mteb/get_tasks.py", line 15, in <module>
    from mteb.filter_tasks import filter_tasks
  File "/mteb/mteb/filter_tasks.py", line 10, in <module>
    from mteb.abstasks.aggregated_task import AbsTaskAggregate
  File "/mteb/mteb/abstasks/aggregated_task.py", line 10, in <module>
    from mteb.results.task_result import TaskResult
  File "/mteb/mteb/results/__init__.py", line 1, in <module>
    from .benchmark_results import BenchmarkResults, ModelResult
  File "/mteb/mteb/results/benchmark_results.py", line 18, in <module>
    from mteb.benchmarks.benchmark import Benchmark

Moving table imports inside it's functions is easiest solution

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let us keep it as is - I might refactor the imports later, but for now I think this is reasoable


return _create_summary_table_from_benchmark_results(benchmark_results)

def _create_per_task_table(
Expand All @@ -80,6 +78,10 @@ def _create_per_task_table(
Returns:
A pandas DataFrame representing the per-task results.
"""
from mteb.benchmarks._create_table import (
_create_per_task_table_from_benchmark_results,
)

return _create_per_task_table_from_benchmark_results(benchmark_results)

def _create_per_language_table(
Expand All @@ -90,6 +92,10 @@ def _create_per_language_table(
Returns:
A pandas DataFrame representing the per-language results.
"""
from mteb.benchmarks._create_table import (
_create_per_language_table_from_benchmark_results,
)

if self.language_view == "all" or len(self.language_view) > 0:
return _create_per_language_table_from_benchmark_results(
benchmark_results, self.language_view
Expand All @@ -111,6 +117,10 @@ class RtebBenchmark(Benchmark):
def _create_summary_table(
self, benchmark_results: BenchmarkResults
) -> pd.DataFrame:
from mteb.benchmarks._create_table import (
_create_summary_table_mean_public_private,
)

joint_table = _create_summary_table_mean_public_private(benchmark_results)
# For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
joint_table = joint_table.rename(columns={"Retrieval": "Mean (Task)"})
Expand All @@ -123,6 +133,8 @@ class HUMEBenchmark(Benchmark):
def _create_summary_table(
self, benchmark_results: BenchmarkResults
) -> pd.DataFrame:
from mteb.benchmarks._create_table import _create_summary_table_mean_subset

return _create_summary_table_mean_subset(benchmark_results)


Expand All @@ -132,6 +144,8 @@ class MIEBBenchmark(Benchmark):
def _create_summary_table(
self, benchmark_results: BenchmarkResults
) -> pd.DataFrame:
from mteb.benchmarks._create_table import _create_summary_table_mean_task_type

return _create_summary_table_mean_task_type(benchmark_results)


Expand All @@ -141,6 +155,10 @@ class VidoreBenchmark(Benchmark):
def _create_summary_table(
self, benchmark_results: BenchmarkResults
) -> pd.DataFrame:
from mteb.benchmarks._create_table import (
_create_summary_table_mean_public_private,
)

joint_table = _create_summary_table_mean_public_private(benchmark_results)
# For ViDoRe (V1, V2, V3): all tasks are Document Understanding type, so Document Understanding column = Mean (Task)
joint_table = joint_table.rename(
Expand Down
4 changes: 2 additions & 2 deletions mteb/benchmarks/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@
],
),
description="A curated set of MTEB tasks designed to evaluate systems in the context of medical information retrieval.",
reference="",
reference=None,
citation=None,
)

Expand Down Expand Up @@ -2589,7 +2589,7 @@
],
),
description="The HUME benchmark is designed to evaluate the performance of text embedding models and humans on a comparable set of tasks. This captures areas where models perform better than human annotators and the reverse. In the paper, we go further into the analysis and what conclusions can be drawn.",
reference="Coming soon (in review)",
reference=None,
citation=None,
contacts=["AdnanElAssadi56", "KennethEnevoldsen", "isaac-chung", "Samoed"],
)
Expand Down
11 changes: 9 additions & 2 deletions mteb/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
from pathlib import Path
from typing import cast

import mteb
from mteb.abstasks import AbsTask
from mteb.benchmarks.benchmark import Benchmark
from mteb.models import ModelMeta
from mteb.results import BenchmarkResults, ModelResult, TaskResult
from mteb.types import ModelName, Revision
Expand Down Expand Up @@ -465,7 +467,7 @@ def _filter_paths_by_task(
def load_results(
self,
models: Sequence[str] | Sequence[ModelMeta] | None = None,
tasks: Sequence[str] | Sequence[AbsTask] | None = None,
tasks: Sequence[str] | Sequence[AbsTask] | Benchmark | str | None = None,
require_model_meta: bool = True,
include_remote: bool = True,
validate_and_filter: bool = False,
Expand All @@ -475,7 +477,8 @@ def load_results(

Args:
models: A list of model names to load the results for. If None it will load the results for all models.
tasks: A list of task names to load the results for. If None it will load the results for all tasks.
tasks: A list of task names to load the results for. If str is passed, then benchmark will be loaded.
If None it will load the results for all tasks.
require_model_meta: If True it will ignore results that do not have a model_meta.json file. If false it attempt to
extract the model name and revision from the path.
include_remote: If True, it will include results from the remote repository.
Expand All @@ -497,6 +500,9 @@ def load_results(
... require_model_meta=True,
... )
"""
if isinstance(tasks, str):
tasks = mteb.get_benchmark(tasks)

paths = self.get_cache_paths(
models=models,
tasks=tasks,
Expand Down Expand Up @@ -546,6 +552,7 @@ def load_results(

benchmark_results = BenchmarkResults(
model_results=models_results,
benchmark=tasks if isinstance(tasks, Benchmark) else None,
)

return benchmark_results
26 changes: 22 additions & 4 deletions mteb/results/benchmark_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
TaskDomain,
TaskType,
)
from mteb.benchmarks.benchmark import Benchmark
from mteb.models import ModelMeta
from mteb.models.get_model_meta import get_model_metas
from mteb.types import (
Expand All @@ -39,10 +40,10 @@ class BenchmarkResults(BaseModel):
"""

model_results: list[ModelResult]
model_config = (
ConfigDict( # to free up the name model_results which is otherwise protected
protected_namespaces=(),
)
benchmark: Benchmark | None = None
model_config = ConfigDict(
protected_namespaces=(), # to free up the name model_results which is otherwise protected
arbitrary_types_allowed=True, # Benchmark is dataclasses.dataclass
)

def __repr__(self) -> str:
Expand Down Expand Up @@ -362,6 +363,23 @@ def to_dataframe(
format=format,
)

def get_benchmark_result(self) -> pd.DataFrame:
"""Get aggregated scores for each model in the benchmark.

Uses the benchmark's summary table creation method to compute scores.

Returns:
A DataFrame with the aggregated benchmark scores for each model.
"""
if self.benchmark is None:
raise ValueError(
"No benchmark associated with these results (self.benchmark is None). "
"To get benchmark results, load results with a Benchmark object. "
"`results = cache.load_results(tasks='MTEB(eng, v2)')`"
)

return self.benchmark._create_summary_table(self)

def __iter__(self) -> Iterator[ModelResult]:
return iter(self.model_results)

Expand Down
Loading