diff --git a/mteb/__init__.py b/mteb/__init__.py index 281faf7d77..bef5f7408d 100644 --- a/mteb/__init__.py +++ b/mteb/__init__.py @@ -10,8 +10,8 @@ CoIR, ) from mteb.evaluation import * -from mteb.load_results import load_results -from mteb.models import get_model, get_model_meta +from mteb.load_results import BenchmarkResults, load_results +from mteb.models import get_model, get_model_meta, get_model_metas from mteb.overview import TASKS_REGISTRY, get_task, get_tasks from .benchmarks.benchmarks import Benchmark @@ -31,8 +31,10 @@ "get_task", "get_model", "get_model_meta", + "get_model_metas", "load_results", "Benchmark", "get_benchmark", "get_benchmarks", + "BenchmarkResults", ] diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index 16f436ce03..78073f5666 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -310,3 +310,6 @@ def __repr__(self) -> str: return ( f"{self.__class__.__name__}(name='{self.metadata.name}', languages={langs})" ) + + def __hash__(self) -> int: + return hash(self.metadata) diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py index be345b2f48..ea4667d9de 100644 --- a/mteb/abstasks/AbsTaskBitextMining.py +++ b/mteb/abstasks/AbsTaskBitextMining.py @@ -8,7 +8,7 @@ from mteb.encoder_interface import Encoder from ..evaluation.evaluators import BitextMiningEvaluator -from ..load_results.mteb_results import HFSubset, ScoresDict +from ..load_results.task_results import HFSubset, ScoresDict from .AbsTask import AbsTask, DescriptiveStatistics logger = logging.getLogger(__name__) diff --git a/mteb/abstasks/AbsTaskClassification.py b/mteb/abstasks/AbsTaskClassification.py index 36c0a76b96..788d3a5347 100644 --- a/mteb/abstasks/AbsTaskClassification.py +++ b/mteb/abstasks/AbsTaskClassification.py @@ -14,7 +14,7 @@ kNNClassificationEvaluatorPytorch, logRegClassificationEvaluator, ) -from ..load_results.mteb_results import HFSubset, ScoresDict +from ..load_results.task_results import HFSubset, ScoresDict from .AbsTask import AbsTask, DescriptiveStatistics logger = logging.getLogger(__name__) diff --git a/mteb/abstasks/AbsTaskClustering.py b/mteb/abstasks/AbsTaskClustering.py index 87113b2b26..bd0898f5ea 100644 --- a/mteb/abstasks/AbsTaskClustering.py +++ b/mteb/abstasks/AbsTaskClustering.py @@ -9,7 +9,7 @@ from datasets import Dataset from mteb.encoder_interface import Encoder, EncoderWithQueryCorpusEncode -from mteb.load_results.mteb_results import ScoresDict +from mteb.load_results.task_results import ScoresDict from ..evaluation.evaluators import ClusteringEvaluator from .AbsTask import AbsTask, DescriptiveStatistics diff --git a/mteb/abstasks/AbsTaskClusteringFast.py b/mteb/abstasks/AbsTaskClusteringFast.py index ba49b599dc..da665963f6 100644 --- a/mteb/abstasks/AbsTaskClusteringFast.py +++ b/mteb/abstasks/AbsTaskClusteringFast.py @@ -15,7 +15,7 @@ from mteb.encoder_interface import Encoder from ..evaluation.evaluators.model_encode import model_encode -from ..load_results.mteb_results import HFSubset +from ..load_results.task_results import HFSubset from .AbsTask import AbsTask, DescriptiveStatistics logger = logging.getLogger(__name__) diff --git a/mteb/abstasks/AbsTaskMultilabelClassification.py b/mteb/abstasks/AbsTaskMultilabelClassification.py index b9bb79f21c..ac66a4ecef 100644 --- a/mteb/abstasks/AbsTaskMultilabelClassification.py +++ b/mteb/abstasks/AbsTaskMultilabelClassification.py @@ -15,7 +15,7 @@ from mteb.encoder_interface import Encoder from ..evaluation.evaluators.model_encode import model_encode -from ..load_results.mteb_results import HFSubset, ScoresDict +from ..load_results.task_results import HFSubset, ScoresDict from .AbsTask import AbsTask, DescriptiveStatistics logger = logging.getLogger(__name__) diff --git a/mteb/abstasks/AbsTaskPairClassification.py b/mteb/abstasks/AbsTaskPairClassification.py index f06fcdcf4c..50c5076d17 100644 --- a/mteb/abstasks/AbsTaskPairClassification.py +++ b/mteb/abstasks/AbsTaskPairClassification.py @@ -7,7 +7,7 @@ from ..encoder_interface import Encoder, EncoderWithQueryCorpusEncode from ..evaluation.evaluators import PairClassificationEvaluator -from ..load_results.mteb_results import ScoresDict +from ..load_results.task_results import ScoresDict from .AbsTask import AbsTask, DescriptiveStatistics logger = logging.getLogger(__name__) diff --git a/mteb/abstasks/AbsTaskReranking.py b/mteb/abstasks/AbsTaskReranking.py index 0fba84b040..a0c8b0a3a5 100644 --- a/mteb/abstasks/AbsTaskReranking.py +++ b/mteb/abstasks/AbsTaskReranking.py @@ -5,7 +5,7 @@ from datasets import Dataset from mteb.encoder_interface import Encoder, EncoderWithQueryCorpusEncode -from mteb.load_results.mteb_results import ScoresDict +from mteb.load_results.task_results import ScoresDict from ..evaluation.evaluators import RerankingEvaluator from .AbsTask import AbsTask, DescriptiveStatistics diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index 3445e1576a..b57ea1bee8 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -13,7 +13,7 @@ from mteb.abstasks.TaskMetadata import HFSubset from ..evaluation.evaluators import RetrievalEvaluator -from ..load_results.mteb_results import ScoresDict +from ..load_results.task_results import ScoresDict from .AbsTask import AbsTask, DescriptiveStatistics logger = logging.getLogger(__name__) diff --git a/mteb/abstasks/AbsTaskSTS.py b/mteb/abstasks/AbsTaskSTS.py index 422162e8c3..157f285951 100644 --- a/mteb/abstasks/AbsTaskSTS.py +++ b/mteb/abstasks/AbsTaskSTS.py @@ -4,7 +4,7 @@ from typing import Any from ..evaluation.evaluators import STSEvaluator -from ..load_results.mteb_results import ScoresDict +from ..load_results.task_results import ScoresDict from .AbsTask import AbsTask, DescriptiveStatistics logger = logging.getLogger(__name__) diff --git a/mteb/abstasks/AbsTaskSpeedTask.py b/mteb/abstasks/AbsTaskSpeedTask.py index e764f607db..e9c144640c 100644 --- a/mteb/abstasks/AbsTaskSpeedTask.py +++ b/mteb/abstasks/AbsTaskSpeedTask.py @@ -8,7 +8,7 @@ import numpy as np from mteb.encoder_interface import Encoder, EncoderWithQueryCorpusEncode -from mteb.load_results.mteb_results import ScoresDict +from mteb.load_results.task_results import ScoresDict from .AbsTask import AbsTask diff --git a/mteb/abstasks/AbsTaskSummarization.py b/mteb/abstasks/AbsTaskSummarization.py index 4717d2a8cb..ff03fbaab3 100644 --- a/mteb/abstasks/AbsTaskSummarization.py +++ b/mteb/abstasks/AbsTaskSummarization.py @@ -6,7 +6,7 @@ import numpy as np from mteb.encoder_interface import Encoder -from mteb.load_results.mteb_results import ScoresDict +from mteb.load_results.task_results import ScoresDict from ..evaluation.evaluators import SummarizationEvaluator from .AbsTask import AbsTask, DescriptiveStatistics diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index b130e30a9c..a1638c9285 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -6,7 +6,7 @@ from typing import Annotated, Any, Union from pydantic import AnyUrl, BaseModel, BeforeValidator, TypeAdapter, field_validator -from typing_extensions import Literal +from typing_extensions import Annotated, Literal from ..languages import ( ISO_LANGUAGE_SCRIPT, @@ -352,3 +352,6 @@ def intext_citation(self, include_cite: bool = True) -> str: ) return f"\\cite{{{cite}}}" return cite + + def __hash__(self) -> int: + return hash(self.model_dump_json()) diff --git a/mteb/benchmarks/__init__.py b/mteb/benchmarks/__init__.py index fb1d12a293..653b97c6f7 100644 --- a/mteb/benchmarks/__init__.py +++ b/mteb/benchmarks/__init__.py @@ -1,3 +1,4 @@ from __future__ import annotations from mteb.benchmarks.benchmarks import * +from mteb.benchmarks.get_benchmark import * diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 2798a79df3..9667295df2 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -7,6 +7,12 @@ from pydantic import AnyUrl, BeforeValidator, TypeAdapter from mteb.abstasks.AbsTask import AbsTask +from mteb.load_results.benchmark_results import ( + BenchmarkResults, + ModelResult, + TaskResult, +) +from mteb.load_results.load_results import load_results from mteb.overview import get_tasks http_url_adapter = TypeAdapter(AnyUrl) @@ -52,6 +58,15 @@ def __len__(self) -> int: def __getitem__(self, index): return self.tasks[index] + def load_results( + self, base_results: None | BenchmarkResults = None + ) -> BenchmarkResults: + if base_results is None: + base_results = load_results() + return base_results.select_tasks(self.tasks) + + +MTEB_MAIN_MULTILINGUAL = Benchmark(name="MTEB(multilingual)", tasks=get_tasks()) MTEB_MAIN_EN = Benchmark( name="MTEB(eng)", diff --git a/mteb/benchmarks/get_benchmark.py b/mteb/benchmarks/get_benchmark.py index 2f7f3aa6d0..b60b40fc59 100644 --- a/mteb/benchmarks/get_benchmark.py +++ b/mteb/benchmarks/get_benchmark.py @@ -3,7 +3,7 @@ import difflib import mteb.benchmarks.benchmarks as benchmark_module -from mteb.benchmarks import Benchmark +from mteb.benchmarks.benchmarks import Benchmark BENCHMARK_REGISTRY = { inst.name: inst diff --git a/mteb/create_meta.py b/mteb/create_meta.py index 551331acdb..02ed273996 100644 --- a/mteb/create_meta.py +++ b/mteb/create_meta.py @@ -7,8 +7,8 @@ import yaml import mteb -from mteb import MTEBResults -from mteb.load_results.mteb_results import CQADupstackRetrievalDummy +from mteb import TaskResult +from mteb.load_results.task_results import CQADupstackRetrievalDummy def generate_readme(results_folder: Path, from_existing: Path | None = None) -> str: @@ -45,7 +45,7 @@ def load_model_name(results_folder: Path) -> str: return "PLACEHOLDER" -def process_task_result(task_result: MTEBResults) -> list[dict[str, Any]]: +def process_task_result(task_result: TaskResult) -> list[dict[str, Any]]: # CQADupstackRetrieval is a combined dataset (special case atm.) task = ( CQADupstackRetrievalDummy() @@ -84,13 +84,13 @@ def process_task_result(task_result: MTEBResults) -> list[dict[str, Any]]: return yaml_results -def get_task_results(results_folder: Path) -> list[MTEBResults]: +def get_task_results(results_folder: Path) -> list[TaskResult]: json_files = [ r for r in results_folder.glob("*.json") if r.is_file() and r.name != "model_meta.json" ] - task_results = [MTEBResults.from_disk(path) for path in json_files] + task_results = [TaskResult.from_disk(path) for path in json_files] task_results = [ results for results in task_results @@ -102,8 +102,8 @@ def get_task_results(results_folder: Path) -> list[MTEBResults]: def potentially_add_cqadupstack_to_results( - results: list[MTEBResults], -) -> list[MTEBResults]: + results: list[TaskResult], +) -> list[TaskResult]: task_list_cqa = { "CQADupstackAndroidRetrieval", "CQADupstackEnglishRetrieval", @@ -128,7 +128,7 @@ def potentially_add_cqadupstack_to_results( main_scores = [r.get_score(splits=["test"]) for r in cqa_results] main_score = float(sum(main_scores) / len(main_scores)) - combined_result = MTEBResults( + combined_result = TaskResult( task_name="CQADupstackRetrieval", dataset_revision="CQADupstackRetrieval_is_a_combined_dataset", mteb_version="NA", diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index a60ac09021..1517307c30 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -21,7 +21,7 @@ from ..abstasks import * from ..abstasks import AbsTask -from ..load_results.mteb_results import MTEBResults +from ..load_results.task_results import TaskResult from ..tasks import * from . import LangMapping @@ -317,7 +317,7 @@ def run( co2_tracker: bool = False, encode_kwargs: dict[str, Any] = {}, **kwargs, - ) -> list[MTEBResults]: + ) -> list[TaskResult]: """Run the evaluation pipeline on the selected tasks. Args: @@ -336,7 +336,7 @@ def run( kwargs: Additional arguments to be passed to `_run_eval` method and task.load_data. Returns: - A list of MTEBResults objects, one for each task evaluated. + A list of TaskResult objects, one for each task evaluated. """ if "batch_size" in kwargs: logger.warning( @@ -376,7 +376,7 @@ def run( logger.info( f"{task.metadata.name} results already exists. Loading results from disk. Set overwrite_results=True to overwrite." ) - mteb_results = MTEBResults.from_disk(save_path) + mteb_results = TaskResult.from_disk(save_path) evaluation_results.append(mteb_results) del self.tasks[0] # empty memory continue @@ -437,7 +437,7 @@ def run( if verbosity >= 1: logger.info(f"Scores: {results}") - mteb_task_result = MTEBResults.from_task_results( + mteb_task_result = TaskResult.from_task_results( task, task_results, evaluation_time=evaluation_time, diff --git a/mteb/leaderboard/__init__.py b/mteb/leaderboard/__init__.py new file mode 100644 index 0000000000..d0122cfbfb --- /dev/null +++ b/mteb/leaderboard/__init__.py @@ -0,0 +1 @@ +from mteb.leaderboard.app import demo diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py new file mode 100644 index 0000000000..de23e0332d --- /dev/null +++ b/mteb/leaderboard/app.py @@ -0,0 +1,215 @@ +import functools +import json +from collections import defaultdict +from pathlib import Path + +import gradio as gr +import numpy as np +import pandas as pd +from gradio_rangeslider import RangeSlider + +import mteb +from mteb.leaderboard.table import scores_to_table + + +def load_results(): + results_cache_path = Path(__file__).parent.joinpath("__cached_results.json") + if not results_cache_path.exists(): + all_results = mteb.load_results() + all_results.to_disk(results_cache_path) + return all_results + else: + return mteb.BenchmarkResults.from_disk(results_cache_path) + + +all_results = load_results().filter_models() + +# Model sizes in million parameters +min_model_size, max_model_size = 8, 46703 + +benchmarks = mteb.get_benchmarks() + +default_benchmark = mteb.get_benchmark("MTEB(multilingual)") +default_results = default_benchmark.load_results(base_results=all_results) + +benchmark_select = gr.Dropdown( + [bench.name for bench in benchmarks], + value=default_benchmark.name, + label="Prebuilt Benchmarks", + info="Select one of our expert-selected benchmarks from MTEB publications.", +) +lang_select = gr.Dropdown( + default_results.languages, + value=default_results.languages, + multiselect=True, + label="Language", + info="Select languages to include.", +) +type_select = gr.Dropdown( + default_results.task_types, + value=default_results.task_types, + multiselect=True, + label="Task Type", + info="Select task types to include.", +) +domain_select = gr.Dropdown( + default_results.domains, + value=default_results.domains, + multiselect=True, + label="Domain", + info="Select domains to include.", +) +task_select = gr.Dropdown( + default_results.task_names, + value=default_results.task_names, + multiselect=True, + label="Task", + info="Select specific tasks to include", +) + +css = """ +.scrollable { + overflow-y: scroll; + max-height: 400px +} +""" + +with gr.Blocks(fill_width=True, theme=gr.themes.Base(), css=css) as demo: + gr.Markdown( + """ + ### Model Selection + Select models to rank based on an assortment of criteria. + """ + ) + with gr.Group(): + with gr.Row(): + with gr.Column(): + availability = gr.Radio( + [("Only Open", True), ("Only Proprietary", False), ("Both", None)], + value=None, + label="Availability", + interactive=True, + ) + compatibility = gr.CheckboxGroup( + [ + ( + "Should be sentence-transformers compatible", + "sbert_compatible", + ) + ], + value=[], + label="Compatibility", + interactive=True, + ) + with gr.Column(): + instructions = gr.Radio( + [ + ("Only Instruction-tuned", True), + ("Only non-instruction", False), + ("Both", None), + ], + value=None, + label="Instructions", + interactive=True, + ) + model_size = RangeSlider( + minimum=0, + maximum=8000, + value=(0, 8000), + label="Model Size (#M Parameters)", + interactive=True, + ) + + gr.Markdown( + """ + ### Benchmarks + Select one of the hand-curated benchmarks from our publication. + Or create one from scratch based on your use case. + """ + ) + with gr.Group(elem_classes="scrollable"): + with gr.Row(): + with gr.Column(): + benchmark_select.render() + with gr.Row(): + lang_select.render() + type_select.render() + with gr.Row(): + domain_select.render() + with gr.Column(): + # with gr.Accordion("Add and remove tasks:", open=False): + task_select.render() + scores = gr.State(default_results.get_scores(format="long")) + dataframe = gr.DataFrame( + scores_to_table, + inputs=[scores], + ) + + @gr.on( + inputs=[benchmark_select], + outputs=[ + lang_select, + type_select, + domain_select, + ], + ) + def on_select_benchmark(benchmark_name): + benchmark = mteb.get_benchmark(benchmark_name) + benchmark_results = benchmark.load_results(base_results=all_results) + return ( + benchmark_results.languages, + benchmark_results.task_types, + benchmark_results.domains, + ) + + @gr.on( + inputs=[benchmark_select, lang_select, type_select, domain_select], + outputs=[task_select], + ) + def update_task_list(benchmark_name, languages, task_types, domains): + benchmark = mteb.get_benchmark(benchmark_name) + benchmark_results = benchmark.load_results(base_results=all_results) + task_to_lang_set = defaultdict(set) + task_to_type = dict() + task_to_domains = defaultdict(set) + for model_res in benchmark_results: + for task_res in model_res: + task_to_lang_set[task_res.task_name] |= set(task_res.languages) + task_to_domains[task_res.task_name] |= set(task_res.domains) + task_to_type[task_res.task_name] = task_res.task_type + res = [] + for task_name in benchmark_results.task_names: + if not (task_to_domains[task_name] & set(domains)): + continue + if not (task_to_lang_set[task_name] & set(languages)): + continue + if not (task_to_type[task_name] in task_types): + continue + res.append(task_name) + return res + + @gr.on( + inputs=[ + benchmark_select, + task_select, + lang_select, + type_select, + domain_select, + ], + outputs=[scores], + ) + def update_scores(benchmark_name, task_names, languages, task_types, domains): + benchmark = mteb.get_benchmark(benchmark_name) + benchmark_results = benchmark.load_results(base_results=all_results) + benchmark_results = benchmark_results.filter_tasks( + languages=languages, + task_names=task_names, + task_types=task_types, + domains=domains, + ) + scores = benchmark_results.get_scores(languages=languages, format="long") + return scores + + +if __name__ == "__main__": + demo.launch() diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py new file mode 100644 index 0000000000..61090c9034 --- /dev/null +++ b/mteb/leaderboard/table.py @@ -0,0 +1,43 @@ +import numpy as np +import pandas as pd + +from mteb.overview import get_task + + +def scores_to_table(scores_long: list[dict]): + data = pd.DataFrame.from_records(scores_long) + data["task_type"] = data["task_name"].map( + lambda task_name: get_task(task_name).metadata.type + ) + mean_per_type = ( + data.groupby(["model_name", "model_revision", "task_type"])[["score"]] + .agg(np.nanmean) + .reset_index() + ) + typed_mean = ( + mean_per_type.groupby(["model_name", "model_revision"])[["score"]] + .agg(np.nanmean) + .rename(columns={"score": "mean_by_task_type"}) + ) + mean_per_type = mean_per_type.pivot( + index=["model_name", "model_revision"], columns="task_type", values="score" + ) + per_task = data.pivot( + index=["model_name", "model_revision"], columns="task_name", values="score" + ) + overall_mean = ( + data.groupby(["model_name", "model_revision"])[["score"]] + .agg(np.nanmean) + .rename(columns={"score": "mean"}) + ) + joint_table = overall_mean.join([typed_mean, mean_per_type, per_task]).reset_index() + joint_table = joint_table.sort_values("mean", ascending=False) + joint_table = joint_table.rename( + columns={ + "model_name": "Model", + "mean_by_task_type": "Mean by Task Type", + "mean": "Mean", + } + ) + joint_table = joint_table.drop(columns=["model_revision"]) + return joint_table diff --git a/mteb/load_results/__init__.py b/mteb/load_results/__init__.py index 3b08f6eb4d..aee4201d39 100644 --- a/mteb/load_results/__init__.py +++ b/mteb/load_results/__init__.py @@ -1,6 +1,7 @@ from __future__ import annotations +from .benchmark_results import BenchmarkResults, ModelResult from .load_results import load_results -from .mteb_results import MTEBResults +from .task_results import TaskResult -__all__ = ["load_results", "MTEBResults"] +__all__ = ["load_results", "TaskResult", "ModelResult", "BenchmarkResults"] diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py new file mode 100644 index 0000000000..8878f276ba --- /dev/null +++ b/mteb/load_results/benchmark_results.py @@ -0,0 +1,318 @@ +from __future__ import annotations + +import json +from collections import defaultdict +from pathlib import Path +from typing import Any, Callable, Iterable, Literal + +import numpy as np +import pandas as pd +from pydantic import BaseModel, ConfigDict + +from mteb.abstasks.AbsTask import AbsTask, ScoresDict +from mteb.abstasks.TaskMetadata import ( + ISO_LANGUAGE_SCRIPT, + TASK_CATEGORY, + TASK_DOMAIN, + TASK_TYPE, +) +from mteb.languages import ISO_LANGUAGE +from mteb.load_results.task_results import TaskResult +from mteb.models.overview import get_model_metas +from mteb.overview import get_tasks + +Split = str +Score = Any + + +class ModelResult(BaseModel): + model_name: str + model_revision: str | None + task_results: list[TaskResult] + model_config = ConfigDict( + protected_namespaces=(), + ) + + def __repr__(self) -> str: + n_entries = len(self.task_results) + return f"ModelResult(model_name={self.model_name}, model_revision={self.model_revision}, task_results=[...](#{n_entries}))" + + def filter_tasks( + self, + task_names: list[str] | None = None, + languages: list[str] | None = None, + domains: list[TASK_DOMAIN] | None = None, + task_types: list[TASK_TYPE] | None = None, + ) -> "ModelResult": + new_task_results = [] + for task_result in self.task_results: + if (task_names is not None) and (task_result.task_name not in task_names): + continue + if languages is not None: + task_languages = task_result.languages + if not any([lang in task_languages for lang in languages]): + continue + if domains is not None: + task_domains = task_result.domains + if not any([domain in task_domains for domain in domains]): + continue + if (task_types is not None) and (task_result.task_type not in task_types): + continue + new_task_results.append(task_result) + return type(self)( + model_name=self.model_name, + model_revision=self.model_revision, + task_results=new_task_results, + ) + + def select_tasks(self, tasks: list[AbsTask]) -> "ModelResult": + task_name_to_task = {task.metadata.name: task for task in tasks} + new_task_results = [ + task_res.validate_and_filter_scores(task_name_to_task[task_res.task_name]) + for task_res in self.task_results + if task_res.task_name in task_name_to_task + ] + return type(self)( + model_name=self.model_name, + model_revision=self.model_revision, + task_results=new_task_results, + ) + + def get_scores( + self, + splits: list[Split] | None = None, + languages: list[ISO_LANGUAGE | ISO_LANGUAGE_SCRIPT] | None = None, + scripts: list[ISO_LANGUAGE_SCRIPT] | None = None, + getter: Callable[[ScoresDict], Score] = lambda scores: scores["main_score"], + aggregation: Callable[[list[Score]], Any] = np.mean, + format: Literal["wide", "long"] = "wide", + ) -> dict | list: + if format == "wide": + scores = { + res.task_name: res.get_score( + splits=splits, + languages=languages, + scripts=scripts, + getter=getter, + aggregation=aggregation, + ) + for res in self.task_results + } + return scores + if format == "long": + entries = [] + for task_res in self.task_results: + entry = dict( + model_name=self.model_name, + model_revision=self.model_revision, + task_name=task_res.task_name, + score=task_res.get_score( + splits=splits, + languages=languages, + getter=getter, + aggregation=aggregation, + ), + mteb_version=task_res.mteb_version, + dataset_revision=task_res.dataset_revision, + evaluation_time=task_res.evaluation_time, + kg_co2_emissions=task_res.kg_co2_emissions, + ) + entries.append(entry) + return entries + + def __iter__(self): + return iter(self.task_results) + + def __getitem__(self, index) -> TaskResult: + return self.task_results[index] + + @property + def languages(self) -> list[str]: + langs = [] + for task_res in self.task_results: + langs.extend(task_res.languages) + return list(set(langs)) + + @property + def domains(self) -> list[str]: + ds = [] + for task_res in self.task_results: + ds.extend(task_res.domains) + return list(set(ds)) + + @property + def task_types(self) -> list[str]: + return list(set([task_res.task_type for task_res in self.task_results])) + + @property + def task_names(self) -> list[str]: + return [task_res.task_name for task_res in self.task_results] + + +class BenchmarkResults(BaseModel): + model_results: list[ModelResult] + model_config = ConfigDict( + protected_namespaces=(), + ) + + def __repr__(self) -> str: + n_models = len(self.model_results) + return f"BenchmarkResults(model_results=[...](#{n_models}))" + + def filter_tasks( + self, + task_names: list[str] | None = None, + languages: list[str] | None = None, + domains: list[TASK_DOMAIN] | None = None, + task_types: list[TASK_TYPE] | None = None, + ) -> "BenchmarkResults": + model_results = [ + res.filter_tasks( + task_names=task_names, + languages=languages, + domains=domains, + task_types=task_types, + ) + for res in self.model_results + ] + return type(self)( + model_results=[res for res in model_results if res.task_results] + ) + + def select_tasks(self, tasks: list[AbsTask]) -> "BenchmarkResults": + new_model_results = [ + model_res.select_tasks(tasks) for model_res in self.model_results + ] + return type(self)(model_results=new_model_results) + + def filter_models( + self, + model_names: Iterable[str] | None = None, + languages: Iterable[str] | None = None, + open_source: bool | None = None, + frameworks: Iterable[str] | None = None, + n_parameters_range: tuple[int | None, int | None] = (None, None), + ) -> "BenchmarkResults": + model_metas = get_model_metas( + model_names, languages, open_source, frameworks, n_parameters_range + ) + model_revision_pairs = {(meta.name, meta.revision) for meta in model_metas} + new_model_results = [] + for model_res in self: + if (model_res.model_name, model_res.model_revision) in model_revision_pairs: + new_model_results.append(model_res) + return type(self)(model_results=new_model_results) + + def get_scores( + self, + splits: list[Split] | None = None, + languages: list[ISO_LANGUAGE | ISO_LANGUAGE_SCRIPT] | None = None, + scripts: list[ISO_LANGUAGE_SCRIPT] | None = None, + getter: Callable[[ScoresDict], Score] = lambda scores: scores["main_score"], + aggregation: Callable[[list[Score]], Any] = np.mean, + format: Literal["wide", "long"] = "wide", + ) -> list[dict]: + entries = [] + if format == "wide": + for model_res in self: + model_scores = model_res.get_scores( + splits=splits, + languages=languages, + scripts=scripts, + getter=getter, + aggregation=aggregation, + format="wide", + ) + entries.append( + { + "model": model_res.model_name, + "revision": model_res.model_revision, + **model_scores, + } + ) + if format == "long": + for model_res in self: + entries.extend( + model_res.get_scores( + splits=splits, + languages=languages, + scripts=scripts, + getter=getter, + aggregation=aggregation, + format="long", + ) + ) + return entries + + def __iter__(self): + return iter(self.model_results) + + def __getitem__(self, index) -> ModelResult: + return self.model_results[index] + + def to_legacy_dict(self) -> dict[str, dict[str, list[TaskResult]]]: + res = defaultdict(dict) + for model_res in self: + res[model_res.model_name][model_res.model_revision] = model_res.task_results + return res + + @classmethod + def from_legacy_dict(cls, legacy: dict[str, dict[str, list[TaskResult]]]): + model_results = [] + for model_name, revisions in legacy.items(): + for model_revision, results in revisions.items(): + model_results.append( + ModelResult( + model_name=model_name, + model_revision=model_revision, + task_results=results, + ) + ) + return cls(model_results=model_results) + + def to_dict(self) -> dict: + return self.model_dump() + + @classmethod + def from_dict(cls, data: dict) -> TaskResult: + return cls.model_validate(data) + + def to_disk(self, path: Path | str) -> None: + path = Path(path) + with path.open("w") as out_file: + out_file.write(self.model_dump_json(indent=2)) + + @classmethod + def from_disk(cls, path: Path | str) -> "BenchmarkResults": + path = Path(path) + with path.open() as in_file: + data = json.loads(in_file.read()) + return cls.from_dict(data) + + @property + def languages(self) -> list[str]: + langs = [] + for model_res in self.model_results: + langs.extend(model_res.languages) + return list(set(langs)) + + @property + def domains(self) -> list[str]: + ds = [] + for model_res in self.model_results: + ds.extend(model_res.domains) + return list(set(ds)) + + @property + def task_types(self) -> list[str]: + ts = [] + for model_res in self.model_results: + ts.extend(model_res.task_types) + return list(set(ts)) + + @property + def task_names(self) -> list[str]: + names = [] + for model_res in self.model_results: + names.extend(model_res.task_names) + return list(set(names)) diff --git a/mteb/load_results/load_results.py b/mteb/load_results/load_results.py index aca6bd6835..6d42cf1dbd 100644 --- a/mteb/load_results/load_results.py +++ b/mteb/load_results/load_results.py @@ -9,15 +9,14 @@ from pathlib import Path from mteb.abstasks.AbsTask import AbsTask -from mteb.load_results.mteb_results import MTEBResults +from mteb.load_results.benchmark_results import BenchmarkResults, ModelResult +from mteb.load_results.task_results import TaskResult from mteb.model_meta import ModelMeta logger = logging.getLogger(__name__) MODEL_NAME = str REVISION = str -RESULTS = dict[MODEL_NAME, dict[REVISION, list[MTEBResults]]] - def download_of_results( results_repo: str, cache_directory: Path | None = None, download_latest: bool = True @@ -92,7 +91,7 @@ def load_results( tasks: Sequence[AbsTask] | Sequence[str] | None = None, validate_and_filter: bool = True, require_model_meta: bool = True, -) -> RESULTS: +) -> BenchmarkResults: """Loads the results from the latest version of the results repository. The results are cached locally in the MTEB_CACHE directory. This directory can be set using the MTEB_CACHE environment variable or defaults to "~/.cache/mteb". @@ -107,29 +106,7 @@ def load_results( splits from the results object that are not default in the task metadata. Defaults to True. Returns: - A dictionary where the keys are the model names and the values are dictionaries where the keys are the revisions and the values are lists of MTEBResults objects. - - Example: - >>> results = load_results() - >>> results - {'mixedbread-ai/mxbai-embed-large-v1': - {'990580e27d329c7408b3741ecff85876e128e203': [ - MTEBResults(task_name=TwentyNewsgroupsClustering.v2, scores=...), - MTEBResults(task_name=MedrxivClusteringP2P, scores=...), - MTEBResults(task_name=StackExchangeClustering, scores=...), - MTEBResults(task_name=BiorxivClusteringP2P.v2, scores=...), - MTEBResults(task_name=MedrxivClusteringS2S.v2, scores=...), - MTEBResults(task_name=MedrxivClusteringS2S, scores=...), - ... - ]}, - 'intfloat/multilingual-e5-small': - {'e4ce9877abf3edfe10b0d82785e83bdcb973e22e': [ - MTEBResults(task_name=IndicGenBenchFloresBitextMining, scores=...), - MTEBResults(task_name=PpcPC, scores=...), - MTEBResults(task_name=TwentyNewsgroupsClustering.v2, scores=...), - ... - ]}, - ... + """ repo_directory = download_of_results(results_repo, download_latest=download_latest) model_paths = [p for p in (repo_directory / "results").glob("*") if p.is_dir()] @@ -144,16 +121,15 @@ def load_results( else: models_to_keep = None + task_names = {} if tasks is not None: - task_names = {} for task in tasks: if isinstance(task, AbsTask): task_names[task.metadata.name] = task else: task_names[task] = None - results = defaultdict(dict) - + model_results = [] for model_path in model_paths: model_revisions = model_path.glob("*") @@ -174,7 +150,7 @@ def load_results( task_json_files = [ f for f in revision_path.glob("*.json") if "model_meta.json" != f.name ] - _results = [MTEBResults.from_disk(f) for f in task_json_files] + _results = [TaskResult.from_disk(f) for f in task_json_files] # filter out tasks that are not in the tasks list if tasks is not None: @@ -184,14 +160,23 @@ def load_results( filtered_results = [] for r in _results: try: - r.validate_and_filter_scores(task_names[r.task_name]) + if task_names: + task = task_names[r.task_name] + else: + task = None + r = r.validate_and_filter_scores(task=task) filtered_results.append(r) except Exception as e: logger.warning( f"Validation failed for {r.task_name} in {model_name} {revision}: {e}" ) _results = filtered_results + model_results.append( + ModelResult( + model_name=model_name, + model_revision=revision, + task_results=_results, + ) + ) - results[model_name][revision] = _results - - return dict(results) + return BenchmarkResults(model_results=model_results) diff --git a/mteb/load_results/mteb_results.py b/mteb/load_results/task_results.py similarity index 89% rename from mteb/load_results/mteb_results.py rename to mteb/load_results/task_results.py index 49cf3a710a..aa9bf58359 100644 --- a/mteb/load_results/mteb_results.py +++ b/mteb/load_results/task_results.py @@ -4,6 +4,7 @@ import logging from argparse import Namespace from collections import defaultdict +from functools import cached_property from importlib.metadata import version from pathlib import Path from typing import Any, Callable @@ -13,10 +14,7 @@ from pydantic import BaseModel, field_validator from mteb.abstasks.AbsTask import AbsTask, ScoresDict -from mteb.abstasks.TaskMetadata import ( - ISO_LANGUAGE_SCRIPT, - HFSubset, -) +from mteb.abstasks.TaskMetadata import ISO_LANGUAGE_SCRIPT, HFSubset from mteb.languages import ISO_LANGUAGE, LanguageScripts Split = str @@ -116,7 +114,7 @@ class ScalaSvClassificationDummy: } -class MTEBResults(BaseModel): +class TaskResult(BaseModel): """A class to represent the MTEB result. Attributes: @@ -142,7 +140,7 @@ class MTEBResults(BaseModel): ... }, ... } >>> sample_task = ... # some MTEB task - >>> mteb_results = MTEBResults.from_task_results(sample_task, scores) + >>> mteb_results = TaskResult.from_task_results(sample_task, scores) >>> mteb_results.get_score() # get the main score for all languages 0.55 >>> mteb_results.get_score(languages=["fra"]) # get the main score for French @@ -170,7 +168,7 @@ def from_task_results( scores: dict[Split, dict[HFSubset, ScoresDict]], evaluation_time: float, kg_co2_emissions: float | None = None, - ) -> MTEBResults: + ) -> TaskResult: task_meta = task.metadata subset2langscripts = task_meta.hf_subsets_to_langscripts flat_scores = defaultdict(list) @@ -184,7 +182,7 @@ def from_task_results( } flat_scores[split].append(_scores) - return MTEBResults( + return TaskResult( dataset_revision=task.metadata.dataset["revision"], task_name=task.metadata.name, mteb_version=version("mteb"), @@ -219,11 +217,36 @@ def _validate_scores_dict(scores: ScoresDict) -> None: except Exception as e: raise ValueError(f"Scores are not json serializable: {e}") + @property + def languages(self) -> list[str]: + langs = [] + for split, split_res in self.scores.items(): + for entry in split_res: + langs.extend([lang.split("-")[0] for lang in entry["languages"]]) + return list(set(langs)) + + @cached_property + def task(self) -> AbsTask: + from mteb.overview import get_task + + return get_task(self.task_name) + + @property + def domains(self) -> list[str]: + doms = self.task.metadata.domains + if doms is None: + doms = [] + return doms + + @property + def task_type(self) -> str: + return self.task.metadata.type + def to_dict(self) -> dict: return self.model_dump() @classmethod - def from_dict(cls, data: dict) -> MTEBResults: + def from_dict(cls, data: dict) -> TaskResult: return cls.model_validate(data) def _round_scores(self, scores: dict[Split, list[ScoresDict]], n: int) -> None: @@ -249,8 +272,8 @@ def to_disk(self, path: Path) -> None: json.dump(json_obj, f, indent=2) @classmethod - def from_disk(cls, path: Path, load_historic_data: bool = True) -> MTEBResults: # type: ignore - """Load MTEBResults from disk. + def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult: # type: ignore + """Load TaskResult from disk. Args: path: The path to the file to load. @@ -264,7 +287,7 @@ def from_disk(cls, path: Path, load_historic_data: bool = True) -> MTEBResults: return cls.model_validate(data) except Exception as e: raise ValueError( - f"Error loading MTEBResults from disk. You can try to load historic data by setting `load_historic_data=True`. Error: {e}" + f"Error loading TaskResult from disk. You can try to load historic data by setting `load_historic_data=True`. Error: {e}" ) pre_1_11_load = ( @@ -280,7 +303,7 @@ def from_disk(cls, path: Path, load_historic_data: bool = True) -> MTEBResults: if not pre_1_11_load: raise e logger.debug( - f"Could not load MTEBResults from disk, got error: {e}. Attempting to load from disk using format from before v1.11.0" + f"Could not load TaskResult from disk, got error: {e}. Attempting to load from disk using format from before v1.11.0" ) obj = cls._convert_from_before_v1_11_0(data) @@ -294,7 +317,7 @@ def from_disk(cls, path: Path, load_historic_data: bool = True) -> MTEBResults: return obj @classmethod - def _fix_pair_classification_scores(cls, obj: MTEBResults) -> None: + def _fix_pair_classification_scores(cls, obj: TaskResult) -> None: from mteb import get_task task_name = obj.task_name @@ -314,7 +337,7 @@ def _fix_pair_classification_scores(cls, obj: MTEBResults) -> None: hf_subset_scores.pop(key) @classmethod - def _convert_from_before_v1_11_0(cls, data: dict) -> MTEBResults: + def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult: from mteb.overview import TASKS_REGISTRY # in case the task name is not found in the registry, try to find a lower case version @@ -394,7 +417,7 @@ def _convert_from_before_v1_11_0(cls, data: dict) -> MTEBResults: if "test" in scores and "fr" in scores["test"]: scores["test"]["fra-fra"] = scores["test"].pop("fr") - result: MTEBResults = MTEBResults.from_task_results( + result: TaskResult = TaskResult.from_task_results( task, # type: ignore scores, evaluation_time, @@ -444,11 +467,12 @@ def get_score( return aggregation(values) def __repr__(self) -> str: - return f"MTEBResults(task_name={self.task_name}, scores=...)" + return f"TaskResult(task_name={self.task_name}, scores=...)" - def validate_and_filter_scores(self, task: AbsTask | None = None) -> None: + def validate_and_filter_scores(self, task: AbsTask | None = None) -> AbsTask: """This ensures that the scores are correct for the given task, by removing any splits besides those specified in the task metadata. Additionally it also ensure that all of the splits required as well as the languages are present in the scores. + Returns new TaskResult object. Args: task: The task to validate the scores against. E.g. if the task supplied is limited to certain splits and languages, @@ -459,30 +483,32 @@ def validate_and_filter_scores(self, task: AbsTask | None = None) -> None: if task is None: task = get_task(self.task_name) splits = task.metadata.eval_splits - hf_subsets = set(task.metadata.hf_subsets_to_langscripts) - + if task.is_multilingual: + hf_subsets = getattr( + task, "hf_subsets", task.metadata.hf_subsets_to_langscripts.keys() + ) + hf_subsets = set(hf_subsets) + else: + hf_subsets = {"default"} new_scores = {} seen_splits = set() - for split in self.scores: + for split in task_result.scores: if split not in splits: continue new_scores[split] = [] - seen_subsets = set() - for _scores in self.scores[split]: + for _scores in task_result.scores[split]: if _scores["hf_subset"] not in hf_subsets: continue new_scores[split].append(_scores) seen_subsets.add(_scores["hf_subset"]) - if seen_subsets != hf_subsets: raise ValueError( f"Missing subsets {hf_subsets - seen_subsets} for split {split}" ) - seen_splits.add(split) - if seen_splits != set(splits): raise ValueError(f"Missing splits {set(splits) - seen_splits}") - - self.scores = new_scores + new_res = {**task_result.to_dict(), "scores": new_scores} + new_res = TaskResult.from_dict(new_res) + return new_res diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py index 82fc803df3..50bfc937d9 100644 --- a/mteb/models/__init__.py +++ b/mteb/models/__init__.py @@ -20,142 +20,11 @@ mxbai_models, nomic_models, openai_models, - promptriever_models, - repllama_models, ru_sentence_models, salesforce_models, sentence_transformers_models, voyage_models, ) +from mteb.models.overview import * logger = logging.getLogger(__name__) - - -def get_model( - model_name: str, revision: str | None = None, **kwargs: Any -) -> Encoder | EncoderWithQueryCorpusEncode: - """A function to fetch a model object by name. - - Args: - model_name: Name of the model to fetch - revision: Revision of the model to fetch - **kwargs: Additional keyword arguments to pass to the model loader - - Returns: - A model object - """ - meta = get_model_meta(model_name, revision) - model = meta.load_model(**kwargs) - - # If revision not available in the modelmeta, try to extract it from sentence-transformers - if meta.revision is None and isinstance(model, SentenceTransformer): - _meta = model_meta_from_sentence_transformers(model) - meta.revision = _meta.revision if _meta.revision else meta.revision - - model.mteb_model_meta = meta # type: ignore - return model - - -def get_model_meta(model_name: str, revision: str | None = None) -> ModelMeta: - """A function to fetch a model metadata object by name. - - Args: - model_name: Name of the model to fetch - revision: Revision of the model to fetch - - Returns: - A model metadata object - """ - if model_name in models: - if revision and (not models[model_name].revision == revision): - raise ValueError( - f"Model revision {revision} not found for model {model_name}. Expected {models[model_name].revision}." - ) - return models[model_name] - - # assume it is a sentence-transformers model - logger.info( - "Model not found in model registry, assuming it is a sentence-transformers model." - ) - logger.info( - f"Attempting to extract metadata by loading the model ({model_name}) using sentence-transformers." - ) - model = SentenceTransformer(model_name, revision=revision, trust_remote_code=True) - meta = model_meta_from_sentence_transformers(model) - - meta.revision = revision - meta.name = model_name - return meta - - -def model_meta_from_sentence_transformers( - model: CrossEncoder | SentenceTransformer, -) -> ModelMeta: - if isinstance(model, SentenceTransformer): - name = ( - model.model_card_data.model_name - if model.model_card_data.model_name - else model.model_card_data.base_model - ) - languages = ( - [model.model_card_data.language] - if isinstance(model.model_card_data.language, str) - else model.model_card_data.language - ) - meta = ModelMeta( - name=name, - revision=model.model_card_data.base_model_revision, - release_date=None, - languages=languages, - framework=["Sentence Transformers"], - similarity_fn_name=model.similarity_fn_name, - ) - elif isinstance(model, CrossEncoder): - meta = ModelMeta( - name=model.config._name_or_path, - revision=None, - release_date=None, - languages=None, - framework=["Sentence Transformers"], - similarity_fn_name=None, - ) - else: - logger.warning( - "Failed to extract metadata from model. Upgrading to sentence-transformers v3.0.0 or above is recommended." - ) - meta = ModelMeta( - name=None, - revision=None, - languages=None, - release_date=None, - ) - return meta - - -model_modules = [ - bge_models, - bm25, - cohere_models, - e5_instruct, - e5_models, - google_models, - gritlm_models, - gte_models, - llm2vec_models, - mxbai_models, - nomic_models, - openai_models, - promptriever_models, - repllama_models, - ru_sentence_models, - salesforce_models, - sentence_transformers_models, - voyage_models, - google_models, -] -models = {} - -for module in model_modules: - for mdl in vars(module).values(): - if isinstance(mdl, ModelMeta): - models[mdl.name] = mdl diff --git a/mteb/models/overview.py b/mteb/models/overview.py new file mode 100644 index 0000000000..a8065a99ab --- /dev/null +++ b/mteb/models/overview.py @@ -0,0 +1,182 @@ +from __future__ import annotations + +import logging +from typing import Any, Iterable + +from sentence_transformers import SentenceTransformer + +from mteb.encoder_interface import Encoder, EncoderWithQueryCorpusEncode +from mteb.model_meta import ModelMeta +from mteb.models import ( + bge_models, + bm25, + cohere_models, + e5_instruct, + e5_models, + google_models, + gritlm_models, + gte_models, + llm2vec_models, + mxbai_models, + nomic_models, + openai_models, + ru_sentence_models, + salesforce_models, + sentence_transformers_models, + voyage_models, +) + +logger = logging.getLogger(__name__) + +model_modules = [ + bge_models, + bm25, + cohere_models, + e5_instruct, + e5_models, + google_models, + gritlm_models, + gte_models, + llm2vec_models, + mxbai_models, + nomic_models, + openai_models, + ru_sentence_models, + salesforce_models, + sentence_transformers_models, + voyage_models, + google_models, +] +MODEL_REGISTRY = {} + +for module in model_modules: + for mdl in vars(module).values(): + if isinstance(mdl, ModelMeta): + MODEL_REGISTRY[mdl.name] = mdl + + +def get_model_metas( + model_names: Iterable[str] | None = None, + languages: Iterable[str] | None = None, + open_source: bool | None = None, + frameworks: Iterable[str] | None = None, + n_parameters_range: tuple[int | None, int | None] = (None, None), +) -> list[ModelMeta]: + """Load all models' metadata that fit the specified criteria.""" + res = [] + model_names = set(model_names) if model_names is not None else None + languages = set(languages) if languages is not None else None + frameworks = set(frameworks) if frameworks is not None else None + for model_meta in MODEL_REGISTRY.values(): + if (model_names is not None) and (model_meta.name not in model_names): + continue + if languages is not None: + if (model_meta.languages is None) or not ( + languages <= set(model_meta.languages) + ): + continue + if (open_source is not None) and (model_meta.open_source != open_source): + continue + if (frameworks is not None) and not (frameworks <= set(model_meta.framework)): + continue + upper, lower = n_parameters_range + n_parameters = model_meta.n_parameters + if upper is not None: + if (n_parameters is None) or (n_parameters > upper): + continue + if lower is not None: + if (n_parameters is None) or (n_parameters < lower): + continue + res.append(model_meta) + return res + + +def get_model( + model_name: str, revision: str | None = None, **kwargs: Any +) -> Encoder | EncoderWithQueryCorpusEncode: + """A function to fetch a model object by name. + + Args: + model_name: Name of the model to fetch + revision: Revision of the model to fetch + **kwargs: Additional keyword arguments to pass to the model loader + + Returns: + A model object + """ + meta = get_model_meta(model_name, revision) + model = meta.load_model(**kwargs) + + # If revision not available in the modelmeta, try to extract it from sentence-transformers + if meta.revision is None and isinstance(model, SentenceTransformer): + _meta = model_meta_from_sentence_transformers(model) + meta.revision = _meta.revision if _meta.revision else meta.revision + + model.mteb_model_meta = meta # type: ignore + return model + + +def get_model_meta(model_name: str, revision: str | None = None) -> ModelMeta: + """A function to fetch a model metadata object by name. + + Args: + model_name: Name of the model to fetch + revision: Revision of the model to fetch + + Returns: + A model metadata object + """ + if model_name in MODEL_REGISTRY: + if revision and (not MODEL_REGISTRY[model_name].revision == revision): + raise ValueError( + f"Model revision {revision} not found for model {model_name}. Expected {MODEL_REGISTRY[model_name].revision}." + ) + return MODEL_REGISTRY[model_name] + else: # assume it is a sentence-transformers model + logger.info( + "Model not found in model registry, assuming it is a sentence-transformers model." + ) + logger.info( + f"Attempting to extract metadata by loading the model ({model_name}) using sentence-transformers." + ) + model = SentenceTransformer( + model_name, revision=revision, trust_remote_code=True + ) + meta = model_meta_from_sentence_transformers(model) + + meta.revision = revision + meta.name = model_name + return meta + + +def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMeta: + try: + name = ( + model.model_card_data.model_name + if model.model_card_data.model_name + else model.model_card_data.base_model + ) + languages = ( + [model.model_card_data.language] + if isinstance(model.model_card_data.language, str) + else model.model_card_data.language + ) + meta = ModelMeta( + name=name, + revision=model.model_card_data.base_model_revision, + release_date=None, + languages=languages, + framework=["Sentence Transformers"], + similarity_fn_name=model.similarity_fn_name, + ) + except AttributeError as e: + logger.warning( + f"Failed to extract metadata from model: {e}. Upgrading to sentence-transformers v3.0.0 or above is recommended." + ) + meta = ModelMeta( + name=None, + revision=None, + languages=None, + release_date=None, + ) + return meta diff --git a/mteb/task_aggregation.py b/mteb/task_aggregation.py index 57fb542bb7..e5ce47a4d0 100644 --- a/mteb/task_aggregation.py +++ b/mteb/task_aggregation.py @@ -5,24 +5,27 @@ import numpy as np -from mteb.load_results.load_results import MODEL_NAME, RESULTS, REVISION -from mteb.load_results.mteb_results import MTEBResults +from mteb.load_results.benchmark_results import BenchmarkResults +from mteb.load_results.task_results import TaskResult from mteb.overview import get_task logger = logging.getLogger(__name__) +REVISION = str +MODEL_NAME = str AGGREGATION = dict[MODEL_NAME, dict[REVISION, dict[str, float]]] -def mean(results: RESULTS) -> AGGREGATION: +def mean(results: BenchmarkResults) -> AGGREGATION: """Calculate the mean of the main score of the given results.""" + results = results.to_legacy_dict() unique_tasks = set() for model, revisions in results.items(): for revision, res in revisions.items(): for result in res: unique_tasks.add(result.task_name) - def _mean(model_name: str, rev: str, results: list[MTEBResults]) -> float: + def _mean(model_name: str, rev: str, results: list[TaskResult]) -> float: """Calculate the mean of the main score of the given results.""" scores: list[float] = [result.get_score() for result in results] @@ -42,9 +45,10 @@ def _mean(model_name: str, rev: str, results: list[MTEBResults]) -> float: def task_category_weighted_mean( - results: RESULTS, + results: BenchmarkResults, ) -> AGGREGATION: """Calculate the mean of the main score of the given results, weighted by the number of tasks of each type.""" + results = results.to_legacy_dict() unique_tasks = set() task_types = defaultdict(set) for model, revisions in results.items(): @@ -56,7 +60,7 @@ def task_category_weighted_mean( task_types[task_type].add(task_name) def _task_category_weighted_mean( - model: str, rev: str, results: list[MTEBResults] + model: str, rev: str, results: list[TaskResult] ) -> dict[str, float]: """Calculate the mean of the main score of the given results, weighted by the number of tasks of each type.""" _task_types = {task_type: [] for task_type in task_types.keys()} @@ -91,7 +95,7 @@ def _task_category_weighted_mean( def borda_count( - results: RESULTS, + results: BenchmarkResults, ) -> AGGREGATION: """Calculate the Borda count of the given results. @@ -102,6 +106,7 @@ def borda_count( # consider each model a candidate and each task a voter # each voter ranks the candidates + results = results.to_legacy_dict() n_candidates = sum(len(revs) for revs in results.values()) candidate_scores = { model: {revision: 0.0 for revision in revisions} diff --git a/mteb/task_selection.py b/mteb/task_selection.py index 935e5157ec..20d91a97b7 100644 --- a/mteb/task_selection.py +++ b/mteb/task_selection.py @@ -52,8 +52,8 @@ def results_to_dataframe( for task_result in tasks_results: data.append( { - "model": model_name, - "revision": rev, + "Model": model_name, + "Revision": rev, "task": task_result.task_name, "main_score": task_result.get_score(**kwargs), } @@ -63,7 +63,7 @@ def results_to_dataframe( if drop_na: df = df.dropna(axis=1) return df.pivot_table( - index=["model", "revision"], + index=["Model", "Revision"], columns=["task"], values="main_score", ) diff --git a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py index c5298d34e8..a1dc1c5249 100644 --- a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py +++ b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py @@ -9,7 +9,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.encoder_interface import Encoder, EncoderWithQueryCorpusEncode from mteb.evaluation.evaluators import RerankingEvaluator -from mteb.load_results.mteb_results import ScoresDict +from mteb.load_results.task_results import ScoresDict from ....abstasks.AbsTaskReranking import AbsTaskReranking diff --git a/pyproject.toml b/pyproject.toml index f4b71b3d11..f4abaae3dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint codecarbon = ["codecarbon"] speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] peft = ["peft>=0.11.0"] +leaderboard = ["gradio>=4.44.0", "gradio_rangeslider>=0.0.6"] [tool.coverage.report] diff --git a/scripts/running_model/check_results.py b/scripts/running_model/check_results.py index 2d69b7acdf..c410fb5be7 100644 --- a/scripts/running_model/check_results.py +++ b/scripts/running_model/check_results.py @@ -13,7 +13,7 @@ def results_to_dataframe( - mteb_results: dict[MODEL, dict[REVISION, list[mteb.MTEBResults]]], + mteb_results: dict[MODEL, dict[REVISION, list[mteb.TaskResult]]], ): data = [] for model_name, revisions in mteb_results.items(): diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index e53d7eb5d3..2fd5b0bb0f 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -69,13 +69,13 @@ def test_reload_results(task: str | mteb.AbsTask, model: mteb.Encoder, tmp_path: results = eval.run(model, output_folder=str(tmp_path), overwrite_results=True) assert isinstance(results, list) - assert isinstance(results[0], mteb.MTEBResults) + assert isinstance(results[0], mteb.TaskResult) # reload the results results = eval.run(model, output_folder=str(tmp_path), overwrite_results=False) assert isinstance(results, list) - assert isinstance(results[0], mteb.MTEBResults) + assert isinstance(results[0], mteb.TaskResult) @pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID) diff --git a/tests/test_load_results/test_mteb_load_results.py b/tests/test_load_results/test_mteb_load_results.py index d5d2ec87ef..57ba1bae54 100644 --- a/tests/test_load_results/test_mteb_load_results.py +++ b/tests/test_load_results/test_mteb_load_results.py @@ -4,6 +4,7 @@ from pathlib import Path import mteb +from mteb.load_results.benchmark_results import BenchmarkResults, ModelResult def test_mteb_load_results(): @@ -13,15 +14,15 @@ def test_mteb_load_results(): results = mteb.load_results(download_latest=False) - assert isinstance(results, dict) - for model in results: - assert isinstance(results[model], dict) - for revision in results[model]: - assert isinstance(results[model][revision], list) - for result in results[model][revision]: - assert isinstance(result, mteb.MTEBResults) + assert isinstance(results, BenchmarkResults) + for model_result in results: + assert isinstance(model_result, ModelResult) + for res in model_result: + assert isinstance(res, mteb.TaskResult) known_model = "sentence-transformers/average_word_embeddings_levy_dependency" known_revision = "6d9c09a789ad5dd126b476323fccfeeafcd90509" - assert known_model in results - assert known_revision in results[known_model] + assert known_model in [res.model_name for res in results] + assert known_revision in [ + res.model_revision for res in results if res.model_name == known_model + ] diff --git a/tests/test_load_results/test_mteb_results.py b/tests/test_load_results/test_mteb_results.py index 4007da270f..6c22b390f3 100644 --- a/tests/test_load_results/test_mteb_results.py +++ b/tests/test_load_results/test_mteb_results.py @@ -7,7 +7,7 @@ import mteb from mteb import AbsTask -from mteb.load_results.mteb_results import MTEBResults +from mteb.load_results.task_results import TaskResult tests_folder = Path(__file__).parent.parent @@ -52,7 +52,7 @@ def _calculate_metrics_from_split( def test_mteb_results(): - """Test MTEBResults class (this is the same as the example in the docstring)""" + """Test TaskResult class (this is the same as the example in the docstring)""" scores = { "train": { "en-de": { @@ -66,7 +66,7 @@ def test_mteb_results(): evaluation_time = 100 - mteb_results = MTEBResults.from_task_results( + mteb_results = TaskResult.from_task_results( task=DummyTask(), scores=scores, evaluation_time=evaluation_time ) @@ -101,5 +101,5 @@ def test_mteb_results(): "path", list((tests_folder / "historic_results").glob("*.json")) ) def test_mteb_results_from_historic(path: Path): - mteb_result = MTEBResults.from_disk(path, load_historic_data=True) - assert isinstance(mteb_result, MTEBResults) + mteb_result = TaskResult.from_disk(path, load_historic_data=True) + assert isinstance(mteb_result, TaskResult) diff --git a/tests/test_task_aggregation.py b/tests/test_task_aggregation.py index 23228872c6..f0754418c3 100644 --- a/tests/test_task_aggregation.py +++ b/tests/test_task_aggregation.py @@ -2,9 +2,10 @@ import mteb import mteb.task_aggregation as task_aggregation +from mteb.load_results.benchmark_results import BenchmarkResults # define some test data -bitext1_1 = mteb.MTEBResults( +bitext1_1 = mteb.TaskResult( dataset_revision="test_rev", task_name="BornholmBitextMining", mteb_version="test_version", @@ -12,7 +13,7 @@ scores={"test": [{"main_score": 1, "hf_subset": "NaN", "languages": ["eng-Latn"]}]}, ) -bitext1_2 = mteb.MTEBResults( +bitext1_2 = mteb.TaskResult( dataset_revision="test_rev", task_name="BornholmBitextMining", mteb_version="test_version", @@ -20,7 +21,7 @@ scores={"test": [{"main_score": 2, "hf_subset": "NaN", "languages": ["eng-Latn"]}]}, ) -classification1_1 = mteb.MTEBResults( +classification1_1 = mteb.TaskResult( dataset_revision="test_rev", task_name="Banking77Classification", mteb_version="test_version", @@ -28,7 +29,7 @@ scores={"test": [{"main_score": 1, "hf_subset": "NaN", "languages": ["eng-Latn"]}]}, ) -classification1_2 = mteb.MTEBResults( +classification1_2 = mteb.TaskResult( dataset_revision="test_rev", task_name="Banking77Classification", mteb_version="test_version", @@ -36,7 +37,7 @@ scores={"test": [{"main_score": 2, "hf_subset": "NaN", "languages": ["eng-Latn"]}]}, ) -classification2_1 = mteb.MTEBResults( +classification2_1 = mteb.TaskResult( dataset_revision="test_rev", task_name="AfriSentiClassification", mteb_version="test_version", @@ -54,6 +55,7 @@ "rev2": [bitext1_2, classification1_1, classification2_1], }, } +mteb_results = BenchmarkResults.from_legacy_dict(mteb_results) def test_mean(): @@ -103,14 +105,16 @@ def test_task_category_weighted_mean(): def test_borda_count_simple(): - mteb_results_simple = { - "model1": { - "rev1": [bitext1_1], - }, - "model2": { - "rev2": [bitext1_2], - }, - } + mteb_results_simple = BenchmarkResults.from_legacy_dict( + { + "model1": { + "rev1": [bitext1_1], + }, + "model2": { + "rev2": [bitext1_2], + }, + } + ) expected = { "model1": { "rev1": {"borda_count": 0}, @@ -143,6 +147,9 @@ def test_borda_count_simple_with_tie(): "rev2": {"borda_count": 2.5}, }, } + mteb_results_simple_with_tie = BenchmarkResults.from_legacy_dict( + mteb_results_simple_with_tie + ) assert task_aggregation.borda_count(mteb_results_simple_with_tie) == expected diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index 558aa4bb9b..2940fd9593 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -7,6 +7,7 @@ from sentence_transformers import CrossEncoder, SentenceTransformer from mteb import MTEB +from mteb.model_meta import ModelMeta logging.basicConfig(level=logging.INFO) @@ -365,6 +366,14 @@ def test_reranker_same_ndcg1(): revision = "21eec43590414cb8e3a6f654857abed0483ae36e" de = SentenceTransformer(de_name, revision=revision) ce = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2") + ce_revision = "e9ea2688951463fc2791a2ea2ddfce6762900675" + ce.mteb_model_meta = ModelMeta( + name="cross-encoder/ms-marco-TinyBERT-L-2-v2", + languages=["eng-Latn"], + open_source=True, + revision=ce_revision, + release_date="2021-04-15", + ) eval = MTEB(tasks=["SciFact"]) eval.run( de, @@ -390,7 +399,7 @@ def test_reranker_same_ndcg1(): stage1 = json.load(f) with open( - "tests/results/stage2/cross-encoder__ms-marco-TinyBERT-L-2-v2/no_revision_available/SciFact.json" + f"tests/results/stage2/cross-encoder__ms-marco-TinyBERT-L-2-v2/{ce_revision}/SciFact.json" ) as f: stage2 = json.load(f)