diff --git a/mteb/cli.py b/mteb/cli.py index 3c6c821f52..c552394e49 100644 --- a/mteb/cli.py +++ b/mteb/cli.py @@ -374,26 +374,7 @@ def main(): add_create_meta_parser(subparsers) args = parser.parse_args() - - # If no subcommand is provided, default to run with a deprecation warning - if not hasattr(args, "func"): - logger.warning( - "Using `mteb` without a subcommand is deprecated. Use `mteb run` instead.", - DeprecationWarning, - ) - # Set default arguments for 'run' if no subcommand is provided - default_args = parser.parse_args( - ["run"] - + list(map(str, args._get_args())) - + [ - f"--{k}" if v is None else f"--{k}={v}" - for k, v in vars(args).items() - if k != "func" - ] - ) - default_args.func(default_args) - else: - args.func(args) + args.func(args) if __name__ == "__main__": diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index 0c07ff34db..3c94f24785 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -5,28 +5,29 @@ import os import traceback from collections.abc import Iterable -from copy import copy, deepcopy +from copy import deepcopy from datetime import datetime from itertools import chain from pathlib import Path from time import time -from typing import Any +from typing import TYPE_CHECKING, Any import datasets from codecarbon import EmissionsTracker from sentence_transformers import CrossEncoder, SentenceTransformer +import mteb from mteb.abstasks.AbsTask import ScoresDict from mteb.encoder_interface import Encoder from mteb.model_meta import ModelMeta from mteb.models import model_meta_from_sentence_transformers from ..abstasks.AbsTask import AbsTask -from ..abstasks.AbsTaskMultilabelClassification import AbsTaskMultilabelClassification -from ..abstasks.AbsTaskReranking import AbsTaskReranking from ..load_results.task_results import TaskResult from ..models.sentence_transformer_wrapper import SentenceTransformerWrapper -from . import LangMapping + +if TYPE_CHECKING: + from mteb.benchmarks import Benchmark logger = logging.getLogger(__name__) @@ -34,124 +35,41 @@ class MTEB: def __init__( self, - tasks: Iterable[str | AbsTask] | None = None, + tasks: Iterable[AbsTask | Benchmark], *, - task_types: list[str] | None = None, - task_categories: list[str] | None = None, - task_langs: list[str] | None = None, - version=None, err_logs_path: str = "error_logs.txt", - **kwargs, ): """Create an Evaluation pipeline, based on the provided tasks. Args: - tasks: List of tasks to be evaluated. - task_types: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of task types (Clustering, Retrieval..) to be - evaluated. If None, all tasks will be evaluated - task_categories: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of task categories (s2s, p2p..) to be - evaluated. If None, all tasks will be evaluated - task_langs: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of languages to be evaluated. if None, all - languages will be evaluated. ["eng-Latn", "deu_Latn"] will evaluate on all tasks with these languages. - version: Will be deprecated. Version of the benchmark to use. If None, latest is used + tasks: List of tasks or benchmarks to be evaluated, e.g. tasks returned by + `mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)"). err_logs_path: Path to save error logs. - kwargs: Additional arguments to be passed to the tasks """ from mteb.benchmarks import Benchmark - self.deprecation_warning( - task_types, task_categories, task_langs, tasks, version - ) - - if tasks is not None: - self._tasks = tasks - if isinstance(tasks[0], Benchmark): - self.benchmarks = tasks - self._tasks = list(chain.from_iterable(tasks)) - assert ( - task_types is None and task_categories is None - ), "Cannot specify both `tasks` and `task_types`/`task_categories`" - else: - self._task_types = task_types - self._task_categories = task_categories - self._tasks = None - - self._task_langs = task_langs if task_langs is not None else [] - if isinstance(self._task_langs, str): - self._task_langs = [self._task_langs] + self.tasks = list(tasks) + if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark): + self.benchmarks = tasks + self.tasks = list(chain.from_iterable(self.tasks)) - self._extend_lang_code() - self._extend_lang_pairs() # add all possible pairs - - self._version = version self.err_logs_path = err_logs_path - self.last_evaluated_splits = {} - self.select_tasks(**kwargs) - - def deprecation_warning( - self, task_types, task_categories, task_langs, tasks, version - ): - if task_types is not None: - logger.warning( - "The `task_types` argument is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(... task_types = [...])` to filter tasks instead." - ) - if task_categories is not None: - logger.warning( - "The `task_categories` argument is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(... categories = [...])` to filter tasks instead." - ) - if task_langs is not None: - logger.warning( - "The `task_langs` argument is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(... languages = [...])` to filter tasks instead. " - + "Note that this uses 3 letter language codes (ISO 639-3)." - ) - if version is not None: - logger.warning( - "The `version` argument is deprecated and will be removed in the next release." - ) - task_contains_strings = any(isinstance(x, str) for x in tasks or []) - if task_contains_strings: - logger.warning( - "Passing task names as strings is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(tasks=[...])` method to get tasks instead." - ) - @property def available_tasks(self): - return [x.metadata.name for x in self.tasks_cls] + return [x.metadata.name for x in self.tasks] @property def available_task_types(self): # sort the task types - return sorted({x.metadata.type for x in self.tasks_cls}) + return sorted({x.metadata.type for x in self.tasks}) @property def available_task_categories(self): - return {x.metadata.category for x in self.tasks_cls} - - def _extend_lang_code(self): - # add all possible language codes - for lang in set(self._task_langs): - if lang in LangMapping.LANG_MAPPING: - self._task_langs += LangMapping.LANG_MAPPING[lang] - - def _extend_lang_pairs(self): - # add all possible language pairs - langs = set(self._task_langs) - for x in langs: - if "-" not in x: - for y in langs: - if "-" not in y: - pair = f"{x}-{y}" - if pair not in langs: - self._task_langs.append(pair) - return - - def _display_tasks(self, task_list, name=None): + return {x.metadata.category for x in self.tasks} + + def _display_tasks(self, task_list: Iterable[AbsTask], name: str | None = None): from rich.console import Console # disable logging for other ranks @@ -215,80 +133,14 @@ def mteb_benchmarks(self): @classmethod def mteb_tasks(cls): """Get all tasks available in the MTEB.""" - instance = cls() - instance._display_tasks(instance.tasks_cls, name="MTEB tasks") + tasks = mteb.get_tasks() + instance = cls(tasks) + instance._display_tasks(tasks, name="MTEB tasks") def print_selected_tasks(self): """Print the selected tasks.""" self._display_tasks(self.tasks, name="Selected tasks") - def select_tasks(self, **kwargs): - """Select the tasks to be evaluated.""" - # Get all existing tasks - # reranking and multiclassClassification subclasses retrieval to share methods, but is an abstract task - tasks_categories_cls = list(AbsTask.__subclasses__()) + [ - AbsTaskReranking, - AbsTaskMultilabelClassification, - ] - all_task_classes = [] - for cat_cls in tasks_categories_cls: - for cls in cat_cls.__subclasses__(): - if cat_cls.__name__.startswith("AbsTask") and cls.__name__ not in ( - "AbsTaskReranking", - "AbsTaskMultilabelClassification", - ): - task = cls(hf_subsets=self._task_langs, **kwargs) - all_task_classes.append(task) - - self.tasks_cls = all_task_classes - - # If `task_list` is specified, select list of tasks - if self._tasks is not None: - self.tasks = list( - filter(lambda x: (x.metadata.name in self._tasks), self.tasks_cls) - ) - if len(self.tasks) != len(self._tasks): - tasks_known = {x.metadata.name for x in self.tasks_cls} - tasks_unknown = { - x for x in self._tasks if isinstance(x, str) - } - tasks_known - if tasks_unknown: - unknown_str, known_str = ( - ",".join(sorted(tasks_unknown)), - ",".join(sorted(tasks_known)), - ) - logger.warning( - f"WARNING: Unknown tasks: {unknown_str}. Known tasks: {known_str}." - ) - # add task if subclass of mteb.tasks - self.tasks.extend([x for x in self._tasks if isinstance(x, AbsTask)]) - return - - # Otherwise use filters to select tasks - filtered_tasks = filter( - lambda x: (self._task_types is None) - or (x.metadata.type in self._task_types), - self.tasks_cls, - ) - filtered_tasks = filter( - lambda x: (self._task_categories is None) - or (x.metadata.category in self._task_categories), - filtered_tasks, - ) - filtered_tasks = filter( - lambda x: (self._version is None) or (x.metadata.version >= self._version), - filtered_tasks, - ) - # keep only tasks with at least one language in the filter - filtered_tasks = filter( - lambda x: (not self._task_langs) - or (len(set(x.metadata.eval_langs) & set(self._task_langs)) > 0), - filtered_tasks, - ) - - # Get final list of tasks - self.tasks = list(filtered_tasks) - def load_tasks_data(self): """Load datasets for the selected tasks.""" logger.info(f"\n\n## Loading datasets for {len(self.tasks)} tasks") @@ -416,13 +268,6 @@ def run( Returns: A list of TaskResult objects, one for each task evaluated. """ - if "batch_size" in kwargs: - logger.warning( - "The `batch_size` argument is deprecated and will be removed in the next release. " - + "Please use `encode_kwargs = {'batch_size': ...}` to set the batch size instead." - ) - encode_kwargs["batch_size"] = kwargs["batch_size"] - # update logging to account for different levels of Verbosity (similar to the command line) if verbosity == 0: @@ -455,8 +300,8 @@ def run( self.print_selected_tasks() evaluation_results = [] - original_tasks = ( - self.tasks.copy() + original_tasks = deepcopy( + self.tasks ) # save them in case we re-use the object (e.g. for reranking) # To evaluate missing splits, we keep track of the task name and the corresponding splits. @@ -665,7 +510,7 @@ def create_model_meta(model: Encoder) -> ModelMeta: ) # create a copy of the meta to avoid modifying the original object - meta = copy(meta) + meta = deepcopy(meta) meta.revision = meta.revision or "no_revision_available" meta.name = meta.name or "no_model_name_available" diff --git a/mteb/overview.py b/mteb/overview.py index 5846993b02..39d96041bd 100644 --- a/mteb/overview.py +++ b/mteb/overview.py @@ -124,7 +124,7 @@ def __repr__(self) -> str: return "MTEBTasks" + super().__repr__() @staticmethod - def _extract_property_from_task(task, property): + def _extract_property_from_task(task, property: str): if hasattr(task.metadata, property): return getattr(task.metadata, property) elif hasattr(task, property): diff --git a/tests/test_benchmark/task_grid.py b/tests/test_benchmark/task_grid.py index 8ae310555f..3ad484b6ff 100644 --- a/tests/test_benchmark/task_grid.py +++ b/tests/test_benchmark/task_grid.py @@ -2,14 +2,8 @@ from __future__ import annotations +import mteb from mteb.abstasks import AbsTask -from mteb.tasks.BitextMining.dan.BornholmskBitextMining import BornholmBitextMining -from mteb.tasks.Classification.multilingual.IndicSentimentClassification import ( - IndicSentimentClassification, -) -from mteb.tasks.Clustering.eng.TwentyNewsgroupsClustering import ( - TwentyNewsgroupsClusteringFast, -) from .mock_tasks import ( MockBitextMiningTask, @@ -39,31 +33,25 @@ MockSummarizationTask, ) -twenty_news = TwentyNewsgroupsClusteringFast() - -# downsample to speed up tests -twenty_news.max_document_to_embed = 1000 -twenty_news.n_clusters = 2 -twenty_news.max_fraction_of_documents_to_embed = None - -TASK_TEST_GRID = [ - BornholmBitextMining(), # bitext mining + just supplying a task class instead of a string - IndicSentimentClassification( # multi subset loader - hf_subsets=["as"], # we only load one subset here to speed up tests - n_experiments=2, # to speed up the test - ), - "TwentyNewsgroupsClustering", # clustering and string instead of class - twenty_news, # fast clustering - "Banking77Classification", # classification - "SciDocsRR", # reranking - "FarsTail", # pair classification - "TwitterHjerneRetrieval", # retrieval - "BrazilianToxicTweetsClassification", # multilabel classification - "FaroeseSTS", # STS - "SummEval", # summarization - "Core17InstructionRetrieval", # instruction reranking - "InstructIR", # instruction retrieval -] +TASK_TEST_GRID = ( + mteb.get_tasks( + tasks=[ + "BornholmBitextMining", # bitext mining + just supplying a task class instead of a string + "TwentyNewsgroupsClustering", # clustering and string instead of class + "TwentyNewsgroupsClustering.v2", # fast clustering + "Banking77Classification", # classification + "SciDocsRR", # reranking + "FarsTail", # pair classification + "TwitterHjerneRetrieval", # retrieval + "BrazilianToxicTweetsClassification", # multilabel classification + "FaroeseSTS", # STS + "SummEval", # summarization + "Core17InstructionRetrieval", # instruction reranking + "InstructIR", # instruction retrieval + ] + ) + + mteb.get_tasks(tasks=["IndicSentimentClassification"], languages=["asm-Beng"]) +) TASK_TEST_GRID_AS_STRING = [ t.metadata.name if isinstance(t, AbsTask) else t for t in TASK_TEST_GRID diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index 0c8521578d..1393d46f12 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -67,7 +67,7 @@ def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder): eval.run(model, output_folder="tests/results", overwrite_results=True) -@pytest.mark.parametrize("task", [MockMultilingualRetrievalTask]) +@pytest.mark.parametrize("task", [MockMultilingualRetrievalTask()]) @pytest.mark.parametrize( "model", [MockSentenceTransformer()], @@ -188,7 +188,7 @@ def test_run_using_benchmark(model: mteb.Encoder): name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"]) ) - eval = mteb.MTEB(tasks=bench) + eval = mteb.MTEB(tasks=[bench]) eval.run( model, output_folder="tests/results", overwrite_results=True ) # we just want to test that it runs diff --git a/tests/test_overview.py b/tests/test_overview.py index 127e54f279..6136af1ea5 100644 --- a/tests/test_overview.py +++ b/tests/test_overview.py @@ -98,8 +98,3 @@ def test_MTEBTasks( # check for header of a table n_langs = len(tasks) assert len(tasks.to_markdown().split("\n")) - 3 == n_langs - - -def test_all_tasks_fetch(): - """Test that all tasks can be fetched""" - mteb.MTEB.mteb_tasks() diff --git a/tests/test_reproducible_workflow.py b/tests/test_reproducible_workflow.py index 566864a112..1c7536076e 100644 --- a/tests/test_reproducible_workflow.py +++ b/tests/test_reproducible_workflow.py @@ -36,7 +36,7 @@ def test_reproducibility_workflow(task_name: str, model_name: str, model_revisio @pytest.mark.parametrize( "task_name", TASK_TEST_GRID - + [ + + ( "BitextMining", "Classification", "MultilabelClassification", @@ -49,7 +49,7 @@ def test_reproducibility_workflow(task_name: str, model_name: str, model_revisio "InstructionRetrieval", "InstructionReranking", "Speed", - ], + ), ) def test_validate_task_to_prompt_name(task_name: str | AbsTask): if isinstance(task_name, AbsTask): diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index af66133273..91a7b95070 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -8,20 +8,17 @@ import pytest import mteb -from mteb import MTEB from mteb.abstasks import AbsTask, MultilingualTask from mteb.abstasks.AbsTaskReranking import AbsTaskReranking from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval from mteb.abstasks.AbsTaskSpeedTask import AbsTaskSpeedTask -from mteb.overview import TASKS_REGISTRY +from mteb.overview import TASKS_REGISTRY, get_tasks from ..test_benchmark.task_grid import MOCK_TASK_TEST_GRID_AS_STRING logging.basicConfig(level=logging.INFO) -tasks = [ - t for t in MTEB().tasks_cls if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING -] +tasks = [t for t in get_tasks() if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING] @pytest.mark.parametrize("task", tasks) @@ -84,7 +81,7 @@ async def check_datasets_are_available_on_hf(tasks): def test_dataset_availability(): """Checks if the datasets are available on Hugging Face using both their name and revision.""" - tasks = MTEB().tasks_cls + tasks = get_tasks() tasks = [ t for t in tasks diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index c540bb41ee..565b00e22f 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -6,6 +6,7 @@ from sentence_transformers import CrossEncoder, SentenceTransformer +import mteb from mteb import MTEB from mteb.model_meta import ModelMeta @@ -318,11 +319,7 @@ def test_mteb_rerank(tmp_path: Path): "1395", ] model = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2") - eval = MTEB( - tasks=[ - "SciFact", - ] - ) + eval = MTEB(tasks=mteb.get_tasks(["SciFact"])) # create fake first stage results tmp_file = tmp_path / "tmp.json" with open(tmp_file, "w") as f: @@ -374,7 +371,7 @@ def test_reranker_same_ndcg1(): revision=ce_revision, release_date="2021-04-15", ) - eval = MTEB(tasks=["SciFact"]) + eval = MTEB(tasks=mteb.get_tasks(["SciFact"])) eval.run( de, output_folder="tests/results/stage1",