From fb06d3ff5e00a9908f28732341a3b4a7783eeaa4 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 12 Jan 2025 10:35:43 +0300 Subject: [PATCH 01/12] remove deprecated parameters --- mteb/cli.py | 21 +----------- mteb/evaluation/MTEB.py | 75 +++-------------------------------------- 2 files changed, 5 insertions(+), 91 deletions(-) diff --git a/mteb/cli.py b/mteb/cli.py index 3c6c821f52..c552394e49 100644 --- a/mteb/cli.py +++ b/mteb/cli.py @@ -374,26 +374,7 @@ def main(): add_create_meta_parser(subparsers) args = parser.parse_args() - - # If no subcommand is provided, default to run with a deprecation warning - if not hasattr(args, "func"): - logger.warning( - "Using `mteb` without a subcommand is deprecated. Use `mteb run` instead.", - DeprecationWarning, - ) - # Set default arguments for 'run' if no subcommand is provided - default_args = parser.parse_args( - ["run"] - + list(map(str, args._get_args())) - + [ - f"--{k}" if v is None else f"--{k}={v}" - for k, v in vars(args).items() - if k != "func" - ] - ) - default_args.func(default_args) - else: - args.func(args) + args.func(args) if __name__ == "__main__": diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index ab317cadbd..142fc8396f 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -35,10 +35,6 @@ def __init__( self, tasks: Iterable[str | AbsTask] | None = None, *, - task_types: list[str] | None = None, - task_categories: list[str] | None = None, - task_langs: list[str] | None = None, - version=None, err_logs_path: str = "error_logs.txt", **kwargs, ): @@ -46,79 +42,23 @@ def __init__( Args: tasks: List of tasks to be evaluated. - task_types: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of task types (Clustering, Retrieval..) to be - evaluated. If None, all tasks will be evaluated - task_categories: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of task categories (s2s, p2p..) to be - evaluated. If None, all tasks will be evaluated - task_langs: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of languages to be evaluated. if None, all - languages will be evaluated. ["eng-Latn", "deu_Latn"] will evaluate on all tasks with these languages. - version: Will be deprecated. Version of the benchmark to use. If None, latest is used err_logs_path: Path to save error logs. kwargs: Additional arguments to be passed to the tasks """ from mteb.benchmarks import Benchmark - self.deprecation_warning( - task_types, task_categories, task_langs, tasks, version - ) - - if tasks is not None: - self._tasks = tasks - if isinstance(tasks[0], Benchmark): - self.benchmarks = tasks - self._tasks = list(chain.from_iterable(tasks)) - assert ( - task_types is None and task_categories is None - ), "Cannot specify both `tasks` and `task_types`/`task_categories`" - else: - self._task_types = task_types - self._task_categories = task_categories - self._tasks = None - - self._task_langs = task_langs if task_langs is not None else [] - if isinstance(self._task_langs, str): - self._task_langs = [self._task_langs] + self._tasks = tasks + if isinstance(tasks[0], Benchmark): + self.benchmarks = tasks + self._tasks = list(chain.from_iterable(tasks)) self._extend_lang_code() self._extend_lang_pairs() # add all possible pairs - - self._version = version self.err_logs_path = err_logs_path - self.last_evaluated_splits = {} self.select_tasks(**kwargs) - def deprecation_warning( - self, task_types, task_categories, task_langs, tasks, version - ): - if task_types is not None: - logger.warning( - "The `task_types` argument is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(... task_types = [...])` to filter tasks instead." - ) - if task_categories is not None: - logger.warning( - "The `task_categories` argument is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(... categories = [...])` to filter tasks instead." - ) - if task_langs is not None: - logger.warning( - "The `task_langs` argument is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(... languages = [...])` to filter tasks instead. " - + "Note that this uses 3 letter language codes (ISO 639-3)." - ) - if version is not None: - logger.warning( - "The `version` argument is deprecated and will be removed in the next release." - ) - task_contains_strings = any(isinstance(x, str) for x in tasks or []) - if task_contains_strings: - logger.warning( - "Passing task names as strings is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(tasks=[...])` method to get tasks instead." - ) - @property def available_tasks(self): return [x.metadata.name for x in self.tasks_cls] @@ -412,13 +352,6 @@ def run( Returns: A list of TaskResult objects, one for each task evaluated. """ - if "batch_size" in kwargs: - logger.warning( - "The `batch_size` argument is deprecated and will be removed in the next release. " - + "Please use `encode_kwargs = {'batch_size': ...}` to set the batch size instead." - ) - encode_kwargs["batch_size"] = kwargs["batch_size"] - # update logging to account for different levels of Verbosity (similar to the command line) if verbosity == 0: From d189eb50593af93f6372db89faadc3b9011beaf4 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 12 Jan 2025 10:50:42 +0300 Subject: [PATCH 02/12] remove _task_langs --- mteb/evaluation/MTEB.py | 86 ----------------------------------------- 1 file changed, 86 deletions(-) diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index 142fc8396f..3889dd5bcc 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -52,13 +52,9 @@ def __init__( self.benchmarks = tasks self._tasks = list(chain.from_iterable(tasks)) - self._extend_lang_code() - self._extend_lang_pairs() # add all possible pairs self.err_logs_path = err_logs_path self.last_evaluated_splits = {} - self.select_tasks(**kwargs) - @property def available_tasks(self): return [x.metadata.name for x in self.tasks_cls] @@ -72,24 +68,6 @@ def available_task_types(self): def available_task_categories(self): return {x.metadata.category for x in self.tasks_cls} - def _extend_lang_code(self): - # add all possible language codes - for lang in set(self._task_langs): - if lang in LangMapping.LANG_MAPPING: - self._task_langs += LangMapping.LANG_MAPPING[lang] - - def _extend_lang_pairs(self): - # add all possible language pairs - langs = set(self._task_langs) - for x in langs: - if "-" not in x: - for y in langs: - if "-" not in y: - pair = f"{x}-{y}" - if pair not in langs: - self._task_langs.append(pair) - return - def _display_tasks(self, task_list, name=None): from rich.console import Console @@ -161,70 +139,6 @@ def print_selected_tasks(self): """Print the selected tasks.""" self._display_tasks(self.tasks, name="Selected tasks") - def select_tasks(self, **kwargs): - """Select the tasks to be evaluated.""" - # Get all existing tasks - # reranking subclasses retrieval to share methods, but is an abstract task - tasks_categories_cls = list(AbsTask.__subclasses__()) + [AbsTaskReranking] - all_task_classes = [] - for cat_cls in tasks_categories_cls: - for cls in cat_cls.__subclasses__(): - if ( - cat_cls.__name__.startswith("AbsTask") - and cls.__name__ != "AbsTaskReranking" - ): - task = cls(hf_subsets=self._task_langs, **kwargs) - all_task_classes.append(task) - - self.tasks_cls = all_task_classes - - # If `task_list` is specified, select list of tasks - if self._tasks is not None: - self.tasks = list( - filter(lambda x: (x.metadata.name in self._tasks), self.tasks_cls) - ) - if len(self.tasks) != len(self._tasks): - tasks_known = {x.metadata.name for x in self.tasks_cls} - tasks_unknown = { - x for x in self._tasks if isinstance(x, str) - } - tasks_known - if tasks_unknown: - unknown_str, known_str = ( - ",".join(sorted(tasks_unknown)), - ",".join(sorted(tasks_known)), - ) - logger.warning( - f"WARNING: Unknown tasks: {unknown_str}. Known tasks: {known_str}." - ) - # add task if subclass of mteb.tasks - self.tasks.extend([x for x in self._tasks if isinstance(x, AbsTask)]) - return - - # Otherwise use filters to select tasks - filtered_tasks = filter( - lambda x: (self._task_types is None) - or (x.metadata.type in self._task_types), - self.tasks_cls, - ) - filtered_tasks = filter( - lambda x: (self._task_categories is None) - or (x.metadata.category in self._task_categories), - filtered_tasks, - ) - filtered_tasks = filter( - lambda x: (self._version is None) or (x.metadata.version >= self._version), - filtered_tasks, - ) - # keep only tasks with at least one language in the filter - filtered_tasks = filter( - lambda x: (not self._task_langs) - or (len(set(x.metadata.eval_langs) & set(self._task_langs)) > 0), - filtered_tasks, - ) - - # Get final list of tasks - self.tasks = list(filtered_tasks) - def load_tasks_data(self): """Load datasets for the selected tasks.""" logger.info(f"\n\n## Loading datasets for {len(self.tasks)} tasks") From dbb56c61ff8b50a3a3892f477657e847e4e99fbb Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 12 Jan 2025 10:50:51 +0300 Subject: [PATCH 03/12] lint --- mteb/evaluation/MTEB.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index 3889dd5bcc..251846db7f 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -22,10 +22,8 @@ from mteb.models import model_meta_from_sentence_transformers from ..abstasks.AbsTask import AbsTask -from ..abstasks.AbsTaskReranking import AbsTaskReranking from ..load_results.task_results import TaskResult from ..models.sentence_transformer_wrapper import SentenceTransformerWrapper -from . import LangMapping logger = logging.getLogger(__name__) From e75806b1086d87dd47b28f5997c29d132fe71a39 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 12 Jan 2025 11:07:49 +0300 Subject: [PATCH 04/12] fixes --- mteb/evaluation/MTEB.py | 20 +++++++------------- tests/test_overview.py | 5 ----- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index 251846db7f..efbc913fc9 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -31,7 +31,7 @@ class MTEB: def __init__( self, - tasks: Iterable[str | AbsTask] | None = None, + tasks: Iterable[AbsTask], *, err_logs_path: str = "error_logs.txt", **kwargs, @@ -45,28 +45,28 @@ def __init__( """ from mteb.benchmarks import Benchmark - self._tasks = tasks + self.tasks = tasks if isinstance(tasks[0], Benchmark): self.benchmarks = tasks - self._tasks = list(chain.from_iterable(tasks)) + self.tasks = list(chain.from_iterable(tasks)) self.err_logs_path = err_logs_path self.last_evaluated_splits = {} @property def available_tasks(self): - return [x.metadata.name for x in self.tasks_cls] + return [x.metadata.name for x in self.tasks] @property def available_task_types(self): # sort the task types - return sorted({x.metadata.type for x in self.tasks_cls}) + return sorted({x.metadata.type for x in self.tasks}) @property def available_task_categories(self): - return {x.metadata.category for x in self.tasks_cls} + return {x.metadata.category for x in self.tasks} - def _display_tasks(self, task_list, name=None): + def _display_tasks(self, task_list: Iterable[AbsTask], name: str | None = None): from rich.console import Console # disable logging for other ranks @@ -127,12 +127,6 @@ def mteb_benchmarks(self): name = benchmark.name self._display_tasks(benchmark.tasks, name=name) - @classmethod - def mteb_tasks(cls): - """Get all tasks available in the MTEB.""" - instance = cls() - instance._display_tasks(instance.tasks_cls, name="MTEB tasks") - def print_selected_tasks(self): """Print the selected tasks.""" self._display_tasks(self.tasks, name="Selected tasks") diff --git a/tests/test_overview.py b/tests/test_overview.py index 127e54f279..6136af1ea5 100644 --- a/tests/test_overview.py +++ b/tests/test_overview.py @@ -98,8 +98,3 @@ def test_MTEBTasks( # check for header of a table n_langs = len(tasks) assert len(tasks.to_markdown().split("\n")) - 3 == n_langs - - -def test_all_tasks_fetch(): - """Test that all tasks can be fetched""" - mteb.MTEB.mteb_tasks() From d2bd00a18c97e1f87e3198f9de2c5bf1bbf15a4d Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 12 Jan 2025 11:21:54 +0300 Subject: [PATCH 05/12] fixes --- mteb/evaluation/MTEB.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index efbc913fc9..87c0e458fc 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -45,10 +45,10 @@ def __init__( """ from mteb.benchmarks import Benchmark - self.tasks = tasks - if isinstance(tasks[0], Benchmark): + self.tasks = deepcopy(tasks) + if isinstance(self.tasks[0], Benchmark): self.benchmarks = tasks - self.tasks = list(chain.from_iterable(tasks)) + self.tasks = list(chain.from_iterable(self.tasks)) self.err_logs_path = err_logs_path self.last_evaluated_splits = {} From 1b75b4619a15a816a43712ba09940b4819c807f8 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 12 Jan 2025 22:23:49 +0300 Subject: [PATCH 06/12] fixes --- mteb/evaluation/MTEB.py | 15 ++++++----- mteb/overview.py | 2 +- tests/test_benchmark/task_grid.py | 36 ++++++++++++++++---------- tests/test_benchmark/test_benchmark.py | 2 +- 4 files changed, 33 insertions(+), 22 deletions(-) diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index 87c0e458fc..4757add99c 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -5,12 +5,12 @@ import os import traceback from collections.abc import Iterable -from copy import copy, deepcopy +from copy import deepcopy from datetime import datetime from itertools import chain from pathlib import Path from time import time -from typing import Any +from typing import TYPE_CHECKING, Any import datasets from codecarbon import EmissionsTracker @@ -25,13 +25,16 @@ from ..load_results.task_results import TaskResult from ..models.sentence_transformer_wrapper import SentenceTransformerWrapper +if TYPE_CHECKING: + from mteb.benchmarks import Benchmark + logger = logging.getLogger(__name__) class MTEB: def __init__( self, - tasks: Iterable[AbsTask], + tasks: list[AbsTask | Benchmark], *, err_logs_path: str = "error_logs.txt", **kwargs, @@ -290,8 +293,8 @@ def run( self.print_selected_tasks() evaluation_results = [] - original_tasks = ( - self.tasks.copy() + original_tasks = deepcopy( + self.tasks ) # save them in case we re-use the object (e.g. for reranking) # To evaluate missing splits, we keep track of the task name and the corresponding splits. @@ -501,7 +504,7 @@ def create_model_meta(model: Encoder) -> ModelMeta: ) # create a copy of the meta to avoid modifying the original object - meta = copy(meta) + meta = deepcopy(meta) meta.revision = meta.revision or "no_revision_available" meta.name = meta.name or "no_model_name_available" diff --git a/mteb/overview.py b/mteb/overview.py index 64e8802563..d0982974e5 100644 --- a/mteb/overview.py +++ b/mteb/overview.py @@ -119,7 +119,7 @@ def filter_task_by_categories( return [t for t in tasks if t.metadata.category in _categories] -class MTEBTasks(tuple): +class MTEBTasks(list): def __repr__(self) -> str: return "MTEBTasks" + super().__repr__() diff --git a/tests/test_benchmark/task_grid.py b/tests/test_benchmark/task_grid.py index 8ae310555f..1e5f6b967d 100644 --- a/tests/test_benchmark/task_grid.py +++ b/tests/test_benchmark/task_grid.py @@ -3,12 +3,20 @@ from __future__ import annotations from mteb.abstasks import AbsTask -from mteb.tasks.BitextMining.dan.BornholmskBitextMining import BornholmBitextMining -from mteb.tasks.Classification.multilingual.IndicSentimentClassification import ( +from mteb.tasks import ( + Banking77Classification, + BornholmBitextMining, + BrazilianToxicTweetsClassification, + Core17InstructionRetrieval, + FaroeseSTS, + FarsTail, IndicSentimentClassification, -) -from mteb.tasks.Clustering.eng.TwentyNewsgroupsClustering import ( + InstructIR, + SciDocsReranking, + SummEvalSummarization, + TwentyNewsgroupsClustering, TwentyNewsgroupsClusteringFast, + TwitterHjerneRetrieval, ) from .mock_tasks import ( @@ -52,17 +60,17 @@ hf_subsets=["as"], # we only load one subset here to speed up tests n_experiments=2, # to speed up the test ), - "TwentyNewsgroupsClustering", # clustering and string instead of class + TwentyNewsgroupsClustering, # clustering and string instead of class twenty_news, # fast clustering - "Banking77Classification", # classification - "SciDocsRR", # reranking - "FarsTail", # pair classification - "TwitterHjerneRetrieval", # retrieval - "BrazilianToxicTweetsClassification", # multilabel classification - "FaroeseSTS", # STS - "SummEval", # summarization - "Core17InstructionRetrieval", # instruction reranking - "InstructIR", # instruction retrieval + Banking77Classification, # classification + SciDocsReranking, # reranking + FarsTail, # pair classification + TwitterHjerneRetrieval, # retrieval + BrazilianToxicTweetsClassification, # multilabel classification + FaroeseSTS, # STS + SummEvalSummarization, # summarization + Core17InstructionRetrieval, # instruction reranking + InstructIR, # instruction retrieval ] TASK_TEST_GRID_AS_STRING = [ diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index e84f0e63b6..b3d318afb4 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -170,7 +170,7 @@ def test_run_using_benchmark(model: mteb.Encoder): name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"]) ) - eval = mteb.MTEB(tasks=bench) + eval = mteb.MTEB(tasks=[bench]) eval.run( model, output_folder="tests/results", overwrite_results=True ) # we just want to test that it runs From f9ae1c31e9fb7149b29ff3ca99c90e378fc38c63 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 12 Jan 2025 22:26:48 +0300 Subject: [PATCH 07/12] fix all abs tasks --- tests/test_tasks/test_all_abstasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index 84d5a521ca..71c9175966 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -13,14 +13,14 @@ from mteb.abstasks.AbsTaskReranking import AbsTaskReranking from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval from mteb.abstasks.AbsTaskSpeedTask import AbsTaskSpeedTask -from mteb.overview import TASKS_REGISTRY +from mteb.overview import TASKS_REGISTRY, get_tasks from ..test_benchmark.task_grid import MOCK_TASK_TEST_GRID_AS_STRING logging.basicConfig(level=logging.INFO) tasks = [ - t for t in MTEB().tasks_cls if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING + t for t in get_tasks() if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING ] From fb0b5e7a6e90bfe9fb8154d22d2eeff71641e6e5 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 12 Jan 2025 22:51:33 +0300 Subject: [PATCH 08/12] change to get tasks --- tests/test_benchmark/task_grid.py | 60 +++++++++------------------ tests/test_tasks/test_all_abstasks.py | 6 +-- 2 files changed, 22 insertions(+), 44 deletions(-) diff --git a/tests/test_benchmark/task_grid.py b/tests/test_benchmark/task_grid.py index 1e5f6b967d..320ca9c930 100644 --- a/tests/test_benchmark/task_grid.py +++ b/tests/test_benchmark/task_grid.py @@ -2,22 +2,8 @@ from __future__ import annotations +import mteb from mteb.abstasks import AbsTask -from mteb.tasks import ( - Banking77Classification, - BornholmBitextMining, - BrazilianToxicTweetsClassification, - Core17InstructionRetrieval, - FaroeseSTS, - FarsTail, - IndicSentimentClassification, - InstructIR, - SciDocsReranking, - SummEvalSummarization, - TwentyNewsgroupsClustering, - TwentyNewsgroupsClusteringFast, - TwitterHjerneRetrieval, -) from .mock_tasks import ( MockBitextMiningTask, @@ -47,31 +33,25 @@ MockSummarizationTask, ) -twenty_news = TwentyNewsgroupsClusteringFast() - -# downsample to speed up tests -twenty_news.max_document_to_embed = 1000 -twenty_news.n_clusters = 2 -twenty_news.max_fraction_of_documents_to_embed = None - -TASK_TEST_GRID = [ - BornholmBitextMining(), # bitext mining + just supplying a task class instead of a string - IndicSentimentClassification( # multi subset loader - hf_subsets=["as"], # we only load one subset here to speed up tests - n_experiments=2, # to speed up the test - ), - TwentyNewsgroupsClustering, # clustering and string instead of class - twenty_news, # fast clustering - Banking77Classification, # classification - SciDocsReranking, # reranking - FarsTail, # pair classification - TwitterHjerneRetrieval, # retrieval - BrazilianToxicTweetsClassification, # multilabel classification - FaroeseSTS, # STS - SummEvalSummarization, # summarization - Core17InstructionRetrieval, # instruction reranking - InstructIR, # instruction retrieval -] +TASK_TEST_GRID = ( + mteb.get_tasks( + tasks=[ + "BornholmBitextMining", # bitext mining + just supplying a task class instead of a string + "TwentyNewsgroupsClustering", # clustering and string instead of class + "TwentyNewsgroupsClustering.v2", # fast clustering + "Banking77Classification", # classification + "SciDocsRR", # reranking + "FarsTail", # pair classification + "TwitterHjerneRetrieval", # retrieval + "BrazilianToxicTweetsClassification", # multilabel classification + "FaroeseSTS", # STS + "SummEval", # summarization + "Core17InstructionRetrieval", # instruction reranking + "InstructIR", # instruction retrieval + ] + ) + + mteb.get_tasks(tasks=["IndicSentimentClassification"], eval_splits=["as"]) +) TASK_TEST_GRID_AS_STRING = [ t.metadata.name if isinstance(t, AbsTask) else t for t in TASK_TEST_GRID diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index 71c9175966..b7deaf1310 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -19,9 +19,7 @@ logging.basicConfig(level=logging.INFO) -tasks = [ - t for t in get_tasks() if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING -] +tasks = [t for t in get_tasks() if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING] @pytest.mark.parametrize("task", tasks) @@ -84,7 +82,7 @@ async def check_datasets_are_available_on_hf(tasks): def test_dataset_availability(): """Checks if the datasets are available on Hugging Face using both their name and revision.""" - tasks = MTEB().tasks_cls + tasks = get_tasks() tasks = [t for t in tasks if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING] asyncio.run(check_datasets_are_available_on_hf(tasks)) From f822bdd8b2495b2e8427e5d6d568494077fe0494 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 12 Jan 2025 22:54:58 +0300 Subject: [PATCH 09/12] try to fix --- tests/test_benchmark/test_benchmark.py | 2 +- tests/test_tasks/test_all_abstasks.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index d093d2f030..1393d46f12 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -67,7 +67,7 @@ def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder): eval.run(model, output_folder="tests/results", overwrite_results=True) -@pytest.mark.parametrize("task", [MockMultilingualRetrievalTask]) +@pytest.mark.parametrize("task", [MockMultilingualRetrievalTask()]) @pytest.mark.parametrize( "model", [MockSentenceTransformer()], diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index b7deaf1310..58c91c41df 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -8,7 +8,6 @@ import pytest import mteb -from mteb import MTEB from mteb.abstasks import AbsTask, MultilingualTask from mteb.abstasks.AbsTaskReranking import AbsTaskReranking from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval From 33fc8cafe5802d2ef7eb9f054be64151bf7e6e12 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 12 Jan 2025 23:18:49 +0300 Subject: [PATCH 10/12] final fixes --- mteb/evaluation/MTEB.py | 10 +++++++++- tests/test_benchmark/task_grid.py | 2 +- tests/test_tasks/test_mteb_rerank.py | 9 +++------ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index 7b0bd6c08b..5b40df033e 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -16,6 +16,7 @@ from codecarbon import EmissionsTracker from sentence_transformers import CrossEncoder, SentenceTransformer +import mteb from mteb.abstasks.AbsTask import ScoresDict from mteb.encoder_interface import Encoder from mteb.model_meta import ModelMeta @@ -49,7 +50,7 @@ def __init__( from mteb.benchmarks import Benchmark self.tasks = deepcopy(tasks) - if isinstance(self.tasks[0], Benchmark): + if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark): self.benchmarks = tasks self.tasks = list(chain.from_iterable(self.tasks)) @@ -130,6 +131,13 @@ def mteb_benchmarks(self): name = benchmark.name self._display_tasks(benchmark.tasks, name=name) + @classmethod + def mteb_tasks(cls): + """Get all tasks available in the MTEB.""" + tasks = mteb.get_tasks() + instance = cls(tasks) + instance._display_tasks(tasks, name="MTEB tasks") + def print_selected_tasks(self): """Print the selected tasks.""" self._display_tasks(self.tasks, name="Selected tasks") diff --git a/tests/test_benchmark/task_grid.py b/tests/test_benchmark/task_grid.py index 320ca9c930..3ad484b6ff 100644 --- a/tests/test_benchmark/task_grid.py +++ b/tests/test_benchmark/task_grid.py @@ -50,7 +50,7 @@ "InstructIR", # instruction retrieval ] ) - + mteb.get_tasks(tasks=["IndicSentimentClassification"], eval_splits=["as"]) + + mteb.get_tasks(tasks=["IndicSentimentClassification"], languages=["asm-Beng"]) ) TASK_TEST_GRID_AS_STRING = [ diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index c540bb41ee..565b00e22f 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -6,6 +6,7 @@ from sentence_transformers import CrossEncoder, SentenceTransformer +import mteb from mteb import MTEB from mteb.model_meta import ModelMeta @@ -318,11 +319,7 @@ def test_mteb_rerank(tmp_path: Path): "1395", ] model = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2") - eval = MTEB( - tasks=[ - "SciFact", - ] - ) + eval = MTEB(tasks=mteb.get_tasks(["SciFact"])) # create fake first stage results tmp_file = tmp_path / "tmp.json" with open(tmp_file, "w") as f: @@ -374,7 +371,7 @@ def test_reranker_same_ndcg1(): revision=ce_revision, release_date="2021-04-15", ) - eval = MTEB(tasks=["SciFact"]) + eval = MTEB(tasks=mteb.get_tasks(["SciFact"])) eval.run( de, output_folder="tests/results/stage1", From cee5c3586c6b08812d469c39d74cb7b3c8c1394c Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 14 Jan 2025 11:45:17 +0300 Subject: [PATCH 11/12] back to tuple --- mteb/evaluation/MTEB.py | 4 ++-- mteb/overview.py | 4 ++-- tests/test_reproducible_workflow.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index 5b40df033e..b9ac0c20a2 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -35,7 +35,7 @@ class MTEB: def __init__( self, - tasks: list[AbsTask | Benchmark], + tasks: Iterable[AbsTask | Benchmark], *, err_logs_path: str = "error_logs.txt", **kwargs, @@ -49,7 +49,7 @@ def __init__( """ from mteb.benchmarks import Benchmark - self.tasks = deepcopy(tasks) + self.tasks = list(tasks) if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark): self.benchmarks = tasks self.tasks = list(chain.from_iterable(self.tasks)) diff --git a/mteb/overview.py b/mteb/overview.py index c12c28a4e1..39d96041bd 100644 --- a/mteb/overview.py +++ b/mteb/overview.py @@ -119,12 +119,12 @@ def filter_task_by_categories( return [t for t in tasks if t.metadata.category in _categories] -class MTEBTasks(list): +class MTEBTasks(tuple): def __repr__(self) -> str: return "MTEBTasks" + super().__repr__() @staticmethod - def _extract_property_from_task(task, property): + def _extract_property_from_task(task, property: str): if hasattr(task.metadata, property): return getattr(task.metadata, property) elif hasattr(task, property): diff --git a/tests/test_reproducible_workflow.py b/tests/test_reproducible_workflow.py index 566864a112..1c7536076e 100644 --- a/tests/test_reproducible_workflow.py +++ b/tests/test_reproducible_workflow.py @@ -36,7 +36,7 @@ def test_reproducibility_workflow(task_name: str, model_name: str, model_revisio @pytest.mark.parametrize( "task_name", TASK_TEST_GRID - + [ + + ( "BitextMining", "Classification", "MultilabelClassification", @@ -49,7 +49,7 @@ def test_reproducibility_workflow(task_name: str, model_name: str, model_revisio "InstructionRetrieval", "InstructionReranking", "Speed", - ], + ), ) def test_validate_task_to_prompt_name(task_name: str | AbsTask): if isinstance(task_name, AbsTask): From a1c5745021e7f2ff679b75cabe2304382aa3f280 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 15 Jan 2025 19:18:06 +0300 Subject: [PATCH 12/12] update args description --- mteb/evaluation/MTEB.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index b9ac0c20a2..3c94f24785 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -38,14 +38,13 @@ def __init__( tasks: Iterable[AbsTask | Benchmark], *, err_logs_path: str = "error_logs.txt", - **kwargs, ): """Create an Evaluation pipeline, based on the provided tasks. Args: - tasks: List of tasks to be evaluated. + tasks: List of tasks or benchmarks to be evaluated, e.g. tasks returned by + `mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)"). err_logs_path: Path to save error logs. - kwargs: Additional arguments to be passed to the tasks """ from mteb.benchmarks import Benchmark