From fb06d3ff5e00a9908f28732341a3b4a7783eeaa4 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sun, 12 Jan 2025 10:35:43 +0300
Subject: [PATCH 01/12] remove deprecated parameters

---
 mteb/cli.py             | 21 +-----------
 mteb/evaluation/MTEB.py | 75 +++--------------------------------------
 2 files changed, 5 insertions(+), 91 deletions(-)

diff --git a/mteb/cli.py b/mteb/cli.py
index 3c6c821f52..c552394e49 100644
--- a/mteb/cli.py
+++ b/mteb/cli.py
@@ -374,26 +374,7 @@ def main():
     add_create_meta_parser(subparsers)
 
     args = parser.parse_args()
-
-    # If no subcommand is provided, default to run with a deprecation warning
-    if not hasattr(args, "func"):
-        logger.warning(
-            "Using `mteb` without a subcommand is deprecated. Use `mteb run` instead.",
-            DeprecationWarning,
-        )
-        # Set default arguments for 'run' if no subcommand is provided
-        default_args = parser.parse_args(
-            ["run"]
-            + list(map(str, args._get_args()))
-            + [
-                f"--{k}" if v is None else f"--{k}={v}"
-                for k, v in vars(args).items()
-                if k != "func"
-            ]
-        )
-        default_args.func(default_args)
-    else:
-        args.func(args)
+    args.func(args)
 
 
 if __name__ == "__main__":
diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index ab317cadbd..142fc8396f 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -35,10 +35,6 @@ def __init__(
         self,
         tasks: Iterable[str | AbsTask] | None = None,
         *,
-        task_types: list[str] | None = None,
-        task_categories: list[str] | None = None,
-        task_langs: list[str] | None = None,
-        version=None,
         err_logs_path: str = "error_logs.txt",
         **kwargs,
     ):
@@ -46,79 +42,23 @@ def __init__(
 
         Args:
             tasks: List of tasks to be evaluated.
-            task_types: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of task types (Clustering, Retrieval..) to be
-                evaluated. If None, all tasks will be evaluated
-            task_categories: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of task categories (s2s, p2p..) to be
-                evaluated. If None, all tasks will be evaluated
-            task_langs: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of languages to be evaluated. if None, all
-                languages will be evaluated. ["eng-Latn", "deu_Latn"] will evaluate on all tasks with these languages.
-            version: Will be deprecated. Version of the benchmark to use. If None, latest is used
             err_logs_path: Path to save error logs.
             kwargs: Additional arguments to be passed to the tasks
         """
         from mteb.benchmarks import Benchmark
 
-        self.deprecation_warning(
-            task_types, task_categories, task_langs, tasks, version
-        )
-
-        if tasks is not None:
-            self._tasks = tasks
-            if isinstance(tasks[0], Benchmark):
-                self.benchmarks = tasks
-                self._tasks = list(chain.from_iterable(tasks))
-            assert (
-                task_types is None and task_categories is None
-            ), "Cannot specify both `tasks` and `task_types`/`task_categories`"
-        else:
-            self._task_types = task_types
-            self._task_categories = task_categories
-            self._tasks = None
-
-        self._task_langs = task_langs if task_langs is not None else []
-        if isinstance(self._task_langs, str):
-            self._task_langs = [self._task_langs]
+        self._tasks = tasks
+        if isinstance(tasks[0], Benchmark):
+            self.benchmarks = tasks
+            self._tasks = list(chain.from_iterable(tasks))
 
         self._extend_lang_code()
         self._extend_lang_pairs()  # add all possible pairs
-
-        self._version = version
         self.err_logs_path = err_logs_path
-
         self.last_evaluated_splits = {}
 
         self.select_tasks(**kwargs)
 
-    def deprecation_warning(
-        self, task_types, task_categories, task_langs, tasks, version
-    ):
-        if task_types is not None:
-            logger.warning(
-                "The `task_types` argument is deprecated and will be removed in the next release. "
-                + "Please use `tasks = mteb.get_tasks(... task_types = [...])` to filter tasks instead."
-            )
-        if task_categories is not None:
-            logger.warning(
-                "The `task_categories` argument is deprecated and will be removed in the next release. "
-                + "Please use `tasks = mteb.get_tasks(... categories = [...])` to filter tasks instead."
-            )
-        if task_langs is not None:
-            logger.warning(
-                "The `task_langs` argument is deprecated and will be removed in the next release. "
-                + "Please use `tasks = mteb.get_tasks(... languages = [...])` to filter tasks instead. "
-                + "Note that this uses 3 letter language codes (ISO 639-3)."
-            )
-        if version is not None:
-            logger.warning(
-                "The `version` argument is deprecated and will be removed in the next release."
-            )
-        task_contains_strings = any(isinstance(x, str) for x in tasks or [])
-        if task_contains_strings:
-            logger.warning(
-                "Passing task names as strings is deprecated and will be removed in the next release. "
-                + "Please use `tasks = mteb.get_tasks(tasks=[...])` method to get tasks instead."
-            )
-
     @property
     def available_tasks(self):
         return [x.metadata.name for x in self.tasks_cls]
@@ -412,13 +352,6 @@ def run(
         Returns:
             A list of TaskResult objects, one for each task evaluated.
         """
-        if "batch_size" in kwargs:
-            logger.warning(
-                "The `batch_size` argument is deprecated and will be removed in the next release. "
-                + "Please use `encode_kwargs = {'batch_size': ...}` to set the batch size instead."
-            )
-            encode_kwargs["batch_size"] = kwargs["batch_size"]
-
         # update logging to account for different levels of Verbosity (similar to the command line)
 
         if verbosity == 0:

From d189eb50593af93f6372db89faadc3b9011beaf4 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sun, 12 Jan 2025 10:50:42 +0300
Subject: [PATCH 02/12] remove _task_langs

---
 mteb/evaluation/MTEB.py | 86 -----------------------------------------
 1 file changed, 86 deletions(-)

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index 142fc8396f..3889dd5bcc 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -52,13 +52,9 @@ def __init__(
             self.benchmarks = tasks
             self._tasks = list(chain.from_iterable(tasks))
 
-        self._extend_lang_code()
-        self._extend_lang_pairs()  # add all possible pairs
         self.err_logs_path = err_logs_path
         self.last_evaluated_splits = {}
 
-        self.select_tasks(**kwargs)
-
     @property
     def available_tasks(self):
         return [x.metadata.name for x in self.tasks_cls]
@@ -72,24 +68,6 @@ def available_task_types(self):
     def available_task_categories(self):
         return {x.metadata.category for x in self.tasks_cls}
 
-    def _extend_lang_code(self):
-        # add all possible language codes
-        for lang in set(self._task_langs):
-            if lang in LangMapping.LANG_MAPPING:
-                self._task_langs += LangMapping.LANG_MAPPING[lang]
-
-    def _extend_lang_pairs(self):
-        # add all possible language pairs
-        langs = set(self._task_langs)
-        for x in langs:
-            if "-" not in x:
-                for y in langs:
-                    if "-" not in y:
-                        pair = f"{x}-{y}"
-                        if pair not in langs:
-                            self._task_langs.append(pair)
-        return
-
     def _display_tasks(self, task_list, name=None):
         from rich.console import Console
 
@@ -161,70 +139,6 @@ def print_selected_tasks(self):
         """Print the selected tasks."""
         self._display_tasks(self.tasks, name="Selected tasks")
 
-    def select_tasks(self, **kwargs):
-        """Select the tasks to be evaluated."""
-        # Get all existing tasks
-        # reranking subclasses retrieval to share methods, but is an abstract task
-        tasks_categories_cls = list(AbsTask.__subclasses__()) + [AbsTaskReranking]
-        all_task_classes = []
-        for cat_cls in tasks_categories_cls:
-            for cls in cat_cls.__subclasses__():
-                if (
-                    cat_cls.__name__.startswith("AbsTask")
-                    and cls.__name__ != "AbsTaskReranking"
-                ):
-                    task = cls(hf_subsets=self._task_langs, **kwargs)
-                    all_task_classes.append(task)
-
-        self.tasks_cls = all_task_classes
-
-        # If `task_list` is specified, select list of tasks
-        if self._tasks is not None:
-            self.tasks = list(
-                filter(lambda x: (x.metadata.name in self._tasks), self.tasks_cls)
-            )
-            if len(self.tasks) != len(self._tasks):
-                tasks_known = {x.metadata.name for x in self.tasks_cls}
-                tasks_unknown = {
-                    x for x in self._tasks if isinstance(x, str)
-                } - tasks_known
-                if tasks_unknown:
-                    unknown_str, known_str = (
-                        ",".join(sorted(tasks_unknown)),
-                        ",".join(sorted(tasks_known)),
-                    )
-                    logger.warning(
-                        f"WARNING: Unknown tasks: {unknown_str}. Known tasks: {known_str}."
-                    )
-            # add task if subclass of mteb.tasks
-            self.tasks.extend([x for x in self._tasks if isinstance(x, AbsTask)])
-            return
-
-        # Otherwise use filters to select tasks
-        filtered_tasks = filter(
-            lambda x: (self._task_types is None)
-            or (x.metadata.type in self._task_types),
-            self.tasks_cls,
-        )
-        filtered_tasks = filter(
-            lambda x: (self._task_categories is None)
-            or (x.metadata.category in self._task_categories),
-            filtered_tasks,
-        )
-        filtered_tasks = filter(
-            lambda x: (self._version is None) or (x.metadata.version >= self._version),
-            filtered_tasks,
-        )
-        # keep only tasks with at least one language in the filter
-        filtered_tasks = filter(
-            lambda x: (not self._task_langs)
-            or (len(set(x.metadata.eval_langs) & set(self._task_langs)) > 0),
-            filtered_tasks,
-        )
-
-        # Get final list of tasks
-        self.tasks = list(filtered_tasks)
-
     def load_tasks_data(self):
         """Load datasets for the selected tasks."""
         logger.info(f"\n\n## Loading datasets for {len(self.tasks)} tasks")

From dbb56c61ff8b50a3a3892f477657e847e4e99fbb Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sun, 12 Jan 2025 10:50:51 +0300
Subject: [PATCH 03/12] lint

---
 mteb/evaluation/MTEB.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index 3889dd5bcc..251846db7f 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -22,10 +22,8 @@
 from mteb.models import model_meta_from_sentence_transformers
 
 from ..abstasks.AbsTask import AbsTask
-from ..abstasks.AbsTaskReranking import AbsTaskReranking
 from ..load_results.task_results import TaskResult
 from ..models.sentence_transformer_wrapper import SentenceTransformerWrapper
-from . import LangMapping
 
 logger = logging.getLogger(__name__)
 

From e75806b1086d87dd47b28f5997c29d132fe71a39 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sun, 12 Jan 2025 11:07:49 +0300
Subject: [PATCH 04/12] fixes

---
 mteb/evaluation/MTEB.py | 20 +++++++-------------
 tests/test_overview.py  |  5 -----
 2 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index 251846db7f..efbc913fc9 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -31,7 +31,7 @@
 class MTEB:
     def __init__(
         self,
-        tasks: Iterable[str | AbsTask] | None = None,
+        tasks: Iterable[AbsTask],
         *,
         err_logs_path: str = "error_logs.txt",
         **kwargs,
@@ -45,28 +45,28 @@ def __init__(
         """
         from mteb.benchmarks import Benchmark
 
-        self._tasks = tasks
+        self.tasks = tasks
         if isinstance(tasks[0], Benchmark):
             self.benchmarks = tasks
-            self._tasks = list(chain.from_iterable(tasks))
+            self.tasks = list(chain.from_iterable(tasks))
 
         self.err_logs_path = err_logs_path
         self.last_evaluated_splits = {}
 
     @property
     def available_tasks(self):
-        return [x.metadata.name for x in self.tasks_cls]
+        return [x.metadata.name for x in self.tasks]
 
     @property
     def available_task_types(self):
         # sort the task types
-        return sorted({x.metadata.type for x in self.tasks_cls})
+        return sorted({x.metadata.type for x in self.tasks})
 
     @property
     def available_task_categories(self):
-        return {x.metadata.category for x in self.tasks_cls}
+        return {x.metadata.category for x in self.tasks}
 
-    def _display_tasks(self, task_list, name=None):
+    def _display_tasks(self, task_list: Iterable[AbsTask], name: str | None = None):
         from rich.console import Console
 
         # disable logging for other ranks
@@ -127,12 +127,6 @@ def mteb_benchmarks(self):
             name = benchmark.name
             self._display_tasks(benchmark.tasks, name=name)
 
-    @classmethod
-    def mteb_tasks(cls):
-        """Get all tasks available in the MTEB."""
-        instance = cls()
-        instance._display_tasks(instance.tasks_cls, name="MTEB tasks")
-
     def print_selected_tasks(self):
         """Print the selected tasks."""
         self._display_tasks(self.tasks, name="Selected tasks")
diff --git a/tests/test_overview.py b/tests/test_overview.py
index 127e54f279..6136af1ea5 100644
--- a/tests/test_overview.py
+++ b/tests/test_overview.py
@@ -98,8 +98,3 @@ def test_MTEBTasks(
     # check for header of a table
     n_langs = len(tasks)
     assert len(tasks.to_markdown().split("\n")) - 3 == n_langs
-
-
-def test_all_tasks_fetch():
-    """Test that all tasks can be fetched"""
-    mteb.MTEB.mteb_tasks()

From d2bd00a18c97e1f87e3198f9de2c5bf1bbf15a4d Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sun, 12 Jan 2025 11:21:54 +0300
Subject: [PATCH 05/12] fixes

---
 mteb/evaluation/MTEB.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index efbc913fc9..87c0e458fc 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -45,10 +45,10 @@ def __init__(
         """
         from mteb.benchmarks import Benchmark
 
-        self.tasks = tasks
-        if isinstance(tasks[0], Benchmark):
+        self.tasks = deepcopy(tasks)
+        if isinstance(self.tasks[0], Benchmark):
             self.benchmarks = tasks
-            self.tasks = list(chain.from_iterable(tasks))
+            self.tasks = list(chain.from_iterable(self.tasks))
 
         self.err_logs_path = err_logs_path
         self.last_evaluated_splits = {}

From 1b75b4619a15a816a43712ba09940b4819c807f8 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sun, 12 Jan 2025 22:23:49 +0300
Subject: [PATCH 06/12] fixes

---
 mteb/evaluation/MTEB.py                | 15 ++++++-----
 mteb/overview.py                       |  2 +-
 tests/test_benchmark/task_grid.py      | 36 ++++++++++++++++----------
 tests/test_benchmark/test_benchmark.py |  2 +-
 4 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index 87c0e458fc..4757add99c 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -5,12 +5,12 @@
 import os
 import traceback
 from collections.abc import Iterable
-from copy import copy, deepcopy
+from copy import deepcopy
 from datetime import datetime
 from itertools import chain
 from pathlib import Path
 from time import time
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import datasets
 from codecarbon import EmissionsTracker
@@ -25,13 +25,16 @@
 from ..load_results.task_results import TaskResult
 from ..models.sentence_transformer_wrapper import SentenceTransformerWrapper
 
+if TYPE_CHECKING:
+    from mteb.benchmarks import Benchmark
+
 logger = logging.getLogger(__name__)
 
 
 class MTEB:
     def __init__(
         self,
-        tasks: Iterable[AbsTask],
+        tasks: list[AbsTask | Benchmark],
         *,
         err_logs_path: str = "error_logs.txt",
         **kwargs,
@@ -290,8 +293,8 @@ def run(
             self.print_selected_tasks()
 
         evaluation_results = []
-        original_tasks = (
-            self.tasks.copy()
+        original_tasks = deepcopy(
+            self.tasks
         )  # save them in case we re-use the object (e.g. for reranking)
 
         # To evaluate missing splits, we keep track of the task name and the corresponding splits.
@@ -501,7 +504,7 @@ def create_model_meta(model: Encoder) -> ModelMeta:
                 )
 
         # create a copy of the meta to avoid modifying the original object
-        meta = copy(meta)
+        meta = deepcopy(meta)
         meta.revision = meta.revision or "no_revision_available"
         meta.name = meta.name or "no_model_name_available"
 
diff --git a/mteb/overview.py b/mteb/overview.py
index 64e8802563..d0982974e5 100644
--- a/mteb/overview.py
+++ b/mteb/overview.py
@@ -119,7 +119,7 @@ def filter_task_by_categories(
     return [t for t in tasks if t.metadata.category in _categories]
 
 
-class MTEBTasks(tuple):
+class MTEBTasks(list):
     def __repr__(self) -> str:
         return "MTEBTasks" + super().__repr__()
 
diff --git a/tests/test_benchmark/task_grid.py b/tests/test_benchmark/task_grid.py
index 8ae310555f..1e5f6b967d 100644
--- a/tests/test_benchmark/task_grid.py
+++ b/tests/test_benchmark/task_grid.py
@@ -3,12 +3,20 @@
 from __future__ import annotations
 
 from mteb.abstasks import AbsTask
-from mteb.tasks.BitextMining.dan.BornholmskBitextMining import BornholmBitextMining
-from mteb.tasks.Classification.multilingual.IndicSentimentClassification import (
+from mteb.tasks import (
+    Banking77Classification,
+    BornholmBitextMining,
+    BrazilianToxicTweetsClassification,
+    Core17InstructionRetrieval,
+    FaroeseSTS,
+    FarsTail,
     IndicSentimentClassification,
-)
-from mteb.tasks.Clustering.eng.TwentyNewsgroupsClustering import (
+    InstructIR,
+    SciDocsReranking,
+    SummEvalSummarization,
+    TwentyNewsgroupsClustering,
     TwentyNewsgroupsClusteringFast,
+    TwitterHjerneRetrieval,
 )
 
 from .mock_tasks import (
@@ -52,17 +60,17 @@
         hf_subsets=["as"],  # we only load one subset here to speed up tests
         n_experiments=2,  # to speed up the test
     ),
-    "TwentyNewsgroupsClustering",  # clustering and string instead of class
+    TwentyNewsgroupsClustering,  # clustering and string instead of class
     twenty_news,  # fast clustering
-    "Banking77Classification",  # classification
-    "SciDocsRR",  # reranking
-    "FarsTail",  # pair classification
-    "TwitterHjerneRetrieval",  # retrieval
-    "BrazilianToxicTweetsClassification",  # multilabel classification
-    "FaroeseSTS",  # STS
-    "SummEval",  # summarization
-    "Core17InstructionRetrieval",  # instruction reranking
-    "InstructIR",  # instruction retrieval
+    Banking77Classification,  # classification
+    SciDocsReranking,  # reranking
+    FarsTail,  # pair classification
+    TwitterHjerneRetrieval,  # retrieval
+    BrazilianToxicTweetsClassification,  # multilabel classification
+    FaroeseSTS,  # STS
+    SummEvalSummarization,  # summarization
+    Core17InstructionRetrieval,  # instruction reranking
+    InstructIR,  # instruction retrieval
 ]
 
 TASK_TEST_GRID_AS_STRING = [
diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py
index e84f0e63b6..b3d318afb4 100644
--- a/tests/test_benchmark/test_benchmark.py
+++ b/tests/test_benchmark/test_benchmark.py
@@ -170,7 +170,7 @@ def test_run_using_benchmark(model: mteb.Encoder):
         name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"])
     )
 
-    eval = mteb.MTEB(tasks=bench)
+    eval = mteb.MTEB(tasks=[bench])
     eval.run(
         model, output_folder="tests/results", overwrite_results=True
     )  # we just want to test that it runs

From f9ae1c31e9fb7149b29ff3ca99c90e378fc38c63 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sun, 12 Jan 2025 22:26:48 +0300
Subject: [PATCH 07/12] fix all abs tasks

---
 tests/test_tasks/test_all_abstasks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py
index 84d5a521ca..71c9175966 100644
--- a/tests/test_tasks/test_all_abstasks.py
+++ b/tests/test_tasks/test_all_abstasks.py
@@ -13,14 +13,14 @@
 from mteb.abstasks.AbsTaskReranking import AbsTaskReranking
 from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 from mteb.abstasks.AbsTaskSpeedTask import AbsTaskSpeedTask
-from mteb.overview import TASKS_REGISTRY
+from mteb.overview import TASKS_REGISTRY, get_tasks
 
 from ..test_benchmark.task_grid import MOCK_TASK_TEST_GRID_AS_STRING
 
 logging.basicConfig(level=logging.INFO)
 
 tasks = [
-    t for t in MTEB().tasks_cls if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING
+    t for t in get_tasks() if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING
 ]
 
 

From fb0b5e7a6e90bfe9fb8154d22d2eeff71641e6e5 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sun, 12 Jan 2025 22:51:33 +0300
Subject: [PATCH 08/12] change to get tasks

---
 tests/test_benchmark/task_grid.py     | 60 +++++++++------------------
 tests/test_tasks/test_all_abstasks.py |  6 +--
 2 files changed, 22 insertions(+), 44 deletions(-)

diff --git a/tests/test_benchmark/task_grid.py b/tests/test_benchmark/task_grid.py
index 1e5f6b967d..320ca9c930 100644
--- a/tests/test_benchmark/task_grid.py
+++ b/tests/test_benchmark/task_grid.py
@@ -2,22 +2,8 @@
 
 from __future__ import annotations
 
+import mteb
 from mteb.abstasks import AbsTask
-from mteb.tasks import (
-    Banking77Classification,
-    BornholmBitextMining,
-    BrazilianToxicTweetsClassification,
-    Core17InstructionRetrieval,
-    FaroeseSTS,
-    FarsTail,
-    IndicSentimentClassification,
-    InstructIR,
-    SciDocsReranking,
-    SummEvalSummarization,
-    TwentyNewsgroupsClustering,
-    TwentyNewsgroupsClusteringFast,
-    TwitterHjerneRetrieval,
-)
 
 from .mock_tasks import (
     MockBitextMiningTask,
@@ -47,31 +33,25 @@
     MockSummarizationTask,
 )
 
-twenty_news = TwentyNewsgroupsClusteringFast()
-
-# downsample to speed up tests
-twenty_news.max_document_to_embed = 1000
-twenty_news.n_clusters = 2
-twenty_news.max_fraction_of_documents_to_embed = None
-
-TASK_TEST_GRID = [
-    BornholmBitextMining(),  # bitext mining + just supplying a task class instead of a string
-    IndicSentimentClassification(  # multi subset loader
-        hf_subsets=["as"],  # we only load one subset here to speed up tests
-        n_experiments=2,  # to speed up the test
-    ),
-    TwentyNewsgroupsClustering,  # clustering and string instead of class
-    twenty_news,  # fast clustering
-    Banking77Classification,  # classification
-    SciDocsReranking,  # reranking
-    FarsTail,  # pair classification
-    TwitterHjerneRetrieval,  # retrieval
-    BrazilianToxicTweetsClassification,  # multilabel classification
-    FaroeseSTS,  # STS
-    SummEvalSummarization,  # summarization
-    Core17InstructionRetrieval,  # instruction reranking
-    InstructIR,  # instruction retrieval
-]
+TASK_TEST_GRID = (
+    mteb.get_tasks(
+        tasks=[
+            "BornholmBitextMining",  # bitext mining + just supplying a task class instead of a string
+            "TwentyNewsgroupsClustering",  # clustering and string instead of class
+            "TwentyNewsgroupsClustering.v2",  # fast clustering
+            "Banking77Classification",  # classification
+            "SciDocsRR",  # reranking
+            "FarsTail",  # pair classification
+            "TwitterHjerneRetrieval",  # retrieval
+            "BrazilianToxicTweetsClassification",  # multilabel classification
+            "FaroeseSTS",  # STS
+            "SummEval",  # summarization
+            "Core17InstructionRetrieval",  # instruction reranking
+            "InstructIR",  # instruction retrieval
+        ]
+    )
+    + mteb.get_tasks(tasks=["IndicSentimentClassification"], eval_splits=["as"])
+)
 
 TASK_TEST_GRID_AS_STRING = [
     t.metadata.name if isinstance(t, AbsTask) else t for t in TASK_TEST_GRID
diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py
index 71c9175966..b7deaf1310 100644
--- a/tests/test_tasks/test_all_abstasks.py
+++ b/tests/test_tasks/test_all_abstasks.py
@@ -19,9 +19,7 @@
 
 logging.basicConfig(level=logging.INFO)
 
-tasks = [
-    t for t in get_tasks() if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING
-]
+tasks = [t for t in get_tasks() if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING]
 
 
 @pytest.mark.parametrize("task", tasks)
@@ -84,7 +82,7 @@ async def check_datasets_are_available_on_hf(tasks):
 
 def test_dataset_availability():
     """Checks if the datasets are available on Hugging Face using both their name and revision."""
-    tasks = MTEB().tasks_cls
+    tasks = get_tasks()
     tasks = [t for t in tasks if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING]
     asyncio.run(check_datasets_are_available_on_hf(tasks))
 

From f822bdd8b2495b2e8427e5d6d568494077fe0494 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sun, 12 Jan 2025 22:54:58 +0300
Subject: [PATCH 09/12] try to fix

---
 tests/test_benchmark/test_benchmark.py | 2 +-
 tests/test_tasks/test_all_abstasks.py  | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py
index d093d2f030..1393d46f12 100644
--- a/tests/test_benchmark/test_benchmark.py
+++ b/tests/test_benchmark/test_benchmark.py
@@ -67,7 +67,7 @@ def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder):
     eval.run(model, output_folder="tests/results", overwrite_results=True)
 
 
-@pytest.mark.parametrize("task", [MockMultilingualRetrievalTask])
+@pytest.mark.parametrize("task", [MockMultilingualRetrievalTask()])
 @pytest.mark.parametrize(
     "model",
     [MockSentenceTransformer()],
diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py
index b7deaf1310..58c91c41df 100644
--- a/tests/test_tasks/test_all_abstasks.py
+++ b/tests/test_tasks/test_all_abstasks.py
@@ -8,7 +8,6 @@
 import pytest
 
 import mteb
-from mteb import MTEB
 from mteb.abstasks import AbsTask, MultilingualTask
 from mteb.abstasks.AbsTaskReranking import AbsTaskReranking
 from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval

From 33fc8cafe5802d2ef7eb9f054be64151bf7e6e12 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sun, 12 Jan 2025 23:18:49 +0300
Subject: [PATCH 10/12] final fixes

---
 mteb/evaluation/MTEB.py              | 10 +++++++++-
 tests/test_benchmark/task_grid.py    |  2 +-
 tests/test_tasks/test_mteb_rerank.py |  9 +++------
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index 7b0bd6c08b..5b40df033e 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -16,6 +16,7 @@
 from codecarbon import EmissionsTracker
 from sentence_transformers import CrossEncoder, SentenceTransformer
 
+import mteb
 from mteb.abstasks.AbsTask import ScoresDict
 from mteb.encoder_interface import Encoder
 from mteb.model_meta import ModelMeta
@@ -49,7 +50,7 @@ def __init__(
         from mteb.benchmarks import Benchmark
 
         self.tasks = deepcopy(tasks)
-        if isinstance(self.tasks[0], Benchmark):
+        if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark):
             self.benchmarks = tasks
             self.tasks = list(chain.from_iterable(self.tasks))
 
@@ -130,6 +131,13 @@ def mteb_benchmarks(self):
             name = benchmark.name
             self._display_tasks(benchmark.tasks, name=name)
 
+    @classmethod
+    def mteb_tasks(cls):
+        """Get all tasks available in the MTEB."""
+        tasks = mteb.get_tasks()
+        instance = cls(tasks)
+        instance._display_tasks(tasks, name="MTEB tasks")
+
     def print_selected_tasks(self):
         """Print the selected tasks."""
         self._display_tasks(self.tasks, name="Selected tasks")
diff --git a/tests/test_benchmark/task_grid.py b/tests/test_benchmark/task_grid.py
index 320ca9c930..3ad484b6ff 100644
--- a/tests/test_benchmark/task_grid.py
+++ b/tests/test_benchmark/task_grid.py
@@ -50,7 +50,7 @@
             "InstructIR",  # instruction retrieval
         ]
     )
-    + mteb.get_tasks(tasks=["IndicSentimentClassification"], eval_splits=["as"])
+    + mteb.get_tasks(tasks=["IndicSentimentClassification"], languages=["asm-Beng"])
 )
 
 TASK_TEST_GRID_AS_STRING = [
diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py
index c540bb41ee..565b00e22f 100644
--- a/tests/test_tasks/test_mteb_rerank.py
+++ b/tests/test_tasks/test_mteb_rerank.py
@@ -6,6 +6,7 @@
 
 from sentence_transformers import CrossEncoder, SentenceTransformer
 
+import mteb
 from mteb import MTEB
 from mteb.model_meta import ModelMeta
 
@@ -318,11 +319,7 @@ def test_mteb_rerank(tmp_path: Path):
         "1395",
     ]
     model = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2")
-    eval = MTEB(
-        tasks=[
-            "SciFact",
-        ]
-    )
+    eval = MTEB(tasks=mteb.get_tasks(["SciFact"]))
     # create fake first stage results
     tmp_file = tmp_path / "tmp.json"
     with open(tmp_file, "w") as f:
@@ -374,7 +371,7 @@ def test_reranker_same_ndcg1():
         revision=ce_revision,
         release_date="2021-04-15",
     )
-    eval = MTEB(tasks=["SciFact"])
+    eval = MTEB(tasks=mteb.get_tasks(["SciFact"]))
     eval.run(
         de,
         output_folder="tests/results/stage1",

From cee5c3586c6b08812d469c39d74cb7b3c8c1394c Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Tue, 14 Jan 2025 11:45:17 +0300
Subject: [PATCH 11/12] back to tuple

---
 mteb/evaluation/MTEB.py             | 4 ++--
 mteb/overview.py                    | 4 ++--
 tests/test_reproducible_workflow.py | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index 5b40df033e..b9ac0c20a2 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -35,7 +35,7 @@
 class MTEB:
     def __init__(
         self,
-        tasks: list[AbsTask | Benchmark],
+        tasks: Iterable[AbsTask | Benchmark],
         *,
         err_logs_path: str = "error_logs.txt",
         **kwargs,
@@ -49,7 +49,7 @@ def __init__(
         """
         from mteb.benchmarks import Benchmark
 
-        self.tasks = deepcopy(tasks)
+        self.tasks = list(tasks)
         if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark):
             self.benchmarks = tasks
             self.tasks = list(chain.from_iterable(self.tasks))
diff --git a/mteb/overview.py b/mteb/overview.py
index c12c28a4e1..39d96041bd 100644
--- a/mteb/overview.py
+++ b/mteb/overview.py
@@ -119,12 +119,12 @@ def filter_task_by_categories(
     return [t for t in tasks if t.metadata.category in _categories]
 
 
-class MTEBTasks(list):
+class MTEBTasks(tuple):
     def __repr__(self) -> str:
         return "MTEBTasks" + super().__repr__()
 
     @staticmethod
-    def _extract_property_from_task(task, property):
+    def _extract_property_from_task(task, property: str):
         if hasattr(task.metadata, property):
             return getattr(task.metadata, property)
         elif hasattr(task, property):
diff --git a/tests/test_reproducible_workflow.py b/tests/test_reproducible_workflow.py
index 566864a112..1c7536076e 100644
--- a/tests/test_reproducible_workflow.py
+++ b/tests/test_reproducible_workflow.py
@@ -36,7 +36,7 @@ def test_reproducibility_workflow(task_name: str, model_name: str, model_revisio
 @pytest.mark.parametrize(
     "task_name",
     TASK_TEST_GRID
-    + [
+    + (
         "BitextMining",
         "Classification",
         "MultilabelClassification",
@@ -49,7 +49,7 @@ def test_reproducibility_workflow(task_name: str, model_name: str, model_revisio
         "InstructionRetrieval",
         "InstructionReranking",
         "Speed",
-    ],
+    ),
 )
 def test_validate_task_to_prompt_name(task_name: str | AbsTask):
     if isinstance(task_name, AbsTask):

From a1c5745021e7f2ff679b75cabe2304382aa3f280 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Wed, 15 Jan 2025 19:18:06 +0300
Subject: [PATCH 12/12] update args description

---
 mteb/evaluation/MTEB.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index b9ac0c20a2..3c94f24785 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -38,14 +38,13 @@ def __init__(
         tasks: Iterable[AbsTask | Benchmark],
         *,
         err_logs_path: str = "error_logs.txt",
-        **kwargs,
     ):
         """Create an Evaluation pipeline, based on the provided tasks.
 
         Args:
-            tasks: List of tasks to be evaluated.
+            tasks: List of tasks or benchmarks to be evaluated, e.g. tasks returned by
+                `mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)").
             err_logs_path: Path to save error logs.
-            kwargs: Additional arguments to be passed to the tasks
         """
         from mteb.benchmarks import Benchmark