embeddings-benchmark · KennethEnevoldsen · Sep 17, 2024 · Sep 11, 2024 · Sep 14, 2024 · Sep 14, 2024
diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@
 pip install mteb
 ```
 
-## Usage
+## Example Usage
 
 * Using a python script:
 
@@ -71,7 +71,7 @@ mteb run -m sentence-transformers/all-MiniLM-L6-v2 \
 
 
 
-## Advanced Usage
+## Usage Documentation
 Click on each section below to see the details.
 
 <br /> 
@@ -126,13 +126,28 @@ evaluation = mteb.MTEB(tasks=[
 <details>
   <summary>  Running a benchmark </summary>
 
+### Running a Benchmark
+
 `mteb` comes with a set of predefined benchmarks. These can be fetched using `get_benchmark` and run in a similar fashion to other sets of tasks. 
 For instance to select the 56 English datasets that form the "Overall MTEB English leaderboard":
 
 ```python
 import mteb
-mteb_eng = mteb.get_benchmark("MTEB(eng)")
-evaluation = mteb.MTEB(tasks=mteb_eng, eval_splits=["test"])
+benchmark = mteb.get_benchmark("MTEB(eng)")
+evaluation = mteb.MTEB(tasks=benchmark)
+```
+
+The benchmark specified not only a list of tasks, but also what splits and language to run on. To get an overview of all available benhcmarks simply run:
+
+```python
+import mteb
+benchmarks = mteb.get_benchmarks()
+```
+
+Generally we use the naming scheme for benchmarks `MTEB(*)`, where the "*" denotes the target of the benchmark. In case of a language we use the three letter language code. For large groups of language we use the group notation, e.g. `MTEB(Scandinavian)` for Scandinavian languages. External benchmarks implemented in MTEB like `CoIR` use their original name. When using a benchmark from MTEB please cite `mteb` along with the citations of the benchmark which you can access using:
+
+```python
+benchmark.citation
 ```
 
 </details>
@@ -325,9 +340,11 @@ mteb run -t NFCorpus -m all-MiniLM-L6-v2 --output_folder results --save_predicti
 <details>
   <summary> Fetching result from the results repository </summary>
 
+### Fetching result from the results repository
+
 Multiple models have already been run on tasks avaiable within MTEB. These results are available results [repository](https://github.com/embeddings-benchmark/results).
 
-To make the results more easily accecible we have designed custom functionality for retrieving from the repository. For instance, you are selecting the best model for your French and English retrieval task on legal documents you could fetch the relevant tasks and create a dataframe of the results using the following code:
+To make the results more easily accessible, we have designed custom functionality for retrieving from the repository. For instance, you are selecting the best model for your French and English retrieval task on legal documents you could fetch the relevant tasks and create a dataframe of the results using the following code:
 
 ```python
 import mteb

diff --git a/docs/adding_a_model.md b/docs/adding_a_model.md
@@ -29,10 +29,7 @@ mteb run -m {model_name} -t {task_names}
 
 These will save the results in a folder called `results/{model_name}/{model_revision}`.
 
-For reference you can also look at [scripts/data/run_mteb_english.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/data/run_mteb_english.py) for all MTEB English datasets used in the main ranking.
-Advanced scripts with different models are available in the [mteb/mtebscripts repo](https://github.com/embeddings-benchmark/mtebscripts).
-
-2. **Format the results using the CLI:**
+1. **Format the results using the CLI:**
 
 ```bash
 mteb create_meta --results_folder results/{model_name}/{model_revision} --output_path model_card.md
@@ -44,11 +41,11 @@ If readme of model exists:
 mteb create_meta --results_folder results/{model_name}/{model_revision} --output_path model_card.md --from_existing your_existing_readme.md 
 ```
 
-3. **Add the frontmatter to model repository:**
+2. **Add the frontmatter to model repository:**
 
 Copy the content of the `model_card.md` file to the top of a `README.md` file of your model on the Hub. See [here](https://huggingface.co/Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit/blob/main/README.md) for an example.
 
-4. **Wait for a refresh the leaderboard:**
+3. **Wait for a refresh the leaderboard:**
 
 The leaderboard will then automatically refresh daily so once submitted all you have to do is wait for the automatic refresh.
 

diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py
@@ -60,12 +60,13 @@ class DescriptiveStatistics(TypedDict):
 
 class AbsTask(ABC):
     metadata: TaskMetadata
+    _eval_splits: list[str] | None = None
     superseded_by: None | str = None
+    dataset: dict[HFSubset, DatasetDict] | None = None  # type: ignore
+    data_loaded: bool = False
+    is_multilingual: bool = False
 
     def __init__(self, seed: int = 42, **kwargs: Any):
-        self.dataset = None
-        self.data_loaded = False
-        self.is_multilingual = False
         self.save_suffix = kwargs.get("save_suffix", "")
 
         self.seed = seed
@@ -255,6 +256,11 @@ def languages(self) -> list[str]:
 
         return self.metadata.languages
 
+    def filter_eval_splits(self, eval_splits: list[str] | None) -> AbsTask:
+        """Filter the evaluation splits of the task."""
+        self._eval_splits = eval_splits
+        return self
+
     def filter_languages(
         self, languages: list[str] | None, script: list[str] | None = None
     ) -> AbsTask:
@@ -285,6 +291,12 @@ def filter_languages(
         self.hf_subsets = subsets_to_keep
         return self
 
+    @property
+    def eval_splits(self) -> list[str]:
+        if self._eval_splits:
+            return self._eval_splits
+        return self.metadata.eval_splits
+
     def __repr__(self) -> str:
         """Format the representation of the task such that it appears as:
 

diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py
@@ -32,7 +32,7 @@ class AbsTaskBitextMining(AbsTask):
     """Abstract class for BitextMining tasks
     The similarity is computed between pairs and the results are ranked.
 
-    self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
+    self.load_data() must generate a huggingface dataset with a split matching self.metadata.eval_splits, and assign it to self.dataset. It must contain the following columns:
         id: str
         sentence1: str
         sentence2: str

diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
@@ -126,6 +126,7 @@ def __getitem__(self, index):
             "TwitterURLCorpus",
         ],
         languages=["eng"],
+        eval_splits=["test"],
     ),
     description="Main English benchmarks from MTEB",
     citation="""@inproceedings{muennighoff-etal-2023-mteb,
@@ -263,6 +264,7 @@ def __getitem__(self, index):
     }
     """,
 )
+
 SEB = Benchmark(
     name="MTEB(Scandinavian)",
     tasks=get_tasks(

diff --git a/mteb/benchmarks/get_benchmark.py b/mteb/benchmarks/get_benchmark.py
@@ -25,3 +25,11 @@ def get_benchmark(
             suggestion = f"KeyError: '{benchmark_name}' not found and no similar keys were found."
         raise KeyError(suggestion)
     return BENCHMARK_REGISTRY[benchmark_name]
+
+
+def get_benchmarks(
+    names: list[str] | None,
+) -> list[Benchmark]:
+    if names is None:
+        names = list(BENCHMARK_REGISTRY.keys())
+    return [BENCHMARK_REGISTRY[name] for name in names]
diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
@@ -341,9 +341,7 @@ def run(
                     continue
             try:
                 task_eval_splits = (
-                    eval_splits
-                    if eval_splits is not None
-                    else task.metadata_dict.get("eval_splits", [])
+                    eval_splits if eval_splits is not None else task.eval_splits
                 )
 
                 # load data

diff --git a/mteb/overview.py b/mteb/overview.py
@@ -231,6 +231,7 @@ def get_tasks(
     categories: list[TASK_CATEGORY] | None = None,
     tasks: list[str] | None = None,
     exclude_superseeded: bool = True,
+    eval_splits: list[str] | None = None,
 ) -> MTEBTasks:
     """Get a list of tasks based on the specified filters.
 
@@ -245,6 +246,7 @@ def get_tasks(
             paragraph).
         tasks: A list of task names to include. If None, all tasks which pass the filters are included.
         exclude_superseeded: A boolean flag to exclude datasets which are superseeded by another.
+        eval_splits: A list of evaluation splits to include. If None, all splits are included.
 
     Returns:
         A list of all initialized tasks objects which pass all of the filters (AND operation).
@@ -253,12 +255,18 @@ def get_tasks(
         >>> get_tasks(languages=["eng", "deu"], script=["Latn"], domains=["Legal"])
         >>> get_tasks(languages=["eng"], script=["Latn"], task_types=["Classification"])
         >>> get_tasks(languages=["eng"], script=["Latn"], task_types=["Clustering"], exclude_superseeded=False)
+        >>> get_tasks(languages=["eng"], tasks=["WikipediaRetrievalMultilingual"], eval_splits=["test"])
     """
     if tasks:
-        _tasks = [get_task(task, languages, script) for task in tasks]
+        _tasks = [
+            get_task(task, languages, script, eval_splits=eval_splits) for task in tasks
+        ]
         return MTEBTasks(_tasks)
 
-    _tasks = [cls().filter_languages(languages, script) for cls in create_task_list()]
+    _tasks = [
+        cls().filter_languages(languages, script).filter_eval_splits(eval_splits)
+        for cls in create_task_list()
+    ]
 
     if languages:
         _tasks = filter_tasks_by_languages(_tasks, languages)
@@ -280,6 +288,7 @@ def get_task(
     task_name: str,
     languages: list[str] | None = None,
     script: list[str] | None = None,
+    eval_splits: list[str] | None = None,
 ) -> AbsTask:
     """Get a task by name.
 
@@ -288,6 +297,7 @@ def get_task(
         languages: A list of languages either specified as 3 letter languages codes (ISO 639-3, e.g. "eng") or as script languages codes e.g.
             "eng-Latn". For multilingual tasks this will also remove languages that are not in the specified list.
         script: A list of script codes (ISO 15924 codes). If None, all scripts are included. For multilingual tasks this will also remove scripts
+        eval_splits: A list of evaluation splits to include. If None, all splits are included.
 
     Returns:
         An initialized task object.
@@ -306,4 +316,7 @@ def get_task(
                 f"KeyError: '{task_name}' not found and no similar keys were found."
             )
         raise KeyError(suggestion)
-    return TASKS_REGISTRY[task_name]().filter_languages(languages, script)
+    task = TASKS_REGISTRY[task_name]()
+    if eval_splits:
+        task.filter_eval_splits(eval_splits=eval_splits)
+    return task.filter_languages(languages, script)
diff --git a/scripts/data/run_mteb_english.py b/scripts/data/run_mteb_english.py
diff --git a/scripts/running_model/check_run.sh b/scripts/running_model/check_run.sh
@@ -7,7 +7,7 @@
     # pip install codecarbon
 # ensure latest version of sentnece-transformers is installed:
     # pip install sentence-transformers --upgrade
-# ensure that the the huggingface token is set and accecible using:
+# ensure that the the huggingface token is set and accessible using:
     # huggingface-cli login
 
 echo "Running model on a sample set of tasks" # this is to check tasks are running correctly

diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py
@@ -149,6 +149,6 @@ def test_benchmark_names_must_be_unique():
 
 
 @pytest.mark.parametrize("name", ["MTEB(eng)", "MTEB(rus)", "MTEB(Scandinavian)"])
-def test_get_benchmarks(name):
+def test_get_benchmark(name):
     benchmark = mteb.get_benchmark(benchmark_name=name)
     assert isinstance(benchmark, mteb.Benchmark)