diff --git a/README.md b/README.md
index e2f7a523a5..e35ad3bdbc 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@
pip install mteb
```
-## Usage
+## Example Usage
* Using a python script:
@@ -71,7 +71,7 @@ mteb run -m sentence-transformers/all-MiniLM-L6-v2 \
-## Advanced Usage
+## Usage Documentation
Click on each section below to see the details.
@@ -126,13 +126,28 @@ evaluation = mteb.MTEB(tasks=[
Running a benchmark
+### Running a Benchmark
+
`mteb` comes with a set of predefined benchmarks. These can be fetched using `get_benchmark` and run in a similar fashion to other sets of tasks.
For instance to select the 56 English datasets that form the "Overall MTEB English leaderboard":
```python
import mteb
-mteb_eng = mteb.get_benchmark("MTEB(eng)")
-evaluation = mteb.MTEB(tasks=mteb_eng, eval_splits=["test"])
+benchmark = mteb.get_benchmark("MTEB(eng)")
+evaluation = mteb.MTEB(tasks=benchmark)
+```
+
+The benchmark specified not only a list of tasks, but also what splits and language to run on. To get an overview of all available benhcmarks simply run:
+
+```python
+import mteb
+benchmarks = mteb.get_benchmarks()
+```
+
+Generally we use the naming scheme for benchmarks `MTEB(*)`, where the "*" denotes the target of the benchmark. In case of a language we use the three letter language code. For large groups of language we use the group notation, e.g. `MTEB(Scandinavian)` for Scandinavian languages. External benchmarks implemented in MTEB like `CoIR` use their original name. When using a benchmark from MTEB please cite `mteb` along with the citations of the benchmark which you can access using:
+
+```python
+benchmark.citation
```
@@ -325,9 +340,11 @@ mteb run -t NFCorpus -m all-MiniLM-L6-v2 --output_folder results --save_predicti
Fetching result from the results repository
+### Fetching result from the results repository
+
Multiple models have already been run on tasks avaiable within MTEB. These results are available results [repository](https://github.com/embeddings-benchmark/results).
-To make the results more easily accecible we have designed custom functionality for retrieving from the repository. For instance, you are selecting the best model for your French and English retrieval task on legal documents you could fetch the relevant tasks and create a dataframe of the results using the following code:
+To make the results more easily accessible, we have designed custom functionality for retrieving from the repository. For instance, you are selecting the best model for your French and English retrieval task on legal documents you could fetch the relevant tasks and create a dataframe of the results using the following code:
```python
import mteb
diff --git a/docs/adding_a_model.md b/docs/adding_a_model.md
index bd37bc5fe1..90d3b0989a 100644
--- a/docs/adding_a_model.md
+++ b/docs/adding_a_model.md
@@ -29,10 +29,7 @@ mteb run -m {model_name} -t {task_names}
These will save the results in a folder called `results/{model_name}/{model_revision}`.
-For reference you can also look at [scripts/data/run_mteb_english.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/data/run_mteb_english.py) for all MTEB English datasets used in the main ranking.
-Advanced scripts with different models are available in the [mteb/mtebscripts repo](https://github.com/embeddings-benchmark/mtebscripts).
-
-2. **Format the results using the CLI:**
+1. **Format the results using the CLI:**
```bash
mteb create_meta --results_folder results/{model_name}/{model_revision} --output_path model_card.md
@@ -44,11 +41,11 @@ If readme of model exists:
mteb create_meta --results_folder results/{model_name}/{model_revision} --output_path model_card.md --from_existing your_existing_readme.md
```
-3. **Add the frontmatter to model repository:**
+2. **Add the frontmatter to model repository:**
Copy the content of the `model_card.md` file to the top of a `README.md` file of your model on the Hub. See [here](https://huggingface.co/Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit/blob/main/README.md) for an example.
-4. **Wait for a refresh the leaderboard:**
+3. **Wait for a refresh the leaderboard:**
The leaderboard will then automatically refresh daily so once submitted all you have to do is wait for the automatic refresh.
diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py
index 928020ff10..f7e606ec42 100644
--- a/mteb/abstasks/AbsTask.py
+++ b/mteb/abstasks/AbsTask.py
@@ -60,12 +60,13 @@ class DescriptiveStatistics(TypedDict):
class AbsTask(ABC):
metadata: TaskMetadata
+ _eval_splits: list[str] | None = None
superseded_by: None | str = None
+ dataset: dict[HFSubset, DatasetDict] | None = None # type: ignore
+ data_loaded: bool = False
+ is_multilingual: bool = False
def __init__(self, seed: int = 42, **kwargs: Any):
- self.dataset = None
- self.data_loaded = False
- self.is_multilingual = False
self.save_suffix = kwargs.get("save_suffix", "")
self.seed = seed
@@ -255,6 +256,11 @@ def languages(self) -> list[str]:
return self.metadata.languages
+ def filter_eval_splits(self, eval_splits: list[str] | None) -> AbsTask:
+ """Filter the evaluation splits of the task."""
+ self._eval_splits = eval_splits
+ return self
+
def filter_languages(
self, languages: list[str] | None, script: list[str] | None = None
) -> AbsTask:
@@ -285,6 +291,12 @@ def filter_languages(
self.hf_subsets = subsets_to_keep
return self
+ @property
+ def eval_splits(self) -> list[str]:
+ if self._eval_splits:
+ return self._eval_splits
+ return self.metadata.eval_splits
+
def __repr__(self) -> str:
"""Format the representation of the task such that it appears as:
diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py
index 973d69ee7f..be345b2f48 100644
--- a/mteb/abstasks/AbsTaskBitextMining.py
+++ b/mteb/abstasks/AbsTaskBitextMining.py
@@ -32,7 +32,7 @@ class AbsTaskBitextMining(AbsTask):
"""Abstract class for BitextMining tasks
The similarity is computed between pairs and the results are ranked.
- self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
+ self.load_data() must generate a huggingface dataset with a split matching self.metadata.eval_splits, and assign it to self.dataset. It must contain the following columns:
id: str
sentence1: str
sentence2: str
diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
index 048c74d75a..37e4c5309f 100644
--- a/mteb/benchmarks/benchmarks.py
+++ b/mteb/benchmarks/benchmarks.py
@@ -126,6 +126,7 @@ def __getitem__(self, index):
"TwitterURLCorpus",
],
languages=["eng"],
+ eval_splits=["test"],
),
description="Main English benchmarks from MTEB",
citation="""@inproceedings{muennighoff-etal-2023-mteb,
@@ -263,6 +264,7 @@ def __getitem__(self, index):
}
""",
)
+
SEB = Benchmark(
name="MTEB(Scandinavian)",
tasks=get_tasks(
diff --git a/mteb/benchmarks/get_benchmark.py b/mteb/benchmarks/get_benchmark.py
index 169e3bcd50..88079ce860 100644
--- a/mteb/benchmarks/get_benchmark.py
+++ b/mteb/benchmarks/get_benchmark.py
@@ -25,3 +25,11 @@ def get_benchmark(
suggestion = f"KeyError: '{benchmark_name}' not found and no similar keys were found."
raise KeyError(suggestion)
return BENCHMARK_REGISTRY[benchmark_name]
+
+
+def get_benchmarks(
+ names: list[str] | None,
+) -> list[Benchmark]:
+ if names is None:
+ names = list(BENCHMARK_REGISTRY.keys())
+ return [BENCHMARK_REGISTRY[name] for name in names]
diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index 0ac12d4bd2..8768de3828 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -341,9 +341,7 @@ def run(
continue
try:
task_eval_splits = (
- eval_splits
- if eval_splits is not None
- else task.metadata_dict.get("eval_splits", [])
+ eval_splits if eval_splits is not None else task.eval_splits
)
# load data
diff --git a/mteb/overview.py b/mteb/overview.py
index 993767da48..7b1bfbb426 100644
--- a/mteb/overview.py
+++ b/mteb/overview.py
@@ -231,6 +231,7 @@ def get_tasks(
categories: list[TASK_CATEGORY] | None = None,
tasks: list[str] | None = None,
exclude_superseeded: bool = True,
+ eval_splits: list[str] | None = None,
) -> MTEBTasks:
"""Get a list of tasks based on the specified filters.
@@ -245,6 +246,7 @@ def get_tasks(
paragraph).
tasks: A list of task names to include. If None, all tasks which pass the filters are included.
exclude_superseeded: A boolean flag to exclude datasets which are superseeded by another.
+ eval_splits: A list of evaluation splits to include. If None, all splits are included.
Returns:
A list of all initialized tasks objects which pass all of the filters (AND operation).
@@ -253,12 +255,18 @@ def get_tasks(
>>> get_tasks(languages=["eng", "deu"], script=["Latn"], domains=["Legal"])
>>> get_tasks(languages=["eng"], script=["Latn"], task_types=["Classification"])
>>> get_tasks(languages=["eng"], script=["Latn"], task_types=["Clustering"], exclude_superseeded=False)
+ >>> get_tasks(languages=["eng"], tasks=["WikipediaRetrievalMultilingual"], eval_splits=["test"])
"""
if tasks:
- _tasks = [get_task(task, languages, script) for task in tasks]
+ _tasks = [
+ get_task(task, languages, script, eval_splits=eval_splits) for task in tasks
+ ]
return MTEBTasks(_tasks)
- _tasks = [cls().filter_languages(languages, script) for cls in create_task_list()]
+ _tasks = [
+ cls().filter_languages(languages, script).filter_eval_splits(eval_splits)
+ for cls in create_task_list()
+ ]
if languages:
_tasks = filter_tasks_by_languages(_tasks, languages)
@@ -280,6 +288,7 @@ def get_task(
task_name: str,
languages: list[str] | None = None,
script: list[str] | None = None,
+ eval_splits: list[str] | None = None,
) -> AbsTask:
"""Get a task by name.
@@ -288,6 +297,7 @@ def get_task(
languages: A list of languages either specified as 3 letter languages codes (ISO 639-3, e.g. "eng") or as script languages codes e.g.
"eng-Latn". For multilingual tasks this will also remove languages that are not in the specified list.
script: A list of script codes (ISO 15924 codes). If None, all scripts are included. For multilingual tasks this will also remove scripts
+ eval_splits: A list of evaluation splits to include. If None, all splits are included.
Returns:
An initialized task object.
@@ -306,4 +316,7 @@ def get_task(
f"KeyError: '{task_name}' not found and no similar keys were found."
)
raise KeyError(suggestion)
- return TASKS_REGISTRY[task_name]().filter_languages(languages, script)
+ task = TASKS_REGISTRY[task_name]()
+ if eval_splits:
+ task.filter_eval_splits(eval_splits=eval_splits)
+ return task.filter_languages(languages, script)
diff --git a/scripts/data/run_mteb_english.py b/scripts/data/run_mteb_english.py
deleted file mode 100644
index 8b2d50950d..0000000000
--- a/scripts/data/run_mteb_english.py
+++ /dev/null
@@ -1,120 +0,0 @@
-"""Example script for benchmarking all datasets constituting the MTEB English leaderboard & average scores"""
-
-from __future__ import annotations
-
-import logging
-
-from sentence_transformers import SentenceTransformer
-
-from mteb import MTEB
-
-logging.basicConfig(level=logging.INFO)
-
-logger = logging.getLogger("main")
-
-TASK_LIST_CLASSIFICATION = [
- "AmazonCounterfactualClassification",
- "AmazonPolarityClassification",
- "AmazonReviewsClassification",
- "Banking77Classification",
- "EmotionClassification",
- "ImdbClassification",
- "MassiveIntentClassification",
- "MassiveScenarioClassification",
- "MTOPDomainClassification",
- "MTOPIntentClassification",
- "ToxicConversationsClassification",
- "TweetSentimentExtractionClassification",
-]
-
-TASK_LIST_CLUSTERING = [
- "ArxivClusteringP2P",
- "ArxivClusteringS2S",
- "BiorxivClusteringP2P",
- "BiorxivClusteringS2S",
- "MedrxivClusteringP2P",
- "MedrxivClusteringS2S",
- "RedditClustering",
- "RedditClusteringP2P",
- "StackExchangeClustering",
- "StackExchangeClusteringP2P",
- "TwentyNewsgroupsClustering",
-]
-
-TASK_LIST_PAIR_CLASSIFICATION = [
- "SprintDuplicateQuestions",
- "TwitterSemEval2015",
- "TwitterURLCorpus",
-]
-
-TASK_LIST_RERANKING = [
- "AskUbuntuDupQuestions",
- "MindSmallReranking",
- "SciDocsRR",
- "StackOverflowDupQuestions",
-]
-
-TASK_LIST_RETRIEVAL = [
- "ArguAna",
- "ClimateFEVER",
- "CQADupstackAndroidRetrieval",
- "CQADupstackEnglishRetrieval",
- "CQADupstackGamingRetrieval",
- "CQADupstackGisRetrieval",
- "CQADupstackMathematicaRetrieval",
- "CQADupstackPhysicsRetrieval",
- "CQADupstackProgrammersRetrieval",
- "CQADupstackStatsRetrieval",
- "CQADupstackTexRetrieval",
- "CQADupstackUnixRetrieval",
- "CQADupstackWebmastersRetrieval",
- "CQADupstackWordpressRetrieval",
- "DBPedia",
- "FEVER",
- "FiQA2018",
- "HotpotQA",
- "MSMARCO",
- "NFCorpus",
- "NQ",
- "QuoraRetrieval",
- "SCIDOCS",
- "SciFact",
- "Touche2020",
- "TRECCOVID",
-]
-
-TASK_LIST_STS = [
- "BIOSSES",
- "SICK-R",
- "STS12",
- "STS13",
- "STS14",
- "STS15",
- "STS16",
- "STS17",
- "STS22",
- "STSBenchmark",
- "SummEval",
-]
-
-TASK_LIST = (
- TASK_LIST_CLASSIFICATION
- + TASK_LIST_CLUSTERING
- + TASK_LIST_PAIR_CLASSIFICATION
- + TASK_LIST_RERANKING
- + TASK_LIST_RETRIEVAL
- + TASK_LIST_STS
-)
-
-model_name = "average_word_embeddings_komninos"
-model = SentenceTransformer(model_name)
-
-for task in TASK_LIST:
- logger.info(f"Running task: {task}")
- eval_splits = ["dev"] if task == "MSMARCO" else ["test"]
- evaluation = MTEB(
- tasks=[task], task_langs=["en"]
- ) # Remove "en" for running all languages
- evaluation.run(
- model, output_folder=f"results/{model_name}", eval_splits=eval_splits
- )
diff --git a/scripts/running_model/check_run.sh b/scripts/running_model/check_run.sh
index 6741f5f7c5..d4a9de4d70 100644
--- a/scripts/running_model/check_run.sh
+++ b/scripts/running_model/check_run.sh
@@ -7,7 +7,7 @@
# pip install codecarbon
# ensure latest version of sentnece-transformers is installed:
# pip install sentence-transformers --upgrade
-# ensure that the the huggingface token is set and accecible using:
+# ensure that the the huggingface token is set and accessible using:
# huggingface-cli login
echo "Running model on a sample set of tasks" # this is to check tasks are running correctly
diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py
index 742c7930e9..0a9c4bf2a8 100644
--- a/tests/test_benchmark/test_benchmark.py
+++ b/tests/test_benchmark/test_benchmark.py
@@ -149,6 +149,6 @@ def test_benchmark_names_must_be_unique():
@pytest.mark.parametrize("name", ["MTEB(eng)", "MTEB(rus)", "MTEB(Scandinavian)"])
-def test_get_benchmarks(name):
+def test_get_benchmark(name):
benchmark = mteb.get_benchmark(benchmark_name=name)
assert isinstance(benchmark, mteb.Benchmark)
diff --git a/tests/test_overview.py b/tests/test_overview.py
index 7103e2dfa2..73df5dc193 100644
--- a/tests/test_overview.py
+++ b/tests/test_overview.py
@@ -3,7 +3,8 @@
import pytest
import mteb
-from mteb import get_tasks
+from mteb import get_task, get_tasks
+from mteb.abstasks.AbsTask import AbsTask
from mteb.abstasks.TaskMetadata import TASK_DOMAIN, TASK_TYPE
from mteb.overview import MTEBTasks
@@ -19,12 +20,25 @@ def test_get_tasks_size_differences():
)
+@pytest.mark.parametrize("task_name", ["BornholmBitextMining"])
+@pytest.mark.parametrize("eval_splits", [["test"], None])
+def test_get_task(task_name: str, eval_splits: list[str] | None):
+ task = get_task(task_name, eval_splits=eval_splits)
+ assert isinstance(task, AbsTask)
+ assert task.metadata.name == task_name
+ if eval_splits:
+ for split in task.eval_splits:
+ assert split in eval_splits
+ else:
+ assert task.eval_splits == task.metadata.eval_splits
+
+
@pytest.mark.parametrize("languages", [["eng", "deu"], ["eng"], None])
@pytest.mark.parametrize("script", [["Latn"], ["Cyrl"], None])
@pytest.mark.parametrize("domains", [["Legal"], ["Medical", "Non-fiction"], None])
@pytest.mark.parametrize("task_types", [["Classification"], ["Clustering"], None])
@pytest.mark.parametrize("exclude_superseeded_datasets", [True, False])
-def test_get_task(
+def test_get_tasks(
languages: list[str],
script: list[str],
domains: list[TASK_DOMAIN],