diff --git a/README.md b/README.md
index e08545aec6..e2f7a523a5 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ pip install mteb
 
 ## Usage
 
-* Using a python script (see [scripts/run_mteb_english.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_english.py) and [mteb/mtebscripts](https://github.com/embeddings-benchmark/mtebscripts) for more):
+* Using a python script:
 
 ```python
 import mteb
@@ -77,11 +77,11 @@ Click on each section below to see the details.
 <br /> 
 
 <details>
-  <summary>  Dataset selection </summary>
+  <summary>  Task selection </summary>
 
-### Dataset selection
+### Task selection
 
-Datasets can be selected by providing the list of datasets, but also
+Tasks can be selected by providing the list of datasets, but also
 
 * by their task (e.g. "Clustering" or "Classification")
 
@@ -121,11 +121,18 @@ evaluation = mteb.MTEB(tasks=[
 # for an example of a HF subset see "Subset" in the dataset viewer at: https://huggingface.co/datasets/mteb/bucc-bitext-mining
 ```
 
-There are also presets available for certain task collections, e.g. to select the 56 English datasets that form the "Overall MTEB English leaderboard":
+</details>
+
+<details>
+  <summary>  Running a benchmark </summary>
+
+`mteb` comes with a set of predefined benchmarks. These can be fetched using `get_benchmark` and run in a similar fashion to other sets of tasks. 
+For instance to select the 56 English datasets that form the "Overall MTEB English leaderboard":
 
 ```python
-from mteb import MTEB_MAIN_EN
-evaluation = mteb.MTEB(tasks=MTEB_MAIN_EN, task_langs=["en"])
+import mteb
+mteb_eng = mteb.get_benchmark("MTEB(eng)")
+evaluation = mteb.MTEB(tasks=mteb_eng, eval_splits=["test"])
 ```
 
 </details>
diff --git a/mteb/__init__.py b/mteb/__init__.py
index be5edd97ed..2b98827014 100644
--- a/mteb/__init__.py
+++ b/mteb/__init__.py
@@ -2,7 +2,7 @@
 
 from importlib.metadata import version
 
-from mteb.benchmarks import (
+from mteb.benchmarks.benchmarks import (
     MTEB_MAIN_EN,
     MTEB_MAIN_RU,
     MTEB_RETRIEVAL_LAW,
@@ -14,7 +14,8 @@
 from mteb.models import get_model, get_model_meta
 from mteb.overview import TASKS_REGISTRY, get_task, get_tasks
 
-from .benchmarks import Benchmark
+from .benchmarks.benchmarks import Benchmark
+from .benchmarks.get_benchmark import get_benchmark
 
 __version__ = version("mteb")  # fetch version from install metadata
 
@@ -32,4 +33,5 @@
     "get_model_meta",
     "load_results",
     "Benchmark",
+    "get_benchmark",
 ]
diff --git a/mteb/benchmarks/__init__.py b/mteb/benchmarks/__init__.py
new file mode 100644
index 0000000000..fb1d12a293
--- /dev/null
+++ b/mteb/benchmarks/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from mteb.benchmarks.benchmarks import *
diff --git a/mteb/benchmarks.py b/mteb/benchmarks/benchmarks.py
similarity index 63%
rename from mteb/benchmarks.py
rename to mteb/benchmarks/benchmarks.py
index 9485230a62..048c74d75a 100644
--- a/mteb/benchmarks.py
+++ b/mteb/benchmarks/benchmarks.py
@@ -3,16 +3,44 @@
 from dataclasses import dataclass
 from typing import Sequence
 
+from pydantic import AnyUrl, BeforeValidator, TypeAdapter
+from typing_extensions import Annotated
+
 from mteb.abstasks.AbsTask import AbsTask
 from mteb.overview import get_tasks
 
+http_url_adapter = TypeAdapter(AnyUrl)
+UrlString = Annotated[
+    str, BeforeValidator(lambda value: str(http_url_adapter.validate_python(value)))
+]  # Allows the type to be a string, but ensures that the string is a URL
+
 
 @dataclass
 class Benchmark:
+    """A benchmark object intended to run a certain benchmark within MTEB.
+
+    Args:
+        name: The name of the benchmark
+        tasks: The tasks within the benchmark.
+        description: A description of the benchmark, should include its intended goal and potentially a description of its construction
+        reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github.
+        citation: A bibtex citation
+
+    Example:
+        >>> Benchmark(
+        ...     name="MTEB(custom)",
+        ...     tasks=mteb.get_tasks(
+        ...         tasks=["AmazonCounterfactualClassification", "AmazonPolarityClassification"],
+        ...         languages=["eng"],
+        ...     ),
+        ...     description="A custom benchmark"
+        ... )
+    """
+
     name: str
-    tasks: Sequence[str] | Sequence[AbsTask]
+    tasks: Sequence[AbsTask]
     description: str | None = None
-    reference: str | None = None
+    reference: UrlString | None = None
     citation: str | None = None
 
     def __iter__(self):
@@ -27,75 +55,78 @@ def __getitem__(self, index):
 
 MTEB_MAIN_EN = Benchmark(
     name="MTEB(eng)",
-    tasks=[
-        "AmazonCounterfactualClassification",
-        "AmazonPolarityClassification",
-        "AmazonReviewsClassification",
-        "ArguAna",
-        "ArxivClusteringP2P",
-        "ArxivClusteringS2S",
-        "AskUbuntuDupQuestions",
-        "BIOSSES",
-        "Banking77Classification",
-        "BiorxivClusteringP2P",
-        "BiorxivClusteringS2S",
-        "CQADupstackAndroidRetrieval",
-        "CQADupstackEnglishRetrieval",
-        "CQADupstackGamingRetrieval",
-        "CQADupstackGisRetrieval",
-        "CQADupstackMathematicaRetrieval",
-        "CQADupstackPhysicsRetrieval",
-        "CQADupstackProgrammersRetrieval",
-        "CQADupstackStatsRetrieval",
-        "CQADupstackTexRetrieval",
-        "CQADupstackUnixRetrieval",
-        "CQADupstackWebmastersRetrieval",
-        "CQADupstackWordpressRetrieval",
-        "ClimateFEVER",
-        "DBPedia",
-        "EmotionClassification",
-        "FEVER",
-        "FiQA2018",
-        "HotpotQA",
-        "ImdbClassification",
-        "MSMARCO",
-        "MTOPDomainClassification",
-        "MTOPIntentClassification",
-        "MassiveIntentClassification",
-        "MassiveScenarioClassification",
-        "MedrxivClusteringP2P",
-        "MedrxivClusteringS2S",
-        "MindSmallReranking",
-        "NFCorpus",
-        "NQ",
-        "QuoraRetrieval",
-        "RedditClustering",
-        "RedditClusteringP2P",
-        "SCIDOCS",
-        "SICK-R",
-        "STS12",
-        "STS13",
-        "STS14",
-        "STS15",
-        "STS16",
-        "STS17",
-        "STS22",
-        "STSBenchmark",
-        "SciDocsRR",
-        "SciFact",
-        "SprintDuplicateQuestions",
-        "StackExchangeClustering",
-        "StackExchangeClusteringP2P",
-        "StackOverflowDupQuestions",
-        "SummEval",
-        "TRECCOVID",
-        "Touche2020",
-        "ToxicConversationsClassification",
-        "TweetSentimentExtractionClassification",
-        "TwentyNewsgroupsClustering",
-        "TwitterSemEval2015",
-        "TwitterURLCorpus",
-    ],
+    tasks=get_tasks(
+        tasks=[
+            "AmazonCounterfactualClassification",
+            "AmazonPolarityClassification",
+            "AmazonReviewsClassification",
+            "ArguAna",
+            "ArxivClusteringP2P",
+            "ArxivClusteringS2S",
+            "AskUbuntuDupQuestions",
+            "BIOSSES",
+            "Banking77Classification",
+            "BiorxivClusteringP2P",
+            "BiorxivClusteringS2S",
+            "CQADupstackAndroidRetrieval",
+            "CQADupstackEnglishRetrieval",
+            "CQADupstackGamingRetrieval",
+            "CQADupstackGisRetrieval",
+            "CQADupstackMathematicaRetrieval",
+            "CQADupstackPhysicsRetrieval",
+            "CQADupstackProgrammersRetrieval",
+            "CQADupstackStatsRetrieval",
+            "CQADupstackTexRetrieval",
+            "CQADupstackUnixRetrieval",
+            "CQADupstackWebmastersRetrieval",
+            "CQADupstackWordpressRetrieval",
+            "ClimateFEVER",
+            "DBPedia",
+            "EmotionClassification",
+            "FEVER",
+            "FiQA2018",
+            "HotpotQA",
+            "ImdbClassification",
+            "MSMARCO",
+            "MTOPDomainClassification",
+            "MTOPIntentClassification",
+            "MassiveIntentClassification",
+            "MassiveScenarioClassification",
+            "MedrxivClusteringP2P",
+            "MedrxivClusteringS2S",
+            "MindSmallReranking",
+            "NFCorpus",
+            "NQ",
+            "QuoraRetrieval",
+            "RedditClustering",
+            "RedditClusteringP2P",
+            "SCIDOCS",
+            "SICK-R",
+            "STS12",
+            "STS13",
+            "STS14",
+            "STS15",
+            "STS16",
+            "STS17",
+            "STS22",
+            "STSBenchmark",
+            "SciDocsRR",
+            "SciFact",
+            "SprintDuplicateQuestions",
+            "StackExchangeClustering",
+            "StackExchangeClusteringP2P",
+            "StackOverflowDupQuestions",
+            "SummEval",
+            "TRECCOVID",
+            "Touche2020",
+            "ToxicConversationsClassification",
+            "TweetSentimentExtractionClassification",
+            "TwentyNewsgroupsClustering",
+            "TwitterSemEval2015",
+            "TwitterURLCorpus",
+        ],
+        languages=["eng"],
+    ),
     description="Main English benchmarks from MTEB",
     citation="""@inproceedings{muennighoff-etal-2023-mteb,
     title = "{MTEB}: Massive Text Embedding Benchmark",
@@ -170,11 +201,13 @@ def __getitem__(self, index):
 
 MTEB_RETRIEVAL_WITH_INSTRUCTIONS = Benchmark(
     name="MTEB(Retrieval w/Instructions)",
-    tasks=[
-        "Robust04InstructionRetrieval",
-        "News21InstructionRetrieval",
-        "Core17InstructionRetrieval",
-    ],
+    tasks=get_tasks(
+        tasks=[
+            "Robust04InstructionRetrieval",
+            "News21InstructionRetrieval",
+            "Core17InstructionRetrieval",
+        ]
+    ),
     description="Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions.",
     reference="https://arxiv.org/abs/2403.15246",
     citation="""@misc{weller2024followir,
@@ -188,33 +221,37 @@ def __getitem__(self, index):
 )
 
 MTEB_RETRIEVAL_LAW = Benchmark(
-    name="MTEB(law)",
-    tasks=[
-        "LegalSummarization",
-        "LegalBenchConsumerContractsQA",
-        "LegalBenchCorporateLobbying",
-        "AILACasedocs",
-        "AILAStatutes",
-        "LeCaRDv2",
-        "LegalQuAD",
-        "GerDaLIRSmall",
-    ],
-    description="Legal benchmarks from MTEB",
+    name="MTEB(law)",  # This benchmark is likely in the need of an update
+    tasks=get_tasks(
+        tasks=[
+            "AILACasedocs",
+            "AILAStatutes",
+            "LegalSummarization",
+            "GerDaLIRSmall",
+            "LeCaRDv2",
+            "LegalBenchConsumerContractsQA",
+            "LegalBenchCorporateLobbying",
+            "LegalQuAD",
+        ]
+    ),
+    description="Legal benchmarks from MTEB.",
     reference="https://aclanthology.org/2023.eacl-main.148/",
     citation=None,
 )
 
 MTEB_MINERS_BITEXT_MINING = Benchmark(
     name="MINERSBitextMining",
-    tasks=[
-        "BUCCBitextMining",
-        "LinceMTBitextMining",
-        "NollySentiBitextMining",
-        "NusaXBitextMining",
-        "NusaTranslationBitextMining",
-        "PhincBitextMining",
-        "TatoebaBitextMining",
-    ],
+    tasks=get_tasks(
+        tasks=[
+            "BUCC",
+            "LinceMTBitextMining",
+            "NollySentiBitextMining",
+            "NusaXBitextMining",
+            "NusaTranslationBitextMining",
+            "PhincBitextMining",
+            "Tatoeba",
+        ]
+    ),
     description="BitextMining benchmark from MINERS",
     reference="https://arxiv.org/pdf/2406.07424",
     citation="""
@@ -228,37 +265,43 @@ def __getitem__(self, index):
 )
 SEB = Benchmark(
     name="MTEB(Scandinavian)",
-    tasks=[
-        "BornholmBitextMining",
-        "NorwegianCourtsBitextMining",
-        "AngryTweetsClassification",
-        "DanishPoliticalCommentsClassification",
-        "DKHateClassification",
-        "LccSentimentClassification",
-        "MassiveIntentClassification",
-        "MassiveScenarioClassification",
-        "NordicLangClassification",
-        "ScalaClassification",
-        "NoRecClassification",
-        "NorwegianParliamentClassification",
-        "DalajClassification",
-        "SwedishSentimentClassification",
-        "SweRecClassification",
-        "DanFEVER",
-        "TV2Nordretrieval",
-        "TwitterHjerneRetrieval",
-        "NorQuadRetrieval",
-        "SNLRetrieval",
-        "SwednRetrieval",
-        "SweFaqRetrieval",
-        "WikiClusteringP2P.v2",
-        "SNLHierarchicalClusteringP2P",
-        "SNLHierarchicalClusteringS2S",
-        "VGHierarchicalClusteringP2P",
-        "VGHierarchicalClusteringS2S",
-        "SwednClusteringP2P",
-        "SwednClusteringS2S",
-    ],
+    tasks=get_tasks(
+        tasks=[
+            # Bitext
+            "BornholmBitextMining",
+            "NorwegianCourtsBitextMining",
+            # Classification
+            "AngryTweetsClassification",
+            "DanishPoliticalCommentsClassification",
+            "DalajClassification",
+            "DKHateClassification",
+            "LccSentimentClassification",
+            "MassiveIntentClassification",
+            "MassiveScenarioClassification",
+            "NordicLangClassification",
+            "NoRecClassification",
+            "NorwegianParliamentClassification",
+            "ScalaClassification",
+            "SwedishSentimentClassification",
+            "SweRecClassification",
+            # Retrieval
+            "DanFEVER",
+            "NorQuadRetrieval",
+            "SNLRetrieval",
+            "SwednRetrieval",
+            "SweFaqRetrieval",
+            "TV2Nordretrieval",
+            "TwitterHjerneRetrieval",
+            # Clustering
+            "SNLHierarchicalClusteringS2S",
+            "SNLHierarchicalClusteringP2P",
+            "SwednClusteringP2P",
+            "SwednClusteringS2S",
+            "VGHierarchicalClusteringS2S",
+            "VGHierarchicalClusteringP2P",
+        ],
+        languages=["dan", "swe", "nno", "nob"],
+    ),
     description="A curated selection of tasks coverering the Scandinavian languages; Danish, Swedish and Norwegian, including Bokmål and Nynorsk.",
     reference="https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/",
     citation="""@misc{enevoldsen2024scandinavian,
@@ -273,18 +316,20 @@ def __getitem__(self, index):
 
 CoIR = Benchmark(
     name="CoIR",
-    tasks=[
-        "AppsRetrieval",
-        "CosQA",
-        "SyntheticText2SQL",
-        "COIRCodeSearchNetRetrieval",
-        "CodeSearchNetCCRetrieval",
-        "CodeTransOceanDL",
-        "CodeTransOceanContest",
-        "StackOverflowQA",
-        "CodeFeedbackMT",
-        "CodeFeedbackST",
-    ],
+    tasks=get_tasks(
+        tasks=[
+            "AppsRetrieval",
+            "CodeFeedbackMT",
+            "CodeFeedbackST",
+            "CodeSearchNetCCRetrieval",
+            "CodeTransOceanContest",
+            "CodeTransOceanDL",
+            "CosQA",
+            "COIRCodeSearchNetRetrieval",
+            "StackOverflowQA",
+            "SyntheticText2SQL",
+        ]
+    ),
     description="CoIR: A Comprehensive Benchmark for Code Information Retrieval Models",
     reference="https://github.com/CoIR-team/coir",
     citation="""@misc{li2024coircomprehensivebenchmarkcode,
@@ -322,19 +367,19 @@ def __getitem__(self, index):
             "OpusparcusPC",
             "PawsXPairClassification",
             # Reranking
-            "SyntecReranking",
             "AlloprofReranking",
+            "SyntecReranking",
             # Retrieval
             "AlloprofRetrieval",
             "BSARDRetrieval",
+            "MintakaRetrieval",
             "SyntecRetrieval",
             "XPQARetrieval",
-            "MintakaRetrieval",
             # STS
-            "SummEvalFr",
-            "STSBenchmarkMultilingualSTS",
-            "STS22",
             "SICKFr",
+            "STS22",
+            "STSBenchmarkMultilingualSTS",
+            "SummEvalFr",
         ],
     ),
     description="Main French benchmarks from MTEB",
@@ -426,27 +471,27 @@ def __getitem__(self, index):
         languages=["pol"],
         tasks=[
             # Classification
+            "AllegroReviews",
             "CBD",
+            "MassiveIntentClassification",
+            "MassiveScenarioClassification",
             "PolEmo2.0-IN",
             "PolEmo2.0-OUT",
-            "AllegroReviews",
             "PAC",
-            "MassiveIntentClassification",
-            "MassiveScenarioClassification",
             # Clustering
             "EightTagsClustering",
             "PlscClusteringS2S",
             "PlscClusteringP2P",
             # Pair Classification
-            "SICK-E-PL",
-            "PpcPC",
             "CDSC-E",
+            "PpcPC",
             "PSC",
+            "SICK-E-PL",
             # STS
-            "SICK-R-PL",
             "CDSC-R",
             "STS22",
             "STSBenchmarkMultilingualSTS",
+            "SICK-R-PL",
         ],
     ),
     description="Main Polish benchmarks from MTEB",
diff --git a/mteb/benchmarks/get_benchmark.py b/mteb/benchmarks/get_benchmark.py
new file mode 100644
index 0000000000..169e3bcd50
--- /dev/null
+++ b/mteb/benchmarks/get_benchmark.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+import difflib
+
+import mteb.benchmarks.benchmarks as benchmark_module
+from mteb.benchmarks import Benchmark
+
+BENCHMARK_REGISTRY = {
+    inst.name: inst
+    for nam, inst in benchmark_module.__dict__.items()
+    if isinstance(inst, Benchmark)
+}
+
+
+def get_benchmark(
+    benchmark_name: str,
+) -> Benchmark:
+    if benchmark_name not in BENCHMARK_REGISTRY:
+        close_matches = difflib.get_close_matches(
+            benchmark_name, BENCHMARK_REGISTRY.keys()
+        )
+        if close_matches:
+            suggestion = f"KeyError: '{benchmark_name}' not found. Did you mean: {close_matches[0]}?"
+        else:
+            suggestion = f"KeyError: '{benchmark_name}' not found and no similar keys were found."
+        raise KeyError(suggestion)
+    return BENCHMARK_REGISTRY[benchmark_name]
diff --git a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
index a9cf4cea25..a01bda1d80 100644
--- a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
+++ b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
@@ -12,7 +12,7 @@ class FilipinoHateSpeechClassification(AbsTaskClassification):
         description="Filipino Twitter dataset for sentiment classification.",
         reference="https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019",
         dataset={
-            "path": "hate-speech-filipino/hate_speech_filipino",
+            "path": "legacy-datasets/hate_speech_filipino",
             "revision": "1994e9bb7f3ec07518e3f0d9e870cb293e234686",
             "trust_remote_code": True,
         },
diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py
index 3d32d923bc..742c7930e9 100644
--- a/tests/test_benchmark/test_benchmark.py
+++ b/tests/test_benchmark/test_benchmark.py
@@ -10,7 +10,7 @@
 from sentence_transformers import SentenceTransformer
 
 import mteb
-from mteb.benchmarks import Benchmark
+from mteb.benchmarks.benchmarks import Benchmark
 from mteb.create_meta import generate_readme
 
 from .mock_models import (
@@ -127,9 +127,28 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs):
 @pytest.mark.parametrize("model", [MockNumpyEncoder()])
 def test_run_using_benchmark(model: mteb.Encoder):
     """Test that a benchmark object can be run using the MTEB class."""
-    bench = Benchmark(name="test_bench", tasks=["STS12", "SummEval"])
+    bench = Benchmark(
+        name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"])
+    )
 
     eval = mteb.MTEB(tasks=bench)
     eval.run(
         model, output_folder="tests/results", overwrite_results=True
     )  # we just want to test that it runs
+
+
+def test_benchmark_names_must_be_unique():
+    import mteb.benchmarks.benchmarks as benchmark_module
+
+    names = [
+        inst.name
+        for nam, inst in benchmark_module.__dict__.items()
+        if isinstance(inst, Benchmark)
+    ]
+    assert len(names) == len(set(names))
+
+
+@pytest.mark.parametrize("name", ["MTEB(eng)", "MTEB(rus)", "MTEB(Scandinavian)"])
+def test_get_benchmarks(name):
+    benchmark = mteb.get_benchmark(benchmark_name=name)
+    assert isinstance(benchmark, mteb.Benchmark)