embeddings-benchmark · isaac-chung · Oct 2, 2025 · Aug 10, 2025 · Aug 10, 2025 · Sep 25, 2025
diff --git a/mteb/benchmarks/benchmarks/__init__.py b/mteb/benchmarks/benchmarks/__init__.py
@@ -13,6 +13,7 @@
     ENCODECHKA,
     FA_MTEB,
     FA_MTEB_2,
+    HUME,
     JINA_VDR,
     LONG_EMBED,
     MIEB_ENG,
@@ -112,4 +113,5 @@
     "RTEB_ENGLISH",
     "RTEB_FRENCH",
     "RTEB_GERMAN",
+    "HUME",
 ]
diff --git a/mteb/benchmarks/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks/benchmarks.py
@@ -2295,3 +2295,35 @@
   year = {2025},
 }""",
 )
+
+
+HUME = Benchmark(
+    name="HUME(v1)",
+    display_name="Human Benchmark",
+    # icon="https://raw.githubusercontent.com/huggingface/benchmarks/main/benchmarks/assets/hume.png",
+    tasks=get_tasks(
+        tasks=[
+            "HUMEEmotionClassification",
+            "HUMEToxicConversationsClassification",
+            "HUMETweetSentimentExtractionClassification",
+            "HUMEMultilingualSentimentClassification",
+            "HUMEArxivClusteringP2P",
+            "HUMERedditClusteringP2P",
+            "HUMEWikiCitiesClustering",
+            "HUMESIB200ClusteringS2S",
+            "HUMECore17InstructionReranking",
+            "HUMENews21InstructionReranking",
+            "HUMERobust04InstructionReranking",
+            "HUMEWikipediaRerankingMultilingual",
+            "HUMESICK-R",
+            "HUMESTS12",
+            "HUMESTSBenchmark",
+            "HUMESTS22",
+        ],
+        languages=["eng-Latn", "ara-Arab", "rus-Cyrl", "dan-Latn", "nob-Latn"],
+    ),
+    description="The HUME benchmark is designed to evaluate the performance of text embedding models and humans on a comparable set of tasks. This captures areas where models perform better than human annotators and the reverse. In the paper, we go further into the analysis and what conclusions can be drawn.",
+    reference="Coming soon (in review)",
+    citation=None,
+    contacts=["AdnanElAssadi56", "KennethEnevoldsen", "isaac-chung", "Samoed"],
+)
diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py
@@ -34,7 +34,9 @@ class MenuEntry:
         name="Select Benchmark",
         description="",
         open=False,
-        benchmarks=mteb.get_benchmarks(["MTEB(Multilingual, v2)", "MTEB(eng, v2)"])
+        benchmarks=mteb.get_benchmarks(
+            ["MTEB(Multilingual, v2)", "MTEB(eng, v2)", "HUME(v1)"]
+        )
         + [
             MenuEntry(
                 "Image",

diff --git a/mteb/load_results/load_results.py b/mteb/load_results/load_results.py
@@ -57,7 +57,10 @@ def download_of_results(
         logger.info(
             f"No results repository found in {results_directory}, cloning it from {results_repo}"
         )
-        subprocess.run(["git", "clone", results_repo], cwd=cache_directory)
+        subprocess.run(
+            ["git", "clone", results_repo],
+            cwd=cache_directory,
+        )
 
     return results_directory
 

diff --git a/mteb/model_meta.py b/mteb/model_meta.py
@@ -140,7 +140,7 @@ def languages_are_valid(cls, languages: list[ISO_LANGUAGE_SCRIPT] | None) -> Non
     @field_validator("name")
     @classmethod
     def check_name(cls, v: str | None) -> str | None:
-        if v is None or v == "bm25s":
+        if v is None or v in ("bm25s", "Human"):
             return v
         if "/" not in v:
             raise ValueError(

diff --git a/mteb/models/human.py b/mteb/models/human.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+from mteb.model_meta import ModelMeta
+
+human = ModelMeta(
+    loader=None,
+    name="Human",
+    languages=["eng-Latn", "ara-Arab", "rus-Cyrl", "dan-Latn", "nob-Latn"],
+    open_weights=True,
+    revision="2025_09_25",
+    release_date=None,
+    n_parameters=None,
+    memory_usage_mb=None,
+    embed_dim=None,
+    license=None,
+    max_tokens=None,
+    reference=None,
+    similarity_fn_name=None,
+    framework=[],
+    use_instructions=None,
+    training_datasets=None,
+    public_training_code=None,
+    public_training_data=None,
+)
diff --git a/mteb/models/overview.py b/mteb/models/overview.py
@@ -48,6 +48,7 @@
     gritlm_models,
     gte_models,
     hinvec_models,
+    human,
     ibm_granite_models,
     inf_models,
     jasper_models,
@@ -143,6 +144,7 @@
     gritlm_models,
     gte_models,
     hinvec_models,
+    human,
     ibm_granite_models,
     inf_models,
     jasper_models,

diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py
@@ -29,6 +29,9 @@
 from .eng.EmotionClassification import *
 from .eng.FinancialPhrasebankClassification import *
 from .eng.FrenkEnClassification import *
+from .eng.HUMEEmotionClassification import *
+from .eng.HUMEToxicConversationsClassification import *
+from .eng.HUMETweetSentimentExtractionClassification import *
 from .eng.ImdbClassification import *
 from .eng.LegalBenchClassification import *
 from .eng.NewsClassification import *
@@ -95,6 +98,7 @@
 from .multilingual.CataloniaTweetClassification import *
 from .multilingual.CyrillicTurkicLangClassification import *
 from .multilingual.HinDialectClassification import *
+from .multilingual.HUMEMultilingualSentimentClassification import *
 from .multilingual.IndicLangClassification import *
 from .multilingual.IndicNLPNewsClassification import *
 from .multilingual.IndicSentimentClassification import *

diff --git a/mteb/tasks/Classification/eng/HUMEEmotionClassification.py b/mteb/tasks/Classification/eng/HUMEEmotionClassification.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class HUMEEmotionClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="HUMEEmotionClassification",
+        description="Human evaluation subset of Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise.",
+        reference="https://www.aclweb.org/anthology/D18-1404",
+        dataset={
+            "path": "mteb/HUMEEmotionClassification",
+            "revision": "bc2a4c799c86abc5bc138b0de038f46e24e88eb4",
+        },
+        type="Classification",
+        category="s2s",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="accuracy",
+        date=(
+            "2017-01-01",
+            "2018-12-31",
+        ),  # Estimated range for the collection of Twitter messages
+        domains=["Social", "Written"],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="not specified",
+        annotations_creators="human-annotated",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation=r"""
+@inproceedings{saravia-etal-2018-carer,
+  abstract = {Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.},
+  address = {Brussels, Belgium},
+  author = {Saravia, Elvis  and
+Liu, Hsien-Chi Toby  and
+Huang, Yen-Hao  and
+Wu, Junlin  and
+Chen, Yi-Shin},
+  booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
+  doi = {10.18653/v1/D18-1404},
+  editor = {Riloff, Ellen  and
+Chiang, David  and
+Hockenmaier, Julia  and
+Tsujii, Jun{'}ichi},
+  month = oct # {-} # nov,
+  pages = {3687--3697},
+  publisher = {Association for Computational Linguistics},
+  title = {{CARER}: Contextualized Affect Representations for Emotion Recognition},
+  url = {https://aclanthology.org/D18-1404},
+  year = {2018},
+}
+""",
+        prompt="Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise",
+        adapted_from=["EmotionClassification"],
+    )
+
+    samples_per_label = 16
diff --git a/mteb/tasks/Classification/eng/HUMEToxicConversationsClassification.py b/mteb/tasks/Classification/eng/HUMEToxicConversationsClassification.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class HUMEToxicConversationsClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="HUMEToxicConversationsClassification",
+        description="Human evaluation subset of Collection of comments from the Civil Comments platform together with annotations if the comment is toxic or not.",
+        reference="https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview",
+        dataset={
+            "path": "mteb/HUMEToxicConversationsClassification",
+            "revision": "4c128c30566ffc7b01c7c3a367da20f36fc08ef8",
+        },
+        type="Classification",
+        category="s2s",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="accuracy",
+        date=(
+            "2017-01-01",
+            "2018-12-31",
+        ),  # Estimated range for the collection of comments
+        domains=["Social", "Written"],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="cc-by-4.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation=r"""
+@misc{jigsaw-unintended-bias-in-toxicity-classification,
+  author = {cjadams and Daniel Borkan and inversion and Jeffrey Sorensen and Lucas Dixon and Lucy Vasserman and nithum},
+  publisher = {Kaggle},
+  title = {Jigsaw Unintended Bias in Toxicity Classification},
+  url = {https://kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification},
+  year = {2019},
+}
+""",
+        prompt="Classify the given comments as either toxic or not toxic",
+        adapted_from=["ToxicConversationsClassification"],
+    )
+
+    samples_per_label = 16
diff --git a/mteb/tasks/Classification/eng/HUMETweetSentimentExtractionClassification.py b/mteb/tasks/Classification/eng/HUMETweetSentimentExtractionClassification.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class HUMETweetSentimentExtractionClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="HUMETweetSentimentExtractionClassification",
+        description="Human evaluation subset of Tweet Sentiment Extraction dataset.",
+        reference="https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview",
+        dataset={
+            "path": "mteb/HUMETweetSentimentExtractionClassification",
+            "revision": "264bce01a98dfaf3581b53dcaa0fd5e2d44aa589",
+        },
+        type="Classification",
+        category="s2s",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="accuracy",
+        date=(
+            "2020-01-01",
+            "2020-12-31",
+        ),  # Estimated range for the collection of tweets
+        domains=["Social", "Written"],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="not specified",
+        annotations_creators="human-annotated",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation=r"""
+@misc{tweet-sentiment-extraction,
+  author = {Maggie, Phil Culliton, Wei Chen},
+  publisher = {Kaggle},
+  title = {Tweet Sentiment Extraction},
+  url = {https://kaggle.com/competitions/tweet-sentiment-extraction},
+  year = {2020},
+}
+""",
+        prompt="Classify the sentiment of a given tweet as either positive, negative, or neutral",
+        adapted_from=["TweetSentimentExtractionClassification"],
+    )
+
+    samples_per_label = 32
diff --git a/mteb/tasks/Classification/multilingual/HUMEMultilingualSentimentClassification.py b/mteb/tasks/Classification/multilingual/HUMEMultilingualSentimentClassification.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+from mteb.abstasks.MultilingualTask import MultilingualTask
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+_LANGUAGES = {
+    "eng": ["eng-Latn"],
+    "ara": ["ara-Arab"],
+    "nor": ["nor-Latn"],
+    "rus": ["rus-Cyrl"],
+}
+
+
+class HUMEMultilingualSentimentClassification(AbsTaskClassification, MultilingualTask):
+    metadata = TaskMetadata(
+        name="HUMEMultilingualSentimentClassification",
+        dataset={
+            "path": "mteb/HUMEMultilingualSentimentClassification",
+            "revision": "1b988d30980efdd9c27de1643837bf3ae5bae814",
+        },
+        description=(
+            "Human evaluation subset of Sentiment classification dataset with binary "
+            "(positive vs negative sentiment) labels. Includes 4 languages."
+        ),
+        reference="https://huggingface.co/datasets/mteb/multilingual-sentiment-classification",
+        type="Classification",
+        category="s2s",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=_LANGUAGES,
+        main_score="accuracy",
+        date=("2022-08-01", "2022-08-01"),
+        domains=["Reviews", "Written"],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=["ar-dz"],
+        sample_creation="found",
+        bibtex_citation=r"""
+@inproceedings{mollanorozy-etal-2023-cross,
+  address = {Dubrovnik, Croatia},
+  author = {Mollanorozy, Sepideh  and
+Tanti, Marc  and
+Nissim, Malvina},
+  booktitle = {Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP},
+  doi = {10.18653/v1/2023.sigtyp-1.9},
+  editor = {Beinborn, Lisa  and
+Goswami, Koustava  and
+Murado{\\u{g}}lu, Saliha  and
+Sorokin, Alexey  and
+Kumar, Ritesh  and
+Shcherbakov, Andreas  and
+Ponti, Edoardo M.  and
+Cotterell, Ryan  and
+Vylomova, Ekaterina},
+  month = may,
+  pages = {89--95},
+  publisher = {Association for Computational Linguistics},
+  title = {Cross-lingual Transfer Learning with \{P\}ersian},
+  url = {https://aclanthology.org/2023.sigtyp-1.9},
+  year = {2023},
+}
+""",
+        adapted_from=["MultilingualSentimentClassification"],
+    )
diff --git a/mteb/tasks/Classification/multilingual/MultilingualSentimentClassification.py b/mteb/tasks/Classification/multilingual/MultilingualSentimentClassification.py
@@ -47,9 +47,10 @@ class MultilingualSentimentClassification(AbsTaskClassification, MultilingualTas
             "path": "mteb/multilingual-sentiment-classification",
             "revision": "2b9b4d10fc589af67794141fe8cbd3739de1eb33",
         },
-        description="""Sentiment classification dataset with binary
-                       (positive vs negative sentiment) labels. Includes 30 languages and dialects.
-                     """,
+        description=(
+            "Sentiment classification dataset with binary "
+            "(positive vs negative sentiment) labels. Includes 30 languages and dialects."
+        ),
         reference="https://huggingface.co/datasets/mteb/multilingual-sentiment-classification",
         type="Classification",
         category="s2s",

diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py
@@ -13,6 +13,9 @@
 from .eng.BuiltBenchClusteringP2P import *
 from .eng.BuiltBenchClusteringS2S import *
 from .eng.ClusTrecCovid import *
+from .eng.HUMEArxivClusteringP2P import *
+from .eng.HUMERedditClusteringP2P import *
+from .eng.HUMEWikiCitiesClustering import *
 from .eng.MedrxivClusteringP2P import *
 from .eng.MedrxivClusteringS2S import *
 from .eng.RedditClustering import *
@@ -31,6 +34,7 @@
 from .jpn.MewsC16JaClustering import *
 from .kor.KlueMrcDomainClustering import *
 from .kor.KlueYnatMrcCategoryClustering import *
+from .multilingual.HUMESIB200ClusteringS2S import *
 from .multilingual.IndicReviewsClusteringP2P import *
 from .multilingual.MasakhaNEWSClusteringP2P import *
 from .multilingual.MasakhaNEWSClusteringS2S import *