Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
d1e1a1b
Human Subsets Tasks
AdnanElAssadi56 Aug 10, 2025
7eaff4d
Fixed Multilingual Classification Subset
AdnanElAssadi56 Aug 10, 2025
9bc5b87
linting
AdnanElAssadi56 Sep 25, 2025
cb3893b
fix citations format
AdnanElAssadi56 Sep 25, 2025
ed1af7e
make lint
isaac-chung Sep 25, 2025
aa08f35
fix tests
isaac-chung Sep 25, 2025
d9fbb44
remove human folder
isaac-chung Sep 27, 2025
52eb9a7
fix relative imports
isaac-chung Sep 27, 2025
94b1f7c
add adapted_from for all human subsets
isaac-chung Sep 27, 2025
6e599fa
fix pydantic errors
isaac-chung Sep 27, 2025
9e61698
add benchmark object
isaac-chung Sep 27, 2025
0cfbe1c
make benchmark discoverable
isaac-chung Sep 27, 2025
926a839
bibtex test
isaac-chung Sep 27, 2025
7a5983e
Apply suggestion
Samoed Sep 29, 2025
c3a3b77
Apply suggestions from code review
Samoed Sep 29, 2025
f6c3c37
rename & reupload
Samoed Sep 29, 2025
72e9522
upd tests
Samoed Sep 29, 2025
7142999
upd tests again
Samoed Sep 29, 2025
070eea8
add model
Samoed Sep 29, 2025
c94fbb6
add benchmark to leaderboard
Samoed Sep 29, 2025
e1e28b6
change branch of leaderboard
Samoed Sep 29, 2025
2bab369
remove branch of load data
Samoed Sep 29, 2025
a394ed8
fix model meta path
Samoed Sep 29, 2025
91d7262
make mteb importable
Samoed Sep 29, 2025
ae0a508
update repo
Samoed Sep 29, 2025
39cec66
Merge branch 'main' into human_tasks
AdnanElAssadi56 Oct 1, 2025
df3e263
Update mteb/benchmarks/benchmarks/benchmarks.py
isaac-chung Oct 2, 2025
5b09048
Update mteb/leaderboard/benchmark_selector.py
isaac-chung Oct 2, 2025
da9124d
Update mteb/load_results/load_results.py
isaac-chung Oct 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions mteb/benchmarks/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
ENCODECHKA,
FA_MTEB,
FA_MTEB_2,
HUME,
JINA_VDR,
LONG_EMBED,
MIEB_ENG,
Expand Down Expand Up @@ -112,4 +113,5 @@
"RTEB_ENGLISH",
"RTEB_FRENCH",
"RTEB_GERMAN",
"HUME",
]
32 changes: 32 additions & 0 deletions mteb/benchmarks/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2295,3 +2295,35 @@
year = {2025},
}""",
)


HUME = Benchmark(
name="HUME(v1)",
display_name="Human Benchmark",
# icon="https://raw.githubusercontent.com/huggingface/benchmarks/main/benchmarks/assets/hume.png",
tasks=get_tasks(
tasks=[
"HUMEEmotionClassification",
"HUMEToxicConversationsClassification",
"HUMETweetSentimentExtractionClassification",
"HUMEMultilingualSentimentClassification",
"HUMEArxivClusteringP2P",
"HUMERedditClusteringP2P",
"HUMEWikiCitiesClustering",
"HUMESIB200ClusteringS2S",
"HUMECore17InstructionReranking",
"HUMENews21InstructionReranking",
"HUMERobust04InstructionReranking",
"HUMEWikipediaRerankingMultilingual",
"HUMESICK-R",
"HUMESTS12",
"HUMESTSBenchmark",
"HUMESTS22",
],
languages=["eng-Latn", "ara-Arab", "rus-Cyrl", "dan-Latn", "nob-Latn"],
),
description="The HUME benchmark is designed to evaluate the performance of text embedding models and humans on a comparable set of tasks. This captures areas where models perform better than human annotators and the reverse. In the paper, we go further into the analysis and what conclusions can be drawn.",
reference="Coming soon (in review)",
citation=None,
contacts=["AdnanElAssadi56", "KennethEnevoldsen", "isaac-chung", "Samoed"],
)
4 changes: 3 additions & 1 deletion mteb/leaderboard/benchmark_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ class MenuEntry:
name="Select Benchmark",
description="",
open=False,
benchmarks=mteb.get_benchmarks(["MTEB(Multilingual, v2)", "MTEB(eng, v2)"])
benchmarks=mteb.get_benchmarks(
["MTEB(Multilingual, v2)", "MTEB(eng, v2)", "HUME(v1)"]
)
+ [
MenuEntry(
"Image",
Expand Down
5 changes: 4 additions & 1 deletion mteb/load_results/load_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@ def download_of_results(
logger.info(
f"No results repository found in {results_directory}, cloning it from {results_repo}"
)
subprocess.run(["git", "clone", results_repo], cwd=cache_directory)
subprocess.run(
["git", "clone", results_repo],
cwd=cache_directory,
)

return results_directory

Expand Down
2 changes: 1 addition & 1 deletion mteb/model_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def languages_are_valid(cls, languages: list[ISO_LANGUAGE_SCRIPT] | None) -> Non
@field_validator("name")
@classmethod
def check_name(cls, v: str | None) -> str | None:
if v is None or v == "bm25s":
if v is None or v in ("bm25s", "Human"):
return v
if "/" not in v:
raise ValueError(
Expand Down
24 changes: 24 additions & 0 deletions mteb/models/human.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from __future__ import annotations

from mteb.model_meta import ModelMeta

human = ModelMeta(
loader=None,
name="Human",
languages=["eng-Latn", "ara-Arab", "rus-Cyrl", "dan-Latn", "nob-Latn"],
open_weights=True,
revision="2025_09_25",
release_date=None,
n_parameters=None,
memory_usage_mb=None,
embed_dim=None,
license=None,
max_tokens=None,
reference=None,
similarity_fn_name=None,
framework=[],
use_instructions=None,
training_datasets=None,
public_training_code=None,
public_training_data=None,
)
2 changes: 2 additions & 0 deletions mteb/models/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
gritlm_models,
gte_models,
hinvec_models,
human,
ibm_granite_models,
inf_models,
jasper_models,
Expand Down Expand Up @@ -143,6 +144,7 @@
gritlm_models,
gte_models,
hinvec_models,
human,
ibm_granite_models,
inf_models,
jasper_models,
Expand Down
4 changes: 4 additions & 0 deletions mteb/tasks/Classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
from .eng.EmotionClassification import *
from .eng.FinancialPhrasebankClassification import *
from .eng.FrenkEnClassification import *
from .eng.HUMEEmotionClassification import *
from .eng.HUMEToxicConversationsClassification import *
from .eng.HUMETweetSentimentExtractionClassification import *
from .eng.ImdbClassification import *
from .eng.LegalBenchClassification import *
from .eng.NewsClassification import *
Expand Down Expand Up @@ -95,6 +98,7 @@
from .multilingual.CataloniaTweetClassification import *
from .multilingual.CyrillicTurkicLangClassification import *
from .multilingual.HinDialectClassification import *
from .multilingual.HUMEMultilingualSentimentClassification import *
from .multilingual.IndicLangClassification import *
from .multilingual.IndicNLPNewsClassification import *
from .multilingual.IndicSentimentClassification import *
Expand Down
59 changes: 59 additions & 0 deletions mteb/tasks/Classification/eng/HUMEEmotionClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class HUMEEmotionClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="HUMEEmotionClassification",
description="Human evaluation subset of Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise.",
reference="https://www.aclweb.org/anthology/D18-1404",
dataset={
"path": "mteb/HUMEEmotionClassification",
"revision": "bc2a4c799c86abc5bc138b0de038f46e24e88eb4",
},
type="Classification",
category="s2s",
modalities=["text"],
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="accuracy",
date=(
"2017-01-01",
"2018-12-31",
), # Estimated range for the collection of Twitter messages
domains=["Social", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="not specified",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation=r"""
@inproceedings{saravia-etal-2018-carer,
abstract = {Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.},
address = {Brussels, Belgium},
author = {Saravia, Elvis and
Liu, Hsien-Chi Toby and
Huang, Yen-Hao and
Wu, Junlin and
Chen, Yi-Shin},
booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
doi = {10.18653/v1/D18-1404},
editor = {Riloff, Ellen and
Chiang, David and
Hockenmaier, Julia and
Tsujii, Jun{'}ichi},
month = oct # {-} # nov,
pages = {3687--3697},
publisher = {Association for Computational Linguistics},
title = {{CARER}: Contextualized Affect Representations for Emotion Recognition},
url = {https://aclanthology.org/D18-1404},
year = {2018},
}
""",
prompt="Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise",
adapted_from=["EmotionClassification"],
)

samples_per_label = 16
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class HUMEToxicConversationsClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="HUMEToxicConversationsClassification",
description="Human evaluation subset of Collection of comments from the Civil Comments platform together with annotations if the comment is toxic or not.",
reference="https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview",
dataset={
"path": "mteb/HUMEToxicConversationsClassification",
"revision": "4c128c30566ffc7b01c7c3a367da20f36fc08ef8",
},
type="Classification",
category="s2s",
modalities=["text"],
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="accuracy",
date=(
"2017-01-01",
"2018-12-31",
), # Estimated range for the collection of comments
domains=["Social", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="cc-by-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation=r"""
@misc{jigsaw-unintended-bias-in-toxicity-classification,
author = {cjadams and Daniel Borkan and inversion and Jeffrey Sorensen and Lucas Dixon and Lucy Vasserman and nithum},
publisher = {Kaggle},
title = {Jigsaw Unintended Bias in Toxicity Classification},
url = {https://kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification},
year = {2019},
}
""",
prompt="Classify the given comments as either toxic or not toxic",
adapted_from=["ToxicConversationsClassification"],
)

samples_per_label = 16
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class HUMETweetSentimentExtractionClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="HUMETweetSentimentExtractionClassification",
description="Human evaluation subset of Tweet Sentiment Extraction dataset.",
reference="https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview",
dataset={
"path": "mteb/HUMETweetSentimentExtractionClassification",
"revision": "264bce01a98dfaf3581b53dcaa0fd5e2d44aa589",
},
type="Classification",
category="s2s",
modalities=["text"],
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="accuracy",
date=(
"2020-01-01",
"2020-12-31",
), # Estimated range for the collection of tweets
domains=["Social", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="not specified",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation=r"""
@misc{tweet-sentiment-extraction,
author = {Maggie, Phil Culliton, Wei Chen},
publisher = {Kaggle},
title = {Tweet Sentiment Extraction},
url = {https://kaggle.com/competitions/tweet-sentiment-extraction},
year = {2020},
}
""",
prompt="Classify the sentiment of a given tweet as either positive, negative, or neutral",
adapted_from=["TweetSentimentExtractionClassification"],
)

samples_per_label = 32
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.MultilingualTask import MultilingualTask
from mteb.abstasks.TaskMetadata import TaskMetadata

_LANGUAGES = {
"eng": ["eng-Latn"],
"ara": ["ara-Arab"],
"nor": ["nor-Latn"],
"rus": ["rus-Cyrl"],
}


class HUMEMultilingualSentimentClassification(AbsTaskClassification, MultilingualTask):
metadata = TaskMetadata(
name="HUMEMultilingualSentimentClassification",
dataset={
"path": "mteb/HUMEMultilingualSentimentClassification",
"revision": "1b988d30980efdd9c27de1643837bf3ae5bae814",
},
description=(
"Human evaluation subset of Sentiment classification dataset with binary "
"(positive vs negative sentiment) labels. Includes 4 languages."
),
reference="https://huggingface.co/datasets/mteb/multilingual-sentiment-classification",
type="Classification",
category="s2s",
modalities=["text"],
eval_splits=["test"],
eval_langs=_LANGUAGES,
main_score="accuracy",
date=("2022-08-01", "2022-08-01"),
domains=["Reviews", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="not specified",
annotations_creators="derived",
dialect=["ar-dz"],
sample_creation="found",
bibtex_citation=r"""
@inproceedings{mollanorozy-etal-2023-cross,
address = {Dubrovnik, Croatia},
author = {Mollanorozy, Sepideh and
Tanti, Marc and
Nissim, Malvina},
booktitle = {Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP},
doi = {10.18653/v1/2023.sigtyp-1.9},
editor = {Beinborn, Lisa and
Goswami, Koustava and
Murado{\\u{g}}lu, Saliha and
Sorokin, Alexey and
Kumar, Ritesh and
Shcherbakov, Andreas and
Ponti, Edoardo M. and
Cotterell, Ryan and
Vylomova, Ekaterina},
month = may,
pages = {89--95},
publisher = {Association for Computational Linguistics},
title = {Cross-lingual Transfer Learning with \{P\}ersian},
url = {https://aclanthology.org/2023.sigtyp-1.9},
year = {2023},
}
""",
adapted_from=["MultilingualSentimentClassification"],
)
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,10 @@ class MultilingualSentimentClassification(AbsTaskClassification, MultilingualTas
"path": "mteb/multilingual-sentiment-classification",
"revision": "2b9b4d10fc589af67794141fe8cbd3739de1eb33",
},
description="""Sentiment classification dataset with binary
(positive vs negative sentiment) labels. Includes 30 languages and dialects.
""",
description=(
"Sentiment classification dataset with binary "
"(positive vs negative sentiment) labels. Includes 30 languages and dialects."
),
reference="https://huggingface.co/datasets/mteb/multilingual-sentiment-classification",
type="Classification",
category="s2s",
Expand Down
4 changes: 4 additions & 0 deletions mteb/tasks/Clustering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
from .eng.BuiltBenchClusteringP2P import *
from .eng.BuiltBenchClusteringS2S import *
from .eng.ClusTrecCovid import *
from .eng.HUMEArxivClusteringP2P import *
from .eng.HUMERedditClusteringP2P import *
from .eng.HUMEWikiCitiesClustering import *
from .eng.MedrxivClusteringP2P import *
from .eng.MedrxivClusteringS2S import *
from .eng.RedditClustering import *
Expand All @@ -31,6 +34,7 @@
from .jpn.MewsC16JaClustering import *
from .kor.KlueMrcDomainClustering import *
from .kor.KlueYnatMrcCategoryClustering import *
from .multilingual.HUMESIB200ClusteringS2S import *
from .multilingual.IndicReviewsClusteringP2P import *
from .multilingual.MasakhaNEWSClusteringP2P import *
from .multilingual.MasakhaNEWSClusteringS2S import *
Expand Down
Loading