diff --git a/mteb/benchmarks/benchmarks/__init__.py b/mteb/benchmarks/benchmarks/__init__.py index 622a8ecc6d..c5a4666c2f 100644 --- a/mteb/benchmarks/benchmarks/__init__.py +++ b/mteb/benchmarks/benchmarks/__init__.py @@ -13,6 +13,7 @@ ENCODECHKA, FA_MTEB, FA_MTEB_2, + HUME, JINA_VDR, LONG_EMBED, MIEB_ENG, @@ -112,4 +113,5 @@ "RTEB_ENGLISH", "RTEB_FRENCH", "RTEB_GERMAN", + "HUME", ] diff --git a/mteb/benchmarks/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks/benchmarks.py index b24941eddf..c454f67afc 100644 --- a/mteb/benchmarks/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks/benchmarks.py @@ -2295,3 +2295,35 @@ year = {2025}, }""", ) + + +HUME = Benchmark( + name="HUME(v1)", + display_name="Human Benchmark", + # icon="https://raw.githubusercontent.com/huggingface/benchmarks/main/benchmarks/assets/hume.png", + tasks=get_tasks( + tasks=[ + "HUMEEmotionClassification", + "HUMEToxicConversationsClassification", + "HUMETweetSentimentExtractionClassification", + "HUMEMultilingualSentimentClassification", + "HUMEArxivClusteringP2P", + "HUMERedditClusteringP2P", + "HUMEWikiCitiesClustering", + "HUMESIB200ClusteringS2S", + "HUMECore17InstructionReranking", + "HUMENews21InstructionReranking", + "HUMERobust04InstructionReranking", + "HUMEWikipediaRerankingMultilingual", + "HUMESICK-R", + "HUMESTS12", + "HUMESTSBenchmark", + "HUMESTS22", + ], + languages=["eng-Latn", "ara-Arab", "rus-Cyrl", "dan-Latn", "nob-Latn"], + ), + description="The HUME benchmark is designed to evaluate the performance of text embedding models and humans on a comparable set of tasks. This captures areas where models perform better than human annotators and the reverse. In the paper, we go further into the analysis and what conclusions can be drawn.", + reference="Coming soon (in review)", + citation=None, + contacts=["AdnanElAssadi56", "KennethEnevoldsen", "isaac-chung", "Samoed"], +) diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py index 3395a9ecf9..dcb66ab8ad 100644 --- a/mteb/leaderboard/benchmark_selector.py +++ b/mteb/leaderboard/benchmark_selector.py @@ -34,7 +34,9 @@ class MenuEntry: name="Select Benchmark", description="", open=False, - benchmarks=mteb.get_benchmarks(["MTEB(Multilingual, v2)", "MTEB(eng, v2)"]) + benchmarks=mteb.get_benchmarks( + ["MTEB(Multilingual, v2)", "MTEB(eng, v2)", "HUME(v1)"] + ) + [ MenuEntry( "Image", diff --git a/mteb/load_results/load_results.py b/mteb/load_results/load_results.py index ed8a90c060..9bb38eadb1 100644 --- a/mteb/load_results/load_results.py +++ b/mteb/load_results/load_results.py @@ -57,7 +57,10 @@ def download_of_results( logger.info( f"No results repository found in {results_directory}, cloning it from {results_repo}" ) - subprocess.run(["git", "clone", results_repo], cwd=cache_directory) + subprocess.run( + ["git", "clone", results_repo], + cwd=cache_directory, + ) return results_directory diff --git a/mteb/model_meta.py b/mteb/model_meta.py index baafe7139d..9d4f312724 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -140,7 +140,7 @@ def languages_are_valid(cls, languages: list[ISO_LANGUAGE_SCRIPT] | None) -> Non @field_validator("name") @classmethod def check_name(cls, v: str | None) -> str | None: - if v is None or v == "bm25s": + if v is None or v in ("bm25s", "Human"): return v if "/" not in v: raise ValueError( diff --git a/mteb/models/human.py b/mteb/models/human.py new file mode 100644 index 0000000000..82b2447862 --- /dev/null +++ b/mteb/models/human.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from mteb.model_meta import ModelMeta + +human = ModelMeta( + loader=None, + name="Human", + languages=["eng-Latn", "ara-Arab", "rus-Cyrl", "dan-Latn", "nob-Latn"], + open_weights=True, + revision="2025_09_25", + release_date=None, + n_parameters=None, + memory_usage_mb=None, + embed_dim=None, + license=None, + max_tokens=None, + reference=None, + similarity_fn_name=None, + framework=[], + use_instructions=None, + training_datasets=None, + public_training_code=None, + public_training_data=None, +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index de16c34d7b..0c681611eb 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -48,6 +48,7 @@ gritlm_models, gte_models, hinvec_models, + human, ibm_granite_models, inf_models, jasper_models, @@ -143,6 +144,7 @@ gritlm_models, gte_models, hinvec_models, + human, ibm_granite_models, inf_models, jasper_models, diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index d70c94944d..b7da774730 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -29,6 +29,9 @@ from .eng.EmotionClassification import * from .eng.FinancialPhrasebankClassification import * from .eng.FrenkEnClassification import * +from .eng.HUMEEmotionClassification import * +from .eng.HUMEToxicConversationsClassification import * +from .eng.HUMETweetSentimentExtractionClassification import * from .eng.ImdbClassification import * from .eng.LegalBenchClassification import * from .eng.NewsClassification import * @@ -95,6 +98,7 @@ from .multilingual.CataloniaTweetClassification import * from .multilingual.CyrillicTurkicLangClassification import * from .multilingual.HinDialectClassification import * +from .multilingual.HUMEMultilingualSentimentClassification import * from .multilingual.IndicLangClassification import * from .multilingual.IndicNLPNewsClassification import * from .multilingual.IndicSentimentClassification import * diff --git a/mteb/tasks/Classification/eng/HUMEEmotionClassification.py b/mteb/tasks/Classification/eng/HUMEEmotionClassification.py new file mode 100644 index 0000000000..c981888ba2 --- /dev/null +++ b/mteb/tasks/Classification/eng/HUMEEmotionClassification.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class HUMEEmotionClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="HUMEEmotionClassification", + description="Human evaluation subset of Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise.", + reference="https://www.aclweb.org/anthology/D18-1404", + dataset={ + "path": "mteb/HUMEEmotionClassification", + "revision": "bc2a4c799c86abc5bc138b0de038f46e24e88eb4", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2017-01-01", + "2018-12-31", + ), # Estimated range for the collection of Twitter messages + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{saravia-etal-2018-carer, + abstract = {Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.}, + address = {Brussels, Belgium}, + author = {Saravia, Elvis and +Liu, Hsien-Chi Toby and +Huang, Yen-Hao and +Wu, Junlin and +Chen, Yi-Shin}, + booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, + doi = {10.18653/v1/D18-1404}, + editor = {Riloff, Ellen and +Chiang, David and +Hockenmaier, Julia and +Tsujii, Jun{'}ichi}, + month = oct # {-} # nov, + pages = {3687--3697}, + publisher = {Association for Computational Linguistics}, + title = {{CARER}: Contextualized Affect Representations for Emotion Recognition}, + url = {https://aclanthology.org/D18-1404}, + year = {2018}, +} +""", + prompt="Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise", + adapted_from=["EmotionClassification"], + ) + + samples_per_label = 16 diff --git a/mteb/tasks/Classification/eng/HUMEToxicConversationsClassification.py b/mteb/tasks/Classification/eng/HUMEToxicConversationsClassification.py new file mode 100644 index 0000000000..f3b96e7eb3 --- /dev/null +++ b/mteb/tasks/Classification/eng/HUMEToxicConversationsClassification.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class HUMEToxicConversationsClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="HUMEToxicConversationsClassification", + description="Human evaluation subset of Collection of comments from the Civil Comments platform together with annotations if the comment is toxic or not.", + reference="https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview", + dataset={ + "path": "mteb/HUMEToxicConversationsClassification", + "revision": "4c128c30566ffc7b01c7c3a367da20f36fc08ef8", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2017-01-01", + "2018-12-31", + ), # Estimated range for the collection of comments + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{jigsaw-unintended-bias-in-toxicity-classification, + author = {cjadams and Daniel Borkan and inversion and Jeffrey Sorensen and Lucas Dixon and Lucy Vasserman and nithum}, + publisher = {Kaggle}, + title = {Jigsaw Unintended Bias in Toxicity Classification}, + url = {https://kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification}, + year = {2019}, +} +""", + prompt="Classify the given comments as either toxic or not toxic", + adapted_from=["ToxicConversationsClassification"], + ) + + samples_per_label = 16 diff --git a/mteb/tasks/Classification/eng/HUMETweetSentimentExtractionClassification.py b/mteb/tasks/Classification/eng/HUMETweetSentimentExtractionClassification.py new file mode 100644 index 0000000000..f1991abb1a --- /dev/null +++ b/mteb/tasks/Classification/eng/HUMETweetSentimentExtractionClassification.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class HUMETweetSentimentExtractionClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="HUMETweetSentimentExtractionClassification", + description="Human evaluation subset of Tweet Sentiment Extraction dataset.", + reference="https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview", + dataset={ + "path": "mteb/HUMETweetSentimentExtractionClassification", + "revision": "264bce01a98dfaf3581b53dcaa0fd5e2d44aa589", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=( + "2020-01-01", + "2020-12-31", + ), # Estimated range for the collection of tweets + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{tweet-sentiment-extraction, + author = {Maggie, Phil Culliton, Wei Chen}, + publisher = {Kaggle}, + title = {Tweet Sentiment Extraction}, + url = {https://kaggle.com/competitions/tweet-sentiment-extraction}, + year = {2020}, +} +""", + prompt="Classify the sentiment of a given tweet as either positive, negative, or neutral", + adapted_from=["TweetSentimentExtractionClassification"], + ) + + samples_per_label = 32 diff --git a/mteb/tasks/Classification/multilingual/HUMEMultilingualSentimentClassification.py b/mteb/tasks/Classification/multilingual/HUMEMultilingualSentimentClassification.py new file mode 100644 index 0000000000..a021b120a1 --- /dev/null +++ b/mteb/tasks/Classification/multilingual/HUMEMultilingualSentimentClassification.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +_LANGUAGES = { + "eng": ["eng-Latn"], + "ara": ["ara-Arab"], + "nor": ["nor-Latn"], + "rus": ["rus-Cyrl"], +} + + +class HUMEMultilingualSentimentClassification(AbsTaskClassification, MultilingualTask): + metadata = TaskMetadata( + name="HUMEMultilingualSentimentClassification", + dataset={ + "path": "mteb/HUMEMultilingualSentimentClassification", + "revision": "1b988d30980efdd9c27de1643837bf3ae5bae814", + }, + description=( + "Human evaluation subset of Sentiment classification dataset with binary " + "(positive vs negative sentiment) labels. Includes 4 languages." + ), + reference="https://huggingface.co/datasets/mteb/multilingual-sentiment-classification", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=_LANGUAGES, + main_score="accuracy", + date=("2022-08-01", "2022-08-01"), + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=["ar-dz"], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{mollanorozy-etal-2023-cross, + address = {Dubrovnik, Croatia}, + author = {Mollanorozy, Sepideh and +Tanti, Marc and +Nissim, Malvina}, + booktitle = {Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP}, + doi = {10.18653/v1/2023.sigtyp-1.9}, + editor = {Beinborn, Lisa and +Goswami, Koustava and +Murado{\\u{g}}lu, Saliha and +Sorokin, Alexey and +Kumar, Ritesh and +Shcherbakov, Andreas and +Ponti, Edoardo M. and +Cotterell, Ryan and +Vylomova, Ekaterina}, + month = may, + pages = {89--95}, + publisher = {Association for Computational Linguistics}, + title = {Cross-lingual Transfer Learning with \{P\}ersian}, + url = {https://aclanthology.org/2023.sigtyp-1.9}, + year = {2023}, +} +""", + adapted_from=["MultilingualSentimentClassification"], + ) diff --git a/mteb/tasks/Classification/multilingual/MultilingualSentimentClassification.py b/mteb/tasks/Classification/multilingual/MultilingualSentimentClassification.py index b6529c87d5..60eca7ac23 100644 --- a/mteb/tasks/Classification/multilingual/MultilingualSentimentClassification.py +++ b/mteb/tasks/Classification/multilingual/MultilingualSentimentClassification.py @@ -47,9 +47,10 @@ class MultilingualSentimentClassification(AbsTaskClassification, MultilingualTas "path": "mteb/multilingual-sentiment-classification", "revision": "2b9b4d10fc589af67794141fe8cbd3739de1eb33", }, - description="""Sentiment classification dataset with binary - (positive vs negative sentiment) labels. Includes 30 languages and dialects. - """, + description=( + "Sentiment classification dataset with binary " + "(positive vs negative sentiment) labels. Includes 30 languages and dialects." + ), reference="https://huggingface.co/datasets/mteb/multilingual-sentiment-classification", type="Classification", category="s2s", diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index 3476ea491a..1a31fed671 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -13,6 +13,9 @@ from .eng.BuiltBenchClusteringP2P import * from .eng.BuiltBenchClusteringS2S import * from .eng.ClusTrecCovid import * +from .eng.HUMEArxivClusteringP2P import * +from .eng.HUMERedditClusteringP2P import * +from .eng.HUMEWikiCitiesClustering import * from .eng.MedrxivClusteringP2P import * from .eng.MedrxivClusteringS2S import * from .eng.RedditClustering import * @@ -31,6 +34,7 @@ from .jpn.MewsC16JaClustering import * from .kor.KlueMrcDomainClustering import * from .kor.KlueYnatMrcCategoryClustering import * +from .multilingual.HUMESIB200ClusteringS2S import * from .multilingual.IndicReviewsClusteringP2P import * from .multilingual.MasakhaNEWSClusteringP2P import * from .multilingual.MasakhaNEWSClusteringS2S import * diff --git a/mteb/tasks/Clustering/eng/HUMEArxivClusteringP2P.py b/mteb/tasks/Clustering/eng/HUMEArxivClusteringP2P.py new file mode 100644 index 0000000000..5f218de7ac --- /dev/null +++ b/mteb/tasks/Clustering/eng/HUMEArxivClusteringP2P.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class HUMEArxivClusteringP2P(AbsTaskClustering): + metadata = TaskMetadata( + name="HUMEArxivClusteringP2P", + description="Human evaluation subset of Clustering of titles+abstract from arxiv. Clustering of 30 sets, either on the main or secondary category", + reference="https://www.kaggle.com/Cornell-University/arxiv", + dataset={ + "path": "mteb/mteb-human-arxiv-clustering", + "revision": "6d2f0e9d4f4a51cb54332acaef10478928f0fed8", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="v_measure", + date=("1991-01-01", "2021-01-01"), # 1991-01-01 is the first arxiv paper + domains=["Academic", "Written"], + task_subtypes=[], + license="cc0-1.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{arxiv_org_submitters_2024, + author = {arXiv.org submitters}, + doi = {10.34740/KAGGLE/DSV/7548853}, + publisher = {Kaggle}, + title = {arXiv Dataset}, + url = {https://www.kaggle.com/dsv/7548853}, + year = {2024}, +} +""", + prompt="Identify the main and secondary category of Arxiv papers based on the titles and abstracts", + adapted_from=["ArxivClusteringP2P"], + ) diff --git a/mteb/tasks/Clustering/eng/HUMERedditClusteringP2P.py b/mteb/tasks/Clustering/eng/HUMERedditClusteringP2P.py new file mode 100644 index 0000000000..646fc7eef4 --- /dev/null +++ b/mteb/tasks/Clustering/eng/HUMERedditClusteringP2P.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class HUMERedditClusteringP2P(AbsTaskClustering): + metadata = TaskMetadata( + name="HUMERedditClusteringP2P", + description="Human evaluation subset of Clustering of title+posts from reddit. Clustering of 10 sets of 50k paragraphs and 40 sets of 10k paragraphs.", + reference="https://arxiv.org/abs/2104.07081", + dataset={ + "path": "mteb/mteb-human-reddit-clustering", + "revision": "b38bea0ed72e69047a725a96b8022ff2f036bbde", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="v_measure", + date=("2021-01-01", "2021-04-14"), + domains=["Web", "Social", "Written"], + task_subtypes=["Thematic clustering"], + license="not specified", # derived from pushshift + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@article{geigle:2021:arxiv, + archiveprefix = {arXiv}, + author = {Gregor Geigle and +Nils Reimers and +Andreas R{\"u}ckl{\'e} and +Iryna Gurevych}, + eprint = {2104.07081}, + journal = {arXiv preprint}, + title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, + url = {http://arxiv.org/abs/2104.07081}, + volume = {abs/2104.07081}, + year = {2021}, +} +""", + prompt="Identify the topic or theme of Reddit posts based on the titles and posts", + adapted_from=["RedditClusteringP2P"], + ) diff --git a/mteb/tasks/Clustering/eng/HUMEWikiCitiesClustering.py b/mteb/tasks/Clustering/eng/HUMEWikiCitiesClustering.py new file mode 100644 index 0000000000..51d9a1ac50 --- /dev/null +++ b/mteb/tasks/Clustering/eng/HUMEWikiCitiesClustering.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class HUMEWikiCitiesClustering(AbsTaskClustering): + metadata = TaskMetadata( + name="HUMEWikiCitiesClustering", + description="Human evaluation subset of Clustering of Wikipedia articles of cities by country from https://huggingface.co/datasets/wikipedia. Test set includes 126 countries, and a total of 3531 cities.", + reference="https://huggingface.co/datasets/wikipedia", + dataset={ + "path": "mteb/mteb-human-wikicities-clustering", + "revision": "5c46af681d2dfa6d3ee373b7ccb4f153e1b72792", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="v_measure", + date=("2000-01-01", "2021-12-31"), # very rough estimate + domains=["Encyclopaedic", "Written"], + task_subtypes=[], + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@online{wikidump, + author = {Wikimedia Foundation}, + title = {Wikimedia Downloads}, + url = {https://dumps.wikimedia.org}, +} +""", + adapted_from=["WikiCitiesClustering"], + ) diff --git a/mteb/tasks/Clustering/multilingual/HUMESIB200ClusteringS2S.py b/mteb/tasks/Clustering/multilingual/HUMESIB200ClusteringS2S.py new file mode 100644 index 0000000000..4eec020977 --- /dev/null +++ b/mteb/tasks/Clustering/multilingual/HUMESIB200ClusteringS2S.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +_LANGUAGES = { + "eng_Latn": ["eng-Latn"], + "arb_Arab": ["ara-Arab"], + "dan_Latn": ["dan-Latn"], + "fra_Latn": ["fra-Latn"], + "rus_Cyrl": ["rus-Cyrl"], +} + + +class HUMESIB200ClusteringS2S(AbsTaskClustering, MultilingualTask): + fast_loading = True + metadata = TaskMetadata( + name="HUMESIB200ClusteringS2S", + description="Human evaluation subset of Clustering of news article headlines from SIB-200. Clustering of 10 sets, each with 8 categories and 10 texts per category.", + reference="https://github.com/dadelani/sib-200", + dataset={ + "path": "mteb/mteb-human-sib200-clustering", + "revision": "d41717b1b94c0155f5ae7f84034e01af61be455e", + }, + type="Clustering", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=_LANGUAGES, + main_score="v_measure", + date=("2020-01-01", "2022-12-31"), + domains=["News", "Written"], + task_subtypes=[], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{adelani-etal-2023-sib, + address = {Toronto, Canada}, + author = {Adelani, David Ifeoluwa and +Hedderich, Michael A. and +Zhu, Dawei and +van den Berg, Esther and +Klakow, Dietrich}, + booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + doi = {10.18653/v1/2023.acl-long.660}, + month = jul, + pages = {11784--11801}, + publisher = {Association for Computational Linguistics}, + title = {{SIB}-200: A Large-Scale News Classification Dataset for Over 200 Languages}, + url = {https://aclanthology.org/2023.acl-long.660}, + year = {2023}, +} +""", + prompt="Identify the news category that articles belong to based on their content", + adapted_from=["SIB200ClusteringS2S"], + ) diff --git a/mteb/tasks/Reranking/__init__.py b/mteb/tasks/Reranking/__init__.py index e4ba141bff..5c6d9025fd 100644 --- a/mteb/tasks/Reranking/__init__.py +++ b/mteb/tasks/Reranking/__init__.py @@ -3,6 +3,9 @@ from .ara.NamaaMrTydiReranking import * from .eng.AskUbuntuDupQuestions import * from .eng.BuiltBenchReranking import * +from .eng.HUMECore17InstructionReranking import * +from .eng.HUMENews21InstructionReranking import * +from .eng.HUMERobust04InstructionReranking import * from .eng.LocBenchReranking import * from .eng.MindSmallReranking import * from .eng.MultiSWEbenchReranking import * @@ -19,6 +22,7 @@ from .jpn.JQaRAReranking import * from .jpn.MMarcoReranking import * from .multilingual.ESCIReranking import * +from .multilingual.HUMEWikipediaRerankingMultilingual import * from .multilingual.MIRACLReranking import * from .multilingual.WikipediaRerankingMultilingual import * from .multilingual.XGlueWPRReranking import * diff --git a/mteb/tasks/Reranking/eng/HUMECore17InstructionReranking.py b/mteb/tasks/Reranking/eng/HUMECore17InstructionReranking.py new file mode 100644 index 0000000000..e4b4ea01e7 --- /dev/null +++ b/mteb/tasks/Reranking/eng/HUMECore17InstructionReranking.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class HUMECore17InstructionReranking(AbsTaskReranking): + metadata = TaskMetadata( + name="HUMECore17InstructionReranking", + description="Human evaluation subset of Core17 instruction retrieval dataset for reranking evaluation.", + reference="https://arxiv.org/abs/2403.15246", + dataset={ + "path": "mteb/mteb-human-core17-reranking", + "revision": "e2b1a26cb5277a040d7f96a79fef0cf00afe9ffe", + }, + type="Reranking", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="map", + date=("2017-01-01", "2017-12-31"), + domains=["News", "Written"], + task_subtypes=[], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{weller2024followir, + archiveprefix = {arXiv}, + author = {Orion Weller and Benjamin Chang and Sean MacAvaney and Kyle Lo and Arman Cohan and Benjamin Van Durme and Dawn Lawrie and Luca Soldaini}, + eprint = {2403.15246}, + primaryclass = {cs.IR}, + title = {FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions}, + year = {2024}, +} +""", + prompt="Given a query, rerank the documents by their relevance to the query", + adapted_from=["Core17InstructionRetrieval"], + ) diff --git a/mteb/tasks/Reranking/eng/HUMENews21InstructionReranking.py b/mteb/tasks/Reranking/eng/HUMENews21InstructionReranking.py new file mode 100644 index 0000000000..b9598eb29a --- /dev/null +++ b/mteb/tasks/Reranking/eng/HUMENews21InstructionReranking.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class HUMENews21InstructionReranking(AbsTaskReranking): + metadata = TaskMetadata( + name="HUMENews21InstructionReranking", + description="Human evaluation subset of News21 instruction retrieval dataset for reranking evaluation.", + reference="https://trec.nist.gov/data/news2021.html", + dataset={ + "path": "mteb/mteb-human-news21-reranking", + "revision": "22208ecbb54618adb1592fd2ba7cdd92d643d9de", + }, + type="Reranking", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="map", + date=("2021-01-01", "2021-12-31"), + domains=["News", "Written"], + task_subtypes=[], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{soboroff2021trec, + author = {Soboroff, Ian and Macdonald, Craig and McCreadie, Richard}, + booktitle = {TREC}, + title = {TREC 2021 News Track Overview}, + year = {2021}, +} +""", + prompt="Given a query, rerank the documents by their relevance to the query", + adapted_from=["News21InstructionRetrieval"], + ) diff --git a/mteb/tasks/Reranking/eng/HUMERobust04InstructionReranking.py b/mteb/tasks/Reranking/eng/HUMERobust04InstructionReranking.py new file mode 100644 index 0000000000..0db5182eb5 --- /dev/null +++ b/mteb/tasks/Reranking/eng/HUMERobust04InstructionReranking.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class HUMERobust04InstructionReranking(AbsTaskReranking): + metadata = TaskMetadata( + name="HUMERobust04InstructionReranking", + description="Human evaluation subset of Robust04 instruction retrieval dataset for reranking evaluation.", + reference="https://trec.nist.gov/data/robust/04.guidelines.html", + dataset={ + "path": "mteb/mteb-human-robust04-reranking", + "revision": "77756407fed441d7be778b7464c34ccf4700af2e", + }, + type="Reranking", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="map", + date=("2004-01-01", "2004-12-31"), + domains=["News", "Written"], + task_subtypes=[], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{voorhees2005trec, + author = {Voorhees, Ellen M}, + booktitle = {TREC}, + title = {TREC 2004 Robust Retrieval Track Overview}, + year = {2005}, +} +""", + prompt="Given a query, rerank the documents by their relevance to the query", + adapted_from=["Robust04InstructionRetrieval"], + ) diff --git a/mteb/tasks/Reranking/multilingual/HUMEWikipediaRerankingMultilingual.py b/mteb/tasks/Reranking/multilingual/HUMEWikipediaRerankingMultilingual.py new file mode 100644 index 0000000000..1ed08648dc --- /dev/null +++ b/mteb/tasks/Reranking/multilingual/HUMEWikipediaRerankingMultilingual.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskReranking import AbsTaskReranking + +_LANGUAGES = { + "en": ["eng-Latn"], + "da": ["dan-Latn"], + "no": ["nob-Latn"], +} + + +class HUMEWikipediaRerankingMultilingual(AbsTaskReranking, MultilingualTask): + fast_loading = True + metadata = TaskMetadata( + name="HUMEWikipediaRerankingMultilingual", + description="Human evaluation subset of Wikipedia reranking dataset across multiple languages.", + reference="https://github.com/ellamind/wikipedia-2023-11-reranking-multilingual", + dataset={ + "path": "mteb/mteb-human-wiki-reranking", + "revision": "bdbce1ba2d0e58e88d1d13c54a555154adc5c165", + }, + type="Reranking", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=_LANGUAGES, + main_score="map", + date=("2023-01-01", "2023-12-31"), + domains=["Encyclopaedic", "Written"], + task_subtypes=[], + license="cc-by-sa-3.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{wikipedia_reranking_2023, + author = {Ellamind}, + title = {Wikipedia 2023-11 Reranking Multilingual Dataset}, + url = {https://github.com/ellamind/wikipedia-2023-11-reranking-multilingual}, + year = {2023}, +} +""", + prompt="Given a query, rerank the Wikipedia passages by their relevance to the query", + adapted_from=["WikipediaRerankingMultilingual"], + ) diff --git a/mteb/tasks/STS/__init__.py b/mteb/tasks/STS/__init__.py index 00ccf0d7ff..f7997185f0 100644 --- a/mteb/tasks/STS/__init__.py +++ b/mteb/tasks/STS/__init__.py @@ -2,6 +2,9 @@ from .deu.GermanSTSBenchmarkSTS import * from .eng.BiossesSTS import * +from .eng.HUMESICKR import * +from .eng.HUMESTS12 import * +from .eng.HUMESTSBenchmark import * from .eng.SickrSTS import * from .eng.STS12STS import * from .eng.STS13STS import * @@ -17,6 +20,7 @@ from .jpn.JSTS import * from .kor.KlueSTS import * from .kor.KorSTS import * +from .multilingual.HUMESTS22 import * from .multilingual.IndicCrosslingualSTS import * from .multilingual.SemRel24STS import * from .multilingual.STS17CrosslingualSTS import * diff --git a/mteb/tasks/STS/eng/HUMESICKR.py b/mteb/tasks/STS/eng/HUMESICKR.py new file mode 100644 index 0000000000..e573332065 --- /dev/null +++ b/mteb/tasks/STS/eng/HUMESICKR.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class HUMESICKR(AbsTaskSTS): + metadata = TaskMetadata( + name="HUMESICK-R", + dataset={ + "path": "mteb/mteb-human-sickr-sts", + "revision": "cf7172d4b730a743570f25291d04abeee086c824", + }, + description="Human evaluation subset of Semantic Textual Similarity SICK-R dataset", + reference="https://aclanthology.org/L14-1314/", + type="STS", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="cosine_spearman", + date=None, + domains=["Web", "Written"], + task_subtypes=["Textual Entailment"], + license="cc-by-nc-sa-3.0", + annotations_creators="human-annotated", + dialect=None, + sample_creation=None, + bibtex_citation=r""" +@inproceedings{marelli-etal-2014-sick, + abstract = {Shared and internationally recognized benchmarks are fundamental for the development of any computational system. We aim to help the research community working on compositional distributional semantic models (CDSMs) by providing SICK (Sentences Involving Compositional Knowldedge), a large size English benchmark tailored for them. SICK consists of about 10,000 English sentence pairs that include many examples of the lexical, syntactic and semantic phenomena that CDSMs are expected to account for, but do not require dealing with other aspects of existing sentential data sets (idiomatic multiword expressions, named entities, telegraphic language) that are not within the scope of CDSMs. By means of crowdsourcing techniques, each pair was annotated for two crucial semantic tasks: relatedness in meaning (with a 5-point rating scale as gold score) and entailment relation between the two elements (with three possible gold labels: entailment, contradiction, and neutral). The SICK data set was used in SemEval-2014 Task 1, and it freely available for research purposes.}, + address = {Reykjavik, Iceland}, + author = {Marelli, Marco and +Menini, Stefano and +Baroni, Marco and +Bentivogli, Luisa and +Bernardi, Raffaella and +Zamparelli, Roberto}, + booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)}, + editor = {Calzolari, Nicoletta and +Choukri, Khalid and +Declerck, Thierry and +Loftsson, Hrafn and +Maegaard, Bente and +Mariani, Joseph and +Moreno, Asuncion and +Odijk, Jan and +Piperidis, Stelios}, + month = may, + pages = {216--223}, + publisher = {European Language Resources Association (ELRA)}, + title = {A {SICK} cure for the evaluation of compositional distributional semantic models}, + url = {http://www.lrec-conf.org/proceedings/lrec2014/pdf/363_Paper.pdf}, + year = {2014}, +} +""", + adapted_from=["SICK-R"], + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 0 + metadata_dict["max_score"] = 5 + return metadata_dict diff --git a/mteb/tasks/STS/eng/HUMESTS12.py b/mteb/tasks/STS/eng/HUMESTS12.py new file mode 100644 index 0000000000..e00e25ec12 --- /dev/null +++ b/mteb/tasks/STS/eng/HUMESTS12.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class HUMESTS12(AbsTaskSTS): + metadata = TaskMetadata( + name="HUMESTS12", + dataset={ + "path": "mteb/mteb-human-sts12-sts", + "revision": "76cbf76792ec03cb1f76dc6ada05abcb23c82c0c", + }, + description="Human evaluation subset of SemEval-2012 Task 6.", + reference="https://www.aclweb.org/anthology/S12-1051.pdf", + type="STS", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="cosine_spearman", + date=("2005-01-01", "2012-12-31"), + domains=["Encyclopaedic", "News", "Written"], + task_subtypes=[], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="created", + bibtex_citation=r""" +@inproceedings{10.5555/2387636.2387697, + abstract = {Semantic Textual Similarity (STS) measures the degree of semantic equivalence between two texts. This paper presents the results of the STS pilot task in Semeval. The training data contained 2000 sentence pairs from previously existing paraphrase datasets and machine translation evaluation resources. The test data also comprised 2000 sentences pairs for those datasets, plus two surprise datasets with 400 pairs from a different machine translation evaluation corpus and 750 pairs from a lexical resource mapping exercise. The similarity of pairs of sentences was rated on a 0-5 scale (low to high similarity) by human judges using Amazon Mechanical Turk, with high Pearson correlation scores, around 90\%. 35 teams participated in the task, submitting 88 runs. The best results scored a Pearson correlation >80\%, well above a simple lexical baseline that only scored a 31\% correlation. This pilot task opens an exciting way ahead, although there are still open issues, specially the evaluation metric.}, + address = {USA}, + author = {Agirre, Eneko and Diab, Mona and Cer, Daniel and Gonzalez-Agirre, Aitor}, + booktitle = {Proceedings of the First Joint Conference on Lexical and Computational Semantics - Volume 1: Proceedings of the Main Conference and the Shared Task, and Volume 2: Proceedings of the Sixth International Workshop on Semantic Evaluation}, + location = {Montr\'{e}al, Canada}, + numpages = {9}, + pages = {385–393}, + publisher = {Association for Computational Linguistics}, + series = {SemEval '12}, + title = {SemEval-2012 task 6: a pilot on semantic textual similarity}, + year = {2012}, +} +""", + adapted_from=["STS12"], + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 0 + metadata_dict["max_score"] = 5 + return metadata_dict diff --git a/mteb/tasks/STS/eng/HUMESTSBenchmark.py b/mteb/tasks/STS/eng/HUMESTSBenchmark.py new file mode 100644 index 0000000000..b29d5c00ea --- /dev/null +++ b/mteb/tasks/STS/eng/HUMESTSBenchmark.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class HUMESTSBenchmark(AbsTaskSTS): + metadata = TaskMetadata( + name="HUMESTSBenchmark", + dataset={ + "path": "mteb/mteb-human-stsbenchmark-sts", + "revision": "cb05d5409f802e68d6ed39615ed67f7dc2235ac5", + }, + description="Human evaluation subset of Semantic Textual Similarity Benchmark (STSbenchmark) dataset.", + reference="https://github.com/PhilipMay/stsb-multi-mt/", + type="STS", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="cosine_spearman", + date=None, + domains=["Blog", "News", "Written"], + task_subtypes=[], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="machine-translated and verified", + bibtex_citation=r""" +@inproceedings{huggingface:dataset:stsb_multi_mt, + author = {Philip May}, + title = {Machine translated multilingual STS benchmark dataset.}, + url = {https://github.com/PhilipMay/stsb-multi-mt}, + year = {2021}, +} +""", + adapted_from={"STSBenchmark"}, + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 0 + metadata_dict["max_score"] = 5 + return metadata_dict diff --git a/mteb/tasks/STS/multilingual/HUMESTS22.py b/mteb/tasks/STS/multilingual/HUMESTS22.py new file mode 100644 index 0000000000..6b7df853c4 --- /dev/null +++ b/mteb/tasks/STS/multilingual/HUMESTS22.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +_LANGUAGES = { + "en": ["eng-Latn"], + "ar": ["ara-Arab"], + "fr": ["fra-Latn"], + "ru": ["rus-Cyrl"], +} + + +class HUMESTS22(AbsTaskSTS, MultilingualTask): + fast_loading = True + metadata = TaskMetadata( + name="HUMESTS22", + dataset={ + "path": "mteb/mteb-human-sts22-sts", + "revision": "ab40ed76c4283318b7b146aff31f8cdefd6ebae0", + }, + description="Human evaluation subset of SemEval 2022 Task 8: Multilingual News Article Similarity", + reference="https://competitions.codalab.org/competitions/33835", + type="STS", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=_LANGUAGES, + main_score="cosine_spearman", + date=("2020-01-01", "2020-06-11"), + domains=["News", "Written"], + task_subtypes=[], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{chen-etal-2022-semeval, + address = {Seattle, United States}, + author = {Chen, Xi and +Zeynali, Ali and +Camargo, Chico and +Fl{\"o}ck, Fabian and +Gaffney, Devin and +Grabowicz, Przemyslaw and +Hale, Scott and +Jurgens, David and +Samory, Mattia}, + booktitle = {Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)}, + doi = {10.18653/v1/2022.semeval-1.155}, + editor = {Emerson, Guy and +Schluter, Natalie and +Stanovsky, Gabriel and +Kumar, Ritesh and +Palmer, Alexis and +Schneider, Nathan and +Singh, Siddharth and +Ratan, Shyam}, + month = jul, + pages = {1094--1106}, + publisher = {Association for Computational Linguistics}, + title = {{S}em{E}val-2022 Task 8: Multilingual news article similarity}, + url = {https://aclanthology.org/2022.semeval-1.155}, + year = {2022}, +} +""", + adapted_from=["STS22"], + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 1 + metadata_dict["max_score"] = 4 + return metadata_dict diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index 22bf88e1df..c2e8757888 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -159,6 +159,8 @@ "STS15", "STS16", "STSBenchmark", + "HUMESTSBenchmark", + "HUMESICK-R", "FinParaSTS", "SICKFr", "KLUE-STS",