Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
"Tumor detection",
"Duplicate Detection",
"Rendered semantic textual similarity",
"Intent classification",
]

TASK_DOMAIN = Literal[
Expand Down
93 changes: 68 additions & 25 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,9 +226,12 @@
"RuBQRetrieval",
# STS
"RUParaPhraserSTS",
"RuSTSBenchmarkSTS",
"STS22",
],
)
+ get_tasks(
tasks=["RuSTSBenchmarkSTS"],
eval_splits=["test"],
),
description="A Russian version of the Massive Text Embedding Benchmark with a number of novel Russian tasks in all task categories of the original MTEB.",
reference="https://aclanthology.org/2023.eacl-main.148/",
Expand Down Expand Up @@ -1599,14 +1602,14 @@
document undestanding, visual STS, and CV-centric tasks.""",
reference="",
contacts=["gowitheflow-1998", "isaac-chung"],
citation="""@article{xiao2025mieb,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Samoed @KennethEnevoldsen this seems unrelated to this PR. I had previously updated this to match the MTEB paper's bibtex style. Would appreciate it if you could revert this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will fix it in separate PR then. I don't know why it was changed

author = {Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
title = {MIEB: Massive Image Embedding Benchmark},
publisher = {arXiv},
journal={arXiv preprint arXiv:2504.10471},
year = {2025},
url = {https://arxiv.org/abs/2504.10471},
doi = {10.48550/ARXIV.2504.10471},
citation="""@misc{xiao2025miebmassiveimageembedding,
title={MIEB: Massive Image Embedding Benchmark},
author={Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
year={2025},
eprint={2504.10471},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2504.10471},
}""",
)

Expand All @@ -1632,14 +1635,14 @@
datasets + the multilingual parts of VisualSTS-b and VisualSTS-16.""",
reference="",
contacts=["gowitheflow-1998", "isaac-chung"],
citation="""@article{xiao2025mieb,
author = {Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
title = {MIEB: Massive Image Embedding Benchmark},
publisher = {arXiv},
journal={arXiv preprint arXiv:2504.10471},
year = {2025},
url = {https://arxiv.org/abs/2504.10471},
doi = {10.48550/ARXIV.2504.10471},
citation="""@misc{xiao2025miebmassiveimageembedding,
title={MIEB: Massive Image Embedding Benchmark},
author={Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
year={2025},
eprint={2504.10471},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2504.10471},
}""",
)

Expand Down Expand Up @@ -1714,14 +1717,14 @@
relative rank of models.""",
reference="",
contacts=["gowitheflow-1998", "isaac-chung"],
citation="""@article{xiao2025mieb,
author = {Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
title = {MIEB: Massive Image Embedding Benchmark},
publisher = {arXiv},
journal={arXiv preprint arXiv:2504.10471},
year = {2025},
url = {https://arxiv.org/abs/2504.10471},
doi = {10.48550/ARXIV.2504.10471},
citation="""@misc{xiao2025miebmassiveimageembedding,
title={MIEB: Massive Image Embedding Benchmark},
author={Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
year={2025},
eprint={2504.10471},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2504.10471},
}""",
)

Expand All @@ -1745,3 +1748,43 @@
}""",
contacts=["mehrzadshm"],
)

ENCODECHKA = Benchmark(
name="Encodechka",
tasks=MTEBTasks(
get_tasks(
tasks=[
# PI
"RUParaPhraserSTS",
# SA
"SentiRuEval2016",
# TI
"RuToxicOKMLCUPClassification",
# IA
"InappropriatenessClassificationv2",
# IC, ICX
"RuNLUIntentClassification",
]
)
+
# NLI
get_tasks(tasks=["XNLI"], eval_splits=["test"], languages=["rus-Cyrl"])
# STS
+ get_tasks(
tasks=["RuSTSBenchmarkSTS"],
eval_splits=["validation"],
languages=["rus-Cyrl"],
),
),
description="A benchmark for evaluating text embedding models on Russian data.",
reference="https://github.com/avidale/encodechka",
citation="""@misc{dale_encodechka,
author = "Dale, David",
title = "Russian rating of sentence encoders",
editor = "habr.com",
url = "https://habr.com/ru/articles/669674/",
month = {June},
year = {2022},
note = {[Online; posted 12-June-2022]},
}""",
)
3 changes: 3 additions & 0 deletions mteb/tasks/Classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,12 @@
from .rus.HeadlineClassification import *
from .rus.InappropriatenessClassification import *
from .rus.KinopoiskClassification import *
from .rus.ru_nlu_intent_classification import *
from .rus.ru_toixic_classification_okmlcup import *
from .rus.RuReviewsClassification import *
from .rus.RuSciBenchGRNTIClassification import *
from .rus.RuSciBenchOECDClassification import *
from .rus.senti_ru_eval import *
from .san.SanskritShlokasClassification import *
from .sin.SinhalaNewsClassification import *
from .sin.SinhalaNewsSourceClassification import *
Expand Down
54 changes: 54 additions & 0 deletions mteb/tasks/Classification/rus/InappropriatenessClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,57 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
)


class InappropriatenessClassificationv2(AbsTaskClassification):
metadata = TaskMetadata(
name="InappropriatenessClassificationv2",
dataset={
"path": "mteb/InappropriatenessClassificationv2",
"revision": "698cb161a90150ec46618f714cdd8606cf21a9eb",
},
description="Inappropriateness identification in the form of binary classification",
reference="https://aclanthology.org/2021.bsnlp-1.4",
type="Classification",
category="t2t",
modalities=["text"],
eval_splits=["test"],
eval_langs=["rus-Cyrl"],
main_score="accuracy",
date=("2006-01-01", "2021-04-01"),
domains=["Web", "Social", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="cc-by-nc-sa-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation="""@inproceedings{babakov-etal-2021-detecting,
title = "Detecting Inappropriate Messages on Sensitive Topics that Could Harm a Company{'}s Reputation",
author = "Babakov, Nikolay and
Logacheva, Varvara and
Kozlova, Olga and
Semenov, Nikita and
Panchenko, Alexander",
editor = "Babych, Bogdan and
Kanishcheva, Olga and
Nakov, Preslav and
Piskorski, Jakub and
Pivovarova, Lidia and
Starko, Vasyl and
Steinberger, Josef and
Yangarber, Roman and
Marci{\'n}czuk, Micha{\l} and
Pollak, Senja and
P{\v{r}}ib{\'a}{\v{n}}, Pavel and
Robnik-{\v{S}}ikonja, Marko",
booktitle = "Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing",
month = apr,
year = "2021",
address = "Kiyv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.bsnlp-1.4",
pages = "26--36",
abstract = "Not all topics are equally {``}flammable{''} in terms of toxicity: a calm discussion of turtles or fishing less often fuels inappropriate toxic dialogues than a discussion of politics or sexual minorities. We define a set of sensitive topics that can yield inappropriate and toxic messages and describe the methodology of collecting and labelling a dataset for appropriateness. While toxicity in user-generated data is well-studied, we aim at defining a more fine-grained notion of inappropriateness. The core of inappropriateness is that it can harm the reputation of a speaker. This is different from toxicity in two respects: (i) inappropriateness is topic-related, and (ii) inappropriate message is not toxic but still unacceptable. We collect and release two datasets for Russian: a topic-labelled dataset and an appropriateness-labelled dataset. We also release pre-trained classification models trained on this data.",
}""",
prompt="Classify the given message as either sensitive topic or not",
)
50 changes: 50 additions & 0 deletions mteb/tasks/Classification/rus/ru_nlu_intent_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.MultilingualTask import MultilingualTask
from mteb.abstasks.TaskMetadata import TaskMetadata


class RuNLUIntentClassification(AbsTaskClassification, MultilingualTask):
metadata = TaskMetadata(
name="RuNLUIntentClassification",
dataset={
"path": "mteb/RuNLUIntentClassification",
"revision": "424d0f767aaa5c411e3a529eec04658e5726a39e",
},
description=(
"Contains natural language data for human-robot interaction in home domain which we collected and"
" annotated for evaluating NLU Services/platforms."
),
reference="https://arxiv.org/abs/1903.05566",
type="Classification",
category="t2t",
modalities=["text"],
eval_splits=["test"],
eval_langs={
"rus-eng": [
"rus-Cyrl",
"rus-Latn",
],
"rus": [
"rus-Cyrl",
],
},
main_score="accuracy",
date=("2019-03-26", "2019-03-26"),
domains=[],
task_subtypes=["Intent classification"],
license="cc-by-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation="""@misc{liu2019benchmarkingnaturallanguageunderstanding,
title={Benchmarking Natural Language Understanding Services for building Conversational Agents},
author={Xingkun Liu and Arash Eshghi and Pawel Swietojanski and Verena Rieser},
year={2019},
eprint={1903.05566},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/1903.05566},
}""",
)
33 changes: 33 additions & 0 deletions mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class RuToxicOKMLCUPClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="RuToxicOKMLCUPClassification",
dataset={
"path": "mteb/RuToxicOKMLCUPClassification",
"revision": "13722b7320ef4b6a471f9e8b379f3f49167d0517",
},
description="On the Odnoklassniki social network, users post a huge number of comments of various directions and nature every day.",
reference="https://cups.online/ru/contests/okmlcup2020",
type="Classification",
category="t2t",
modalities=["text"],
eval_splits=["test"],
eval_langs=["rus-Cyrl"],
main_score="accuracy",
date=("2015-01-01", "2020-01-01"),
domains=[],
task_subtypes=["Sentiment/Hate speech"],
license="not specified",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""""",
)

def dataset_transform(self):
self.dataset = self.dataset.rename_column("toxic", "label")
39 changes: 39 additions & 0 deletions mteb/tasks/Classification/rus/senti_ru_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class SentiRuEval2016Classification(AbsTaskClassification):
metadata = TaskMetadata(
name="SentiRuEval2016",
dataset={
"path": "mteb/SentiRuEval2016",
"revision": "8507eab0deef37f040a750afbcb4dba7a7de9c16",
},
description="Russian sentiment analysis evaluation SentiRuEval-2016 devoted to reputation monitoring of banks "
"and telecom companies in Twitter. We describe the task, data, the procedure of data preparation, "
"and participants’ results.",
reference="https://github.com/mokoron/sentirueval",
type="Classification",
category="t2t",
modalities=["text"],
eval_splits=["test"],
eval_langs=["rus-Cyrl"],
main_score="accuracy",
date=("2015-01-01", "2016-01-01"),
domains=[],
task_subtypes=["Sentiment/Hate speech"],
license="not specified",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""@inproceedings{loukachevitch2016sentirueval,
title={SentiRuEval-2016: overcoming time gap and data sparsity in tweet sentiment analysis},
author={Loukachevitch, NV and Rubtsova, Yu V},
booktitle={Computational Linguistics and Intellectual Technologies},
pages={416--426},
year={2016}
}
""",
)
1 change: 1 addition & 0 deletions mteb/tasks/MultiLabelClassification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
from .multilingual.MultiEURLEXMultilabelClassification import *
from .por.BrazilianToxicTweetsClassification import *
from .rus.CEDRClassification import *
from .rus.ru_toixic_multilabelclassification_okmlcup import *
from .rus.SensitiveTopicsClassification import *
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskMultilabelClassification import (
AbsTaskMultilabelClassification,
)
from mteb.abstasks.TaskMetadata import TaskMetadata


class RuToxicOKMLCUPMultilabelClassification(AbsTaskMultilabelClassification):
metadata = TaskMetadata(
name="RuToxicOKMLCUPMultilabelClassification",
dataset={
"path": "mteb/RuToxicOKMLCUPClassification",
"revision": "13722b7320ef4b6a471f9e8b379f3f49167d0517",
},
description="On the Odnoklassniki social network, users post a huge number of comments of various directions and nature every day.",
reference="https://cups.online/ru/contests/okmlcup2020",
type="Classification",
category="t2t",
modalities=["text"],
eval_splits=["test"],
eval_langs=["rus-Cyrl"],
main_score="accuracy",
date=("2015-01-01", "2024-01-01"),
domains=[],
task_subtypes=["Sentiment/Hate speech"],
license="not specified",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""""",
)

def dataset_transform(self):
self.dataset = self.dataset.rename_column("labels", "label")
2 changes: 1 addition & 1 deletion mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class RuSTSBenchmarkSTS(AbsTaskSTS):
type="STS",
category="s2s",
modalities=["text"],
eval_splits=["test"],
eval_splits=["test", "validation"],
eval_langs=["rus-Cyrl"],
main_score="cosine_spearman",
date=("2012-01-01", "2018-01-01"),
Expand Down