embeddings-benchmark · Samoed · Apr 27, 2025 · Apr 19, 2025 · Apr 19, 2025 · Apr 19, 2025
diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py
@@ -53,6 +53,7 @@
     "Tumor detection",
     "Duplicate Detection",
     "Rendered semantic textual similarity",
+    "Intent classification",
 ]
 
 TASK_DOMAIN = Literal[

diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
@@ -226,9 +226,12 @@
             "RuBQRetrieval",
             # STS
             "RUParaPhraserSTS",
-            "RuSTSBenchmarkSTS",
             "STS22",
         ],
+    )
+    + get_tasks(
+        tasks=["RuSTSBenchmarkSTS"],
+        eval_splits=["test"],
     ),
     description="A Russian version of the Massive Text Embedding Benchmark with a number of novel Russian tasks in all task categories of the original MTEB.",
     reference="https://aclanthology.org/2023.eacl-main.148/",
@@ -1599,14 +1602,14 @@
     document undestanding, visual STS, and CV-centric tasks.""",
     reference="",
     contacts=["gowitheflow-1998", "isaac-chung"],
-    citation="""@article{xiao2025mieb,
-    author = {Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
-    title = {MIEB: Massive Image Embedding Benchmark},
-    publisher = {arXiv},
-    journal={arXiv preprint arXiv:2504.10471},
-    year = {2025},
-    url = {https://arxiv.org/abs/2504.10471},
-    doi = {10.48550/ARXIV.2504.10471},
+    citation="""@misc{xiao2025miebmassiveimageembedding,
+      title={MIEB: Massive Image Embedding Benchmark}, 
+      author={Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
+      year={2025},
+      eprint={2504.10471},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2504.10471}, 
     }""",
 )
 
@@ -1632,14 +1635,14 @@
     datasets + the multilingual parts of VisualSTS-b and VisualSTS-16.""",
     reference="",
     contacts=["gowitheflow-1998", "isaac-chung"],
-    citation="""@article{xiao2025mieb,
-    author = {Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
-    title = {MIEB: Massive Image Embedding Benchmark},
-    publisher = {arXiv},
-    journal={arXiv preprint arXiv:2504.10471},
-    year = {2025},
-    url = {https://arxiv.org/abs/2504.10471},
-    doi = {10.48550/ARXIV.2504.10471},
+    citation="""@misc{xiao2025miebmassiveimageembedding,
+      title={MIEB: Massive Image Embedding Benchmark}, 
+      author={Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
+      year={2025},
+      eprint={2504.10471},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2504.10471}, 
     }""",
 )
 
@@ -1714,14 +1717,14 @@
     relative rank of models.""",
     reference="",
     contacts=["gowitheflow-1998", "isaac-chung"],
-    citation="""@article{xiao2025mieb,
-    author = {Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
-    title = {MIEB: Massive Image Embedding Benchmark},
-    publisher = {arXiv},
-    journal={arXiv preprint arXiv:2504.10471},
-    year = {2025},
-    url = {https://arxiv.org/abs/2504.10471},
-    doi = {10.48550/ARXIV.2504.10471},
+    citation="""@misc{xiao2025miebmassiveimageembedding,
+      title={MIEB: Massive Image Embedding Benchmark}, 
+      author={Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff},
+      year={2025},
+      eprint={2504.10471},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2504.10471}, 
     }""",
 )
 
@@ -1745,3 +1748,43 @@
 }""",
     contacts=["mehrzadshm"],
 )
+
+ENCODECHKA = Benchmark(
+    name="Encodechka",
+    tasks=MTEBTasks(
+        get_tasks(
+            tasks=[
+                # PI
+                "RUParaPhraserSTS",
+                # SA
+                "SentiRuEval2016",
+                # TI
+                "RuToxicOKMLCUPClassification",
+                # IA
+                "InappropriatenessClassificationv2",
+                # IC, ICX
+                "RuNLUIntentClassification",
+            ]
+        )
+        +
+        # NLI
+        get_tasks(tasks=["XNLI"], eval_splits=["test"], languages=["rus-Cyrl"])
+        # STS
+        + get_tasks(
+            tasks=["RuSTSBenchmarkSTS"],
+            eval_splits=["validation"],
+            languages=["rus-Cyrl"],
+        ),
+    ),
+    description="A benchmark for evaluating text embedding models on Russian data.",
+    reference="https://github.com/avidale/encodechka",
+    citation="""@misc{dale_encodechka,
+   author = "Dale, David",
+   title  = "Russian rating of sentence encoders",
+   editor = "habr.com",
+   url    = "https://habr.com/ru/articles/669674/",
+   month  = {June},
+   year   = {2022},
+   note = {[Online; posted 12-June-2022]},
+}""",
+)
diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py
@@ -129,9 +129,12 @@
 from .rus.HeadlineClassification import *
 from .rus.InappropriatenessClassification import *
 from .rus.KinopoiskClassification import *
+from .rus.ru_nlu_intent_classification import *
+from .rus.ru_toixic_classification_okmlcup import *
 from .rus.RuReviewsClassification import *
 from .rus.RuSciBenchGRNTIClassification import *
 from .rus.RuSciBenchOECDClassification import *
+from .rus.senti_ru_eval import *
 from .san.SanskritShlokasClassification import *
 from .sin.SinhalaNewsClassification import *
 from .sin.SinhalaNewsSourceClassification import *

diff --git a/mteb/tasks/Classification/rus/InappropriatenessClassification.py b/mteb/tasks/Classification/rus/InappropriatenessClassification.py
@@ -61,3 +61,57 @@ def dataset_transform(self):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
         )
+
+
+class InappropriatenessClassificationv2(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="InappropriatenessClassificationv2",
+        dataset={
+            "path": "mteb/InappropriatenessClassificationv2",
+            "revision": "698cb161a90150ec46618f714cdd8606cf21a9eb",
+        },
+        description="Inappropriateness identification in the form of binary classification",
+        reference="https://aclanthology.org/2021.bsnlp-1.4",
+        type="Classification",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["rus-Cyrl"],
+        main_score="accuracy",
+        date=("2006-01-01", "2021-04-01"),
+        domains=["Web", "Social", "Written"],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="cc-by-nc-sa-4.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="""@inproceedings{babakov-etal-2021-detecting,
+        title = "Detecting Inappropriate Messages on Sensitive Topics that Could Harm a Company{'}s Reputation",
+        author = "Babakov, Nikolay  and
+        Logacheva, Varvara  and
+        Kozlova, Olga  and
+        Semenov, Nikita  and
+        Panchenko, Alexander",
+        editor = "Babych, Bogdan  and
+        Kanishcheva, Olga  and
+        Nakov, Preslav  and
+        Piskorski, Jakub  and
+        Pivovarova, Lidia  and
+        Starko, Vasyl  and
+        Steinberger, Josef  and
+        Yangarber, Roman  and
+        Marci{\'n}czuk, Micha{\l}  and
+        Pollak, Senja  and
+        P{\v{r}}ib{\'a}{\v{n}}, Pavel  and
+        Robnik-{\v{S}}ikonja, Marko",
+        booktitle = "Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing",
+        month = apr,
+        year = "2021",
+        address = "Kiyv, Ukraine",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/2021.bsnlp-1.4",
+        pages = "26--36",
+        abstract = "Not all topics are equally {``}flammable{''} in terms of toxicity: a calm discussion of turtles or fishing less often fuels inappropriate toxic dialogues than a discussion of politics or sexual minorities. We define a set of sensitive topics that can yield inappropriate and toxic messages and describe the methodology of collecting and labelling a dataset for appropriateness. While toxicity in user-generated data is well-studied, we aim at defining a more fine-grained notion of inappropriateness. The core of inappropriateness is that it can harm the reputation of a speaker. This is different from toxicity in two respects: (i) inappropriateness is topic-related, and (ii) inappropriate message is not toxic but still unacceptable. We collect and release two datasets for Russian: a topic-labelled dataset and an appropriateness-labelled dataset. We also release pre-trained classification models trained on this data.",
+        }""",
+        prompt="Classify the given message as either sensitive topic or not",
+    )
diff --git a/mteb/tasks/Classification/rus/ru_nlu_intent_classification.py b/mteb/tasks/Classification/rus/ru_nlu_intent_classification.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+from mteb.abstasks.MultilingualTask import MultilingualTask
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class RuNLUIntentClassification(AbsTaskClassification, MultilingualTask):
+    metadata = TaskMetadata(
+        name="RuNLUIntentClassification",
+        dataset={
+            "path": "mteb/RuNLUIntentClassification",
+            "revision": "424d0f767aaa5c411e3a529eec04658e5726a39e",
+        },
+        description=(
+            "Contains natural language data for human-robot interaction in home domain which we collected and"
+            " annotated for evaluating NLU Services/platforms."
+        ),
+        reference="https://arxiv.org/abs/1903.05566",
+        type="Classification",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs={
+            "rus-eng": [
+                "rus-Cyrl",
+                "rus-Latn",
+            ],
+            "rus": [
+                "rus-Cyrl",
+            ],
+        },
+        main_score="accuracy",
+        date=("2019-03-26", "2019-03-26"),
+        domains=[],
+        task_subtypes=["Intent classification"],
+        license="cc-by-4.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="""@misc{liu2019benchmarkingnaturallanguageunderstanding,
+      title={Benchmarking Natural Language Understanding Services for building Conversational Agents},
+      author={Xingkun Liu and Arash Eshghi and Pawel Swietojanski and Verena Rieser},
+      year={2019},
+      eprint={1903.05566},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/1903.05566},
+}""",
+    )
diff --git a/mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py b/mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class RuToxicOKMLCUPClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="RuToxicOKMLCUPClassification",
+        dataset={
+            "path": "mteb/RuToxicOKMLCUPClassification",
+            "revision": "13722b7320ef4b6a471f9e8b379f3f49167d0517",
+        },
+        description="On the Odnoklassniki social network, users post a huge number of comments of various directions and nature every day.",
+        reference="https://cups.online/ru/contests/okmlcup2020",
+        type="Classification",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["rus-Cyrl"],
+        main_score="accuracy",
+        date=("2015-01-01", "2020-01-01"),
+        domains=[],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="""""",
+    )
+
+    def dataset_transform(self):
+        self.dataset = self.dataset.rename_column("toxic", "label")
diff --git a/mteb/tasks/Classification/rus/senti_ru_eval.py b/mteb/tasks/Classification/rus/senti_ru_eval.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class SentiRuEval2016Classification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="SentiRuEval2016",
+        dataset={
+            "path": "mteb/SentiRuEval2016",
+            "revision": "8507eab0deef37f040a750afbcb4dba7a7de9c16",
+        },
+        description="Russian sentiment analysis evaluation SentiRuEval-2016 devoted to reputation monitoring of banks "
+        "and telecom companies in Twitter. We describe the task, data, the procedure of data preparation, "
+        "and participants’ results.",
+        reference="https://github.com/mokoron/sentirueval",
+        type="Classification",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["rus-Cyrl"],
+        main_score="accuracy",
+        date=("2015-01-01", "2016-01-01"),
+        domains=[],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="""@inproceedings{loukachevitch2016sentirueval,
+  title={SentiRuEval-2016: overcoming time gap and data sparsity in tweet sentiment analysis},
+  author={Loukachevitch, NV and Rubtsova, Yu V},
+  booktitle={Computational Linguistics and Intellectual Technologies},
+  pages={416--426},
+  year={2016}
+}
+""",
+    )
diff --git a/mteb/tasks/MultiLabelClassification/__init__.py b/mteb/tasks/MultiLabelClassification/__init__.py
@@ -5,4 +5,5 @@
 from .multilingual.MultiEURLEXMultilabelClassification import *
 from .por.BrazilianToxicTweetsClassification import *
 from .rus.CEDRClassification import *
+from .rus.ru_toixic_multilabelclassification_okmlcup import *
 from .rus.SensitiveTopicsClassification import *
diff --git a/mteb/tasks/MultiLabelClassification/rus/ru_toixic_multilabelclassification_okmlcup.py b/mteb/tasks/MultiLabelClassification/rus/ru_toixic_multilabelclassification_okmlcup.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskMultilabelClassification import (
+    AbsTaskMultilabelClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class RuToxicOKMLCUPMultilabelClassification(AbsTaskMultilabelClassification):
+    metadata = TaskMetadata(
+        name="RuToxicOKMLCUPMultilabelClassification",
+        dataset={
+            "path": "mteb/RuToxicOKMLCUPClassification",
+            "revision": "13722b7320ef4b6a471f9e8b379f3f49167d0517",
+        },
+        description="On the Odnoklassniki social network, users post a huge number of comments of various directions and nature every day.",
+        reference="https://cups.online/ru/contests/okmlcup2020",
+        type="Classification",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["rus-Cyrl"],
+        main_score="accuracy",
+        date=("2015-01-01", "2024-01-01"),
+        domains=[],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="""""",
+    )
+
+    def dataset_transform(self):
+        self.dataset = self.dataset.rename_column("labels", "label")
diff --git a/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py b/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py
@@ -18,7 +18,7 @@ class RuSTSBenchmarkSTS(AbsTaskSTS):
         type="STS",
         category="s2s",
         modalities=["text"],
-        eval_splits=["test"],
+        eval_splits=["test", "validation"],
         eval_langs=["rus-Cyrl"],
         main_score="cosine_spearman",
         date=("2012-01-01", "2018-01-01"),