From c3f617604680235668d61ba5905908dca7aee91e Mon Sep 17 00:00:00 2001 From: Oliver Date: Thu, 3 Oct 2024 14:29:02 +0200 Subject: [PATCH 1/4] Add Slovak Hate Speech and Offensive Language Dataset This commit introduces the Slovak Hate Speech and Offensive Language Database to MTEB. The dataset includes posts from a social network, annotated by humans for hate speech and offensive content. Additionally, the corresponding task has been added to the tasks.md table to reflect this update. --- docs/tasks.md | 1 + .../slk/SlovakHateSpeechClassification.py | 33 +++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py diff --git a/docs/tasks.md b/docs/tasks.md index cb28007896..7675a01c5e 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -485,6 +485,7 @@ The following tables give you an overview of the tasks in MTEB. | [SinhalaNewsClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Category-classification) (Nisansa de Silva, 2015) | ['sin'] | Classification | s2s | [News, Written] | {'train': 3327} | {'train': 148.04} | | [SinhalaNewsSourceClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Source-classification) (Dhananjaya et al., 2022) | ['sin'] | Classification | s2s | [News, Written] | {'train': 24094} | {'train': 56.08} | | [SiswatiNewsClassification](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['ssw'] | Classification | s2s | [News, Written] | {'train': 80} | {'train': 354.2} | +| [SlovakHateSpeechClassification](https://huggingface.co/datasets/TUKE-KEMT/hate_speech_slovak) | ['slk'] | Classification | s2s | [Social, Written] | {'test': 1319} | {'test': 92.71} | | [SlovakMovieReviewSentimentClassification](https://arxiv.org/pdf/2304.01922) ({ {S, 2023) | ['svk'] | Classification | s2s | [Reviews, Written] | {'test': 2048} | {'test': 366.17} | | [SlovakSumRetrieval](https://huggingface.co/datasets/NaiveNeuron/slovaksum) | ['slk'] | Retrieval | s2s | [News, Social, Web, Written] | {'test': 600} | {'test': {'average_document_length': 2156.445, 'average_query_length': 143.59833333333333, 'num_documents': 600, 'num_queries': 600, 'average_relevant_docs_per_query': 1.0}} | | [SouthAfricanLangClassification](https://www.kaggle.com/competitions/south-african-language-identification/) (ExploreAI Academy et al., 2022) | ['afr', 'eng', 'nbl', 'nso', 'sot', 'ssw', 'tsn', 'tso', 'ven', 'xho', 'zul'] | Classification | s2s | [Web, Non-fiction, Written] | {'test': 2048} | {'test': 247.49} | diff --git a/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py b/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py new file mode 100644 index 0000000000..e9784b00cf --- /dev/null +++ b/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SlovakHateSpeechClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SlovakHateSpeechClassification", + description="The dataset contains posts from a social network with human annotations for hateful or offensive language in Slovak.", + reference="https://huggingface.co/datasets/TUKE-KEMT/hate_speech_slovak", + dataset={ + "path": "TUKE-KEMT/hate_speech_slovak", + "revision": "f9301b9937128c9c0b636fa6da203aeb046479f4", + }, + type="Classification", + category="s2s", + modalities=["text"], + date=None, + eval_splits=["test"], + eval_langs=["slk-Latn"], + main_score="accuracy", + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-sa-4.0", + annotations_creators="human-annotated", + dialect=None, + sample_creation="found", + descriptive_stats={ + "n_samples": {"test": 1319}, + "avg_character_length": {"test": 92.71}, + }, + ) From 61c938ee4c24af2770903b62b5bc373d2e859aaa Mon Sep 17 00:00:00 2001 From: Oliver Date: Tue, 15 Oct 2024 16:30:35 +0200 Subject: [PATCH 2/4] Add Slovak Hate Speech and Offensive Language Dataset - Updated __init__.py to include the new SlovakHateSpeechClassification task. - Modified SlovakHateSpeechClassification.py as per review suggestions to enhance functionality and readability. --- docs/tasks.md | 1 - mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index 7675a01c5e..cb28007896 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -485,7 +485,6 @@ The following tables give you an overview of the tasks in MTEB. | [SinhalaNewsClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Category-classification) (Nisansa de Silva, 2015) | ['sin'] | Classification | s2s | [News, Written] | {'train': 3327} | {'train': 148.04} | | [SinhalaNewsSourceClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Source-classification) (Dhananjaya et al., 2022) | ['sin'] | Classification | s2s | [News, Written] | {'train': 24094} | {'train': 56.08} | | [SiswatiNewsClassification](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['ssw'] | Classification | s2s | [News, Written] | {'train': 80} | {'train': 354.2} | -| [SlovakHateSpeechClassification](https://huggingface.co/datasets/TUKE-KEMT/hate_speech_slovak) | ['slk'] | Classification | s2s | [Social, Written] | {'test': 1319} | {'test': 92.71} | | [SlovakMovieReviewSentimentClassification](https://arxiv.org/pdf/2304.01922) ({ {S, 2023) | ['svk'] | Classification | s2s | [Reviews, Written] | {'test': 2048} | {'test': 366.17} | | [SlovakSumRetrieval](https://huggingface.co/datasets/NaiveNeuron/slovaksum) | ['slk'] | Retrieval | s2s | [News, Social, Web, Written] | {'test': 600} | {'test': {'average_document_length': 2156.445, 'average_query_length': 143.59833333333333, 'num_documents': 600, 'num_queries': 600, 'average_relevant_docs_per_query': 1.0}} | | [SouthAfricanLangClassification](https://www.kaggle.com/competitions/south-african-language-identification/) (ExploreAI Academy et al., 2022) | ['afr', 'eng', 'nbl', 'nso', 'sot', 'ssw', 'tsn', 'tso', 'ven', 'xho', 'zul'] | Classification | s2s | [Web, Non-fiction, Written] | {'test': 2048} | {'test': 247.49} | diff --git a/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py b/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py index e9784b00cf..b35c5634d8 100644 --- a/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py +++ b/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py @@ -24,7 +24,7 @@ class SlovakHateSpeechClassification(AbsTaskClassification): task_subtypes=["Sentiment/Hate speech"], license="cc-by-sa-4.0", annotations_creators="human-annotated", - dialect=None, + dialect=[], sample_creation="found", descriptive_stats={ "n_samples": {"test": 1319}, From edb4e4519dd157f0c73446295ff60e8dd29d6670 Mon Sep 17 00:00:00 2001 From: Oliver Date: Tue, 15 Oct 2024 16:42:08 +0200 Subject: [PATCH 3/4] Did requested changes: - Updated __init__.py to include the new SlovakHateSpeechClassification task. - Modified SlovakHateSpeechClassification.py as per review suggestions to enhance functionality and readability. --- mteb/tasks/Classification/__init__.py | 1 + .../Classification/slk/SlovakHateSpeechClassification.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index d8f87f8ea9..3e80ae2181 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -118,6 +118,7 @@ from .sin.SinhalaNewsClassification import * from .sin.SinhalaNewsSourceClassification import * from .slk.CSFDSKMovieReviewSentimentClassification import * +from .slk.SlovakHateSpeechClassification import * from .slv.FrenkSlClassification import * from .spa.SpanishNewsClassification import * from .spa.SpanishSentimentClassification import * diff --git a/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py b/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py index b35c5634d8..f81a19122a 100644 --- a/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py +++ b/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py @@ -16,7 +16,7 @@ class SlovakHateSpeechClassification(AbsTaskClassification): type="Classification", category="s2s", modalities=["text"], - date=None, + date=("2024-05-25", "2024-06-06"), eval_splits=["test"], eval_langs=["slk-Latn"], main_score="accuracy", @@ -26,8 +26,9 @@ class SlovakHateSpeechClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", + bibtex_citation="", descriptive_stats={ "n_samples": {"test": 1319}, "avg_character_length": {"test": 92.71}, }, - ) + ) \ No newline at end of file From 83af71061b6af62e48d0558aa66215f7497191e1 Mon Sep 17 00:00:00 2001 From: Oliver Date: Thu, 24 Oct 2024 09:57:45 +0200 Subject: [PATCH 4/4] resolve linting issues by running `make lint` --- mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py b/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py index f81a19122a..ad1d29dcf7 100644 --- a/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py +++ b/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py @@ -31,4 +31,4 @@ class SlovakHateSpeechClassification(AbsTaskClassification): "n_samples": {"test": 1319}, "avg_character_length": {"test": 92.71}, }, - ) \ No newline at end of file + )