diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index 1d1a097bd0..dfbe1762b4 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -55,6 +55,7 @@ "Duplicate Detection", "Environment Sound Classification", "Gunshot Audio Classification", + "Keyword Spotting", "Instrument Source Classification", "Music Genre Classification", "Music Instrument Recognition", diff --git a/mteb/tasks/Audio/AudioZeroshotClassification/__init__.py b/mteb/tasks/Audio/AudioZeroshotClassification/__init__.py index 4c8083cb5e..6bca670007 100644 --- a/mteb/tasks/Audio/AudioZeroshotClassification/__init__.py +++ b/mteb/tasks/Audio/AudioZeroshotClassification/__init__.py @@ -2,4 +2,5 @@ from .eng.ESC50 import * from .eng.Ravdess import * +from .eng.SpeechCommands import * from .eng.UrbanSound8k import * diff --git a/mteb/tasks/Audio/AudioZeroshotClassification/eng/ESC50.py b/mteb/tasks/Audio/AudioZeroshotClassification/eng/ESC50.py index 2751bd8639..2c6baf8502 100644 --- a/mteb/tasks/Audio/AudioZeroshotClassification/eng/ESC50.py +++ b/mteb/tasks/Audio/AudioZeroshotClassification/eng/ESC50.py @@ -16,7 +16,7 @@ class ESC50ZeroshotClassification(AbsTaskAudioZeroshotClassification): "revision": "e3e2a63ffff66b9a9735524551e3818e96af03ee", }, type="AudioZeroshotClassification", - category="a2a", + category="a2t", eval_splits=["train"], eval_langs=["eng-Latn"], main_score="accuracy", diff --git a/mteb/tasks/Audio/AudioZeroshotClassification/eng/SpeechCommands.py b/mteb/tasks/Audio/AudioZeroshotClassification/eng/SpeechCommands.py new file mode 100644 index 0000000000..41f1400983 --- /dev/null +++ b/mteb/tasks/Audio/AudioZeroshotClassification/eng/SpeechCommands.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioZeroshotClassification import ( + AbsTaskAudioZeroshotClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SpeechCommandsZeroshotClassificationv01(AbsTaskAudioZeroshotClassification): + metadata = TaskMetadata( + name="SpeechCommandsZeroshotv0.01", + description="Sound Classification/Keyword Spotting Dataset. This is a set of one-second audio clips containing a single spoken English word or background noise. These words are from a small set of commands such as 'yes', 'no', and 'stop' spoken by various speakers. With a total of 10 labels/commands for keyword spotting and a total of 30 labels for other auxiliary tasks", + reference="https://huggingface.co/datasets/google/speech_commands", + dataset={ + "path": "google/speech_commands", + "name": "v0.01", + "revision": "57ba463ab37e1e7845e0626539a6f6d0fcfbe64a", + "trust_remote_code": True, + }, + type="AudioZeroshotClassification", + category="a2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2018-07-07", "2018-07-13"), + domains=["Spoken"], + task_subtypes=["Keyword Spotting"], + license="cc-by-4.0", # Replace with appropriate license from allowed list + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation=r""" +@article{DBLP:journals/corr/abs-1804-03209, + author = {Pete Warden}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/journals/corr/abs-1804-03209.bib}, + eprint = {1804.03209}, + eprinttype = {arXiv}, + journal = {CoRR}, + timestamp = {Mon, 13 Aug 2018 16:48:32 +0200}, + title = {Speech Commands: {A} Dataset for Limited-Vocabulary Speech Recognition}, + url = {http://arxiv.org/abs/1804.03209}, + volume = {abs/1804.03209}, + year = {2018}, +} +""", + descriptive_stats={ + "n_samples": {"test": 3081}, + }, + ) + + label_column_name: str = "label" + + def get_candidate_labels(self) -> list[str]: + """Return the text candidates for zeroshot classification""" + return [ + "Yes", + "No", + "Up", + "Down", + "Left", + "Right", + "On", + "Off", + "Stop", + "Go", + # Dataset has 30 labels, but only first 10 are used for zeroshot classification since they are considered as commands, others are considered as auxiliary labels for v1.1 + ] + + def dataset_transform(self): + """Transform dataset to ensure labels are in list format and filter to keep only the first 10 command labels""" + # Filter dataset to keep only examples with labels 0-9 + self.dataset = self.dataset.filter( + lambda x: 0 <= x[self.label_column_name] < len(self.get_candidate_labels()) + ) + + +class SpeechCommandsZeroshotClassificationv02(AbsTaskAudioZeroshotClassification): + metadata = TaskMetadata( + name="SpeechCommandsZeroshotv0.02", + description="Sound Classification/Keyword Spotting Dataset. This is a set of one-second audio clips containing a single spoken English word or background noise. These words are from a small set of commands such as 'yes', 'no', and 'stop' spoken by various speakers. With a total of 10 labels/commands for keyword spotting and a total of 30 labels for other auxiliary tasks", + reference="https://huggingface.co/datasets/google/speech_commands", + dataset={ + "path": "google/speech_commands", + "name": "v0.02", + "revision": "57ba463ab37e1e7845e0626539a6f6d0fcfbe64a", + "trust_remote_code": True, + }, + type="AudioZeroshotClassification", + category="a2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2018-07-07", "2018-07-13"), + domains=["Spoken"], + task_subtypes=["Keyword Spotting"], + license="cc-by-4.0", # Replace with appropriate license from allowed list + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation=r""" +@article{DBLP:journals/corr/abs-1804-03209, + author = {Pete Warden}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/journals/corr/abs-1804-03209.bib}, + eprint = {1804.03209}, + eprinttype = {arXiv}, + journal = {CoRR}, + timestamp = {Mon, 13 Aug 2018 16:48:32 +0200}, + title = {Speech Commands: {A} Dataset for Limited-Vocabulary Speech Recognition}, + url = {http://arxiv.org/abs/1804.03209}, + volume = {abs/1804.03209}, + year = {2018}, +} +""", + descriptive_stats={ + "n_samples": {"test": 4890}, + }, + ) + + label_column_name: str = "label" + + def get_candidate_labels(self) -> list[str]: + """Return the text candidates for zeroshot classification""" + return [ + "Yes", + "No", + "Up", + "Down", + "Left", + "Right", + "On", + "Off", + "Stop", + "Go", + # Dataset has 30 labels, but only first 10 are used for zeroshot classification since they are considered as commands, others are considered as auxiliary labels for v1.1 + ] + + def dataset_transform(self): + """Transform dataset to ensure labels are in list format and filter to keep only the first 10 command labels""" + # Filter dataset to keep only examples with labels 0-9 + self.dataset = self.dataset.filter( + lambda x: 0 <= x[self.label_column_name] < len(self.get_candidate_labels()) + )