Skip to content
1 change: 1 addition & 0 deletions mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
"Duplicate Detection",
"Environment Sound Classification",
"Gunshot Audio Classification",
"Keyword Spotting",
"Instrument Source Classification",
"Music Genre Classification",
"Music Instrument Recognition",
Expand Down
1 change: 1 addition & 0 deletions mteb/tasks/Audio/AudioZeroshotClassification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@

from .eng.ESC50 import *
from .eng.Ravdess import *
from .eng.SpeechCommands import *
from .eng.UrbanSound8k import *
2 changes: 1 addition & 1 deletion mteb/tasks/Audio/AudioZeroshotClassification/eng/ESC50.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class ESC50ZeroshotClassification(AbsTaskAudioZeroshotClassification):
"revision": "e3e2a63ffff66b9a9735524551e3818e96af03ee",
},
type="AudioZeroshotClassification",
category="a2a",
category="a2t",
eval_splits=["train"],
eval_langs=["eng-Latn"],
main_score="accuracy",
Expand Down
146 changes: 146 additions & 0 deletions mteb/tasks/Audio/AudioZeroshotClassification/eng/SpeechCommands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
from __future__ import annotations

from mteb.abstasks.Audio.AbsTaskAudioZeroshotClassification import (
AbsTaskAudioZeroshotClassification,
)
from mteb.abstasks.TaskMetadata import TaskMetadata


class SpeechCommandsZeroshotClassificationv01(AbsTaskAudioZeroshotClassification):
metadata = TaskMetadata(
name="SpeechCommandsZeroshotv0.01",
description="Sound Classification/Keyword Spotting Dataset. This is a set of one-second audio clips containing a single spoken English word or background noise. These words are from a small set of commands such as 'yes', 'no', and 'stop' spoken by various speakers. With a total of 10 labels/commands for keyword spotting and a total of 30 labels for other auxiliary tasks",
reference="https://huggingface.co/datasets/google/speech_commands",
dataset={
"path": "google/speech_commands",
"name": "v0.01",
"revision": "57ba463ab37e1e7845e0626539a6f6d0fcfbe64a",
"trust_remote_code": True,
},
type="AudioZeroshotClassification",
category="a2t",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="accuracy",
date=("2018-07-07", "2018-07-13"),
domains=["Spoken"],
task_subtypes=["Keyword Spotting"],
license="cc-by-4.0", # Replace with appropriate license from allowed list
annotations_creators="human-annotated",
dialect=[],
modalities=["audio"],
sample_creation="found",
bibtex_citation=r"""
@article{DBLP:journals/corr/abs-1804-03209,
author = {Pete Warden},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/journals/corr/abs-1804-03209.bib},
eprint = {1804.03209},
eprinttype = {arXiv},
journal = {CoRR},
timestamp = {Mon, 13 Aug 2018 16:48:32 +0200},
title = {Speech Commands: {A} Dataset for Limited-Vocabulary Speech Recognition},
url = {http://arxiv.org/abs/1804.03209},
volume = {abs/1804.03209},
year = {2018},
}
""",
descriptive_stats={
"n_samples": {"test": 3081},
},
)

label_column_name: str = "label"

def get_candidate_labels(self) -> list[str]:
"""Return the text candidates for zeroshot classification"""
return [
"Yes",
"No",
"Up",
"Down",
"Left",
"Right",
"On",
"Off",
"Stop",
"Go",
# Dataset has 30 labels, but only first 10 are used for zeroshot classification since they are considered as commands, others are considered as auxiliary labels for v1.1
]

def dataset_transform(self):
"""Transform dataset to ensure labels are in list format and filter to keep only the first 10 command labels"""
# Filter dataset to keep only examples with labels 0-9
self.dataset = self.dataset.filter(
lambda x: 0 <= x[self.label_column_name] < len(self.get_candidate_labels())
)


class SpeechCommandsZeroshotClassificationv02(AbsTaskAudioZeroshotClassification):
metadata = TaskMetadata(
name="SpeechCommandsZeroshotv0.02",
description="Sound Classification/Keyword Spotting Dataset. This is a set of one-second audio clips containing a single spoken English word or background noise. These words are from a small set of commands such as 'yes', 'no', and 'stop' spoken by various speakers. With a total of 10 labels/commands for keyword spotting and a total of 30 labels for other auxiliary tasks",
reference="https://huggingface.co/datasets/google/speech_commands",
dataset={
"path": "google/speech_commands",
"name": "v0.02",
"revision": "57ba463ab37e1e7845e0626539a6f6d0fcfbe64a",
"trust_remote_code": True,
},
type="AudioZeroshotClassification",
category="a2t",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="accuracy",
date=("2018-07-07", "2018-07-13"),
domains=["Spoken"],
task_subtypes=["Keyword Spotting"],
license="cc-by-4.0", # Replace with appropriate license from allowed list
annotations_creators="human-annotated",
dialect=[],
modalities=["audio"],
sample_creation="found",
bibtex_citation=r"""
@article{DBLP:journals/corr/abs-1804-03209,
author = {Pete Warden},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/journals/corr/abs-1804-03209.bib},
eprint = {1804.03209},
eprinttype = {arXiv},
journal = {CoRR},
timestamp = {Mon, 13 Aug 2018 16:48:32 +0200},
title = {Speech Commands: {A} Dataset for Limited-Vocabulary Speech Recognition},
url = {http://arxiv.org/abs/1804.03209},
volume = {abs/1804.03209},
year = {2018},
}
""",
descriptive_stats={
"n_samples": {"test": 4890},
},
)

label_column_name: str = "label"

def get_candidate_labels(self) -> list[str]:
"""Return the text candidates for zeroshot classification"""
return [
"Yes",
"No",
"Up",
"Down",
"Left",
"Right",
"On",
"Off",
"Stop",
"Go",
# Dataset has 30 labels, but only first 10 are used for zeroshot classification since they are considered as commands, others are considered as auxiliary labels for v1.1
]

def dataset_transform(self):
"""Transform dataset to ensure labels are in list format and filter to keep only the first 10 command labels"""
# Filter dataset to keep only examples with labels 0-9
self.dataset = self.dataset.filter(
lambda x: 0 <= x[self.label_column_name] < len(self.get_candidate_labels())
)