Skip to content
7 changes: 7 additions & 0 deletions mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

TASK_SUBTYPE = Literal[
"Article retrieval",
"Accent identification",
"Conversational retrieval",
"Dialect pairing",
"Dialog Systems",
Expand Down Expand Up @@ -64,20 +65,26 @@
"Stroke Classification of Musical Instrument",
"Tonic Classification of Musical Instrument",
"Speaker Count Identification",
"Species Classification",
"Spoken Digit Classification",
"Gender Clustering",
"Music Clustering",
"Sentiment Clustering",
"Emotion Clustering",
"Accent Clustering",
"Rendered semantic textual similarity",
"Sentiment Analysis",
"Intent Classification",
"Vehicle Clustering",
"Environment Sound Clustering",
"Rendered semantic textual similarity",
"Gender Classification",
"Age Classification",
]

TASK_DOMAIN = Literal[
"Academic",
"Bioacoustics",
"Blog",
"Constructed",
"Encyclopaedic",
Expand Down
2 changes: 1 addition & 1 deletion mteb/models/wav2vec2_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def get_audio_embeddings(

outputs = self.model(
inputs.input_values.squeeze(0),
attention_mask=inputs.attention_mask,
attention_mask=inputs.attention_mask.squeeze(0).unsqueeze(-1),
output_hidden_states=True,
)

Expand Down
10 changes: 10 additions & 0 deletions mteb/tasks/Audio/AudioClassification/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations

from .eng.AmbientAcousticContext import *
from .eng.BeijingOpera import *
from .eng.BirdCLEF import *
from .eng.CommonLanguageAgeDetection import *
from .eng.CommonLanguageGenderDetection import *
from .eng.CommonLanguageLanguageClassification import *
Expand All @@ -9,11 +11,19 @@
from .eng.FSDD import *
from .eng.GTZANGenre import *
from .eng.GunshotTriangulation import *
from .eng.IEMOCAPEmotion import *
from .eng.IEMOCAPGender import *
from .eng.LibriCount import *
from .eng.MInDS14 import *
from .eng.MridinghamStroke import *
from .eng.MridinghamTonic import *
from .eng.NSynth import *
from .eng.SIBFLEURS import *
from .eng.SpeechCommands import *
from .eng.SpokenQAforIC import *
from .eng.TUTAcousticScenes import *
from .eng.VoxCelebSA import *
from .eng.VoxLingua107Top10 import *
from .eng.VoxPopuliAccentID import *
from .eng.VoxPopuliGenderID import *
from .eng.VoxPopuliLanguageID import *
62 changes: 62 additions & 0 deletions mteb/tasks/Audio/AudioClassification/eng/AmbientAcousticContext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from __future__ import annotations

from mteb.abstasks.Audio.AbsTaskAudioClassification import (
AbsTaskAudioClassification,
)
from mteb.abstasks.TaskMetadata import TaskMetadata


class AmbientAcousticContextClassification(AbsTaskAudioClassification):
metadata = TaskMetadata(
name="AmbientAcousticContext",
description="The Ambient Acoustic Context dataset contains 1-second segments for activities that occur in a workplace setting. This is a downsampled version with ~100 train and ~50 test samples per class.",
reference="https://dl.acm.org/doi/10.1145/3379503.3403535",
dataset={
"path": "AdnanElAssadi/ambient-acoustic-context-small",
"revision": "360c858462b79492c6b09d5855ec4d59c87497c6",
},
type="AudioClassification",
category="a2t",
eval_splits=["test"], # Using the pre-created test split
eval_langs=["eng-Latn"],
main_score="accuracy",
date=("2020-01-01", "2020-12-31"), # Paper publication date
domains=["Spoken", "Speech"],
task_subtypes=["Environment Sound Classification"],
license="not specified", # Not specified in dataset card
annotations_creators="human-annotated",
dialect=[],
modalities=["audio"],
sample_creation="found",
bibtex_citation="""@inproceedings{10.1145/3379503.3403535,
author = {Park, Chunjong and Min, Chulhong and Bhattacharya, Sourav and Kawsar, Fahim},
title = {Augmenting Conversational Agents with Ambient Acoustic Contexts},
year = {2020},
isbn = {9781450375160},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3379503.3403535},
doi = {10.1145/3379503.3403535},
booktitle = {22nd International Conference on Human-Computer Interaction with Mobile Devices and Services},
articleno = {33},
numpages = {9},
keywords = {Acoustic ambient context, Conversational agents},
location = {Oldenburg, Germany},
series = {MobileHCI '20}
}""",
descriptive_stats={
"n_samples": {
"train": 2387, # ~100 samples × 24 classes
"test": 1036, # ~50 samples × 24 classes
},
"n_classes": 24,
"sampling_rate": 16000,
},
)

audio_column_name: str = "audio"
label_column_name: str = "label"
samples_per_label: int = None # Not needed as dataset is already balanced
is_cross_validation: bool = False

# No dataset_transform method needed as dataset is already filtered and split
49 changes: 49 additions & 0 deletions mteb/tasks/Audio/AudioClassification/eng/BirdCLEF.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from __future__ import annotations

from mteb.abstasks.Audio.AbsTaskAudioClassification import (
AbsTaskAudioClassification,
)
from mteb.abstasks.TaskMetadata import TaskMetadata


class BirdCLEFClassification(AbsTaskAudioClassification):
metadata = TaskMetadata(
name="BirdCLEF",
description="BirdCLEF+ 2025 dataset for species identification from audio, focused on birds, amphibians, mammals and insects from the Middle Magdalena Valley of Colombia. Downsampled to 50 classes with 20 samples each.",
reference="https://huggingface.co/datasets/christopher/birdclef-2025",
dataset={
"path": "AdnanElAssadi/birdclef25_small",
"revision": "55dbd1a0f77dd71980337a6e64620369c1e3585a",
},
type="AudioClassification",
category="a2t",
eval_splits=["train"],
eval_langs=[
"eng-Latn",
],
main_score="accuracy",
date=("2025-01-01", "2025-12-31"), # Competition year
domains=["Spoken", "Speech", "Bioacoustics"],
task_subtypes=["Species Classification"],
license="cc-by-nc-4.0",
annotations_creators="expert-annotated",
dialect=[],
modalities=["audio"],
sample_creation="found",
bibtex_citation="""@dataset{birdclef2025,
author={Christopher},
title={BirdCLEF+ 2025},
year={2025},
publisher={Hugging Face},
url={https://huggingface.co/datasets/christopher/birdclef-2025}
}""",
descriptive_stats={
"n_samples": {"train": 1000}, # 50 classes × 20 samples each
"n_classes": 50,
},
)

audio_column_name: str = "recording"
label_column_name: str = "primary_label"
samples_per_label: int = 20
is_cross_validation: bool = True
90 changes: 90 additions & 0 deletions mteb/tasks/Audio/AudioClassification/eng/IEMOCAPEmotion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from __future__ import annotations

from mteb.abstasks.Audio.AbsTaskAudioClassification import (
AbsTaskAudioClassification,
)
from mteb.abstasks.TaskMetadata import TaskMetadata


class IEMOCAPEmotionClassification(AbsTaskAudioClassification):
metadata = TaskMetadata(
name="IEMOCAPEmotion",
description="Classification of speech samples into emotions (angry, happy, sad, neutral, frustrated, excited, fearful, surprised, disgusted) from interactive emotional dyadic conversations.",
reference="https://doi.org/10.1007/s10579-008-9076-6",
dataset={
"path": "AbstractTTS/IEMOCAP",
"revision": "9f1696a135a65ce997d898d4121c952269a822ca",
},
type="AudioClassification",
category="a2t",
eval_splits=["train"],
eval_langs=["eng-Latn"],
main_score="accuracy",
date=("2008-01-01", "2008-12-31"),
domains=["Spoken", "Speech"],
task_subtypes=["Emotion classification"],
license="cc-by-nc-sa-3.0",
annotations_creators="expert-annotated",
dialect=[],
modalities=["audio"],
sample_creation="created",
bibtex_citation="""@article{busso2008iemocap,
title={IEMOCAP: Interactive emotional dyadic motion capture database},
author={Busso, Carlos and Bulut, Murtaza and Lee, Chi-Chun and Kazemzadeh, Abe and Mower, Emily and Kim, Samuel and Chang, Jeannette N and Lee, Sungbok and Narayanan, Shrikanth S},
journal={Language resources and evaluation},
volume={42},
number={4},
pages={335--359},
year={2008},
publisher={Springer}
}""",
descriptive_stats={
"n_samples": {"train": 10039}, # Approximate after subsampling
},
)

audio_column_name: str = "audio"
label_column_name: str = "emotion"
samples_per_label: int = 10
is_cross_validation: bool = True

def dataset_transform(self):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can remove this on re-upload

# Define emotion labels and their mapping to indices
labels = [
"angry", # 0
"sad", # 1
"happy", # 2
"neutral", # 3
"frustrated", # 4
"excited", # 5
"fear", # 6
"surprise", # 7
"disgust", # 8
"other", # 9
]
label2id = {emotion: idx for idx, emotion in enumerate(labels)}

# Basic filtering to ensure we have valid emotion labels
for split in self.dataset:
# First ensure we have valid emotion labels and normalize case
self.dataset[split] = self.dataset[split].filter(
lambda example: example["major_emotion"] is not None
and example["major_emotion"] != ""
)

# Map to indices with case normalization for reliability
self.dataset[split] = self.dataset[split].map(
lambda example: {
"emotion_id": label2id.get(example["major_emotion"].lower(), -1)
}
)

# Filter out any examples with unknown emotions
self.dataset[split] = self.dataset[split].filter(
lambda example: example["emotion_id"] != -1
)

# Use numeric ID as the label
self.dataset[split] = self.dataset[split].rename_column(
"emotion_id", self.label_column_name
)
63 changes: 63 additions & 0 deletions mteb/tasks/Audio/AudioClassification/eng/IEMOCAPGender.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from __future__ import annotations

from mteb.abstasks.Audio.AbsTaskAudioClassification import (
AbsTaskAudioClassification,
)
from mteb.abstasks.TaskMetadata import TaskMetadata


class IEMOCAPGenderClassification(AbsTaskAudioClassification):
metadata = TaskMetadata(
name="IEMOCAPGender",
description="Classification of speech samples by speaker gender (male/female) from the IEMOCAP database of interactive emotional dyadic conversations.",
reference="https://doi.org/10.1007/s10579-008-9076-6",
dataset={
"path": "AbstractTTS/IEMOCAP",
"revision": "9f1696a135a65ce997d898d4121c952269a822ca",
},
type="AudioClassification",
category="a2t",
eval_splits=["train"],
eval_langs=["eng-Latn"],
main_score="accuracy",
date=("2008-01-01", "2008-12-31"),
domains=["Spoken", "Speech"],
task_subtypes=["Gender Classification"],
license="cc-by-nc-sa-3.0",
annotations_creators="expert-annotated",
dialect=[],
modalities=["audio"],
sample_creation="created",
bibtex_citation="""@article{busso2008iemocap,
title={IEMOCAP: Interactive emotional dyadic motion capture database},
author={Busso, Carlos and Bulut, Murtaza and Lee, Chi-Chun and Kazemzadeh, Abe and Mower, Emily and Kim, Samuel and Chang, Jeannette N and Lee, Sungbok and Narayanan, Shrikanth S},
journal={Language resources and evaluation},
volume={42},
number={4},
pages={335--359},
year={2008},
publisher={Springer}
}""",
descriptive_stats={
"n_samples": {"train": 10039},
},
)

audio_column_name: str = "audio"
label_column_name: str = "gender_id"
samples_per_label: int = 100
is_cross_validation: bool = True

def dataset_transform(self):
# Define label mapping
label2id = {"Female": 0, "Male": 1}

# Apply transformation to all dataset splits
for split in self.dataset:
# Define transform function to add numeric labels
def add_gender_id(example):
example["gender_id"] = label2id[example["gender"]]
return example

print(f"Converting gender labels to numeric IDs for split '{split}'...")
self.dataset[split] = self.dataset[split].map(add_gender_id)
Loading
Loading