Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mteb/tasks/Audio/AudioClassification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .eng.GTZANGenre import *
from .eng.GunshotTriangulation import *
from .eng.LibriCount import *
from .eng.IEMOCAP import *
from .eng.MridinghamStroke import *
from .eng.MridinghamTonic import *
from .eng.NSynth import *
Expand Down
64 changes: 64 additions & 0 deletions mteb/tasks/Audio/AudioClassification/eng/IEMOCAP.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from __future__ import annotations

from mteb.abstasks.Audio.AbsTaskAudioClassification import (
AbsTaskAudioClassification,
)
from mteb.abstasks.TaskMetadata import TaskMetadata


class IEMOCAP(AbsTaskAudioClassification):
metadata = TaskMetadata(
name="IEMOCAP",
description="""IEMOCAP was recorded from ten actors in dyadic sessions with markers on the face, head,
and hands, which provide detailed information about their facial expression and
hand movements during scripted and spontaneous spoken communication scenarios.
actors performed selected emotional scripts and also improvised hypothetical
scenarios designed to elicit specific types of emotions (happiness, anger, sadness, frustration and neutral state).
After autmoated annotations, the final emotional categories selected for annotation were :
anger, sadness, happiness, disgust, fear and surprise, plus frustration, excited and neutral states
""",
reference="https://huggingface.co/datasets/AbstractTTS/IEMOCAP",
dataset={
"path": "AbstractTTS/IEMOCAP",
"revision": "9f1696a135a65ce997d898d4121c952269a822ca",
},
type="AudioClassification",
category="a2t",
eval_splits=["train"],
eval_langs=["eng-Latn"],
main_score="accuracy",
date=("2024-08-09", "2024-08-11"),
domains=["Spoken"],
task_subtypes=["Emotion classification"],
license="not specified",
annotations_creators="automatic-and-reviewed",
dialect=[],
modalities=["audio"],
sample_creation="created",
bibtex_citation="""@article{article,
author = {Busso, Carlos and Bulut, Murtaza and Lee, Chi-Chun and Kazemzadeh, Abe and Mower Provost, Emily and Kim, Samuel and Chang, Jeannette and Lee, Sungbok and Narayanan, Shrikanth},
year = {2008},
month = {12},
pages = {335-359},
title = {IEMOCAP: Interactive emotional dyadic motion capture database},
volume = {42},
journal = {Language Resources and Evaluation},
doi = {10.1007/s10579-008-9076-6}
}""",
# https://ecs.utdallas.edu/research/researchlabs/msp-lab/publications/Busso_2008_5.pdf
descriptive_stats={
"n_samples": {"train": 10039},
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the dataset is too big (e.g. >2048 examples), considering using self.stratified_subsampling() under dataset_transform()

This is checked but I don't see that completed in the PR.

},
)

audio_column_name: str = "audio"
label_column_name: str = "major_emotion"
samples_per_label: int = 10
is_cross_validation: bool = True

def dataset_transform(self):
## required to run the dataloader for cross-validation
import torch

torch.multiprocessing.set_sharing_strategy("file_system")
#########################################################