diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index b0df961f21..28cf0ab9be 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -24,6 +24,7 @@ TASK_SUBTYPE = Literal[ "Article retrieval", + "Accent identification", "Conversational retrieval", "Dialect pairing", "Dialog Systems", @@ -64,13 +65,18 @@ "Stroke Classification of Musical Instrument", "Tonic Classification of Musical Instrument", "Speaker Count Identification", + "Species Classification", "Spoken Digit Classification", "Gender Clustering", "Music Clustering", + "Sentiment Clustering", + "Emotion Clustering", + "Accent Clustering", "Rendered semantic textual similarity", "Sentiment Analysis", "Intent Classification", "Vehicle Clustering", + "Environment Sound Clustering", "Rendered semantic textual similarity", "Gender Classification", "Age Classification", @@ -78,6 +84,7 @@ TASK_DOMAIN = Literal[ "Academic", + "Bioacoustics", "Blog", "Constructed", "Encyclopaedic", diff --git a/mteb/models/wav2vec2_models.py b/mteb/models/wav2vec2_models.py index ebdacc8d03..2eacc5a49f 100644 --- a/mteb/models/wav2vec2_models.py +++ b/mteb/models/wav2vec2_models.py @@ -181,7 +181,7 @@ def get_audio_embeddings( outputs = self.model( inputs.input_values.squeeze(0), - attention_mask=inputs.attention_mask, + attention_mask=inputs.attention_mask.squeeze(0).unsqueeze(-1), output_hidden_states=True, ) diff --git a/mteb/tasks/Audio/AudioClassification/__init__.py b/mteb/tasks/Audio/AudioClassification/__init__.py index fe2257b321..00ecb12aba 100644 --- a/mteb/tasks/Audio/AudioClassification/__init__.py +++ b/mteb/tasks/Audio/AudioClassification/__init__.py @@ -1,6 +1,8 @@ from __future__ import annotations +from .eng.AmbientAcousticContext import * from .eng.BeijingOpera import * +from .eng.BirdCLEF import * from .eng.CommonLanguageAgeDetection import * from .eng.CommonLanguageGenderDetection import * from .eng.CommonLanguageLanguageClassification import * @@ -9,11 +11,19 @@ from .eng.FSDD import * from .eng.GTZANGenre import * from .eng.GunshotTriangulation import * +from .eng.IEMOCAPEmotion import * +from .eng.IEMOCAPGender import * from .eng.LibriCount import * +from .eng.MInDS14 import * from .eng.MridinghamStroke import * from .eng.MridinghamTonic import * from .eng.NSynth import * from .eng.SIBFLEURS import * +from .eng.SpeechCommands import * from .eng.SpokenQAforIC import * +from .eng.TUTAcousticScenes import * from .eng.VoxCelebSA import * from .eng.VoxLingua107Top10 import * +from .eng.VoxPopuliAccentID import * +from .eng.VoxPopuliGenderID import * +from .eng.VoxPopuliLanguageID import * diff --git a/mteb/tasks/Audio/AudioClassification/eng/AmbientAcousticContext.py b/mteb/tasks/Audio/AudioClassification/eng/AmbientAcousticContext.py new file mode 100644 index 0000000000..4743ca3f6a --- /dev/null +++ b/mteb/tasks/Audio/AudioClassification/eng/AmbientAcousticContext.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClassification import ( + AbsTaskAudioClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class AmbientAcousticContextClassification(AbsTaskAudioClassification): + metadata = TaskMetadata( + name="AmbientAcousticContext", + description="The Ambient Acoustic Context dataset contains 1-second segments for activities that occur in a workplace setting. This is a downsampled version with ~100 train and ~50 test samples per class.", + reference="https://dl.acm.org/doi/10.1145/3379503.3403535", + dataset={ + "path": "AdnanElAssadi/ambient-acoustic-context-small", + "revision": "360c858462b79492c6b09d5855ec4d59c87497c6", + }, + type="AudioClassification", + category="a2t", + eval_splits=["test"], # Using the pre-created test split + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2020-01-01", "2020-12-31"), # Paper publication date + domains=["Spoken", "Speech"], + task_subtypes=["Environment Sound Classification"], + license="not specified", # Not specified in dataset card + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation="""@inproceedings{10.1145/3379503.3403535, + author = {Park, Chunjong and Min, Chulhong and Bhattacharya, Sourav and Kawsar, Fahim}, + title = {Augmenting Conversational Agents with Ambient Acoustic Contexts}, + year = {2020}, + isbn = {9781450375160}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + url = {https://doi.org/10.1145/3379503.3403535}, + doi = {10.1145/3379503.3403535}, + booktitle = {22nd International Conference on Human-Computer Interaction with Mobile Devices and Services}, + articleno = {33}, + numpages = {9}, + keywords = {Acoustic ambient context, Conversational agents}, + location = {Oldenburg, Germany}, + series = {MobileHCI '20} + }""", + descriptive_stats={ + "n_samples": { + "train": 2387, # ~100 samples × 24 classes + "test": 1036, # ~50 samples × 24 classes + }, + "n_classes": 24, + "sampling_rate": 16000, + }, + ) + + audio_column_name: str = "audio" + label_column_name: str = "label" + samples_per_label: int = None # Not needed as dataset is already balanced + is_cross_validation: bool = False + + # No dataset_transform method needed as dataset is already filtered and split diff --git a/mteb/tasks/Audio/AudioClassification/eng/BirdCLEF.py b/mteb/tasks/Audio/AudioClassification/eng/BirdCLEF.py new file mode 100644 index 0000000000..9af1568e9a --- /dev/null +++ b/mteb/tasks/Audio/AudioClassification/eng/BirdCLEF.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClassification import ( + AbsTaskAudioClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class BirdCLEFClassification(AbsTaskAudioClassification): + metadata = TaskMetadata( + name="BirdCLEF", + description="BirdCLEF+ 2025 dataset for species identification from audio, focused on birds, amphibians, mammals and insects from the Middle Magdalena Valley of Colombia. Downsampled to 50 classes with 20 samples each.", + reference="https://huggingface.co/datasets/christopher/birdclef-2025", + dataset={ + "path": "AdnanElAssadi/birdclef25_small", + "revision": "55dbd1a0f77dd71980337a6e64620369c1e3585a", + }, + type="AudioClassification", + category="a2t", + eval_splits=["train"], + eval_langs=[ + "eng-Latn", + ], + main_score="accuracy", + date=("2025-01-01", "2025-12-31"), # Competition year + domains=["Spoken", "Speech", "Bioacoustics"], + task_subtypes=["Species Classification"], + license="cc-by-nc-4.0", + annotations_creators="expert-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation="""@dataset{birdclef2025, + author={Christopher}, + title={BirdCLEF+ 2025}, + year={2025}, + publisher={Hugging Face}, + url={https://huggingface.co/datasets/christopher/birdclef-2025} + }""", + descriptive_stats={ + "n_samples": {"train": 1000}, # 50 classes × 20 samples each + "n_classes": 50, + }, + ) + + audio_column_name: str = "recording" + label_column_name: str = "primary_label" + samples_per_label: int = 20 + is_cross_validation: bool = True diff --git a/mteb/tasks/Audio/AudioClassification/eng/IEMOCAPEmotion.py b/mteb/tasks/Audio/AudioClassification/eng/IEMOCAPEmotion.py new file mode 100644 index 0000000000..d3e07931ff --- /dev/null +++ b/mteb/tasks/Audio/AudioClassification/eng/IEMOCAPEmotion.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClassification import ( + AbsTaskAudioClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class IEMOCAPEmotionClassification(AbsTaskAudioClassification): + metadata = TaskMetadata( + name="IEMOCAPEmotion", + description="Classification of speech samples into emotions (angry, happy, sad, neutral, frustrated, excited, fearful, surprised, disgusted) from interactive emotional dyadic conversations.", + reference="https://doi.org/10.1007/s10579-008-9076-6", + dataset={ + "path": "AbstractTTS/IEMOCAP", + "revision": "9f1696a135a65ce997d898d4121c952269a822ca", + }, + type="AudioClassification", + category="a2t", + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2008-01-01", "2008-12-31"), + domains=["Spoken", "Speech"], + task_subtypes=["Emotion classification"], + license="cc-by-nc-sa-3.0", + annotations_creators="expert-annotated", + dialect=[], + modalities=["audio"], + sample_creation="created", + bibtex_citation="""@article{busso2008iemocap, + title={IEMOCAP: Interactive emotional dyadic motion capture database}, + author={Busso, Carlos and Bulut, Murtaza and Lee, Chi-Chun and Kazemzadeh, Abe and Mower, Emily and Kim, Samuel and Chang, Jeannette N and Lee, Sungbok and Narayanan, Shrikanth S}, + journal={Language resources and evaluation}, + volume={42}, + number={4}, + pages={335--359}, + year={2008}, + publisher={Springer} + }""", + descriptive_stats={ + "n_samples": {"train": 10039}, # Approximate after subsampling + }, + ) + + audio_column_name: str = "audio" + label_column_name: str = "emotion" + samples_per_label: int = 10 + is_cross_validation: bool = True + + def dataset_transform(self): + # Define emotion labels and their mapping to indices + labels = [ + "angry", # 0 + "sad", # 1 + "happy", # 2 + "neutral", # 3 + "frustrated", # 4 + "excited", # 5 + "fear", # 6 + "surprise", # 7 + "disgust", # 8 + "other", # 9 + ] + label2id = {emotion: idx for idx, emotion in enumerate(labels)} + + # Basic filtering to ensure we have valid emotion labels + for split in self.dataset: + # First ensure we have valid emotion labels and normalize case + self.dataset[split] = self.dataset[split].filter( + lambda example: example["major_emotion"] is not None + and example["major_emotion"] != "" + ) + + # Map to indices with case normalization for reliability + self.dataset[split] = self.dataset[split].map( + lambda example: { + "emotion_id": label2id.get(example["major_emotion"].lower(), -1) + } + ) + + # Filter out any examples with unknown emotions + self.dataset[split] = self.dataset[split].filter( + lambda example: example["emotion_id"] != -1 + ) + + # Use numeric ID as the label + self.dataset[split] = self.dataset[split].rename_column( + "emotion_id", self.label_column_name + ) diff --git a/mteb/tasks/Audio/AudioClassification/eng/IEMOCAPGender.py b/mteb/tasks/Audio/AudioClassification/eng/IEMOCAPGender.py new file mode 100644 index 0000000000..f8e3b92fa2 --- /dev/null +++ b/mteb/tasks/Audio/AudioClassification/eng/IEMOCAPGender.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClassification import ( + AbsTaskAudioClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class IEMOCAPGenderClassification(AbsTaskAudioClassification): + metadata = TaskMetadata( + name="IEMOCAPGender", + description="Classification of speech samples by speaker gender (male/female) from the IEMOCAP database of interactive emotional dyadic conversations.", + reference="https://doi.org/10.1007/s10579-008-9076-6", + dataset={ + "path": "AbstractTTS/IEMOCAP", + "revision": "9f1696a135a65ce997d898d4121c952269a822ca", + }, + type="AudioClassification", + category="a2t", + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2008-01-01", "2008-12-31"), + domains=["Spoken", "Speech"], + task_subtypes=["Gender Classification"], + license="cc-by-nc-sa-3.0", + annotations_creators="expert-annotated", + dialect=[], + modalities=["audio"], + sample_creation="created", + bibtex_citation="""@article{busso2008iemocap, + title={IEMOCAP: Interactive emotional dyadic motion capture database}, + author={Busso, Carlos and Bulut, Murtaza and Lee, Chi-Chun and Kazemzadeh, Abe and Mower, Emily and Kim, Samuel and Chang, Jeannette N and Lee, Sungbok and Narayanan, Shrikanth S}, + journal={Language resources and evaluation}, + volume={42}, + number={4}, + pages={335--359}, + year={2008}, + publisher={Springer} + }""", + descriptive_stats={ + "n_samples": {"train": 10039}, + }, + ) + + audio_column_name: str = "audio" + label_column_name: str = "gender_id" + samples_per_label: int = 100 + is_cross_validation: bool = True + + def dataset_transform(self): + # Define label mapping + label2id = {"Female": 0, "Male": 1} + + # Apply transformation to all dataset splits + for split in self.dataset: + # Define transform function to add numeric labels + def add_gender_id(example): + example["gender_id"] = label2id[example["gender"]] + return example + + print(f"Converting gender labels to numeric IDs for split '{split}'...") + self.dataset[split] = self.dataset[split].map(add_gender_id) diff --git a/mteb/tasks/Audio/AudioClassification/eng/MInDS14.py b/mteb/tasks/Audio/AudioClassification/eng/MInDS14.py new file mode 100644 index 0000000000..cade6b2bad --- /dev/null +++ b/mteb/tasks/Audio/AudioClassification/eng/MInDS14.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClassification import AbsTaskAudioClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class MInDS14Classification(AbsTaskAudioClassification): + metadata = TaskMetadata( + name="MInDS14", + description="MInDS-14 is a training and evaluation resource for intent detection with spoken data in 14 diverse language varieties.", + reference="https://arxiv.org/abs/2104.08524", + dataset={ + "path": "PolyAI/minds14", + "name": "en-US", # English language configuration + "revision": "75900a7c6f93f014f25b50d16596a6da89add3a5", + }, + type="AudioClassification", + category="a2t", + eval_splits=["train"], + eval_langs=["eng-Latn"], # English (en-US) in BCP-47 format + main_score="accuracy", + date=("2021-04-01", "2021-04-30"), # Paper publication date + domains=["Speech", "Spoken"], + task_subtypes=["Intent Classification"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation="""@article{DBLP:journals/corr/abs-2104-08524, + author = {Daniela Gerz and Pei{-}Hao Su and Razvan Kusztos and Avishek Mondal and Michal Lis and Eshan Singhal and Nikola Mrkšić and Tsung{-}Hsien Wen and Ivan Vulic}, + title = {Multilingual and Cross-Lingual Intent Detection from Spoken Data}, + journal = {CoRR}, + volume = {abs/2104.08524}, + year = {2021}, + url = {https://arxiv.org/abs/2104.08524}, + eprinttype = {arXiv}, + eprint = {2104.08524}, + timestamp = {Mon, 26 Apr 2021 17:25:10 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2104-08524.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} + }""", + descriptive_stats={ + "n_samples": { + "train": 563, # Count for en-US configuration + }, + "n_classes": 14, + "classes": [ + "aboard", + "address", + "app_error", + "atm_limit", + "balance", + "business_loan", + "card_issues", + "cash_deposite", + "direct_debit", + "freeze", + "latest_transactions", + "joint_account", + "high_value_payment", + "pay_bill", + ], + }, + ) + + audio_column_name: str = "audio" + label_column_name: str = "intent_class" # Contains numeric labels 0-13 + samples_per_label: int = 40 + is_cross_validation: bool = True + n_splits: int = 5 diff --git a/mteb/tasks/Audio/AudioClassification/eng/SpeechCommands.py b/mteb/tasks/Audio/AudioClassification/eng/SpeechCommands.py new file mode 100644 index 0000000000..a71f92593f --- /dev/null +++ b/mteb/tasks/Audio/AudioClassification/eng/SpeechCommands.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClassification import ( + AbsTaskAudioClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SpeechCommandsClassification(AbsTaskAudioClassification): + metadata = TaskMetadata( + name="SpeechCommands", + description="A set of one-second .wav audio files, each containing a single spoken English word or background noise.", + reference="https://arxiv.org/abs/1804.03209", + dataset={ + "path": "AdnanElAssadi/speech_commands_small", + "revision": "a59564b91bf0cfcf587e11c2603fe42bae21e5f0", # Using downsampled version of v0.02 + }, + type="AudioClassification", + category="a2t", + eval_splits=["validation", "test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2018-04-11", "2018-04-11"), # v0.02 release date + domains=["Speech"], + task_subtypes=["Spoken Language Identification"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation="""@article{speechcommands2018, + title={Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition}, + author={Pete Warden}, + journal={arXiv preprint arXiv:1804.03209}, + year={2018} + }""", + descriptive_stats={ + "n_samples": {"train": 1755, "validation": 9982, "test": 4890}, + "n_classes": 36, + "classes": [ + "yes", + "no", + "up", + "down", + "left", + "right", + "on", + "off", + "stop", + "go", + "zero", + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + "bed", + "bird", + "cat", + "dog", + "happy", + "house", + "marvin", + "sheila", + "tree", + "wow", + "backward", + "forward", + "follow", + "learn", + "visual", + "_unknown_", # (likely background noise or silent segments) + ], + }, + ) + + audio_column_name: str = "audio" + label_column_name: str = "label" + samples_per_label: int = 50 + is_cross_validation: bool = False diff --git a/mteb/tasks/Audio/AudioClassification/eng/TUTAcousticScenes.py b/mteb/tasks/Audio/AudioClassification/eng/TUTAcousticScenes.py new file mode 100644 index 0000000000..7e8d165279 --- /dev/null +++ b/mteb/tasks/Audio/AudioClassification/eng/TUTAcousticScenes.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClassification import ( + AbsTaskAudioClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class TUTAcousticScenesClassification(AbsTaskAudioClassification): + metadata = TaskMetadata( + name="TUTAcousticScenes", + description="TUT Urban Acoustic Scenes 2018 dataset consists of 10-second audio segments from 10 acoustic scenes recorded in six European cities.", + reference="https://zenodo.org/record/1228142", + dataset={ + "path": "wetdog/TUT-urban-acoustic-scenes-2018-development", + "revision": "583b181ea2666eb28d10909784690009f6c9da9d", + }, + type="AudioClassification", + category="a2t", + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2018-01-01", "2018-12-31"), + domains=[ + "Spoken" + ], # A more appropriate domain for this task could be put when the domain list is updated + task_subtypes=["Environment Sound Classification"], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation="""@inproceedings{Mesaros2018_DCASE, + author = {Annamaria Mesaros and Toni Heittola and Tuomas Virtanen}, + title = {A Multi-Device Dataset for Urban Acoustic Scene Classification}, + booktitle = {Proceedings of the Detection and Classification of Acoustic Scenes and Events 2018 Workshop (DCASE2018)}, + year = {2018}, + publisher = {Tampere University of Technology}, + address = {Tampere, Finland}, + url = {https://arxiv.org/abs/1807.09840} + }""", + descriptive_stats={ + "n_samples": {"train": 8640}, # Based on provided stats + "n_classes": 10, + "classes": [ + "airport", + "bus", + "metro", + "metro_station", + "park", + "public_square", + "shopping_mall", + "street_pedestrian", + "street_traffic", + "tram", + ], + }, + ) + + audio_column_name: str = "audio" + label_column_name: str = "scene_label" + samples_per_label: int = 50 + is_cross_validation: bool = True + n_splits: int = 5 diff --git a/mteb/tasks/Audio/AudioClassification/eng/VoxPopuliAccentID.py b/mteb/tasks/Audio/AudioClassification/eng/VoxPopuliAccentID.py new file mode 100644 index 0000000000..b333cf1cca --- /dev/null +++ b/mteb/tasks/Audio/AudioClassification/eng/VoxPopuliAccentID.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClassification import ( + AbsTaskAudioClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class VoxPopuliAccentID(AbsTaskAudioClassification): + metadata = TaskMetadata( + name="VoxPopuliAccentID", + description="Classification of English speech samples into one of 15 non-native accents from European Parliament recordings.", + reference="https://huggingface.co/datasets/facebook/voxpopuli", + dataset={ + "path": "facebook/voxpopuli", + "name": "en_accented", # This explicitly selects the accented English config + "revision": "719aaef8225945c0d80b277de6c79aa42ab053d5", + }, + type="AudioClassification", + category="a2t", + eval_splits=["train", "test"], + eval_langs=["eng-Latn"], # Using BCP-47 format + main_score="accuracy", + date=("2009-01-01", "2020-12-31"), + domains=["Spoken", "Speech"], + task_subtypes=["Accent identification"], + license="cc0-1.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation="""@inproceedings{wang-etal-2021-voxpopuli, + title = "{V}ox{P}opuli: A Large-Scale Multilingual Speech Corpus for Representation Learning, Semi-Supervised Learning and Interpretation", + author = "Wang, Changhan and + Riviere, Morgane and + Lee, Ann and + Wu, Anne and + Talnikar, Chaitanya and + Haziza, Daniel and + Williamson, Mary and + Pino, Juan and + Dupoux, Emmanuel", + booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)", + month = aug, + year = "2021", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.acl-long.80", + doi = "10.18653/v1/2021.acl-long.80", + pages = "993--1003", + }""", + descriptive_stats={ + "n_samples": {"test": 6900}, + }, + ) + + audio_column_name: str = "audio" + label_column_name: str = "accent" + samples_per_label: int = 50 + is_cross_validation: bool = False + + def dataset_transform(self): + # Split test into train (80%) and new test (20%) + import random + + import numpy as np + + random.seed(42) + dataset = self.dataset + + # Function to filter out corrupted or empty audio samples + def is_valid_audio(example): + # Check if audio array exists and is not empty + if "audio" not in example or "array" not in example["audio"]: + return False + + # Get the audio array + audio_array = example["audio"]["array"] + + # Check if array is empty or too short (needs at least 10 samples for wav2vec2) + if ( + audio_array is None or len(audio_array) < 500 + ): # Minimum length to avoid kernel error + return False + + # Check for NaN or Inf values + if np.isnan(audio_array).any() or np.isinf(audio_array).any(): + return False + + return True + + # Filter test data to remove corrupted samples + print("Filtering out corrupted audio samples...") + test_data = dataset["test"] + valid_indices = [] + + # Find valid indices + for i in range(len(test_data)): + if is_valid_audio(test_data[i]): + valid_indices.append(i) + + # Use only valid samples + test_data = test_data.select(valid_indices) + print( + f"Kept {len(valid_indices)} valid samples out of {len(dataset['test'])} total" + ) + + # Continue with the original split logic + indices = list(range(len(test_data))) + random.shuffle(indices) + + split_point = int(len(indices) * 0.8) + train_indices = indices[:split_point] + test_indices = indices[split_point:] + + self.dataset = { + "train": test_data.select(train_indices), + "test": test_data.select(test_indices), + } + print( + f"Created train split with {len(train_indices)} samples and test split with {len(test_indices)} samples" + ) diff --git a/mteb/tasks/Audio/AudioClassification/eng/VoxPopuliGenderID.py b/mteb/tasks/Audio/AudioClassification/eng/VoxPopuliGenderID.py new file mode 100644 index 0000000000..afa5d84147 --- /dev/null +++ b/mteb/tasks/Audio/AudioClassification/eng/VoxPopuliGenderID.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClassification import ( + AbsTaskAudioClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class VoxPopuliGenderID(AbsTaskAudioClassification): + metadata = TaskMetadata( + name="VoxPopuliGenderID", + description="Classification of speech samples by speaker gender (male/female) from European Parliament recordings.", + reference="https://huggingface.co/datasets/facebook/voxpopuli", + dataset={ + "path": "facebook/voxpopuli", + "name": "en", # This selects the english config/subset + "revision": "719aaef8225945c0d80b277de6c79aa42ab053d5", + }, + type="AudioClassification", + category="a2t", + eval_splits=["validation", "test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2009-01-01", "2020-12-31"), + domains=["Spoken", "Speech"], + task_subtypes=["Gender Classification"], + license="cc0-1.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation="""@inproceedings{wang-etal-2021-voxpopuli, + title = "{V}ox{P}opuli: A Large-Scale Multilingual Speech Corpus for Representation Learning, Semi-Supervised Learning and Interpretation", + author = "Wang, Changhan and + Riviere, Morgane and + Lee, Ann and + Wu, Anne and + Talnikar, Chaitanya and + Haziza, Daniel and + Williamson, Mary and + Pino, Juan and + Dupoux, Emmanuel", + booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)", + month = aug, + year = "2021", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.acl-long.80", + doi = "10.18653/v1/2021.acl-long.80", + pages = "993--1003", + }""", + descriptive_stats={ + "n_samples": { + "train": 7600, + "validation": 1750, + "test": 1840, + }, + }, + ) + + audio_column_name: str = "audio" + label_column_name: str = "gender" + samples_per_label: int = 100 + is_cross_validation: bool = False diff --git a/mteb/tasks/Audio/AudioClassification/eng/VoxPopuliLanguageID.py b/mteb/tasks/Audio/AudioClassification/eng/VoxPopuliLanguageID.py new file mode 100644 index 0000000000..6f7678305a --- /dev/null +++ b/mteb/tasks/Audio/AudioClassification/eng/VoxPopuliLanguageID.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClassification import ( + AbsTaskAudioClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class VoxPopuliLanguageID(AbsTaskAudioClassification): + metadata = TaskMetadata( + name="VoxPopuliLanguageID", + description="Classification of speech samples into one of 5 European languages (English, German, French, Spanish, Polish) from European Parliament recordings.", + reference="https://huggingface.co/datasets/facebook/voxpopuli", + dataset={ + "path": "facebook/voxpopuli", + "name": "multilang", # This selects the multilingual config/subset + "revision": "719aaef8225945c0d80b277de6c79aa42ab053d5", + }, + type="AudioClassification", + category="a2t", + eval_splits=["test"], + eval_langs=[ + "eng-Latn", # English + "deu-Latn", # German + "fra-Latn", # French + "spa-Latn", # Spanish + "pol-Latn", # Polish + ], # Using BCP-47 format for the 5 main languages + main_score="accuracy", + date=("2009-01-01", "2020-12-31"), + domains=["Spoken", "Speech"], + task_subtypes=["Spoken Language Identification"], + license="cc0-1.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation="""@inproceedings{wang-etal-2021-voxpopuli, + title = "{V}ox{P}opuli: A Large-Scale Multilingual Speech Corpus for Representation Learning, Semi-Supervised Learning and Interpretation", + author = "Wang, Changhan and + Riviere, Morgane and + Lee, Ann and + Wu, Anne and + Talnikar, Chaitanya and + Haziza, Daniel and + Williamson, Mary and + Pino, Juan and + Dupoux, Emmanuel", + booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)", + month = aug, + year = "2021", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.acl-long.80", + doi = "10.18653/v1/2021.acl-long.80", + pages = "993--1003", + }""", + descriptive_stats={ + "n_samples": { + "train": 6200, # ~80% of test examples + "test": 1600, # ~20% of test examples + }, + }, + ) + + audio_column_name: str = "audio" + label_column_name: str = "language" + samples_per_label: int = 50 # For balanced training + is_cross_validation: bool = False + + def dataset_transform(self): + """Create train and test splits from the original test split.""" + import random + + random.seed(42) + + if "test" in self.dataset: + test_data = self.dataset["test"] + print( + f"Creating train/test splits from original test split with {len(test_data)} examples" + ) + + # Get all indices (all audio assumed valid) + all_indices = list(range(len(test_data))) + + # Create stratified split (balanced by language) + lang_indices = {} + for i in all_indices: + lang = test_data[i][self.label_column_name] + if lang not in lang_indices: + lang_indices[lang] = [] + lang_indices[lang].append(i) + + # Take 80% for training, 20% for testing from each language + train_indices = [] + test_indices = [] + + for lang, indices in lang_indices.items(): + # Shuffle indices for this language + shuffled = indices.copy() + random.shuffle(shuffled) + + # Split 80/20 + split_point = int(len(shuffled) * 0.8) + train_indices.extend(shuffled[:split_point]) + test_indices.extend(shuffled[split_point:]) + + # Create the splits + self.dataset["train"] = test_data.select(train_indices) + self.dataset["test"] = test_data.select(test_indices) diff --git a/mteb/tasks/Audio/Clustering/__init__.py b/mteb/tasks/Audio/Clustering/__init__.py index 68508cb87b..1a72165f2f 100644 --- a/mteb/tasks/Audio/Clustering/__init__.py +++ b/mteb/tasks/Audio/Clustering/__init__.py @@ -1,5 +1,15 @@ from __future__ import annotations +from .eng.AmbientAcousticContextClustering import * +from .eng.CREMA_DClustering import * +from .eng.ESC50Clustering import * +from .eng.GTZANGenreClustering import * +from .eng.IEMOCAPEmotionClustering import * +from .eng.IEMOCAPGenderClustering import * from .eng.MusicGenre import * +from .eng.TUTAcousticScenesClustering import * from .eng.VehicleSoundClustering import * from .eng.VoiceGender import * +from .eng.VoxCelebClustering import * +from .eng.VoxPopuliAccentClustering import * +from .eng.VoxPopuliGenderClustering import * diff --git a/mteb/tasks/Audio/Clustering/eng/AmbientAcousticContextClustering.py b/mteb/tasks/Audio/Clustering/eng/AmbientAcousticContextClustering.py new file mode 100644 index 0000000000..96de25dec2 --- /dev/null +++ b/mteb/tasks/Audio/Clustering/eng/AmbientAcousticContextClustering.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClustering import AbsTaskAudioClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class AmbientAcousticContextClustering(AbsTaskAudioClustering): + label_column_name: str = "label" + + metadata = TaskMetadata( + name="AmbientAcousticContextClustering", + description="Clustering task based on a subset of the Ambient Acoustic Context dataset containing 1-second segments for workplace activities.", + reference="https://dl.acm.org/doi/10.1145/3379503.3403535", + dataset={ + "path": "AdnanElAssadi/ambient-acoustic-context-small", + "revision": "360c858462b79492c6b09d5855ec4d59c87497c6", + }, + type="AudioClustering", + category="a2a", + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="cluster_accuracy", + date=("2020-01-01", "2020-12-31"), + domains=["Spoken", "Speech"], + task_subtypes=["Environment Sound Clustering"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation="""@inproceedings{10.1145/3379503.3403535, + author = {Park, Chunjong and Min, Chulhong and Bhattacharya, Sourav and Kawsar, Fahim}, + title = {Augmenting Conversational Agents with Ambient Acoustic Contexts}, + year = {2020}, + isbn = {9781450375160}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + url = {https://doi.org/10.1145/3379503.3403535}, + doi = {10.1145/3379503.3403535}, + booktitle = {22nd International Conference on Human-Computer Interaction with Mobile Devices and Services}, + articleno = {33}, + numpages = {9}, + keywords = {Acoustic ambient context, Conversational agents}, + location = {Oldenburg, Germany}, + series = {MobileHCI '20} + }""", + descriptive_stats={ + "n_samples": { + "train": 2387, # ~100 samples × 24 classes + "test": 1036, # ~50 samples × 24 classes + }, + "n_classes": 24, + "sampling_rate": 16000, + }, + ) diff --git a/mteb/tasks/Audio/Clustering/eng/CREMA_DClustering.py b/mteb/tasks/Audio/Clustering/eng/CREMA_DClustering.py new file mode 100644 index 0000000000..3e083bc03b --- /dev/null +++ b/mteb/tasks/Audio/Clustering/eng/CREMA_DClustering.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClustering import AbsTaskAudioClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CREMA_DClustering(AbsTaskAudioClustering): + label_column_name: str = "label" + metadata = TaskMetadata( + name="CREMA_DClustering", + description="Emotion clustering task with audio data for 6 emotions: Anger, Disgust, Fear, Happy, Neutral, Sad.", + reference="https://huggingface.co/datasets/silky1708/CREMA-D", + dataset={ + "path": "silky1708/CREMA-D", + "revision": "ab26a0ddbeade7c31a3208ecc043f06f9953892c", + }, + type="AudioClustering", + category="a2a", + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="cluster_accuracy", + date=("2014-01-01", "2014-12-31"), + domains=["Speech"], + task_subtypes=["Emotion Clustering"], + license="http://opendatacommons.org/licenses/odbl/1.0/", # Open Database License + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="created", + bibtex_citation="""@article{cao2014crema, + title={Crema-d: Crowd-sourced emotional multimodal actors dataset}, + author={Cao, Houwei and Cooper, David G and Keutmann, Michael K and Gur, Ruben C and Nenkova, Ani and Verma, Ragini}, + journal={IEEE transactions on affective computing}, + volume={5}, + number={4}, + pages={377--390}, + year={2014}, + publisher={IEEE} + }""", + ) diff --git a/mteb/tasks/Audio/Clustering/eng/ESC50Clustering.py b/mteb/tasks/Audio/Clustering/eng/ESC50Clustering.py new file mode 100644 index 0000000000..edd13e0f50 --- /dev/null +++ b/mteb/tasks/Audio/Clustering/eng/ESC50Clustering.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClustering import AbsTaskAudioClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ESC50Clustering(AbsTaskAudioClustering): + label_column_name: str = "target" + metadata = TaskMetadata( + name="ESC50Clustering", + description="Clustering task based on the Environmental Sound Classification Dataset with 50 classes.", + reference="https://huggingface.co/datasets/ashraq/esc50", + dataset={ + "path": "ashraq/esc50", + "revision": "e3e2a63ffff66b9a9735524551e3818e96af03ee", + }, + type="AudioClustering", + category="a2a", + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="cluster_accuracy", + date=("2023-01-07", "2023-01-07"), + domains=["Spoken", "Speech"], + task_subtypes=["Environment Sound Clustering"], + license="cc-by-nc-sa-3.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation="""@inproceedings{piczak2015dataset, + title = {{ESC}: {Dataset} for {Environmental Sound Classification}}, + author = {Piczak, Karol J.}, + booktitle = {Proceedings of the 23rd {Annual ACM Conference} on {Multimedia}}, + date = {2015-10-13}, + url = {http://dl.acm.org/citation.cfm?doid=2733373.2806390}, + doi = {10.1145/2733373.2806390}, + location = {{Brisbane, Australia}}, + isbn = {978-1-4503-3459-4}, + publisher = {{ACM Press}}, + pages = {1015--1018} + }""", + ) diff --git a/mteb/tasks/Audio/Clustering/eng/GTZANGenreClustering.py b/mteb/tasks/Audio/Clustering/eng/GTZANGenreClustering.py new file mode 100644 index 0000000000..b8819cfb2a --- /dev/null +++ b/mteb/tasks/Audio/Clustering/eng/GTZANGenreClustering.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClustering import AbsTaskAudioClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class GTZANGenreClustering(AbsTaskAudioClustering): + label_column_name: str = "label" + metadata = TaskMetadata( + name="GTZANGenreClustering", + description="Music genre clustering task based on GTZAN dataset with 10 music genres.", + reference="https://huggingface.co/datasets/silky1708/GTZAN-Genre", + dataset={ + "path": "silky1708/GTZAN-Genre", + "revision": "5efdda59d0d185bfe17ada9b54d233349d0e0168", + }, + type="AudioClustering", + category="a2a", + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="cluster_accuracy", + date=("2000-01-01", "2001-12-31"), + domains=["Music"], + task_subtypes=["Music Clustering"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation="""@ARTICLE{1021072, + author={Tzanetakis, G. and Cook, P.}, + journal={IEEE Transactions on Speech and Audio Processing}, + title={Musical genre classification of audio signals}, + year={2002}, + volume={10}, + number={5}, + pages={293-302}, + keywords={Humans;Music information retrieval;Instruments;Computer science;Multiple signal classification;Signal analysis;Pattern recognition;Feature extraction;Wavelet analysis;Cultural differences}, + doi={10.1109/TSA.2002.800560}}""", + ) diff --git a/mteb/tasks/Audio/Clustering/eng/IEMOCAPEmotionClustering.py b/mteb/tasks/Audio/Clustering/eng/IEMOCAPEmotionClustering.py new file mode 100644 index 0000000000..cb21fbb39a --- /dev/null +++ b/mteb/tasks/Audio/Clustering/eng/IEMOCAPEmotionClustering.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClustering import AbsTaskAudioClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class IEMOCAPEmotionClustering(AbsTaskAudioClustering): + label_column_name: str = "emotion" + + metadata = TaskMetadata( + name="IEMOCAPEmotionClustering", + description="Clustering speech samples by emotion from interactive emotional dyadic conversations in the IEMOCAP database.", + reference="https://doi.org/10.1007/s10579-008-9076-6", + dataset={ + "path": "AbstractTTS/IEMOCAP", + "revision": "9f1696a135a65ce997d898d4121c952269a822ca", + }, + type="AudioClustering", + category="a2a", + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="cluster_accuracy", + date=("2008-01-01", "2008-12-31"), + domains=["Spoken", "Speech"], + task_subtypes=["Emotion Clustering"], + license="cc-by-nc-sa-3.0", + annotations_creators="expert-annotated", + dialect=[], + modalities=["audio"], + sample_creation="created", + bibtex_citation="""@article{busso2008iemocap, + title={IEMOCAP: Interactive emotional dyadic motion capture database}, + author={Busso, Carlos and Bulut, Murtaza and Lee, Chi-Chun and Kazemzadeh, Abe and Mower, Emily and Kim, Samuel and Chang, Jeannette N and Lee, Sungbok and Narayanan, Shrikanth S}, + journal={Language resources and evaluation}, + volume={42}, + number={4}, + pages={335--359}, + year={2008}, + publisher={Springer} + }""", + descriptive_stats={ + "n_samples": {"train": 10039}, # Approximate after subsampling + }, + ) + + audio_column_name: str = "audio" + + def dataset_transform(self): + # Define emotion labels and their mapping to indices + labels = [ + "angry", # 0 + "sad", # 1 + "happy", # 2 + "neutral", # 3 + "frustrated", # 4 + "excited", # 5 + "fear", # 6 + "surprise", # 7 + "disgust", # 8 + "other", # 9 + ] + label2id = {emotion: idx for idx, emotion in enumerate(labels)} + + # Basic filtering to ensure we have valid emotion labels + for split in self.dataset: + # First ensure we have valid emotion labels and normalize case + self.dataset[split] = self.dataset[split].filter( + lambda example: example["major_emotion"] is not None + and example["major_emotion"] != "" + ) + + # Map to indices with case normalization for reliability + self.dataset[split] = self.dataset[split].map( + lambda example: { + "emotion_id": label2id.get(example["major_emotion"].lower(), -1) + } + ) + + # Filter out any examples with unknown emotions + self.dataset[split] = self.dataset[split].filter( + lambda example: example["emotion_id"] != -1 + ) + + # Use numeric ID as the label + self.dataset[split] = self.dataset[split].rename_column( + "emotion_id", self.label_column_name + ) diff --git a/mteb/tasks/Audio/Clustering/eng/IEMOCAPGenderClustering.py b/mteb/tasks/Audio/Clustering/eng/IEMOCAPGenderClustering.py new file mode 100644 index 0000000000..3124328aa4 --- /dev/null +++ b/mteb/tasks/Audio/Clustering/eng/IEMOCAPGenderClustering.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClustering import AbsTaskAudioClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class IEMOCAPGenderClustering(AbsTaskAudioClustering): + label_column_name: str = "gender_id" + + metadata = TaskMetadata( + name="IEMOCAPGenderClustering", + description="Clustering speech samples by speaker gender (male/female) from the IEMOCAP database of interactive emotional dyadic conversations.", + reference="https://doi.org/10.1007/s10579-008-9076-6", + dataset={ + "path": "AbstractTTS/IEMOCAP", + "revision": "9f1696a135a65ce997d898d4121c952269a822ca", # Latest commit as of writing + }, + type="AudioClustering", + category="a2a", + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="cluster_accuracy", + date=("2008-01-01", "2008-12-31"), + domains=["Spoken", "Speech"], + task_subtypes=["Gender Clustering"], + license="cc-by-nc-sa-3.0", + annotations_creators="expert-annotated", + dialect=[], + modalities=["audio"], + sample_creation="created", + bibtex_citation="""@article{busso2008iemocap, + title={IEMOCAP: Interactive emotional dyadic motion capture database}, + author={Busso, Carlos and Bulut, Murtaza and Lee, Chi-Chun and Kazemzadeh, Abe and Mower, Emily and Kim, Samuel and Chang, Jeannette N and Lee, Sungbok and Narayanan, Shrikanth S}, + journal={Language resources and evaluation}, + volume={42}, + number={4}, + pages={335--359}, + year={2008}, + publisher={Springer} + }""", + descriptive_stats={ + "n_samples": {"train": 10039}, # Approximate + }, + ) + + audio_column_name: str = "audio" + + def dataset_transform(self): + # Define label mapping + label2id = {"Female": 0, "Male": 1} + + # Apply transformation to all dataset splits + for split in self.dataset: + # Define transform function to add numeric labels + def add_gender_id(example): + example["gender_id"] = label2id[example["gender"]] + return example + + print(f"Converting gender labels to numeric IDs for split '{split}'...") + self.dataset[split] = self.dataset[split].map(add_gender_id) diff --git a/mteb/tasks/Audio/Clustering/eng/TUTAcousticScenesClustering.py b/mteb/tasks/Audio/Clustering/eng/TUTAcousticScenesClustering.py new file mode 100644 index 0000000000..6d87063b49 --- /dev/null +++ b/mteb/tasks/Audio/Clustering/eng/TUTAcousticScenesClustering.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClustering import AbsTaskAudioClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class TUTAcousticScenesClustering(AbsTaskAudioClustering): + label_column_name: str = "scene_id" + + metadata = TaskMetadata( + name="TUTAcousticScenesClustering", + description="Clustering task based on the TUT Urban Acoustic Scenes 2018 dataset with 10 different acoustic scenes.", + reference="https://zenodo.org/record/1228142", + dataset={ + "path": "wetdog/TUT-urban-acoustic-scenes-2018-development", + "revision": "583b181ea2666eb28d10909784690009f6c9da9d", + }, + type="AudioClustering", + category="a2a", + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="cluster_accuracy", + date=("2018-01-01", "2018-12-31"), + domains=["Spoken", "Speech"], + task_subtypes=["Environment Sound Clustering"], + license="cc-by-4.0", + annotations_creators="expert-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation="""@inproceedings{Mesaros2018_DCASE, + author = {Mesaros, Annamaria and Heittola, Toni and Virtanen, Tuomas}, + title = {A multi-device dataset for urban acoustic scene classification}, + booktitle = {Proceedings of the Detection and Classification of Acoustic Scenes and Events 2018 Workshop (DCASE2018)}, + year = {2018}, + pages = {9--13}, + publisher = {Tampere University of Technology}, + address = {Tampere, Finland} + }""", + descriptive_stats={ + "n_samples": {"train": 8640}, # Based on provided stats + "n_classes": 10, + "classes": [ + "airport", + "bus", + "metro", + "metro_station", + "park", + "public_square", + "shopping_mall", + "street_pedestrian", + "street_traffic", + "tram", + ], + }, + ) + + def dataset_transform(self): + """Apply transformations to the dataset to map scene labels to numeric IDs. + This adds a 'scene_id' column containing the numeric ID for each scene. + """ + # Define mappings between scene labels and IDs + SCENE_TO_ID = { + "airport": 0, + "bus": 1, + "metro": 2, + "metro_station": 3, + "park": 4, + "public_square": 5, + "shopping_mall": 6, + "street_pedestrian": 7, + "street_traffic": 8, + "tram": 9, + } + + # Define a mapping function to add scene_id + def add_scene_id(example): + example[self.label_column_name] = SCENE_TO_ID.get( + example["scene_label"], -1 + ) + return example + + # Apply transformation to all dataset splits + for split in self.dataset: + print(f"Converting scene labels to numeric IDs for split '{split}'...") + self.dataset[split] = self.dataset[split].map(add_scene_id) diff --git a/mteb/tasks/Audio/Clustering/eng/VoxCelebClustering.py b/mteb/tasks/Audio/Clustering/eng/VoxCelebClustering.py new file mode 100644 index 0000000000..a5bf7da69c --- /dev/null +++ b/mteb/tasks/Audio/Clustering/eng/VoxCelebClustering.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClustering import AbsTaskAudioClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + +# ASSUMED VOXCELEB IN CLASSIFICATION TASK WAS ACCURATE. + + +class VoxCelebClustering(AbsTaskAudioClustering): + label_column_name: str = "label_id" + metadata = TaskMetadata( + name="VoxCelebClustering", + description="Clustering task based on the VoxCeleb dataset for sentiment analysis, clustering by positive/negative sentiment.", + reference="https://huggingface.co/datasets/DynamicSuperb/Sentiment_Analysis_SLUE-VoxCeleb", + dataset={ + "path": "DynamicSuperb/Sentiment_Analysis_SLUE-VoxCeleb", + "revision": "554ad4367e98b7c6f4d4d9756dc6bbdf345e042e", + }, + type="AudioClustering", + category="a2a", + eval_splits=["train"], + eval_langs=["eng-Latn"], + main_score="cluster_accuracy", + date=("2024-06-27", "2024-06-28"), + domains=["Spoken", "Speech"], + task_subtypes=["Sentiment Clustering"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation="""@misc{shon2022sluenewbenchmarktasks, + title={SLUE: New Benchmark Tasks for Spoken Language Understanding Evaluation on Natural Speech}, + author={Suwon Shon and Ankita Pasad and Felix Wu and Pablo Brusco and Yoav Artzi and Karen Livescu and Kyu J. Han}, + year={2022}, + eprint={2111.10367}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2111.10367}, + }""", + ) + + def dataset_transform(self): + # Get the original training split + ds = self.dataset + # Remove 'Disagreement' samples and '' samples + ds = ds.filter(lambda x: x["label"] not in ["Disagreement", ""]) + # Map string sentiment labels to numeric IDs + label2id = {"Negative": 0, "Neutral": 1, "Positive": 2} + + def add_label_id(example): + example["label_id"] = label2id[example["label"]] + return example + + ds = ds.map(add_label_id) + # Update the dataset and label column + self.dataset["train"] = ds.pop("test") + self.label_column_name = "label_id" diff --git a/mteb/tasks/Audio/Clustering/eng/VoxPopuliAccentClustering.py b/mteb/tasks/Audio/Clustering/eng/VoxPopuliAccentClustering.py new file mode 100644 index 0000000000..07d04ab689 --- /dev/null +++ b/mteb/tasks/Audio/Clustering/eng/VoxPopuliAccentClustering.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClustering import AbsTaskAudioClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class VoxPopuliAccentClustering(AbsTaskAudioClustering): + label_column_name: str = "accent_id" + + metadata = TaskMetadata( + name="VoxPopuliAccentClustering", + description="Clustering English speech samples by non-native accent from European Parliament recordings.", + reference="https://huggingface.co/datasets/facebook/voxpopuli", + dataset={ + "path": "facebook/voxpopuli", + "name": "en_accented", # This explicitly selects the accented English config + "revision": "719aaef8225945c0d80b277de6c79aa42ab053d5", + }, + type="AudioClustering", + category="a2a", + eval_splits=["test"], # Only test split is available for accented English + eval_langs=["eng-Latn"], # Using BCP-47 format + main_score="cluster_accuracy", + date=("2009-01-01", "2020-12-31"), + domains=["Spoken", "Speech"], + task_subtypes=["Accent Clustering"], + license="cc0-1.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation="""@inproceedings{wang-etal-2021-voxpopuli, + title = "{V}ox{P}opuli: A Large-Scale Multilingual Speech Corpus for Representation Learning, Semi-Supervised Learning and Interpretation", + author = "Wang, Changhan and + Riviere, Morgane and + Lee, Ann and + Wu, Anne and + Talnikar, Chaitanya and + Haziza, Daniel and + Williamson, Mary and + Pino, Juan and + Dupoux, Emmanuel", + booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)", + month = aug, + year = "2021", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.acl-long.80", + doi = "10.18653/v1/2021.acl-long.80", + pages = "993--1003", + }""", + descriptive_stats={ + "n_samples": {"test": 6900}, + }, + ) + + audio_column_name: str = "audio" + + def dataset_transform(self): + # Split test into train (80%) and new test (20%) + import random + + import numpy as np + + random.seed(42) + dataset = self.dataset + + # Function to filter out corrupted or empty audio samples + def is_valid_audio(example): + # Check if audio array exists and is not empty + if "audio" not in example or "array" not in example["audio"]: + return False + + # Get the audio array + audio_array = example["audio"]["array"] + + # Check if array is empty or too short (needs at least 10 samples for wav2vec2) + if ( + audio_array is None or len(audio_array) < 500 + ): # Minimum length to avoid kernel error + return False + + # Check for NaN or Inf values + if np.isnan(audio_array).any() or np.isinf(audio_array).any(): + return False + + return True + + # Filter test data to remove corrupted samples + print("Filtering out corrupted audio samples...") + test_data = dataset["test"] + valid_indices = [] + + # Find valid indices + for i in range(len(test_data)): + if is_valid_audio(test_data[i]): + valid_indices.append(i) + + # Use only valid samples + test_data = test_data.select(valid_indices) + print( + f"Kept {len(valid_indices)} valid samples out of {len(dataset['test'])} total" + ) + + # Map accent codes to numeric IDs for clustering + accent2id = { + "en_nl": 0, # Dutch + "en_de": 1, # German + "en_cs": 2, # Czech + "en_pl": 3, # Polish + "en_fr": 4, # French + "en_hu": 5, # Hungarian + "en_fi": 6, # Finnish + "en_ro": 7, # Romanian + "en_sk": 8, # Slovak + "en_es": 9, # Spanish + "en_it": 10, # Italian + "en_et": 11, # Estonian + "en_lt": 12, # Lithuanian + "en_hr": 13, # Croatian + "en_sl": 14, # Slovene + } + + # Add accent_id based on accent code + def add_accent_id(example): + example["accent_id"] = accent2id[example["accent"]] + return example + + test_data = test_data.map(add_accent_id) + print(f"Mapped {len(accent2id)} accent codes to numeric IDs") + + # Continue with the original split logic + indices = list(range(len(test_data))) + random.shuffle(indices) + + split_point = int(len(indices) * 0.8) + train_indices = indices[:split_point] + test_indices = indices[split_point:] + + self.dataset = { + "train": test_data.select(train_indices), + "test": test_data.select(test_indices), + } + print( + f"Created train split with {len(train_indices)} samples and test split with {len(test_indices)} samples" + ) diff --git a/mteb/tasks/Audio/Clustering/eng/VoxPopuliGenderClustering.py b/mteb/tasks/Audio/Clustering/eng/VoxPopuliGenderClustering.py new file mode 100644 index 0000000000..27edd741d5 --- /dev/null +++ b/mteb/tasks/Audio/Clustering/eng/VoxPopuliGenderClustering.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioClustering import AbsTaskAudioClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class VoxPopuliGenderClustering(AbsTaskAudioClustering): + label_column_name: str = "gender_id" + + metadata = TaskMetadata( + name="VoxPopuliGenderClustering", + description="Clustering speech samples by speaker gender (male/female) from European Parliament recordings.", + reference="https://huggingface.co/datasets/facebook/voxpopuli", + dataset={ + "path": "facebook/voxpopuli", + "name": "en", # This selects the english config + "revision": "719aaef8225945c0d80b277de6c79aa42ab053d5", + }, + type="AudioClustering", + category="a2a", + eval_splits=["validation", "test"], + eval_langs=["eng-Latn"], # Focus on one language for clustering + main_score="cluster_accuracy", + date=("2009-01-01", "2020-12-31"), + domains=["Spoken", "Speech"], + task_subtypes=["Gender Clustering"], + license="cc0-1.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation="""@inproceedings{wang-etal-2021-voxpopuli, + title = "{V}ox{P}opuli: A Large-Scale Multilingual Speech Corpus for Representation Learning, Semi-Supervised Learning and Interpretation", + author = "Wang, Changhan and + Riviere, Morgane and + Lee, Ann and + Wu, Anne and + Talnikar, Chaitanya and + Haziza, Daniel and + Williamson, Mary and + Pino, Juan and + Dupoux, Emmanuel", + booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)", + month = aug, + year = "2021", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.acl-long.80", + doi = "10.18653/v1/2021.acl-long.80", + pages = "993--1003", + }""", + descriptive_stats={ + "n_samples": { + "train": 7600, + "validation": 1750, + "test": 1840, + }, + }, + ) + + audio_column_name: str = "audio" + + def dataset_transform(self): + # Define label mapping + label2id = {"female": 0, "male": 1} + + # Apply transformation to all dataset splits + for split in self.dataset: + # Define transform function to add numeric labels + def add_gender_id(example): + example["gender_id"] = label2id[example["gender"]] + return example + + print(f"Converting gender labels to numeric IDs for split '{split}'...") + self.dataset[split] = self.dataset[split].map(add_gender_id) diff --git a/mteb/tasks/Audio/Clustering/eng/__init__.py b/mteb/tasks/Audio/Clustering/eng/__init__.py index e69de29bb2..ae36395892 100644 --- a/mteb/tasks/Audio/Clustering/eng/__init__.py +++ b/mteb/tasks/Audio/Clustering/eng/__init__.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from mteb.tasks.Audio.Clustering.eng.AmbientAcousticContextClustering import ( + AmbientAcousticContextClustering, +) +from mteb.tasks.Audio.Clustering.eng.ESC50Clustering import ESC50Clustering +from mteb.tasks.Audio.Clustering.eng.MusicGenre import MusicGenreClustering +from mteb.tasks.Audio.Clustering.eng.TUTAcousticScenesClustering import ( + TUTAcousticScenesClustering, +) +from mteb.tasks.Audio.Clustering.eng.VehicleSoundClustering import ( + VehicleSoundClustering, +) +from mteb.tasks.Audio.Clustering.eng.VoiceGender import VoiceGenderClustering + +__all__ = [ + "ESC50Clustering", + "TUTAcousticScenesClustering", + "AmbientAcousticContextClustering", + "MusicGenreClustering", + "VehicleSoundClustering", + "VoiceGenderClustering", +]