From 8b2a4e6333c5f023edb6582cc86b075f3af3b166 Mon Sep 17 00:00:00 2001 From: Animesh Jha Date: Fri, 14 Mar 2025 22:49:51 -0700 Subject: [PATCH] fix FSD-50K task --- .../eng/FSD50HF.py | 44 +++++++++++-------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/mteb/tasks/Audio/AudioMultilabelClassification/eng/FSD50HF.py b/mteb/tasks/Audio/AudioMultilabelClassification/eng/FSD50HF.py index 4f5416a1b8..337ce2325a 100644 --- a/mteb/tasks/Audio/AudioMultilabelClassification/eng/FSD50HF.py +++ b/mteb/tasks/Audio/AudioMultilabelClassification/eng/FSD50HF.py @@ -8,7 +8,7 @@ class FSD50HFMultilingualClassification(AbsTaskAudioMultilabelClassification): metadata = TaskMetadata( - name="FSD50HF", + name="FSD50K", description="Multilabel Audio Classification.", reference="https://huggingface.co/datasets/Chand0320/fsd50k_hf", dataset={ @@ -31,26 +31,34 @@ class FSD50HFMultilingualClassification(AbsTaskAudioMultilabelClassification): dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@dataset{eduardo_fonseca_2020_3612637, - author = {Eduardo Fonseca and - Manoj Plakal and - Frederic Font and - Daniel P. W. Ellis and - Xavier Serra}, - title = {FSDKaggle2019}, - month = jan, - year = 2020, - publisher = {Zenodo}, - version = {1.0}, - doi = {10.5281/zenodo.3612637}, - url = {https://doi.org/10.5281/zenodo.3612637}, - } + bibtex_citation="""@ARTICLE{9645159, + author={Fonseca, Eduardo and Favory, Xavier and Pons, Jordi and Font, Frederic and Serra, Xavier}, + journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, + title={FSD50K: An Open Dataset of Human-Labeled Sound Events}, + year={2022}, + volume={30}, + number={}, + pages={829-852}, + keywords={Videos;Task analysis;Labeling;Vocabulary;Speech recognition;Ontologies;Benchmark testing;Audio dataset;sound event;recognition;classification;tagging;data collection;environmental sound}, + doi={10.1109/TASLP.2021.3133208}} """, - descriptive_stats={ - "n_samples": {"test": 8961}, - }, ) audio_column_name: str = "audio" label_column_name: str = "labels" samples_per_label: int = 8 + + def dataset_transform(self): + # labels column is a string of comma separated labels, this function converts it to a list of labels + self.dataset = self.dataset.map( + lambda x: { + self.label_column_name: x[self.label_column_name].split(","), + } + ) + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=self.eval_splits, + label=self.label_column_name, + n_samples=2048, + )