Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 26 additions & 18 deletions mteb/tasks/Audio/AudioMultilabelClassification/eng/FSD50HF.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

class FSD50HFMultilingualClassification(AbsTaskAudioMultilabelClassification):
metadata = TaskMetadata(
name="FSD50HF",
name="FSD50K",
description="Multilabel Audio Classification.",
reference="https://huggingface.co/datasets/Chand0320/fsd50k_hf",
dataset={
Expand All @@ -31,26 +31,34 @@ class FSD50HFMultilingualClassification(AbsTaskAudioMultilabelClassification):
dialect=[],
modalities=["audio"],
sample_creation="found",
bibtex_citation="""@dataset{eduardo_fonseca_2020_3612637,
author = {Eduardo Fonseca and
Manoj Plakal and
Frederic Font and
Daniel P. W. Ellis and
Xavier Serra},
title = {FSDKaggle2019},
month = jan,
year = 2020,
publisher = {Zenodo},
version = {1.0},
doi = {10.5281/zenodo.3612637},
url = {https://doi.org/10.5281/zenodo.3612637},
}
bibtex_citation="""@ARTICLE{9645159,
author={Fonseca, Eduardo and Favory, Xavier and Pons, Jordi and Font, Frederic and Serra, Xavier},
journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
title={FSD50K: An Open Dataset of Human-Labeled Sound Events},
year={2022},
volume={30},
number={},
pages={829-852},
keywords={Videos;Task analysis;Labeling;Vocabulary;Speech recognition;Ontologies;Benchmark testing;Audio dataset;sound event;recognition;classification;tagging;data collection;environmental sound},
doi={10.1109/TASLP.2021.3133208}}
""",
descriptive_stats={
"n_samples": {"test": 8961},
},
)

audio_column_name: str = "audio"
label_column_name: str = "labels"
samples_per_label: int = 8

def dataset_transform(self):
# labels column is a string of comma separated labels, this function converts it to a list of labels
self.dataset = self.dataset.map(
lambda x: {
self.label_column_name: x[self.label_column_name].split(","),
}
)
self.dataset = self.stratified_subsampling(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you should add stratified subsampling

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there are 5k test samples which is > 2048 as mentioned in the pr descp so i added it, I can remove subsampling

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for following the PR template. It's fine to stay.

self.dataset,
seed=self.seed,
splits=self.eval_splits,
label=self.label_column_name,
n_samples=2048,
)