diff --git a/mteb/tasks/Audio/AudioMultilabelClassification/__init__.py b/mteb/tasks/Audio/AudioMultilabelClassification/__init__.py index 5e7d7786ce..91b9c8836c 100644 --- a/mteb/tasks/Audio/AudioMultilabelClassification/__init__.py +++ b/mteb/tasks/Audio/AudioMultilabelClassification/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +from .eng.AudioSet import * from .eng.BirdSet import * from .eng.FSD50HF import * from .eng.FSD2019Kaggle import * diff --git a/mteb/tasks/Audio/AudioMultilabelClassification/eng/AudioSet.py b/mteb/tasks/Audio/AudioMultilabelClassification/eng/AudioSet.py new file mode 100644 index 0000000000..1afd67780d --- /dev/null +++ b/mteb/tasks/Audio/AudioMultilabelClassification/eng/AudioSet.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +from mteb.abstasks.Audio.AbsTaskAudioMultilabelClassification import ( + AbsTaskAudioMultilabelClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class AudioSetMultilingualClassification(AbsTaskAudioMultilabelClassification): + superseded_by = "AudioSetMini" + metadata = TaskMetadata( + name="AudioSet", + description="AudioSet consists of an expanding ontology of 632 audio event classes and a collection of 2,084,320 human-labeled 10-second sound clips drawn from YouTube videos.", + reference="https://huggingface.co/datasets/agkphysics/AudioSet", + dataset={ + "path": "agkphysics/AudioSet", + "revision": "5a2fa42a1506470d275a47ff8e1fdac5b364e6ef", + }, + type="AudioMultilabelClassification", + category="a2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="lrap", + date=( + "2016-01-01", + "2017-01-30", + ), + domains=["Web", "Music", "Speech", "Scene"], + task_subtypes=[ + "Environment Sound Classification", + "Music Instrument Recognition", + "Vocal Sound Classification", + "Gunshot Audio Classification", + ], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{45857, + address = {New Orleans, LA}, + author = {Jort F. Gemmeke and Daniel P. W. Ellis and Dylan Freedman and Aren Jansen and Wade Lawrence and R. Channing Moore and Manoj Plakal and Marvin Ritter}, + booktitle = {Proc. IEEE ICASSP 2017}, + title = {Audio Set: An ontology and human-labeled dataset for audio events}, + year = {2017}, +} +""", + ) + + audio_column_name: str = "audio" + label_column_name: str = "human_labels" + + +# Sampled using scripts/data/audioset/create_data.ipynb +class AudioSetMiniMultilingualClassification(AbsTaskAudioMultilabelClassification): + metadata = TaskMetadata( + name="AudioSetMini", + description="AudioSet consists of an expanding ontology of 632 audio event classes and a collection of 2,084,320 human-labeled 10-second sound clips drawn from YouTube videos. This is a mini version that is sampled from the original dataset.", + reference="https://huggingface.co/datasets/agkphysics/AudioSet", + dataset={ + "path": "mteb/audioset", + "revision": "168a7e681ee40609129535d49855c7e3e77e5efa", + }, + type="AudioMultilabelClassification", + category="a2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="lrap", + date=( + "2016-01-01", + "2017-01-30", + ), + domains=["Web", "Music", "Speech", "Scene"], + task_subtypes=[ + "Environment Sound Classification", + "Music Instrument Recognition", + "Vocal Sound Classification", + "Gunshot Audio Classification", + ], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{45857, + address = {New Orleans, LA}, + author = {Jort F. Gemmeke and Daniel P. W. Ellis and Dylan Freedman and Aren Jansen and Wade Lawrence and R. Channing Moore and Manoj Plakal and Marvin Ritter}, + booktitle = {Proc. IEEE ICASSP 2017}, + title = {Audio Set: An ontology and human-labeled dataset for audio events}, + year = {2017}, +} +""", + ) + + audio_column_name: str = "audio" + label_column_name: str = "human_labels" diff --git a/scripts/create_dataset_citations_bib.py b/scripts/create_dataset_citations_bib.py index 9086117536..131d29ad33 100644 --- a/scripts/create_dataset_citations_bib.py +++ b/scripts/create_dataset_citations_bib.py @@ -49,8 +49,7 @@ def create_citations_table(tasks: list[mteb.AbsTask]) -> str: \\setlength\\extrarowheight{7pt} \\begin{longtable}{L{3.5cm}|L{3.0cm}L{1.4cm}L{1.4cm}L{1.4cm}L{1.4cm}L{1.0cm}L{1.0cm}} \\toprule -\\textbf{Dataset} & \\textbf{N. Langs} & \\textbf{Type} & \\textbf{Category} & \\textbf{Domains} & \\textbf{N. Docs} & -\\textbf{Avg. Length} \\\\ +\\textbf{Dataset} & \\textbf{N. Langs} & \\textbf{Type} & \\textbf{Category} & \\textbf{Domains} & \\textbf{N. Docs} \\\\ \\midrule \\endhead \\\\""" for task in tasks: @@ -75,14 +74,6 @@ def task_to_tex_row(task: mteb.AbsTask) -> str: else "" ) - avg_character_length = ( - "{:.2f}".format( - sum(task.metadata.avg_character_length.values()) - / len(task.metadata.avg_character_length.keys()) - ) - if task.metadata.avg_character_length - else "" - ) library = bibtexparser.parse_string(task.metadata.bibtex_citation) try: cite_key = library.entries[0].key @@ -97,11 +88,11 @@ def task_to_tex_row(task: mteb.AbsTask) -> str: ) lang = lang.replace("'", "") - return f"{name}{cite_key} & {lang} & {task.metadata.type} & {task.metadata.category} & {domains[1:-1]} & {n_samples} & {avg_character_length} \\\\" + return f"{name}{cite_key} & {lang} & {task.metadata.type} & {task.metadata.category} & {domains[1:-1]} & {n_samples} \\\\" def main(): - tasks = mteb.get_tasks() + tasks = mteb.get_tasks(modalities=["audio"]) tasks = sorted(tasks, key=lambda x: x.metadata.name) extract_bibtex_to_file(tasks) print(create_citations_table(tasks)) diff --git a/scripts/data/audioset/create_data.ipynb b/scripts/data/audioset/create_data.ipynb new file mode 100644 index 0000000000..0ebc4fdb99 --- /dev/null +++ b/scripts/data/audioset/create_data.ipynb @@ -0,0 +1,258 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/anaconda3/envs/mteb/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from __future__ import annotations\n", + "\n", + "from datasets import load_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['video_id', 'audio', 'labels', 'human_labels'],\n", + " num_rows: 18685\n", + " })\n", + " test: Dataset({\n", + " features: ['video_id', 'audio', 'labels', 'human_labels'],\n", + " num_rows: 17142\n", + " })\n", + "})" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = load_dataset(\"agkphysics/AudioSet\", \"balanced\")\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "import numpy as np\n", + "\n", + "\n", + "def undersample_data_indices(y, samples_per_label, idxs=None):\n", + " \"\"\"Undersample data to have samples_per_label samples of each label\"\"\"\n", + " sample_indices = []\n", + " if idxs is None:\n", + " idxs = np.arange(len(y))\n", + " np.random.shuffle(idxs)\n", + " label_counter = defaultdict(int)\n", + " for i in idxs:\n", + " if any((label_counter[label] < samples_per_label) for label in y[i]):\n", + " sample_indices.append(i)\n", + " for label in y[i]:\n", + " label_counter[label] += 1\n", + " return sample_indices, idxs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3010" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_indices, _ = undersample_data_indices(ds[\"train\"][\"labels\"], 8, None)\n", + "len(sample_indices)\n", + "\n", + "# 16: 6018\n", + "# 8: 3010" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2153" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_sample_indices, _ = undersample_data_indices(ds[\"test\"][\"labels\"], 6, None)\n", + "len(test_sample_indices)\n", + "\n", + "# 4: 1556\n", + "# 6: 2153" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['video_id', 'audio', 'labels', 'human_labels'],\n", + " num_rows: 3010\n", + " })\n", + " test: Dataset({\n", + " features: ['video_id', 'audio', 'labels', 'human_labels'],\n", + " num_rows: 2153\n", + " })\n", + "})" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from datasets import DatasetDict\n", + "\n", + "ds1 = DatasetDict(\n", + " {\n", + " \"train\": ds[\"train\"].select(sample_indices),\n", + " \"test\": ds[\"test\"].select(test_sample_indices),\n", + " }\n", + ")\n", + "ds1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 335/335 [00:00<00:00, 1245.97 examples/s]t/s]\n", + "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 9.18ba/s]\n", + "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 335/335 [00:00<00:00, 1623.89 examples/s], 20.66s/it]\n", + "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.42ba/s]\n", + "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 335/335 [00:00<00:00, 1491.32 examples/s], 17.84s/it]\n", + "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.38ba/s]\n", + "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 335/335 [00:00<00:00, 521.42 examples/s]3, 17.32s/it]\n", + "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 13.60ba/s]\n", + "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 334/334 [00:00<00:00, 447.69 examples/s]5, 17.03s/it]\n", + "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.02ba/s]\n", + "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 334/334 [00:00<00:00, 466.32 examples/s]8, 17.07s/it]\n", + "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.21ba/s]\n", + "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 334/334 [00:00<00:00, 456.27 examples/s]0, 16.85s/it]\n", + "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 12.88ba/s]\n", + "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 334/334 [00:00<00:00, 485.48 examples/s]2, 16.40s/it]\n", + "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 10.39ba/s]\n", + "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 334/334 [00:00<00:00, 385.79 examples/s]6, 16.58s/it]\n", + "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 12.72ba/s]\n", + "Uploading the dataset shards: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 9/9 [02:31<00:00, 16.84s/it]\n", + "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 359/359 [00:00<00:00, 1828.07 examples/s]t/s]\n", + "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 10.42ba/s]\n", + "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 359/359 [00:00<00:00, 1774.86 examples/s], 17.25s/it]\n", + "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.69ba/s]\n", + "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 359/359 [00:00<00:00, 1196.21 examples/s], 18.00s/it]\n", + "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 10.40ba/s]\n", + "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 359/359 [00:00<00:00, 742.40 examples/s]6, 18.71s/it]\n", + "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.55ba/s]\n", + "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 359/359 [00:00<00:00, 722.23 examples/s]5, 17.79s/it]\n", + "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.13ba/s]\n", + "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 358/358 [00:00<00:00, 750.80 examples/s]7, 17.36s/it]\n", + "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.36ba/s]\n", + "Uploading the dataset shards: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [01:47<00:00, 17.96s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "CommitInfo(commit_url='https://huggingface.co/datasets/mteb/audioset/commit/168a7e681ee40609129535d49855c7e3e77e5efa', commit_message='Upload dataset', commit_description='', oid='168a7e681ee40609129535d49855c7e3e77e5efa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mteb/audioset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mteb/audioset'), pr_revision=None, pr_num=None)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from huggingface_hub import create_repo\n", + "\n", + "repo_name = \"mteb/audioset\"\n", + "\n", + "WRITE_TOK = \"\"\n", + "create_repo(repo_name, repo_type=\"dataset\", token=WRITE_TOK)\n", + "\n", + "ds1.push_to_hub(repo_name, token=WRITE_TOK)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mteb", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}