embeddings-benchmark · isaac-chung · Jul 28, 2025 · Jul 27, 2025 · Jul 27, 2025 · Jul 27, 2025
diff --git a/mteb/tasks/Audio/AudioMultilabelClassification/__init__.py b/mteb/tasks/Audio/AudioMultilabelClassification/__init__.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from .eng.AudioSet import *
 from .eng.BirdSet import *
 from .eng.FSD50HF import *
 from .eng.FSD2019Kaggle import *
diff --git a/mteb/tasks/Audio/AudioMultilabelClassification/eng/AudioSet.py b/mteb/tasks/Audio/AudioMultilabelClassification/eng/AudioSet.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+
+from mteb.abstasks.Audio.AbsTaskAudioMultilabelClassification import (
+    AbsTaskAudioMultilabelClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class AudioSetMultilingualClassification(AbsTaskAudioMultilabelClassification):
+    superseded_by = "AudioSetMini"
+    metadata = TaskMetadata(
+        name="AudioSet",
+        description="AudioSet consists of an expanding ontology of 632 audio event classes and a collection of 2,084,320 human-labeled 10-second sound clips drawn from YouTube videos.",
+        reference="https://huggingface.co/datasets/agkphysics/AudioSet",
+        dataset={
+            "path": "agkphysics/AudioSet",
+            "revision": "5a2fa42a1506470d275a47ff8e1fdac5b364e6ef",
+        },
+        type="AudioMultilabelClassification",
+        category="a2t",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="lrap",
+        date=(
+            "2016-01-01",
+            "2017-01-30",
+        ),
+        domains=["Web", "Music", "Speech", "Scene"],
+        task_subtypes=[
+            "Environment Sound Classification",
+            "Music Instrument Recognition",
+            "Vocal Sound Classification",
+            "Gunshot Audio Classification",
+        ],
+        license="cc-by-4.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        modalities=["audio"],
+        sample_creation="found",
+        bibtex_citation=r"""
+@inproceedings{45857,
+  address = {New Orleans, LA},
+  author = {Jort F. Gemmeke and Daniel P. W. Ellis and Dylan Freedman and Aren Jansen and Wade Lawrence and R. Channing Moore and Manoj Plakal and Marvin Ritter},
+  booktitle = {Proc. IEEE ICASSP 2017},
+  title = {Audio Set: An ontology and human-labeled dataset for audio events},
+  year = {2017},
+}
+""",
+    )
+
+    audio_column_name: str = "audio"
+    label_column_name: str = "human_labels"
+
+
+# Sampled using scripts/data/audioset/create_data.ipynb
+class AudioSetMiniMultilingualClassification(AbsTaskAudioMultilabelClassification):
+    metadata = TaskMetadata(
+        name="AudioSetMini",
+        description="AudioSet consists of an expanding ontology of 632 audio event classes and a collection of 2,084,320 human-labeled 10-second sound clips drawn from YouTube videos. This is a mini version that is sampled from the original dataset.",
+        reference="https://huggingface.co/datasets/agkphysics/AudioSet",
+        dataset={
+            "path": "mteb/audioset",
+            "revision": "168a7e681ee40609129535d49855c7e3e77e5efa",
+        },
+        type="AudioMultilabelClassification",
+        category="a2t",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="lrap",
+        date=(
+            "2016-01-01",
+            "2017-01-30",
+        ),
+        domains=["Web", "Music", "Speech", "Scene"],
+        task_subtypes=[
+            "Environment Sound Classification",
+            "Music Instrument Recognition",
+            "Vocal Sound Classification",
+            "Gunshot Audio Classification",
+        ],
+        license="cc-by-4.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        modalities=["audio"],
+        sample_creation="found",
+        bibtex_citation=r"""
+@inproceedings{45857,
+  address = {New Orleans, LA},
+  author = {Jort F. Gemmeke and Daniel P. W. Ellis and Dylan Freedman and Aren Jansen and Wade Lawrence and R. Channing Moore and Manoj Plakal and Marvin Ritter},
+  booktitle = {Proc. IEEE ICASSP 2017},
+  title = {Audio Set: An ontology and human-labeled dataset for audio events},
+  year = {2017},
+}
+""",
+    )
+
+    audio_column_name: str = "audio"
+    label_column_name: str = "human_labels"
diff --git a/scripts/create_dataset_citations_bib.py b/scripts/create_dataset_citations_bib.py
@@ -49,8 +49,7 @@ def create_citations_table(tasks: list[mteb.AbsTask]) -> str:
 \\setlength\\extrarowheight{7pt}
 \\begin{longtable}{L{3.5cm}|L{3.0cm}L{1.4cm}L{1.4cm}L{1.4cm}L{1.4cm}L{1.0cm}L{1.0cm}}
 \\toprule
-\\textbf{Dataset} & \\textbf{N. Langs} & \\textbf{Type} & \\textbf{Category} & \\textbf{Domains} & \\textbf{N. Docs} &
-\\textbf{Avg. Length} \\\\
+\\textbf{Dataset} & \\textbf{N. Langs} & \\textbf{Type} & \\textbf{Category} & \\textbf{Domains} & \\textbf{N. Docs} \\\\
 \\midrule
 \\endhead \\\\"""
     for task in tasks:
@@ -75,14 +74,6 @@ def task_to_tex_row(task: mteb.AbsTask) -> str:
         else ""
     )
 
-    avg_character_length = (
-        "{:.2f}".format(
-            sum(task.metadata.avg_character_length.values())
-            / len(task.metadata.avg_character_length.keys())
-        )
-        if task.metadata.avg_character_length
-        else ""
-    )
     library = bibtexparser.parse_string(task.metadata.bibtex_citation)
     try:
         cite_key = library.entries[0].key
@@ -97,11 +88,11 @@ def task_to_tex_row(task: mteb.AbsTask) -> str:
     )
     lang = lang.replace("'", "")
 
-    return f"{name}{cite_key} & {lang} & {task.metadata.type} & {task.metadata.category} & {domains[1:-1]} & {n_samples} & {avg_character_length} \\\\"
+    return f"{name}{cite_key} & {lang} & {task.metadata.type} & {task.metadata.category} & {domains[1:-1]} & {n_samples}  \\\\"
 
 
 def main():
-    tasks = mteb.get_tasks()
+    tasks = mteb.get_tasks(modalities=["audio"])
     tasks = sorted(tasks, key=lambda x: x.metadata.name)
     extract_bibtex_to_file(tasks)
     print(create_citations_table(tasks))