diff --git a/mteb/tasks/Audio/AudioMultilabelClassification/__init__.py b/mteb/tasks/Audio/AudioMultilabelClassification/__init__.py
index 5e7d7786ce..91b9c8836c 100644
--- a/mteb/tasks/Audio/AudioMultilabelClassification/__init__.py
+++ b/mteb/tasks/Audio/AudioMultilabelClassification/__init__.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from .eng.AudioSet import *
 from .eng.BirdSet import *
 from .eng.FSD50HF import *
 from .eng.FSD2019Kaggle import *
diff --git a/mteb/tasks/Audio/AudioMultilabelClassification/eng/AudioSet.py b/mteb/tasks/Audio/AudioMultilabelClassification/eng/AudioSet.py
new file mode 100644
index 0000000000..1afd67780d
--- /dev/null
+++ b/mteb/tasks/Audio/AudioMultilabelClassification/eng/AudioSet.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+
+from mteb.abstasks.Audio.AbsTaskAudioMultilabelClassification import (
+    AbsTaskAudioMultilabelClassification,
+)
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class AudioSetMultilingualClassification(AbsTaskAudioMultilabelClassification):
+    superseded_by = "AudioSetMini"
+    metadata = TaskMetadata(
+        name="AudioSet",
+        description="AudioSet consists of an expanding ontology of 632 audio event classes and a collection of 2,084,320 human-labeled 10-second sound clips drawn from YouTube videos.",
+        reference="https://huggingface.co/datasets/agkphysics/AudioSet",
+        dataset={
+            "path": "agkphysics/AudioSet",
+            "revision": "5a2fa42a1506470d275a47ff8e1fdac5b364e6ef",
+        },
+        type="AudioMultilabelClassification",
+        category="a2t",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="lrap",
+        date=(
+            "2016-01-01",
+            "2017-01-30",
+        ),
+        domains=["Web", "Music", "Speech", "Scene"],
+        task_subtypes=[
+            "Environment Sound Classification",
+            "Music Instrument Recognition",
+            "Vocal Sound Classification",
+            "Gunshot Audio Classification",
+        ],
+        license="cc-by-4.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        modalities=["audio"],
+        sample_creation="found",
+        bibtex_citation=r"""
+@inproceedings{45857,
+  address = {New Orleans, LA},
+  author = {Jort F. Gemmeke and Daniel P. W. Ellis and Dylan Freedman and Aren Jansen and Wade Lawrence and R. Channing Moore and Manoj Plakal and Marvin Ritter},
+  booktitle = {Proc. IEEE ICASSP 2017},
+  title = {Audio Set: An ontology and human-labeled dataset for audio events},
+  year = {2017},
+}
+""",
+    )
+
+    audio_column_name: str = "audio"
+    label_column_name: str = "human_labels"
+
+
+# Sampled using scripts/data/audioset/create_data.ipynb
+class AudioSetMiniMultilingualClassification(AbsTaskAudioMultilabelClassification):
+    metadata = TaskMetadata(
+        name="AudioSetMini",
+        description="AudioSet consists of an expanding ontology of 632 audio event classes and a collection of 2,084,320 human-labeled 10-second sound clips drawn from YouTube videos. This is a mini version that is sampled from the original dataset.",
+        reference="https://huggingface.co/datasets/agkphysics/AudioSet",
+        dataset={
+            "path": "mteb/audioset",
+            "revision": "168a7e681ee40609129535d49855c7e3e77e5efa",
+        },
+        type="AudioMultilabelClassification",
+        category="a2t",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="lrap",
+        date=(
+            "2016-01-01",
+            "2017-01-30",
+        ),
+        domains=["Web", "Music", "Speech", "Scene"],
+        task_subtypes=[
+            "Environment Sound Classification",
+            "Music Instrument Recognition",
+            "Vocal Sound Classification",
+            "Gunshot Audio Classification",
+        ],
+        license="cc-by-4.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        modalities=["audio"],
+        sample_creation="found",
+        bibtex_citation=r"""
+@inproceedings{45857,
+  address = {New Orleans, LA},
+  author = {Jort F. Gemmeke and Daniel P. W. Ellis and Dylan Freedman and Aren Jansen and Wade Lawrence and R. Channing Moore and Manoj Plakal and Marvin Ritter},
+  booktitle = {Proc. IEEE ICASSP 2017},
+  title = {Audio Set: An ontology and human-labeled dataset for audio events},
+  year = {2017},
+}
+""",
+    )
+
+    audio_column_name: str = "audio"
+    label_column_name: str = "human_labels"
diff --git a/scripts/create_dataset_citations_bib.py b/scripts/create_dataset_citations_bib.py
index 9086117536..131d29ad33 100644
--- a/scripts/create_dataset_citations_bib.py
+++ b/scripts/create_dataset_citations_bib.py
@@ -49,8 +49,7 @@ def create_citations_table(tasks: list[mteb.AbsTask]) -> str:
 \\setlength\\extrarowheight{7pt}
 \\begin{longtable}{L{3.5cm}|L{3.0cm}L{1.4cm}L{1.4cm}L{1.4cm}L{1.4cm}L{1.0cm}L{1.0cm}}
 \\toprule
-\\textbf{Dataset} & \\textbf{N. Langs} & \\textbf{Type} & \\textbf{Category} & \\textbf{Domains} & \\textbf{N. Docs} &
-\\textbf{Avg. Length} \\\\
+\\textbf{Dataset} & \\textbf{N. Langs} & \\textbf{Type} & \\textbf{Category} & \\textbf{Domains} & \\textbf{N. Docs} \\\\
 \\midrule
 \\endhead \\\\"""
     for task in tasks:
@@ -75,14 +74,6 @@ def task_to_tex_row(task: mteb.AbsTask) -> str:
         else ""
     )
 
-    avg_character_length = (
-        "{:.2f}".format(
-            sum(task.metadata.avg_character_length.values())
-            / len(task.metadata.avg_character_length.keys())
-        )
-        if task.metadata.avg_character_length
-        else ""
-    )
     library = bibtexparser.parse_string(task.metadata.bibtex_citation)
     try:
         cite_key = library.entries[0].key
@@ -97,11 +88,11 @@ def task_to_tex_row(task: mteb.AbsTask) -> str:
     )
     lang = lang.replace("'", "")
 
-    return f"{name}{cite_key} & {lang} & {task.metadata.type} & {task.metadata.category} & {domains[1:-1]} & {n_samples} & {avg_character_length} \\\\"
+    return f"{name}{cite_key} & {lang} & {task.metadata.type} & {task.metadata.category} & {domains[1:-1]} & {n_samples}  \\\\"
 
 
 def main():
-    tasks = mteb.get_tasks()
+    tasks = mteb.get_tasks(modalities=["audio"])
     tasks = sorted(tasks, key=lambda x: x.metadata.name)
     extract_bibtex_to_file(tasks)
     print(create_citations_table(tasks))
diff --git a/scripts/data/audioset/create_data.ipynb b/scripts/data/audioset/create_data.ipynb
new file mode 100644
index 0000000000..0ebc4fdb99
--- /dev/null
+++ b/scripts/data/audioset/create_data.ipynb
@@ -0,0 +1,258 @@
+{
+    "cells": [
+        {
+            "cell_type": "code",
+            "execution_count": 3,
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stderr",
+                    "output_type": "stream",
+                    "text": [
+                        "/opt/homebrew/anaconda3/envs/mteb/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+                        "  from .autonotebook import tqdm as notebook_tqdm\n"
+                    ]
+                }
+            ],
+            "source": [
+                "from __future__ import annotations\n",
+                "\n",
+                "from datasets import load_dataset"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 17,
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "DatasetDict({\n",
+                            "    train: Dataset({\n",
+                            "        features: ['video_id', 'audio', 'labels', 'human_labels'],\n",
+                            "        num_rows: 18685\n",
+                            "    })\n",
+                            "    test: Dataset({\n",
+                            "        features: ['video_id', 'audio', 'labels', 'human_labels'],\n",
+                            "        num_rows: 17142\n",
+                            "    })\n",
+                            "})"
+                        ]
+                    },
+                    "execution_count": 17,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "ds = load_dataset(\"agkphysics/AudioSet\", \"balanced\")\n",
+                "ds"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 6,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from collections import defaultdict\n",
+                "\n",
+                "import numpy as np\n",
+                "\n",
+                "\n",
+                "def undersample_data_indices(y, samples_per_label, idxs=None):\n",
+                "    \"\"\"Undersample data to have samples_per_label samples of each label\"\"\"\n",
+                "    sample_indices = []\n",
+                "    if idxs is None:\n",
+                "        idxs = np.arange(len(y))\n",
+                "    np.random.shuffle(idxs)\n",
+                "    label_counter = defaultdict(int)\n",
+                "    for i in idxs:\n",
+                "        if any((label_counter[label] < samples_per_label) for label in y[i]):\n",
+                "            sample_indices.append(i)\n",
+                "            for label in y[i]:\n",
+                "                label_counter[label] += 1\n",
+                "    return sample_indices, idxs"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "3010"
+                        ]
+                    },
+                    "execution_count": 12,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "sample_indices, _ = undersample_data_indices(ds[\"train\"][\"labels\"], 8, None)\n",
+                "len(sample_indices)\n",
+                "\n",
+                "# 16: 6018\n",
+                "# 8: 3010"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "2153"
+                        ]
+                    },
+                    "execution_count": 20,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "test_sample_indices, _ = undersample_data_indices(ds[\"test\"][\"labels\"], 6, None)\n",
+                "len(test_sample_indices)\n",
+                "\n",
+                "# 4: 1556\n",
+                "# 6: 2153"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 21,
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "DatasetDict({\n",
+                            "    train: Dataset({\n",
+                            "        features: ['video_id', 'audio', 'labels', 'human_labels'],\n",
+                            "        num_rows: 3010\n",
+                            "    })\n",
+                            "    test: Dataset({\n",
+                            "        features: ['video_id', 'audio', 'labels', 'human_labels'],\n",
+                            "        num_rows: 2153\n",
+                            "    })\n",
+                            "})"
+                        ]
+                    },
+                    "execution_count": 21,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "from datasets import DatasetDict\n",
+                "\n",
+                "ds1 = DatasetDict(\n",
+                "    {\n",
+                "        \"train\": ds[\"train\"].select(sample_indices),\n",
+                "        \"test\": ds[\"test\"].select(test_sample_indices),\n",
+                "    }\n",
+                ")\n",
+                "ds1"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stderr",
+                    "output_type": "stream",
+                    "text": [
+                        "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 335/335 [00:00<00:00, 1245.97 examples/s]t/s]\n",
+                        "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00,  9.18ba/s]\n",
+                        "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 335/335 [00:00<00:00, 1623.89 examples/s], 20.66s/it]\n",
+                        "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.42ba/s]\n",
+                        "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 335/335 [00:00<00:00, 1491.32 examples/s], 17.84s/it]\n",
+                        "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.38ba/s]\n",
+                        "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 335/335 [00:00<00:00, 521.42 examples/s]3, 17.32s/it]\n",
+                        "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 13.60ba/s]\n",
+                        "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 334/334 [00:00<00:00, 447.69 examples/s]5, 17.03s/it]\n",
+                        "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.02ba/s]\n",
+                        "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 334/334 [00:00<00:00, 466.32 examples/s]8, 17.07s/it]\n",
+                        "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.21ba/s]\n",
+                        "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 334/334 [00:00<00:00, 456.27 examples/s]0, 16.85s/it]\n",
+                        "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 12.88ba/s]\n",
+                        "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 334/334 [00:00<00:00, 485.48 examples/s]2, 16.40s/it]\n",
+                        "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 10.39ba/s]\n",
+                        "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 334/334 [00:00<00:00, 385.79 examples/s]6, 16.58s/it]\n",
+                        "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 12.72ba/s]\n",
+                        "Uploading the dataset shards: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 9/9 [02:31<00:00, 16.84s/it]\n",
+                        "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 359/359 [00:00<00:00, 1828.07 examples/s]t/s]\n",
+                        "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 10.42ba/s]\n",
+                        "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 359/359 [00:00<00:00, 1774.86 examples/s], 17.25s/it]\n",
+                        "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.69ba/s]\n",
+                        "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 359/359 [00:00<00:00, 1196.21 examples/s], 18.00s/it]\n",
+                        "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 10.40ba/s]\n",
+                        "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 359/359 [00:00<00:00, 742.40 examples/s]6, 18.71s/it]\n",
+                        "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.55ba/s]\n",
+                        "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 359/359 [00:00<00:00, 722.23 examples/s]5, 17.79s/it]\n",
+                        "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.13ba/s]\n",
+                        "Map: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 358/358 [00:00<00:00, 750.80 examples/s]7, 17.36s/it]\n",
+                        "Creating parquet from Arrow format: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:00<00:00, 11.36ba/s]\n",
+                        "Uploading the dataset shards: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [01:47<00:00, 17.96s/it]\n"
+                    ]
+                },
+                {
+                    "data": {
+                        "text/plain": [
+                            "CommitInfo(commit_url='https://huggingface.co/datasets/mteb/audioset/commit/168a7e681ee40609129535d49855c7e3e77e5efa', commit_message='Upload dataset', commit_description='', oid='168a7e681ee40609129535d49855c7e3e77e5efa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mteb/audioset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mteb/audioset'), pr_revision=None, pr_num=None)"
+                        ]
+                    },
+                    "execution_count": 26,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "from huggingface_hub import create_repo\n",
+                "\n",
+                "repo_name = \"mteb/audioset\"\n",
+                "\n",
+                "WRITE_TOK = \"\"\n",
+                "create_repo(repo_name, repo_type=\"dataset\", token=WRITE_TOK)\n",
+                "\n",
+                "ds1.push_to_hub(repo_name, token=WRITE_TOK)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": []
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "mteb",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.11"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 2
+}