lhotse-speech · pzelasko · Jul 19, 2023 · Apr 20, 2023 · Apr 20, 2023 · Apr 20, 2023
diff --git a/docs/corpus.rst b/docs/corpus.rst
@@ -167,6 +167,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_vctk`
   * - VoxCeleb
     - :func:`lhotse.recipes.prepare_voxceleb`
+  * - VoxPopuli
+    - :func:`lhotse.recipes.prepare_voxpopuli`
   * - WenetSpeech
     - :func:`lhotse.recipes.prepare_wenet_speech`
   * - YesNo

diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
@@ -67,6 +67,7 @@
 from .uwb_atcc import *
 from .vctk import *
 from .voxceleb import *
+from .voxpopuli import *
 from .wenet_speech import *
 from .xbmu_amdo31 import *
 from .yesno import *
diff --git a/lhotse/bin/modes/recipes/voxpopuli.py b/lhotse/bin/modes/recipes/voxpopuli.py
@@ -0,0 +1,84 @@
+import click
+
+from lhotse.bin.modes import download, prepare
+from lhotse.recipes import download_voxpopuli, prepare_voxpopuli
+from lhotse.recipes.voxpopuli import (
+    LANGUAGES,
+    LANGUAGES_V2,
+    S2S_SRC_LANGUAGES,
+    S2S_TGT_LANGUAGES,
+)
+from lhotse.utils import Pathlike
+
+__all__ = ["voxpopuli"]
+
+
+@prepare.command()
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "--task",
+    type=click.Choice(["asr", "s2s", "lm"]),
+    default="asr",
+    help="The task for which to prepare the VoxPopuli data.",
+    show_default=True,
+)
+@click.option(
+    "--lang",
+    type=click.Choice(LANGUAGES + LANGUAGES_V2),
+    default="en",
+    help="The language to prepare (only used if task is asr or lm).",
+    show_default=True,
+)
+@click.option(
+    "--src-lang",
+    type=click.Choice(S2S_SRC_LANGUAGES),
+    default=None,
+    help="The source language (only used if task is s2s).",
+    show_default=True,
+)
+@click.option(
+    "--tgt-lang",
+    type=click.Choice(S2S_TGT_LANGUAGES),
+    default=None,
+    help="The target language (only used if task is s2s).",
+    show_default=True,
+)
+@click.option(
+    "--num-jobs",
+    "-j",
+    type=int,
+    default=1,
+    help="Number of parallel jobs (can provide small speed-ups).",
+    show_default=True,
+)
+def voxpopuli(
+    corpus_dir: Pathlike,
+    output_dir: Pathlike,
+    task: str,
+    lang: str,
+    src_lang: str,
+    tgt_lang: str,
+    num_jobs: int,
+):
+    """voxpopuli data preparation."""
+    prepare_voxpopuli(
+        corpus_dir,
+        output_dir=output_dir,
+        task=task,
+        lang=lang,
+        source_lang=src_lang,
+        target_lang=tgt_lang,
+    )
+
+
+@download.command()
+@click.argument("target_dir", type=click.Path())
+@click.option(
+    "--subset",
+    type=click.Choice(["asr", "10k", "100k", "400k"] + LANGUAGES + LANGUAGES_V2),
+    default="asr",
+)
+def voxpopuli(target_dir: Pathlike):
+    """voxpopuli download."""
+    download_voxpopuli(target_dir)
diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py
@@ -67,6 +67,7 @@
 from .uwb_atcc import download_uwb_atcc, prepare_uwb_atcc
 from .vctk import download_vctk, prepare_vctk
 from .voxceleb import download_voxceleb1, download_voxceleb2, prepare_voxceleb
+from .voxpopuli import download_voxpopuli, prepare_voxpopuli
 from .wenet_speech import prepare_wenet_speech
 from .xbmu_amdo31 import download_xbmu_amdo31, prepare_xbmu_amdo31
 from .yesno import download_yesno, prepare_yesno
diff --git a/lhotse/recipes/voxpopuli.py b/lhotse/recipes/voxpopuli.py
@@ -0,0 +1,285 @@
+"""
+VoxPopuli provides
+
+- 400K hours of unlabelled speech data for 23 languages
+- 1.8K hours of transcribed speech data for 16 languages
+- 17.3K hours of speech-to-speech interpretation data for 15x15 directions
+- 29 hours of transcribed speech data of non-native English intended for research in ASR
+for accented speech (15 L2 accents)
+
+The raw data is collected from 2009-2020 European Parliament event recordings.
+For details about the corpus, please refer to the website:
+https://github.com/facebookresearch/voxpopuli
+
+Reference:
+Wang, Changhan et al. “VoxPopuli: A Large-Scale Multilingual Speech Corpus for Representation
+Learning, Semi-Supervised Learning and Interpretation.” Annual Meeting of the Association
+for Computational Linguistics (2021).
+
+This script is based on code from the repository linked above.
+
+NOTE: Our data preparation is slightly different from the original repository. In particular,
+we only use the metadata to create manifests, i.e., we do not create segment-level wav files,
+unlike the original repository. In this way, we can avoid duplicating the audio files.
+"""
+import csv
+import gzip
+import logging
+import shutil
+import tempfile
+from ast import literal_eval
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torchaudio
+from torch.hub import download_url_to_file
+from tqdm import tqdm
+
+from lhotse import (
+    RecordingSet,
+    SupervisionSegment,
+    SupervisionSet,
+    validate_recordings_and_supervisions,
+)
+from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
+from lhotse.utils import Pathlike, safe_extract
+
+# fmt: off
+LANGUAGES = [
+    "en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
+    "sk", "sl", "et", "lt", "pt", "bg", "el", "lv", "mt", "sv", "da"
+]
+LANGUAGES_V2 = [f"{x}_v2" for x in LANGUAGES]
+
+YEARS = list(range(2009, 2020 + 1))
+
+ASR_LANGUAGES = [
+    "en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
+    "sk", "sl", "et", "lt"
+]
+ASR_ACCENTED_LANGUAGES = [
+    "en_accented"
+]
+
+S2S_SRC_LANGUAGES = ASR_LANGUAGES
+
+S2S_TGT_LANGUAGES = [
+    "en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
+    "sk", "sl", "et", "lt", "pt", "bg", "el", "lv", "mt", "sv", "da"
+]
+
+S2S_TGT_LANGUAGES_WITH_HUMAN_TRANSCRIPTION = ["en", "fr", "es"]
+
+DOWNLOAD_BASE_URL = "https://dl.fbaipublicfiles.com/voxpopuli"
+# fmt: on
+
+
+def download_voxpopuli(
+    target_dir: Pathlike = ".",
+    subset: Optional[str] = "asr",
+) -> Path:
+    """
+    Download and untar/unzip the VoxPopuli dataset.
+
+    :param target_dir: Pathlike, the path of the dir to storage the dataset.
+    :param subset: str, the subset of the dataset to download, can be one of "400k", "100k",
+        "10k", "asr", or any of the languages in LANGUAGES or LANGUAGES_V2.
+    :return: the path to downloaded and extracted directory with data.
+    """
+    target_dir = Path(target_dir)
+    target_dir.mkdir(parents=True, exist_ok=True)
+
+    if subset in LANGUAGES_V2:
+        languages = [subset.split("_")[0]]
+        years = YEARS + [f"{y}_2" for y in YEARS]
+    elif subset in LANGUAGES:
+        languages = [subset]
+        years = YEARS
+    else:
+        languages = {
+            "400k": LANGUAGES,
+            "100k": LANGUAGES,
+            "10k": LANGUAGES,
+            "asr": ["original"],
+        }.get(subset, None)
+        years = {
+            "400k": YEARS + [f"{y}_2" for y in YEARS],
+            "100k": YEARS,
+            "10k": [2019, 2020],
+            "asr": YEARS,
+        }.get(subset, None)
+
+    url_list = []
+    for l in languages:
+        for y in years:
+            url_list.append(f"{DOWNLOAD_BASE_URL}/audios/{l}_{y}.tar")
+
+    out_root = target_dir / "raw_audios"
+    out_root.mkdir(exist_ok=True, parents=True)
+    logging.info(f"{len(url_list)} files to download...")
+    for url in tqdm(url_list):
+        tar_path = out_root / Path(url).name
+        download_url_to_file(url, out_root.as_posix(), Path(url).name)
+        safe_extract(tar_path, out_root)
+        tar_path.unlink()
+
+    return target_dir
+
+
+def prepare_voxpopuli(
+    corpus_dir: Pathlike,
+    output_dir: Optional[Pathlike] = None,
+    task: str = "asr",
+    lang: str = "en",
+    source_lang: Optional[str] = None,
+    target_lang: Optional[str] = None,
+    num_jobs: int = 1,
+) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
+    """
+    Prepares and returns the VoxPopuli manifests which consist of Recordings and Supervisions.
+
+    :param corpus_dir: Pathlike, the path of the data dir.
+    :param output_dir: Pathlike, the path where to write the manifests.
+    :param task: str, the task to prepare the manifests for, can be one of "asr", "s2s", "lm".
+    :param lang: str, the language to prepare the manifests for, can be one of LANGUAGES
+        or LANGUAGES_V2. This is used for "asr" and "lm" tasks.
+    :param source_lang: str, the source language for the s2s task, can be one of S2S_SRC_LANGUAGES.
+    :param target_lang: str, the target language for the s2s task, can be one of S2S_TGT_LANGUAGES.
+    :param num_jobs: int, the number of parallel jobs to use for preparing the manifests.
+    :return: Dict[str, Union[RecordingSet, SupervisionSet]], the manifests.
+    """
+    corpus_dir = Path(corpus_dir)
+    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
+
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(exist_ok=True, parents=True)
+
+    if task == "asr":
+        assert lang in ASR_LANGUAGES, f"Unsupported language: {lang}"
+        manifests = _prepare_voxpopuli_asr(
+            corpus_dir, output_dir, lang, num_jobs=num_jobs
+        )
+    elif task == "s2s":
+        assert (
+            source_lang in S2S_SRC_LANGUAGES
+        ), f"Unsupported source language: {source_lang}"
+        assert (
+            target_lang in S2S_TGT_LANGUAGES
+        ), f"Unsupported target language: {target_lang}"
+        manifests = _prepare_voxpopuli_s2s(corpus_dir, source_lang, target_lang)
+    elif task == "lm":
+        assert lang in ASR_LANGUAGES, f"Unsupported language: {lang}"
+        manifests = _prepare_voxpopuli_lm(corpus_dir, lang)
+
+    for k, v in manifests.items():
+        recordings, supervisions = fix_manifests(**v)
+        validate_recordings_and_supervisions(
+            recordings=recordings, supervisions=supervisions
+        )
+        manifests[k]["recordings"] = recordings
+        manifests[k]["supervisions"] = supervisions
+
+        lang_affix = f"{source_lang}-{target_lang}" if task == "s2s" else lang
+        if output_dir is not None:
+            recordings.to_file(
+                output_dir / f"voxpopuli-{task}-{lang_affix}_recordings_{k}.jsonl.gz"
+            )
+            supervisions.to_file(
+                output_dir / f"voxpopuli-{task}-{lang_affix}_supervisions_{k}.jsonl.gz"
+            )
+
+    return manifests
+
+
+def _prepare_voxpopuli_asr(
+    corpus_dir: Path, output_dir: Path, lang: str, num_jobs: int = 1
+) -> Tuple[RecordingSet, SupervisionSet]:
+    """
+    Download metadata TSV and prepare manifests for the ASR task.
+    """
+    # First create recordings. We remove the affix "_original" from the recording ID
+    logging.info("Preparing recordings (this may take a few minutes)...")
+    in_root = corpus_dir / "raw_audios" / "original"
+    recordings = RecordingSet.from_dir(
+        in_root,
+        "*.ogg",
+        num_jobs=num_jobs,
+        recording_id=lambda x: x.stem.replace("_original", ""),
+    )
+
+    # Now create supervisions
+    temp_dir = Path(tempfile.mkdtemp(prefix=f"voxpopuli_asr_", dir=output_dir))
+
+    # Get metadata TSV
+    url = f"{DOWNLOAD_BASE_URL}/annotations/asr/asr_{lang}.tsv.gz"
+    tsv_path = temp_dir / Path(url).name
+    if not tsv_path.exists():
+        download_url_to_file(url, tsv_path)
+    with gzip.open(tsv_path, "rt") as f:
+        metadata = [x for x in csv.DictReader(f, delimiter="|")]
+
+    # Get segment into list (train, dev, test)
+    segments = defaultdict(list)
+    # We also keep a count of the number of segments per recording
+    num_segments = defaultdict(lambda: 0)
+
+    for r in tqdm(metadata):
+        split = r["split"]
+        if split not in ["train", "dev", "test"]:
+            continue
+        reco_id = r["session_id"]
+        start_time = float(r["start_time"])
+        duration = float(r["end_time"]) - start_time
+
+        num_segments[reco_id] += 1
+        segments[split].append(
+            SupervisionSegment(
+                id=f"{reco_id}-{num_segments[reco_id]}",
+                recording_id=reco_id,
+                start=round(start_time, ndigits=8),
+                duration=round(duration, ndigits=8),
+                channel=0,
+                language=lang,
+                speaker=r["speaker_id"],
+                gender=r["gender"],
+                text=r["normed_text"],
+                custom={
+                    "orig_text": r["original_text"],
+                },
+            )
+        )
+
+    # Get list of recording IDs for each split
+    reco_ids = defaultdict(list)
+    for split, segs in segments.items():
+        reco_ids[split] = sorted(set([s.recording_id for s in segs]))
+
+    manifests = defaultdict(dict)
+    for split in ["train", "dev", "test"]:
+        manifests[split]["recordings"] = recordings.filter(
+            lambda r: r.id in reco_ids[split]
+        )
+        manifests[split]["supervisions"] = SupervisionSet.from_segments(segments[split])
+
+    # Delete temp dir along with its contents
+    shutil.rmtree(temp_dir)
+    return manifests
+
+
+def _prepare_voxpopuli_s2s(
+    corpus_dir: Path, source_lang: str, target_lang: str
+) -> Tuple[RecordingSet, SupervisionSet]:
+    """
+    Prepare the manifests for the s2s task.
+    """
+    raise NotImplementedError
+
+
+def _prepare_voxpopuli_lm(corpus_dir: Path, lang: str) -> Tuple[RecordingSet, None]:
+    """
+    Prepare the manifests for the lm task.
+    """
+    raise NotImplementedError