Skip to content

[Recipe] VoxPopuli #1089

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Jul 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
ce0f5c1
add transform attribute for MixedCut
desh2608 Apr 20, 2023
ab18682
add mix_first option in normalize_loudness
desh2608 Apr 20, 2023
e4bca74
handle the case when mix is called on MixedCut with existing transforms
desh2608 Apr 20, 2023
71a9236
add test for mixing with transformed MixedCut
desh2608 Apr 20, 2023
2e54646
enhancements and bug fixes
desh2608 May 16, 2023
db37a75
small changes in some cutset methods
desh2608 May 16, 2023
7b59ecd
small fix in error message
desh2608 May 16, 2023
a64727a
return word alignments from ami recipe
desh2608 May 17, 2023
850ce2c
add word alignments for ICSI
desh2608 May 18, 2023
4b39c6f
remove unwanted whitespace
desh2608 May 18, 2023
3c16b90
fix IHM preparation
desh2608 May 18, 2023
9921575
remove words with zero or negative duration
desh2608 May 18, 2023
dba413f
ensure word alignments respect segment boundary
desh2608 May 18, 2023
12be424
add save-to-wav option for icsi
desh2608 May 22, 2023
c4b957d
add test for mixing cut with recording
desh2608 May 22, 2023
04ca4aa
Merge branch 'ami_icsi'
desh2608 May 22, 2023
fef3aa3
Merge branch 'cuts'
desh2608 May 22, 2023
0de443e
Merge branch 'mixed_cut_transform'
desh2608 May 22, 2023
80619bb
Merge branch 'master' of https://github.com/lhotse-speech/lhotse
desh2608 Jun 8, 2023
752be69
style fix
desh2608 Jun 8, 2023
5bd483d
Merge branch 'master' of https://github.com/lhotse-speech/lhotse
desh2608 Jun 11, 2023
68f3ffd
Merge branch 'loudness_fix'
desh2608 Jun 11, 2023
2171d7e
add data prep for voxpopuli
desh2608 Jun 12, 2023
2ca22d7
Merge branch 'recipe/voxpopuli' of https://github.com/desh2608/lhotse
desh2608 Jul 13, 2023
a11263b
change extract function
desh2608 Jul 19, 2023
f39c2b1
Merge branch 'master' of https://github.com/lhotse-speech/lhotse into…
desh2608 Jul 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_vctk`
* - VoxCeleb
- :func:`lhotse.recipes.prepare_voxceleb`
* - VoxPopuli
- :func:`lhotse.recipes.prepare_voxpopuli`
* - WenetSpeech
- :func:`lhotse.recipes.prepare_wenet_speech`
* - YesNo
Expand Down
1 change: 1 addition & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
from .uwb_atcc import *
from .vctk import *
from .voxceleb import *
from .voxpopuli import *
from .wenet_speech import *
from .xbmu_amdo31 import *
from .yesno import *
84 changes: 84 additions & 0 deletions lhotse/bin/modes/recipes/voxpopuli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import click

from lhotse.bin.modes import download, prepare
from lhotse.recipes import download_voxpopuli, prepare_voxpopuli
from lhotse.recipes.voxpopuli import (
LANGUAGES,
LANGUAGES_V2,
S2S_SRC_LANGUAGES,
S2S_TGT_LANGUAGES,
)
from lhotse.utils import Pathlike

__all__ = ["voxpopuli"]


@prepare.command()
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
@click.argument("output_dir", type=click.Path())
@click.option(
"--task",
type=click.Choice(["asr", "s2s", "lm"]),
default="asr",
help="The task for which to prepare the VoxPopuli data.",
show_default=True,
)
@click.option(
"--lang",
type=click.Choice(LANGUAGES + LANGUAGES_V2),
default="en",
help="The language to prepare (only used if task is asr or lm).",
show_default=True,
)
@click.option(
"--src-lang",
type=click.Choice(S2S_SRC_LANGUAGES),
default=None,
help="The source language (only used if task is s2s).",
show_default=True,
)
@click.option(
"--tgt-lang",
type=click.Choice(S2S_TGT_LANGUAGES),
default=None,
help="The target language (only used if task is s2s).",
show_default=True,
)
@click.option(
"--num-jobs",
"-j",
type=int,
default=1,
help="Number of parallel jobs (can provide small speed-ups).",
show_default=True,
)
def voxpopuli(
corpus_dir: Pathlike,
output_dir: Pathlike,
task: str,
lang: str,
src_lang: str,
tgt_lang: str,
num_jobs: int,
):
"""voxpopuli data preparation."""
prepare_voxpopuli(
corpus_dir,
output_dir=output_dir,
task=task,
lang=lang,
source_lang=src_lang,
target_lang=tgt_lang,
)


@download.command()
@click.argument("target_dir", type=click.Path())
@click.option(
"--subset",
type=click.Choice(["asr", "10k", "100k", "400k"] + LANGUAGES + LANGUAGES_V2),
default="asr",
)
def voxpopuli(target_dir: Pathlike):
"""voxpopuli download."""
download_voxpopuli(target_dir)
1 change: 1 addition & 0 deletions lhotse/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
from .uwb_atcc import download_uwb_atcc, prepare_uwb_atcc
from .vctk import download_vctk, prepare_vctk
from .voxceleb import download_voxceleb1, download_voxceleb2, prepare_voxceleb
from .voxpopuli import download_voxpopuli, prepare_voxpopuli
from .wenet_speech import prepare_wenet_speech
from .xbmu_amdo31 import download_xbmu_amdo31, prepare_xbmu_amdo31
from .yesno import download_yesno, prepare_yesno
285 changes: 285 additions & 0 deletions lhotse/recipes/voxpopuli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
"""
VoxPopuli provides

- 400K hours of unlabelled speech data for 23 languages
- 1.8K hours of transcribed speech data for 16 languages
- 17.3K hours of speech-to-speech interpretation data for 15x15 directions
- 29 hours of transcribed speech data of non-native English intended for research in ASR
for accented speech (15 L2 accents)

The raw data is collected from 2009-2020 European Parliament event recordings.
For details about the corpus, please refer to the website:
https://github.com/facebookresearch/voxpopuli

Reference:
Wang, Changhan et al. “VoxPopuli: A Large-Scale Multilingual Speech Corpus for Representation
Learning, Semi-Supervised Learning and Interpretation.” Annual Meeting of the Association
for Computational Linguistics (2021).

This script is based on code from the repository linked above.

NOTE: Our data preparation is slightly different from the original repository. In particular,
we only use the metadata to create manifests, i.e., we do not create segment-level wav files,
unlike the original repository. In this way, we can avoid duplicating the audio files.
"""
import csv
import gzip
import logging
import shutil
import tempfile
from ast import literal_eval
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

import torch
import torchaudio
from torch.hub import download_url_to_file
from tqdm import tqdm

from lhotse import (
RecordingSet,
SupervisionSegment,
SupervisionSet,
validate_recordings_and_supervisions,
)
from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
from lhotse.utils import Pathlike, safe_extract

# fmt: off
LANGUAGES = [
"en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
"sk", "sl", "et", "lt", "pt", "bg", "el", "lv", "mt", "sv", "da"
]
LANGUAGES_V2 = [f"{x}_v2" for x in LANGUAGES]

YEARS = list(range(2009, 2020 + 1))

ASR_LANGUAGES = [
"en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
"sk", "sl", "et", "lt"
]
ASR_ACCENTED_LANGUAGES = [
"en_accented"
]

S2S_SRC_LANGUAGES = ASR_LANGUAGES

S2S_TGT_LANGUAGES = [
"en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
"sk", "sl", "et", "lt", "pt", "bg", "el", "lv", "mt", "sv", "da"
]

S2S_TGT_LANGUAGES_WITH_HUMAN_TRANSCRIPTION = ["en", "fr", "es"]

DOWNLOAD_BASE_URL = "https://dl.fbaipublicfiles.com/voxpopuli"
# fmt: on


def download_voxpopuli(
target_dir: Pathlike = ".",
subset: Optional[str] = "asr",
) -> Path:
"""
Download and untar/unzip the VoxPopuli dataset.

:param target_dir: Pathlike, the path of the dir to storage the dataset.
:param subset: str, the subset of the dataset to download, can be one of "400k", "100k",
"10k", "asr", or any of the languages in LANGUAGES or LANGUAGES_V2.
:return: the path to downloaded and extracted directory with data.
"""
target_dir = Path(target_dir)
target_dir.mkdir(parents=True, exist_ok=True)

if subset in LANGUAGES_V2:
languages = [subset.split("_")[0]]
years = YEARS + [f"{y}_2" for y in YEARS]
elif subset in LANGUAGES:
languages = [subset]
years = YEARS
else:
languages = {
"400k": LANGUAGES,
"100k": LANGUAGES,
"10k": LANGUAGES,
"asr": ["original"],
}.get(subset, None)
years = {
"400k": YEARS + [f"{y}_2" for y in YEARS],
"100k": YEARS,
"10k": [2019, 2020],
"asr": YEARS,
}.get(subset, None)

url_list = []
for l in languages:
for y in years:
url_list.append(f"{DOWNLOAD_BASE_URL}/audios/{l}_{y}.tar")

out_root = target_dir / "raw_audios"
out_root.mkdir(exist_ok=True, parents=True)
logging.info(f"{len(url_list)} files to download...")
for url in tqdm(url_list):
tar_path = out_root / Path(url).name
download_url_to_file(url, out_root.as_posix(), Path(url).name)
safe_extract(tar_path, out_root)
tar_path.unlink()

return target_dir


def prepare_voxpopuli(
corpus_dir: Pathlike,
output_dir: Optional[Pathlike] = None,
task: str = "asr",
lang: str = "en",
source_lang: Optional[str] = None,
target_lang: Optional[str] = None,
num_jobs: int = 1,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
"""
Prepares and returns the VoxPopuli manifests which consist of Recordings and Supervisions.

:param corpus_dir: Pathlike, the path of the data dir.
:param output_dir: Pathlike, the path where to write the manifests.
:param task: str, the task to prepare the manifests for, can be one of "asr", "s2s", "lm".
:param lang: str, the language to prepare the manifests for, can be one of LANGUAGES
or LANGUAGES_V2. This is used for "asr" and "lm" tasks.
:param source_lang: str, the source language for the s2s task, can be one of S2S_SRC_LANGUAGES.
:param target_lang: str, the target language for the s2s task, can be one of S2S_TGT_LANGUAGES.
:param num_jobs: int, the number of parallel jobs to use for preparing the manifests.
:return: Dict[str, Union[RecordingSet, SupervisionSet]], the manifests.
"""
corpus_dir = Path(corpus_dir)
assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True, parents=True)

if task == "asr":
assert lang in ASR_LANGUAGES, f"Unsupported language: {lang}"
manifests = _prepare_voxpopuli_asr(
corpus_dir, output_dir, lang, num_jobs=num_jobs
)
elif task == "s2s":
assert (
source_lang in S2S_SRC_LANGUAGES
), f"Unsupported source language: {source_lang}"
assert (
target_lang in S2S_TGT_LANGUAGES
), f"Unsupported target language: {target_lang}"
manifests = _prepare_voxpopuli_s2s(corpus_dir, source_lang, target_lang)
elif task == "lm":
assert lang in ASR_LANGUAGES, f"Unsupported language: {lang}"
manifests = _prepare_voxpopuli_lm(corpus_dir, lang)

for k, v in manifests.items():
recordings, supervisions = fix_manifests(**v)
validate_recordings_and_supervisions(
recordings=recordings, supervisions=supervisions
)
manifests[k]["recordings"] = recordings
manifests[k]["supervisions"] = supervisions

lang_affix = f"{source_lang}-{target_lang}" if task == "s2s" else lang
if output_dir is not None:
recordings.to_file(
output_dir / f"voxpopuli-{task}-{lang_affix}_recordings_{k}.jsonl.gz"
)
supervisions.to_file(
output_dir / f"voxpopuli-{task}-{lang_affix}_supervisions_{k}.jsonl.gz"
)

return manifests


def _prepare_voxpopuli_asr(
corpus_dir: Path, output_dir: Path, lang: str, num_jobs: int = 1
) -> Tuple[RecordingSet, SupervisionSet]:
"""
Download metadata TSV and prepare manifests for the ASR task.
"""
# First create recordings. We remove the affix "_original" from the recording ID
logging.info("Preparing recordings (this may take a few minutes)...")
in_root = corpus_dir / "raw_audios" / "original"
recordings = RecordingSet.from_dir(
in_root,
"*.ogg",
num_jobs=num_jobs,
recording_id=lambda x: x.stem.replace("_original", ""),
)

# Now create supervisions
temp_dir = Path(tempfile.mkdtemp(prefix=f"voxpopuli_asr_", dir=output_dir))

# Get metadata TSV
url = f"{DOWNLOAD_BASE_URL}/annotations/asr/asr_{lang}.tsv.gz"
tsv_path = temp_dir / Path(url).name
if not tsv_path.exists():
download_url_to_file(url, tsv_path)
with gzip.open(tsv_path, "rt") as f:
metadata = [x for x in csv.DictReader(f, delimiter="|")]

# Get segment into list (train, dev, test)
segments = defaultdict(list)
# We also keep a count of the number of segments per recording
num_segments = defaultdict(lambda: 0)

for r in tqdm(metadata):
split = r["split"]
if split not in ["train", "dev", "test"]:
continue
reco_id = r["session_id"]
start_time = float(r["start_time"])
duration = float(r["end_time"]) - start_time

num_segments[reco_id] += 1
segments[split].append(
SupervisionSegment(
id=f"{reco_id}-{num_segments[reco_id]}",
recording_id=reco_id,
start=round(start_time, ndigits=8),
duration=round(duration, ndigits=8),
channel=0,
language=lang,
speaker=r["speaker_id"],
gender=r["gender"],
text=r["normed_text"],
custom={
"orig_text": r["original_text"],
},
)
)

# Get list of recording IDs for each split
reco_ids = defaultdict(list)
for split, segs in segments.items():
reco_ids[split] = sorted(set([s.recording_id for s in segs]))

manifests = defaultdict(dict)
for split in ["train", "dev", "test"]:
manifests[split]["recordings"] = recordings.filter(
lambda r: r.id in reco_ids[split]
)
manifests[split]["supervisions"] = SupervisionSet.from_segments(segments[split])

# Delete temp dir along with its contents
shutil.rmtree(temp_dir)
return manifests


def _prepare_voxpopuli_s2s(
corpus_dir: Path, source_lang: str, target_lang: str
) -> Tuple[RecordingSet, SupervisionSet]:
"""
Prepare the manifests for the s2s task.
"""
raise NotImplementedError


def _prepare_voxpopuli_lm(corpus_dir: Path, lang: str) -> Tuple[RecordingSet, None]:
"""
Prepare the manifests for the lm task.
"""
raise NotImplementedError