Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LibriLight dataset #1014

Merged
merged 7 commits into from
Apr 1, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_l2_arctic`
* - LibriCSS
- :func:`lhotse.recipes.prepare_libricss`
* - LibriLight
- :func:`lhotse.recipes.prepare_librilight`
* - LibriSpeech (including "mini")
- :func:`lhotse.recipes.prepare_librispeech`
* - LibriTTS
Expand Down
1 change: 1 addition & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from .icsi import *
from .l2_arctic import *
from .libricss import *
from .librilight import *
from .librimix import *
from .librispeech import *
from .libritts import *
Expand Down
30 changes: 30 additions & 0 deletions lhotse/bin/modes/recipes/librilight.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from typing import Dict, List, Optional, Tuple, Union

import click

from lhotse.bin.modes import download, prepare
from lhotse.recipes.librilight import prepare_librilight
from lhotse.utils import Pathlike


@prepare.command(context_settings=dict(show_default=True))
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
@click.argument("output_dir", type=click.Path())
@click.option(
"-j",
"--num-jobs",
type=int,
default=1,
help="How many threads to use (can give good speed-ups with slow disks).",
)
def librilight(
corpus_dir: Pathlike,
output_dir: Optional[Pathlike] = None,
num_jobs: int = 1,
):
"""LibriLight data preparation."""
prepare_librilight(
corpus_dir=corpus_dir,
output_dir=output_dir,
num_jobs=num_jobs,
)
1 change: 1 addition & 0 deletions lhotse/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from .icsi import download_icsi, prepare_icsi
from .l2_arctic import prepare_l2_arctic
from .libricss import download_libricss, prepare_libricss
from .librilight import prepare_librilight
from .librimix import download_librimix, prepare_librimix
from .librispeech import download_librispeech, prepare_librispeech
from .libritts import download_libritts, prepare_libritts
Expand Down
158 changes: 158 additions & 0 deletions lhotse/recipes/librilight.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
"""
About the librilight corpus

Libri-light is a benchmark for the training of automatic speech recognition (ASR)
systems with limited or no supervision.

It contains a large dataset of 60K hours of unlabelled speech from audiobooks in
English and a small labelled dataset (10h, 1h, and 10 min) plus metrics,
trainable baseline models, and pretrained models that use these datasets.

It is covered in more detail at https://arxiv.org/abs/1912.07875.

This data is very huge - please download manually at LIBRILIGHT_URL.
"""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're not adding download functions, could you at least provide a link to where this dataset can be obtained from?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure.


import logging
import os
from collections import defaultdict
from concurrent.futures.thread import ThreadPoolExecutor
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Tuple, Union

from tqdm.auto import tqdm

from lhotse.audio import Recording, RecordingSet
from lhotse.recipes.utils import manifests_exist
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike

LIBRILIGHT = ("small", "medium", "large")

LIBRILIGHT_URL = (
"https://dl.fbaipublicfiles.com/librilight/data/small.tar",
"https://dl.fbaipublicfiles.com/librilight/data/medium.tar",
"https://dl.fbaipublicfiles.com/librilight/data/large.tar",
)


def _parse_utterance(
corpus_dir: Pathlike,
audio_path: Pathlike,
) -> Optional[Tuple[Recording, SupervisionSegment]]:
file_name = audio_path.replace(".flac", "").replace(str(corpus_dir) + "/", "")
speaker = audio_path.split("/")[-3]
audio_path = Path(audio_path).resolve()

if not audio_path.is_file():
logging.warning(f"No such file: {audio_path}")
return None

recording = Recording.from_file(
path=audio_path,
recording_id=file_name,
)
segment = SupervisionSegment(
id=file_name,
recording_id=file_name,
start=0.0,
duration=recording.duration,
channel=0,
language="English",
speaker=speaker,
)

return recording, segment


def _prepare_subset(
subset: str,
corpus_dir: Pathlike,
num_jobs: int = 1,
) -> Tuple[RecordingSet, SupervisionSet]:
"""
Returns the RecodingSet and SupervisionSet given a dataset part.
:param subset: str, the name of the subset.
:param corpus_dir: Pathlike, the path of the data dir.
:return: the RecodingSet and SupervisionSet for train and valid.
"""
corpus_dir = Path(corpus_dir)
part_path = corpus_dir / subset
audio_paths = []
for root, dirs, files in os.walk(part_path):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this loop be turned into a one-liner with Path(part_path).rglob("*.flac")?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure.

if len(dirs) == 0:
audio_paths += [
os.path.join(root, file_path)
for file_path in files
if file_path.endswith(".flac")
]

with ThreadPoolExecutor(num_jobs) as ex:
futures = []
recordings = []
supervisions = []
for audio_path in tqdm(audio_paths, desc="Distributing tasks"):
futures.append(ex.submit(_parse_utterance, corpus_dir, audio_path))

for future in tqdm(futures, desc="Processing"):
result = future.result()
if result is None:
continue
recording, segment = result
recordings.append(recording)
supervisions.append(segment)

recording_set = RecordingSet.from_recordings(recordings)
supervision_set = SupervisionSet.from_segments(supervisions)

return recording_set, supervision_set


def prepare_librilight(
corpus_dir: Pathlike,
output_dir: Optional[Pathlike] = None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is a large corpus, perhaps it would be useful to add a num_jobs option to speed up manifest creation? Check the LibriSpeech recipe for an example.

Copy link
Contributor Author

@yfyeung yfyeung Mar 31, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I have implemented this.

num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
"""
Returns the manifests which consist of the Recordings and Supervisions
:param corpus_dir: Path to the LibriLight dataset.
:param output_dir: Pathlike, the path where to write the manifests.
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
"""
corpus_dir = Path(corpus_dir)
output_dir = Path(output_dir) if output_dir is not None else None

assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

logging.info("Preparing LibriLight...")

subsets = LIBRILIGHT

if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

manifests = defaultdict(dict)

for part in tqdm(subsets, desc="Dataset parts"):
logging.info(f"Processing LibriLight subset: {part}")
if manifests_exist(
part=part,
output_dir=output_dir,
prefix="librilight",
suffix="jsonl.gz",
):
logging.info(f"LibriLight subset: {part} already prepared - skipping.")
continue

recording_set, supervision_set = _prepare_subset(part, corpus_dir, num_jobs)

if output_dir is not None:
supervision_set.to_file(
output_dir / f"librilight_supervisions_{part}.jsonl.gz"
)
recording_set.to_file(output_dir / f"librilight_recordings_{part}.jsonl.gz")

manifests[part] = {"recordings": recording_set, "supervisions": supervision_set}

return manifests