-
Notifications
You must be signed in to change notification settings - Fork 225
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
LibriLight dataset #1014
LibriLight dataset #1014
Changes from 5 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from typing import Dict, List, Optional, Tuple, Union | ||
|
||
import click | ||
|
||
from lhotse.bin.modes import download, prepare | ||
from lhotse.recipes.librilight import prepare_librilight | ||
from lhotse.utils import Pathlike | ||
|
||
|
||
@prepare.command(context_settings=dict(show_default=True)) | ||
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True)) | ||
@click.argument("output_dir", type=click.Path()) | ||
@click.option( | ||
"-j", | ||
"--num-jobs", | ||
type=int, | ||
default=1, | ||
help="How many threads to use (can give good speed-ups with slow disks).", | ||
) | ||
def librilight( | ||
corpus_dir: Pathlike, | ||
output_dir: Optional[Pathlike] = None, | ||
num_jobs: int = 1, | ||
): | ||
"""LibriLight data preparation.""" | ||
prepare_librilight( | ||
corpus_dir=corpus_dir, | ||
output_dir=output_dir, | ||
num_jobs=num_jobs, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
""" | ||
About the librilight corpus | ||
|
||
Libri-light is a benchmark for the training of automatic speech recognition (ASR) | ||
systems with limited or no supervision. | ||
|
||
It contains a large dataset of 60K hours of unlabelled speech from audiobooks in | ||
English and a small labelled dataset (10h, 1h, and 10 min) plus metrics, | ||
trainable baseline models, and pretrained models that use these datasets. | ||
|
||
It is covered in more detail at https://arxiv.org/abs/1912.07875. | ||
|
||
This data is very huge - please download manually at LIBRILIGHT_URL. | ||
""" | ||
|
||
import logging | ||
import os | ||
from collections import defaultdict | ||
from concurrent.futures.thread import ThreadPoolExecutor | ||
from pathlib import Path | ||
from typing import Dict, List, Optional, Sequence, Tuple, Union | ||
|
||
from tqdm.auto import tqdm | ||
|
||
from lhotse.audio import Recording, RecordingSet | ||
from lhotse.recipes.utils import manifests_exist | ||
from lhotse.supervision import SupervisionSegment, SupervisionSet | ||
from lhotse.utils import Pathlike | ||
|
||
LIBRILIGHT = ("small", "medium", "large") | ||
|
||
LIBRILIGHT_URL = ( | ||
"https://dl.fbaipublicfiles.com/librilight/data/small.tar", | ||
"https://dl.fbaipublicfiles.com/librilight/data/medium.tar", | ||
"https://dl.fbaipublicfiles.com/librilight/data/large.tar", | ||
) | ||
|
||
|
||
def _parse_utterance( | ||
corpus_dir: Pathlike, | ||
audio_path: Pathlike, | ||
) -> Optional[Tuple[Recording, SupervisionSegment]]: | ||
file_name = audio_path.replace(".flac", "").replace(str(corpus_dir) + "/", "") | ||
speaker = audio_path.split("/")[-3] | ||
audio_path = Path(audio_path).resolve() | ||
|
||
if not audio_path.is_file(): | ||
logging.warning(f"No such file: {audio_path}") | ||
return None | ||
|
||
recording = Recording.from_file( | ||
path=audio_path, | ||
recording_id=file_name, | ||
) | ||
segment = SupervisionSegment( | ||
id=file_name, | ||
recording_id=file_name, | ||
start=0.0, | ||
duration=recording.duration, | ||
channel=0, | ||
language="English", | ||
speaker=speaker, | ||
) | ||
|
||
return recording, segment | ||
|
||
|
||
def _prepare_subset( | ||
subset: str, | ||
corpus_dir: Pathlike, | ||
num_jobs: int = 1, | ||
) -> Tuple[RecordingSet, SupervisionSet]: | ||
""" | ||
Returns the RecodingSet and SupervisionSet given a dataset part. | ||
:param subset: str, the name of the subset. | ||
:param corpus_dir: Pathlike, the path of the data dir. | ||
:return: the RecodingSet and SupervisionSet for train and valid. | ||
""" | ||
corpus_dir = Path(corpus_dir) | ||
part_path = corpus_dir / subset | ||
audio_paths = [] | ||
for root, dirs, files in os.walk(part_path): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this loop be turned into a one-liner with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. |
||
if len(dirs) == 0: | ||
audio_paths += [ | ||
os.path.join(root, file_path) | ||
for file_path in files | ||
if file_path.endswith(".flac") | ||
] | ||
|
||
with ThreadPoolExecutor(num_jobs) as ex: | ||
futures = [] | ||
recordings = [] | ||
supervisions = [] | ||
for audio_path in tqdm(audio_paths, desc="Distributing tasks"): | ||
futures.append(ex.submit(_parse_utterance, corpus_dir, audio_path)) | ||
|
||
for future in tqdm(futures, desc="Processing"): | ||
result = future.result() | ||
if result is None: | ||
continue | ||
recording, segment = result | ||
recordings.append(recording) | ||
supervisions.append(segment) | ||
|
||
recording_set = RecordingSet.from_recordings(recordings) | ||
supervision_set = SupervisionSet.from_segments(supervisions) | ||
|
||
return recording_set, supervision_set | ||
|
||
|
||
def prepare_librilight( | ||
corpus_dir: Pathlike, | ||
output_dir: Optional[Pathlike] = None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since this is a large corpus, perhaps it would be useful to add a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, I have implemented this. |
||
num_jobs: int = 1, | ||
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: | ||
""" | ||
Returns the manifests which consist of the Recordings and Supervisions | ||
:param corpus_dir: Path to the LibriLight dataset. | ||
:param output_dir: Pathlike, the path where to write the manifests. | ||
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. | ||
""" | ||
corpus_dir = Path(corpus_dir) | ||
output_dir = Path(output_dir) if output_dir is not None else None | ||
|
||
assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" | ||
|
||
logging.info("Preparing LibriLight...") | ||
|
||
subsets = LIBRILIGHT | ||
|
||
if output_dir is not None: | ||
output_dir = Path(output_dir) | ||
output_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
manifests = defaultdict(dict) | ||
|
||
for part in tqdm(subsets, desc="Dataset parts"): | ||
logging.info(f"Processing LibriLight subset: {part}") | ||
if manifests_exist( | ||
part=part, | ||
output_dir=output_dir, | ||
prefix="librilight", | ||
suffix="jsonl.gz", | ||
): | ||
logging.info(f"LibriLight subset: {part} already prepared - skipping.") | ||
continue | ||
|
||
recording_set, supervision_set = _prepare_subset(part, corpus_dir, num_jobs) | ||
|
||
if output_dir is not None: | ||
supervision_set.to_file( | ||
output_dir / f"librilight_supervisions_{part}.jsonl.gz" | ||
) | ||
recording_set.to_file(output_dir / f"librilight_recordings_{part}.jsonl.gz") | ||
|
||
manifests[part] = {"recordings": recording_set, "supervisions": supervision_set} | ||
|
||
return manifests |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we're not adding download functions, could you at least provide a link to where this dataset can be obtained from?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure.