Skip to content

Commit

Permalink
Aishell3 (#998)
Browse files Browse the repository at this point in the history
Dataset download and prepare code supported for aishell3.
  • Loading branch information
pzelasko authored Mar 19, 2023
2 parents 7e8d6b0 + 9a81a0a commit 4cbd1bd
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_aidatatang_200zh`
* - Aishell
- :func:`lhotse.recipes.prepare_aishell`
* - Aishell-3
- :func:`lhotse.recipes.prepare_aishell3`
* - AISHELL-4
- :func:`lhotse.recipes.prepare_aishell4`
* - AliMeeting
Expand Down
1 change: 1 addition & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .aidatatang_200zh import *
from .aishell import *
from .aishell2 import *
from .aishell3 import *
from .aishell4 import *
from .ali_meeting import *
from .ami import *
Expand Down
22 changes: 22 additions & 0 deletions lhotse/bin/modes/recipes/aishell3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import click

from lhotse.bin.modes import download, prepare
from lhotse.recipes.aishell3 import download_aishell3, prepare_aishell3
from lhotse.utils import Pathlike

__all__ = ["aishell3"]


@prepare.command(context_settings=dict(show_default=True))
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
@click.argument("output_dir", type=click.Path())
def aishell3(corpus_dir: Pathlike, output_dir: Pathlike):
"""aishell3 data preparation."""
prepare_aishell3(corpus_dir, output_dir=output_dir)


@download.command(context_settings=dict(show_default=True))
@click.argument("target_dir", type=click.Path(), default=".")
def aishell3(target_dir: Pathlike):
"""aishell3 download."""
download_aishell3(target_dir)
1 change: 1 addition & 0 deletions lhotse/recipes/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .adept import download_adept, prepare_adept
from .aishell import download_aishell, prepare_aishell
from .aishell3 import download_aishell3, prepare_aishell3
from .aishell4 import download_aishell4, prepare_aishell4
from .ali_meeting import download_ali_meeting, prepare_ali_meeting
from .ami import download_ami, prepare_ami
Expand Down
143 changes: 143 additions & 0 deletions lhotse/recipes/aishell3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
"""
AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpus
published by Beijing Shell Shell Technology Co.,Ltd.
It can be used to train multi-speaker Text-to-Speech (TTS) systems.
The corpus contains roughly 85 hours of emotion-neutral recordings spoken by
218 native Chinese mandarin speakers and total 88035 utterances.
Their auxiliary attributes such as gender, age group and native accents are
explicitly marked and provided in the corpus. Accordingly, transcripts in Chinese
character-level and pinyin-level are provided along with the recordings.
The word & tone transcription accuracy rate is above 98%, through professional
speech annotation and strict quality inspection for tone and prosody.
"""
import logging
import shutil
import tarfile
from pathlib import Path
from typing import Dict, Optional, Sequence, Union

from tqdm import tqdm

from lhotse import (
RecordingSet,
SupervisionSegment,
SupervisionSet,
validate_recordings_and_supervisions,
)
from lhotse.audio import Recording, RecordingSet
from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
from lhotse.utils import Pathlike, safe_extract, urlretrieve_progress

aishell3 = (
"test",
"train",
)


def download_aishell3(
target_dir: Pathlike = ".",
force_download: Optional[bool] = False,
base_url: Optional[str] = "http://www.openslr.org/resources",
) -> Path:
"""
Download and untar the dataset
:param target_dir: Pathlike, the path of the dir to storage the dataset.
:param force_download: Bool, if True, download the tars no matter if the tars exist.
:param base_url: str, the url of the OpenSLR resources.
:return: the path to downloaded and extracted directory with data.
"""
target_dir = Path(target_dir)
target_dir.mkdir(parents=True, exist_ok=True)

url = f"{base_url}/93"
tar_name = "data_aishell3.tgz"
tar_path = target_dir / tar_name
completed_detector = target_dir / ".completed"
if completed_detector.is_file():
logging.info(f"Skipping {tar_name} because {completed_detector} exists.")
return
if force_download or not tar_path.is_file():
urlretrieve_progress(
f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}"
)
with tarfile.open(tar_path) as tar:
safe_extract(tar, path=target_dir)
completed_detector.touch()
return target_dir


def prepare_aishell3(
corpus_dir: Pathlike,
output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
"""
Returns the manifests which consist of the Recordings and Supervisions.
When all the manifests are available in the ``output_dir``, it will simply read and return them.
:param corpus_dir: Pathlike, the path of the data dir.
:param output_dir: Pathlike, the path where to write the manifests.
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
"""
corpus_dir = Path(corpus_dir)
assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

dataset_parts = aishell3
manifests = {}

if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Maybe the manifests already exist: we can read them and save a bit of preparation time.
manifests = read_manifests_if_cached(
dataset_parts=dataset_parts, output_dir=output_dir, prefix="aishell3"
)

for part in tqdm(dataset_parts, desc="Preparing aishell3 parts"):
if manifests_exist(part=part, output_dir=output_dir, prefix="aishell3"):
logging.info(f"aishell3 subset: {part} already prepared - skipping.")
continue
part_path = corpus_dir / part
scripts_path = part_path / "content.txt"
assert scripts_path.is_file(), f"No such file: {scripts_path}"
recordings = []
supervisions = []
with open(scripts_path) as f:
for line in tqdm(f):
id, text = line.strip().split("\t")
audio_path = part_path / "wav" / id[:7] / id
id = id.split(".")[0]
text = "".join([x for i, x in enumerate(text.split()) if i % 2 == 0])
pinyin = " ".join([x for i, x in enumerate(text.split()) if i % 2 == 1])
if not audio_path.is_file():
logging.warning(f"No such file: {audio_path}")
continue
recording = Recording.from_file(audio_path)
segment = SupervisionSegment(
id=id,
recording_id=id,
start=0.0,
duration=recording.duration,
channel=0,
language="Chinese",
gender="female",
text=text,
custom={"pinyin": pinyin.strip()},
)
recordings.append(recording)
supervisions.append(segment)

recording_set = RecordingSet.from_recordings(recordings)
supervision_set = SupervisionSet.from_segments(supervisions)

validate_recordings_and_supervisions(recording_set, supervision_set)

if output_dir is not None:
supervision_set.to_file(
output_dir / f"aishell3_supervisions_{part}.jsonl.gz"
)
recording_set.to_file(output_dir / f"aishell3_recordings_{part}.jsonl.gz")

manifests[part] = {"recordings": supervision_set, "supervisions": recording_set}

return manifests

0 comments on commit 4cbd1bd

Please sign in to comment.