Aishell3 (#998)

Dataset download and prepare code supported for aishell3.
lhotse-speech · Mar 19, 2023 · 4cbd1bd · 4cbd1bd
2 parents 7e8d6b0 + 9a81a0a
commit 4cbd1bd
Show file tree

Hide file tree

Showing 5 changed files with 169 additions and 0 deletions.
diff --git a/docs/corpus.rst b/docs/corpus.rst
@@ -49,6 +49,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_aidatatang_200zh`
   * - Aishell
     - :func:`lhotse.recipes.prepare_aishell`
+  * - Aishell-3
+    - :func:`lhotse.recipes.prepare_aishell3`
   * - AISHELL-4
     - :func:`lhotse.recipes.prepare_aishell4`
   * - AliMeeting

diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
@@ -2,6 +2,7 @@
 from .aidatatang_200zh import *
 from .aishell import *
 from .aishell2 import *
+from .aishell3 import *
 from .aishell4 import *
 from .ali_meeting import *
 from .ami import *

diff --git a/lhotse/bin/modes/recipes/aishell3.py b/lhotse/bin/modes/recipes/aishell3.py
@@ -0,0 +1,22 @@
+import click
+
+from lhotse.bin.modes import download, prepare
+from lhotse.recipes.aishell3 import download_aishell3, prepare_aishell3
+from lhotse.utils import Pathlike
+
+__all__ = ["aishell3"]
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+def aishell3(corpus_dir: Pathlike, output_dir: Pathlike):
+    """aishell3 data preparation."""
+    prepare_aishell3(corpus_dir, output_dir=output_dir)
+
+
+@download.command(context_settings=dict(show_default=True))
+@click.argument("target_dir", type=click.Path(), default=".")
+def aishell3(target_dir: Pathlike):
+    """aishell3 download."""
+    download_aishell3(target_dir)
diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py
@@ -1,5 +1,6 @@
 from .adept import download_adept, prepare_adept
 from .aishell import download_aishell, prepare_aishell
+from .aishell3 import download_aishell3, prepare_aishell3
 from .aishell4 import download_aishell4, prepare_aishell4
 from .ali_meeting import download_ali_meeting, prepare_ali_meeting
 from .ami import download_ami, prepare_ami

diff --git a/lhotse/recipes/aishell3.py b/lhotse/recipes/aishell3.py
@@ -0,0 +1,143 @@
+"""
+AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpus 
+published by Beijing Shell Shell Technology Co.,Ltd. 
+It can be used to train multi-speaker Text-to-Speech (TTS) systems.
+The corpus contains roughly 85 hours of emotion-neutral recordings spoken by 
+218 native Chinese mandarin speakers and total 88035 utterances. 
+Their auxiliary attributes such as gender, age group and native accents are 
+explicitly marked and provided in the corpus. Accordingly, transcripts in Chinese 
+character-level and pinyin-level are provided along with the recordings. 
+The word & tone transcription accuracy rate is above 98%, through professional 
+speech annotation and strict quality inspection for tone and prosody.
+"""
+import logging
+import shutil
+import tarfile
+from pathlib import Path
+from typing import Dict, Optional, Sequence, Union
+
+from tqdm import tqdm
+
+from lhotse import (
+    RecordingSet,
+    SupervisionSegment,
+    SupervisionSet,
+    validate_recordings_and_supervisions,
+)
+from lhotse.audio import Recording, RecordingSet
+from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
+from lhotse.utils import Pathlike, safe_extract, urlretrieve_progress
+
+aishell3 = (
+    "test",
+    "train",
+)
+
+
+def download_aishell3(
+    target_dir: Pathlike = ".",
+    force_download: Optional[bool] = False,
+    base_url: Optional[str] = "http://www.openslr.org/resources",
+) -> Path:
+    """
+    Download and untar the dataset
+
+    :param target_dir: Pathlike, the path of the dir to storage the dataset.
+    :param force_download: Bool, if True, download the tars no matter if the tars exist.
+    :param base_url: str, the url of the OpenSLR resources.
+    :return: the path to downloaded and extracted directory with data.
+    """
+    target_dir = Path(target_dir)
+    target_dir.mkdir(parents=True, exist_ok=True)
+
+    url = f"{base_url}/93"
+    tar_name = "data_aishell3.tgz"
+    tar_path = target_dir / tar_name
+    completed_detector = target_dir / ".completed"
+    if completed_detector.is_file():
+        logging.info(f"Skipping {tar_name} because {completed_detector} exists.")
+        return
+    if force_download or not tar_path.is_file():
+        urlretrieve_progress(
+            f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}"
+        )
+    with tarfile.open(tar_path) as tar:
+        safe_extract(tar, path=target_dir)
+    completed_detector.touch()
+    return target_dir
+
+
+def prepare_aishell3(
+    corpus_dir: Pathlike,
+    output_dir: Optional[Pathlike] = None,
+) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
+    """
+    Returns the manifests which consist of the Recordings and Supervisions.
+    When all the manifests are available in the ``output_dir``, it will simply read and return them.
+
+    :param corpus_dir: Pathlike, the path of the data dir.
+    :param output_dir: Pathlike, the path where to write the manifests.
+    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
+    """
+    corpus_dir = Path(corpus_dir)
+    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
+
+    dataset_parts = aishell3
+    manifests = {}
+
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
+        manifests = read_manifests_if_cached(
+            dataset_parts=dataset_parts, output_dir=output_dir, prefix="aishell3"
+        )
+
+    for part in tqdm(dataset_parts, desc="Preparing aishell3 parts"):
+        if manifests_exist(part=part, output_dir=output_dir, prefix="aishell3"):
+            logging.info(f"aishell3 subset: {part} already prepared - skipping.")
+            continue
+        part_path = corpus_dir / part
+        scripts_path = part_path / "content.txt"
+        assert scripts_path.is_file(), f"No such file: {scripts_path}"
+        recordings = []
+        supervisions = []
+        with open(scripts_path) as f:
+            for line in tqdm(f):
+                id, text = line.strip().split("\t")
+                audio_path = part_path / "wav" / id[:7] / id
+                id = id.split(".")[0]
+                text = "".join([x for i, x in enumerate(text.split()) if i % 2 == 0])
+                pinyin = " ".join([x for i, x in enumerate(text.split()) if i % 2 == 1])
+                if not audio_path.is_file():
+                    logging.warning(f"No such file: {audio_path}")
+                    continue
+                recording = Recording.from_file(audio_path)
+                segment = SupervisionSegment(
+                    id=id,
+                    recording_id=id,
+                    start=0.0,
+                    duration=recording.duration,
+                    channel=0,
+                    language="Chinese",
+                    gender="female",
+                    text=text,
+                    custom={"pinyin": pinyin.strip()},
+                )
+                recordings.append(recording)
+                supervisions.append(segment)
+
+        recording_set = RecordingSet.from_recordings(recordings)
+        supervision_set = SupervisionSet.from_segments(supervisions)
+
+        validate_recordings_and_supervisions(recording_set, supervision_set)
+
+        if output_dir is not None:
+            supervision_set.to_file(
+                output_dir / f"aishell3_supervisions_{part}.jsonl.gz"
+            )
+            recording_set.to_file(output_dir / f"aishell3_recordings_{part}.jsonl.gz")
+
+        manifests[part] = {"recordings": supervision_set, "supervisions": recording_set}
+
+    return manifests