Skip to content

Commit

Permalink
[Recipe] Wenetspeech4tts (lhotse-speech#1384)
Browse files Browse the repository at this point in the history
* add wenetspeech4tts recipe

* fix wenetspeech4tts recipe

* fix wenetspeech4tts recipe float

* fix wenetspeech4tts recipe typo

* fix wenetspeech4tts recipe typo

* add wenetspeech4tts doc
  • Loading branch information
yuekaizhang authored and Your Name committed Jan 7, 2025
1 parent 4593ccc commit 0148c8d
Show file tree
Hide file tree
Showing 5 changed files with 229 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_voxpopuli`
* - WenetSpeech
- :func:`lhotse.recipes.prepare_wenet_speech`
* - WenetSpeech4TTS
- :func:`lhotse.recipes.prepare_wenetspeech4tts`
* - YesNo
- :func:`lhotse.recipes.prepare_yesno`
* - Eval2000
Expand Down
1 change: 1 addition & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,5 +87,6 @@
from .voxconverse import *
from .voxpopuli import *
from .wenet_speech import *
from .wenetspeech4tts import *
from .xbmu_amdo31 import *
from .yesno import *
43 changes: 43 additions & 0 deletions lhotse/bin/modes/recipes/wenetspeech4tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from typing import Sequence

import click

from lhotse.bin.modes import prepare
from lhotse.recipes import prepare_wenetspeech4tts
from lhotse.utils import Pathlike

__all__ = ["wenetspeech4tts"]


@prepare.command(context_settings=dict(show_default=True))
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
@click.argument("output_dir", type=click.Path())
@click.option(
"-j",
"--num-jobs",
type=int,
default=1,
help="How many jobs to use (can give good speed-ups with slow disks).",
)
@click.option(
"-p",
"--dataset-parts",
type=str,
default=["all"],
multiple=True,
help="List of dataset parts to prepare. To prepare multiple parts, pass each with `-p` "
"Example: `-p Basic -p Premium`",
)
def wenetspeech4tts(
corpus_dir: Pathlike,
output_dir: Pathlike,
dataset_parts: Sequence[str],
num_jobs: int,
):
"""WenetSpeech4TTS data preparation."""
prepare_wenetspeech4tts(
corpus_dir,
output_dir=output_dir,
num_jobs=num_jobs,
dataset_parts=dataset_parts,
)
1 change: 1 addition & 0 deletions lhotse/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
from .voxconverse import download_voxconverse, prepare_voxconverse
from .voxpopuli import download_voxpopuli, prepare_voxpopuli
from .wenet_speech import prepare_wenet_speech
from .wenetspeech4tts import prepare_wenetspeech4tts
from .xbmu_amdo31 import download_xbmu_amdo31, prepare_xbmu_amdo31
from .yesno import download_yesno, prepare_yesno

Expand Down
182 changes: 182 additions & 0 deletions lhotse/recipes/wenetspeech4tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
"""
This recipe supports Chinese TTS corpora: WenetSpeech4TTS.
Paper: https://arxiv.org/abs/2406.05763v3
HuggingFace Dataset: https://huggingface.co/datasets/Wenetspeech4TTS/WenetSpeech4TTS
Download using huggingface-cli:
huggingface-cli login
huggingface-cli download --repo-type dataset --local-dir $DATA_DIR Wenetspeech4TTS/WenetSpeech4TTS
Extract the downloaded data:
for folder in Standard Premium Basic; do
for file in "$folder"/*.tar.gz; do
tar -xzvf "$file" -C "$folder"
done
done
"""
import logging
import re
import shutil
import tarfile
from pathlib import Path
from typing import Dict, Optional, Sequence, Union

from tqdm import tqdm

from lhotse import (
SupervisionSegment,
SupervisionSet,
fix_manifests,
validate_recordings_and_supervisions,
)
from lhotse.audio import Recording, RecordingSet
from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
from lhotse.utils import Pathlike, resumable_download, safe_extract

WENETSPEECH4TTS = (
"Basic",
"Premium",
"Standard",
)


def prepare_wenetspeech4tts(
corpus_dir: Pathlike,
dataset_parts: Union[str, Sequence[str]] = "Basic",
output_dir: Optional[Pathlike] = None,
num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
"""
Returns the manifests which consist of the Recordings and Supervisions.
When all the manifests are available in the ``output_dir``, it will simply read and return them.
:param corpus_dir: Pathlike, the path of the data dir.
:param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'Basic', 'Premium'.
By default we will prepare all parts.
:param output_dir: Pathlike, the path where to write the manifests.
:param num_jobs: the number of parallel workers parsing the data.
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
"""
corpus_dir = Path(corpus_dir)
assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

if dataset_parts == "all" or dataset_parts[0] == "all":
dataset_parts = WENETSPEECH4TTS
elif isinstance(dataset_parts, str):
assert (
dataset_parts in WENETSPEECH4TTS
), f"Unsupported dataset part: {dataset_parts}"
dataset_parts = [dataset_parts]

manifests = {}

if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Maybe the manifests already exist: we can read them and save a bit of preparation time.
manifests = read_manifests_if_cached(
dataset_parts=dataset_parts, output_dir=output_dir, prefix="wenetspeech4tts"
)

basic_wav_scp_dict = {}
premium_wav_scp_dict = {}
standard_wav_scp_dict = {}
with open(corpus_dir / "filelists" / "Basic_filelist.lst") as f:
for line in f:
line = line.strip().split()
basic_wav_scp_dict[line[0]] = line[1]
if "Basic" not in line[1]:
standard_wav_scp_dict[line[0]] = line[1]
if "Premium" in line[1]:
premium_wav_scp_dict[line[0]] = line[1]

basic_dns_mos_dict = {}
premium_dns_mos_dict = {}
standard_dns_mos_dict = {}
with open(corpus_dir / "DNSMOS_P808Scores" / "Basic_DNSMOS.lst") as f:
for line in f:
line = line.strip().split()
basic_dns_mos_dict[line[0]] = float(line[1])
with open(corpus_dir / "DNSMOS_P808Scores" / "Premium_DNSMOS.lst") as f:
for line in f:
line = line.strip().split()
premium_dns_mos_dict[line[0]] = float(line[1])
with open(corpus_dir / "DNSMOS_P808Scores" / "Standard_DNSMOS.lst") as f:
for line in f:
line = line.strip().split()
standard_dns_mos_dict[line[0]] = float(line[1])

for part in dataset_parts:
if manifests_exist(part=part, output_dir=output_dir, prefix="wenetspeech4tts"):
logging.info(f"WenetSpeech4TTS subset: {part} already prepared - skipping.")
continue
recordings = []
supervisions = []
if part == "Premium":
wav_scp_dict = premium_wav_scp_dict
dns_mos_dict = premium_dns_mos_dict
elif part == "Standard":
wav_scp_dict = standard_wav_scp_dict
dns_mos_dict = standard_dns_mos_dict
else:
wav_scp_dict = basic_wav_scp_dict
dns_mos_dict = basic_dns_mos_dict
for wav_name, wav_path in tqdm(
wav_scp_dict.items(), desc=f"Preparing WenetSpeech4TTS {part}"
):
# get the actual wav path, remove the prefix '../'
# e.g. ../Premium/WenetSpeech4TTS_Premium_9/wavs/X0000015306_83500032_S00110-S00112.wav -> Premium/WenetSpeech4TTS_Premium_9/wavs/X0000015306_83500032_S00110-S00112.wav
assert wav_path.startswith("../")
wav_path = corpus_dir / wav_path[3:]
if not wav_path.is_file():
logging.warning(f"No such file: {wav_path}")
continue
recording = Recording.from_file(wav_path)
recordings.append(recording)

# get the text path
# e.g. ../Premium/WenetSpeech4TTS_Premium_9/txts/X0000015306_83500032_S00110-S00112.txt
txt_path = (
wav_path.parent.parent
/ "txts"
/ wav_path.name.replace("wavs", "txts").replace(".wav", ".txt")
)
if not txt_path.is_file():
logging.warning(f"No such file: {txt_path}")
continue
with open(txt_path, "r") as f:
lines = f.readlines()
text = lines[0].strip().split("\t")[1]
timestamp = lines[1].strip()
supervisions.append(
SupervisionSegment(
id=wav_name,
recording_id=wav_name,
start=0.0,
duration=recording.duration,
channel=0,
language="Chinese",
text=text,
custom={
"timestamp": timestamp,
"dns_mos": dns_mos_dict.get(wav_name, None),
},
)
)
recordings = RecordingSet.from_recordings(recordings)
supervisions = SupervisionSet.from_segments(supervisions)
recordings, supervisions = fix_manifests(recordings, supervisions)
validate_recordings_and_supervisions(recordings, supervisions)

if output_dir is not None:
supervisions.to_file(
output_dir / f"wenetspeech4tts_supervisions_{part}.jsonl.gz"
)
recordings.to_file(
output_dir / f"wenetspeech4tts_recordings_{part}.jsonl.gz"
)

manifests[part] = {"recordings": recordings, "supervisions": supervisions}

return manifests

0 comments on commit 0148c8d

Please sign in to comment.