Skip to content

Commit

Permalink
Optimized ReazonSpeech download speed using hf datasets features (lho…
Browse files Browse the repository at this point in the history
…tse-speech#1434)

* Optimize ReazonSpeech download speed using hf datasets features

* fix: format
  • Loading branch information
yuta0306 authored and Your Name committed Jan 7, 2025
1 parent 89f072e commit f83a297
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 24 deletions.
11 changes: 9 additions & 2 deletions lhotse/bin/modes/recipes/reazonspeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,16 @@ def reazonspeech(
help="List of dataset parts to prepare (default: small-v1). To prepare multiple parts, pass each with `--subset` "
"Example: `--subset all",
)
def reazonspeech(target_dir: Pathlike, subset: List[str]):
@click.option(
"-j",
"--num-jobs",
type=int,
default=1,
help="How many threads to use (can give good speed-ups with slow disks).",
)
def reazonspeech(target_dir: Pathlike, subset: List[str], num_jobs: int):
"""ReazonSpeech download."""
logging.basicConfig(level=logging.INFO)
if "auto" in subset:
subset = "auto"
download_reazonspeech(target_dir, dataset_parts=subset)
download_reazonspeech(target_dir, dataset_parts=subset, num_jobs=num_jobs)
50 changes: 28 additions & 22 deletions lhotse/recipes/reazonspeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,18 +72,21 @@ def write_to_json(data, filename):
def download_reazonspeech(
target_dir: Pathlike = ".",
dataset_parts: Optional[Union[str, Sequence[str]]] = "auto",
num_jobs: int = 1,
) -> Path:
"""
Download the ReazonSpeech dataset.
:param target_dir: Pathlike, the path of the dir to storage the dataset.
:param dataset_parts: the parts of the dataset to download (e.g. small, medium, or large).
:param num_jobs: the number of processes to download and format.
:return: the path to downloaded data and the JSON file.
"""
if is_module_available("datasets"):
from datasets import load_dataset
import soundfile as sf
from datasets import Audio, load_dataset
else:
raise ImportError(
"To process the ReazonSpeech corpus, please install optional dependency: pip install datasets"
"To process the ReazonSpeech corpus, please install optional dependencies: pip install datasets soundfile"
)
target_dir = Path(target_dir)
target_dir.mkdir(parents=True, exist_ok=True)
Expand All @@ -101,31 +104,34 @@ def download_reazonspeech(
part,
trust_remote_code=True,
cache_dir=corpus_dir,
num_proc=num_jobs,
)["train"]

# Prepare data for JSON export
data_for_json = []
idx = 0
for item in ds:
# Calculate the duration of the audio file
audio_array = item["audio"]["array"]
sampling_rate = item["audio"]["sampling_rate"]
duration = len(audio_array) / float(sampling_rate)

# Create a dictionary for the current record
record = {
"id": str(idx),
"audio_filepath": item["audio"]["path"],
"text": normalize(item["transcription"]),
"duration": duration,
}

# Append the record to the list
data_for_json.append(record)
idx += 1
def format_example(example: dict, idx: int) -> dict:
example["id"] = str(idx)
example["audio_filepath"] = example["audio"]["path"]
example["text"] = normalize(example["transcription"])
example["duration"] = sf.info(example["audio"]["path"]).duration
return example

ds = ds.cast_column("audio", Audio(decode=True)) # Hack: don't decode to speedup
ds = ds.map(
format_example,
with_indices=True,
remove_columns=ds.column_names,
num_proc=num_jobs,
)

# Write data to a JSON file
write_to_json(data_for_json, corpus_dir / "dataset.json")
ds.to_json(
corpus_dir / "dataset.json",
num_proc=num_jobs,
force_ascii=False,
indent=4,
lines=False,
batch_size=ds.num_rows,
)

return corpus_dir

Expand Down

0 comments on commit f83a297

Please sign in to comment.