Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized ReazonSpeech download speed using hf datasets features #1434

Merged
merged 2 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions lhotse/bin/modes/recipes/reazonspeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,16 @@ def reazonspeech(
help="List of dataset parts to prepare (default: small-v1). To prepare multiple parts, pass each with `--subset` "
"Example: `--subset all",
)
def reazonspeech(target_dir: Pathlike, subset: List[str]):
@click.option(
"-j",
"--num-jobs",
type=int,
default=1,
help="How many threads to use (can give good speed-ups with slow disks).",
)
def reazonspeech(target_dir: Pathlike, subset: List[str], num_jobs: int):
"""ReazonSpeech download."""
logging.basicConfig(level=logging.INFO)
if "auto" in subset:
subset = "auto"
download_reazonspeech(target_dir, dataset_parts=subset)
download_reazonspeech(target_dir, dataset_parts=subset, num_jobs=num_jobs)
50 changes: 28 additions & 22 deletions lhotse/recipes/reazonspeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,18 +72,21 @@ def write_to_json(data, filename):
def download_reazonspeech(
target_dir: Pathlike = ".",
dataset_parts: Optional[Union[str, Sequence[str]]] = "auto",
num_jobs: int = 1,
) -> Path:
"""
Download the ReazonSpeech dataset.
:param target_dir: Pathlike, the path of the dir to storage the dataset.
:param dataset_parts: the parts of the dataset to download (e.g. small, medium, or large).
:param num_jobs: the number of processes to download and format.
:return: the path to downloaded data and the JSON file.
"""
if is_module_available("datasets"):
from datasets import load_dataset
import soundfile as sf
from datasets import Audio, load_dataset
else:
raise ImportError(
"To process the ReazonSpeech corpus, please install optional dependency: pip install datasets"
"To process the ReazonSpeech corpus, please install optional dependencies: pip install datasets soundfile"
)
target_dir = Path(target_dir)
target_dir.mkdir(parents=True, exist_ok=True)
Expand All @@ -101,31 +104,34 @@ def download_reazonspeech(
part,
trust_remote_code=True,
cache_dir=corpus_dir,
num_proc=num_jobs,
)["train"]

# Prepare data for JSON export
data_for_json = []
idx = 0
for item in ds:
# Calculate the duration of the audio file
audio_array = item["audio"]["array"]
sampling_rate = item["audio"]["sampling_rate"]
duration = len(audio_array) / float(sampling_rate)

# Create a dictionary for the current record
record = {
"id": str(idx),
"audio_filepath": item["audio"]["path"],
"text": normalize(item["transcription"]),
"duration": duration,
}

# Append the record to the list
data_for_json.append(record)
idx += 1
def format_example(example: dict, idx: int) -> dict:
example["id"] = str(idx)
example["audio_filepath"] = example["audio"]["path"]
example["text"] = normalize(example["transcription"])
example["duration"] = sf.info(example["audio"]["path"]).duration
return example

ds = ds.cast_column("audio", Audio(decode=True)) # Hack: don't decode to speedup
ds = ds.map(
format_example,
with_indices=True,
remove_columns=ds.column_names,
num_proc=num_jobs,
)

# Write data to a JSON file
write_to_json(data_for_json, corpus_dir / "dataset.json")
ds.to_json(
corpus_dir / "dataset.json",
num_proc=num_jobs,
force_ascii=False,
indent=4,
lines=False,
batch_size=ds.num_rows,
)

return corpus_dir

Expand Down
Loading