Skip to content

Commit

Permalink
Fix wrong Common Voice parsing (#979)
Browse files Browse the repository at this point in the history
Fixes: #969, #977
  • Loading branch information
pzelasko authored Feb 17, 2023
2 parents 6a923e1 + ec4266a commit 9220b5b
Showing 1 changed file with 20 additions and 5 deletions.
25 changes: 20 additions & 5 deletions lhotse/recipes/commonvoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
How does it work?
We’re crowdsourcing an open-source dataset of voices. Donate your voice, validate the accuracy of other people’s clips, make the dataset better for everyone.
"""
import csv
import logging
import math
import numbers
import shutil
import tarfile
import warnings
Expand Down Expand Up @@ -226,7 +229,7 @@ def prepare_single_commonvoice_tsv(
tsv_path = lang_path / f"{part}.tsv"

# Read the metadata
df = pd.read_csv(tsv_path, sep="\t")
df = pd.read_csv(tsv_path, sep="\t", quoting=csv.QUOTE_NONE)
# Scan all the audio files
with RecordingSet.open_writer(
output_dir / f"cv-{lang}_recordings_{part}.jsonl.gz",
Expand Down Expand Up @@ -262,6 +265,18 @@ def prepare_single_commonvoice_tsv(
def parse_utterance(
row: Any, lang_path: Path, language: str
) -> Tuple[Recording, SupervisionSegment]:
def read_row_optional_field(fieldname: str):
# defaulting instead of raising exception
if fieldname not in row:
return None
cell_val = row[fieldname]
if cell_val == "nan" or (
isinstance(cell_val, numbers.Number) and math.isnan(cell_val)
):
return None
else:
return cell_val

# Create the Recording first
audio_path = lang_path / "clips" / row.path
if not audio_path.is_file():
Expand All @@ -270,9 +285,9 @@ def parse_utterance(
recording = Recording.from_file(audio_path, recording_id=recording_id)
# Handling accent(s) in different versions of CommonVoice
if "accents" in row:
accents = row.accents if row.accents != "nan" else None
accents = read_row_optional_field("accents")
else:
accents = row.accent if row.accent != "nan" else None
accents = read_row_optional_field("accent")
# Then, create the corresponding supervisions
segment = SupervisionSegment(
id=recording_id,
Expand All @@ -285,9 +300,9 @@ def parse_utterance(
language=COMMONVOICE_CODE2LANG.get(language, language),
speaker=row.client_id,
text=row.sentence.strip(),
gender=row.gender if row.gender != "nan" else None,
gender=read_row_optional_field("gender"),
custom={
"age": row.age if row.age != "nan" else None,
"age": read_row_optional_field("age"),
"accents": accents,
},
)
Expand Down

0 comments on commit 9220b5b

Please sign in to comment.