Skip to content

Commit

Permalink
librispeech, redoing text-norm as --normalize-text=['none','lower']
Browse files Browse the repository at this point in the history
… as desh2608 suggested
  • Loading branch information
KarelVesely84 committed Nov 8, 2023
1 parent e67ea29 commit ae2518b
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 9 deletions.
13 changes: 7 additions & 6 deletions lhotse/bin/modes/recipes/librispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,18 @@
help="How many threads to use (can give good speed-ups with slow disks).",
)
@click.option(
"--to-lowercase",
type=bool,
default=False,
help="Conversion of transcripts to lower-vase (originally in uppercase).",
"--normalize-text",
type=click.Choice(["none", "lower"], case_sensitive=False),
default="none",
help="Conversion of transcripts to lower-case (originally in upper-case).",
show_default=True,
)
def librispeech(
corpus_dir: Pathlike,
output_dir: Pathlike,
alignments_dir: Pathlike,
dataset_parts: Sequence[str],
to_lowercase: bool,
normalize_text: str,
num_jobs: int,
):
"""(Mini) Librispeech ASR data preparation."""
Expand All @@ -57,7 +58,7 @@ def librispeech(
alignments_dir=alignments_dir,
num_jobs=num_jobs,
dataset_parts=dataset_parts,
to_lowercase=to_lowercase,
normalize_text=normalize_text,
)


Expand Down
8 changes: 5 additions & 3 deletions lhotse/recipes/librispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def prepare_librispeech(
alignments_dir: Optional[Pathlike] = None,
dataset_parts: Union[str, Sequence[str]] = "auto",
output_dir: Optional[Pathlike] = None,
to_lowercase: bool = False,
normalize_text: str = "none",
num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
"""
Expand All @@ -127,7 +127,8 @@ def prepare_librispeech(
:param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
By default we will infer which parts are available in ``corpus_dir``.
:param output_dir: Pathlike, the path where to write the manifests.
:param to_lowercase: Bool, if True, the transcripts are converted to lower-case.
:param normalize_text: str, "none" or "lower",
for "lower" the transcripts are converted to lower-case.
:param num_jobs: int, number of parallel threads used for 'parse_utterance' calls.
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
"""
Expand Down Expand Up @@ -207,7 +208,8 @@ def prepare_librispeech(
recording_set = RecordingSet.from_recordings(recordings)
supervision_set = SupervisionSet.from_segments(supervisions)

if to_lowercase:
# Normalize text to lowercase
if normalize_text == "lower":
to_lower = lambda text: text.lower()
supervision_set = SupervisionSet.from_segments(
[s.transform_text(to_lower) for s in supervision_set]
Expand Down

0 comments on commit ae2518b

Please sign in to comment.