librispeech, redoing text-norm as --normalize-text=['none','lower']…

… as desh2608 suggested
lhotse-speech · Nov 8, 2023 · ae2518b · ae2518b
1 parent e67ea29
commit ae2518b
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 9 deletions.
diff --git a/lhotse/bin/modes/recipes/librispeech.py b/lhotse/bin/modes/recipes/librispeech.py
@@ -35,17 +35,18 @@
     help="How many threads to use (can give good speed-ups with slow disks).",
 )
 @click.option(
-    "--to-lowercase",
-    type=bool,
-    default=False,
-    help="Conversion of transcripts to lower-vase (originally in uppercase).",
+    "--normalize-text",
+    type=click.Choice(["none", "lower"], case_sensitive=False),
+    default="none",
+    help="Conversion of transcripts to lower-case (originally in upper-case).",
+    show_default=True,
 )
 def librispeech(
     corpus_dir: Pathlike,
     output_dir: Pathlike,
     alignments_dir: Pathlike,
     dataset_parts: Sequence[str],
-    to_lowercase: bool,
+    normalize_text: str,
     num_jobs: int,
 ):
     """(Mini) Librispeech ASR data preparation."""
@@ -57,7 +58,7 @@ def librispeech(
         alignments_dir=alignments_dir,
         num_jobs=num_jobs,
         dataset_parts=dataset_parts,
-        to_lowercase=to_lowercase,
+        normalize_text=normalize_text,
     )
 
 

diff --git a/lhotse/recipes/librispeech.py b/lhotse/recipes/librispeech.py
@@ -114,7 +114,7 @@ def prepare_librispeech(
     alignments_dir: Optional[Pathlike] = None,
     dataset_parts: Union[str, Sequence[str]] = "auto",
     output_dir: Optional[Pathlike] = None,
-    to_lowercase: bool = False,
+    normalize_text: str = "none",
     num_jobs: int = 1,
 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
     """
@@ -127,7 +127,8 @@ def prepare_librispeech(
     :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
         By default we will infer which parts are available in ``corpus_dir``.
     :param output_dir: Pathlike, the path where to write the manifests.
-    :param to_lowercase: Bool, if True, the transcripts are converted to lower-case.
+    :param normalize_text: str, "none" or "lower",
+        for "lower" the transcripts are converted to lower-case.
     :param num_jobs: int, number of parallel threads used for 'parse_utterance' calls.
     :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
     """
@@ -207,7 +208,8 @@ def prepare_librispeech(
             recording_set = RecordingSet.from_recordings(recordings)
             supervision_set = SupervisionSet.from_segments(supervisions)
 
-            if to_lowercase:
+            # Normalize text to lowercase
+            if normalize_text == "lower":
                 to_lower = lambda text: text.lower()
                 supervision_set = SupervisionSet.from_segments(
                     [s.transform_text(to_lower) for s in supervision_set]