feat: replace repeated prepro warnings by a summary of missing symbols

Squashes for following commits: * test: run_test now runs the dev suite by default * fix: quiet duplicate text warning during attn processing The text warnings are displayed while processing the text, and again while processing attn. Text processing is very fast so it's not really an issue not to see the progress bar well, whereas attn is slow, so the solution here is simply to suppress the warnings during attn processing. Fixes #70 * feat: display a summary of missing symbols after processing "text" Disable parallel processing for the "text" step, which is OK since it gains very little, in order for the missing_symbols counters to get updated correctly in the main process, and thus allow us to print it, instead of dumping a bump of messages every time a symbol is ignored. Fixes #70 even better than the previous commit. :) * test: improve PR patch coverage * fix: pfs also needs to be treated like text for warnings * fix: make sure the pfs and text missing counters are distinct * fix: grammatically correct messages, and doctest for everyvoce.utils
EveryVoiceTTS · Dec 1, 2023 · fd1940e · fd1940e
1 parent 05ada20
commit fd1940e
Show file tree

Hide file tree

Showing 5 changed files with 120 additions and 47 deletions.
diff --git a/everyvoice/preprocessor/__init__.py b/everyvoice/preprocessor/__init__.py
@@ -8,6 +8,7 @@
 import multiprocessing as mp
 import random
 import sys
+from collections import Counter
 from glob import glob
 from multiprocessing import Manager, managers
 from pathlib import Path
@@ -34,7 +35,12 @@
 from everyvoice.model.vocoder.config import VocoderConfig
 from everyvoice.preprocessor.attention_prior import BetaBinomialInterpolator
 from everyvoice.text import TextProcessor
-from everyvoice.utils import generic_dict_loader, tqdm_joblib_context, write_filelist
+from everyvoice.utils import (
+    generic_dict_loader,
+    n_times,
+    tqdm_joblib_context,
+    write_filelist,
+)
 from everyvoice.utils.heavy import (
     dynamic_range_compression_torch,
     get_spectral_transform,
@@ -364,20 +370,24 @@ def extract_energy(self, spectral_feature_tensor: torch.Tensor):
         """
         return torch.linalg.norm(spectral_feature_tensor, dim=0)
 
-    def extract_text_inputs(self, text, use_pfs=False) -> torch.Tensor:
+    def extract_text_inputs(self, text, use_pfs=False, quiet=False) -> torch.Tensor:
         """Given some text, normalize it, g2p it, and save as one-hot or multi-hot phonological feature vectors
 
         Args:
             text (str): text
+            use_pfs:
+            quiet: suppress warnings
         """
         if self.text_processor is None:
             raise ValueError("Text processor not initialized")
         if use_pfs:
             return torch.Tensor(
-                self.text_processor.text_to_phonological_features(text)
+                self.text_processor.text_to_phonological_features(text, quiet)
             ).long()
         else:
-            return torch.Tensor(self.text_processor.text_to_sequence(text)).long()
+            return torch.Tensor(
+                self.text_processor.text_to_sequence(text, quiet)
+            ).long()
 
     def print_duration(self):
         """Convert seconds to a human readable format"""
@@ -575,14 +585,14 @@ def process_one_audio(self, item: dict, data_dir) -> Optional[dict]:
                 save(output_audio, output_audio_save_path)
         return item
 
-    def process_all_audio(self, debug=False):
+    def process_all_audio(self):
         """Process all audio across datasets, create a combined, filtered filelist and return it"""
         self.dataset_sanity_checks()
         filtered_filelist: List[dict] = []
         for dataset in tqdm(self.datasets, total=len(self.datasets), desc="Dataset"):
             data_dir = Path(dataset.data_dir)
             filelist = dataset.filelist_loader(dataset.filelist)
-            if debug:
+            if self.debug:
                 filelist = filelist[:10]
                 logger.info(
                     "Debug flag was set to true, only processing first 10 files"
@@ -665,7 +675,7 @@ def process_attn_prior(self, item):
         if attn_prior_path.exists() and not self.overwrite:
             return
         binomial_interpolator = BetaBinomialInterpolator()
-        text = self.extract_text_inputs(item["text"], use_pfs=False)
+        text = self.extract_text_inputs(item["text"], use_pfs=False, quiet=True)
         input_spec_path = self.create_path(
             item,
             "spec",
@@ -683,7 +693,7 @@ def process_text(self, item, use_pfs=False):
         text_path = self.create_path(item, "text", basename)
         if text_path.exists() and not self.overwrite:
             return
-        text = self.extract_text_inputs(item["text"], use_pfs=use_pfs)
+        text = self.extract_text_inputs(item["text"], use_pfs=use_pfs, quiet=True)
         save(text, text_path)
 
     def process_spec(self, item):
@@ -795,6 +805,22 @@ def check_data(self, filelist, word_seg_token=" ", heavy_clip_detction=False):
             data.append(data_point)
         return data
 
+    def load_filelist(self, path: Path):
+        try:
+            filelist = generic_dict_loader(path)
+            if self.debug:
+                logger.info(
+                    "Debug flag was set to true, only processing first 10 files"
+                )
+                filelist = filelist[:10]
+        except FileNotFoundError:
+            logger.error(
+                f"A filelist was not found at {path}. "
+                "Please try processing your audio again."
+            )
+            sys.exit(1)
+        return filelist
+
     def preprocess(
         self,
         output_path="filelist.psv",
@@ -805,17 +831,19 @@ def preprocess(
     ):
         self.overwrite = overwrite
         self.cpus = cpus
+        self.debug = debug
         if not isinstance(output_path, Path):
             output_path = Path(output_path)
         processing_order = ("audio", "text", "pfs", "spec", "attn", "energy", "pitch")
         random.seed(self.config.preprocessing.dataset_split_seed)
+        processed_filelist = self.save_dir / output_path.name
         for process in processing_order:
             if process not in to_process:
                 continue
             (self.save_dir / process).mkdir(parents=True, exist_ok=True)
             if process == "audio":
-                if filelist := self.process_all_audio(debug=debug):
-                    write_filelist(filelist, self.save_dir / output_path.name)
+                if filelist := self.process_all_audio():
+                    write_filelist(filelist, processed_filelist)
                     # sample the validation set and subtract it from the whole dataset to determine the training set
                     random.shuffle(filelist)
                     train_split = int(
@@ -838,25 +866,30 @@ def preprocess(
                         "Your filtered audio filelist is empty. Nothing to process."
                     )
                     sys.exit(1)
-                logger.info(f"Audio Filelist len={len(filelist or [])}")
+                # logger.info(f"Audio Filelist len={len(filelist or [])}")
+            elif process in ["text", "pfs"]:
+                # We split out the "text" step to issue the missing symbol warnings
+                filelist = self.load_filelist(processed_filelist)
+                process_fn = self.get_process_fn(process)
+                missing_symbols_before = Counter(self.text_processor.missing_symbols)
+                for f in tqdm(filelist, desc=f"Processing {process} on 1 CPU"):
+                    process_fn(f)
+                # if only one of "pfs" or "text" is specified, missing_symbols_before
+                # will always be empty, but if both are specified this makes sure
+                # each process gets only its own missing symbols logged.
+                new_missing_symbols = (
+                    self.text_processor.missing_symbols - missing_symbols_before
+                )
+                for symbol, count in new_missing_symbols.items():
+                    logger.warning(
+                        f"Symbol '{symbol}' occurs {n_times(count)} but was not declared in your configuration so it is being ignored."
+                    )
             else:
                 # If audio has already been processed, then just read the processed_filelist
-                try:
-                    filelist = generic_dict_loader(self.save_dir / output_path.name)
-                    if debug:
-                        logger.info(
-                            "Debug flag was set to true, only processing first 10 files"
-                        )
-                        filelist = filelist[:10]
-                except FileNotFoundError:
-                    logger.error(
-                        f"A filelist was not found at {self.save_dir / output_path.name}. "
-                        "Please try processing your audio again."
-                    )
-                    sys.exit(1)
+                filelist = self.load_filelist(processed_filelist)
                 process_fn = self.get_process_fn(process)
                 logger.info(f"Processing {process} on {self.cpus} CPUs...")
-                logger.info(f"Filelist len={len(filelist or [])}")
+                # logger.info(f"Filelist len={len(filelist or [])}")
                 if self.cpus > 1:
                     batch_size = min(100, 1 + len(filelist) // (self.cpus * 2))
                     with tqdm_joblib_context(

diff --git a/everyvoice/run_tests.py b/everyvoice/run_tests.py
@@ -79,9 +79,10 @@ def run_tests(suite):
 
 if __name__ == "__main__":
     try:
-        result = run_tests(sys.argv[1])
-        if not result:
-            sys.exit(1)
+        suite = sys.argv[1]
     except IndexError:
-        logger.error("Please specify a test suite to run: i.e. 'dev' or 'all'")
+        logger.info('No test suite specified, defaulting to "dev"')
+        suite = "dev"
+    result = run_tests(suite)
+    if not result:
         sys.exit(1)
diff --git a/everyvoice/tests/test_preprocessing.py b/everyvoice/tests/test_preprocessing.py
@@ -320,10 +320,31 @@ def test_incremental_preprocess(self):
                     cpus=1,
                     overwrite=True,
                     to_process=to_process,
+                    debug=True,
                 )
             self.assertRegex(output.getvalue(), r"processed files *5")
             self.assertRegex(output.getvalue(), r"previously processed files *0")
 
+    def test_gotta_do_audio_first(self):
+        with tempfile.TemporaryDirectory(prefix="missing_audio", dir=".") as tmpdir:
+            tmpdir = Path(tmpdir)
+            preprocessed = tmpdir / "preprocessed"
+            filelist = preprocessed / "preprocessed_filelist.psv"
+
+            fp_config = EveryVoiceConfig().feature_prediction
+            fp_config.preprocessing.source_data[0].data_dir = (
+                self.data_dir / "lj" / "wavs"
+            )
+            full_filelist = self.data_dir / "metadata.csv"
+            fp_config.preprocessing.source_data[0].filelist = full_filelist
+            fp_config.preprocessing.save_dir = preprocessed
+
+            to_process_no_audio = ("energy", "pitch", "attn", "text", "spec")
+            with self.assertRaises(SystemExit), capture_stdout():
+                Preprocessor(fp_config).preprocess(
+                    output_path=filelist, cpus=1, to_process=to_process_no_audio
+                )
+
     def test_empty_preprocess(self):
         # Test case where the file list is not empty but after filtering
         # silence, the result is empty. The behaviour of the code base is not

diff --git a/everyvoice/text/__init__.py b/everyvoice/text/__init__.py
@@ -57,7 +57,7 @@ def __init__(self, config: Union[AlignerConfig, FeaturePredictionConfig]):
             discard_empty=True,
         )
 
-    def replace_cleaner(self, text):
+    def replace_cleaner(self, text: str) -> str:
         """Given some text and a list of replacement operations in the form of input/output key value pairs,
            return the transformed text.
         Args:
@@ -69,7 +69,7 @@ def replace_cleaner(self, text):
             text = re.sub(k, v, text)
         return text
 
-    def text_to_sequence(self, text):
+    def text_to_sequence(self, text: str, quiet: bool = False):
         """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
         Args:
         text: string to convert to a sequence
@@ -78,24 +78,24 @@ def text_to_sequence(self, text):
         List of integers corresponding to the symbols in the text
         """
         sequence = []
-        clean_tokens = self.text_to_tokens(text)
+        clean_tokens = self.text_to_tokens(text, quiet)
         for symbol in clean_tokens:
             symbol_id = self._symbol_to_id[symbol]
             sequence += [symbol_id]
         return sequence
 
-    def text_to_phonological_features(self, text):
+    def text_to_phonological_features(self, text: str, quiet: bool = False):
         """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
         Args:
-        text: string to convert to a sequence
-        cleaner_fns: a list of fns to clean text
+            text: string to convert to a sequence
+            quiet: suppress warnings
         Returns:
-        List of phonological feature vectors
+            List of phonological feature vectors
         """
-        clean_text = self.text_to_tokens(text)
+        clean_text = self.text_to_tokens(text, quiet)
         return get_features(clean_text)
 
-    def clean_text(self, text):
+    def clean_text(self, text: str) -> str:
         """Converts some text to cleaned text"""
         text = self.replace_cleaner(text)
         for cleaner_fn in self.config.text.cleaners:
@@ -107,20 +107,21 @@ def clean_text(self, text):
                 ) from e
         return text
 
-    def text_to_tokens(self, text):
+    def text_to_tokens(self, text: str, quiet: bool = False):
         """Converts a string of text to a sequence of tokens.
         Args:
-        text: string to convert to a sequence
-        cleaner_fns: a list of fns to clean text
+            text: string to convert to a sequence
+            quiet: suppress warnings
         Returns:
-        List of symbols in the text
+            List of symbols in the text
         """
         clean_text = self.clean_text(text)
         clean_tokens = self._tokenizer.tokenize(clean_text)
         for symbol in self._missing_symbol_finder.tokenize(clean_text):
-            logger.warning(
-                f"Symbol '{symbol}' occurs in the text '{clean_text}' but was not declared in your configuration so it is being ignored."
-            )
+            if not quiet:
+                logger.warning(
+                    f"Symbol '{symbol}' occurs in the text '{clean_text}' but was not declared in your configuration so it is being ignored."
+                )
             self.missing_symbols[symbol] += 1
         return clean_tokens
 
@@ -132,9 +133,9 @@ def get_missing_symbols(self, text):
     def cleaned_text_to_sequence(self, cleaned_text):
         """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
         Args:
-        text: string to convert to a sequence
+            text: string to convert to a sequence
         Returns:
-        List of integers corresponding to the symbols in the text
+            List of integers corresponding to the symbols in the text
         """
         cleaned_text = self._tokenizer.tokenize(cleaned_text)
         return [self._symbol_to_id[symbol] for symbol in cleaned_text]

diff --git a/everyvoice/utils/__init__.py b/everyvoice/utils/__init__.py
@@ -324,3 +324,20 @@ def __call__(self, out):
     finally:
         tqdm_instance.close()
         joblib.parallel.BatchCompletionCallBack = old_callback
+
+
+def n_times(n: int) -> str:
+    """Return a grammatically correct version of n times for n > 0.
+
+    >>> n_times(1)
+    'once'
+    >>> n_times(2)
+    'twice'
+    >>> n_times(1001)
+    '1001 times'
+    """
+    if n == 1:
+        return "once"
+    if n == 2:
+        return "twice"
+    return f"{n} times"