Skip to content

Commit

Permalink
feat: replace repeated prepro warnings by a summary of missing symbols
Browse files Browse the repository at this point in the history
Squashes for following commits:

* test: run_test now runs the dev suite by default

* fix: quiet duplicate text warning during attn processing

The text warnings are displayed while processing the text, and again while
processing attn. Text processing is very fast so it's not really an issue not
to see the progress bar well, whereas attn is slow, so the solution here is
simply to suppress the warnings during attn processing.

Fixes #70

* feat: display a summary of missing symbols after processing "text"

Disable parallel processing for the "text" step, which is OK since it gains
very little, in order for the missing_symbols counters to get updated correctly
in the main process, and thus allow us to print it, instead of dumping a bump
of messages every time a symbol is ignored.

Fixes #70 even better than the previous commit. :)

* test: improve PR patch coverage

* fix: pfs also needs to be treated like text for warnings

* fix: make sure the pfs and text missing counters are distinct

* fix: grammatically correct messages, and doctest for everyvoce.utils
  • Loading branch information
joanise authored Dec 1, 2023
1 parent 05ada20 commit fd1940e
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 47 deletions.
83 changes: 58 additions & 25 deletions everyvoice/preprocessor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import multiprocessing as mp
import random
import sys
from collections import Counter
from glob import glob
from multiprocessing import Manager, managers
from pathlib import Path
Expand All @@ -34,7 +35,12 @@
from everyvoice.model.vocoder.config import VocoderConfig
from everyvoice.preprocessor.attention_prior import BetaBinomialInterpolator
from everyvoice.text import TextProcessor
from everyvoice.utils import generic_dict_loader, tqdm_joblib_context, write_filelist
from everyvoice.utils import (
generic_dict_loader,
n_times,
tqdm_joblib_context,
write_filelist,
)
from everyvoice.utils.heavy import (
dynamic_range_compression_torch,
get_spectral_transform,
Expand Down Expand Up @@ -364,20 +370,24 @@ def extract_energy(self, spectral_feature_tensor: torch.Tensor):
"""
return torch.linalg.norm(spectral_feature_tensor, dim=0)

def extract_text_inputs(self, text, use_pfs=False) -> torch.Tensor:
def extract_text_inputs(self, text, use_pfs=False, quiet=False) -> torch.Tensor:
"""Given some text, normalize it, g2p it, and save as one-hot or multi-hot phonological feature vectors
Args:
text (str): text
use_pfs:
quiet: suppress warnings
"""
if self.text_processor is None:
raise ValueError("Text processor not initialized")
if use_pfs:
return torch.Tensor(
self.text_processor.text_to_phonological_features(text)
self.text_processor.text_to_phonological_features(text, quiet)
).long()
else:
return torch.Tensor(self.text_processor.text_to_sequence(text)).long()
return torch.Tensor(
self.text_processor.text_to_sequence(text, quiet)
).long()

def print_duration(self):
"""Convert seconds to a human readable format"""
Expand Down Expand Up @@ -575,14 +585,14 @@ def process_one_audio(self, item: dict, data_dir) -> Optional[dict]:
save(output_audio, output_audio_save_path)
return item

def process_all_audio(self, debug=False):
def process_all_audio(self):
"""Process all audio across datasets, create a combined, filtered filelist and return it"""
self.dataset_sanity_checks()
filtered_filelist: List[dict] = []
for dataset in tqdm(self.datasets, total=len(self.datasets), desc="Dataset"):
data_dir = Path(dataset.data_dir)
filelist = dataset.filelist_loader(dataset.filelist)
if debug:
if self.debug:
filelist = filelist[:10]
logger.info(
"Debug flag was set to true, only processing first 10 files"
Expand Down Expand Up @@ -665,7 +675,7 @@ def process_attn_prior(self, item):
if attn_prior_path.exists() and not self.overwrite:
return
binomial_interpolator = BetaBinomialInterpolator()
text = self.extract_text_inputs(item["text"], use_pfs=False)
text = self.extract_text_inputs(item["text"], use_pfs=False, quiet=True)
input_spec_path = self.create_path(
item,
"spec",
Expand All @@ -683,7 +693,7 @@ def process_text(self, item, use_pfs=False):
text_path = self.create_path(item, "text", basename)
if text_path.exists() and not self.overwrite:
return
text = self.extract_text_inputs(item["text"], use_pfs=use_pfs)
text = self.extract_text_inputs(item["text"], use_pfs=use_pfs, quiet=True)
save(text, text_path)

def process_spec(self, item):
Expand Down Expand Up @@ -795,6 +805,22 @@ def check_data(self, filelist, word_seg_token=" ", heavy_clip_detction=False):
data.append(data_point)
return data

def load_filelist(self, path: Path):
try:
filelist = generic_dict_loader(path)
if self.debug:
logger.info(
"Debug flag was set to true, only processing first 10 files"
)
filelist = filelist[:10]
except FileNotFoundError:
logger.error(
f"A filelist was not found at {path}. "
"Please try processing your audio again."
)
sys.exit(1)
return filelist

def preprocess(
self,
output_path="filelist.psv",
Expand All @@ -805,17 +831,19 @@ def preprocess(
):
self.overwrite = overwrite
self.cpus = cpus
self.debug = debug
if not isinstance(output_path, Path):
output_path = Path(output_path)
processing_order = ("audio", "text", "pfs", "spec", "attn", "energy", "pitch")
random.seed(self.config.preprocessing.dataset_split_seed)
processed_filelist = self.save_dir / output_path.name
for process in processing_order:
if process not in to_process:
continue
(self.save_dir / process).mkdir(parents=True, exist_ok=True)
if process == "audio":
if filelist := self.process_all_audio(debug=debug):
write_filelist(filelist, self.save_dir / output_path.name)
if filelist := self.process_all_audio():
write_filelist(filelist, processed_filelist)
# sample the validation set and subtract it from the whole dataset to determine the training set
random.shuffle(filelist)
train_split = int(
Expand All @@ -838,25 +866,30 @@ def preprocess(
"Your filtered audio filelist is empty. Nothing to process."
)
sys.exit(1)
logger.info(f"Audio Filelist len={len(filelist or [])}")
# logger.info(f"Audio Filelist len={len(filelist or [])}")
elif process in ["text", "pfs"]:
# We split out the "text" step to issue the missing symbol warnings
filelist = self.load_filelist(processed_filelist)
process_fn = self.get_process_fn(process)
missing_symbols_before = Counter(self.text_processor.missing_symbols)
for f in tqdm(filelist, desc=f"Processing {process} on 1 CPU"):
process_fn(f)
# if only one of "pfs" or "text" is specified, missing_symbols_before
# will always be empty, but if both are specified this makes sure
# each process gets only its own missing symbols logged.
new_missing_symbols = (
self.text_processor.missing_symbols - missing_symbols_before
)
for symbol, count in new_missing_symbols.items():
logger.warning(
f"Symbol '{symbol}' occurs {n_times(count)} but was not declared in your configuration so it is being ignored."
)
else:
# If audio has already been processed, then just read the processed_filelist
try:
filelist = generic_dict_loader(self.save_dir / output_path.name)
if debug:
logger.info(
"Debug flag was set to true, only processing first 10 files"
)
filelist = filelist[:10]
except FileNotFoundError:
logger.error(
f"A filelist was not found at {self.save_dir / output_path.name}. "
"Please try processing your audio again."
)
sys.exit(1)
filelist = self.load_filelist(processed_filelist)
process_fn = self.get_process_fn(process)
logger.info(f"Processing {process} on {self.cpus} CPUs...")
logger.info(f"Filelist len={len(filelist or [])}")
# logger.info(f"Filelist len={len(filelist or [])}")
if self.cpus > 1:
batch_size = min(100, 1 + len(filelist) // (self.cpus * 2))
with tqdm_joblib_context(
Expand Down
9 changes: 5 additions & 4 deletions everyvoice/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,10 @@ def run_tests(suite):

if __name__ == "__main__":
try:
result = run_tests(sys.argv[1])
if not result:
sys.exit(1)
suite = sys.argv[1]
except IndexError:
logger.error("Please specify a test suite to run: i.e. 'dev' or 'all'")
logger.info('No test suite specified, defaulting to "dev"')
suite = "dev"
result = run_tests(suite)
if not result:
sys.exit(1)
21 changes: 21 additions & 0 deletions everyvoice/tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,10 +320,31 @@ def test_incremental_preprocess(self):
cpus=1,
overwrite=True,
to_process=to_process,
debug=True,
)
self.assertRegex(output.getvalue(), r"processed files *5")
self.assertRegex(output.getvalue(), r"previously processed files *0")

def test_gotta_do_audio_first(self):
with tempfile.TemporaryDirectory(prefix="missing_audio", dir=".") as tmpdir:
tmpdir = Path(tmpdir)
preprocessed = tmpdir / "preprocessed"
filelist = preprocessed / "preprocessed_filelist.psv"

fp_config = EveryVoiceConfig().feature_prediction
fp_config.preprocessing.source_data[0].data_dir = (
self.data_dir / "lj" / "wavs"
)
full_filelist = self.data_dir / "metadata.csv"
fp_config.preprocessing.source_data[0].filelist = full_filelist
fp_config.preprocessing.save_dir = preprocessed

to_process_no_audio = ("energy", "pitch", "attn", "text", "spec")
with self.assertRaises(SystemExit), capture_stdout():
Preprocessor(fp_config).preprocess(
output_path=filelist, cpus=1, to_process=to_process_no_audio
)

def test_empty_preprocess(self):
# Test case where the file list is not empty but after filtering
# silence, the result is empty. The behaviour of the code base is not
Expand Down
37 changes: 19 additions & 18 deletions everyvoice/text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def __init__(self, config: Union[AlignerConfig, FeaturePredictionConfig]):
discard_empty=True,
)

def replace_cleaner(self, text):
def replace_cleaner(self, text: str) -> str:
"""Given some text and a list of replacement operations in the form of input/output key value pairs,
return the transformed text.
Args:
Expand All @@ -69,7 +69,7 @@ def replace_cleaner(self, text):
text = re.sub(k, v, text)
return text

def text_to_sequence(self, text):
def text_to_sequence(self, text: str, quiet: bool = False):
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
Args:
text: string to convert to a sequence
Expand All @@ -78,24 +78,24 @@ def text_to_sequence(self, text):
List of integers corresponding to the symbols in the text
"""
sequence = []
clean_tokens = self.text_to_tokens(text)
clean_tokens = self.text_to_tokens(text, quiet)
for symbol in clean_tokens:
symbol_id = self._symbol_to_id[symbol]
sequence += [symbol_id]
return sequence

def text_to_phonological_features(self, text):
def text_to_phonological_features(self, text: str, quiet: bool = False):
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
Args:
text: string to convert to a sequence
cleaner_fns: a list of fns to clean text
text: string to convert to a sequence
quiet: suppress warnings
Returns:
List of phonological feature vectors
List of phonological feature vectors
"""
clean_text = self.text_to_tokens(text)
clean_text = self.text_to_tokens(text, quiet)
return get_features(clean_text)

def clean_text(self, text):
def clean_text(self, text: str) -> str:
"""Converts some text to cleaned text"""
text = self.replace_cleaner(text)
for cleaner_fn in self.config.text.cleaners:
Expand All @@ -107,20 +107,21 @@ def clean_text(self, text):
) from e
return text

def text_to_tokens(self, text):
def text_to_tokens(self, text: str, quiet: bool = False):
"""Converts a string of text to a sequence of tokens.
Args:
text: string to convert to a sequence
cleaner_fns: a list of fns to clean text
text: string to convert to a sequence
quiet: suppress warnings
Returns:
List of symbols in the text
List of symbols in the text
"""
clean_text = self.clean_text(text)
clean_tokens = self._tokenizer.tokenize(clean_text)
for symbol in self._missing_symbol_finder.tokenize(clean_text):
logger.warning(
f"Symbol '{symbol}' occurs in the text '{clean_text}' but was not declared in your configuration so it is being ignored."
)
if not quiet:
logger.warning(
f"Symbol '{symbol}' occurs in the text '{clean_text}' but was not declared in your configuration so it is being ignored."
)
self.missing_symbols[symbol] += 1
return clean_tokens

Expand All @@ -132,9 +133,9 @@ def get_missing_symbols(self, text):
def cleaned_text_to_sequence(self, cleaned_text):
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
Args:
text: string to convert to a sequence
text: string to convert to a sequence
Returns:
List of integers corresponding to the symbols in the text
List of integers corresponding to the symbols in the text
"""
cleaned_text = self._tokenizer.tokenize(cleaned_text)
return [self._symbol_to_id[symbol] for symbol in cleaned_text]
Expand Down
17 changes: 17 additions & 0 deletions everyvoice/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,3 +324,20 @@ def __call__(self, out):
finally:
tqdm_instance.close()
joblib.parallel.BatchCompletionCallBack = old_callback


def n_times(n: int) -> str:
"""Return a grammatically correct version of n times for n > 0.
>>> n_times(1)
'once'
>>> n_times(2)
'twice'
>>> n_times(1001)
'1001 times'
"""
if n == 1:
return "once"
if n == 2:
return "twice"
return f"{n} times"

0 comments on commit fd1940e

Please sign in to comment.