diff --git a/examples/tts/conf/fastpitch_align_44100.yaml b/examples/tts/conf/fastpitch_align_44100.yaml index 95442a0eb44f..f8719325c088 100644 --- a/examples/tts/conf/fastpitch_align_44100.yaml +++ b/examples/tts/conf/fastpitch_align_44100.yaml @@ -28,7 +28,7 @@ highfreq: null window: hann phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" +heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" model: diff --git a/examples/tts/conf/fastpitch_align_v1.05.yaml b/examples/tts/conf/fastpitch_align_v1.05.yaml index 692b2500cb29..caf057e0f89b 100644 --- a/examples/tts/conf/fastpitch_align_v1.05.yaml +++ b/examples/tts/conf/fastpitch_align_v1.05.yaml @@ -28,7 +28,7 @@ highfreq: 8000 window: hann phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" +heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" model: diff --git a/examples/tts/conf/mixer-tts.yaml b/examples/tts/conf/mixer-tts.yaml index c66aac76d446..48e146eb11c3 100644 --- a/examples/tts/conf/mixer-tts.yaml +++ b/examples/tts/conf/mixer-tts.yaml @@ -28,7 +28,7 @@ highfreq: 8000 window: hann phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" +heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" model: diff --git a/examples/tts/conf/tacotron2.yaml b/examples/tts/conf/tacotron2.yaml index e227a82d49af..2470b2d46a88 100644 --- a/examples/tts/conf/tacotron2.yaml +++ b/examples/tts/conf/tacotron2.yaml @@ -10,7 +10,7 @@ sup_data_path: null sup_data_types: null phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" +heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index 9a02822371e4..cdb48b5d4c19 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -46,9 +46,9 @@ @dataclass class G2PConfig: - _target_: str = "nemo_text_processing.g2p.modules.EnglishG2p" + _target_: str = "nemo.collections.tts.torch.g2ps.EnglishG2p" phoneme_dict: str = "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" - heteronyms: str = "scripts/tts_dataset_files/heteronyms-052722" + heteronyms: str = "scripts/tts_dataset_files/heteronyms-030921" phoneme_probability: float = 0.5 diff --git a/nemo/collections/tts/torch/g2ps.py b/nemo/collections/tts/torch/g2ps.py index 554f214975ba..ef47705feddf 100644 --- a/nemo/collections/tts/torch/g2ps.py +++ b/nemo/collections/tts/torch/g2ps.py @@ -12,6 +12,449 @@ # See the License for the specific language governing permissions and # limitations under the License. -# TODO (xueyang): deprecate this file since no other places import modules from here anymore. However, -# all checkpoints uploaded in ngc used this path. So it requires to update all ngc checkpoints g2p path as well. -from nemo_text_processing.g2p.modules import IPAG2P, BaseG2p, EnglishG2p +import abc +import pathlib +import random +import re +import time +from collections import defaultdict +from typing import Optional + +import nltk +import torch + +from nemo.collections.tts.torch.en_utils import english_word_tokenize +from nemo.utils import logging +from nemo.utils.decorators import experimental +from nemo.utils.get_rank import is_global_rank_zero + + +class BaseG2p(abc.ABC): + def __init__( + self, phoneme_dict=None, word_tokenize_func=lambda x: x, apply_to_oov_word=None, + ): + """Abstract class for creating an arbitrary module to convert grapheme words to phoneme sequences (or leave unchanged or use apply_to_oov_word). + Args: + phoneme_dict: Arbitrary representation of dictionary (phoneme -> grapheme) for known words. + word_tokenize_func: Function for tokenizing text to words. + apply_to_oov_word: Function that will be applied to out of phoneme_dict word. + """ + self.phoneme_dict = phoneme_dict + self.word_tokenize_func = word_tokenize_func + self.apply_to_oov_word = apply_to_oov_word + + @abc.abstractmethod + def __call__(self, text: str) -> str: + pass + + +class EnglishG2p(BaseG2p): + def __init__( + self, + phoneme_dict=None, + word_tokenize_func=english_word_tokenize, + apply_to_oov_word=None, + ignore_ambiguous_words=True, + heteronyms=None, + encoding='latin-1', + phoneme_probability: Optional[float] = None, + ): + """English G2P module. This module converts words from grapheme to phoneme representation using phoneme_dict in CMU dict format. + Optionally, it can ignore words which are heteronyms, ambiguous or marked as unchangeable by word_tokenize_func (see code for details). + Ignored words are left unchanged or passed through apply_to_oov_word for handling. + Args: + phoneme_dict (str, Path, Dict): Path to file in CMUdict format or dictionary of CMUdict-like entries. + word_tokenize_func: Function for tokenizing text to words. + It has to return List[Tuple[Union[str, List[str]], bool]] where every tuple denotes word representation and flag whether to leave unchanged or not. + It is expected that unchangeable word representation will be represented as List[str], other cases are represented as str. + It is useful to mark word as unchangeable which is already in phoneme representation. + apply_to_oov_word: Function that will be applied to out of phoneme_dict word. + ignore_ambiguous_words: Whether to not handle word via phoneme_dict with ambiguous phoneme sequences. Defaults to True. + heteronyms (str, Path, List): Path to file with heteronyms (every line is new word) or list of words. + encoding: Encoding type. + phoneme_probability (Optional[float]): The probability (0. self.phoneme_probability: + return word, True + + # punctuation + if re.search("[a-zA-Z]", word) is None: + return list(word), True + + # heteronym + if self.heteronyms is not None and word in self.heteronyms: + return word, True + + # `'s` suffix + if ( + len(word) > 2 + and word.endswith("'s") + and (word not in self.phoneme_dict) + and (word[:-2] in self.phoneme_dict) + and (not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word[:-2])) + ): + return self.phoneme_dict[word[:-2]][0] + ["Z"], True + + # `s` suffix + if ( + len(word) > 1 + and word.endswith("s") + and (word not in self.phoneme_dict) + and (word[:-1] in self.phoneme_dict) + and (not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word[:-1])) + ): + return self.phoneme_dict[word[:-1]][0] + ["Z"], True + + # phoneme dict + if word in self.phoneme_dict and (not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word)): + return self.phoneme_dict[word][0], True + + if self.apply_to_oov_word is not None: + return self.apply_to_oov_word(word), True + else: + return word, False + + def __call__(self, text): + words = self.word_tokenize_func(text) + + prons = [] + for word, without_changes in words: + if without_changes: + prons.extend(word) + continue + + word_by_hyphen = word.split("-") + + pron, is_handled = self.parse_one_word(word) + + if not is_handled and len(word_by_hyphen) > 1: + pron = [] + for sub_word in word_by_hyphen: + p, _ = self.parse_one_word(sub_word) + pron.extend(p) + pron.extend(["-"]) + pron.pop() + + prons.extend(pron) + + return prons + + +@experimental +class IPAG2P(BaseG2p): + # fmt: off + STRESS_SYMBOLS = ["ˈ", "ˌ"] + + def __init__( + self, + phoneme_dict, + word_tokenize_func=english_word_tokenize, + apply_to_oov_word=None, + ignore_ambiguous_words=True, + heteronyms=None, + phoneme_probability: Optional[float]=None, + use_stresses: Optional[bool]=True, + set_graphemes_upper: Optional[bool]=True + ): + """Generic IPA G2P module. This module converts words from grapheme to International Phonetic Alphabet representations. + Optionally, it can ignore heteronyms, ambiguous words, or words marked as unchangeable by word_tokenize_func (see code for details). + Ignored words are left unchanged or passed through apply_to_oov_word for handling. + + Args: + phoneme_dict (str, Path, Dict): Path to file in CMUdict format or dictionary of CMUdict-like entries. + Must be given for IPA G2P. (Consider using scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.08.txt.) + word_tokenize_func: Function for tokenizing text to words. + It has to return List[Tuple[Union[str, List[str]], bool]] where every tuple denotes word + representation and flag whether to leave unchanged or not. + It is expected that unchangeable word representation will be represented as List[str], other + cases are represented as str. + It is useful to mark word as unchangeable which is already in phoneme representation. + Defaults to the English word tokenizer. + apply_to_oov_word: Function that will be applied to out of phoneme_dict word. + ignore_ambiguous_words: Whether to not handle word via phoneme_dict with ambiguous phoneme sequences. + Defaults to True. + heteronyms (str, Path, List): Path to file with heteronyms (every line is new word) or list of words. + phoneme_probability (Optional[float]): The probability (0. self.phoneme_probability: + return word, True + + # Punctuation (assumes other chars have been stripped) + if re.search("[a-zA-Z]", word) is None: + return list(word), True + + # Heteronym + if self.heteronyms and word in self.heteronyms: + return word, True + + # `'s` suffix (with apostrophe) - not in phoneme dict + if ( + len(word) > 2 + and word.endswith("'s") + and (word not in self.phoneme_dict) + and (word[:-2] in self.phoneme_dict) + and (not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word[:-2])) + ): + if word[-3] == 'T': + # Case like "airport's" + return self.phoneme_dict[word[:-2]][0] + ["s"], True + elif word[-3] == 'S': + # Case like "jones's" + return self.phoneme_dict[word[:-2]][0] + ["ɪ", "z"], True + else: + return self.phoneme_dict[word[:-2]][0] + ["z"], True + + # `s` suffix (without apostrophe) - not in phoneme dict + if ( + len(word) > 1 + and word.endswith("s") + and (word not in self.phoneme_dict) + and (word[:-1] in self.phoneme_dict) + and (not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word[:-1])) + ): + if word[-2] == 'T': + # Case like "airports" + return self.phoneme_dict[word[:-1]][0] + ["s"], True + else: + return self.phoneme_dict[word[:-1]][0] + ["z"], True + + # Phoneme dict lookup for unique words (or default pron if ignore_ambiguous_words=False) + if word in self.phoneme_dict and (not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word)): + return self.phoneme_dict[word][0], True + + if self.apply_to_oov_word is not None: + return self.apply_to_oov_word(word), True + else: + return word, False + + def __call__(self, text): + words = self.word_tokenize_func(text) + + prons = [] + for word, without_changes in words: + if without_changes: + prons.extend(word) + continue + + pron, is_handled = self.parse_one_word(word) + + word_by_hyphen = word.split("-") + if not is_handled and len(word_by_hyphen) > 1: + pron = [] + for sub_word in word_by_hyphen: + p, _ = self.parse_one_word(sub_word) + pron.extend(p) + pron.extend(["-"]) + pron.pop() + + prons.extend(pron) + + return prons diff --git a/nemo/collections/tts/torch/tts_dataset.yaml b/nemo/collections/tts/torch/tts_dataset.yaml index 510f8f8ce20b..10cad3d249c2 100644 --- a/nemo/collections/tts/torch/tts_dataset.yaml +++ b/nemo/collections/tts/torch/tts_dataset.yaml @@ -41,6 +41,6 @@ tts_dataset: add_blank_at: null pad_with_space: True g2p: - _target_: nemo_text_processing.g2p.modules.EnglishG2p + _target_: nemo.collections.tts.torch.g2ps.EnglishG2p phoneme_dict: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" - heteronyms: "scripts/tts_dataset_files/heteronyms-052722" + heteronyms: "scripts/tts_dataset_files/heteronyms-030921" diff --git a/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_fastpitch_align.yaml b/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_fastpitch_align.yaml index bed6b2ee49af..80471f589dcc 100644 --- a/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_fastpitch_align.yaml +++ b/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_fastpitch_align.yaml @@ -5,7 +5,7 @@ sup_data_path: "sup_data" sup_data_types: [ "align_prior_matrix", "pitch" ] whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" +heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921" dataset: _target_: nemo.collections.tts.torch.data.TTSDataset diff --git a/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_mixer_tts.yaml b/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_mixer_tts.yaml index d4151e888ae0..ae21fc674e4c 100644 --- a/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_mixer_tts.yaml +++ b/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_mixer_tts.yaml @@ -5,7 +5,7 @@ sup_data_path: "sup_data" sup_data_types: [ "align_prior_matrix", "pitch" ] whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.08" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" +heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921" dataset: _target_: nemo.collections.tts.torch.data.TTSDataset diff --git a/scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.08.txt b/scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.08.txt index 1bedf98caf1f..e19979d0ad7c 100644 --- a/scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.08.txt +++ b/scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.08.txt @@ -57525,7 +57525,7 @@ HURRAY həˈɹeɪ HURRELL ˈhɔɹəl HURRI ˈhɝi HURRICANE ˈhɝəˌkeɪn -HURRICANE(1) ˈhʌɹəˌkeɪn +HURRICANE ˈhəɹəˌkeɪn HURRICANE'S ˈhɝəˌkeɪnz HURRICANES ˈhɝəˌkeɪnz HURRIED ˈhɝid