add ItalianPhonemesTokenizer (NVIDIA#7587)

* add ItalianPhonemesTokenizer Signed-off-by: GiacomoLeoneMaria <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Italian phonemes Signed-off-by: GiacomoLeoneMaria <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add test Signed-off-by: GiacomoLeoneMaria <[email protected]> --------- Signed-off-by: GiacomoLeoneMaria <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Xuesong Yang <[email protected]> Signed-off-by: Sasha Meister <[email protected]>
ssh-meister · Oct 5, 2023 · 8c892db · 8c892db
1 parent 620c011
commit 8c892db
Show file tree

Hide file tree

Showing 3 changed files with 93 additions and 3 deletions.
diff --git a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
@@ -88,7 +88,7 @@
         'ɢ','ʛ','ɦ','ɧ','ħ','ɥ','ʜ','ɨ','ɬ','ɫ','ɮ','ʟ',
         'ɱ','ɯ','ɰ','ɳ','ɵ','ɸ','œ','ɶ','ʘ','ɺ','ɻ','ʀ','ʁ',
         'ɽ','ʂ','ʈ','ʧ','ʉ','ʋ','ⱱ','ɤ','ʍ','χ','ʏ','ʑ','ʐ',
-        'ʔ','ʡ','ʕ','ʢ','ǀ','ǁ','ǂ','ᵻ'
+        'ʔ','ʡ','ʕ','ʢ','ǀ','ǁ','ǂ','ᵻ', 'ʃ','ː',
     ),
 }
 
@@ -181,7 +181,10 @@ def get_ipa_punctuation_list(locale):
                 '↑',
                 '→',
                 '↗',
-                '↘,',
+                '↘',
+                '”',
+                '’',
+                '-',
             ]
         )
     elif locale == "es-ES":

diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -284,7 +284,7 @@ def __init__(
             non_default_punct_list: List of punctuation marks which will be used instead default.
         """
 
-        it_alphabet = "abcdefghijklmnopqrstuvwxyzàèéìòù"
+        it_alphabet = "abcdefghijklmnopqrstuvwxyzàèéìòùó"
         super().__init__(
             chars=it_alphabet,
             punct=punct,
@@ -367,6 +367,77 @@ def encode(self, text):
         return [self._token2id[p] for p in cs]
 
 
+class ItalianPhonemesTokenizer(BaseCharsTokenizer):
+    # fmt: off
+    PUNCT_LIST = (
+        ',', '.', '!', '?', '-',
+        ':', ';', '/', '"', '(',
+        ')', '[', ']', '{', '}',
+        '„', '“', '”', '‘', '’', '‒', '—', '«', '»', '‹', '›', '_',
+    )
+    # fmt: on
+
+    def __init__(
+        self,
+        punct=True,
+        apostrophe=True,
+        add_blank_at=None,
+        pad_with_space=False,
+        non_default_punct_list=None,
+        text_preprocessing_func=italian_text_preprocessing,
+    ):
+        """Italian phoneme-based tokenizer.
+        Args:
+            punct: Whether to reserve grapheme for basic punctuation or not.
+            apostrophe: Whether to use apostrophe or not.
+            add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
+             if None then no blank in labels.
+            pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
+            non_default_punct_list: List of punctuation marks which will be used instead default.
+            text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer.
+             Currently, it only applies lower() function.
+        """
+
+        it_ipa = "abcdefghijklmnopqrstuvwxyzàèéìòùóæɐɑɔəɚɜɬɹʌʔᵻðŋɛɡɣɪɲɾʃʊʎʒʝβθd͡'t͡'øɒɕɓçɖɘɝɞɟʄɡɠɢʛɦɧħɥʜɨɬɫɮʟɱɯɰɳɵɸœɶʘɺɻʀʁɽʂʈʧʉʋⱱɤʍχʏʑʐʔʡʕʢǀǁǂᵻʃ'ː"
+        super().__init__(
+            chars=it_ipa,
+            punct=punct,
+            apostrophe=apostrophe,
+            add_blank_at=add_blank_at,
+            pad_with_space=pad_with_space,
+            non_default_punct_list=non_default_punct_list,
+            text_preprocessing_func=text_preprocessing_func,
+        )
+
+    def encode(self, text):
+        """See base class."""
+        cs, space, tokens = [], self.tokens[self.space], set(self.tokens)
+
+        text = self.text_preprocessing_func(text)
+        for c in text:
+            # Add space if last one isn't one
+            if c == space and len(cs) > 0 and cs[-1] != space:
+                cs.append(c)
+            # Add next char
+            elif (c.isalnum() or c == "'" or c == "\u0303") and c in tokens:
+                cs.append(c)
+            # Add punct
+            elif (c in self.PUNCT_LIST) and self.punct:
+                cs.append(c)
+            # Warn about unknown char
+            elif c != space:
+                logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")
+
+        # Remove trailing spaces
+        while cs[-1] == space:
+            cs.pop()
+
+        if self.pad_with_space:
+            cs = [space] + cs + [space]
+
+        return [self._token2id[p] for p in cs]
+
+
 class EnglishPhonemesTokenizer(BaseTokenizer):
     # fmt: off
     PUNCT_LIST = (  # Derived from LJSpeech and "/" additionally

diff --git a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
@@ -34,6 +34,10 @@ class TestTTSTokenizers:
         "BUENOS": ["bwˈenos"],
         "DÍAS": ["dˈias"],
     }
+    PHONEME_DICT_IT = {
+        "CIAO": ["tʃˈao"],
+        "MONDO": ["mˈondo"],
+    }
 
     @staticmethod
     def _parse_text(tokenizer, text):
@@ -146,6 +150,18 @@ def test_ipa_tokenizer_de_de(self):
 
         assert chars == expected_output
 
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_ipa_tokenizer_it_it(self):
+        input_text = "Ciao mondo"
+        expected_output = "tʃˈao mˈondo"
+
+        g2p = IpaG2p(phoneme_dict=self.PHONEME_DICT_IT, locale="it-IT")
+        tokenizer = IPATokenizer(g2p=g2p, locale="it-IT")
+        chars, tokens = self._parse_text(tokenizer, input_text)
+
+        assert chars == expected_output
+
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
     def test_ipa_tokenizer_en_us(self):