NVIDIA · ekmb · Feb 15, 2023 · Feb 10, 2023 · Feb 10, 2023 · Feb 10, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -54,10 +54,6 @@ WORKDIR /tmp/nemo
 COPY requirements .
 RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done
 
-# install pynini
-COPY nemo_text_processing/install_pynini.sh /tmp/nemo/
-RUN /bin/bash /tmp/nemo/install_pynini.sh
-
 # install k2, skip if installation fails
 COPY scripts /tmp/nemo/scripts/
 RUN /bin/bash /tmp/nemo/scripts/speech_recognition/k2/setup.sh || exit 0
@@ -81,8 +77,7 @@ RUN --mount=from=nemo-src,target=/tmp/nemo cd /tmp/nemo && pip install ".[all]"
 
 # Check install
 RUN python -c "import nemo.collections.nlp as nemo_nlp" && \
-    python -c "import nemo.collections.tts as nemo_tts" && \
-    python -c "import nemo_text_processing.text_normalization as text_normalization"
+    python -c "import nemo.collections.tts as nemo_tts"
 
 # TODO: Update to newer numba 0.56.0RC1 for 22.03 container if possible
 # install pinned numba version

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -102,92 +102,6 @@ pipeline {
       }
     }
 
-
-    stage('L0: TN/ITN Tests CPU') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('En TN grammars') {
-          steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
-          }
-        }
-        stage('En ITN grammars') {
-          steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
-          }
-        }
-        stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
-          steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
-            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
-          }
-        }
-        stage('Test En Hybrid TN') {
-          steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/hybrid/wfst_lm_rescoring.py --data /home/TestData/nlp/text_norm/hybrid_tn/test.txt --regenerate_pkl --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 | grep "all_correct: True" || exit 1'
-          }
-        }
-      }
-    }
-
-    stage('L2: NeMo text processing') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('L2: Eng TN') {
-          steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
-            sh 'cd nemo_text_processing/text_normalization/ &&  python normalize.py --input_file=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose'
-            sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt'
-            sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_05-25.txt || exit 1'
-            sh 'rm -rf /home/TestData/nlp/text_norm/output/*'
-          }
-        }
-
-        stage('L2: Eng ITN export') {
-          steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
-            sh 'cd nemo_text_processing/inverse_text_normalization/ &&  python inverse_normalize.py --input_file=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output_file=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose'
-            sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1'
-            sh 'rm -rf /home/TestData/nlp/text_denorm/output/*'
-          }
-        }
-        stage('L2: TN with Audio (audio and raw text)') {
-          steps {
-            sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --text "The total amounts to \\$4.76." \
-            --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_raw.txt 2>&1 && \
-            cmp --silent /tmp/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
-          }
-        }
-        stage('L2: TN with Audio (audio and text file)') {
-          steps {
-            sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
-            --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_file.txt 2>&1 && \
-            cmp --silent /tmp/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
-          }
-        }
-        stage('L2: TN with Audio (manifest)') {
-          steps {
-            sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
-          }
-        }
-      }
-    }
-
     stage('L2: ASR dev run') {
       when {
         anyOf {
@@ -1073,7 +987,7 @@ pipeline {
       parallel {
         stage('G2P Conformer training, evaluation and inference') {
           steps {
-            sh 'cd examples/text_processing/g2p && \
+            sh 'cd examples/tts/g2p && \
                 TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \
                 python g2p_train_and_evaluate.py \
                     train_manifest=/home/TestData/g2p/g2p.json \
@@ -1097,7 +1011,7 @@ pipeline {
             }
             stage('ByT5G2P training, evaluation and inference') {
               steps {
-                sh 'TRANSFORMERS_OFFLINE=0 && cd examples/text_processing/g2p && \
+                sh 'TRANSFORMERS_OFFLINE=0 && cd examples/tts/g2p && \
                     TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_T5=output_byt5_${TIME} && \
                     python g2p_train_and_evaluate.py \
                         train_manifest=/home/TestData/g2p/g2p.json \
@@ -1119,7 +1033,7 @@ pipeline {
             }
            stage('HeteronymClassificationModel training, evaluation and inference') {
               steps {
-                sh 'cd examples/text_processing/g2p && \
+                sh 'cd examples/tts/g2p && \
                     TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \
                     python heteronym_classification_train_and_evaluate.py \
                         train_manifest=/home/TestData/g2p/manifest.json \

diff --git a/docs/source/tts/api.rst b/docs/source/tts/api.rst
@@ -96,4 +96,4 @@ Dataset Processing Classes
 
 .. autoclass:: nemo.collections.tts.torch.data.VocoderDataset
     :show-inheritance:
-    :members:
+    :members:
diff --git a/docs/source/tts/checkpoints.rst b/docs/source/tts/checkpoints.rst
@@ -150,4 +150,4 @@ End2End models
 .. csv-table::
    :file: data/ngc_models_e2e.csv
    :align: left
-   :header-rows: 1
+   :header-rows: 1
diff --git a/docs/source/tts/configs.rst b/docs/source/tts/configs.rst
@@ -96,7 +96,6 @@ Text normalization (TN) converts text from written form into its verbalized form
       _target_: nemo_text_processing.text_normalization.normalize.Normalizer
       lang: en
       input_case: cased
-      whitelist: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 
     text_normalizer_call_kwargs:
       verbose: false
@@ -118,7 +117,7 @@ Tokenization converts input text string to a list of integer tokens. It may pad
       apostrophe: true
       pad_with_space: true
       g2p:
-        _target_: nemo_text_processing.g2p.modules.EnglishG2p
+        _target_: nemo.collections.tts.g2p.modules.EnglishG2p
         phoneme_dict: ${phoneme_dict_path}
         heteronyms: ${heteronyms_path}
       phoneme_probability: 0.5
@@ -260,4 +259,4 @@ Fine-tuning via a Pytorch Lightning checkpoint
         trainer.devices=-1 \
         trainer.accelerator='gpu' \
         trainer.max_epochs=50 \
-        +init_from_ptl_ckpt="<name of pytorch lightning checkpoint>"
+        +init_from_ptl_ckpt="<name of pytorch lightning checkpoint>"
diff --git a/docs/source/tts/datasets.rst b/docs/source/tts/datasets.rst
@@ -45,9 +45,7 @@ LJSpeech
 .. code-block:: shell-session
 
     $ python scripts/dataset_processing/tts/ljspeech/get_data.py \
-        --data-root <your_local_dataset_root> \
-        --whitelist-path <your_local_whitelist_filepath> \
-        or default nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv
+        --data-root <your_local_dataset_root>
 
     $ python scripts/dataset_processing/tts/extract_sup_data.py \
         --config-path ljspeech/ds_conf \

diff --git a/examples/tts/aligner_heteronym_disambiguation.py b/examples/tts/aligner_heteronym_disambiguation.py
@@ -26,7 +26,7 @@
 
 
 """
-G2P disambiguation using an Aligner model's input embedding distances.
+G2P_paper disambiguation using an Aligner model's input embedding distances.
 
 Does not handle OOV and leaves them as graphemes.
 
@@ -46,7 +46,7 @@
 def get_args():
     """Retrieve arguments for disambiguation.
     """
-    parser = argparse.ArgumentParser("G2P disambiguation using Aligner input embedding distances.")
+    parser = argparse.ArgumentParser("G2P_paper disambiguation using Aligner input embedding distances.")
     # TODO(jocelynh): Make this required=False with default download from NGC once ckpt uploaded
     parser.add_argument('--model', required=True, type=str, help="Path to Aligner model checkpoint (.nemo file).")
     parser.add_argument(
@@ -108,7 +108,7 @@ def disambiguate_candidates(aligner, text, spec, spec_len, confidence, device, h
 
     Note: This could be sped up if multiple words' candidates were batched, but this is conceptually easier.
     """
-    # Grab original G2P result
+    # Grab original G2P_paper result
     aligner_g2p = aligner.tokenizer.g2p
     base_g2p = aligner_g2p(text)
 
@@ -122,7 +122,7 @@ def disambiguate_candidates(aligner, text, spec, spec_len, confidence, device, h
     has_heteronym = False
 
     for word in words:
-        # Retrieve the length of the word in the default G2P conversion
+        # Retrieve the length of the word in the default G2P_paper conversion
         g2p_default_len = len(aligner_g2p(word))
 
         # Check if word needs to be disambiguated
@@ -134,7 +134,7 @@ def disambiguate_candidates(aligner, text, spec, spec_len, confidence, device, h
             candidate_prons_and_lengths = []
 
             for pron in aligner_g2p.phoneme_dict[word]:
-                # Replace graphemes in the base G2P result with the current variant
+                # Replace graphemes in the base G2P_paper result with the current variant
                 candidate = base_g2p[:word_start_idx] + pron + base_g2p[word_start_idx + g2p_default_len :]
                 candidate_tokens = aligner.tokenizer.encode_from_g2p(candidate)
 
@@ -247,7 +247,7 @@ def disambiguate_dataset(
             count = 0
 
             for line in f_in:
-                # Retrieve entry and base G2P conversion for full text
+                # Retrieve entry and base G2P_paper conversion for full text
                 entry = json.loads(line)
                 # Set punct_post_process=True in order to preserve words with apostrophes
                 text = aligner.normalizer.normalize(entry['text'], punct_post_process=True)

diff --git a/examples/tts/conf/aligner.yaml b/examples/tts/conf/aligner.yaml
@@ -21,7 +21,6 @@ window: hann
 
 phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
-whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 
 model:
   symbols_embedding_dim: 384
@@ -41,7 +40,6 @@ model:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: en
     input_case: cased
-    whitelist: ${whitelist_path}
 
   text_normalizer_call_kwargs:
     verbose: false
@@ -56,7 +54,7 @@ model:
     apostrophe: true
     pad_with_space: true
     g2p:
-      _target_: nemo_text_processing.g2p.modules.EnglishG2p
+      _target_: nemo.collections.tts.g2p.modules.EnglishG2p
       phoneme_dict: ${phoneme_dict_path}
       heteronyms: ${heteronyms_path}
 

diff --git a/examples/tts/conf/de/fastpitch_align_22050_grapheme.yaml b/examples/tts/conf/de/fastpitch_align_22050_grapheme.yaml
@@ -28,8 +28,6 @@ lowfreq: 0
 highfreq: null
 window: hann
 
-whitelist_path: "nemo_text_processing/text_normalization/de/data/whitelist.tsv"
-
 model:
   learn_alignment: true
   bin_loss_warmup_epochs: 100
@@ -58,7 +56,6 @@ model:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: de
     input_case: cased
-    whitelist: ${whitelist_path}
 
   text_normalizer_call_kwargs:
     verbose: false

diff --git a/examples/tts/conf/de/fastpitch_align_22050_mix.yaml b/examples/tts/conf/de/fastpitch_align_22050_mix.yaml
@@ -30,7 +30,6 @@ window: hann
 
 phoneme_dict_path: "scripts/tts_dataset_files/de/de_nv230119.dict"
 heteronyms_path: "scripts/tts_dataset_files/de/de_nv230119.heteronyms"
-whitelist_path: "nemo_text_processing/text_normalization/de/data/whitelist.tsv"
 
 model:
   learn_alignment: true
@@ -60,7 +59,6 @@ model:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: de
     input_case: cased
-    whitelist: ${whitelist_path}
 
   text_normalizer_call_kwargs:
     verbose: false
@@ -74,7 +72,7 @@ model:
     apostrophe: true
     pad_with_space: true
     g2p:
-      _target_: nemo_text_processing.g2p.modules.IPAG2P
+      _target_: nemo.collections.tts.g2p.modules.IPAG2P
       locale: 'de-DE'
       phoneme_dict: ${phoneme_dict_path}
       heteronyms: ${heteronyms_path}

diff --git a/examples/tts/conf/de/fastpitch_align_44100_grapheme.yaml b/examples/tts/conf/de/fastpitch_align_44100_grapheme.yaml
@@ -28,8 +28,6 @@ lowfreq: 0
 highfreq: null
 window: hann
 
-whitelist_path: "nemo_text_processing/text_normalization/de/data/whitelist.tsv"
-
 model:
   learn_alignment: true
   bin_loss_warmup_epochs: 100
@@ -58,7 +56,6 @@ model:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: de
     input_case: cased
-    whitelist: ${whitelist_path}
 
   text_normalizer_call_kwargs:
     verbose: false

diff --git a/examples/tts/conf/de/fastpitch_align_44100_phoneme.yaml b/examples/tts/conf/de/fastpitch_align_44100_phoneme.yaml
@@ -28,8 +28,6 @@ lowfreq: 0
 highfreq: null
 window: hann
 
-whitelist_path: "nemo_text_processing/text_normalization/de/data/whitelist.tsv"
-
 model:
   learn_alignment: true
   bin_loss_warmup_epochs: 100
@@ -58,7 +56,6 @@ model:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: de
     input_case: cased
-    whitelist: ${whitelist_path}
 
   text_normalizer_call_kwargs:
     verbose: false

diff --git a/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml b/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml
@@ -60,7 +60,7 @@ model:
     apostrophe: true
     pad_with_space: true
     g2p:
-      _target_: nemo_text_processing.g2p.modules.IPAG2P
+      _target_: nemo.collections.tts.g2p.modules.IPAG2P
       locale: es-ES
       phoneme_dict: ${phoneme_dict_path}
       phoneme_probability: 0.5

diff --git a/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml b/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml
@@ -57,7 +57,7 @@ model:
     apostrophe: true
     pad_with_space: true
     g2p:
-      _target_: nemo_text_processing.g2p.modules.IPAG2P
+      _target_: nemo.collections.tts.g2p.modules.IPAG2P
       locale: es-ES
       phoneme_dict: ${phoneme_dict_path}
       phoneme_probability: 0.5

diff --git a/examples/tts/conf/fastpitch_align_44100.yaml b/examples/tts/conf/fastpitch_align_44100.yaml
@@ -29,7 +29,6 @@ window: hann
 
 phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
-whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 
 model:
   learn_alignment: true
@@ -59,7 +58,6 @@ model:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: en
     input_case: cased
-    whitelist: ${whitelist_path}
 
   text_normalizer_call_kwargs:
     verbose: false
@@ -74,7 +72,7 @@ model:
     apostrophe: true
     pad_with_space: true
     g2p:
-      _target_: nemo_text_processing.g2p.modules.EnglishG2p
+      _target_: nemo.collections.tts.g2p.modules.EnglishG2p
       phoneme_dict: ${phoneme_dict_path}
       heteronyms: ${heteronyms_path}
       phoneme_probability: 0.5