[TTS/TN/G2P] Remove Text Processing from NeMo, move G2P to TTS (NVIDI…

…A#5982) * remove TN Signed-off-by: ekmb <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix imports Signed-off-by: ekmb <[email protected]> * fix import Signed-off-by: ekmb <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add missing init Signed-off-by: ekmb <[email protected]> * fix import Signed-off-by: ekmb <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rename unit test Signed-off-by: ekmb <[email protected]> * fix import Signed-off-by: ekmb <[email protected]> * fix modules test Signed-off-by: ekmb <[email protected]> * fix imports Signed-off-by: ekmb <[email protected]> * remove whitelist from config Signed-off-by: ekmb <[email protected]> * delete wordid file Signed-off-by: ekmb <[email protected]> * remove pynini_install from tutorials Signed-off-by: ekmb <[email protected]> * update requirements Signed-off-by: ekmb <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add support warning Signed-off-by: ekmb <[email protected]> * review Signed-off-by: ekmb <[email protected]> --------- Signed-off-by: ekmb <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
titu1994 · Mar 24, 2023 · dd98487 · dd98487
1 parent 7667324
commit dd98487
Show file tree

Hide file tree

Showing 1,137 changed files with 194 additions and 95,825 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -54,10 +54,6 @@ WORKDIR /tmp/nemo
 COPY requirements .
 RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done
 
-# install pynini
-COPY nemo_text_processing/install_pynini.sh /tmp/nemo/
-RUN /bin/bash /tmp/nemo/install_pynini.sh
-
 # install k2, skip if installation fails
 COPY scripts /tmp/nemo/scripts/
 RUN /bin/bash /tmp/nemo/scripts/speech_recognition/k2/setup.sh || exit 0
@@ -81,8 +77,7 @@ RUN --mount=from=nemo-src,target=/tmp/nemo cd /tmp/nemo && pip install ".[all]"
 
 # Check install
 RUN python -c "import nemo.collections.nlp as nemo_nlp" && \
-    python -c "import nemo.collections.tts as nemo_tts" && \
-    python -c "import nemo_text_processing.text_normalization as text_normalization"
+    python -c "import nemo.collections.tts as nemo_tts"
 
 # TODO: Update to newer numba 0.56.0RC1 for 22.03 container if possible
 # install pinned numba version

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -102,92 +102,6 @@ pipeline {
       }
     }
 
-
-    stage('L0: TN/ITN Tests CPU') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('En TN grammars') {
-          steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
-          }
-        }
-        stage('En ITN grammars') {
-          steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
-          }
-        }
-        stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
-          steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
-            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
-          }
-        }
-        stage('Test En Hybrid TN') {
-          steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/hybrid/wfst_lm_rescoring.py --data /home/TestData/nlp/text_norm/hybrid_tn/test.txt --regenerate_pkl --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 | grep "all_correct: True" || exit 1'
-          }
-        }
-      }
-    }
-
-    stage('L2: NeMo text processing') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('L2: Eng TN') {
-          steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
-            sh 'cd nemo_text_processing/text_normalization/ &&  python normalize.py --input_file=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose'
-            sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt'
-            sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_05-25.txt || exit 1'
-            sh 'rm -rf /home/TestData/nlp/text_norm/output/*'
-          }
-        }
-
-        stage('L2: Eng ITN export') {
-          steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
-            sh 'cd nemo_text_processing/inverse_text_normalization/ &&  python inverse_normalize.py --input_file=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output_file=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose'
-            sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1'
-            sh 'rm -rf /home/TestData/nlp/text_denorm/output/*'
-          }
-        }
-        stage('L2: TN with Audio (audio and raw text)') {
-          steps {
-            sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --text "The total amounts to \\$4.76." \
-            --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_raw.txt 2>&1 && \
-            cmp --silent /tmp/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
-          }
-        }
-        stage('L2: TN with Audio (audio and text file)') {
-          steps {
-            sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
-            --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_file.txt 2>&1 && \
-            cmp --silent /tmp/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
-          }
-        }
-        stage('L2: TN with Audio (manifest)') {
-          steps {
-            sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
-          }
-        }
-      }
-    }
-
     stage('L2: ASR dev run') {
       when {
         anyOf {
@@ -1073,7 +987,7 @@ pipeline {
       parallel {
         stage('G2P Conformer training, evaluation and inference') {
           steps {
-            sh 'cd examples/text_processing/g2p && \
+            sh 'cd examples/tts/g2p && \
                 TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \
                 python g2p_train_and_evaluate.py \
                     train_manifest=/home/TestData/g2p/g2p.json \
@@ -1097,7 +1011,7 @@ pipeline {
             }
             stage('ByT5G2P training, evaluation and inference') {
               steps {
-                sh 'TRANSFORMERS_OFFLINE=0 && cd examples/text_processing/g2p && \
+                sh 'TRANSFORMERS_OFFLINE=0 && cd examples/tts/g2p && \
                     TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_T5=output_byt5_${TIME} && \
                     python g2p_train_and_evaluate.py \
                         train_manifest=/home/TestData/g2p/g2p.json \
@@ -1119,7 +1033,7 @@ pipeline {
             }
            stage('HeteronymClassificationModel training, evaluation and inference') {
               steps {
-                sh 'cd examples/text_processing/g2p && \
+                sh 'cd examples/tts/g2p && \
                     TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \
                     python heteronym_classification_train_and_evaluate.py \
                         train_manifest=/home/TestData/g2p/manifest.json \

diff --git a/README.rst b/README.rst
@@ -243,11 +243,7 @@ Transformer Engine enables FP8 training on NVIDIA Hopper GPUs.
 
 NeMo Text Processing
 ~~~~~~~~~~~~~~~~~~~~
-NeMo Text Processing, specifically (Inverse) Text Normalization, requires `Pynini <https://pypi.org/project/pynini/>`_ to be installed.
-
-.. code-block:: bash
-
-    bash NeMo/nemo_text_processing/install_pynini.sh
+NeMo Text Processing, specifically (Inverse) Text Normalization, is now a separate repository `https://github.com/NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`_.
 
 Docker containers:
 ~~~~~~~~~~~~~~~~~~

diff --git a/docs/source/tts/api.rst b/docs/source/tts/api.rst
@@ -96,4 +96,4 @@ Dataset Processing Classes
 
 .. autoclass:: nemo.collections.tts.torch.data.VocoderDataset
     :show-inheritance:
-    :members:
+    :members:
diff --git a/docs/source/tts/checkpoints.rst b/docs/source/tts/checkpoints.rst
@@ -150,4 +150,4 @@ End2End models
 .. csv-table::
    :file: data/ngc_models_e2e.csv
    :align: left
-   :header-rows: 1
+   :header-rows: 1
diff --git a/docs/source/tts/configs.rst b/docs/source/tts/configs.rst
@@ -96,7 +96,6 @@ Text normalization (TN) converts text from written form into its verbalized form
       _target_: nemo_text_processing.text_normalization.normalize.Normalizer
       lang: en
       input_case: cased
-      whitelist: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 
     text_normalizer_call_kwargs:
       verbose: false
@@ -118,7 +117,7 @@ Tokenization converts input text string to a list of integer tokens. It may pad
       apostrophe: true
       pad_with_space: true
       g2p:
-        _target_: nemo_text_processing.g2p.modules.EnglishG2p
+        _target_: nemo.collections.tts.g2p.modules.EnglishG2p
         phoneme_dict: ${phoneme_dict_path}
         heteronyms: ${heteronyms_path}
       phoneme_probability: 0.5
@@ -260,4 +259,4 @@ Fine-tuning via a Pytorch Lightning checkpoint
         trainer.devices=-1 \
         trainer.accelerator='gpu' \
         trainer.max_epochs=50 \
-        +init_from_ptl_ckpt="<name of pytorch lightning checkpoint>"
+        +init_from_ptl_ckpt="<name of pytorch lightning checkpoint>"
diff --git a/docs/source/tts/datasets.rst b/docs/source/tts/datasets.rst
@@ -45,9 +45,7 @@ LJSpeech
 .. code-block:: shell-session
 
     $ python scripts/dataset_processing/tts/ljspeech/get_data.py \
-        --data-root <your_local_dataset_root> \
-        --whitelist-path <your_local_whitelist_filepath> \
-        or default nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv
+        --data-root <your_local_dataset_root>
 
     $ python scripts/dataset_processing/tts/extract_sup_data.py \
         --config-path ljspeech/ds_conf \

diff --git a/examples/tts/conf/aligner.yaml b/examples/tts/conf/aligner.yaml
@@ -21,7 +21,6 @@ window: hann
 
 phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
-whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 
 model:
   symbols_embedding_dim: 384
@@ -41,7 +40,6 @@ model:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: en
     input_case: cased
-    whitelist: ${whitelist_path}
 
   text_normalizer_call_kwargs:
     verbose: false
@@ -56,7 +54,7 @@ model:
     apostrophe: true
     pad_with_space: true
     g2p:
-      _target_: nemo_text_processing.g2p.modules.EnglishG2p
+      _target_: nemo.collections.tts.g2p.modules.EnglishG2p
       phoneme_dict: ${phoneme_dict_path}
       heteronyms: ${heteronyms_path}
 

diff --git a/examples/tts/conf/de/fastpitch_align_22050_grapheme.yaml b/examples/tts/conf/de/fastpitch_align_22050_grapheme.yaml
@@ -28,8 +28,6 @@ lowfreq: 0
 highfreq: null
 window: hann
 
-whitelist_path: "nemo_text_processing/text_normalization/de/data/whitelist.tsv"
-
 model:
   learn_alignment: true
   bin_loss_warmup_epochs: 100
@@ -58,7 +56,6 @@ model:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: de
     input_case: cased
-    whitelist: ${whitelist_path}
 
   text_normalizer_call_kwargs:
     verbose: false

diff --git a/examples/tts/conf/de/fastpitch_align_22050_mix.yaml b/examples/tts/conf/de/fastpitch_align_22050_mix.yaml
@@ -30,7 +30,6 @@ window: hann
 
 phoneme_dict_path: "scripts/tts_dataset_files/de/de_nv230119.dict"
 heteronyms_path: "scripts/tts_dataset_files/de/de_nv230119.heteronyms"
-whitelist_path: "nemo_text_processing/text_normalization/de/data/whitelist.tsv"
 
 model:
   learn_alignment: true
@@ -60,7 +59,6 @@ model:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: de
     input_case: cased
-    whitelist: ${whitelist_path}
 
   text_normalizer_call_kwargs:
     verbose: false
@@ -74,7 +72,7 @@ model:
     apostrophe: true
     pad_with_space: true
     g2p:
-      _target_: nemo_text_processing.g2p.modules.IPAG2P
+      _target_: nemo.collections.tts.g2p.modules.IPAG2P
       locale: 'de-DE'
       phoneme_dict: ${phoneme_dict_path}
       heteronyms: ${heteronyms_path}

diff --git a/examples/tts/conf/de/fastpitch_align_44100_grapheme.yaml b/examples/tts/conf/de/fastpitch_align_44100_grapheme.yaml
@@ -28,8 +28,6 @@ lowfreq: 0
 highfreq: null
 window: hann
 
-whitelist_path: "nemo_text_processing/text_normalization/de/data/whitelist.tsv"
-
 model:
   learn_alignment: true
   bin_loss_warmup_epochs: 100
@@ -58,7 +56,6 @@ model:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: de
     input_case: cased
-    whitelist: ${whitelist_path}
 
   text_normalizer_call_kwargs:
     verbose: false

diff --git a/examples/tts/conf/de/fastpitch_align_44100_phoneme.yaml b/examples/tts/conf/de/fastpitch_align_44100_phoneme.yaml
@@ -28,8 +28,6 @@ lowfreq: 0
 highfreq: null
 window: hann
 
-whitelist_path: "nemo_text_processing/text_normalization/de/data/whitelist.tsv"
-
 model:
   learn_alignment: true
   bin_loss_warmup_epochs: 100
@@ -58,7 +56,6 @@ model:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: de
     input_case: cased
-    whitelist: ${whitelist_path}
 
   text_normalizer_call_kwargs:
     verbose: false

diff --git a/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml b/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml
@@ -60,7 +60,7 @@ model:
     apostrophe: true
     pad_with_space: true
     g2p:
-      _target_: nemo_text_processing.g2p.modules.IPAG2P
+      _target_: nemo.collections.tts.g2p.modules.IPAG2P
       locale: es-ES
       phoneme_dict: ${phoneme_dict_path}
       phoneme_probability: 0.5

diff --git a/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml b/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml
@@ -57,7 +57,7 @@ model:
     apostrophe: true
     pad_with_space: true
     g2p:
-      _target_: nemo_text_processing.g2p.modules.IPAG2P
+      _target_: nemo.collections.tts.g2p.modules.IPAG2P
       locale: es-ES
       phoneme_dict: ${phoneme_dict_path}
       phoneme_probability: 0.5

diff --git a/examples/tts/conf/fastpitch_align_44100.yaml b/examples/tts/conf/fastpitch_align_44100.yaml
@@ -29,7 +29,6 @@ window: hann
 
 phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
-whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 
 model:
   learn_alignment: true
@@ -59,7 +58,6 @@ model:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: en
     input_case: cased
-    whitelist: ${whitelist_path}
 
   text_normalizer_call_kwargs:
     verbose: false
@@ -74,7 +72,7 @@ model:
     apostrophe: true
     pad_with_space: true
     g2p:
-      _target_: nemo_text_processing.g2p.modules.EnglishG2p
+      _target_: nemo.collections.tts.g2p.modules.EnglishG2p
       phoneme_dict: ${phoneme_dict_path}
       heteronyms: ${heteronyms_path}
       phoneme_probability: 0.5

diff --git a/examples/tts/conf/fastpitch_align_ipa.yaml b/examples/tts/conf/fastpitch_align_ipa.yaml
@@ -30,7 +30,6 @@ window: hann
 
 phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
-whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 
 model:
   learn_alignment: true
@@ -60,7 +59,6 @@ model:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: en
     input_case: cased
-    whitelist: ${whitelist_path}
 
   text_normalizer_call_kwargs:
     verbose: false
@@ -73,7 +71,7 @@ model:
     apostrophe: true
     pad_with_space: true
     g2p:
-      _target_: nemo_text_processing.g2p.modules.IPAG2P
+      _target_: nemo.collections.tts.g2p.modules.IPAG2P
       phoneme_dict: ${phoneme_dict_path}
       heteronyms: ${heteronyms_path}
       phoneme_probability: 0.8

diff --git a/examples/tts/conf/fastpitch_align_v1.05.yaml b/examples/tts/conf/fastpitch_align_v1.05.yaml
@@ -30,7 +30,6 @@ window: hann
 
 phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
-whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 
 model:
   learn_alignment: true
@@ -60,7 +59,6 @@ model:
     _target_: nemo_text_processing.text_normalization.normalize.Normalizer
     lang: en
     input_case: cased
-    whitelist: ${whitelist_path}
 
   text_normalizer_call_kwargs:
     verbose: false
@@ -75,7 +73,7 @@ model:
     apostrophe: true
     pad_with_space: true
     g2p:
-      _target_: nemo_text_processing.g2p.modules.EnglishG2p
+      _target_: nemo.collections.tts.g2p.modules.EnglishG2p
       phoneme_dict: ${phoneme_dict_path}
       heteronyms: ${heteronyms_path}
       phoneme_probability: 0.5