messiaen · borisfom · Feb 9, 2023 · Feb 10, 2023 · Feb 10, 2023 · Feb 10, 2023
diff --git a/.gitignore b/.gitignore
@@ -94,6 +94,10 @@ target/
 # Jupyter Notebook
 .ipynb_checkpoints
 
+# Override Jupyter in Github Language states for more accurate estimate of repo code.
+# Reference: https://github.com/github/linguist/blob/master/docs/overrides.md#generated-code
+*.ipynb linguist-generated
+
 # IPython
 profile_default/
 ipython_config.py

diff --git a/Dockerfile b/Dockerfile
@@ -54,10 +54,6 @@ WORKDIR /tmp/nemo
 COPY requirements .
 RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done
 
-# install pynini
-COPY nemo_text_processing/install_pynini.sh /tmp/nemo/
-RUN /bin/bash /tmp/nemo/install_pynini.sh
-
 # install k2, skip if installation fails
 COPY scripts /tmp/nemo/scripts/
 RUN /bin/bash /tmp/nemo/scripts/speech_recognition/k2/setup.sh || exit 0
@@ -81,8 +77,7 @@ RUN --mount=from=nemo-src,target=/tmp/nemo cd /tmp/nemo && pip install ".[all]"
 
 # Check install
 RUN python -c "import nemo.collections.nlp as nemo_nlp" && \
-    python -c "import nemo.collections.tts as nemo_tts" && \
-    python -c "import nemo_text_processing.text_normalization as text_normalization"
+    python -c "import nemo.collections.tts as nemo_tts"
 
 # TODO: Update to newer numba 0.56.0RC1 for 22.03 container if possible
 # install pinned numba version

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -1,7 +1,7 @@
 pipeline {
   agent {
         docker {
-          image 'nvcr.io/nvidia/pytorch:23.01-py3'
+          image 'nemo_containers:23.01_apex_c3d575f2478cd379b3c2d81f41edde39791b5d92'
           args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g'
         }
   }
@@ -102,92 +102,6 @@ pipeline {
       }
     }
 
-
-    stage('L0: TN/ITN Tests CPU') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('En TN grammars') {
-          steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
-          }
-        }
-        stage('En ITN grammars') {
-          steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
-          }
-        }
-        stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
-          steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
-            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
-          }
-        }
-        stage('Test En Hybrid TN') {
-          steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/hybrid/wfst_lm_rescoring.py --data /home/TestData/nlp/text_norm/hybrid_tn/test.txt --regenerate_pkl --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 | grep "all_correct: True" || exit 1'
-          }
-        }
-      }
-    }
-
-    stage('L2: NeMo text processing') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('L2: Eng TN') {
-          steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
-            sh 'cd nemo_text_processing/text_normalization/ &&  python normalize.py --input_file=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose'
-            sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt'
-            sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_05-25.txt || exit 1'
-            sh 'rm -rf /home/TestData/nlp/text_norm/output/*'
-          }
-        }
-
-        stage('L2: Eng ITN export') {
-          steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
-            sh 'cd nemo_text_processing/inverse_text_normalization/ &&  python inverse_normalize.py --input_file=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output_file=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose'
-            sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1'
-            sh 'rm -rf /home/TestData/nlp/text_denorm/output/*'
-          }
-        }
-        stage('L2: TN with Audio (audio and raw text)') {
-          steps {
-            sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --text "The total amounts to \\$4.76." \
-            --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_raw.txt 2>&1 && \
-            cmp --silent /tmp/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
-          }
-        }
-        stage('L2: TN with Audio (audio and text file)') {
-          steps {
-            sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
-            --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_file.txt 2>&1 && \
-            cmp --silent /tmp/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
-          }
-        }
-        stage('L2: TN with Audio (manifest)') {
-          steps {
-            sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
-          }
-        }
-      }
-    }
-
     stage('L2: ASR dev run') {
       when {
         anyOf {
@@ -1073,7 +987,7 @@ pipeline {
       parallel {
         stage('G2P Conformer training, evaluation and inference') {
           steps {
-            sh 'cd examples/text_processing/g2p && \
+            sh 'cd examples/tts/g2p && \
                 TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \
                 python g2p_train_and_evaluate.py \
                     train_manifest=/home/TestData/g2p/g2p.json \
@@ -1097,7 +1011,7 @@ pipeline {
             }
             stage('ByT5G2P training, evaluation and inference') {
               steps {
-                sh 'TRANSFORMERS_OFFLINE=0 && cd examples/text_processing/g2p && \
+                sh 'TRANSFORMERS_OFFLINE=0 && cd examples/tts/g2p && \
                     TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_T5=output_byt5_${TIME} && \
                     python g2p_train_and_evaluate.py \
                         train_manifest=/home/TestData/g2p/g2p.json \
@@ -1119,7 +1033,7 @@ pipeline {
             }
            stage('HeteronymClassificationModel training, evaluation and inference') {
               steps {
-                sh 'cd examples/text_processing/g2p && \
+                sh 'cd examples/tts/g2p && \
                     TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \
                     python heteronym_classification_train_and_evaluate.py \
                         train_manifest=/home/TestData/g2p/manifest.json \
@@ -3252,6 +3166,12 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.max_position_embeddings=128 \
         model.encoder_seq_length=128 \
         model.data.seq_length=128 \
+        model.position_embedding_type=rope \
+        model.rotary_percentage=0.5 \
+        model.normalization=rmsnorm \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
         model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
         model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
         model.num_layers=8 \
@@ -3282,6 +3202,12 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.max_position_embeddings=128 \
         model.encoder_seq_length=128 \
         model.data.seq_length=128 \
+        model.position_embedding_type=rope \
+        model.rotary_percentage=0.5 \
+        model.normalization=rmsnorm \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
         model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
         model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
         model.num_layers=8 \
@@ -3323,6 +3249,12 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.optim.sched.min_lr=8e-5 \
         model.max_position_embeddings=128 \
         model.encoder_seq_length=128 \
+        model.activation=swiglu \
+        model.bias_activation_fusion=False \
+        model.hidden_dropout=0.0 \
+        model.attention_dropout=0.0 \
+        model.transformer_block_type=normformer \
+        model.headscale=True \
         model.data.seq_length=128 \
         model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
         model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
@@ -3353,6 +3285,12 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.optim.sched.min_lr=8e-5 \
         model.max_position_embeddings=128 \
         model.encoder_seq_length=128 \
+        model.activation=swiglu \
+        model.bias_activation_fusion=False \
+        model.hidden_dropout=0.0 \
+        model.attention_dropout=0.0 \
+        model.transformer_block_type=normformer \
+        model.headscale=True \
         model.data.seq_length=128 \
         model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
         model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
@@ -4509,4 +4447,4 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
       cleanWs()
     }
   }
-}
+}
diff --git a/PUBLICATIONS.md b/PUBLICATIONS.md
@@ -6,6 +6,21 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
 
 # Automatic Speech Recognition (ASR)
 
+<details>
+  <summary>2023</summary>
+
+  * [Fast Entropy-Based Methods of Word-Level Confidence Estimation for End-to-End Automatic Speech Recognition](https://ieeexplore.ieee.org/abstract/document/10022960)
+  * [Damage Control During Domain Adaptation for Transducer Based Automatic Speech Recognition](https://ieeexplore.ieee.org/abstract/document/10023219)
+
+</details>
+
+<details>
+  <summary>2022</summary>
+
+  * [Multi-blank Transducers for Speech Recognition](https://arxiv.org/abs/2211.03541)
+
+</details>
+
 <details>
   <summary>2021</summary>
 
@@ -44,9 +59,9 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
 ## Speaker Recognition (SpkR)
 
 <details>
-  <summary>2021</summary>
+  <summary>2022</summary>
 
-  * [TitaNet: Neural Model for speaker representation with 1D Depth-wise separable convolutions and global context]( https://arxiv.org/pdf/2110.04410.pdf) 
+  * [TitaNet: Neural Model for Speaker Representation with 1D Depth-Wise Separable Convolutions and Global Context](https://ieeexplore.ieee.org/abstract/document/9746806)
 
 </details>
 
@@ -62,6 +77,15 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
 
 ## Speech Classification
 
+<details>
+  <summary>2022</summary>
+
+  * [AmberNet: A Compact End-to-End Model for Spoken Language Identification](https://arxiv.org/abs/2210.15781)
+  * [Accidental Learners: Spoken Language Identification in Multilingual Self-Supervised Models](https://arxiv.org/abs/2211.05103)
+
+
+</details>
+
 <details>
   <summary>2021</summary>
 
@@ -78,12 +102,32 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
 </details>
 
 
+--------
+
+## Speech Translation
+
+<details>
+  <summary>2022</summary>
+
+  * [NVIDIA NeMo Offline Speech Translation Systems for IWSLT 2022](https://aclanthology.org/2022.iwslt-1.18/)
+
+</details>
+
+
 --------
 
 # Natural Language Processing (NLP)
 
 ## Language Modeling
 
+<details>
+  <summary>2022</summary>
+
+  * [Evaluating Parameter Efficient Learning for Generation](https://arxiv.org/abs/2210.13673)
+  * [Text Mining Drug/Chemical-Protein Interactions using an Ensemble of BERT and T5 Based Models](https://arxiv.org/abs/2111.15617)
+
+</details>
+
 <details>
   <summary>2021</summary>
 
@@ -93,6 +137,13 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
 
 ## Neural Machine Translation
 
+<details>
+    <summary>2022</summary>
+
+  * [Finding the Right Recipe for Low Resource Domain Adaptation in Neural Machine Translation](https://arxiv.org/abs/2206.01137)
+
+</details>
+
 <details>
   <summary>2021</summary>
 
@@ -122,6 +173,13 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
 
 # Text To Speech (TTS)
 
+<details>
+  <summary>2022</summary>
+
+  * [Adapter-Based Extension of Multi-Speaker Text-to-Speech Model for New Speakers](https://arxiv.org/abs/2211.00585)
+
+</details>
+
 <details>
   <summary>2021</summary>
 
@@ -140,6 +198,7 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
   <summary>2022</summary>
 
   * [Shallow Fusion of Weighted Finite-State Transducer and Language Model for Text Normalization](https://arxiv.org/abs/2203.15917)
+  * [Thutmose Tagger: Single-pass neural model for Inverse Text Normalization](https://arxiv.org/abs/2208.00064)
 
 </details>
 

diff --git a/README.rst b/README.rst
@@ -63,7 +63,7 @@ Key Features
 * Speech processing
     * `HuggingFace Space for Audio Transcription (File, Microphone and YouTube) <https://huggingface.co/spaces/smajumdar/nemo_multilingual_language_id>`_
     * `Automatic Speech Recognition (ASR) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/intro.html>`_
-        * Supported models: Jasper, QuartzNet, CitriNet, Conformer-CTC, Conformer-Transducer, Squeezeformer-CTC, Squeezeformer-Transducer, ContextNet, LSTM-Transducer (RNNT), LSTM-CTC, ...
+        * Supported models: Jasper, QuartzNet, CitriNet, Conformer-CTC, Conformer-Transducer, Squeezeformer-CTC, Squeezeformer-Transducer, ContextNet, LSTM-Transducer (RNNT), LSTM-CTC, FastConformer-CTC, FastConformer-Transducer...
         * Supports CTC and Transducer/RNNT losses/decoders
             * NeMo Original `Multi-blank Transducers <https://arxiv.org/abs/2211.03541>`_
         * Beam Search decoding
@@ -243,21 +243,17 @@ Transformer Engine enables FP8 training on NVIDIA Hopper GPUs.
 
 NeMo Text Processing
 ~~~~~~~~~~~~~~~~~~~~
-NeMo Text Processing, specifically (Inverse) Text Normalization, requires `Pynini <https://pypi.org/project/pynini/>`_ to be installed.
-
-.. code-block:: bash
-
-    bash NeMo/nemo_text_processing/install_pynini.sh
+NeMo Text Processing, specifically (Inverse) Text Normalization, is now a separate repository `https://github.com/NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`_.
 
 Docker containers:
 ~~~~~~~~~~~~~~~~~~
-We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.14.0`` comes with container ``nemo:22.11``, you may find more details about released containers in `releases page <https://github.com/NVIDIA/NeMo/releases>`_. 
+We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.15.0`` comes with container ``nemo:22.12``, you may find more details about released containers in `releases page <https://github.com/NVIDIA/NeMo/releases>`_. 
 
 To use built container, please run
 
 .. code-block:: bash
 
-    docker pull nvcr.io/nvidia/nemo:22.11
+    docker pull nvcr.io/nvidia/nemo:22.12
 
 To build a nemo container with Dockerfile from a branch, please run 
 

diff --git a/docs/source/asr/api.rst b/docs/source/asr/api.rst
@@ -113,6 +113,10 @@ Mixins
     :show-inheritance:
     :members:
 
+.. autoclass:: nemo.collections.asr.parts.mixins.interctc_mixin.InterCTCMixin
+    :show-inheritance:
+    :members:
+
 Datasets
 --------