Merge branch 'main' into dialogue_state_tracking_refactor

NVIDIA · Jan 31, 2022 · deeeaec · deeeaec
2 parents 5f6dbd9 + ddcc2a6
commit deeeaec
Show file tree

Hide file tree

Showing 86 changed files with 1,047 additions and 273 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,20 @@
+# What does this PR do?
+
+Please have one of the following:
+
+* Fixes # (issue)
+* New feature: (short feature description)
+
+## Before your PR is "Ready for review"
+- [ ] Make sure you read and followed [Contributor guidelines](https://github.com/NVIDIA/NeMo/blob/main/CONTRIBUTING.md)
+- [ ] Did you write any new necessary tests?
+- [ ] Did you add or updated any necessary documentation?
+
+
+If you haven't finished some of the above items you can still open "Draft" PR.
+
+
+## Who can review?
+
+Anyone in the community is free to review the PR once the checks have passed. 
+[Contributor guidelines](https://github.com/NVIDIA/NeMo/blob/main/CONTRIBUTING.md) contains specific people who can review PRs to various areas.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -29,7 +29,7 @@ pytest --with_downloads
 1. For changes to NeMo's core: @ericharper, @titu1994, @blisc, or @okuchaiev  
 1. For changes to NeMo's ASR collection: @titu1994, @redoctopus, @jbalam-nv, or @okuchaiev
 1. For changes to NeMo's NLP collection: @MaximumEntropy, @ericharper, @ekmb, @yzhang123, @VahidooX, @vladgets, or @okuchaiev 
-1. For changes to NeMo's TTS collection: @blisc or @stasbel, or @okuchaiev
+1. For changes to NeMo's TTS collection: @blisc or @Oktai15, or @okuchaiev
 
 Note that some people may self-assign to review your PR - in which case, please wait for them to add a review.
 

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -111,12 +111,12 @@ pipeline {
       parallel {
         stage('En TN grammars') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-27.1'
           }
         }
         stage('En ITN grammars') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-27.1'
           }
         }
         stage('German ITN and non-deterministic TN') {
@@ -131,8 +131,8 @@ pipeline {
         }
         stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15'
-            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-27.1'
+            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-27.1'
           }
         }
         stage('Run Ru ITN and non-deterministic TN & Run all Ru ITN tests') {
@@ -1322,8 +1322,33 @@ pipeline {
       parallel {
         stage('L2: NMT Training Post-LN') {
             steps {
-              sh 'cd examples/nlp/machine_translation && \
-              python enc_dec_nmt.py \
+              sh 'python examples/nlp/machine_translation/enc_dec_nmt.py \
+              --config-path=conf \
+              --config-name=aayn_base \
+              do_testing=false \
+              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+              model.encoder.num_layers=1 \
+              model.encoder.hidden_size=64 \
+              model.encoder.inner_size=256 \
+              model.decoder.num_layers=1 \
+              model.decoder.hidden_size=64 \
+              model.decoder.inner_size=256 \
+              trainer.gpus=[0] \
+              +trainer.val_check_interval=2 \
+              +trainer.limit_val_batches=1 \
+              +trainer.max_steps=2 \
+              trainer.precision=16 \
+              +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
+              +exp_manager.create_checkpoint_callback=true \
+              '
+              sh 'python examples/nlp/machine_translation/enc_dec_nmt.py \
               --config-path=conf \
               --config-name=aayn_base \
               do_testing=true \
@@ -1335,10 +1360,20 @@ pipeline {
               model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
               model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
               model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+              model.encoder.num_layers=1 \
+              model.encoder.hidden_size=64 \
+              model.encoder.inner_size=256 \
+              model.decoder.num_layers=1 \
+              model.decoder.hidden_size=64 \
+              model.decoder.inner_size=256 \
               trainer.gpus=[0] \
-              +trainer.fast_dev_run=true \
-              +trainer.limit_test_batches=2 \
-              exp_manager=null \
+              +trainer.val_check_interval=2 \
+              +trainer.limit_val_batches=1 \
+              +trainer.limit_test_batches=1 \
+              +trainer.max_steps=4 \
+              +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
+              +exp_manager.create_checkpoint_callback=true \
+              +exp_manager.resume_if_exists=True \
               '
             }
         }
@@ -1501,40 +1536,6 @@ pipeline {
       }
     }
 
-    // TODO: add when megatron bert is supported again in NeMo
-    // stage('L2: NMT Megatron BERT Model Parallel Size 2 Encoder') {
-    //   when {
-    //     anyOf{
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps{
-    //     sh 'cd examples/nlp/machine_translation && \
-    //     python enc_dec_nmt.py \
-    //     --config-path=conf \
-    //     --config-name=megatron \
-    //     model.encoder.model_name=megatron-bert-uncased \
-    //     model.encoder.checkpoint_file=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
-    //     model.encoder.hidden_size=1024 \
-    //     model.encoder.num_attention_heads=16 \
-    //     model.encoder.num_layers=24 \
-    //     model.encoder.max_position_embeddings=512 \
-    //     model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //     model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //     model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //     model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //     model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //     model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //     model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    //     model.decoder.hidden_size=1024 \
-    //     trainer.gpus=[0,1] \
-    //     +trainer.fast_dev_run=true \
-    //     exp_manager=null \
-    //     '
-    //   }
-    // }
 
     stage('L2: NMT Tarred Dataset Creation') {
       when {

diff --git a/README.rst b/README.rst
@@ -51,8 +51,8 @@ Key Features
         * `Language Modelling for ASR <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html>`_: N-gram LM in fusion with Beam Search decoding, Neural Rescoring with Transformer
     * `Speech Classification and Speech Command Recognition <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speech_classification/intro.html>`_: MatchboxNet (Command Recognition)
     * `Voice activity Detection (VAD) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#marblenet-vad>`_: MarbleNet
-    * `Speaker Recognition <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_recognition/intro.html>`_: SpeakerNet, ECAPA_TDNN
-    * `Speaker Diarization <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_diarization/intro.html>`_: SpeakerNet, ECAPA_TDNN
+    * `Speaker Recognition <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_recognition/intro.html>`_: TitaNet, ECAPA_TDNN, SpeakerNet
+    * `Speaker Diarization <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_diarization/intro.html>`_: TitaNet, ECAPA_TDNN SpeakerNet
     * `Pretrained models on different languages. <https://ngc.nvidia.com/catalog/collections/nvidia:nemo_asr>`_: English, Spanish, German, Russian, Chinese, French, Italian, Polish, ...
     * `NGC collection of pre-trained speech processing models. <https://ngc.nvidia.com/catalog/collections/nvidia:nemo_asr>`_
 * Natural Language Processing
@@ -69,6 +69,7 @@ Key Features
     * `Entity Linking <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/entity_linking.html>`_
     * `Dialogue State Tracking <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/sgd_qa.html>`_
     * `Neural Duplex Text Normalization <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/text_normalization.html>`_
+    * `Prompt Tuning <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/megatron_finetuning.html#prompt-tuning>`_
     * `NGC collection of pre-trained NLP models. <https://ngc.nvidia.com/catalog/collections/nvidia:nemo_nlp>`_
 * `Speech synthesis (TTS) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tts/intro.html#>`_
     * Spectrogram generation: Tacotron2, GlowTTS, TalkNet, FastPitch, FastSpeech2, Mixer-TTS, Mixer-TTS-X

diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst
@@ -725,12 +725,14 @@ All ASR scripts support easy fine-tuning by partially/fully loading the pretrain
 2) Providing a name of a pretrained NeMo model (which will be downloaded via the cloud) (via ``init_from_pretrained_model``)
 3) Providing a path to a Pytorch Lightning checkpoint file (via ``init_from_ptl_ckpt``)
 
+There are multiple ASR subtasks inside the ``examples/asr/`` directory, you can substitute the ``<subtask>`` tag below.
+
 Fine-tuning via a NeMo model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: sh
 
-    python examples/asr/script_to_<script_name>.py \
+    python examples/asr/<subtask>/script_to_<script_name>.py \
         --config-path=<path to dir of configs> \
         --config-name=<name of config without .yaml>) \
         model.train_ds.manifest_filepath="<path to manifest file>" \
@@ -745,7 +747,7 @@ Fine-tuning via a NeMo pretrained model name
 
 .. code-block:: sh
 
-    python examples/asr/script_to_<script_name>.py \
+    python examples/asr/<subtask>/script_to_<script_name>.py \
         --config-path=<path to dir of configs> \
         --config-name=<name of config without .yaml>) \
         model.train_ds.manifest_filepath="<path to manifest file>" \
@@ -759,7 +761,7 @@ Fine-tuning via a Pytorch Lightning checkpoint
 
 .. code-block:: sh
 
-    python examples/asr/script_to_<script_name>.py \
+    python examples/asr/<subtask>/script_to_<script_name>.py \
         --config-path=<path to dir of configs> \
         --config-name=<name of config without .yaml>) \
         model.train_ds.manifest_filepath="<path to manifest file>" \

diff --git a/docs/source/asr/results.rst b/docs/source/asr/results.rst
@@ -30,7 +30,7 @@ NGC Pretrained Checkpoints
 --------------------------
 
 The ASR collection has checkpoints of several models trained on various datasets for a variety of tasks. These checkpoints are 
-obtainable via NGC `NeMo Automatic Speech Recognition collection <https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels>`_.
+obtainable via NGC `NeMo Automatic Speech Recognition collection <https://catalog.ngc.nvidia.com/orgs/nvidia/collections/nemo_asr>`_.
 The model cards on NGC contain more information about each of the checkpoints available.
 
 The tables below list the ASR models available from NGC. The models can be accessed via the :code:`from_pretrained()` method inside

diff --git a/docs/source/asr/speaker_recognition/api.rst b/docs/source/asr/speaker_recognition/api.rst
@@ -0,0 +1,11 @@
+NeMo Speaker Recogniton API
+=============================
+
+
+Model Classes
+-------------
+.. autoclass:: nemo.collections.asr.models.label_models.EncDecSpeakerLabelModel
+    :show-inheritance:
+    :members: setup_finetune_model, get_embedding, verify_speakers
+
+
diff --git a/docs/source/asr/speaker_recognition/images/titanet_network.png b/docs/source/asr/speaker_recognition/images/titanet_network.png
diff --git a/docs/source/asr/speaker_recognition/intro.rst b/docs/source/asr/speaker_recognition/intro.rst
@@ -17,6 +17,7 @@ The full documentation tree:
    configs
    datasets
    results
+   api
 
 Resource and Documentation Guide
 --------------------------------

diff --git a/docs/source/asr/speaker_recognition/results.rst b/docs/source/asr/speaker_recognition/results.rst
@@ -76,6 +76,20 @@ This python call will download best pretrained model from NGC and writes embeddi
 .. code-block:: bash
   
     python examples/speaker_tasks/recognition/extract_speaker_embeddings.py --manifest=manifest.json
+  
+Speaker Verification Inference
+------------------------------
+
+Speaker Verification is a task of verifying if two utterances are from the same speaker or not.
+
+We provide a helper function to verify the audio files and return True if two provided audio files are from the same speaker, False otherwise.
+
+The audio files should be 16KHz mono channel wav files.
+
+.. code-block:: python
+
+  speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_name="titanet_large")
+  decision = speaker_model.verify_speakers('path/to/one/audio_file','path/to/other/audio_file')
 
 
 NGC Pretrained Checkpoints