Merge final bugfix r1.3.0 (#2749)

* update jenkins branch Signed-off-by: ericharper <[email protected]> * update notebooks branch Signed-off-by: ericharper <[email protected]> * Replaced unfold() with split_view() (#2671) * Replaced unfold() with split_view() Signed-off-by: Boris Fomitchev <[email protected]> * fixed typo Signed-off-by: Boris Fomitchev <[email protected]> Co-authored-by: Somshubra Majumdar <[email protected]> * Fix issues with ASR notebooks (#2698) Signed-off-by: smajumdar <[email protected]> * Allow non divisible split_size (#2699) * bugfix Signed-off-by: Jason <[email protected]> * bugfix Signed-off-by: Jason <[email protected]> * Fix the feat_out param. (#2714) * broken link fix (#2720) Signed-off-by: nithinraok <[email protected]> * rename (#2721) Signed-off-by: fayejf <[email protected]> * apply fix (#2726) Signed-off-by: Jason <[email protected]> * [DOCS] Updating adobe and copyright for docs (#2740) * update Signed-off-by: ericharper <[email protected]> * update Signed-off-by: ericharper <[email protected]> * update Signed-off-by: ericharper <[email protected]> * update Signed-off-by: ericharper <[email protected]> * update Signed-off-by: ericharper <[email protected]> * update Signed-off-by: ericharper <[email protected]> * update notebook branch Signed-off-by: ericharper <[email protected]> * update jenkins branch Signed-off-by: ericharper <[email protected]> * update jenkins test to use less memory Signed-off-by: ericharper <[email protected]> * update jenkins test to use less memory Signed-off-by: ericharper <[email protected]> Co-authored-by: Boris Fomitchev <[email protected]> Co-authored-by: Somshubra Majumdar <[email protected]> Co-authored-by: Jason <[email protected]> Co-authored-by: Vahid Noroozi <[email protected]> Co-authored-by: Nithin Rao <[email protected]> Co-authored-by: fayejf <[email protected]>
NVIDIA · Aug 31, 2021 · 234e496 · 234e496
1 parent 122ee9a
commit 234e496
Show file tree

Hide file tree

Showing 11 changed files with 136 additions and 68 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -1246,15 +1246,20 @@ pipeline {
               model.shared_tokenizer=False \
               model.encoder_tokenizer.library=huggingface \
               model.encoder.library=huggingface \
-              model.encoder.model_name=bert-base-cased \
+              model.encoder.model_name=distilbert-base-cased \
               model.encoder.pretrained=true \
               model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
               model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
               model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
               model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
               model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
               model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.train_ds.tokens_in_batch=128 \
+              model.validation_ds.tokens_in_batch=128 \
+              model.test_ds.tokens_in_batch=128 \
               model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+              model.decoder.hidden_size=128 \
+              model.decoder.inner_size=256 \
               trainer.gpus=[0] \
               +trainer.fast_dev_run=true \
               exp_manager=null \
@@ -1275,14 +1280,19 @@ pipeline {
               model.encoder.model_name=null \
               model.encoder.pretrained=false \
               +model.encoder._target_=transformers.BertConfig \
-              +model.encoder.hidden_size=1536 \
+              +model.encoder.hidden_size=48 \
               model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
               model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
               model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
               model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
               model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
               model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.train_ds.tokens_in_batch=128 \
+              model.validation_ds.tokens_in_batch=128 \
+              model.test_ds.tokens_in_batch=128 \
               model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+              model.decoder.hidden_size=128 \
+              model.decoder.inner_size=256 \
               trainer.gpus=[1] \
               +trainer.fast_dev_run=true \
               exp_manager=null \

diff --git a/docs/source/_templates/layouts.html b/docs/source/_templates/layouts.html
@@ -0,0 +1,13 @@
+{% extends "!layout.html" %}
+
+{% block extrahead %}
+
+<script src="//assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js"></script>
+
+{% endblock %}
+
+{% block footer %}
+
+<script type="text/javascript">_satellite.pageBottom();</script>
+
+{% endblock %}
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -138,7 +138,7 @@
 
 # General information about the project.
 project = "NVIDIA NeMo"
-copyright = "2021-, NVIDIA CORPORATION"
+copyright = "© 2020-2021 NVIDIA Corporation & Affiliates. All rights reserved."
 author = "NVIDIA CORPORATION"
 
 # The version info for the project you're documenting, acts as replacement for

diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py
@@ -154,8 +154,8 @@ def __init__(
             )
             self._feat_out = d_model
         else:
-            self._feat_out = d_model
             self.pre_encode = nn.Linear(feat_in, d_model)
+            self._feat_out = d_model
 
         if not untie_biases and self_attention_model == "rel_pos":
             d_head = d_model // n_heads
@@ -199,8 +199,8 @@ def __init__(
             )
             self.layers.append(layer)
 
-        if feat_out > 0 and feat_out != self.output_dim:
-            self.out_proj = nn.Linear(self.feat_out, feat_out)
+        if feat_out > 0 and feat_out != self._feat_out:
+            self.out_proj = nn.Linear(self._feat_out, feat_out)
             self._feat_out = feat_out
         else:
             self.out_proj = None

diff --git a/nemo/collections/tts/models/talknet.py b/nemo/collections/tts/models/talknet.py
@@ -370,19 +370,23 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]':
         Returns:
             List of available pre-trained models.
         """
-        list_of_models = []
-        model = PretrainedModelInfo(
-            pretrained_model_name="tts_en_talknet",
-            location=(
-                "https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_talknet/versions/1.0.0rc1/files"
-                "/talknet_spect.nemo"
-            ),
-            description=(
-                "This model is trained on LJSpeech sampled at 22050Hz, and can be used to generate female "
-                "English voices with an American accent."
-            ),
-            class_=cls,  # noqa
-            aliases=["TalkNet-22050Hz"],
-        )
-        list_of_models.append(model)
-        return list_of_models
+        # list_of_models = []
+        # model = PretrainedModelInfo(
+        #     pretrained_model_name="tts_en_talknet",
+        #     location=(
+        #         "https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_talknet/versions/1.0.0rc1/files"
+        #         "/talknet_spect.nemo"
+        #     ),
+        #     description=(
+        #         "This model is trained on LJSpeech sampled at 22050Hz, and can be used to generate female "
+        #         "English voices with an American accent."
+        #     ),
+        #     class_=cls,  # noqa
+        #     aliases=["TalkNet-22050Hz"],
+        # )
+        # list_of_models.append(model)
+        # return list_of_models
+
+        # NOTE: TalkNet loading is currently broken in main and newer. Please revert to r1.2.0 if interested in
+        # TalkNet.
+        pass
diff --git a/tutorials/asr/Intro_to_Transducers.ipynb b/tutorials/asr/Intro_to_Transducers.ipynb
@@ -553,7 +553,7 @@
         "\n",
         "The Joint model config has several essential components which we discuss below :\n",
         "\n",
-        "1) `log_softmax`: Due to the cost of computing softmax on such large tensors, the Numba CUDA implementation of RNNT loss will implicitly compute the log softmax when called (so its inputs should be logits). The CPU version of the loss doesn't face such memory issues so it requires log-probabilities instead. Since the behaviour is different for CPU-GPU, the `null` value will automatically switch behaviour dependent on whether the input tensor is on a CPU or GPU device.\n",
+        "1) `log_softmax`: Due to the cost of computing softmax on such large tensors, the Numba CUDA implementation of RNNT loss will implicitly compute the log softmax when called (so its inputs should be logits). The CPU version of the loss doesn't face such memory issues so it requires log-probabilities instead. Since the behaviour is different for CPU-GPU, the `None` value will automatically switch behaviour dependent on whether the input tensor is on a CPU or GPU device.\n",
         "\n",
         "2) `preserve_memory`: This flag will call `torch.cuda.empty_cache()` at certain critical sections when computing the Joint tensor. While this operation might allow us to preserve some memory, the empty_cache() operation is tremendously slow and will slow down training by an order of magnitude or more. It is available to use but not recommended.\n",
         "\n",

diff --git a/tutorials/asr/Online_ASR_Microphone_Demo.ipynb b/tutorials/asr/Online_ASR_Microphone_Demo.ipynb
@@ -43,7 +43,7 @@
     "This notebook demonstrates automatic speech recognition (ASR) from a microphone's stream in NeMo.\n",
     "\n",
     "It is **not a recommended** way to do inference in production workflows. If you are interested in \n",
-    "production-level inference using NeMo ASR models, please sign-up to Jarvis early access program: https://developer.nvidia.com/nvidia-jarvis"
+    "production-level inference using NeMo ASR models, please refer to NVIDIA RIVA: https://developer.nvidia.com/riva"
    ]
   },
   {
@@ -537,4 +537,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/tutorials/nlp/Non_English_Downstream_Tasks_(NER).ipynb b/tutorials/nlp/Non_English_Downstream_Tasks_(NER).ipynb
@@ -877,4 +877,4 @@
       ]
     }
   ]
-}
+}
diff --git a/tutorials/speaker_recognition/Speaker_Diarization_Inference.ipynb b/tutorials/speaker_recognition/Speaker_Diarization_Inference.ipynb
@@ -475,7 +475,7 @@
    "source": [
     "To generate VAD predicted time step. We perform VAD inference to have frame level prediction &#8594; (optional: use decision smoothing) &#8594; given `threshold`,  write speech segment to RTTM-like time stamps manifest.\n",
     "\n",
-    "we use vad decision smoothing (87.5% overlap median) as described [here](https://github.com/NVIDIA/NeMo/blob/stable/nemo/collections/asr/parts/vad_utils.py)\n",
+    "we use vad decision smoothing (87.5% overlap median) as described [here](https://github.com/NVIDIA/NeMo/blob/stable/nemo/collections/asr/parts/utils/vad_utils.py)\n",
     "\n",
     "you can also tune the threshold on your dev set. Use this provided [script](https://github.com/NVIDIA/NeMo/blob/stable/scripts/voice_activity_detection/vad_tune_threshold.py)"
    ]

diff --git a/tutorials/tts/1_Inference_ModelSelect.ipynb b/tutorials/tts/1_Inference_ModelSelect.ipynb
@@ -69,6 +69,7 @@
     "- [Tacotron 2](https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_tacotron2)\n",
     "- [Glow-TTS](https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_glowtts)\n",
     "- [TalkNet](https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_talknet)\n",
+    "  - <span style=\"color:red\"> NOTE: TalkNet loading is not working in main. Please use r1.2.0 for TalkNet inference </span>\n",
     "- [FastPitch](https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_fastpitch)\n",
     "- [FastSpeech2](https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_fastspeech_2)\n",
     "\n",
@@ -93,7 +94,8 @@
     "from IPython.display import display\n",
     "\n",
     "supported_e2e = [\"fastpitch_hifigan\", \"fastspeech2_hifigan\", None]\n",
-    "supported_spec_gen = [\"tacotron2\", \"glow_tts\", \"talknet\", \"fastpitch\", \"fastspeech2\", None]\n",
+    "# supported_spec_gen = [\"tacotron2\", \"glow_tts\", \"talknet\", \"fastpitch\", \"fastspeech2\", None]\n",
+    "supported_spec_gen = [\"tacotron2\", \"glow_tts\", \"fastpitch\", \"fastspeech2\", None]\n",
     "supported_audio_gen = [\"waveglow\", \"squeezewave\", \"uniglow\", \"melgan\", \"hifigan\", \"griffin-lim\", None]\n",
     "\n",
     "print(\"Select the model(s) that you want to use. Please choose either 1 end-to-end model or 1 spectrogram generator and 1 vocoder.\")\n",
@@ -388,7 +390,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,