Merge branch 'main' into streaming-conformer

NVIDIA · Mar 7, 2023 · 7630c8e · 7630c8e
2 parents fb2c070 + e6c51d3
commit 7630c8e
Show file tree

Hide file tree

Showing 28 changed files with 20,780 additions and 9,409 deletions.
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -26,6 +26,6 @@ sphinx:
 
 # Set the version of Python and requirements required to build your docs
 python:
-  version: 3.7
+  version: 3.8
   install:
     - requirements: requirements/requirements_docs.txt
diff --git a/Dockerfile b/Dockerfile
@@ -21,15 +21,23 @@ ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.01-py3
 # image (by specifying build target as `nemo-deps`)
 FROM ${BASE_IMAGE} as nemo-deps
 
+# dependency flags; should be declared after FROM
+# torchaudio: not required by default
+ARG REQUIRE_TORCHAUDIO=false
+# k2: not required by default
+ARG REQUIRE_K2=false
+
 # Ensure apt-get won't prompt for selecting options
 ENV DEBIAN_FRONTEND=noninteractive
+# libavdevice-dev rerquired for latest torchaudio
 RUN apt-get update && \
     apt-get upgrade -y && \
     apt-get install -y \
     libsndfile1 sox \
     libfreetype6 \
     swig \
-    ffmpeg && \
+    ffmpeg \
+    libavdevice-dev && \
     rm -rf /var/lib/apt/lists/*
 
 WORKDIR /tmp/
@@ -47,7 +55,14 @@ RUN pip3 uninstall -y sacrebleu torchtext
 # build torchaudio
 WORKDIR /tmp/torchaudio_build
 COPY scripts/installers /tmp/torchaudio_build/scripts/installers/
-RUN /bin/bash /tmp/torchaudio_build/scripts/installers/install_torchaudio_latest.sh
+RUN INSTALL_MSG=$(/bin/bash /tmp/torchaudio_build/scripts/installers/install_torchaudio_latest.sh); INSTALL_CODE=$?; \
+    echo ${INSTALL_MSG}; \
+    if [ ${INSTALL_CODE} -ne 0 ]; then \
+      echo "torchaudio installation failed";  \
+      if [ "${REQUIRE_TORCHAUDIO}" = true ]; then \
+        exit ${INSTALL_CODE};  \
+      else echo "Skipping failed torchaudio installation"; fi \
+    else echo "torchaudio installed successfully"; fi
 
 # install nemo dependencies
 WORKDIR /tmp/nemo
@@ -56,7 +71,14 @@ RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-chec
 
 # install k2, skip if installation fails
 COPY scripts /tmp/nemo/scripts/
-RUN /bin/bash /tmp/nemo/scripts/speech_recognition/k2/setup.sh || exit 0
+RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/speech_recognition/k2/setup.sh); INSTALL_CODE=$?; \
+    echo ${INSTALL_MSG}; \
+    if [ ${INSTALL_CODE} -ne 0 ]; then \
+      echo "k2 installation failed";  \
+      if [ "${REQUIRE_K2}" = true ]; then \
+        exit ${INSTALL_CODE};  \
+      else echo "Skipping failed k2 installation"; fi \
+    else echo "k2 installed successfully"; fi
 
 # copy nemo source into a scratch image
 FROM scratch as nemo-src

diff --git a/README.rst b/README.rst
@@ -1,5 +1,5 @@
 
-|status| |documentation| |license| |lgtm_grade| |lgtm_alerts| |black|
+|status| |documentation| |codeql| |license| |pypi| |pyversion| |downloads| |black|
 
 .. |status| image:: http://www.repostatus.org/badges/latest/active.svg
   :target: http://www.repostatus.org/#active
@@ -13,13 +13,21 @@
   :target: https://github.com/NVIDIA/NeMo/blob/master/LICENSE
   :alt: NeMo core license and license for collections in this repo
 
-.. |lgtm_grade| image:: https://img.shields.io/lgtm/grade/python/g/NVIDIA/NeMo.svg?logo=lgtm&logoWidth=18
-  :target: https://lgtm.com/projects/g/NVIDIA/NeMo/context:python
-  :alt: Language grade: Python
+.. |pypi| image:: https://badge.fury.io/py/nemo-toolkit.svg
+  :target: https://badge.fury.io/py/nemo-toolkit
+  :alt: Release version
 
-.. |lgtm_alerts| image:: https://img.shields.io/lgtm/alerts/g/NVIDIA/NeMo.svg?logo=lgtm&logoWidth=18
-  :target: https://lgtm.com/projects/g/NVIDIA/NeMo/alerts/
-  :alt: Total alerts
+.. |pyversion| image:: https://img.shields.io/pypi/pyversions/nemo-toolkit.svg
+  :target: https://badge.fury.io/py/nemo-toolkit
+  :alt: Python version
+
+.. |downloads| image:: https://static.pepy.tech/personalized-badge/nemo-toolkit?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=downloads
+  :target: https://pepy.tech/project/nemo-toolkit
+  :alt: PyPi total downloads
+
+.. |codeql| image:: https://github.com/nvidia/nemo/actions/workflows/codeql.yml/badge.svg?branch=main&event=push
+  :target: https://github.com/nvidia/nemo/actions/workflows/codeql.yml
+  :alt: CodeQL
 
 .. |black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
   :target: https://github.com/psf/black
@@ -47,9 +55,9 @@ NeMo models can be optimized for inference and deployed for production use-cases
 Getting started with NeMo is simple.
 State of the Art pretrained NeMo models are freely available on `HuggingFace Hub <https://huggingface.co/models?library=nemo&sort=downloads&search=nvidia>`_ and
 `NVIDIA NGC <https://catalog.ngc.nvidia.com/models?query=nemo&orderBy=weightPopularDESC>`_.
-These models can be used to transcribe audio, synthesize speech, or translate text in a just a few lines of code.
+These models can be used to transcribe audio, synthesize speech, or translate text in just a few lines of code.
 
-We have have extensive `tutorials <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/tutorials.html>`_ that 
+We have extensive `tutorials <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/tutorials.html>`_ that 
 can all be run on `Google Colab <https://colab.research.google.com>`_.
 
 For advanced users that want to train NeMo models from scratch or finetune existing NeMo models 
@@ -184,7 +192,7 @@ Use this installation mode if you want the latest released version.
 
 Pip from source
 ~~~~~~~~~~~~~~~
-Use this installation mode if you want the a version from particular GitHub branch (e.g main).
+Use this installation mode if you want the version from a particular GitHub branch (e.g main).
 
 .. code-block:: bash
 
@@ -273,7 +281,7 @@ If you chose to work with main branch, we recommend using NVIDIA's PyTorch conta
 Examples
 --------
 
-Many examples can be found under `"Examples" <https://github.com/NVIDIA/NeMo/tree/stable/examples>`_ folder.
+Many examples can be found under the `"Examples" <https://github.com/NVIDIA/NeMo/tree/stable/examples>`_ folder.
 
 
 Contributing

diff --git a/docs/source/tts/data/datasets.csv b/docs/source/tts/data/datasets.csv
@@ -2,7 +2,8 @@ Language,Locale,Dataset Name,#spk-total,#spk-F,#spk-M,#hours-total,#hour-F,#hour
 English,en-US,LJSpeech,1,1,0,23.92,23.92,0.00,"22,050Hz",https://keithito.com/LJ-Speech-Dataset/
 English,en-US,LibriTTS (clean),1230,592,638,262.62,133.97,128.65,"24,000Hz",https://www.openslr.org/60/
 English,en-US,HiFiTTS,10,6,4,291.60,158.30,133.30,"44,100Hz",http://www.openslr.org/109/
-German,de-DE,Thorsten Müller (German Neutral-TTS dataset),1,0,1,22.96,0.00,22.96,"22,050Hz",https://www.openslr.org/95/
+German,de-DE,Thorsten Müller Neutral 21.02 dataset,1,0,1,20.91,0.00,20.91,"22,050Hz",https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1
+German,de-DE,Thorsten Müller Neutral 22.10 dataset,1,0,1,11.21,0.00,11.21,"22,050Hz",https://zenodo.org/record/7265581/files/ThorstenVoice-Dataset_2022.10.zip?download=1
 German,de-DE,HUI-Audio-Corpus-German (clean),118,n/a,n/a,253.85,0.00,0.00,"44,100Hz",https://opendata.iisys.de/datasets.html
 Spanish,es-AR,Crowdsourced high-quality Argentinian Spanish,44,31,13,8.03,5.61,2.42,"48,000Hz",https://www.openslr.org/61/
 Spanish,es-CL,Crowdsourced high-quality Chilean Spanish,31,13,18,7.15,2.84,4.31,"48,000Hz",https://www.openslr.org/71/

diff --git a/docs/source/tts/data/ngc_models_am.csv b/docs/source/tts/data/ngc_models_am.csv
@@ -1,4 +1,4 @@
-Locale,Model Name,Dataset,Sampling Rate,#Spk,Phoneme Unit,Model Class,Overview,Checkpoint
+Locale,Model Name,Dataset,Sampling Rate,#Spk,Symbols,Model Class,Overview,Checkpoint
 en-US,tts_en_fastpitch,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_en_fastpitch <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_fastpitch>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch/versions/1.8.1/files/tts_en_fastpitch_align.nemo``
 en-US,tts_en_fastpitch_ipa,LJSpeech,22050Hz,1,IPA,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_en_fastpitch <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_fastpitch>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch/versions/IPA_1.13.0/files/tts_en_fastpitch_align_ipa.nemo``
 en-US,tts_en_fastpitch_multispeaker,HiFiTTS,44100Hz,10,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_en_multispeaker_fastpitchhifigan <https://ngc.nvidia.com/models/nvidia:nemo:tts_en_multispeaker_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_multispeaker_fastpitchhifigan/versions/1.10.0/files/tts_en_fastpitch_multispeaker.nemo``
@@ -7,6 +7,7 @@ en-US,tts_en_lj_mixerttsx,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models
 en-US,RAD-TTS,TBD,TBD,TBD,ARPABET,nemo.collections.tts.models.radtts.RadTTSModel,TBD,
 en-US,tts_en_tacotron2,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models.tacotron2.Tacotron2Model,`tts_en_tacotron2 <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_tacotron2>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_tacotron2/versions/1.10.0/files/tts_en_tacotron2.nemo``
 de-DE,tts_de_fastpitch_multispeaker_5,HUI Audio Corpus German,44100Hz,5,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitch_multispeaker_5 <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitch_multispeaker_5>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitch_multispeaker_5/versions/1.11.0/files/tts_de_fastpitch_multispeaker_5.nemo``
-de-DE,tts_de_fastpitch_singlespeaker,Thorsten Müller (German Neutral-TTS dataset),22050Hz,1,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitchhifigan <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.10.0/files/tts_de_fastpitch_align.nemo``
+de-DE,tts_de_fastpitch_singleSpeaker_thorstenNeutral_2102,Thorsten Müller Neutral 21.02 dataset,22050Hz,1,Graphemes,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitchhifigan <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_fastpitch_thorstens2102.nemo``
+de-DE,tts_de_fastpitch_singleSpeaker_thorstenNeutral_2210,Thorsten Müller Neutral 22.10 dataset,22050Hz,1,Graphemes,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitchhifigan <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_fastpitch_thorstens2210.nemo``
 es,tts_es_fastpitch_multispeaker,OpenSLR crowdsourced Latin American Spanish,44100Hz,174,IPA,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_es_multispeaker_fastpitchhifigan <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_es_multispeaker_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.15.0/files/tts_es_fastpitch_multispeaker.nemo``
 zh-CN,tts_zh_fastpitch_sfspeech,SFSpeech Chinese/English Bilingual Speech,22050Hz,1,pinyin,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_zh_fastpitch_hifigan_sfspeech <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_zh_fastpitch_hifigan_sfspeech>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.15.0/files/tts_zh_fastpitch_sfspeech.nemo``
diff --git a/docs/source/tts/data/ngc_models_vocoder.csv b/docs/source/tts/data/ngc_models_vocoder.csv
@@ -6,8 +6,8 @@ en-US,tts_en_hifitts_hifigan_ft_fastpitch,FastPitch,HiFiTTS,44100Hz,10,nemo.coll
 en-US,tts_en_lj_univnet,librosa.filters.mel,LJSpeech,22050Hz,1,nemo.collections.tts.models.univnet.UnivNetModel,`tts_en_lj_univnet <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_lj_univnet>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_univnet/versions/1.7.0/files/tts_en_lj_univnet.nemo``
 en-US,tts_en_libritts_univnet,librosa.filters.mel,LibriTTS,24000Hz,1,nemo.collections.tts.models.univnet.UnivNetModel,`tts_en_libritts_univnet <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_libritts_univnet>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_libritts_univnet/versions/1.7.0/files/tts_en_libritts_multispeaker_univnet.nemo``
 en-US,tts_waveglow_88m,librosa.filters.mel,LJSpeech,22050Hz,1,nemo.collections.tts.models.waveglow.WaveGlowModel,`tts_waveglow_88m <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_waveglow_88m>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_waveglow_88m/versions/1.0.0/files/tts_waveglow.nemo``
-en-US,tts_waveglow_268m,librosa.filters.mel,LJSpeech,22050Hz,1,nemo.collections.tts.models.waveglow.WaveGlowModel,`tts_waveglow_268m <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_waveglow_268m>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_waveglow_268m/versions/1.0.0rc1/files/tts_waveglow_268m.nemo``
 de-DE,tts_de_hui_hifigan_ft_fastpitch_multispeaker_5,FastPitch,HUI Audio Corpus German,44100Hz,5,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitch_multispeaker_5 <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitch_multispeaker_5>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitch_multispeaker_5/versions/1.11.0/files/tts_de_hui_hifigan_ft_fastpitch_multispeaker_5.nemo``
-de-DE,tts_de_slr_hifigan_ft_fastpitch_singlespeaker,FastPitch,Thorsten Müller (German Neutral-TTS dataset),22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitchhifigan <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.10.0/files/tts_de_hifigan.nemo``
+de-DE,tts_de_hifigan_singleSpeaker_thorstenNeutral_2102,FastPitch,Thorsten Müller Neutral 21.02 dataset,22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitchhifigan <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_hifigan_thorstens2102.nemo``
+de-DE,tts_de_hifigan_singleSpeaker_thorstenNeutral_2210,FastPitch,Thorsten Müller Neutral 22.10 dataset,22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitchhifigan <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_hifigan_thorstens2210.nemo``
 es,tts_es_hifigan_ft_fastpitch_multispeaker,FastPitch,OpenSLR crowdsourced Latin American Spanish,44100Hz,174,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_es_multispeaker_fastpitchhifigan <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_es_multispeaker_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.15.0/files/tts_es_hifigan_ft_fastpitch_multispeaker.nemo``
 zh-CN,tts_zh_hifigan_sfspeech,FastPitch,SFSpeech Chinese/English Bilingual Speech,22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_zh_fastpitch_hifigan_sfspeech <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_zh_fastpitch_hifigan_sfspeech>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.15.0/files/tts_zh_hifigan_sfspeech.nemo``
diff --git a/examples/asr/speech_to_text_eval.py b/examples/asr/speech_to_text_eval.py
@@ -66,6 +66,7 @@
 from omegaconf import MISSING, OmegaConf, open_dict
 
 from nemo.collections.asr.metrics.wer import word_error_rate
+from nemo.collections.asr.parts.utils.transcribe_utils import PunctuationCapitalization
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 
@@ -80,6 +81,10 @@ class EvaluationConfig(transcribe_speech.TranscriptionConfig):
 
     only_score_manifest: bool = False
 
+    separate_punctuation: bool = True
+    do_lowercase: bool = False
+    rm_punctuation: bool = False
+
 
 @hydra_runner(config_name="EvaluationConfig", schema=EvaluationConfig)
 def main(cfg: EvaluationConfig):
@@ -123,8 +128,19 @@ def main(cfg: EvaluationConfig):
                 break
 
             ground_truth_text.append(data['text'])
+
             predicted_text.append(data['pred_text'])
 
+    pc = PunctuationCapitalization('.,?')
+    if cfg.separate_punctuation:
+        ground_truth_text = pc.separate_punctuation(ground_truth_text)
+    if cfg.do_lowercase:
+        ground_truth_text = pc.do_lowercase(ground_truth_text)
+        predicted_text = pc.do_lowercase(predicted_text)
+    if cfg.rm_punctuation:
+        ground_truth_text = pc.rm_punctuation(ground_truth_text)
+        predicted_text = pc.rm_punctuation(predicted_text)
+
     # Test for invalid manifest supplied
     if invalid_manifest:
         raise ValueError(
@@ -133,8 +149,15 @@ def main(cfg: EvaluationConfig):
         )
 
     # Compute the WER
-    metric_name = 'CER' if cfg.use_cer else 'WER'
-    metric_value = word_error_rate(hypotheses=predicted_text, references=ground_truth_text, use_cer=cfg.use_cer)
+    cer = word_error_rate(hypotheses=predicted_text, references=ground_truth_text, use_cer=True)
+    wer = word_error_rate(hypotheses=predicted_text, references=ground_truth_text, use_cer=False)
+
+    if cfg.use_cer:
+        metric_name = 'CER'
+        metric_value = cer
+    else:
+        metric_name = 'WER'
+        metric_value = wer
 
     if cfg.tolerance is not None:
         if metric_value > cfg.tolerance:
@@ -144,6 +167,8 @@ def main(cfg: EvaluationConfig):
     else:
         logging.info(f'Got {metric_name} of {metric_value}')
 
+    logging.info(f'Dataset WER/CER ' + str(round(100 * wer, 2)) + "%/" + str(round(100 * cer, 2)) + "%")
+
     # Inject the metric name and score into the config, and return the entire config
     with open_dict(cfg):
         cfg.metric_name = metric_name

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
@@ -36,7 +36,6 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 
-
 """
 Transcribe audio file on a single CPU/GPU. Useful for transcription of moderate amounts of audio data.
 

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -107,6 +107,7 @@ model:
   bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: False # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
 
 
   # Miscellaneous

diff --git a/examples/tts/conf/spectrogram-enhancer.yaml b/examples/tts/conf/spectrogram-enhancer.yaml
@@ -7,6 +7,7 @@ model:
   network_capacity: 16
   mixed_prob: 0.9
   fmap_max: 192
+  start_from_zero: true  # might give better results at downstream tasks 
 
   generator:
     _target_: "nemo.collections.tts.modules.spectrogram_enhancer.Generator"  

diff --git a/nemo/collections/asr/data/audio_to_text.py b/nemo/collections/asr/data/audio_to_text.py
@@ -1195,7 +1195,7 @@ def __init__(self, tokenizer):
                 self._tokenizer = tokenizer
 
             def __call__(self, *args):
-                if isinstance(args[0], Iterable) and self.is_aggregate:
+                if isinstance(args[0], List) and self.is_aggregate:
                     t = []
                     for span in args[0]:
                         t.extend(self._tokenizer.text_to_ids(span['str'], span['lang']))

diff --git a/nemo/collections/asr/models/label_models.py b/nemo/collections/asr/models/label_models.py
@@ -424,7 +424,7 @@ def infer_file(self, path2audio_file):
         audio, sr = librosa.load(path2audio_file, sr=None)
         target_sr = self._cfg.train_ds.get('sample_rate', 16000)
         if sr != target_sr:
-            audio = librosa.core.resample(audio, sr, target_sr)
+            audio = librosa.core.resample(audio, orig_sr=sr, target_sr=target_sr)
         audio_length = audio.shape[0]
         device = self.device
         audio = np.array(audio)
@@ -453,8 +453,10 @@ def get_label(self, path2audio_file):
             label: label corresponding to the trained model
         """
         _, logits = self.infer_file(path2audio_file=path2audio_file)
-        trained_labels = list(self._cfg['train_ds']['labels'])
+
+        trained_labels = self._cfg['train_ds'].get('labels', None)
         if trained_labels is not None:
+            trained_labels = list(trained_labels)
             label_id = logits.argmax(axis=1)
             label = trained_labels[int(label_id[0])]
         else: