Merge branch 'main' into micro_bert

NVIDIA · Feb 27, 2023 · 0d06ef7 · 0d06ef7
2 parents 95ffbe6 + b6380f9
commit 0d06ef7
Show file tree

Hide file tree

Showing 157 changed files with 2,274 additions and 2,295 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -1035,7 +1035,7 @@ pipeline {
               steps {
                 sh 'cd examples/tts/g2p && \
                     TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \
-                    python heteronym_classification_train_and_evaluate.py \
+                    python g2p_heteronym_classification_train_and_evaluate.py \
                         train_manifest=/home/TestData/g2p/manifest.json \
                         validation_manifest=/home/TestData/g2p/manifest.json \
                         test_manifest=/home/TestData/g2p/manifest.json \
@@ -1047,7 +1047,7 @@ pipeline {
                         exp_manager.exp_dir=${OUTPUT_DIR} \
                         +exp_manager.use_datetime_version=False\
                         +exp_manager.version=test && \
-                    python heteronym_classification_inference.py \
+                    python g2p_heteronym_classification_inference.py \
                         manifest=/home/TestData/g2p/manifest.json \
                         pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
                         output_manifest=preds.json'
@@ -3249,7 +3249,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.optim.sched.min_lr=8e-5 \
         model.max_position_embeddings=128 \
         model.encoder_seq_length=128 \
-        model.activation=swiglu \
+        model.activation=fast-swiglu \
         model.bias_activation_fusion=False \
         model.hidden_dropout=0.0 \
         model.attention_dropout=0.0 \
@@ -3285,7 +3285,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.optim.sched.min_lr=8e-5 \
         model.max_position_embeddings=128 \
         model.encoder_seq_length=128 \
-        model.activation=swiglu \
+        model.activation=fast-swiglu \
         model.bias_activation_fusion=False \
         model.hidden_dropout=0.0 \
         model.attention_dropout=0.0 \
@@ -3361,7 +3361,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
                 trainer.max_epochs=null \
                 model.data.num_workers=1 \
                 model.tensor_model_parallel_size=1 \
-                model.virtual_prompt_style='prompt-tuning' \
+                model.virtual_prompt_style='p-tuning' \
+                model.p_tuning.encoder_type='embedding' \
                 model.language_model_path='/home/TestData/nlp/megatron_gpt/tiny/megatron_14m_gpt_tp1_pp1.nemo' \
                 model.existing_tasks=[] \
                 model.new_tasks=['rte'] \
@@ -3562,7 +3563,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.decoder.num_layers=2 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
+        model.decoder.activation='fast-swiglu' \
         model.decoder.masked_softmax_fusion=False \
         model.decoder.bias_activation_fusion=False \
         model.decoder.activations_checkpoint_method='block' \
@@ -3604,7 +3605,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.decoder.num_layers=2 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
+        model.decoder.activation='fast-swiglu' \
         model.decoder.masked_softmax_fusion=False \
         model.decoder.bias_activation_fusion=False \
         model.decoder.activations_checkpoint_method='block' \

diff --git a/docs/source/tts/api.rst b/docs/source/tts/api.rst
@@ -86,14 +86,14 @@ To read more about them, see the `Base Classes <./intro.html#Base Classes>`__ se
 
 Dataset Processing Classes
 --------------------------
-.. autoclass:: nemo.collections.tts.torch.data.MixerTTSXDataset
+.. autoclass:: nemo.collections.tts.data.tts_dataset.MixerTTSXDataset
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.tts.torch.data.TTSDataset
+.. autoclass:: nemo.collections.tts.data.tts_dataset.TTSDataset
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.tts.torch.data.VocoderDataset
+.. autoclass:: nemo.collections.tts.data.tts_dataset.VocoderDataset
     :show-inheritance:
     :members:
diff --git a/docs/source/tts/configs.rst b/docs/source/tts/configs.rst
@@ -16,14 +16,16 @@ Dataset Configuration
 
 Training, validation, and test parameters are specified using the ``model.train_ds``, ``model.validation_ds``, and ``model.test_ds`` sections in the configuration file, respectively. Depending on the task, there may be arguments specifying the sample rate of the audio files, supplementary data such as speech/text alignment priors and speaker IDs, etc., the threshold to trim leading and trailing silence from an audio signal, pitch normalization parameters, and so on. You may also decide to leave fields such as the ``manifest_filepath`` blank, to be specified via the command-line at runtime.
 
-Any initialization parameter that is accepted for the class `nemo.collections.tts.torch.data.TTSDataset <https://github.com/NVIDIA/NeMo/tree/stable/nemo/collections/tts/torch/data.py#L78>`_  can be set in the config file. Refer to the `Dataset Processing Classes <./api.html#Datasets>`__ section of the API for a list of datasets classes and their respective parameters. An example TTS train and validation configuration should look similar to the following:
+Any initialization parameter that is accepted for the class `nemo.collections.tts.data.tts_dataset.TTSDataset
+<https://github.com/NVIDIA/NeMo/tree/stable/nemo/collections/tts/data/tts_dataset.py#L80>`_  can be set in the config
+file. Refer to the `Dataset Processing Classes <./api.html#Datasets>`__ section of the API for a list of datasets classes and their respective parameters. An example TTS train and validation configuration should look similar to the following:
 
 .. code-block:: yaml
 
   model:
     train_ds:
       dataset:
-        _target_: nemo.collections.tts.torch.data.TTSDataset
+        _target_: nemo.collections.tts.data.tts_dataset.TTSDataset
         manifest_filepath: ???
         sample_rate: 44100
         sup_data_path: ???

diff --git a/docs/source/tts/datasets.rst b/docs/source/tts/datasets.rst
@@ -1,7 +1,7 @@
 Data Preprocessing
 ==================
 
-NeMo TTS recipes support most of public TTS datasets that consist of multiple languages, multiple emotions, and multiple speakers. Current recipes covered English (en-US), German (de-DE), Spanish (es-ES), and Mandarin Chinese (zh-CN), while the support for many other languages is under planning. NeMo provides corpus-specific data preprocessing scripts, as shown in the directory of `scripts/data_processing/tts/ <https://github.com/NVIDIA/NeMo/tree/stable/scripts/dataset_processing/tts/>`_, to convert common public TTS datasets into the format expected by the dataloaders as defined in `nemo/collections/tts/torch/data.py <https://github.com/NVIDIA/NeMo/tree/stable/nemo/collections/tts/torch/data.py>`_. The ``nemo_tts`` collection expects each dataset to consist of a set of utterances in individual audio files plus a ``JSON`` manifest that describes the dataset, with information about one utterance per line. The audio files can be of any format supported by `Pydub <https://github.com/jiaaro/pydub>`_, though we recommend ``WAV`` files as they are the default and have been most thoroughly tested. NeMo supports any original sampling rates of audios, although our scripts of extracting supplementary data and model training all specify the common target sampling rates as either 44100 Hz or 22050 Hz. If the original sampling rate mismatches the target sampling rate, the `feature preprocess <https://github.com/NVIDIA/NeMo/blob/stable/nemo/collections/asr/parts/preprocessing/features.py#L124>`_ can automatically resample the original sampling rate into the target one.
+NeMo TTS recipes support most of public TTS datasets that consist of multiple languages, multiple emotions, and multiple speakers. Current recipes covered English (en-US), German (de-DE), Spanish (es-ES), and Mandarin Chinese (zh-CN), while the support for many other languages is under planning. NeMo provides corpus-specific data preprocessing scripts, as shown in the directory of `scripts/data_processing/tts/ <https://github.com/NVIDIA/NeMo/tree/stable/scripts/dataset_processing/tts/>`_, to convert common public TTS datasets into the format expected by the dataloaders as defined in `nemo/collections/tts/data/tts_dataset.py <https://github.com/NVIDIA/NeMo/tree/stable/nemo/collections/tts/data/tts_dataset.py>`_. The ``nemo_tts`` collection expects each dataset to consist of a set of utterances in individual audio files plus a ``JSON`` manifest that describes the dataset, with information about one utterance per line. The audio files can be of any format supported by `Pydub <https://github.com/jiaaro/pydub>`_, though we recommend ``WAV`` files as they are the default and have been most thoroughly tested. NeMo supports any original sampling rates of audios, although our scripts of extracting supplementary data and model training all specify the common target sampling rates as either 44100 Hz or 22050 Hz. If the original sampling rate mismatches the target sampling rate, the `feature preprocess <https://github.com/NVIDIA/NeMo/blob/stable/nemo/collections/asr/parts/preprocessing/features.py#L124>`_ can automatically resample the original sampling rate into the target one.
 
 There should be one ``JSON`` manifest file per dataset that will be passed in, therefore, if the user wants separate training and validation datasets, they should also have separate manifests. Otherwise, they will be loading validation data with their training data and vice versa. Each line of the manifest should be in the following format:
 

diff --git a/docs/source/tts/g2p.rst b/docs/source/tts/g2p.rst
@@ -59,7 +59,7 @@ To train ByT5 G2P model and evaluate it after at the end of the training, run:
         do_training=True \
         do_testing=True
 
-Example of the config file: ``NeMo/examples/text_processing/g2p/conf/t5_g2p.yaml``.
+Example of the config file: ``NeMo/examples/tts/g2p/conf/g2p_t5.yaml``.
 
 
 To train G2P-Conformer model and evaluate it after at the end of the training, run:
@@ -168,7 +168,7 @@ To train the model, run:
 
 .. code-block::
 
-    python heteronym_classification_train_and_evaluate.py \
+    python g2p_heteronym_classification_train_and_evaluate.py \
         train_manifest=<Path to train manifest file>" \
         validation_manifest=<Path to validation manifest file>" \
         model.wordids=<Path to wordids.tsv file, similar to https://github.com/google-research-datasets/WikipediaHomographData/blob/master/data/wordids.tsv> \
@@ -179,7 +179,7 @@ To train the model and evaluate it when the training is complete, run:
 
 .. code-block::
 
-    python heteronym_classification_train_and_evaluate.py \
+    python g2p_heteronym_classification_train_and_evaluate.py \
         train_manifest=<Path to train manifest file>" \
         validation_manifest=<Path to validation manifest file>" \
         model.test_ds.dataset.manifest=<Path to test manifest file>" \
@@ -191,7 +191,7 @@ To evaluate pretrained model, run:
 
 .. code-block::
 
-    python heteronym_classification_train_and_evaluate.py \
+    python g2p_heteronym_classification_train_and_evaluate.py \
         do_training=False \
         do_testing=True \
         model.test_ds.dataset.manifest=<Path to test manifest file>"  \
@@ -201,7 +201,7 @@ To run inference with a pretrained HeteronymClassificationModel, run:
 
 .. code-block::
 
-    python heteronym_classification_inference.py \
+    python g2p_heteronym_classification_inference.py \
         manifest="<Path to .json manifest>" \
         pretrained_model="<Path to .nemo file or pretrained model name from list_available_models()>" \
         output_file="<Path to .json manifest to save prediction>"

diff --git a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py
@@ -21,6 +21,7 @@
     # (Optional: --config-path=<path to dir of configs> --config-name=<name of config without .yaml>) \
     ++asr_model_type=<rnnt_bpe or ctc_bpe> \
     ++tts_model_path=<path to compatible tts model> \
+    ++enhancer_model_path=<optional path to enhancer model> \
     model.tokenizer.dir=<path to tokenizer> \
     model.tokenizer.type="bpe" \
     model.train_ds.manifest_filepath=<path(s) to manifest with audio-text pairs or null> \
@@ -70,7 +71,11 @@ def main(cfg):
     exp_manager(trainer, cfg.get("exp_manager", None))
 
     asr_model = ASRWithTTSModel.from_asr_config(
-        asr_cfg=cfg.model, asr_model_type=cfg.asr_model_type, tts_model_path=cfg.tts_model_path, trainer=trainer
+        asr_cfg=cfg.model,
+        asr_model_type=cfg.asr_model_type,
+        tts_model_path=cfg.tts_model_path,
+        enhancer_model_path=cfg.get("enhancer_model_path", None),
+        trainer=trainer,
     )
 
     # Initialize the weights of the model from another model, if provided via config

diff --git a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py
@@ -19,6 +19,7 @@
     # (Optional: --config-path=<path to dir of configs> --config-name=<name of config without .yaml>) \
     model.asr_model_path=<path to ASR model> \
     model.tts_model_path=<path to compatible TTS model> \
+    model.enhancer_model_path=<optional path to enhancer model> \
     model.asr_model_fuse_bn=<true recommended if ConformerEncoder with BatchNorm, false otherwise> \
     model.train_ds.manifest_filepath=<path to manifest with audio-text pairs or null> \
     model.train_ds.text_data.manifest_filepath=<path(s) to manifest with train text> \