Merge branch 'main' into fix_extra_argument

NVIDIA · Sep 27, 2023 · 18582ee · 18582ee
2 parents 1f6aa96 + 0fb7b02
commit 18582ee
Show file tree

Hide file tree

Showing 8 changed files with 23 additions and 12 deletions.
diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
@@ -66,6 +66,9 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False):
         gpt_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.0)
         gpt_cfg.ffn_dropout = cfg.model.ffn_dropout
         gpt_cfg.use_flash_attention = cfg.model.get('use_flash_attention', False)
+        gpt_cfg.tensor_model_parallel_size = cfg.model.get('tensor_model_parallel_size', 1)
+        gpt_cfg.pipeline_model_parallel_size = cfg.model.get('pipeline_model_parallel_size', 1)
+        gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get('pipeline_model_parallel_split_rank', 0)
 
         sft_cls = MegatronGPTSFTModel
         gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}"

diff --git a/nemo/collections/tts/modules/aligner.py b/nemo/collections/tts/modules/aligner.py
@@ -98,7 +98,7 @@ def get_dist(self, keys, queries, mask=None):
 
         self._apply_mask(dist, mask, float("inf"))
 
-        return dist
+        return dist.squeeze(1)
 
     @staticmethod
     def get_euclidean_dist(queries_enc, keys_enc):

diff --git a/nemo/utils/loggers/dllogger.py b/nemo/utils/loggers/dllogger.py
@@ -20,6 +20,7 @@
 from lightning_utilities.core.apply_func import apply_to_collection
 from omegaconf import DictConfig, ListConfig, OmegaConf
 from pytorch_lightning.loggers import Logger
+from pytorch_lightning.utilities import rank_zero_only
 from pytorch_lightning.utilities.parsing import AttributeDict
 
 from nemo.utils import logging
@@ -81,6 +82,7 @@ def __init__(self, stdout: bool, verbose: bool, json_file: str):
             )
         dllogger.init(backends=backends)
 
+    @rank_zero_only
     def log_hyperparams(self, params, *args, **kwargs):
         if isinstance(params, Namespace):
             params = vars(params)
@@ -91,6 +93,7 @@ def log_hyperparams(self, params, *args, **kwargs):
         params = _sanitize_callable_params(_flatten_dict(_convert_params(params)))
         dllogger.log(step="PARAMETER", data=params)
 
+    @rank_zero_only
     def log_metrics(self, metrics, step=None):
         if step is None:
             step = tuple()

diff --git a/scripts/dataset_processing/tts/preprocess_text.py b/scripts/dataset_processing/tts/preprocess_text.py
@@ -22,9 +22,9 @@
     --input_manifest="<data_root_path>/manifest.json" \
     --output_manifest="<data_root_path>/manifest_processed.json" \
     --normalizer_config_path="<nemo_root_path>/examples/tts/conf/text/normalizer_en.yaml" \
-    --lower_case=True \
+    --lower_case \
     --num_workers=4 \
-    --batch_size=16
+    --joblib_batch_size=16
 """
 
 import argparse

diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb
@@ -100,7 +100,7 @@
     "# .nemo files for your pre-trained FastPitch and HiFiGAN\n",
     "pretrained_fastpitch_checkpoint = \"<Multispeaker pretrained checkpoint path.>\"\n",
     "finetuned_hifigan_on_multispeaker_checkpoint = \"<Pretrained hifiGan checkpoint path.>\"\n",
-    "use_ipa = True #Set to False while using Arpabet."
+    "use_ipa = False #Set to False while using Arpabet."
    ]
   },
   {
@@ -505,7 +505,7 @@
     "+exp_manager.wandb_logger_kwargs.name=\"tutorial-FastPitch-finetune-adaptation\" \\\n",
     "+exp_manager.wandb_logger_kwargs.project=\"NeMo\" \\\n",
     "+exp_manager.checkpoint_callback_params.save_top_k=-1 \\\n",
-    "trainer.max_epochs=200 \\\n",
+    "trainer.max_epochs=20 \\\n",
     "trainer.check_val_every_n_epoch=10 \\\n",
     "trainer.log_every_n_steps=1 \\\n",
     "trainer.devices=1 \\\n",
@@ -612,7 +612,7 @@
     "model.optim.lr=0.0001 \\\n",
     "model/train_ds=train_ds_finetune \\\n",
     "model/validation_ds=val_ds_finetune \\\n",
-    "+trainer.max_epochs=500 \\\n",
+    "+trainer.max_epochs=50 \\\n",
     "trainer.check_val_every_n_epoch=5 \\\n",
     "trainer.devices=-1 \\\n",
     "trainer.strategy='ddp' \\\n",

diff --git a/tutorials/tts/FastPitch_Data_Preparation.ipynb b/tutorials/tts/FastPitch_Data_Preparation.ipynb
@@ -332,6 +332,8 @@
         "lower_case = True\n",
         "# Whether to overwrite output manifest, if it exists\n",
         "overwrite_manifest = True\n",
+        "# Batch size for joblib parallelization. Increasing this value might speed up the script, depending on your CPU.\n",
+        "joblib_batch_size = 16\n",
         "\n",
         "# Python wrapper to invoke the given bash script with the given input args\n",
         "def run_script(script, args):\n",
@@ -351,8 +353,10 @@
         "        f\"--output_manifest={output_filepath}\",\n",
         "        f\"--num_workers={num_workers}\",\n",
         "        f\"--normalizer_config_path={normalizer_config_filepath}\",\n",
-        "        f\"--lower_case={lower_case}\"\n",
+        "        f\"--joblib_batch_size={joblib_batch_size}\"\n",
         "    ]\n",
+        "    if lower_case:\n",
+        "      args.append(\"--lower_case\")\n",
         "    if overwrite_manifest:\n",
         "        args.append(\"--overwrite\")\n",
         "\n",
@@ -787,7 +791,7 @@
         "\n",
         "We will train HiFi-GAN first so that we can use it to help evaluate the performance of FastPitch as it is being trained.\n",
         "\n",
-        "HiFi-GAN training only requires a manifest with with the `audio_filepath` field. All other fields in the manifest are for FastPitch training.\n",
+        "HiFi-GAN training only requires a manifest with the `audio_filepath` field. All other fields in the manifest are for FastPitch training.\n",
         "\n",
         "Here we show how to train these models from scratch. You can also fine-tune them from pretrained checkpoints as mentioned in our [FastPitch fine-tuning tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/FastPitch_Finetuning.ipynb), but pretrained checkpoints compatible with these experimental recipes are not yet available on NGC.\n"
       ],
@@ -914,7 +918,7 @@
     {
       "cell_type": "code",
       "source": [
-        "hifigan_log_epoch_dir = hifigan_log_dir / \"epoch_10\"\n",
+        "hifigan_log_epoch_dir = hifigan_log_dir / \"epoch_10\" / dataset_name\n",
         "!ls $hifigan_log_epoch_dir"
       ],
       "metadata": {
@@ -966,7 +970,7 @@
         "1. Training manifest(s) with `audio_filepath` and `text` or `normalized_text` fields.\n",
         "2. Precomputed features such as *pitch* and *energy* specified in the feature [config file](https://github.com/NVIDIA/NeMo/blob/main/examples/tts/conf/feature/feature_44100.yaml).\n",
         "3. (Optional) Statistics file for normalizing features.\n",
-        "4. (Optional) For a multi-speaker model, the manifest needs a `speaker` field amd JSON file mapping speaker IDs to speaker indices.\n",
+        "4. (Optional) For a multi-speaker model, the manifest needs a `speaker` field and JSON file mapping speaker IDs to speaker indices.\n",
         "5. (Optional) To train with IPA phonemes, a [phoneme dictionary](https://github.com/NVIDIA/NeMo/blob/main/scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt) and optional [heteronyms file](https://github.com/NVIDIA/NeMo/blob/main/scripts/tts_dataset_files/heteronyms-052722)\n",
         "6. (Optional) HiFi-GAN checkpoint or [NGC model name](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/models/hifigan.py#L413) for generating audio predictions during training.\n",
         "\n"
@@ -1093,7 +1097,7 @@
     {
       "cell_type": "code",
       "source": [
-        "faspitch_log_epoch_dir = fastpitch_log_dir / \"epoch_10\"\n",
+        "faspitch_log_epoch_dir = fastpitch_log_dir / \"epoch_10\" / dataset_name\n",
         "!ls $faspitch_log_epoch_dir"
       ],
       "metadata": {

diff --git a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb
@@ -511,7 +511,7 @@
     "+trainer.max_epochs=5 \\\n",
     "trainer.check_val_every_n_epoch=5 \\\n",
     "trainer.devices=1 \\\n",
-    "trainer.strategy='ddp' \\\n",
+    "trainer.strategy='auto' \\\n",
     "trainer.precision=16 \\\n",
     "exp_manager.exp_dir={logs_dir} \\\n",
     "exp_manager.create_wandb_logger=True \\\n",

diff --git a/tutorials/tts/Vits_Training.ipynb b/tutorials/tts/Vits_Training.ipynb
@@ -309,6 +309,7 @@
     "  heteronyms_path=tts_dataset_files/heteronyms-052722 \\\n",
     "  trainer.max_epochs=3 \\\n",
     "  trainer.accelerator=auto \\\n",
+    "  trainer.strategy=auto \\\n",
     "  trainer.check_val_every_n_epoch=1 \\\n",
     "  trainer.devices=1)"
    ]