NVIDIA
diff --git a/‎examples/multimodal/speechllm/README.md
+129-4 b/‎examples/multimodal/speechllm/README.md
+129-4
diff --git a/‎examples/multimodal/speechllm/conf/speechllm/modularized_speech_gpt_config_eval.yaml ‎examples/multimodal/speechllm/conf/modular_audio_gpt_config_eval.yaml
+14-32 b/‎examples/multimodal/speechllm/conf/speechllm/modularized_speech_gpt_config_eval.yaml ‎examples/multimodal/speechllm/conf/modular_audio_gpt_config_eval.yaml
+14-32
diff --git a/‎examples/multimodal/speechllm/conf/speechllm/modularized_speech_gpt_config.yaml ‎examples/multimodal/speechllm/conf/modular_audio_gpt_config_peft.yaml
+27-13 b/‎examples/multimodal/speechllm/conf/speechllm/modularized_speech_gpt_config.yaml ‎examples/multimodal/speechllm/conf/modular_audio_gpt_config_peft.yaml
+27-13
@@ -1,6 +1,131 @@
-# Scripts for NeMo SpeechLM
+# Modular SpeechLLM
 
-This is the repository of the ICASSP'24 paper [SALM: Speech-augmented Language Model with In-context Learning for Speech Recognition and Translation
-](https://arxiv.org/abs/2310.09424)
+This directory contains example scripts to train and evaluate modular SpeechLLM models [1]. 
 
-We will release the scripts and checkpoints soon.
+## Requirements
+You will need to install this specific branch of NeMo, or use the provided Dockerfile in the root directory of this repository to build a Docker image with all the necessary dependencies. This branch is based on NeMo main branch by 2/14/2024, while diverging from the main branch in the following ways:
+- Migrating to pytorch_lightning==2.2 to fix some bugs with multiple validation dataloader_iter and saving -last.ckpt files.
+- Pinning to megatron-core==0.4.0 to avoid possible unstable behavior of the latest versions or not well supported NeMo components.
+
+
+## Architecture
+
+In general, there're three main components of a modular SpeechLLM model: 
+- An audio encoder that processes the input audio and produces a sequence of audio embeddings.
+- A modality adapter that processes the audio embeddings and produces a sequence of embeddings in the same latent space as the token embeddings of a pretrained large language model (LLM).
+- A pretrained large language model (LLM) that processes embeddings from the modality adapter as well as token embeddings of input prompt, and produces the text output. The audio embeddings and text token embeddings are concatenated in time dimension before going into the LLM.
+
+
+## Usage
+
+### Input Format
+
+You'll need to prepare data in the NeMo manifest format, where each line is a python dictionary with some keys, for example:
+```
+{
+    "audio_filepath": "path/to/audio.wav",
+    "offset": 0.0, # offset of the audio in seconds, this is an optional field
+    "duration": 10.0 , # duration of the audio in seconds, can set to `None` to load the whole audio
+    "question": "what is the transcription of the audio?", # this is an optional field, see below for more details
+    "answer": "the transcription of the audio", # optional for inference
+}
+```
+
+The `question` field in the manifest is optional, and you can put a list of questions in a file then set `++model.data.train_ds.question_file=<path to to question file>` to ask the dataloader to randomly pick a question from the file for each audio sample. This is useful for training with multiple prompts for the same task. If neither `question` field nor `question_file` is provided, the dataloader will use a default question `what does the audio mean?` for all aduios.
+
+
+### Training
+
+There are several configs for training a SpeechLLM:
+- `conf/modular_audio_gpt_config_peft.yaml`: a config for training a SpeechLLM model with PEFT (e.g., LoRA), where you don't want to tune the whole LLM but still want to adapt the LLM to your needs.
+- `conf/modular_audio_gpt_config_sft.yaml`: a config for training a SpeechLLM model without PEFT, where you might want to tune the whole LLM or simply freeze it and use as is.
+- `conf/modular_audio_gpt_multi_enc_config_peft.yaml`: a config for training a SpeechLLM model with multiple audio encoders and PEFT, where you can add speaker embeddings to the audio embeddings. Currently only TitaNet is supported as the speaker encoder.
+
+With any config, you can set the following flags to control which components to train or freeze:
+- `model.freeze_llm` # Generally set to `True` unless you want to fine-tune the whole LLM.
+- `model.freeze_audio_encoder` # Generally set to `False` unless you want to freeze the audio encoder.
+- `model.freeze_modality_adapter` # Generally set to `False` since we want to train the modality adapter.
+
+In addition to the config file, you will also need two prepare the audio encoder and the LLM as `*.nemo` files.
+
+To train a SpeechLLM model, you can run the following script:
+```bash
+MEGATRON_MODEL=/path/to/megatron-model.nemo
+ASR_MODEL=/path/to/audio-encoder.nemo
+
+TRAIN_MANIFESTS="[/data/train_1.json,/data/train_2.json]"
+VAL_MANIFESTS="[/data/dev_1.json,/data/dev_2.json]"
+VAL_NAMES="[dev-1,dev-2]"
+
+NVTE_FLASH_ATTN=0 \
+NVTE_FUSED_ATTN=0 \
+NVTE_MASKED_SOFTMAX_FUSION=0 \
+CUDA_VISIBLE_DEVICES="0,1" python modular_audio_gpt_train.py --config-path="./conf" --config-name "modular_audio_gpt_config_peft" \
+    trainer.devices=-1 \
+    model.freeze_audio_encoder=True \
+    model.freeze_llm=True \
+    model.global_batch_size=4 \  # global_batch_size = micro_batch_size * num_gpus_per_node * num_nodes * gradient_accumulation_steps
+    model.micro_batch_size=2 \  # micro_batch_size = batch_size_per_gpu
+    model.pretrained_audio_model=$ASR_MODEL \
+    model.restore_from_path=$MEGATRON_MODEL \
+    model.data.train_ds.manifest_filepath=$TRAIN_MANIFESTS \
+    model.data.validation_ds.manifest_filepath=$VAL_MANIFESTS \
+    ++model.data.validation_ds.names=$VAL_NAMES \
+```
+
+You can also use tarred datasets for faster training by converting normal NeMo datasets to tarred datasets using this [script](https://github.com/NVIDIA/NeMo/blob/main/scripts/speech_recognition/convert_to_tarred_audio_dataset.py) and follow the same dataset setting as shown in the script.
+
+
+#### Multi-task training
+In order to use a question file, you can set `++model.data.train_ds.question_file=<path to to question file>` in the command line or use multiple question files with `++model.data.train_ds.question_file=[<path to to question file1>,<path to question file2>,...]`. If the number of question files is equal to the number of provided datasets, the dataloader will assigne each question file to a dataset. Otherwise, the dataloader will randomly pick a question file from all provided question files for each audio sample. Using multiple question files is useful for training with multiple tasks, where each task has its own set of prompts. Meanwhile, you can control the weights for different tasks/datasets by using concatentated tarred datasets, where you can assign weights to datasets by:
+```
+++model.data.train_ds.is_tarred=True \
+++model.data.train_ds.is_concat=True \
+++model.data.train_ds.manifest_filepath=[/path/to/data1/tarred_audio_manifest.json,/path/to/data2/tarred_audio_manifest.json] \
+++model.data.train_ds.tarred_audio_filepaths=[/path/to/data1/audio__OP_0..1023_CL_.tar,/path/to/data2/audio__OP_0..1023_CL_.tar] \
+++model.data.train_ds.concat_sampling_technique='random' \
+++model.data.train_ds.concat_sampling_probabilities=[0.4,0.6] \
+```
+
+#### Available Audio Encoders
+Currently all NeMo ASR models are supported, others may also work if they have an `encoder` attribute that returns a sequence of audio embeddings, and a `preprocessor` that takes raw audios and returns a sequence of features for the encoder. The model should also have a `cfg` attribute that returns a `omegaconf.DictConfig` object of model configuration. In addition to a local model, you can also set `pretrained_audio_model` to a model from NGC (e.g., `stt_en_fastconformer_transducer_large`) or Huggingface (e.g., `nvidia/parakeet-rnnt-1.1b`), and the script will download the model and use it for training.
+
+
+### Inference
+
+The config file for inference is `conf/modular_audio_gpt_config_eval.yaml`, where you mainly need to set the `model.data.test_ds` fields. An example of running inference is shown below:
+
+```bash
+ASR_MODEL=/path/to/asr-model.nemo  # required only if you freeze the audio encoder during training
+MEGATRON_CKPT=/path/to/megatron-llm.nemo
+ALM_DIR=/path/to/nemo_experiments/job_name
+ALM_YAML=$ALM_DIR/version_0/hparams.yaml
+ALM_CKPT="$ALM_DIR/checkpoints/AudioGPT--validation_wer\=0.2-step\=100000-epoch\=0-last.ckpt"  # this checkpoint file only contains the trainable params
+
+VAL_MANIFESTS="[/data/libri-test-other.json,/data/MCV_7.1_test.json,/data/wsj-test.json]"
+VAL_NAMES="[ls-test-other,mcv7.1-test,wsj-test]"
+
+NVTE_MASKED_SOFTMAX_FUSION=0 \
+NVTE_FLASH_ATTN=0 \
+NVTE_FUSED_ATTN=0 \
+CUDA_VISIBLE_DEVICES=0 python modular_audio_gpt_eval.py \
+    model.restore_from_path=$MEGATRON_CKPT \
+    model.pretrained_audio_model=$ASR_MODEL \  # required only if you freeze the audio encoder during training
+    model.peft.restore_from_path=$ALM_CKPT \
+    model.peft.restore_from_hparams_path=$ALM_YAML \
+    model.data.test_ds.manifest_filepath=$VAL_MANIFESTS \
+    model.data.test_ds.names=$VAL_NAMES \
+    model.data.test_ds.global_batch_size=8 \
+	model.data.test_ds.micro_batch_size=8 \
+	model.data.test_ds.tokens_to_generate=256 \
+    ++inference.greedy=False \
+    ++inference.top_k=50 \
+    ++inference.top_p=0.95 \
+    ++inference.temperature=0.4 \
+    ++inference.repetition_penalty=1.2 \
+    ++model.data.test_ds.output_dir=${ALM_DIR}
+```
+
+
+## Reference
+[1] Chen, Z.\*, Huang, H.\*, Andrusenko, A., Hrinchuk, O., Puvvada, K.C., Li, J., Ghosh, S., Balam, J. and Ginsburg, B., 2023. SALM: Speech-augmented Language Model with In-context Learning for Speech Recognition and Translation. ICASSP'24.
@@ -1,13 +1,13 @@
-name: megatron_gpt_peft_${model.peft.peft_scheme}_eval
+name: megatron_audio_gpt_eval
 
 trainer:
   devices: 1
   accelerator: gpu
   num_nodes: 1
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 9999
   max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10 # frequency with which training steps are logged 
@@ -42,9 +42,9 @@ model:
 
   global_batch_size: 1
   micro_batch_size: 1
+  pretrained_audio_model: null  # Path to a .nemo model for audio encoder
   restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
-  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
   sync_batch_comm: False
   megatron_amp_O2: False
 
@@ -68,28 +68,10 @@ model:
   attention_dropout: 0.0
   ffn_dropout: 0.0
 
-  peft:
-    peft_scheme: "adapter"  # can be either adapter,ia3, or ptuning
+  peft: # keep these basic params for reusing in both sft and peft SpeechLMs
     restore_from_path: null
     restore_from_ckpt_name: null
     restore_from_hparams_path: null
-    
-    # Used for adapter peft training
-    adapter_tuning:
-      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
-      adapter_dim: 32
-      adapter_dropout: 0.0
-      norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used.
-      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
-    
-    # Used for p-tuning peft training
-    p_tuning:
-      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
-      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
-      embedding_dim: 1024  # the size of the prompt encoder embeddings
-      init_std: 0.023
 
   data:
     test_ds:
@@ -103,21 +85,21 @@ model:
       max_seq_length: 2048
       min_seq_length: 1
       drop_last: False
-      end_string: ${data.train_ds.end_string}
-      context_key: ${data.train_ds.context_key}
-      label_key: ${data.train_ds.label_key}
-      add_eos: ${data.train_ds.add_eos}
-      add_sep: ${data.train_ds.add_sep}
-      add_bos: ${data.train_ds.add_bos}
+      end_string: ${data.train_ds.end_string}  # don't change, let hydra resolve from saved config
+      context_key: ${data.train_ds.context_key} # don't change, let hydra resolve from saved config
+      label_key: ${data.train_ds.label_key} # don't change, let hydra resolve from saved config
+      add_eos: ${data.train_ds.add_eos} # don't change, let hydra resolve from saved config
+      add_sep: ${data.train_ds.add_sep} # don't change, let hydra resolve from saved config
+      add_bos: ${data.train_ds.add_bos} # don't change, let hydra resolve from saved config
       separate_prompt_and_response_with_newline: ${data.train_ds.separate_prompt_and_response_with_newline}
       write_predictions_to_file: True
       output_file_path_prefix: "preds" # Prefix of the file to write predictions to.
-      truncation_field: ${data.train_ds.truncation_field} # Options: ['context', 'answer']
+      truncation_field: ${data.train_ds.truncation_field}  # don't change, let hydra resolve from saved config
       index_mapping_dir: null # Path to a directory to write index mapping files.
-      prompt_template: ${data.train_ds.prompt_template}
+      prompt_template: ${data.train_ds.prompt_template} # don't change, let hydra resolve from saved config
       tokens_to_generate: 512
       log_every_n_steps: 1
-      sample_rate: ${data.train_ds.sample_rate}
+      sample_rate: ${data.train_ds.sample_rate} # don't change, let hydra resolve from saved config
       audio_locator: null # set it to allow multiple audios in a sample, e.g. '|audio|', and use it in the context field of manifest to specify the locations of audios (`audio_filepath` is a list of audios).
 
       metric:
 
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name: megatron_audio_gpt_peft_tuning
+name: megatron_audio_gpt_peft
 
 trainer:
   devices: 1
@@ -21,9 +21,9 @@ trainer:
   precision: 16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
-  max_epochs: 9999
-  max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  use_distributed_sampler: False
+  max_epochs: 100
+  max_steps: 1000000 # 1M steps
   log_every_n_steps: 10 # frequency with which training steps are logged 
   val_check_interval: 1.0 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
   gradient_clip_val: 1.0
@@ -102,25 +102,31 @@ model:
   # override_vocab_size: 1024
 
   peft:
-    peft_scheme: "adapter"  # can be either adapter,ia3, or ptuning
+    peft_scheme: "lora"  # can be either lora, adapter, ia3 or ptuning
     restore_from_path: null
 
     # Used for adapter peft training
     adapter_tuning:
       type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
       adapter_dim: 32
       adapter_dropout: 0.0
-      norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used.
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
       column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
       row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
       norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
-    
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
     lora_tuning:
       adapter_dim: 32
       adapter_dropout: 0.0
       column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
       row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-    
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
     # Used for p-tuning peft training
     p_tuning:
       virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
@@ -129,6 +135,14 @@ model:
       init_std: 0.023
 
   perception:
+    use_multi_layer_feat: false  # whether to extract multi-layer features, only supports conformer encoder
+    multi_layer_feat:
+      layer_idx_list: [0,16]  # layer indices to extract features from
+      aggregator:
+        mode: "cat"  # ways to combine features from different layers, choices=['cat','sum','mean', 'max', 'min'], default to concat ('cat')
+        pooling: "avg"  # ways to pool features if they have different temporal lengths and align_mode=min, choices=['mean', 'max', 'min']
+        align_mode: "min"  # if features have different temporal lengths, set `min` to pool to the shortest length or `max` to repeat to the longest.
+
     modality_adapter: 
       _target_: nemo.collections.asr.modules.ConformerEncoder
       feat_in: 1024
@@ -219,8 +233,8 @@ model:
       concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
       context_key: 'input'
       label_key: 'output'
-      # add_eos: True
-      add_eos: False
+      add_eos: True
+      # add_eos: False
       end_string: ${model.data.end_string}
       add_sep: False
       add_bos: False
@@ -267,9 +281,9 @@ model:
       # ASR configs
       sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
 
-      log_every_n_steps: 1
+      log_every_n_steps: 10
       metric:
-        name: "wer" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss', 'wer', 'bleu', 'rouge']
         average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
         num_classes: null