|
| 1 | +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +name: megatron_audio_gpt_bestow_lhotse |
| 16 | + |
| 17 | +trainer: |
| 18 | + devices: 1 |
| 19 | + accelerator: gpu |
| 20 | + num_nodes: 1 |
| 21 | + precision: 16 |
| 22 | + logger: False # logger provided by exp_manager |
| 23 | + enable_checkpointing: False |
| 24 | + use_distributed_sampler: False |
| 25 | + max_epochs: 9999 |
| 26 | + max_steps: 1000000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches |
| 27 | + limit_train_batches : 1000 |
| 28 | + log_every_n_steps: 10 # frequency with which training steps are logged |
| 29 | + val_check_interval: 1000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch |
| 30 | + gradient_clip_val: 1.0 |
| 31 | + accumulate_grad_batches: 1 |
| 32 | + |
| 33 | +model_target: nemo.collections.multimodal.speech_llm.models.modular_models.CrossAttendModularAudioGPTModel |
| 34 | + |
| 35 | +exp_manager: |
| 36 | + # explicit_log_dir: null |
| 37 | + exp_dir: null |
| 38 | + name: ${name} |
| 39 | + create_wandb_logger: False |
| 40 | + wandb_logger_kwargs: |
| 41 | + project: null |
| 42 | + name: null |
| 43 | + resume_if_exists: True |
| 44 | + resume_ignore_no_checkpoint: True |
| 45 | + create_checkpoint_callback: True |
| 46 | + checkpoint_callback_params: |
| 47 | + monitor: validation_${model.data.validation_ds.metric.name} |
| 48 | + save_top_k: 1 |
| 49 | + mode: min |
| 50 | + save_nemo_on_train_end: True |
| 51 | + filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}' |
| 52 | + model_parallel_size: ${model.tensor_model_parallel_size} |
| 53 | + always_save_nemo: False |
| 54 | + save_best_model: True |
| 55 | + create_early_stopping_callback: False |
| 56 | + early_stopping_callback_params: |
| 57 | + monitor: "val_loss" |
| 58 | + mode: "min" |
| 59 | + min_delta: 0.001 |
| 60 | + patience: 10 |
| 61 | + verbose: True |
| 62 | + strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. |
| 63 | + |
| 64 | + |
| 65 | +model: |
| 66 | + seed: 1234 |
| 67 | + tensor_model_parallel_size: 1 # intra-layer model parallelism |
| 68 | + pipeline_model_parallel_size: 1 # inter-layer model parallelism |
| 69 | + |
| 70 | + pretrained_audio_model: stt_en_fastconformer_transducer_large |
| 71 | + freeze_llm: True |
| 72 | + freeze_audio_encoder: False |
| 73 | + freeze_modality_adapter: False |
| 74 | + load_audio_encoder: True |
| 75 | + |
| 76 | + ## Legacy batch_size configuration |
| 77 | + # When used with lhotse, the batch composition is decided by dataloader configs |
| 78 | + # and batch size here is only used for deciding gradient accumulation. |
| 79 | + # gradient accumulation = global_batch_size / micro_batch_size / data_parallel_size |
| 80 | + # where data_parallel_size = num_nodes * num_gpus / TP_size |
| 81 | + global_batch_size: 128 |
| 82 | + micro_batch_size: 4 |
| 83 | + restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with |
| 84 | + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. |
| 85 | + save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. |
| 86 | + sync_batch_comm: False |
| 87 | + megatron_amp_O2: False |
| 88 | + |
| 89 | + ## Sequence Parallelism |
| 90 | + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially |
| 91 | + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. |
| 92 | + sequence_parallel: False |
| 93 | + |
| 94 | + ## Activation Checkpoint |
| 95 | + activations_checkpoint_granularity: null # 'selective' or 'full' |
| 96 | + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' |
| 97 | + # 'uniform' divides the total number of transformer layers and checkpoints the input activation |
| 98 | + # of each chunk at the specified granularity |
| 99 | + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity |
| 100 | + activations_checkpoint_num_layers: null # not used with 'selective' |
| 101 | + activations_checkpoint_layers_per_pipeline: null |
| 102 | + answer_only_loss: True |
| 103 | + gradient_as_bucket_view: False |
| 104 | + |
| 105 | + hidden_dropout: 0.0 |
| 106 | + attention_dropout: 0.0 |
| 107 | + ffn_dropout: 0.0 |
| 108 | + |
| 109 | + # use_am_tokenizer: True |
| 110 | + # override_vocab_size: 1024 |
| 111 | + |
| 112 | + peft: |
| 113 | + peft_scheme: "lora" # can be either lora, adapter, ia3 or ptuning |
| 114 | + restore_from_path: null |
| 115 | + |
| 116 | + # Used for adapter peft training |
| 117 | + adapter_tuning: |
| 118 | + type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' |
| 119 | + adapter_dim: 32 |
| 120 | + adapter_dropout: 0.0 |
| 121 | + norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. |
| 122 | + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal |
| 123 | + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal |
| 124 | + norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] |
| 125 | + layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers |
| 126 | + weight_tying: False |
| 127 | + position_embedding_strategy: null # used only when weight_tying is True |
| 128 | + |
| 129 | + lora_tuning: |
| 130 | + target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2) |
| 131 | + adapter_dim: 32 |
| 132 | + alpha: ${model.peft.lora_tuning.adapter_dim} |
| 133 | + adapter_dropout: 0.0 |
| 134 | + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal |
| 135 | + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal |
| 136 | + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers |
| 137 | + weight_tying: False |
| 138 | + position_embedding_strategy: null # used only when weight_tying is True |
| 139 | + |
| 140 | + # Used for p-tuning peft training |
| 141 | + p_tuning: |
| 142 | + virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence |
| 143 | + bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck |
| 144 | + embedding_dim: 1024 # the size of the prompt encoder embeddings |
| 145 | + init_std: 0.023 |
| 146 | + |
| 147 | + perception: |
| 148 | + target: nemo.collections.multimodal.speech_llm.modules.perception_modules.AudioPerceptionModule |
| 149 | + use_multi_layer_feat: false |
| 150 | + xattn: |
| 151 | + target: nemo.collections.multimodal.speech_llm.modules.perception_modules.TransformerCrossAttention |
| 152 | + num_attention_heads: 8 |
| 153 | + attn_score_dropout: 0.1 |
| 154 | + attn_layer_dropout: 0.1 |
| 155 | + ffn_dropout: 0.1 |
| 156 | + hidden_act: "relu" |
| 157 | + pre_ln: true |
| 158 | + pre_ln_final_layer_norm: true |
| 159 | + |
| 160 | + multi_layer_feat: |
| 161 | + layer_idx_list: [0,16] # layer indices to extract features from |
| 162 | + aggregator: |
| 163 | + mode: "cat" # ways to combine features from different layers, choices=['cat','sum','mean', 'max', 'min'], default to concat ('cat') |
| 164 | + pooling: "avg" # ways to pool features if they have different temporal lengths and align_mode=min, choices=['mean', 'max', 'min'] |
| 165 | + align_mode: "min" # if features have different temporal lengths, set `min` to pool to the shortest length or `max` to repeat to the longest. |
| 166 | + |
| 167 | + modality_adapter: |
| 168 | + _target_: nemo.collections.asr.modules.ConformerEncoder |
| 169 | + feat_in: 1024 |
| 170 | + feat_out: -1 # you may set it if you need different output size other than the default d_model |
| 171 | + n_layers: 2 |
| 172 | + d_model: 512 |
| 173 | + |
| 174 | + # Sub-sampling parameters |
| 175 | + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding |
| 176 | + subsampling_factor: 8 # must be power of 2 for striding and vggnet |
| 177 | + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model |
| 178 | + causal_downsampling: false |
| 179 | + |
| 180 | + # Reduction parameters: Can be used to add another subsampling layer at a given position. |
| 181 | + # Having a 2x reduction will speedup the training and inference speech while keeping similar WER. |
| 182 | + # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup. |
| 183 | + reduction: null # pooling, striding, or null |
| 184 | + reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder |
| 185 | + reduction_factor: 1 |
| 186 | + |
| 187 | + # Feed forward module's params |
| 188 | + ff_expansion_factor: 4 |
| 189 | + |
| 190 | + # Multi-headed Attention Module's params |
| 191 | + self_attention_model: rel_pos # rel_pos or abs_pos |
| 192 | + n_heads: 8 # may need to be lower for smaller d_models |
| 193 | + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention |
| 194 | + att_context_size: [-1, -1] # -1 means unlimited context |
| 195 | + att_context_style: regular # regular or chunked_limited |
| 196 | + xscaling: true # scales up the input embeddings by sqrt(d_model) |
| 197 | + untie_biases: true # unties the biases of the TransformerXL layers |
| 198 | + pos_emb_max_len: 5000 |
| 199 | + |
| 200 | + # Convolution module's params |
| 201 | + conv_kernel_size: 9 |
| 202 | + conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) |
| 203 | + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size |
| 204 | + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] |
| 205 | + conv_context_size: null |
| 206 | + |
| 207 | + ### regularization |
| 208 | + dropout: 0.1 # The dropout used in most of the Conformer Modules |
| 209 | + dropout_pre_encoder: 0.1 # The dropout used before the encoder |
| 210 | + dropout_emb: 0.0 # The dropout used for embeddings |
| 211 | + dropout_att: 0.1 # The dropout for multi-headed attention modules |
| 212 | + |
| 213 | + # set to non-zero to enable stochastic depth |
| 214 | + stochastic_depth_drop_prob: 0.0 |
| 215 | + stochastic_depth_mode: linear # linear or uniform |
| 216 | + stochastic_depth_start_layer: 1 |
| 217 | + |
| 218 | + spec_augment: |
| 219 | + _target_: nemo.collections.asr.modules.SpectrogramAugmentation |
| 220 | + freq_masks: 2 # set to zero to disable it |
| 221 | + time_masks: 10 # set to zero to disable it |
| 222 | + freq_width: 27 |
| 223 | + time_width: 0.05 |
| 224 | + |
| 225 | + # the following are read from the pretrained AM: |
| 226 | + # output_dim: null |
| 227 | + # encoder: null |
| 228 | + # preprocessor: null |
| 229 | + |
| 230 | + data: |
| 231 | + end_string: "[EOG]" |
| 232 | + train_ds: |
| 233 | + # Example of how to specify paths to multiple datasets |
| 234 | + # manifest_filepath: |
| 235 | + # - /path/to/squad.jsonl |
| 236 | + # - /path/to/mnli.jsonl |
| 237 | + # - /path/to/boolq.jsonl |
| 238 | + # Example of how each dataset is formatted |
| 239 | + # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'question': 'transcribe this audio', 'answer': 'I have a dream...'} |
| 240 | + # the 'answer' field can also be 'text', and a default 'question' field is added if missing in manigests, so as to work with ASR manifests |
| 241 | + global_batch_size: ${model.global_batch_size} |
| 242 | + micro_batch_size: ${model.micro_batch_size} |
| 243 | + shuffle: True |
| 244 | + num_workers: 0 |
| 245 | + pin_memory: True |
| 246 | + max_seq_length: 2048 |
| 247 | + min_seq_length: 1 |
| 248 | + drop_last: True |
| 249 | + # Notably, the data weights are controlled by either bucketing_weights |
| 250 | + # or concat_sampling_probabilities depending on the dataset type (tar and |
| 251 | + # non-tar). |
| 252 | + # See audio_text_qa_dataset.py for details. |
| 253 | + concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' |
| 254 | + context_key: 'context' |
| 255 | + answer_key: 'answer' |
| 256 | + add_eos: True |
| 257 | + # add_eos: False |
| 258 | + end_string: ${model.data.end_string} |
| 259 | + add_sep: False |
| 260 | + add_bos: False |
| 261 | + separate_prompt_and_response_with_newline: False |
| 262 | + truncation_field: "context" # Options: ['context', 'answer'] |
| 263 | + index_mapping_dir: null # Path to a directory to write index mapping files. |
| 264 | + prompt_template: "[INST]\n<<SYS>>\nPlease answer the following based on the previous speech feature.\n<</SYS>>\n\n{context}[/INST] {answer}" |
| 265 | + # ASR configs |
| 266 | + sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate} |
| 267 | + max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset |
| 268 | + min_duration: 0.1 |
| 269 | + # tarred datasets |
| 270 | + is_tarred: false |
| 271 | + tarred_audio_filepaths: null |
| 272 | + shuffle_n: 2048 |
| 273 | + # bucketing params |
| 274 | + bucketing_strategy: "fully_randomized" |
| 275 | + bucketing_batch_size: null |
| 276 | + use_lhotse: True |
| 277 | + text_field : "text" |
| 278 | + batch_duration : 80 # 0 |
| 279 | + quadratic_duration : 30 |
| 280 | + num_buckets : 30 |
| 281 | + buffer_size : 10000 |
| 282 | + shuffle_buffer_size : 10000 |
| 283 | + duration_bins: null |
| 284 | + |
| 285 | + validation_ds: |
| 286 | + global_batch_size: ${model.global_batch_size} |
| 287 | + micro_batch_size: ${model.micro_batch_size} |
| 288 | + shuffle: False |
| 289 | + num_workers: 0 |
| 290 | + pin_memory: True |
| 291 | + max_seq_length: 2048 |
| 292 | + min_seq_length: 1 |
| 293 | + drop_last: False |
| 294 | + context_key: ${model.data.train_ds.context_key} |
| 295 | + answer_key: ${model.data.train_ds.answer_key} |
| 296 | + add_eos: ${model.data.train_ds.add_eos} |
| 297 | + end_string: ${model.data.end_string} |
| 298 | + add_sep: ${model.data.train_ds.add_sep} |
| 299 | + add_bos: ${model.data.train_ds.add_bos} |
| 300 | + separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline} |
| 301 | + write_predictions_to_file: False |
| 302 | + output_file_path_prefix: null # Prefix of the file to write predictions to. |
| 303 | + truncation_field: "context" # Options: ['context', 'answer'] |
| 304 | + index_mapping_dir: null # Path to a directory to write index mapping files. |
| 305 | + prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" |
| 306 | + tokens_to_generate: 128 |
| 307 | + # ASR configs |
| 308 | + sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate} |
| 309 | + |
| 310 | + log_every_n_steps: 10 |
| 311 | + metric: |
| 312 | + name: "wer" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] |
| 313 | + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. |
| 314 | + num_classes: null |
| 315 | + |
| 316 | + optim: |
| 317 | + name: fused_adam |
| 318 | + lr: 1e-4 |
| 319 | + weight_decay: 0.01 |
| 320 | + betas: |
| 321 | + - 0.9 |
| 322 | + - 0.98 |
| 323 | + sched: |
| 324 | + name: CosineAnnealing |
| 325 | + warmup_steps: 50 |
| 326 | + min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1 |
| 327 | + constant_steps: 0 # Constant steps should also be 0 when min_lr=0 |
| 328 | + monitor: val_loss |
| 329 | + reduce_on_plateau: false |
0 commit comments