diff --git a/examples/asr/conf/fastconformer/fast-conformer-long_ctc_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer-long_ctc_bpe.yaml deleted file mode 100644 index e83ef931cf5c..000000000000 --- a/examples/asr/conf/fastconformer/fast-conformer-long_ctc_bpe.yaml +++ /dev/null @@ -1,208 +0,0 @@ -# It contains the default values for training a Fast Conformer-CTC ASR model, large size (~120M) with CTC loss and sub-word encoding. -# This version uses Longformer-style attention in order to handle longer audio - -# Architecture and training config: -# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective -# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. - -# You may find more info about Fast Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer - -# Differences from baseline config are in -# model.encoder.global_tokens -# model.encoder.global_tokens_spacing -# model.encoder.global_attn_separate - -name: "FastConformer-Long-CTC-BPE" - -model: - sample_rate: 16000 - log_prediction: true # enables logging sample predictions in the output during training - ctc_reduction: 'mean_volume' - skip_nan_grad: false - - train_ds: - manifest_filepath: ??? - sample_rate: ${model.sample_rate} - batch_size: 16 # you may increase batch_size if your memory allows - shuffle: true - num_workers: 8 - pin_memory: true - use_start_end_token: false - trim_silence: false - max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset - min_duration: 0.1 - # tarred datasets - is_tarred: false - tarred_audio_filepaths: null - shuffle_n: 2048 - # bucketing params - bucketing_strategy: "fully_randomized" - bucketing_batch_size: null - - validation_ds: - manifest_filepath: ??? - sample_rate: ${model.sample_rate} - batch_size: 16 # you may increase batch_size if your memory allows - shuffle: false - num_workers: 8 - pin_memory: true - use_start_end_token: false - - test_ds: - manifest_filepath: null - sample_rate: ${model.sample_rate} - batch_size: 16 # you may increase batch_size if your memory allows - shuffle: false - num_workers: 8 - pin_memory: true - use_start_end_token: false - - # recommend vocab size of 128 or 256 when training on ~1k hr datasets and 1k vocab size on 10+k hr datasets - # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py - tokenizer: - dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (wpe) - type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - sample_rate: ${model.sample_rate} - normalize: "per_feature" - window_size: 0.025 - window_stride: 0.01 - window: "hann" - features: 80 - n_fft: 512 - log: true - frame_splicing: 1 - dither: 0.00001 - pad_to: 0 - pad_value: 0.0 - - spec_augment: - _target_: nemo.collections.asr.modules.SpectrogramAugmentation - freq_masks: 2 # set to zero to disable it - # you may use lower time_masks for smaller models to have a faster convergence - time_masks: 10 # set to zero to disable it - freq_width: 27 - time_width: 0.05 - - encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.preprocessor.features} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 18 - d_model: 512 - - # Sub-sampling params - subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding - subsampling_factor: 8 # must be power of 2 for striding and vggnet - subsampling_conv_channels: 256 # -1 sets it to d_model - causal_downsampling: false - - # Feed forward module's params - ff_expansion_factor: 4 - - self_attention_model: rel_pos_local_attn # longformer-style attention (sliding window + global tokens) - global_tokens: 1 # number of tokens that attend and are attended to by all tokens (put 0 to disable) - global_tokens_spacing: 1 # how far apart the global tokens are - global_attn_separate: false # whether global tokens should use separate q,k,v layers - n_heads: 8 # may need to be lower for smaller d_models - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - att_context_size: [128,128] # -1 means unlimited context - att_context_style: regular # regular or chunked_limited - xscaling: true # scales up the input embeddings by sqrt(d_model) - untie_biases: true # unties the biases of the TransformerXL layers - - # Convolution module's params - conv_kernel_size: 9 - conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) - # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size - # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] - conv_context_size: null - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_pre_encoder: 0.1 # The dropout used before the encoder - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - # set to non-zero to enable stochastic depth - stochastic_depth_drop_prob: 0.0 - stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 1 - - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoder - feat_in: null - num_classes: -1 - vocabulary: [] - - # config for InterCTC loss: https://arxiv.org/abs/2102.03216 - # specify loss weights and which layers to use for InterCTC - # e.g., to reproduce the paper results, set loss_weights: [0.3] - # and apply_at_layers: [8] (assuming 18 layers). Note that final - # layer loss coefficient is automatically adjusted (to 0.7 in above example) - interctc: - loss_weights: [] - apply_at_layers: [] - - optim: - name: adamw - lr: 1e-3 - # optimizer arguments - betas: [0.9, 0.98] - # less necessity for weight_decay as we already have large augmentations with SpecAug - # you may need weight_decay for large models, stable AMP training, small datasets, or when lower augmentations are used - # weight decay of 0.0 with lr of 2.0 also works fine - weight_decay: 1e-3 - - # scheduler setup - sched: - name: CosineAnnealing - # scheduler config override - warmup_steps: 15000 - warmup_ratio: null - min_lr: 1e-6 - -trainer: - devices: -1 # number of GPUs, -1 would use all available GPUs - num_nodes: 1 - max_epochs: 1000 - max_steps: -1 # computed at runtime if not set - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - accelerator: auto - strategy: ddp - accumulate_grad_batches: 1 - gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. - log_every_n_steps: 10 # Interval of logging. - enable_progress_bar: True - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs - sync_batchnorm: true - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - benchmark: false # needs to be false for models with variable-length speech input as it slows down training - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - # in case of multiple validation sets, first one is used - monitor: "val_wer" - mode: "min" - save_top_k: 5 - always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints - - # you need to set these two to True to continue the training - resume_if_exists: false - resume_ignore_no_checkpoint: false - - # You may use this section to create a W&B logger - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null diff --git a/examples/asr/conf/fastconformer/fast-conformer-long_transducer_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer-long_transducer_bpe.yaml deleted file mode 100644 index 50ad302eef45..000000000000 --- a/examples/asr/conf/fastconformer/fast-conformer-long_transducer_bpe.yaml +++ /dev/null @@ -1,265 +0,0 @@ -# It contains the default values for training a Fast Conformer-Transducer ASR model, large size (~120M) with Transducer loss and sub-word encoding. -# This version uses Longformer-style attention in order to handle longer audio - -# Architecture and training config: -# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective -# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. - -# You may find more info about Fast Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer - -# Differences from baseline config are in -# model.encoder.global_tokens -# model.encoder.global_tokens_spacing -# model.encoder.global_attn_separate - -name: "FastConformer-Long-Transducer-BPE" - -model: - sample_rate: 16000 - compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. - log_prediction: true # enables logging sample predictions in the output during training - rnnt_reduction: 'mean_volume' - skip_nan_grad: false - - model_defaults: - enc_hidden: ${model.encoder.d_model} - pred_hidden: 640 - joint_hidden: 640 - - train_ds: - manifest_filepath: ??? - sample_rate: ${model.sample_rate} - batch_size: 16 # you may increase batch_size if your memory allows - shuffle: true - num_workers: 8 - pin_memory: true - use_start_end_token: false - trim_silence: false - max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset - min_duration: 0.1 - # tarred datasets - is_tarred: false - tarred_audio_filepaths: null - shuffle_n: 2048 - # bucketing params - bucketing_strategy: "fully_randomized" - bucketing_batch_size: null - - validation_ds: - manifest_filepath: ??? - sample_rate: ${model.sample_rate} - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - use_start_end_token: false - - test_ds: - manifest_filepath: null - sample_rate: ${model.sample_rate} - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - use_start_end_token: false - - # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py - tokenizer: - dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) - type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - sample_rate: ${model.sample_rate} - normalize: "per_feature" - window_size: 0.025 - window_stride: 0.01 - window: "hann" - features: 80 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - pad_to: 0 - - spec_augment: - _target_: nemo.collections.asr.modules.SpectrogramAugmentation - freq_masks: 2 # set to zero to disable it - time_masks: 10 # set to zero to disable it - freq_width: 27 - time_width: 0.05 - - encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.preprocessor.features} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 17 - d_model: 512 - - # Sub-sampling parameters - subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding - subsampling_factor: 8 # must be power of 2 for striding and vggnet - subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model - causal_downsampling: false - - # Reduction parameters: Can be used to add another subsampling layer at a given position. - # Having a 2x reduction will speedup the training and inference speech while keeping similar WER. - # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup. - reduction: null # pooling, striding, or null - reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder - reduction_factor: 1 - - # Feed forward module's params - ff_expansion_factor: 4 - - # Multi-headed Attention Module's params - self_attention_model: rel_pos_local_attn # longformer-style attention (sliding window + global tokens) - global_tokens: 1 # number of tokens that attend and are attended to by all tokens (put 0 to disable) - global_tokens_spacing: 1 # how far apart the global tokens are - global_attn_separate: false # whether global tokens should use separate q,k,v layers - n_heads: 8 # may need to be lower for smaller d_models - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - att_context_size: [128,128] # -1 means unlimited context - att_context_style: regular # regular or chunked_limited - xscaling: true # scales up the input embeddings by sqrt(d_model) - untie_biases: true # unties the biases of the TransformerXL layers - - # Convolution module's params - conv_kernel_size: 9 - conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) - # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size - # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] - conv_context_size: null - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_pre_encoder: 0.1 # The dropout used before the encoder - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - # set to non-zero to enable stochastic depth - stochastic_depth_drop_prob: 0.0 - stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 1 - - decoder: - _target_: nemo.collections.asr.modules.RNNTDecoder - normalization_mode: null # Currently only null is supported for export. - random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf - blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. - - prednet: - pred_hidden: ${model.model_defaults.pred_hidden} - pred_rnn_layers: 1 - t_max: null - dropout: 0.2 - - # if a large vocabulary size is desired, you may wish to use SampleRNNTJoint module - # _target_: nemo.collections.asr.modules.SampledRNNTJoint - # n_samples: 500 # Specifies the minimum number of tokens to sample from the vocabulary space, excluding - # the RNNT blank token. If a given value is larger than the entire vocabulary size, then the full - # vocabulary will be used - joint: - _target_: nemo.collections.asr.modules.RNNTJoint - log_softmax: null # 'null' would set it automatically according to CPU/GPU device - preserve_memory: false # dramatically slows down training, but might preserve some memory - - # Fuses the computation of prediction net + joint net + loss + WER calculation - # to be run on sub-batches of size `fused_batch_size`. - # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. - # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. - # Using small values here will preserve a lot of memory during training, but will make training slower as well. - # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. - # However, to preserve memory, this ratio can be 1:8 or even 1:16. - # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. - fuse_loss_wer: true - fused_batch_size: 16 - - jointnet: - joint_hidden: ${model.model_defaults.joint_hidden} - activation: "relu" - dropout: 0.2 - - decoding: - strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. - - # greedy strategy config - greedy: - max_symbols: 10 - - # beam strategy config - beam: - beam_size: 2 - return_best_hypothesis: False - score_norm: true - tsd_max_sym_exp: 50 # for Time Synchronous Decoding - alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding - - loss: - loss_name: "default" - - warprnnt_numba_kwargs: - # FastEmit regularization: https://arxiv.org/abs/2010.11148 - # You may enable FastEmit to reduce the latency of the model for streaming - fastemit_lambda: 0.0 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. - clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. - - # Adds Gaussian noise to the gradients of the decoder to avoid overfitting - variational_noise: - start_step: 0 - std: 0.0 - - optim: - name: adamw - lr: 2.5e-3 - # optimizer arguments - betas: [0.9, 0.98] - weight_decay: 1e-3 - - # scheduler setup - sched: - name: CosineAnnealing - # scheduler config override - warmup_steps: 15000 - warmup_ratio: null - min_lr: 1e-6 - -trainer: - devices: -1 # number of GPUs, -1 would use all available GPUs - num_nodes: 1 - max_epochs: 500 - max_steps: -1 # computed at runtime if not set - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - accelerator: auto - strategy: ddp - accumulate_grad_batches: 1 - gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. - log_every_n_steps: 10 # Interval of logging. - enable_progress_bar: True - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs - sync_batchnorm: true - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - benchmark: false # needs to be false for models with variable-length speech input as it slows down training - - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - # in case of multiple validation sets, first one is used - monitor: "val_wer" - mode: "min" - save_top_k: 5 - always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints - resume_if_exists: false - resume_ignore_no_checkpoint: false - - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null