From d9c4963dd3ed039668f26b43d155c1a115ae1d1a Mon Sep 17 00:00:00 2001
From: Vahid <vnoroozi@nvidia.com>
Date: Mon, 17 Apr 2023 12:39:04 -0700
Subject: [PATCH] moved longconformer confs.

Signed-off-by: Vahid <vnoroozi@nvidia.com>
---
 .../fast-conformer-long_ctc_bpe.yaml          | 208 --------------
 .../fast-conformer-long_transducer_bpe.yaml   | 265 ------------------
 2 files changed, 473 deletions(-)
 delete mode 100644 examples/asr/conf/fastconformer/fast-conformer-long_ctc_bpe.yaml
 delete mode 100644 examples/asr/conf/fastconformer/fast-conformer-long_transducer_bpe.yaml

diff --git a/examples/asr/conf/fastconformer/fast-conformer-long_ctc_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer-long_ctc_bpe.yaml
deleted file mode 100644
index e83ef931cf5c..000000000000
--- a/examples/asr/conf/fastconformer/fast-conformer-long_ctc_bpe.yaml
+++ /dev/null
@@ -1,208 +0,0 @@
-# It contains the default values for training a Fast Conformer-CTC ASR model, large size (~120M) with CTC loss and sub-word encoding.
-# This version uses Longformer-style attention in order to handle longer audio
-
-# Architecture and training config:
-# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
-# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
-
-# You may find more info about Fast Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer
-
-# Differences from baseline config are in
-# model.encoder.global_tokens
-# model.encoder.global_tokens_spacing
-# model.encoder.global_attn_separate
-
-name: "FastConformer-Long-CTC-BPE"
-
-model:
-  sample_rate: 16000
-  log_prediction: true # enables logging sample predictions in the output during training
-  ctc_reduction: 'mean_volume'
-  skip_nan_grad: false
-
-  train_ds:
-    manifest_filepath: ???
-    sample_rate: ${model.sample_rate}
-    batch_size: 16 # you may increase batch_size if your memory allows
-    shuffle: true
-    num_workers: 8
-    pin_memory: true
-    use_start_end_token: false
-    trim_silence: false
-    max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset
-    min_duration: 0.1
-    # tarred datasets
-    is_tarred: false
-    tarred_audio_filepaths: null
-    shuffle_n: 2048
-    # bucketing params
-    bucketing_strategy: "fully_randomized"
-    bucketing_batch_size: null
-
-  validation_ds:
-    manifest_filepath: ???
-    sample_rate: ${model.sample_rate}
-    batch_size: 16 # you may increase batch_size if your memory allows
-    shuffle: false
-    num_workers: 8
-    pin_memory: true
-    use_start_end_token: false
-
-  test_ds:
-    manifest_filepath: null
-    sample_rate: ${model.sample_rate}
-    batch_size: 16 # you may increase batch_size if your memory allows
-    shuffle: false
-    num_workers: 8
-    pin_memory: true
-    use_start_end_token: false
-
-  # recommend vocab size of 128 or 256 when training on ~1k hr datasets and 1k vocab size on 10+k hr datasets
-  # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
-  tokenizer:
-    dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (wpe)
-    type: bpe  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)
-
-  preprocessor:
-    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
-    sample_rate: ${model.sample_rate}
-    normalize: "per_feature"
-    window_size: 0.025
-    window_stride: 0.01
-    window: "hann"
-    features: 80
-    n_fft: 512
-    log: true
-    frame_splicing: 1
-    dither: 0.00001
-    pad_to: 0
-    pad_value: 0.0
-
-  spec_augment:
-    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
-    freq_masks: 2 # set to zero to disable it
-    # you may use lower time_masks for smaller models to have a faster convergence
-    time_masks: 10 # set to zero to disable it
-    freq_width: 27
-    time_width: 0.05
-
-  encoder:
-    _target_: nemo.collections.asr.modules.ConformerEncoder
-    feat_in: ${model.preprocessor.features}
-    feat_out: -1 # you may set it if you need different output size other than the default d_model
-    n_layers: 18
-    d_model: 512
-
-    # Sub-sampling params
-    subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
-    subsampling_factor: 8 # must be power of 2 for striding and vggnet
-    subsampling_conv_channels: 256 # -1 sets it to d_model
-    causal_downsampling: false
-
-    # Feed forward module's params
-    ff_expansion_factor: 4
-
-    self_attention_model: rel_pos_local_attn # longformer-style attention (sliding window + global tokens)
-    global_tokens: 1 # number of tokens that attend and are attended to by all tokens (put 0 to disable)
-    global_tokens_spacing: 1 # how far apart the global tokens are
-    global_attn_separate: false # whether global tokens should use separate q,k,v layers
-    n_heads: 8 # may need to be lower for smaller d_models
-    # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
-    att_context_size: [128,128] # -1 means unlimited context
-    att_context_style: regular # regular or chunked_limited
-    xscaling: true # scales up the input embeddings by sqrt(d_model)
-    untie_biases: true # unties the biases of the TransformerXL layers
-
-    # Convolution module's params
-    conv_kernel_size: 9
-    conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
-    # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
-    # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
-    conv_context_size: null
-
-    ### regularization
-    dropout: 0.1 # The dropout used in most of the Conformer Modules
-    dropout_pre_encoder: 0.1 # The dropout used before the encoder
-    dropout_emb: 0.0 # The dropout used for embeddings
-    dropout_att: 0.1 # The dropout for multi-headed attention modules
-
-    # set to non-zero to enable stochastic depth
-    stochastic_depth_drop_prob: 0.0
-    stochastic_depth_mode: linear  # linear or uniform
-    stochastic_depth_start_layer: 1
-
-  decoder:
-    _target_: nemo.collections.asr.modules.ConvASRDecoder
-    feat_in: null
-    num_classes: -1
-    vocabulary: []
-
-  # config for InterCTC loss: https://arxiv.org/abs/2102.03216
-  # specify loss weights and which layers to use for InterCTC
-  # e.g., to reproduce the paper results, set loss_weights: [0.3]
-  # and apply_at_layers: [8] (assuming 18 layers). Note that final
-  # layer loss coefficient is automatically adjusted (to 0.7 in above example)
-  interctc:
-    loss_weights: []
-    apply_at_layers: []
-
-  optim:
-    name: adamw
-    lr: 1e-3
-    # optimizer arguments
-    betas: [0.9, 0.98]
-    # less necessity for weight_decay as we already have large augmentations with SpecAug
-    # you may need weight_decay for large models, stable AMP training, small datasets, or when lower augmentations are used
-    # weight decay of 0.0 with lr of 2.0 also works fine
-    weight_decay: 1e-3
-
-    # scheduler setup
-    sched:
-      name: CosineAnnealing
-      # scheduler config override
-      warmup_steps: 15000
-      warmup_ratio: null
-      min_lr: 1e-6
-
-trainer:
-  devices: -1 # number of GPUs, -1 would use all available GPUs
-  num_nodes: 1
-  max_epochs: 1000
-  max_steps: -1 # computed at runtime if not set
-  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
-  accelerator: auto
-  strategy: ddp
-  accumulate_grad_batches: 1
-  gradient_clip_val: 0.0
-  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
-  log_every_n_steps: 10  # Interval of logging.
-  enable_progress_bar: True
-  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
-  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
-  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
-  sync_batchnorm: true
-  enable_checkpointing: False  # Provided by exp_manager
-  logger: false  # Provided by exp_manager
-  benchmark: false # needs to be false for models with variable-length speech input as it slows down training
-
-exp_manager:
-  exp_dir: null
-  name: ${name}
-  create_tensorboard_logger: true
-  create_checkpoint_callback: true
-  checkpoint_callback_params:
-    # in case of multiple validation sets, first one is used
-    monitor: "val_wer"
-    mode: "min"
-    save_top_k: 5
-    always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints
-
-  # you need to set these two to True to continue the training
-  resume_if_exists: false
-  resume_ignore_no_checkpoint: false
-
-  # You may use this section to create a W&B logger
-  create_wandb_logger: false
-  wandb_logger_kwargs:
-    name: null
-    project: null
diff --git a/examples/asr/conf/fastconformer/fast-conformer-long_transducer_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer-long_transducer_bpe.yaml
deleted file mode 100644
index 50ad302eef45..000000000000
--- a/examples/asr/conf/fastconformer/fast-conformer-long_transducer_bpe.yaml
+++ /dev/null
@@ -1,265 +0,0 @@
-# It contains the default values for training a Fast Conformer-Transducer ASR model, large size (~120M) with Transducer loss and sub-word encoding.
-# This version uses Longformer-style attention in order to handle longer audio
-
-# Architecture and training config:
-# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
-# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
-
-# You may find more info about Fast Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer
-
-# Differences from baseline config are in
-# model.encoder.global_tokens
-# model.encoder.global_tokens_spacing
-# model.encoder.global_attn_separate
-
-name: "FastConformer-Long-Transducer-BPE"
-
-model:
-  sample_rate: 16000
-  compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
-  log_prediction: true # enables logging sample predictions in the output during training
-  rnnt_reduction: 'mean_volume'
-  skip_nan_grad: false
-
-  model_defaults:
-    enc_hidden: ${model.encoder.d_model}
-    pred_hidden: 640
-    joint_hidden: 640
-
-  train_ds:
-    manifest_filepath: ???
-    sample_rate: ${model.sample_rate}
-    batch_size: 16 # you may increase batch_size if your memory allows
-    shuffle: true
-    num_workers: 8
-    pin_memory: true
-    use_start_end_token: false
-    trim_silence: false
-    max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset
-    min_duration: 0.1
-    # tarred datasets
-    is_tarred: false
-    tarred_audio_filepaths: null
-    shuffle_n: 2048
-    # bucketing params
-    bucketing_strategy: "fully_randomized"
-    bucketing_batch_size: null
-
-  validation_ds:
-    manifest_filepath: ???
-    sample_rate: ${model.sample_rate}
-    batch_size: 16
-    shuffle: false
-    num_workers: 8
-    pin_memory: true
-    use_start_end_token: false
-
-  test_ds:
-    manifest_filepath: null
-    sample_rate: ${model.sample_rate}
-    batch_size: 16
-    shuffle: false
-    num_workers: 8
-    pin_memory: true
-    use_start_end_token: false
-
-  # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
-  tokenizer:
-    dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
-    type: bpe  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)
-
-  preprocessor:
-    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
-    sample_rate: ${model.sample_rate}
-    normalize: "per_feature"
-    window_size: 0.025
-    window_stride: 0.01
-    window: "hann"
-    features: 80
-    n_fft: 512
-    frame_splicing: 1
-    dither: 0.00001
-    pad_to: 0
-
-  spec_augment:
-    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
-    freq_masks: 2 # set to zero to disable it
-    time_masks: 10 # set to zero to disable it
-    freq_width: 27
-    time_width: 0.05
-
-  encoder:
-    _target_: nemo.collections.asr.modules.ConformerEncoder
-    feat_in: ${model.preprocessor.features}
-    feat_out: -1 # you may set it if you need different output size other than the default d_model
-    n_layers: 17
-    d_model: 512
-
-    # Sub-sampling parameters
-    subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
-    subsampling_factor: 8 # must be power of 2 for striding and vggnet
-    subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
-    causal_downsampling: false
-
-    # Reduction parameters: Can be used to add another subsampling layer at a given position.
-    # Having a 2x reduction will speedup the training and inference speech while keeping similar WER.
-    # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup.
-    reduction: null # pooling, striding, or null
-    reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder
-    reduction_factor: 1
-
-    # Feed forward module's params
-    ff_expansion_factor: 4
-
-    # Multi-headed Attention Module's params
-    self_attention_model: rel_pos_local_attn # longformer-style attention (sliding window + global tokens)
-    global_tokens: 1 # number of tokens that attend and are attended to by all tokens (put 0 to disable)
-    global_tokens_spacing: 1 # how far apart the global tokens are
-    global_attn_separate: false # whether global tokens should use separate q,k,v layers
-    n_heads: 8 # may need to be lower for smaller d_models
-    # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
-    att_context_size: [128,128] # -1 means unlimited context
-    att_context_style: regular # regular or chunked_limited
-    xscaling: true # scales up the input embeddings by sqrt(d_model)
-    untie_biases: true # unties the biases of the TransformerXL layers
-
-    # Convolution module's params
-    conv_kernel_size: 9
-    conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
-    # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
-    # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
-    conv_context_size: null
-
-    ### regularization
-    dropout: 0.1 # The dropout used in most of the Conformer Modules
-    dropout_pre_encoder: 0.1 # The dropout used before the encoder
-    dropout_emb: 0.0 # The dropout used for embeddings
-    dropout_att: 0.1 # The dropout for multi-headed attention modules
-
-    # set to non-zero to enable stochastic depth
-    stochastic_depth_drop_prob: 0.0
-    stochastic_depth_mode: linear  # linear or uniform
-    stochastic_depth_start_layer: 1
-
-  decoder:
-    _target_: nemo.collections.asr.modules.RNNTDecoder
-    normalization_mode: null # Currently only null is supported for export.
-    random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf
-    blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference.
-
-    prednet:
-      pred_hidden: ${model.model_defaults.pred_hidden}
-      pred_rnn_layers: 1
-      t_max: null
-      dropout: 0.2
-
-  # if a large vocabulary size is desired, you may wish to use SampleRNNTJoint module
-  # _target_: nemo.collections.asr.modules.SampledRNNTJoint
-  # n_samples: 500 # Specifies the minimum number of tokens to sample from the vocabulary space, excluding
-  # the RNNT blank token. If a given value is larger than the entire vocabulary size, then the full
-  # vocabulary will be used
-  joint:
-    _target_: nemo.collections.asr.modules.RNNTJoint
-    log_softmax: null  # 'null' would set it automatically according to CPU/GPU device
-    preserve_memory: false  # dramatically slows down training, but might preserve some memory
-
-    # Fuses the computation of prediction net + joint net + loss + WER calculation
-    # to be run on sub-batches of size `fused_batch_size`.
-    # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size.
-    # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss.
-    # Using small values here will preserve a lot of memory during training, but will make training slower as well.
-    # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1.
-    # However, to preserve memory, this ratio can be 1:8 or even 1:16.
-    # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow.
-    fuse_loss_wer: true
-    fused_batch_size: 16
-
-    jointnet:
-      joint_hidden: ${model.model_defaults.joint_hidden}
-      activation: "relu"
-      dropout: 0.2
-
-  decoding:
-    strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd.
-
-    # greedy strategy config
-    greedy:
-      max_symbols: 10
-
-    # beam strategy config
-    beam:
-      beam_size: 2
-      return_best_hypothesis: False
-      score_norm: true
-      tsd_max_sym_exp: 50  # for Time Synchronous Decoding
-      alsd_max_target_len: 2.0  # for Alignment-Length Synchronous Decoding
-
-  loss:
-    loss_name: "default"
-
-    warprnnt_numba_kwargs:
-      # FastEmit regularization: https://arxiv.org/abs/2010.11148
-      # You may enable FastEmit to reduce the latency of the model for streaming
-      fastemit_lambda: 0.0  # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
-      clamp: -1.0  # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only.
-
-  # Adds Gaussian noise to the gradients of the decoder to avoid overfitting
-  variational_noise:
-    start_step: 0
-    std: 0.0
-
-  optim:
-    name: adamw
-    lr: 2.5e-3
-    # optimizer arguments
-    betas: [0.9, 0.98]
-    weight_decay: 1e-3
-
-    # scheduler setup
-    sched:
-      name: CosineAnnealing
-      # scheduler config override
-      warmup_steps: 15000
-      warmup_ratio: null
-      min_lr: 1e-6
-
-trainer:
-  devices: -1 # number of GPUs, -1 would use all available GPUs
-  num_nodes: 1
-  max_epochs: 500
-  max_steps: -1 # computed at runtime if not set
-  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
-  accelerator: auto
-  strategy: ddp
-  accumulate_grad_batches: 1
-  gradient_clip_val: 0.0
-  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
-  log_every_n_steps: 10  # Interval of logging.
-  enable_progress_bar: True
-  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
-  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
-  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
-  sync_batchnorm: true
-  enable_checkpointing: False  # Provided by exp_manager
-  logger: false  # Provided by exp_manager
-  benchmark: false # needs to be false for models with variable-length speech input as it slows down training
-
-
-exp_manager:
-  exp_dir: null
-  name: ${name}
-  create_tensorboard_logger: true
-  create_checkpoint_callback: true
-  checkpoint_callback_params:
-    # in case of multiple validation sets, first one is used
-    monitor: "val_wer"
-    mode: "min"
-    save_top_k: 5
-    always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints
-  resume_if_exists: false
-  resume_ignore_no_checkpoint: false
-
-  create_wandb_logger: false
-  wandb_logger_kwargs:
-    name: null
-    project: null