Skip to content

Commit

Permalink
added confs.
Browse files Browse the repository at this point in the history
Signed-off-by: Vahid <[email protected]>
  • Loading branch information
VahidooX committed Apr 17, 2023
1 parent 3352d1f commit 932c618
Show file tree
Hide file tree
Showing 8 changed files with 2,154 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR model, large size (~115M) with sub-word encoding.

# You may find more info about FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer
# You may find more info about Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer

# We suggest to use trainer.precision=bf16 for GPUs which support it otherwise trainer.precision=16 is recommended.
# Using bf16 or 16 would makes it possible to double the batch size and speedup training/inference. If fp16 is not stable and model diverges after some epochs, you may use fp32.
# Here are the suggested batch size per GPU for each precision and memory sizes:

# +-----------+------------+------------+
# | Precision | GPU Memory | Batch Size |
# +===========+============+============+
# | 32 | 16GB | 8 |
# | | 32GB | 16 |
# | | 80GB | 32 |
# +-----------+------------+------------+
# | fp16 or | 16GB | 16 |
# | bf16 | 32GB | 32 |
# | | 80GB | 64 |
# +-----------+------------+------------+
# Note: They are based on the assumption of max_duration of 20. If you have longer or shorter max_duration, then batch sizes may need to get updated accordingly.

# Default learning parameters in this config are set for global batch size of 2K while you may use lower values.
# To increase the global batch size with limited number of GPUs, you may use higher accumulate_grad_batches.
# However accumulate_grad_batches is better to be avoided as long as the global batch size is large enough and training is stable.

name: "FastConformer-Transducer-BPE-Streaming"

model:
sample_rate: 16000
compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
log_prediction: true # enables logging sample predictions in the output during training
skip_nan_grad: false

model_defaults:
enc_hidden: ${model.encoder.d_model}
pred_hidden: 640
joint_hidden: 640

train_ds:
manifest_filepath: ???
sample_rate: ${model.sample_rate}
batch_size: 16 # you may increase batch_size if your memory allows
shuffle: true
num_workers: 8
pin_memory: true
max_duration: 20 # you may need to update it for your dataset
min_duration: 0.1
# tarred datasets
is_tarred: false
tarred_audio_filepaths: null
shuffle_n: 2048
# bucketing params
bucketing_strategy: "synced_randomized"
bucketing_batch_size: null

validation_ds:
manifest_filepath: ???
sample_rate: ${model.sample_rate}
batch_size: 16
shuffle: false
use_start_end_token: false
num_workers: 8
pin_memory: true

test_ds:
manifest_filepath: null
sample_rate: ${model.sample_rate}
batch_size: 16
shuffle: false
use_start_end_token: false
num_workers: 8
pin_memory: true

# You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
# We recommend to use vocab size of 1024 with SPE Unigram for most languages
tokenizer:
dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)

preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
sample_rate: ${model.sample_rate}
normalize: "NA" # No normalization for mel-spectogram makes streaming easier
window_size: 0.025
window_stride: 0.01
window: "hann"
features: 80
n_fft: 512
frame_splicing: 1
dither: 0.00001
pad_to: 0

spec_augment:
_target_: nemo.collections.asr.modules.SpectrogramAugmentation
freq_masks: 2 # set to zero to disable it
time_masks: 10 # set to zero to disable it
freq_width: 27
time_width: 0.05

encoder:
_target_: nemo.collections.asr.modules.ConformerEncoder
feat_in: ${model.preprocessor.features}
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 17
d_model: 512

# Sub-sampling parameters
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
subsampling_factor: 8 # must be power of 2 for striding and vggnet
subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
causal_downsampling: true

# Feed forward module's params
ff_expansion_factor: 4

# Multi-headed Attention Module's params
self_attention_model: rel_pos # rel_pos or abs_pos
n_heads: 8 # may need to be lower for smaller d_models

# [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
# for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large
# for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one
# for chunked_limited you may calculate the look-ahead or right context by the following formula:
# look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s
att_context_size: [70, 13] # -1 means unlimited context
att_context_style: chunked_limited # regular or chunked_limited

xscaling: true # scales up the input embeddings by sqrt(d_model)
pos_emb_max_len: 5000

# Convolution module's params
conv_kernel_size: 9
conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)

# conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
# null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
# Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly
conv_context_size: causal

### regularization
dropout: 0.1 # The dropout used in most of the Conformer Modules
dropout_pre_encoder: 0.1 # The dropout used before the encoder
dropout_emb: 0.0 # The dropout used for embeddings
dropout_att: 0.1 # The dropout for multi-headed attention modules

# set to non-zero to enable stochastic depth
stochastic_depth_drop_prob: 0.0
stochastic_depth_mode: linear # linear or uniform
stochastic_depth_start_layer: 1

decoder:
_target_: nemo.collections.asr.modules.RNNTDecoder
normalization_mode: null # Currently only null is supported for export.
random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf
blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference.

prednet:
pred_hidden: ${model.model_defaults.pred_hidden}
pred_rnn_layers: 1
t_max: null
dropout: 0.2

joint:
_target_: nemo.collections.asr.modules.RNNTJoint
log_softmax: null # 'null' would set it automatically according to CPU/GPU device
preserve_memory: false # dramatically slows down training, but might preserve some memory

# Fuses the computation of prediction net + joint net + loss + WER calculation
# to be run on sub-batches of size `fused_batch_size`.
# When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size.
# `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss.
# Using small values here will preserve a lot of memory during training, but will make training slower as well.
# An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1.
# However, to preserve memory, this ratio can be 1:8 or even 1:16.
# Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow.
fuse_loss_wer: true
fused_batch_size: 4

jointnet:
joint_hidden: ${model.model_defaults.joint_hidden}
activation: "relu"
dropout: 0.2

decoding:
strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd.

# greedy strategy config
greedy:
max_symbols: 10

# beam strategy config
beam:
beam_size: 2
return_best_hypothesis: False
score_norm: true
tsd_max_sym_exp: 50 # for Time Synchronous Decoding
alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding

# config for InterCTC loss: https://arxiv.org/abs/2102.03216
# specify loss weights and which layers to use for InterCTC
# e.g., to reproduce the paper results, set loss_weights: [0.3]
# and apply_at_layers: [8] (assuming 18 layers). Note that final
# layer loss coefficient is automatically adjusted (to 0.7 in above example)
interctc:
loss_weights: []
apply_at_layers: []

loss:
loss_name: "default"

optim:
name: adamw
lr: 5.0
# optimizer arguments
betas: [0.9, 0.98]
weight_decay: 1e-3

# scheduler setup
sched:
name: NoamAnnealing
d_model: ${model.encoder.d_model}
# scheduler config override
warmup_steps: 10000
warmup_ratio: null
min_lr: 1e-6

trainer:
devices: -1 # number of GPUs, -1 would use all available GPUs
num_nodes: 1
max_epochs: 1000
max_steps: -1 # computed at runtime if not set
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
accelerator: auto
strategy: ddp
accumulate_grad_batches: 1
gradient_clip_val: 1.0
precision: 32 # 16, 32, or bf16
log_every_n_steps: 10 # Interval of logging.
enable_progress_bar: True
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
sync_batchnorm: true
enable_checkpointing: False # Provided by exp_manager
logger: false # Provided by exp_manager
benchmark: false # needs to be false for models with variable-length speech input as it slows down training


exp_manager:
exp_dir: null
name: ${name}
create_tensorboard_logger: true
create_checkpoint_callback: true
checkpoint_callback_params:
# in case of multiple validation sets, first one is used
monitor: "val_wer"
mode: "min"
save_top_k: 5
always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints
resume_if_exists: false
resume_ignore_no_checkpoint: false

create_wandb_logger: false
wandb_logger_kwargs:
name: null
project: null
Loading

0 comments on commit 932c618

Please sign in to comment.