-
Notifications
You must be signed in to change notification settings - Fork 2.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Vahid <[email protected]>
- Loading branch information
Showing
8 changed files
with
2,154 additions
and
0 deletions.
There are no files selected for viewing
267 changes: 267 additions & 0 deletions
267
.../asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,267 @@ | ||
# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR model, large size (~115M) with sub-word encoding. | ||
|
||
# You may find more info about FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer | ||
# You may find more info about Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer | ||
|
||
# We suggest to use trainer.precision=bf16 for GPUs which support it otherwise trainer.precision=16 is recommended. | ||
# Using bf16 or 16 would makes it possible to double the batch size and speedup training/inference. If fp16 is not stable and model diverges after some epochs, you may use fp32. | ||
# Here are the suggested batch size per GPU for each precision and memory sizes: | ||
|
||
# +-----------+------------+------------+ | ||
# | Precision | GPU Memory | Batch Size | | ||
# +===========+============+============+ | ||
# | 32 | 16GB | 8 | | ||
# | | 32GB | 16 | | ||
# | | 80GB | 32 | | ||
# +-----------+------------+------------+ | ||
# | fp16 or | 16GB | 16 | | ||
# | bf16 | 32GB | 32 | | ||
# | | 80GB | 64 | | ||
# +-----------+------------+------------+ | ||
# Note: They are based on the assumption of max_duration of 20. If you have longer or shorter max_duration, then batch sizes may need to get updated accordingly. | ||
|
||
# Default learning parameters in this config are set for global batch size of 2K while you may use lower values. | ||
# To increase the global batch size with limited number of GPUs, you may use higher accumulate_grad_batches. | ||
# However accumulate_grad_batches is better to be avoided as long as the global batch size is large enough and training is stable. | ||
|
||
name: "FastConformer-Transducer-BPE-Streaming" | ||
|
||
model: | ||
sample_rate: 16000 | ||
compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. | ||
log_prediction: true # enables logging sample predictions in the output during training | ||
skip_nan_grad: false | ||
|
||
model_defaults: | ||
enc_hidden: ${model.encoder.d_model} | ||
pred_hidden: 640 | ||
joint_hidden: 640 | ||
|
||
train_ds: | ||
manifest_filepath: ??? | ||
sample_rate: ${model.sample_rate} | ||
batch_size: 16 # you may increase batch_size if your memory allows | ||
shuffle: true | ||
num_workers: 8 | ||
pin_memory: true | ||
max_duration: 20 # you may need to update it for your dataset | ||
min_duration: 0.1 | ||
# tarred datasets | ||
is_tarred: false | ||
tarred_audio_filepaths: null | ||
shuffle_n: 2048 | ||
# bucketing params | ||
bucketing_strategy: "synced_randomized" | ||
bucketing_batch_size: null | ||
|
||
validation_ds: | ||
manifest_filepath: ??? | ||
sample_rate: ${model.sample_rate} | ||
batch_size: 16 | ||
shuffle: false | ||
use_start_end_token: false | ||
num_workers: 8 | ||
pin_memory: true | ||
|
||
test_ds: | ||
manifest_filepath: null | ||
sample_rate: ${model.sample_rate} | ||
batch_size: 16 | ||
shuffle: false | ||
use_start_end_token: false | ||
num_workers: 8 | ||
pin_memory: true | ||
|
||
# You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py | ||
# We recommend to use vocab size of 1024 with SPE Unigram for most languages | ||
tokenizer: | ||
dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) | ||
type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) | ||
|
||
preprocessor: | ||
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor | ||
sample_rate: ${model.sample_rate} | ||
normalize: "NA" # No normalization for mel-spectogram makes streaming easier | ||
window_size: 0.025 | ||
window_stride: 0.01 | ||
window: "hann" | ||
features: 80 | ||
n_fft: 512 | ||
frame_splicing: 1 | ||
dither: 0.00001 | ||
pad_to: 0 | ||
|
||
spec_augment: | ||
_target_: nemo.collections.asr.modules.SpectrogramAugmentation | ||
freq_masks: 2 # set to zero to disable it | ||
time_masks: 10 # set to zero to disable it | ||
freq_width: 27 | ||
time_width: 0.05 | ||
|
||
encoder: | ||
_target_: nemo.collections.asr.modules.ConformerEncoder | ||
feat_in: ${model.preprocessor.features} | ||
feat_out: -1 # you may set it if you need different output size other than the default d_model | ||
n_layers: 17 | ||
d_model: 512 | ||
|
||
# Sub-sampling parameters | ||
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding | ||
subsampling_factor: 8 # must be power of 2 for striding and vggnet | ||
subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model | ||
causal_downsampling: true | ||
|
||
# Feed forward module's params | ||
ff_expansion_factor: 4 | ||
|
||
# Multi-headed Attention Module's params | ||
self_attention_model: rel_pos # rel_pos or abs_pos | ||
n_heads: 8 # may need to be lower for smaller d_models | ||
|
||
# [left, right] specifies the number of steps to be seen from left and right of each step in self-attention | ||
# for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large | ||
# for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one | ||
# for chunked_limited you may calculate the look-ahead or right context by the following formula: | ||
# look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s | ||
att_context_size: [70, 13] # -1 means unlimited context | ||
att_context_style: chunked_limited # regular or chunked_limited | ||
|
||
xscaling: true # scales up the input embeddings by sqrt(d_model) | ||
pos_emb_max_len: 5000 | ||
|
||
# Convolution module's params | ||
conv_kernel_size: 9 | ||
conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) | ||
|
||
# conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size | ||
# null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] | ||
# Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly | ||
conv_context_size: causal | ||
|
||
### regularization | ||
dropout: 0.1 # The dropout used in most of the Conformer Modules | ||
dropout_pre_encoder: 0.1 # The dropout used before the encoder | ||
dropout_emb: 0.0 # The dropout used for embeddings | ||
dropout_att: 0.1 # The dropout for multi-headed attention modules | ||
|
||
# set to non-zero to enable stochastic depth | ||
stochastic_depth_drop_prob: 0.0 | ||
stochastic_depth_mode: linear # linear or uniform | ||
stochastic_depth_start_layer: 1 | ||
|
||
decoder: | ||
_target_: nemo.collections.asr.modules.RNNTDecoder | ||
normalization_mode: null # Currently only null is supported for export. | ||
random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf | ||
blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. | ||
|
||
prednet: | ||
pred_hidden: ${model.model_defaults.pred_hidden} | ||
pred_rnn_layers: 1 | ||
t_max: null | ||
dropout: 0.2 | ||
|
||
joint: | ||
_target_: nemo.collections.asr.modules.RNNTJoint | ||
log_softmax: null # 'null' would set it automatically according to CPU/GPU device | ||
preserve_memory: false # dramatically slows down training, but might preserve some memory | ||
|
||
# Fuses the computation of prediction net + joint net + loss + WER calculation | ||
# to be run on sub-batches of size `fused_batch_size`. | ||
# When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. | ||
# `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. | ||
# Using small values here will preserve a lot of memory during training, but will make training slower as well. | ||
# An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. | ||
# However, to preserve memory, this ratio can be 1:8 or even 1:16. | ||
# Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. | ||
fuse_loss_wer: true | ||
fused_batch_size: 4 | ||
|
||
jointnet: | ||
joint_hidden: ${model.model_defaults.joint_hidden} | ||
activation: "relu" | ||
dropout: 0.2 | ||
|
||
decoding: | ||
strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. | ||
|
||
# greedy strategy config | ||
greedy: | ||
max_symbols: 10 | ||
|
||
# beam strategy config | ||
beam: | ||
beam_size: 2 | ||
return_best_hypothesis: False | ||
score_norm: true | ||
tsd_max_sym_exp: 50 # for Time Synchronous Decoding | ||
alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding | ||
|
||
# config for InterCTC loss: https://arxiv.org/abs/2102.03216 | ||
# specify loss weights and which layers to use for InterCTC | ||
# e.g., to reproduce the paper results, set loss_weights: [0.3] | ||
# and apply_at_layers: [8] (assuming 18 layers). Note that final | ||
# layer loss coefficient is automatically adjusted (to 0.7 in above example) | ||
interctc: | ||
loss_weights: [] | ||
apply_at_layers: [] | ||
|
||
loss: | ||
loss_name: "default" | ||
|
||
optim: | ||
name: adamw | ||
lr: 5.0 | ||
# optimizer arguments | ||
betas: [0.9, 0.98] | ||
weight_decay: 1e-3 | ||
|
||
# scheduler setup | ||
sched: | ||
name: NoamAnnealing | ||
d_model: ${model.encoder.d_model} | ||
# scheduler config override | ||
warmup_steps: 10000 | ||
warmup_ratio: null | ||
min_lr: 1e-6 | ||
|
||
trainer: | ||
devices: -1 # number of GPUs, -1 would use all available GPUs | ||
num_nodes: 1 | ||
max_epochs: 1000 | ||
max_steps: -1 # computed at runtime if not set | ||
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations | ||
accelerator: auto | ||
strategy: ddp | ||
accumulate_grad_batches: 1 | ||
gradient_clip_val: 1.0 | ||
precision: 32 # 16, 32, or bf16 | ||
log_every_n_steps: 10 # Interval of logging. | ||
enable_progress_bar: True | ||
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. | ||
num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it | ||
check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs | ||
sync_batchnorm: true | ||
enable_checkpointing: False # Provided by exp_manager | ||
logger: false # Provided by exp_manager | ||
benchmark: false # needs to be false for models with variable-length speech input as it slows down training | ||
|
||
|
||
exp_manager: | ||
exp_dir: null | ||
name: ${name} | ||
create_tensorboard_logger: true | ||
create_checkpoint_callback: true | ||
checkpoint_callback_params: | ||
# in case of multiple validation sets, first one is used | ||
monitor: "val_wer" | ||
mode: "min" | ||
save_top_k: 5 | ||
always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints | ||
resume_if_exists: false | ||
resume_ignore_no_checkpoint: false | ||
|
||
create_wandb_logger: false | ||
wandb_logger_kwargs: | ||
name: null | ||
project: null |
Oops, something went wrong.