-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
updated batch sizes and added fastconformer ctc streaming configs.
Signed-off-by: Vahid <[email protected]>
- Loading branch information
Showing
13 changed files
with
456 additions
and
198 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
198 changes: 198 additions & 0 deletions
198
examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
# It contains the default values for training a cache-aware streaming FastConformer-CTC ASR model, large size (~115M) with sub-word encoding. | ||
|
||
# You may find more detail: | ||
# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer | ||
# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer | ||
# FastConformer-CTC's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml | ||
|
||
name: "FastConformer-CTC-BPE-Streaming" | ||
|
||
model: | ||
sample_rate: 16000 | ||
log_prediction: true # enables logging sample predictions in the output during training | ||
ctc_reduction: 'mean_batch' | ||
skip_nan_grad: false | ||
|
||
train_ds: | ||
manifest_filepath: ??? | ||
sample_rate: ${model.sample_rate} | ||
batch_size: 16 # you may increase batch_size if your memory allows | ||
shuffle: true | ||
num_workers: 8 | ||
pin_memory: true | ||
max_duration: 20 # you may need to update it for your dataset | ||
min_duration: 0.1 | ||
# tarred datasets | ||
is_tarred: false | ||
tarred_audio_filepaths: null | ||
shuffle_n: 2048 | ||
# bucketing params | ||
bucketing_strategy: "synced_randomized" | ||
bucketing_batch_size: null | ||
|
||
validation_ds: | ||
manifest_filepath: ??? | ||
sample_rate: ${model.sample_rate} | ||
batch_size: 16 | ||
shuffle: false | ||
use_start_end_token: false | ||
num_workers: 8 | ||
pin_memory: true | ||
|
||
test_ds: | ||
manifest_filepath: null | ||
sample_rate: ${model.sample_rate} | ||
batch_size: 16 | ||
shuffle: false | ||
use_start_end_token: false | ||
num_workers: 8 | ||
pin_memory: true | ||
|
||
# You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py | ||
# We recommend to use vocab size of 1024 with SPE Unigram for most languages | ||
tokenizer: | ||
dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) | ||
type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) | ||
|
||
preprocessor: | ||
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor | ||
sample_rate: ${model.sample_rate} | ||
normalize: "NA" # No normalization for mel-spectogram makes streaming easier | ||
window_size: 0.025 | ||
window_stride: 0.01 | ||
window: "hann" | ||
features: 80 | ||
n_fft: 512 | ||
frame_splicing: 1 | ||
dither: 0.00001 | ||
pad_to: 0 | ||
|
||
spec_augment: | ||
_target_: nemo.collections.asr.modules.SpectrogramAugmentation | ||
freq_masks: 2 # set to zero to disable it | ||
time_masks: 10 # set to zero to disable it | ||
freq_width: 27 | ||
time_width: 0.05 | ||
|
||
encoder: | ||
_target_: nemo.collections.asr.modules.ConformerEncoder | ||
feat_in: ${model.preprocessor.features} | ||
feat_out: -1 # you may set it if you need different output size other than the default d_model | ||
n_layers: 17 | ||
d_model: 512 | ||
|
||
# Sub-sampling parameters | ||
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding | ||
subsampling_factor: 8 # must be power of 2 for striding and vggnet | ||
subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model | ||
causal_downsampling: true | ||
|
||
# Feed forward module's params | ||
ff_expansion_factor: 4 | ||
|
||
# Multi-headed Attention Module's params | ||
self_attention_model: rel_pos # rel_pos or abs_pos | ||
n_heads: 8 # may need to be lower for smaller d_models | ||
|
||
# [left, right] specifies the number of steps to be seen from left and right of each step in self-attention | ||
# for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large | ||
# for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one | ||
# for chunked_limited you may calculate the look-ahead or right context by the following formula: | ||
# look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s | ||
att_context_size: [70, 13] # -1 means unlimited context | ||
att_context_style: chunked_limited # regular or chunked_limited | ||
|
||
xscaling: true # scales up the input embeddings by sqrt(d_model) | ||
pos_emb_max_len: 5000 | ||
|
||
# Convolution module's params | ||
conv_kernel_size: 9 | ||
conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) | ||
|
||
# conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size | ||
# null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] | ||
# Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly | ||
conv_context_size: causal | ||
|
||
### regularization | ||
dropout: 0.1 # The dropout used in most of the Conformer Modules | ||
dropout_pre_encoder: 0.1 # The dropout used before the encoder | ||
dropout_emb: 0.0 # The dropout used for embeddings | ||
dropout_att: 0.1 # The dropout for multi-headed attention modules | ||
|
||
# set to non-zero to enable stochastic depth | ||
stochastic_depth_drop_prob: 0.0 | ||
stochastic_depth_mode: linear # linear or uniform | ||
stochastic_depth_start_layer: 1 | ||
|
||
decoder: | ||
_target_: nemo.collections.asr.modules.ConvASRDecoder | ||
feat_in: null | ||
num_classes: -1 | ||
vocabulary: [] | ||
|
||
# config for InterCTC loss: https://arxiv.org/abs/2102.03216 | ||
# specify loss weights and which layers to use for InterCTC | ||
# e.g., to reproduce the paper results, set loss_weights: [0.3] | ||
# and apply_at_layers: [8] (assuming 18 layers). Note that final | ||
# layer loss coefficient is automatically adjusted (to 0.7 in above example) | ||
interctc: | ||
loss_weights: [] | ||
apply_at_layers: [] | ||
|
||
optim: | ||
name: adamw | ||
lr: 2.0 | ||
# optimizer arguments | ||
betas: [0.9, 0.98] | ||
weight_decay: 1e-3 | ||
|
||
# scheduler setup | ||
sched: | ||
name: NoamAnnealing | ||
d_model: ${model.encoder.d_model} | ||
# scheduler config override | ||
warmup_steps: 10000 | ||
warmup_ratio: null | ||
min_lr: 1e-6 | ||
|
||
trainer: | ||
devices: -1 # number of GPUs, -1 would use all available GPUs | ||
num_nodes: 1 | ||
max_epochs: 1000 | ||
max_steps: -1 # computed at runtime if not set | ||
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations | ||
accelerator: auto | ||
strategy: ddp | ||
accumulate_grad_batches: 1 | ||
gradient_clip_val: 1.0 | ||
precision: 32 # 16, 32, or bf16 | ||
log_every_n_steps: 10 # Interval of logging. | ||
enable_progress_bar: True | ||
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. | ||
num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it | ||
check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs | ||
sync_batchnorm: true | ||
enable_checkpointing: False # Provided by exp_manager | ||
logger: false # Provided by exp_manager | ||
benchmark: false # needs to be false for models with variable-length speech input as it slows down training | ||
|
||
|
||
exp_manager: | ||
exp_dir: null | ||
name: ${name} | ||
create_tensorboard_logger: true | ||
create_checkpoint_callback: true | ||
checkpoint_callback_params: | ||
# in case of multiple validation sets, first one is used | ||
monitor: "val_wer" | ||
mode: "min" | ||
save_top_k: 5 | ||
always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints | ||
resume_if_exists: false | ||
resume_ignore_no_checkpoint: false | ||
|
||
create_wandb_logger: false | ||
wandb_logger_kwargs: | ||
name: null | ||
project: null |
Oops, something went wrong.