diff --git a/examples/aishell/whisper/README.md b/examples/aishell/whisper/README.md index 8baf84625..be12f7cba 100644 --- a/examples/aishell/whisper/README.md +++ b/examples/aishell/whisper/README.md @@ -33,28 +33,52 @@ python local/filter_ckpt.py \ ## Whisper-largev3 (conv1d2, full-parameter tuning) Result +* Feature info: using log_mel_spectrogram feature, no cmvn +* Training info: bf16, deepspeed stage1, activation checkpointing, batch dynamic12000, acc_grad 1, 8 * 3090 gpu, 30 epochs (about 16 hours), conf/finetune_whisper_largev3_onlyattn.yaml +* Decoding info: ctc_weight 0.0, average_num 2 +* Git hash: TBD + +| decoding mode | CER | +|---------------------------|-------| +| attention decoder | 2.57 % N=104765 C=102142 S=2529 D=94 I=74 | +| ctc greedy search | N/A | +| ctc prefix beam search | N/A | +| attention rescoring | N/A | + * Feature info: using log_mel_spectrogram feature, no cmvn, no speed perturb -* Training info: bf16, deepspeed stage1, activation checkpointing, batch dynamic12000, acc_grad 4, 8 * 3090 gpu, 40 epochs (about 14 hours) +* Training info: bf16, deepspeed stage1, activation checkpointing, batch dynamic12000, acc_grad 4, 8 * 3090 gpu, 40 epochs (about 14 hours), conf/finetune_whisper_largev3.yaml * Decoding info: ctc_weight 0.3, average_num 5 * Git hash: TBD | decoding mode | CER | |---------------------------|-------| -| attention decoder | 4.06 | -| ctc greedy search | 8.33 | -| ctc prefix beam search | 8.34 | -| attention rescoring | 6.49 | +| attention decoder | 4.06 % N=104765 C=100643 S=4006 D=116 I=128 | +| ctc greedy search | 8.33 % N=104765 C=96781 S=7776 D=208 I=747 | +| ctc prefix beam search | 8.34 % N=104765 C=96787 S=7775 D=203 I=760 | +| attention rescoring | 6.49 % N=104765 C=98199 S=6427 D=139 I=237 | ## Whisper-largev3 (conv2d4, full-parameter tuning) Result +* Feature info: using log_mel_spectrogram feature, no cmvn +* Training info: bf16, deepspeed stage1, activation checkpointing, batch dynamic12000, acc_grad 1, 8 * 3090 gpu, 30 epochs (about 14 hours), conf/finetune_whisper_largev3_conv2d4_onlyattn.yaml +* Decoding info: ctc_weight 0.0, average_num 2 +* Git hash: TBD + +| decoding mode | CER | +|---------------------------|-------| +| attention decoder | 2.63 % N=104765 C=102088 S=2579 D=98 I=79 | +| ctc greedy search | N/A | +| ctc prefix beam search | N/A | +| attention rescoring | N/A | + * Feature info: using log_mel_spectrogram feature, no cmvn, no speed perturb -* Training info: bf16, deepspeed stage1, activation checkpointing, batch dynamic12000, acc_grad 4, 8 * 3090 gpu, 40 epochs (about 10 hours) +* Training info: bf16, deepspeed stage1, activation checkpointing, batch dynamic12000, acc_grad 4, 8 * 3090 gpu, 40 epochs (about 10 hours), conf/finetune_whisper_largev3_conv2d4.yaml * Decoding info: ctc_weight 0.3, average_num 5 * Git hash: TBD | decoding mode | CER | |---------------------------|-------| -| attention decoder | 3.83 | -| ctc greedy search | 6.87 | -| ctc prefix beam search | 6.87 | -| attention rescoring | 5.33 | +| attention decoder | 3.83 % N=104765 C=100866 S=3784 D=115 I=109 | +| ctc greedy search | 6.87 % N=104765 C=98183 S=6408 D=174 I=620 | +| ctc prefix beam search | 6.87 % N=104765 C=98189 S=6402 D=174 I=619 | +| attention rescoring | 5.33 % N=104765 C=99354 S=5304 D=107 I=171 | diff --git a/examples/aishell/whisper/conf/finetune_whisper_largev3_conv2d4_onlyattn.yaml b/examples/aishell/whisper/conf/finetune_whisper_largev3_conv2d4_onlyattn.yaml new file mode 100644 index 000000000..61c8b61ac --- /dev/null +++ b/examples/aishell/whisper/conf/finetune_whisper_largev3_conv2d4_onlyattn.yaml @@ -0,0 +1,119 @@ +encoder: transformer +encoder_conf: + activation_type: gelu + attention_dropout_rate: 0.0 + attention_heads: 20 + dropout_rate: 0.0 + gradient_checkpointing: true + input_layer: conv2d # NOTE(xcsong): conv1d2, conv2d, conv2d8 + key_bias: false + linear_units: 5120 + normalize_before: true + num_blocks: 32 + output_size: 1280 + pos_enc_layer_type: abs_pos_whisper + positional_dropout_rate: 0.0 + static_chunk_size: -1 + use_dynamic_chunk: false + use_dynamic_left_chunk: false + +decoder: transformer +decoder_conf: + activation_type: gelu + attention_heads: 20 + dropout_rate: 0.0 + gradient_checkpointing: true + input_layer: embed_learnable_pe + key_bias: false + linear_units: 5120 + normalize_before: true + num_blocks: 32 + positional_dropout_rate: 0.0 + self_attention_dropout_rate: 0.0 + src_attention: true + src_attention_dropout_rate: 0.0 + tie_word_embedding: true + use_output_layer: true + +tokenizer: whisper +tokenizer_conf: + bpe_path: null + is_multilingual: true + non_lang_syms_path: null + num_languages: 100 + special_tokens: + eot: 50258 + no_speech: 50363 + no_timestamps: 50364 + sot: 50258 + sot_prev: 50362 + timestamp_begin: 50365 + transcribe: 50360 + translate: 50359 + split_with_space: false + symbol_table_path: null + +ctc: ctc +ctc_conf: + ctc_blank_id: 50256 + +cmvn: null +cmvn_conf: + cmvn_file: null + is_json_cmvn: null + +model: whisper +model_conf: + ctc_weight: 0.0 + length_normalized_loss: false + lsm_weight: 0.1 + +dataset: asr +dataset_conf: + batch_conf: + batch_size: 26 + batch_type: dynamic + max_frames_in_batch: 12000 + feats_type: log_mel_spectrogram + filter_conf: + max_length: 3000 + min_length: 0 + token_max_length: 448 + token_min_length: 1 + log_mel_spectrogram_conf: + hop_length: 160 + n_fft: 400 + num_mel_bins: 128 + padding: 0 + resample_conf: + resample_rate: 16000 + shuffle: true + shuffle_conf: + shuffle_size: 1500 + sort: true + sort_conf: + sort_size: 500 + spec_aug: true + spec_aug_conf: + max_f: 10 + max_t: 50 + num_f_mask: 2 + num_t_mask: 2 + spec_sub: true + spec_sub_conf: + max_t: 30 + num_t_sub: 3 + spec_trim: false + speed_perturb: true + +grad_clip: 5 +accum_grad: 1 +max_epoch: 30 +log_interval: 100 + +optim: adam +optim_conf: + lr: 0.00001 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 12000 diff --git a/examples/aishell/whisper/conf/finetune_whisper_largev3_onlyattn.yaml b/examples/aishell/whisper/conf/finetune_whisper_largev3_onlyattn.yaml new file mode 100644 index 000000000..fd23e15d1 --- /dev/null +++ b/examples/aishell/whisper/conf/finetune_whisper_largev3_onlyattn.yaml @@ -0,0 +1,119 @@ +encoder: transformer +encoder_conf: + activation_type: gelu + attention_dropout_rate: 0.0 + attention_heads: 20 + dropout_rate: 0.0 + gradient_checkpointing: true + input_layer: conv1d2 # NOTE(xcsong): conv1d2, conv2d, conv2d8 + key_bias: false + linear_units: 5120 + normalize_before: true + num_blocks: 32 + output_size: 1280 + pos_enc_layer_type: abs_pos_whisper + positional_dropout_rate: 0.0 + static_chunk_size: -1 + use_dynamic_chunk: false + use_dynamic_left_chunk: false + +decoder: transformer +decoder_conf: + activation_type: gelu + attention_heads: 20 + dropout_rate: 0.0 + gradient_checkpointing: true + input_layer: embed_learnable_pe + key_bias: false + linear_units: 5120 + normalize_before: true + num_blocks: 32 + positional_dropout_rate: 0.0 + self_attention_dropout_rate: 0.0 + src_attention: true + src_attention_dropout_rate: 0.0 + tie_word_embedding: true + use_output_layer: true + +tokenizer: whisper +tokenizer_conf: + bpe_path: null + is_multilingual: true + non_lang_syms_path: null + num_languages: 100 + special_tokens: + eot: 50258 + no_speech: 50363 + no_timestamps: 50364 + sot: 50258 + sot_prev: 50362 + timestamp_begin: 50365 + transcribe: 50360 + translate: 50359 + split_with_space: false + symbol_table_path: null + +ctc: ctc +ctc_conf: + ctc_blank_id: 50256 + +cmvn: null +cmvn_conf: + cmvn_file: null + is_json_cmvn: null + +model: whisper +model_conf: + ctc_weight: 0.0 + length_normalized_loss: false + lsm_weight: 0.1 + +dataset: asr +dataset_conf: + batch_conf: + batch_size: 26 + batch_type: dynamic + max_frames_in_batch: 12000 + feats_type: log_mel_spectrogram + filter_conf: + max_length: 3000 + min_length: 0 + token_max_length: 448 + token_min_length: 1 + log_mel_spectrogram_conf: + hop_length: 160 + n_fft: 400 + num_mel_bins: 128 + padding: 0 + resample_conf: + resample_rate: 16000 + shuffle: true + shuffle_conf: + shuffle_size: 1500 + sort: true + sort_conf: + sort_size: 500 + spec_aug: true + spec_aug_conf: + max_f: 10 + max_t: 50 + num_f_mask: 2 + num_t_mask: 2 + spec_sub: true + spec_sub_conf: + max_t: 30 + num_t_sub: 3 + spec_trim: false + speed_perturb: true + +grad_clip: 5 +accum_grad: 1 +max_epoch: 30 +log_interval: 100 + +optim: adam +optim_conf: + lr: 0.00001 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 12000