wenet-e2e · robin1001 · Jan 22, 2024 · Jan 22, 2024
diff --git a/examples/aishell/whisper/README.md b/examples/aishell/whisper/README.md
@@ -33,28 +33,52 @@ python local/filter_ckpt.py \
 
 ## Whisper-largev3 (conv1d2, full-parameter tuning) Result
 
+* Feature info: using log_mel_spectrogram feature, no cmvn
+* Training info: bf16, deepspeed stage1, activation checkpointing, batch dynamic12000, acc_grad 1, 8 * 3090 gpu, 30 epochs (about 16 hours), conf/finetune_whisper_largev3_onlyattn.yaml
+* Decoding info: ctc_weight 0.0, average_num 2
+* Git hash: TBD
+
+| decoding mode             | CER   |
+|---------------------------|-------|
+| attention decoder         | 2.57 % N=104765 C=102142 S=2529 D=94 I=74 |
+| ctc greedy search         | N/A |
+| ctc prefix beam search    | N/A |
+| attention rescoring       | N/A |
+
 * Feature info: using log_mel_spectrogram feature, no cmvn, no speed perturb
-* Training info: bf16, deepspeed stage1, activation checkpointing, batch dynamic12000, acc_grad 4, 8 * 3090 gpu, 40 epochs (about 14 hours)
+* Training info: bf16, deepspeed stage1, activation checkpointing, batch dynamic12000, acc_grad 4, 8 * 3090 gpu, 40 epochs (about 14 hours), conf/finetune_whisper_largev3.yaml
 * Decoding info: ctc_weight 0.3, average_num 5
 * Git hash: TBD
 
 | decoding mode             | CER   |
 |---------------------------|-------|
-| attention decoder         | 4.06  |
-| ctc greedy search         | 8.33  |
-| ctc prefix beam search    | 8.34  |
-| attention rescoring       | 6.49  |
+| attention decoder         | 4.06 % N=104765 C=100643 S=4006 D=116 I=128  |
+| ctc greedy search         | 8.33 % N=104765 C=96781 S=7776 D=208 I=747   |
+| ctc prefix beam search    | 8.34 % N=104765 C=96787 S=7775 D=203 I=760   |
+| attention rescoring       | 6.49 % N=104765 C=98199 S=6427 D=139 I=237   |
 
 ## Whisper-largev3 (conv2d4, full-parameter tuning) Result
 
+* Feature info: using log_mel_spectrogram feature, no cmvn
+* Training info: bf16, deepspeed stage1, activation checkpointing, batch dynamic12000, acc_grad 1, 8 * 3090 gpu, 30 epochs (about 14 hours), conf/finetune_whisper_largev3_conv2d4_onlyattn.yaml
+* Decoding info: ctc_weight 0.0, average_num 2
+* Git hash: TBD
+
+| decoding mode             | CER   |
+|---------------------------|-------|
+| attention decoder         | 2.63 % N=104765 C=102088 S=2579 D=98 I=79  |
+| ctc greedy search         | N/A |
+| ctc prefix beam search    | N/A |
+| attention rescoring       | N/A |
+
 * Feature info: using log_mel_spectrogram feature, no cmvn, no speed perturb
-* Training info: bf16, deepspeed stage1, activation checkpointing, batch dynamic12000, acc_grad 4, 8 * 3090 gpu, 40 epochs (about 10 hours)
+* Training info: bf16, deepspeed stage1, activation checkpointing, batch dynamic12000, acc_grad 4, 8 * 3090 gpu, 40 epochs (about 10 hours), conf/finetune_whisper_largev3_conv2d4.yaml
 * Decoding info: ctc_weight 0.3, average_num 5
 * Git hash: TBD
 
 | decoding mode             | CER   |
 |---------------------------|-------|
-| attention decoder         | 3.83  |
-| ctc greedy search         | 6.87  |
-| ctc prefix beam search    | 6.87  |
-| attention rescoring       | 5.33  |
+| attention decoder         | 3.83 % N=104765 C=100866 S=3784 D=115 I=109 |
+| ctc greedy search         | 6.87 % N=104765 C=98183 S=6408 D=174 I=620  |
+| ctc prefix beam search    | 6.87 % N=104765 C=98189 S=6402 D=174 I=619  |
+| attention rescoring       | 5.33 % N=104765 C=99354 S=5304 D=107 I=171  |
diff --git a/examples/aishell/whisper/conf/finetune_whisper_largev3_conv2d4_onlyattn.yaml b/examples/aishell/whisper/conf/finetune_whisper_largev3_conv2d4_onlyattn.yaml
@@ -0,0 +1,119 @@
+encoder: transformer
+encoder_conf:
+  activation_type: gelu
+  attention_dropout_rate: 0.0
+  attention_heads: 20
+  dropout_rate: 0.0
+  gradient_checkpointing: true
+  input_layer: conv2d  # NOTE(xcsong): conv1d2, conv2d, conv2d8
+  key_bias: false
+  linear_units: 5120
+  normalize_before: true
+  num_blocks: 32
+  output_size: 1280
+  pos_enc_layer_type: abs_pos_whisper
+  positional_dropout_rate: 0.0
+  static_chunk_size: -1
+  use_dynamic_chunk: false
+  use_dynamic_left_chunk: false
+
+decoder: transformer
+decoder_conf:
+  activation_type: gelu
+  attention_heads: 20
+  dropout_rate: 0.0
+  gradient_checkpointing: true
+  input_layer: embed_learnable_pe
+  key_bias: false
+  linear_units: 5120
+  normalize_before: true
+  num_blocks: 32
+  positional_dropout_rate: 0.0
+  self_attention_dropout_rate: 0.0
+  src_attention: true
+  src_attention_dropout_rate: 0.0
+  tie_word_embedding: true
+  use_output_layer: true
+
+tokenizer: whisper
+tokenizer_conf:
+  bpe_path: null
+  is_multilingual: true
+  non_lang_syms_path: null
+  num_languages: 100
+  special_tokens:
+    eot: 50258
+    no_speech: 50363
+    no_timestamps: 50364
+    sot: 50258
+    sot_prev: 50362
+    timestamp_begin: 50365
+    transcribe: 50360
+    translate: 50359
+  split_with_space: false
+  symbol_table_path: null
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 50256
+
+cmvn: null
+cmvn_conf:
+  cmvn_file: null
+  is_json_cmvn: null
+
+model: whisper
+model_conf:
+  ctc_weight: 0.0
+  length_normalized_loss: false
+  lsm_weight: 0.1
+
+dataset: asr
+dataset_conf:
+  batch_conf:
+    batch_size: 26
+    batch_type: dynamic
+    max_frames_in_batch: 12000
+  feats_type: log_mel_spectrogram
+  filter_conf:
+    max_length: 3000
+    min_length: 0
+    token_max_length: 448
+    token_min_length: 1
+  log_mel_spectrogram_conf:
+    hop_length: 160
+    n_fft: 400
+    num_mel_bins: 128
+    padding: 0
+  resample_conf:
+    resample_rate: 16000
+  shuffle: true
+  shuffle_conf:
+    shuffle_size: 1500
+  sort: true
+  sort_conf:
+    sort_size: 500
+  spec_aug: true
+  spec_aug_conf:
+    max_f: 10
+    max_t: 50
+    num_f_mask: 2
+    num_t_mask: 2
+  spec_sub: true
+  spec_sub_conf:
+    max_t: 30
+    num_t_sub: 3
+  spec_trim: false
+  speed_perturb: true
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 30
+log_interval: 100
+
+optim: adam
+optim_conf:
+  lr: 0.00001
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 12000
diff --git a/examples/aishell/whisper/conf/finetune_whisper_largev3_onlyattn.yaml b/examples/aishell/whisper/conf/finetune_whisper_largev3_onlyattn.yaml
@@ -0,0 +1,119 @@
+encoder: transformer
+encoder_conf:
+  activation_type: gelu
+  attention_dropout_rate: 0.0
+  attention_heads: 20
+  dropout_rate: 0.0
+  gradient_checkpointing: true
+  input_layer: conv1d2  # NOTE(xcsong): conv1d2, conv2d, conv2d8
+  key_bias: false
+  linear_units: 5120
+  normalize_before: true
+  num_blocks: 32
+  output_size: 1280
+  pos_enc_layer_type: abs_pos_whisper
+  positional_dropout_rate: 0.0
+  static_chunk_size: -1
+  use_dynamic_chunk: false
+  use_dynamic_left_chunk: false
+
+decoder: transformer
+decoder_conf:
+  activation_type: gelu
+  attention_heads: 20
+  dropout_rate: 0.0
+  gradient_checkpointing: true
+  input_layer: embed_learnable_pe
+  key_bias: false
+  linear_units: 5120
+  normalize_before: true
+  num_blocks: 32
+  positional_dropout_rate: 0.0
+  self_attention_dropout_rate: 0.0
+  src_attention: true
+  src_attention_dropout_rate: 0.0
+  tie_word_embedding: true
+  use_output_layer: true
+
+tokenizer: whisper
+tokenizer_conf:
+  bpe_path: null
+  is_multilingual: true
+  non_lang_syms_path: null
+  num_languages: 100
+  special_tokens:
+    eot: 50258
+    no_speech: 50363
+    no_timestamps: 50364
+    sot: 50258
+    sot_prev: 50362
+    timestamp_begin: 50365
+    transcribe: 50360
+    translate: 50359
+  split_with_space: false
+  symbol_table_path: null
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 50256
+
+cmvn: null
+cmvn_conf:
+  cmvn_file: null
+  is_json_cmvn: null
+
+model: whisper
+model_conf:
+  ctc_weight: 0.0
+  length_normalized_loss: false
+  lsm_weight: 0.1
+
+dataset: asr
+dataset_conf:
+  batch_conf:
+    batch_size: 26
+    batch_type: dynamic
+    max_frames_in_batch: 12000
+  feats_type: log_mel_spectrogram
+  filter_conf:
+    max_length: 3000
+    min_length: 0
+    token_max_length: 448
+    token_min_length: 1
+  log_mel_spectrogram_conf:
+    hop_length: 160
+    n_fft: 400
+    num_mel_bins: 128
+    padding: 0
+  resample_conf:
+    resample_rate: 16000
+  shuffle: true
+  shuffle_conf:
+    shuffle_size: 1500
+  sort: true
+  sort_conf:
+    sort_size: 500
+  spec_aug: true
+  spec_aug_conf:
+    max_f: 10
+    max_t: 50
+    num_f_mask: 2
+    num_t_mask: 2
+  spec_sub: true
+  spec_sub_conf:
+    max_t: 30
+    num_t_sub: 3
+  spec_trim: false
+  speed_perturb: true
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 30
+log_interval: 100
+
+optim: adam
+optim_conf:
+  lr: 0.00001
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 12000