wenet-e2e · Mddct · Jan 25, 2024 · Jan 25, 2024 · Jan 25, 2024
diff --git a/examples/aishell/paraformer/README.md b/examples/aishell/paraformer/README.md
@@ -5,36 +5,41 @@ output_dir=exp/paraformer/large
 mkdir -p ${output_dir}
 . ./path.sh && python wenet/paraformer/convert_paraformer_to_wenet_config_and_ckpt.py \
   --output_dir ${output_dir}
+# init ctc and embed(used in sampler)
+python local/modify_ckpt.py \
+  --add_list "{\"ctc.ctc_lo.weight\": \"decoder.embed.0.weight\", \"embed.0.weight\": \"decoder.embed.0.weight\"}" \
+  --input_ckpt exp/paraformer/large/wenet_paraformer.pt \
+  --output_ckpt exp/paraformer/large/wenet_paraformer.init-ctc.init-embed.pt
 ```
 
 # Performance Record
 
 ## Paraformer (original) Result
 
-| decoding mode             |  CER  |
-|---------------------------|-------|
-| paraformer greedy search  | 1.95  |
+| decoding mode             |  full | 16  |
+|---------------------------|-------|-----|
+| paraformer greedy search  | 1.95  | N/A |
 
 ## Paraformer (full-parameter tuning) Result
 
-* Training info: batch size 28, ctc_weight: 0.3, acc_grad 4, 8 * v100 gpu, 40 epochs
+* Training info: torch_ddp fp32, batch size 28, ctc_weight: 0.3, acc_grad 1, 8 * 3090 gpu, 60 epochs (about 8h)
 * Decoding info: ctc_weight 0.3, average_num 5
 * Git hash: TBD
 
-| decoding mode             | CER   |
-|---------------------------|-------|
-| ctc greedy search         | 4.00  |
-| ctc prefix beam search    | 4.00  |
-| paraformer greedy search  | 2.16  |
+| decoding mode             | full  | 16  |
+|---------------------------|-------|-----|
+| ctc greedy search         | 3.45 % N=104765 C=101244 S=3406 D=115 I=91  | N/A |
+| ctc prefix beam search    | 3.44 % N=104765 C=101247 S=3407 D=111 I=83  | N/A |
+| paraformer greedy search  | 2.19 % N=104765 C=102643 S=1959 D=163 I=172 | N/A |
 
 ## Paraformer-dynamic training (full-parameter tuning) Result
 
-* Training info: batch size 28, ctc_weight: 0.3, acc_grad 4, 8 * v100 gpu, 43 epochs
+* Training info: torch_ddp fp32, batch size 28, ctc_weight: 0.3, acc_grad 1, 8 * 3090 gpu, 60 epochs (about 8h)
 * Decoding info: ctc_weight 0.3, average_num 5
 * Git hash: TBD
 
 | decoding mode             | full   | 16   |
 |---------------------------|--------|------|
-| ctc greedy search         | 3.93   | 4.94 |
-| ctc prefix beam search    | 3.93   | 4.94 |
-| paraformer greedy search  | 2.08   | 2.41 |
+| ctc greedy search         | 3.46 % N=104765 C=101235 S=3409 D=121 I=98   | 4.18 % N=104765 C=100495 S=4149 D=121 I=107 |
+| ctc prefix beam search    | 3.45 % N=104765 C=101239 S=3413 D=113 I=91   | 4.17 % N=104765 C=100500 S=4150 D=115 I=103 |
+| paraformer greedy search  | 2.15 % N=104765 C=102640 S=1977 D=148 I=132  | 2.40 % N=104765 C=102409 S=2220 D=136 I=161 |
diff --git a/examples/aishell/paraformer/conf/train_paraformer.yaml b/examples/aishell/paraformer/conf/train_paraformer.yaml
@@ -2,32 +2,32 @@ input_dim: 560
 
 encoder: sanm_encoder
 encoder_conf:
-  attention_dropout_rate: 0.1
+  attention_dropout_rate: 0.0
   attention_heads: 4
-  dropout_rate: 0.1
+  dropout_rate: 0.0
   input_layer: paraformer_dummy
   kernel_size: 11
   linear_units: 2048
   normalize_before: true
   num_blocks: 50
   output_size: 512
   pos_enc_layer_type: abs_pos_paraformer
-  positional_dropout_rate: 0.1
+  positional_dropout_rate: 0.0
   sanm_shfit: 0
   gradient_checkpointing: true
 
 decoder: sanm_decoder
 decoder_conf:
   att_layer_num: 16
   attention_heads: 4
-  dropout_rate: 0.1
+  dropout_rate: 0.0
   kernel_size: 11
   linear_units: 2048
   num_blocks: 16
-  positional_dropout_rate: 0.1
+  positional_dropout_rate: 0.0
   sanm_shfit: 0
-  self_attention_dropout_rate: 0.1
-  src_attention_dropout_rate: 0.1
+  self_attention_dropout_rate: 0.0
+  src_attention_dropout_rate: 0.0
   gradient_checkpointing: true
 
 tokenizer: paraformer
@@ -102,12 +102,12 @@ dataset_conf:
 
 grad_clip: 5
 accum_grad: 1
-max_epoch: 45
+max_epoch: 60
 log_interval: 100
 
 optim: adam
 optim_conf:
   lr: 0.0005
 scheduler: warmuplr
 scheduler_conf:
-  warmup_steps: 25000
+  warmup_steps: 12000
diff --git a/examples/aishell/paraformer/conf/train_paraformer_dynamic.yaml b/examples/aishell/paraformer/conf/train_paraformer_dynamic.yaml
@@ -2,17 +2,17 @@ input_dim: 560
 
 encoder: sanm_encoder
 encoder_conf:
-  attention_dropout_rate: 0.1
+  attention_dropout_rate: 0.0
   attention_heads: 4
-  dropout_rate: 0.1
+  dropout_rate: 0.0
   input_layer: paraformer_dummy
   kernel_size: 11
   linear_units: 2048
   normalize_before: true
   num_blocks: 50
   output_size: 512
   pos_enc_layer_type: abs_pos_paraformer
-  positional_dropout_rate: 0.1
+  positional_dropout_rate: 0.0
   sanm_shfit: 0
   use_dynamic_chunk: true
   gradient_checkpointing: true
@@ -21,14 +21,14 @@ decoder: sanm_decoder
 decoder_conf:
   att_layer_num: 16
   attention_heads: 4
-  dropout_rate: 0.1
+  dropout_rate: 0.0
   kernel_size: 11
   linear_units: 2048
   num_blocks: 16
-  positional_dropout_rate: 0.1
+  positional_dropout_rate: 0.0
   sanm_shfit: 0
-  self_attention_dropout_rate: 0.1
-  src_attention_dropout_rate: 0.1
+  self_attention_dropout_rate: 0.0
+  src_attention_dropout_rate: 0.0
   gradient_checkpointing: true
 
 tokenizer: paraformer
@@ -103,12 +103,12 @@ dataset_conf:
 
 grad_clip: 5
 accum_grad: 1
-max_epoch: 45
+max_epoch: 60
 log_interval: 100
 
 optim: adam
 optim_conf:
   lr: 0.0005
 scheduler: warmuplr
 scheduler_conf:
-  warmup_steps: 25000
+  warmup_steps: 12000
diff --git a/examples/aishell/paraformer/run.sh b/examples/aishell/paraformer/run.sh
@@ -28,13 +28,13 @@ job_id=2024
 # data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
 # `shard` is used for large dataset which is over 1k hours, and `shard` is
 # faster on reading data and training.
-data_type=raw
+data_type=shard
 
 train_set=train
 
-train_config=conf/train_paraformer.yaml
-checkpoint=exp/paraformer/large/wenet_paraformer.pt
-dir=exp/finetune_paraformer
+train_config=conf/train_paraformer_dynamic.yaml
+checkpoint=exp/paraformer/large/wenet_paraformer.init-ctc.init-embed.pt
+dir=exp/finetune_paraformer_dynamic
 tensorboard_dir=tensorboard
 num_workers=8
 prefetch=500
@@ -44,6 +44,12 @@ average_checkpoint=true
 decode_checkpoint=$dir/final.pt
 average_num=5
 decode_modes="ctc_greedy_search ctc_prefix_beam_search paraformer_greedy_search"
+decode_device=0
+decoding_chunk_size=-1
+decode_batch=16
+ctc_weight=0.3
+reverse_weight=0.5
+max_epoch=100
 
 train_engine=torch_ddp
 
@@ -124,36 +130,37 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
   # Test model, please specify the model you want to test by --checkpoint
   if [ ${average_checkpoint} == true ]; then
-    decode_checkpoint=$dir/avg_${average_num}.pt
+    decode_checkpoint=$dir/avg_${average_num}_maxepoch_${max_epoch}.pt
     echo "do model average and final checkpoint is $decode_checkpoint"
     python wenet/bin/average_model.py \
       --dst_model $decode_checkpoint \
       --src_path $dir  \
       --num ${average_num} \
+      --max_epoch ${max_epoch} \
       --val_best
   fi
   # Please specify decoding_chunk_size for unified streaming and
   # non-streaming model. The default value is -1, which is full chunk
   # for non-streaming inference.
-  decoding_chunk_size=
-  ctc_weight=0.3
-  reverse_weight=0.5
-  python wenet/bin/recognize.py --gpu 0 \
+  base=$(basename $decode_checkpoint)
+  result_dir=$dir/${base}_chunk${decoding_chunk_size}_ctc${ctc_weight}_reverse${reverse_weight}
+  mkdir -p ${result_dir}
+  python wenet/bin/recognize.py --gpu ${decode_device} \
     --modes $decode_modes \
     --config $dir/train.yaml \
     --data_type $data_type \
     --test_data data/test/data.list \
     --checkpoint $decode_checkpoint \
     --beam_size 10 \
-    --batch_size 16 \
+    --batch_size ${decode_batch} \
     --penalty 0.0 \
     --ctc_weight $ctc_weight \
     --reverse_weight $reverse_weight \
-    --result_dir $dir \
+    --result_dir $result_dir \
     ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
   for mode in ${decode_modes}; do
     python tools/compute-wer.py --char=1 --v=1 \
-      data/test/text $dir/$mode/text > $dir/$mode/wer
+      data/test/text $result_dir/$mode/text > $result_dir/$mode/wer
   done
 fi