From 18f21b24511c110236ca1afc0d27e2173d625d8a Mon Sep 17 00:00:00 2001
From: xingchensong <xingchensong1996@163.com>
Date: Thu, 25 Jan 2024 10:43:12 +0800
Subject: [PATCH 1/2] [examples] update paraformer results on aishell

---
 examples/aishell/paraformer/README.md         | 27 +++++++++-------
 .../paraformer/conf/train_paraformer.yaml     | 18 +++++------
 .../conf/train_paraformer_dynamic.yaml        | 18 +++++------
 examples/aishell/paraformer/run.sh            | 31 ++++++++++++-------
 4 files changed, 53 insertions(+), 41 deletions(-)

diff --git a/examples/aishell/paraformer/README.md b/examples/aishell/paraformer/README.md
index d59211575..2c315e8a4 100644
--- a/examples/aishell/paraformer/README.md
+++ b/examples/aishell/paraformer/README.md
@@ -5,15 +5,20 @@ output_dir=exp/paraformer/large
 mkdir -p ${output_dir}
 . ./path.sh && python wenet/paraformer/convert_paraformer_to_wenet_config_and_ckpt.py \
   --output_dir ${output_dir}
+# init ctc and embed(used in sampler)
+python local/modify_ckpt.py \
+  --add_list "{\"ctc.ctc_lo.weight\": \"decoder.embed.0.weight\", \"embed.0.weight\": \"decoder.embed.0.weight\"}" \
+  --input_ckpt exp/paraformer/large/wenet_paraformer.pt \
+  --output_ckpt exp/paraformer/large/wenet_paraformer.init-ctc.init-embed.pt
 ```
 
 # Performance Record
 
 ## Paraformer (original) Result
 
-| decoding mode             |  CER  |
-|---------------------------|-------|
-| paraformer greedy search  | 1.95  |
+| decoding mode             |  full | 16  |
+|---------------------------|-------|-----|
+| paraformer greedy search  | 1.95  | N/A |
 
 ## Paraformer (full-parameter tuning) Result
 
@@ -21,11 +26,11 @@ mkdir -p ${output_dir}
 * Decoding info: ctc_weight 0.3, average_num 5
 * Git hash: TBD
 
-| decoding mode             | CER   |
-|---------------------------|-------|
-| ctc greedy search         | 4.00  |
-| ctc prefix beam search    | 4.00  |
-| paraformer greedy search  | 2.16  |
+| decoding mode             | full  | 16  |
+|---------------------------|-------|-----|
+| ctc greedy search         | 3.45 % N=104765 C=101244 S=3406 D=115 I=91  | N/A |
+| ctc prefix beam search    | 3.44 % N=104765 C=101247 S=3407 D=111 I=83  | N/A |
+| paraformer greedy search  | 2.19 % N=104765 C=102643 S=1959 D=163 I=172 | N/A |
 
 ## Paraformer-dynamic training (full-parameter tuning) Result
 
@@ -35,6 +40,6 @@ mkdir -p ${output_dir}
 
 | decoding mode             | full   | 16   |
 |---------------------------|--------|------|
-| ctc greedy search         | 3.93   | 4.94 |
-| ctc prefix beam search    | 3.93   | 4.94 |
-| paraformer greedy search  | 2.08   | 2.41 |
+| ctc greedy search         | 3.46 % N=104765 C=101235 S=3409 D=121 I=98   | 4.18 % N=104765 C=100495 S=4149 D=121 I=107 |
+| ctc prefix beam search    | 3.45 % N=104765 C=101239 S=3413 D=113 I=91   | 4.17 % N=104765 C=100500 S=4150 D=115 I=103 |
+| paraformer greedy search  | 2.15 % N=104765 C=102640 S=1977 D=148 I=132  | 2.40 % N=104765 C=102409 S=2220 D=136 I=161 |
diff --git a/examples/aishell/paraformer/conf/train_paraformer.yaml b/examples/aishell/paraformer/conf/train_paraformer.yaml
index d07d64bec..885a98766 100644
--- a/examples/aishell/paraformer/conf/train_paraformer.yaml
+++ b/examples/aishell/paraformer/conf/train_paraformer.yaml
@@ -2,9 +2,9 @@ input_dim: 560
 
 encoder: sanm_encoder
 encoder_conf:
-  attention_dropout_rate: 0.1
+  attention_dropout_rate: 0.0
   attention_heads: 4
-  dropout_rate: 0.1
+  dropout_rate: 0.0
   input_layer: paraformer_dummy
   kernel_size: 11
   linear_units: 2048
@@ -12,7 +12,7 @@ encoder_conf:
   num_blocks: 50
   output_size: 512
   pos_enc_layer_type: abs_pos_paraformer
-  positional_dropout_rate: 0.1
+  positional_dropout_rate: 0.0
   sanm_shfit: 0
   gradient_checkpointing: true
 
@@ -20,14 +20,14 @@ decoder: sanm_decoder
 decoder_conf:
   att_layer_num: 16
   attention_heads: 4
-  dropout_rate: 0.1
+  dropout_rate: 0.0
   kernel_size: 11
   linear_units: 2048
   num_blocks: 16
-  positional_dropout_rate: 0.1
+  positional_dropout_rate: 0.0
   sanm_shfit: 0
-  self_attention_dropout_rate: 0.1
-  src_attention_dropout_rate: 0.1
+  self_attention_dropout_rate: 0.0
+  src_attention_dropout_rate: 0.0
   gradient_checkpointing: true
 
 tokenizer: paraformer
@@ -102,7 +102,7 @@ dataset_conf:
 
 grad_clip: 5
 accum_grad: 1
-max_epoch: 45
+max_epoch: 60
 log_interval: 100
 
 optim: adam
@@ -110,4 +110,4 @@ optim_conf:
   lr: 0.0005
 scheduler: warmuplr
 scheduler_conf:
-  warmup_steps: 25000
+  warmup_steps: 12000
diff --git a/examples/aishell/paraformer/conf/train_paraformer_dynamic.yaml b/examples/aishell/paraformer/conf/train_paraformer_dynamic.yaml
index 353a1fe5f..c3e3ec6e2 100644
--- a/examples/aishell/paraformer/conf/train_paraformer_dynamic.yaml
+++ b/examples/aishell/paraformer/conf/train_paraformer_dynamic.yaml
@@ -2,9 +2,9 @@ input_dim: 560
 
 encoder: sanm_encoder
 encoder_conf:
-  attention_dropout_rate: 0.1
+  attention_dropout_rate: 0.0
   attention_heads: 4
-  dropout_rate: 0.1
+  dropout_rate: 0.0
   input_layer: paraformer_dummy
   kernel_size: 11
   linear_units: 2048
@@ -12,7 +12,7 @@ encoder_conf:
   num_blocks: 50
   output_size: 512
   pos_enc_layer_type: abs_pos_paraformer
-  positional_dropout_rate: 0.1
+  positional_dropout_rate: 0.0
   sanm_shfit: 0
   use_dynamic_chunk: true
   gradient_checkpointing: true
@@ -21,14 +21,14 @@ decoder: sanm_decoder
 decoder_conf:
   att_layer_num: 16
   attention_heads: 4
-  dropout_rate: 0.1
+  dropout_rate: 0.0
   kernel_size: 11
   linear_units: 2048
   num_blocks: 16
-  positional_dropout_rate: 0.1
+  positional_dropout_rate: 0.0
   sanm_shfit: 0
-  self_attention_dropout_rate: 0.1
-  src_attention_dropout_rate: 0.1
+  self_attention_dropout_rate: 0.0
+  src_attention_dropout_rate: 0.0
   gradient_checkpointing: true
 
 tokenizer: paraformer
@@ -103,7 +103,7 @@ dataset_conf:
 
 grad_clip: 5
 accum_grad: 1
-max_epoch: 45
+max_epoch: 60
 log_interval: 100
 
 optim: adam
@@ -111,4 +111,4 @@ optim_conf:
   lr: 0.0005
 scheduler: warmuplr
 scheduler_conf:
-  warmup_steps: 25000
+  warmup_steps: 12000
diff --git a/examples/aishell/paraformer/run.sh b/examples/aishell/paraformer/run.sh
index 71e6966e7..6f60d2445 100755
--- a/examples/aishell/paraformer/run.sh
+++ b/examples/aishell/paraformer/run.sh
@@ -28,13 +28,13 @@ job_id=2024
 # data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
 # `shard` is used for large dataset which is over 1k hours, and `shard` is
 # faster on reading data and training.
-data_type=raw
+data_type=shard
 
 train_set=train
 
-train_config=conf/train_paraformer.yaml
-checkpoint=exp/paraformer/large/wenet_paraformer.pt
-dir=exp/finetune_paraformer
+train_config=conf/train_paraformer_dynamic.yaml
+checkpoint=exp/paraformer/large/wenet_paraformer.init-ctc.init-embed.pt
+dir=exp/finetune_paraformer_dynamic
 tensorboard_dir=tensorboard
 num_workers=8
 prefetch=500
@@ -44,6 +44,12 @@ average_checkpoint=true
 decode_checkpoint=$dir/final.pt
 average_num=5
 decode_modes="ctc_greedy_search ctc_prefix_beam_search paraformer_greedy_search"
+decode_device=0
+decoding_chunk_size=-1
+decode_batch=16
+ctc_weight=0.3
+reverse_weight=0.5
+max_epoch=100
 
 train_engine=torch_ddp
 
@@ -124,36 +130,37 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
   # Test model, please specify the model you want to test by --checkpoint
   if [ ${average_checkpoint} == true ]; then
-    decode_checkpoint=$dir/avg_${average_num}.pt
+    decode_checkpoint=$dir/avg_${average_num}_maxepoch_${max_epoch}.pt
     echo "do model average and final checkpoint is $decode_checkpoint"
     python wenet/bin/average_model.py \
       --dst_model $decode_checkpoint \
       --src_path $dir  \
       --num ${average_num} \
+      --max_epoch ${max_epoch} \
       --val_best
   fi
   # Please specify decoding_chunk_size for unified streaming and
   # non-streaming model. The default value is -1, which is full chunk
   # for non-streaming inference.
-  decoding_chunk_size=
-  ctc_weight=0.3
-  reverse_weight=0.5
-  python wenet/bin/recognize.py --gpu 0 \
+  base=$(basename $decode_checkpoint)
+  result_dir=$dir/${base}_chunk${decoding_chunk_size}_ctc${ctc_weight}_reverse${reverse_weight}
+  mkdir -p ${result_dir}
+  python wenet/bin/recognize.py --gpu ${decode_device} \
     --modes $decode_modes \
     --config $dir/train.yaml \
     --data_type $data_type \
     --test_data data/test/data.list \
     --checkpoint $decode_checkpoint \
     --beam_size 10 \
-    --batch_size 16 \
+    --batch_size ${decode_batch} \
     --penalty 0.0 \
     --ctc_weight $ctc_weight \
     --reverse_weight $reverse_weight \
-    --result_dir $dir \
+    --result_dir $result_dir \
     ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
   for mode in ${decode_modes}; do
     python tools/compute-wer.py --char=1 --v=1 \
-      data/test/text $dir/$mode/text > $dir/$mode/wer
+      data/test/text $result_dir/$mode/text > $result_dir/$mode/wer
   done
 fi
 

From aaee132828efde2fbbaad225532967811bca5cfd Mon Sep 17 00:00:00 2001
From: xingchensong <xingchensong1996@163.com>
Date: Thu, 25 Jan 2024 10:47:50 +0800
Subject: [PATCH 2/2] [examples] update paraformer results on aishell

---
 examples/aishell/paraformer/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/aishell/paraformer/README.md b/examples/aishell/paraformer/README.md
index 2c315e8a4..64482c44d 100644
--- a/examples/aishell/paraformer/README.md
+++ b/examples/aishell/paraformer/README.md
@@ -22,7 +22,7 @@ python local/modify_ckpt.py \
 
 ## Paraformer (full-parameter tuning) Result
 
-* Training info: batch size 28, ctc_weight: 0.3, acc_grad 4, 8 * v100 gpu, 40 epochs
+* Training info: torch_ddp fp32, batch size 28, ctc_weight: 0.3, acc_grad 1, 8 * 3090 gpu, 60 epochs (about 8h)
 * Decoding info: ctc_weight 0.3, average_num 5
 * Git hash: TBD
 
@@ -34,7 +34,7 @@ python local/modify_ckpt.py \
 
 ## Paraformer-dynamic training (full-parameter tuning) Result
 
-* Training info: batch size 28, ctc_weight: 0.3, acc_grad 4, 8 * v100 gpu, 43 epochs
+* Training info: torch_ddp fp32, batch size 28, ctc_weight: 0.3, acc_grad 1, 8 * 3090 gpu, 60 epochs (about 8h)
 * Decoding info: ctc_weight 0.3, average_num 5
 * Git hash: TBD