From 18f21b24511c110236ca1afc0d27e2173d625d8a Mon Sep 17 00:00:00 2001 From: xingchensong Date: Thu, 25 Jan 2024 10:43:12 +0800 Subject: [PATCH 1/2] [examples] update paraformer results on aishell --- examples/aishell/paraformer/README.md | 27 +++++++++------- .../paraformer/conf/train_paraformer.yaml | 18 +++++------ .../conf/train_paraformer_dynamic.yaml | 18 +++++------ examples/aishell/paraformer/run.sh | 31 ++++++++++++------- 4 files changed, 53 insertions(+), 41 deletions(-) diff --git a/examples/aishell/paraformer/README.md b/examples/aishell/paraformer/README.md index d59211575..2c315e8a4 100644 --- a/examples/aishell/paraformer/README.md +++ b/examples/aishell/paraformer/README.md @@ -5,15 +5,20 @@ output_dir=exp/paraformer/large mkdir -p ${output_dir} . ./path.sh && python wenet/paraformer/convert_paraformer_to_wenet_config_and_ckpt.py \ --output_dir ${output_dir} +# init ctc and embed(used in sampler) +python local/modify_ckpt.py \ + --add_list "{\"ctc.ctc_lo.weight\": \"decoder.embed.0.weight\", \"embed.0.weight\": \"decoder.embed.0.weight\"}" \ + --input_ckpt exp/paraformer/large/wenet_paraformer.pt \ + --output_ckpt exp/paraformer/large/wenet_paraformer.init-ctc.init-embed.pt ``` # Performance Record ## Paraformer (original) Result -| decoding mode | CER | -|---------------------------|-------| -| paraformer greedy search | 1.95 | +| decoding mode | full | 16 | +|---------------------------|-------|-----| +| paraformer greedy search | 1.95 | N/A | ## Paraformer (full-parameter tuning) Result @@ -21,11 +26,11 @@ mkdir -p ${output_dir} * Decoding info: ctc_weight 0.3, average_num 5 * Git hash: TBD -| decoding mode | CER | -|---------------------------|-------| -| ctc greedy search | 4.00 | -| ctc prefix beam search | 4.00 | -| paraformer greedy search | 2.16 | +| decoding mode | full | 16 | +|---------------------------|-------|-----| +| ctc greedy search | 3.45 % N=104765 C=101244 S=3406 D=115 I=91 | N/A | +| ctc prefix beam search | 3.44 % N=104765 C=101247 S=3407 D=111 I=83 | N/A | +| paraformer greedy search | 2.19 % N=104765 C=102643 S=1959 D=163 I=172 | N/A | ## Paraformer-dynamic training (full-parameter tuning) Result @@ -35,6 +40,6 @@ mkdir -p ${output_dir} | decoding mode | full | 16 | |---------------------------|--------|------| -| ctc greedy search | 3.93 | 4.94 | -| ctc prefix beam search | 3.93 | 4.94 | -| paraformer greedy search | 2.08 | 2.41 | +| ctc greedy search | 3.46 % N=104765 C=101235 S=3409 D=121 I=98 | 4.18 % N=104765 C=100495 S=4149 D=121 I=107 | +| ctc prefix beam search | 3.45 % N=104765 C=101239 S=3413 D=113 I=91 | 4.17 % N=104765 C=100500 S=4150 D=115 I=103 | +| paraformer greedy search | 2.15 % N=104765 C=102640 S=1977 D=148 I=132 | 2.40 % N=104765 C=102409 S=2220 D=136 I=161 | diff --git a/examples/aishell/paraformer/conf/train_paraformer.yaml b/examples/aishell/paraformer/conf/train_paraformer.yaml index d07d64bec..885a98766 100644 --- a/examples/aishell/paraformer/conf/train_paraformer.yaml +++ b/examples/aishell/paraformer/conf/train_paraformer.yaml @@ -2,9 +2,9 @@ input_dim: 560 encoder: sanm_encoder encoder_conf: - attention_dropout_rate: 0.1 + attention_dropout_rate: 0.0 attention_heads: 4 - dropout_rate: 0.1 + dropout_rate: 0.0 input_layer: paraformer_dummy kernel_size: 11 linear_units: 2048 @@ -12,7 +12,7 @@ encoder_conf: num_blocks: 50 output_size: 512 pos_enc_layer_type: abs_pos_paraformer - positional_dropout_rate: 0.1 + positional_dropout_rate: 0.0 sanm_shfit: 0 gradient_checkpointing: true @@ -20,14 +20,14 @@ decoder: sanm_decoder decoder_conf: att_layer_num: 16 attention_heads: 4 - dropout_rate: 0.1 + dropout_rate: 0.0 kernel_size: 11 linear_units: 2048 num_blocks: 16 - positional_dropout_rate: 0.1 + positional_dropout_rate: 0.0 sanm_shfit: 0 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 gradient_checkpointing: true tokenizer: paraformer @@ -102,7 +102,7 @@ dataset_conf: grad_clip: 5 accum_grad: 1 -max_epoch: 45 +max_epoch: 60 log_interval: 100 optim: adam @@ -110,4 +110,4 @@ optim_conf: lr: 0.0005 scheduler: warmuplr scheduler_conf: - warmup_steps: 25000 + warmup_steps: 12000 diff --git a/examples/aishell/paraformer/conf/train_paraformer_dynamic.yaml b/examples/aishell/paraformer/conf/train_paraformer_dynamic.yaml index 353a1fe5f..c3e3ec6e2 100644 --- a/examples/aishell/paraformer/conf/train_paraformer_dynamic.yaml +++ b/examples/aishell/paraformer/conf/train_paraformer_dynamic.yaml @@ -2,9 +2,9 @@ input_dim: 560 encoder: sanm_encoder encoder_conf: - attention_dropout_rate: 0.1 + attention_dropout_rate: 0.0 attention_heads: 4 - dropout_rate: 0.1 + dropout_rate: 0.0 input_layer: paraformer_dummy kernel_size: 11 linear_units: 2048 @@ -12,7 +12,7 @@ encoder_conf: num_blocks: 50 output_size: 512 pos_enc_layer_type: abs_pos_paraformer - positional_dropout_rate: 0.1 + positional_dropout_rate: 0.0 sanm_shfit: 0 use_dynamic_chunk: true gradient_checkpointing: true @@ -21,14 +21,14 @@ decoder: sanm_decoder decoder_conf: att_layer_num: 16 attention_heads: 4 - dropout_rate: 0.1 + dropout_rate: 0.0 kernel_size: 11 linear_units: 2048 num_blocks: 16 - positional_dropout_rate: 0.1 + positional_dropout_rate: 0.0 sanm_shfit: 0 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 gradient_checkpointing: true tokenizer: paraformer @@ -103,7 +103,7 @@ dataset_conf: grad_clip: 5 accum_grad: 1 -max_epoch: 45 +max_epoch: 60 log_interval: 100 optim: adam @@ -111,4 +111,4 @@ optim_conf: lr: 0.0005 scheduler: warmuplr scheduler_conf: - warmup_steps: 25000 + warmup_steps: 12000 diff --git a/examples/aishell/paraformer/run.sh b/examples/aishell/paraformer/run.sh index 71e6966e7..6f60d2445 100755 --- a/examples/aishell/paraformer/run.sh +++ b/examples/aishell/paraformer/run.sh @@ -28,13 +28,13 @@ job_id=2024 # data_type can be `raw` or `shard`. Typically, raw is used for small dataset, # `shard` is used for large dataset which is over 1k hours, and `shard` is # faster on reading data and training. -data_type=raw +data_type=shard train_set=train -train_config=conf/train_paraformer.yaml -checkpoint=exp/paraformer/large/wenet_paraformer.pt -dir=exp/finetune_paraformer +train_config=conf/train_paraformer_dynamic.yaml +checkpoint=exp/paraformer/large/wenet_paraformer.init-ctc.init-embed.pt +dir=exp/finetune_paraformer_dynamic tensorboard_dir=tensorboard num_workers=8 prefetch=500 @@ -44,6 +44,12 @@ average_checkpoint=true decode_checkpoint=$dir/final.pt average_num=5 decode_modes="ctc_greedy_search ctc_prefix_beam_search paraformer_greedy_search" +decode_device=0 +decoding_chunk_size=-1 +decode_batch=16 +ctc_weight=0.3 +reverse_weight=0.5 +max_epoch=100 train_engine=torch_ddp @@ -124,36 +130,37 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # Test model, please specify the model you want to test by --checkpoint if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt + decode_checkpoint=$dir/avg_${average_num}_maxepoch_${max_epoch}.pt echo "do model average and final checkpoint is $decode_checkpoint" python wenet/bin/average_model.py \ --dst_model $decode_checkpoint \ --src_path $dir \ --num ${average_num} \ + --max_epoch ${max_epoch} \ --val_best fi # Please specify decoding_chunk_size for unified streaming and # non-streaming model. The default value is -1, which is full chunk # for non-streaming inference. - decoding_chunk_size= - ctc_weight=0.3 - reverse_weight=0.5 - python wenet/bin/recognize.py --gpu 0 \ + base=$(basename $decode_checkpoint) + result_dir=$dir/${base}_chunk${decoding_chunk_size}_ctc${ctc_weight}_reverse${reverse_weight} + mkdir -p ${result_dir} + python wenet/bin/recognize.py --gpu ${decode_device} \ --modes $decode_modes \ --config $dir/train.yaml \ --data_type $data_type \ --test_data data/test/data.list \ --checkpoint $decode_checkpoint \ --beam_size 10 \ - --batch_size 16 \ + --batch_size ${decode_batch} \ --penalty 0.0 \ --ctc_weight $ctc_weight \ --reverse_weight $reverse_weight \ - --result_dir $dir \ + --result_dir $result_dir \ ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} for mode in ${decode_modes}; do python tools/compute-wer.py --char=1 --v=1 \ - data/test/text $dir/$mode/text > $dir/$mode/wer + data/test/text $result_dir/$mode/text > $result_dir/$mode/wer done fi From aaee132828efde2fbbaad225532967811bca5cfd Mon Sep 17 00:00:00 2001 From: xingchensong Date: Thu, 25 Jan 2024 10:47:50 +0800 Subject: [PATCH 2/2] [examples] update paraformer results on aishell --- examples/aishell/paraformer/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/aishell/paraformer/README.md b/examples/aishell/paraformer/README.md index 2c315e8a4..64482c44d 100644 --- a/examples/aishell/paraformer/README.md +++ b/examples/aishell/paraformer/README.md @@ -22,7 +22,7 @@ python local/modify_ckpt.py \ ## Paraformer (full-parameter tuning) Result -* Training info: batch size 28, ctc_weight: 0.3, acc_grad 4, 8 * v100 gpu, 40 epochs +* Training info: torch_ddp fp32, batch size 28, ctc_weight: 0.3, acc_grad 1, 8 * 3090 gpu, 60 epochs (about 8h) * Decoding info: ctc_weight 0.3, average_num 5 * Git hash: TBD @@ -34,7 +34,7 @@ python local/modify_ckpt.py \ ## Paraformer-dynamic training (full-parameter tuning) Result -* Training info: batch size 28, ctc_weight: 0.3, acc_grad 4, 8 * v100 gpu, 43 epochs +* Training info: torch_ddp fp32, batch size 28, ctc_weight: 0.3, acc_grad 1, 8 * 3090 gpu, 60 epochs (about 8h) * Decoding info: ctc_weight 0.3, average_num 5 * Git hash: TBD