add ernie sat model file and config

heyudage · Jul 8, 2022 · 9468826 · 9468826
1 parent 0ea9def
commit 9468826
Show file tree

Hide file tree

Showing 46 changed files with 3,604 additions and 81 deletions.
diff --git a/examples/aishell3/ernie_sat/conf/default.yaml b/examples/aishell3/ernie_sat/conf/default.yaml
@@ -0,0 +1,282 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size (samples).
+n_shift: 300       # Hop size (samples). 12.5ms
+win_length: 1200   # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+mean_phn_span: 8
+mlm_prob: 0.8
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 64
+num_workers: 2
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    text_masking: false
+    postnet_layers: 5
+    postnet_filts: 5
+    postnet_chans: 256
+    encoder_type: conformer
+    decoder_type: conformer
+    enc_input_layer: sega_mlm
+    enc_pre_speech_layer: 0
+    enc_cnn_module_kernel: 7
+    enc_attention_dim: 384
+    enc_attention_heads: 2
+    enc_linear_units: 1536
+    enc_num_blocks: 4
+    enc_dropout_rate: 0.2
+    enc_positional_dropout_rate: 0.2
+    enc_attention_dropout_rate: 0.2
+    enc_normalize_before: true
+    enc_macaron_style: true
+    enc_use_cnn_module: true
+    enc_selfattention_layer_type: legacy_rel_selfattn
+    enc_activation_type: swish
+    enc_pos_enc_layer_type: legacy_rel_pos
+    enc_positionwise_layer_type: conv1d
+    enc_positionwise_conv_kernel_size: 3
+    dec_cnn_module_kernel: 31
+    dec_attention_dim: 384
+    dec_attention_heads: 2
+    dec_linear_units: 1536
+    dec_num_blocks: 4
+    dec_dropout_rate: 0.2
+    dec_positional_dropout_rate: 0.2
+    dec_attention_dropout_rate: 0.2
+    dec_macaron_style: true
+    dec_use_cnn_module: true
+    dec_selfattention_layer_type: legacy_rel_selfattn
+    dec_activation_type: swish
+    dec_pos_enc_layer_type: legacy_rel_pos
+    dec_positionwise_layer_type: conv1d
+    dec_positionwise_conv_kernel_size: 3
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+    optim: adam               # optimizer type
+    learning_rate: 0.001      # learning rate
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 200
+num_snapshots: 5
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
+
+token_list:
+- <blank>
+- <unk>
+- d
+- sp
+- sh
+- ii
+- j
+- zh
+- l
+- x
+- b
+- g
+- uu
+- e5
+- h
+- q
+- m
+- i1
+- t
+- z
+- ch
+- f
+- s
+- u4
+- ix4
+- i4
+- n
+- i3
+- iu3
+- vv
+- ian4
+- ix2
+- r
+- e4
+- ai4
+- k
+- ing2
+- a1
+- en2
+- ui4
+- ong1
+- uo3
+- u2
+- u3
+- ao4
+- ee
+- p
+- an1
+- eng2
+- i2
+- in1
+- c
+- ai2
+- ian2
+- e2
+- an4
+- ing4
+- v4
+- ai3
+- a5
+- ian3
+- eng1
+- ong4
+- ang4
+- ian1
+- ing1
+- iy4
+- ao3
+- ang1
+- uo4
+- u1
+- iao4
+- iu4
+- a4
+- van2
+- ie4
+- ang2
+- ou4
+- iang4
+- ix1
+- er4
+- iy1
+- e1
+- en1
+- ui2
+- an3
+- ei4
+- ong2
+- uo1
+- ou3
+- uo2
+- iao1
+- ou1
+- an2
+- uan4
+- ia4
+- ia1
+- ang3
+- v3
+- iu2
+- iao3
+- in4
+- a3
+- ei3
+- iang3
+- v2
+- eng4
+- en3
+- aa
+- uan1
+- v1
+- ao1
+- ve4
+- ie3
+- ai1
+- ing3
+- iang1
+- a2
+- ui1
+- en4
+- en5
+- in3
+- uan3
+- e3
+- ie1
+- ve2
+- ei2
+- in2
+- ix3
+- uan2
+- iang2
+- ie2
+- ua4
+- ou2
+- uai4
+- er2
+- eng3
+- uang3
+- un1
+- ong3
+- uang4
+- vn4
+- un2
+- iy3
+- iz4
+- ui3
+- iao2
+- iong4
+- un4
+- van4
+- ao2
+- uang1
+- iy5
+- o2
+- ei1
+- ua1
+- iu1
+- uang2
+- er5
+- o1
+- un3
+- vn1
+- vn2
+- o4
+- ve1
+- van3
+- ua2
+- er3
+- iong3
+- van1
+- ia2
+- iy2
+- ia3
+- iong1
+- uo5
+- oo
+- ve3
+- ou5
+- uai3
+- ian5
+- iong2
+- uai2
+- uai1
+- ua3
+- vn3
+- ia5
+- ie5
+- ueng1
+- o5
+- o3
+- iang5
+- ei5
+- <sos/eos>
diff --git a/examples/aishell3/ernie_sat/local/preprocess.sh b/examples/aishell3/ernie_sat/local/preprocess.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./aishell3_alignment_tone \
+        --output durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=aishell3 \
+        --rootdir=~/datasets/data_aishell3/ \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="speech"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone/speaker to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
diff --git a/examples/aishell3/ernie_sat/local/synthesize.sh b/examples/aishell3/ernie_sat/local/synthesize.sh
@@ -0,0 +1 @@
+#!/bin/bash
diff --git a/examples/aishell3/ernie_sat/local/train.sh b/examples/aishell3/ernie_sat/local/train.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1 \
+    --phones-dict=dump/phone_id_map.txt
diff --git a/examples/aishell3/ernie_sat/path.sh b/examples/aishell3/ernie_sat/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=ernie_sat
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}