forked from PaddlePaddle/PaddleSpeech
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
0ea9def
commit 9468826
Showing
46 changed files
with
3,604 additions
and
81 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,282 @@ | ||
########################################################### | ||
# FEATURE EXTRACTION SETTING # | ||
########################################################### | ||
|
||
fs: 24000 # sr | ||
n_fft: 2048 # FFT size (samples). | ||
n_shift: 300 # Hop size (samples). 12.5ms | ||
win_length: 1200 # Window length (samples). 50ms | ||
# If set to null, it will be the same as fft_size. | ||
window: "hann" # Window function. | ||
|
||
# Only used for feats_type != raw | ||
|
||
fmin: 80 # Minimum frequency of Mel basis. | ||
fmax: 7600 # Maximum frequency of Mel basis. | ||
n_mels: 80 # The number of mel basis. | ||
|
||
mean_phn_span: 8 | ||
mlm_prob: 0.8 | ||
|
||
########################################################### | ||
# DATA SETTING # | ||
########################################################### | ||
batch_size: 64 | ||
num_workers: 2 | ||
|
||
########################################################### | ||
# MODEL SETTING # | ||
########################################################### | ||
model: | ||
text_masking: false | ||
postnet_layers: 5 | ||
postnet_filts: 5 | ||
postnet_chans: 256 | ||
encoder_type: conformer | ||
decoder_type: conformer | ||
enc_input_layer: sega_mlm | ||
enc_pre_speech_layer: 0 | ||
enc_cnn_module_kernel: 7 | ||
enc_attention_dim: 384 | ||
enc_attention_heads: 2 | ||
enc_linear_units: 1536 | ||
enc_num_blocks: 4 | ||
enc_dropout_rate: 0.2 | ||
enc_positional_dropout_rate: 0.2 | ||
enc_attention_dropout_rate: 0.2 | ||
enc_normalize_before: true | ||
enc_macaron_style: true | ||
enc_use_cnn_module: true | ||
enc_selfattention_layer_type: legacy_rel_selfattn | ||
enc_activation_type: swish | ||
enc_pos_enc_layer_type: legacy_rel_pos | ||
enc_positionwise_layer_type: conv1d | ||
enc_positionwise_conv_kernel_size: 3 | ||
dec_cnn_module_kernel: 31 | ||
dec_attention_dim: 384 | ||
dec_attention_heads: 2 | ||
dec_linear_units: 1536 | ||
dec_num_blocks: 4 | ||
dec_dropout_rate: 0.2 | ||
dec_positional_dropout_rate: 0.2 | ||
dec_attention_dropout_rate: 0.2 | ||
dec_macaron_style: true | ||
dec_use_cnn_module: true | ||
dec_selfattention_layer_type: legacy_rel_selfattn | ||
dec_activation_type: swish | ||
dec_pos_enc_layer_type: legacy_rel_pos | ||
dec_positionwise_layer_type: conv1d | ||
dec_positionwise_conv_kernel_size: 3 | ||
|
||
########################################################### | ||
# OPTIMIZER SETTING # | ||
########################################################### | ||
optimizer: | ||
optim: adam # optimizer type | ||
learning_rate: 0.001 # learning rate | ||
|
||
########################################################### | ||
# TRAINING SETTING # | ||
########################################################### | ||
max_epoch: 200 | ||
num_snapshots: 5 | ||
|
||
########################################################### | ||
# OTHER SETTING # | ||
########################################################### | ||
seed: 10086 | ||
|
||
token_list: | ||
- <blank> | ||
- <unk> | ||
- d | ||
- sp | ||
- sh | ||
- ii | ||
- j | ||
- zh | ||
- l | ||
- x | ||
- b | ||
- g | ||
- uu | ||
- e5 | ||
- h | ||
- q | ||
- m | ||
- i1 | ||
- t | ||
- z | ||
- ch | ||
- f | ||
- s | ||
- u4 | ||
- ix4 | ||
- i4 | ||
- n | ||
- i3 | ||
- iu3 | ||
- vv | ||
- ian4 | ||
- ix2 | ||
- r | ||
- e4 | ||
- ai4 | ||
- k | ||
- ing2 | ||
- a1 | ||
- en2 | ||
- ui4 | ||
- ong1 | ||
- uo3 | ||
- u2 | ||
- u3 | ||
- ao4 | ||
- ee | ||
- p | ||
- an1 | ||
- eng2 | ||
- i2 | ||
- in1 | ||
- c | ||
- ai2 | ||
- ian2 | ||
- e2 | ||
- an4 | ||
- ing4 | ||
- v4 | ||
- ai3 | ||
- a5 | ||
- ian3 | ||
- eng1 | ||
- ong4 | ||
- ang4 | ||
- ian1 | ||
- ing1 | ||
- iy4 | ||
- ao3 | ||
- ang1 | ||
- uo4 | ||
- u1 | ||
- iao4 | ||
- iu4 | ||
- a4 | ||
- van2 | ||
- ie4 | ||
- ang2 | ||
- ou4 | ||
- iang4 | ||
- ix1 | ||
- er4 | ||
- iy1 | ||
- e1 | ||
- en1 | ||
- ui2 | ||
- an3 | ||
- ei4 | ||
- ong2 | ||
- uo1 | ||
- ou3 | ||
- uo2 | ||
- iao1 | ||
- ou1 | ||
- an2 | ||
- uan4 | ||
- ia4 | ||
- ia1 | ||
- ang3 | ||
- v3 | ||
- iu2 | ||
- iao3 | ||
- in4 | ||
- a3 | ||
- ei3 | ||
- iang3 | ||
- v2 | ||
- eng4 | ||
- en3 | ||
- aa | ||
- uan1 | ||
- v1 | ||
- ao1 | ||
- ve4 | ||
- ie3 | ||
- ai1 | ||
- ing3 | ||
- iang1 | ||
- a2 | ||
- ui1 | ||
- en4 | ||
- en5 | ||
- in3 | ||
- uan3 | ||
- e3 | ||
- ie1 | ||
- ve2 | ||
- ei2 | ||
- in2 | ||
- ix3 | ||
- uan2 | ||
- iang2 | ||
- ie2 | ||
- ua4 | ||
- ou2 | ||
- uai4 | ||
- er2 | ||
- eng3 | ||
- uang3 | ||
- un1 | ||
- ong3 | ||
- uang4 | ||
- vn4 | ||
- un2 | ||
- iy3 | ||
- iz4 | ||
- ui3 | ||
- iao2 | ||
- iong4 | ||
- un4 | ||
- van4 | ||
- ao2 | ||
- uang1 | ||
- iy5 | ||
- o2 | ||
- ei1 | ||
- ua1 | ||
- iu1 | ||
- uang2 | ||
- er5 | ||
- o1 | ||
- un3 | ||
- vn1 | ||
- vn2 | ||
- o4 | ||
- ve1 | ||
- van3 | ||
- ua2 | ||
- er3 | ||
- iong3 | ||
- van1 | ||
- ia2 | ||
- iy2 | ||
- ia3 | ||
- iong1 | ||
- uo5 | ||
- oo | ||
- ve3 | ||
- ou5 | ||
- uai3 | ||
- ian5 | ||
- iong2 | ||
- uai2 | ||
- uai1 | ||
- ua3 | ||
- vn3 | ||
- ia5 | ||
- ie5 | ||
- ueng1 | ||
- o5 | ||
- o3 | ||
- iang5 | ||
- ei5 | ||
- <sos/eos> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#!/bin/bash | ||
|
||
stage=0 | ||
stop_stage=100 | ||
|
||
config_path=$1 | ||
|
||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
# get durations from MFA's result | ||
echo "Generate durations.txt from MFA results ..." | ||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ | ||
--inputdir=./aishell3_alignment_tone \ | ||
--output durations.txt \ | ||
--config=${config_path} | ||
fi | ||
|
||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
# extract features | ||
echo "Extract features ..." | ||
python3 ${BIN_DIR}/preprocess.py \ | ||
--dataset=aishell3 \ | ||
--rootdir=~/datasets/data_aishell3/ \ | ||
--dumpdir=dump \ | ||
--dur-file=durations.txt \ | ||
--config=${config_path} \ | ||
--num-cpu=20 \ | ||
--cut-sil=True | ||
fi | ||
|
||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | ||
# get features' stats(mean and std) | ||
echo "Get features' stats ..." | ||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \ | ||
--metadata=dump/train/raw/metadata.jsonl \ | ||
--field-name="speech" | ||
fi | ||
|
||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then | ||
# normalize and covert phone/speaker to id, dev and test should use train's stats | ||
echo "Normalize ..." | ||
python3 ${BIN_DIR}/normalize.py \ | ||
--metadata=dump/train/raw/metadata.jsonl \ | ||
--dumpdir=dump/train/norm \ | ||
--speech-stats=dump/train/speech_stats.npy \ | ||
--phones-dict=dump/phone_id_map.txt \ | ||
--speaker-dict=dump/speaker_id_map.txt | ||
|
||
python3 ${BIN_DIR}/normalize.py \ | ||
--metadata=dump/dev/raw/metadata.jsonl \ | ||
--dumpdir=dump/dev/norm \ | ||
--speech-stats=dump/train/speech_stats.npy \ | ||
--phones-dict=dump/phone_id_map.txt \ | ||
--speaker-dict=dump/speaker_id_map.txt | ||
|
||
python3 ${BIN_DIR}/normalize.py \ | ||
--metadata=dump/test/raw/metadata.jsonl \ | ||
--dumpdir=dump/test/norm \ | ||
--speech-stats=dump/train/speech_stats.npy \ | ||
--phones-dict=dump/phone_id_map.txt \ | ||
--speaker-dict=dump/speaker_id_map.txt | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
#!/bin/bash |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!/bin/bash | ||
|
||
config_path=$1 | ||
train_output_path=$2 | ||
|
||
python3 ${BIN_DIR}/train.py \ | ||
--train-metadata=dump/train/norm/metadata.jsonl \ | ||
--dev-metadata=dump/dev/norm/metadata.jsonl \ | ||
--config=${config_path} \ | ||
--output-dir=${train_output_path} \ | ||
--ngpu=1 \ | ||
--phones-dict=dump/phone_id_map.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/bin/bash | ||
export MAIN_ROOT=`realpath ${PWD}/../../../` | ||
|
||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} | ||
export LC_ALL=C | ||
|
||
export PYTHONDONTWRITEBYTECODE=1 | ||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C | ||
export PYTHONIOENCODING=UTF-8 | ||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} | ||
|
||
MODEL=ernie_sat | ||
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} |
Oops, something went wrong.