forked from PaddlePaddle/PaddleSpeech
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add XPU support for SpeedySpeech (PaddlePaddle#3502)
* Add XPU support for SpeedySpeech * fix typos * update description of nxpu
- Loading branch information
Showing
8 changed files
with
363 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#!/bin/bash | ||
|
||
train_output_path=$1 | ||
|
||
stage=0 | ||
stop_stage=0 | ||
|
||
# pwgan | ||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
python3 ${BIN_DIR}/../inference.py \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--am=speedyspeech_csmsc \ | ||
--voc=pwgan_csmsc \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/pd_infer_out \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--device xpu | ||
fi | ||
|
||
# for more GAN Vocoders | ||
# multi band melgan | ||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
python3 ${BIN_DIR}/../inference.py \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--am=speedyspeech_csmsc \ | ||
--voc=mb_melgan_csmsc \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/pd_infer_out \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--device xpu | ||
fi | ||
|
||
# hifigan | ||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | ||
python3 ${BIN_DIR}/../inference.py \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--am=speedyspeech_csmsc \ | ||
--voc=hifigan_csmsc \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/pd_infer_out \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--device xpu | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
#!/bin/bash | ||
|
||
config_path=$1 | ||
train_output_path=$2 | ||
ckpt_name=$3 | ||
|
||
stage=0 | ||
stop_stage=0 | ||
|
||
# pwgan | ||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=pwgan_csmsc \ | ||
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ | ||
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ | ||
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--ngpu=0 \ | ||
--nxpu=1 | ||
fi | ||
|
||
# for more GAN Vocoders | ||
# multi band melgan | ||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=mb_melgan_csmsc \ | ||
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ | ||
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--ngpu=0 \ | ||
--nxpu=1 | ||
fi | ||
|
||
# the pretrained models haven't release now | ||
# style melgan | ||
# style melgan's Dygraph to Static Graph is not ready now | ||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=style_melgan_csmsc \ | ||
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ | ||
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nxpu=1 | ||
# --inference_dir=${train_output_path}/inference | ||
fi | ||
|
||
# hifigan | ||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=hifigan_csmsc \ | ||
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ | ||
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--ngpu=0 \ | ||
--nxpu=1 | ||
fi | ||
|
||
# wavernn | ||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then | ||
echo "in wavernn syn_e2e" | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=wavernn_csmsc \ | ||
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ | ||
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ | ||
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--ngpu=0 \ | ||
--nxpu=1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
#!/bin/bash | ||
|
||
config_path=$1 | ||
train_output_path=$2 | ||
ckpt_name=$3 | ||
stage=0 | ||
stop_stage=0 | ||
|
||
# pwgan | ||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=pwgan_csmsc \ | ||
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ | ||
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ | ||
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ | ||
--test_metadata=dump/test/norm/metadata.jsonl \ | ||
--output_dir=${train_output_path}/test \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nxpu=1 | ||
fi | ||
|
||
# for more GAN Vocoders | ||
# multi band melgan | ||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=mb_melgan_csmsc \ | ||
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ | ||
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--test_metadata=dump/test/norm/metadata.jsonl \ | ||
--output_dir=${train_output_path}/test \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nxpu=1 | ||
fi | ||
|
||
# style melgan | ||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=style_melgan_csmsc \ | ||
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ | ||
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--test_metadata=dump/test/norm/metadata.jsonl \ | ||
--output_dir=${train_output_path}/test \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nxpu=1 | ||
fi | ||
|
||
# hifigan | ||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then | ||
echo "in hifigan syn" | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=hifigan_csmsc \ | ||
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ | ||
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--test_metadata=dump/test/norm/metadata.jsonl \ | ||
--output_dir=${train_output_path}/test \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nxpu=1 | ||
fi | ||
|
||
# wavernn | ||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then | ||
echo "in wavernn syn" | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=wavernn_csmsc \ | ||
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ | ||
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ | ||
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ | ||
--test_metadata=dump/test/norm/metadata.jsonl \ | ||
--output_dir=${train_output_path}/test \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nxpu=1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
|
||
#!/bin/bash | ||
|
||
config_path=$1 | ||
train_output_path=$2 | ||
|
||
python ${BIN_DIR}/train.py \ | ||
--train-metadata=dump/train/norm/metadata.jsonl \ | ||
--dev-metadata=dump/dev/norm/metadata.jsonl \ | ||
--config=${config_path} \ | ||
--output-dir=${train_output_path} \ | ||
--ngpu=0 \ | ||
--nxpu=1 \ | ||
--phones-dict=dump/phone_id_map.txt \ | ||
--tones-dict=dump/tone_id_map.txt \ | ||
--use-relative-path=True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#!/bin/bash | ||
|
||
set -e | ||
source path.sh | ||
|
||
xpus=0,1 | ||
stage=0 | ||
stop_stage=100 | ||
|
||
conf_path=conf/default.yaml | ||
train_output_path=exp/default | ||
ckpt_name=snapshot_iter_76.pdz | ||
|
||
# with the following command, you can choose the stage range you want to run | ||
# such as `./run_xpu.sh --stage 0 --stop-stage 0` | ||
# this can not be mixed use with `$1`, `$2` ... | ||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 | ||
|
||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
# prepare data | ||
./local/preprocess.sh ${conf_path} || exit -1 | ||
fi | ||
|
||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir | ||
FLAGS_selected_xpus=${xpus} ./local/train_xpu.sh ${conf_path} ${train_output_path} || exit -1 | ||
fi | ||
|
||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | ||
# synthesize, vocoder is pwgan by default | ||
FLAGS_selected_xpus=${xpus} ./local/synthesize_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 | ||
fi | ||
|
||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then | ||
# synthesize_e2e, vocoder is pwgan by default | ||
FLAGS_selected_xpus=${xpus} ./local/synthesize_e2e_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 | ||
fi | ||
|
||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then | ||
# inference with static model | ||
FLAGS_selected_xpus=${xpus} ./local/inference_xpu.sh ${train_output_path} || exit -1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.