From 55ff380c04bab0505a31c8be20b033e2053bb8f5 Mon Sep 17 00:00:00 2001 From: begeekmyfriend Date: Sun, 31 May 2020 12:31:01 +0800 Subject: [PATCH] Better mel feature extraction for better alignment Signed-off-by: begeekmyfriend --- common/preprocessor.py | 1 + scripts/train_tacotron2.sh | 2 +- tacotron2/loader.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/common/preprocessor.py b/common/preprocessor.py index 79603ea..481af5d 100644 --- a/common/preprocessor.py +++ b/common/preprocessor.py @@ -93,6 +93,7 @@ def _process_utterance(wav_dir, mel_dir, basename, wav_file, text, hparams): # Write the spectrogram and audio to disk filename = f'{basename}.npy' np.save(os.path.join(wav_dir, filename), wav, allow_pickle=False) + np.save(os.path.join(mel_dir, filename), mel_spectrogram, allow_pickle=False) # Return a tuple describing this training example return (filename, time_steps, mel_frames, text) diff --git a/scripts/train_tacotron2.sh b/scripts/train_tacotron2.sh index 5fd94aa..a3e0389 100644 --- a/scripts/train_tacotron2.sh +++ b/scripts/train_tacotron2.sh @@ -1 +1 @@ -CUDA_VISIBLE_DEVICES=0 python train.py --amp-run -o logs --init-lr 1e-3 --final-lr 1e-5 --epochs 200 -bs 32 --weight-decay 1e-6 --log-file nvlog.json --dataset-path training_data --training-anchor-dirs tts_fanfanli_22050 tts_xiaoya_22050 tts_yangluzhuo_22050 tts_yuanzhonglu_22050 +CUDA_VISIBLE_DEVICES=0 python train.py --amp-run -o logs --init-lr 1e-3 --final-lr 1e-5 --epochs 200 -bs 32 --weight-decay 1e-6 --log-file nvlog.json --dataset-path training_data --training-anchor-dirs --load-mel-from-disk tts_fanfanli_22050 tts_xiaoya_22050 tts_yangluzhuo_22050 tts_yuanzhonglu_22050 diff --git a/tacotron2/loader.py b/tacotron2/loader.py index 468f05c..6ac9805 100644 --- a/tacotron2/loader.py +++ b/tacotron2/loader.py @@ -40,7 +40,7 @@ def parse_tacotron2_args(parent, add_help=False): # misc parameters parser.add_argument('--mask-padding', default=False, type=bool, help='Use mask padding') parser.add_argument('--n-mel-channels', default=80, type=int, help='Number of bins in mel-spectrograms') - parser.add_argument('--mel_pad_val', default=-11.5129, type=float, help='Corresponding to silence') + parser.add_argument('--mel_pad_val', default=-5, type=float, help='Corresponding to silence') # symbols parameters global symbols