From c088b9a304811775d26dd252f3fa987662917f0c Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 12 Jan 2022 04:02:56 +0000 Subject: [PATCH 1/5] add csmsc tacotron2 --- examples/aishell3/tts3/conf/default.yaml | 5 +- examples/aishell3/vc1/conf/default.yaml | 4 +- examples/aishell3/voc1/conf/default.yaml | 6 +- examples/csmsc/tts0/README.md | 264 +++++++++ examples/csmsc/tts0/conf/default.yaml | 95 ++++ examples/csmsc/tts0/local/preprocess.sh | 62 +++ examples/csmsc/tts0/local/synthesize.sh | 20 + examples/csmsc/tts0/local/synthesize_e2e.sh | 91 +++ examples/csmsc/tts0/local/train.sh | 12 + examples/csmsc/tts0/path.sh | 13 + examples/csmsc/tts0/run.sh | 37 ++ examples/csmsc/tts3/conf/conformer.yaml | 8 +- examples/csmsc/tts3/conf/default.yaml | 5 +- examples/csmsc/tts3/run.sh | 2 +- examples/csmsc/voc1/conf/default.yaml | 10 +- examples/csmsc/voc3/conf/default.yaml | 6 +- examples/csmsc/voc3/conf/finetune.yaml | 6 +- examples/csmsc/voc4/conf/default.yaml | 6 +- examples/csmsc/voc5/conf/default.yaml | 32 +- examples/csmsc/voc5/conf/finetune.yaml | 32 +- examples/ljspeech/tts1/conf/default.yaml | 4 +- examples/ljspeech/tts3/conf/default.yaml | 4 +- examples/ljspeech/voc1/conf/default.yaml | 6 +- examples/vctk/tts3/conf/default.yaml | 4 +- examples/vctk/voc1/conf/default.yaml | 6 +- paddlespeech/t2s/datasets/am_batch_fn.py | 31 +- .../t2s/exps/fastspeech2/gen_gta_mel.py | 36 +- .../t2s/exps/new_tacotron2/normalize.py | 1 + .../t2s/exps/new_tacotron2/preprocess.py | 353 ++++++++++++ paddlespeech/t2s/exps/new_tacotron2/train.py | 190 +++++++ paddlespeech/t2s/exps/synthesize.py | 13 +- paddlespeech/t2s/exps/synthesize_e2e.py | 10 +- .../t2s/models/fastspeech2/fastspeech2.py | 3 +- .../models/fastspeech2/fastspeech2_updater.py | 10 +- .../t2s/models/new_tacotron2/__init__.py | 15 + .../t2s/models/new_tacotron2/tacotron2.py | 496 +++++++++++++++++ .../models/new_tacotron2/tacotron2_updater.py | 217 ++++++++ .../transformer_tts_updater.py | 32 +- paddlespeech/t2s/modules/losses.py | 244 ++++++++ .../t2s/modules/tacotron2/attentions.py | 519 +++++++++++++++++ paddlespeech/t2s/modules/tacotron2/decoder.py | 527 ++++++++++++++++++ paddlespeech/t2s/modules/tacotron2/encoder.py | 8 +- paddlespeech/t2s/training/optimizer.py | 24 +- 43 files changed, 3335 insertions(+), 134 deletions(-) create mode 100644 examples/csmsc/tts0/README.md create mode 100644 examples/csmsc/tts0/conf/default.yaml create mode 100755 examples/csmsc/tts0/local/preprocess.sh create mode 100755 examples/csmsc/tts0/local/synthesize.sh create mode 100755 examples/csmsc/tts0/local/synthesize_e2e.sh create mode 100755 examples/csmsc/tts0/local/train.sh create mode 100755 examples/csmsc/tts0/path.sh create mode 100755 examples/csmsc/tts0/run.sh create mode 120000 paddlespeech/t2s/exps/new_tacotron2/normalize.py create mode 100644 paddlespeech/t2s/exps/new_tacotron2/preprocess.py create mode 100644 paddlespeech/t2s/exps/new_tacotron2/train.py create mode 100644 paddlespeech/t2s/models/new_tacotron2/__init__.py create mode 100644 paddlespeech/t2s/models/new_tacotron2/tacotron2.py create mode 100644 paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py create mode 100644 paddlespeech/t2s/modules/tacotron2/attentions.py diff --git a/examples/aishell3/tts3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml index 3a57e902607..69307049af3 100644 --- a/examples/aishell3/tts3/conf/default.yaml +++ b/examples/aishell3/tts3/conf/default.yaml @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder spk_embed_dim: 256 # speaker embedding dimension spk_embed_integration_type: concat # speaker embedding integration type @@ -84,7 +84,6 @@ updater: use_masking: True # whether to apply masking for padded part in loss calculation - ########################################################### # OPTIMIZER SETTING # ########################################################### diff --git a/examples/aishell3/vc1/conf/default.yaml b/examples/aishell3/vc1/conf/default.yaml index 557a5a0a1cd..69307049af3 100644 --- a/examples/aishell3/vc1/conf/default.yaml +++ b/examples/aishell3/vc1/conf/default.yaml @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder spk_embed_dim: 256 # speaker embedding dimension spk_embed_integration_type: concat # speaker embedding integration type diff --git a/examples/aishell3/voc1/conf/default.yaml b/examples/aishell3/voc1/conf/default.yaml index 7fbffbdde01..e2102d6e7e6 100644 --- a/examples/aishell3/voc1/conf/default.yaml +++ b/examples/aishell3/voc1/conf/default.yaml @@ -33,7 +33,7 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - use_weight_norm: true # Whether to use weight norm. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift @@ -46,8 +46,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md new file mode 100644 index 00000000000..13d291b5c39 --- /dev/null +++ b/examples/csmsc/tts0/README.md @@ -0,0 +1,264 @@ +# FastSpeech2 with CSMSC +This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). + +## Dataset +### Download and Extract +Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source). + +### Get MFA Result and Extract +We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. + +## Get Started +Assume the path to the dataset is `~/datasets/BZNSYP`. +Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from a text file. +5. inference using the static model. +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│ ├── norm +│ └── raw +└── train + ├── energy_stats.npy + ├── norm + ├── pitch_stats.npy + ├── raw + └── speech_stats.npy +``` +The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech、pitch and energy features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, the path of pitch features, the path of energy features, speaker, and the id of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] + +Train a FastSpeech2 model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG fastspeech2 config file. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu=0, use cpu. + --phones-dict PHONES_DICT + phone vocabulary file. + --speaker-dict SPEAKER_DICT + speaker id map file for multiple speaker model. + --voice-cloning VOICE_CLONING + whether training voice cloning model. +``` +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. + +### Synthesizing +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. +Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it. +```bash +unzip pwg_baker_ckpt_0.4.zip +``` +Parallel WaveGAN checkpoint contains files listed below. +```text +pwg_baker_ckpt_0.4 +├── pwg_default.yaml # default config used to train parallel wavegan +├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan +└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] + [--voice-cloning VOICE_CLONING] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--ngpu NGPU] + [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + Choose acoustic model type of tts task. + --am_config AM_CONFIG + Config of acoustic model. Use deault config when it is + None. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --voice-cloning VOICE_CLONING + whether training voice cloning model. + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + Choose vocoder type of tts task. + --voc_config VOC_CONFIG + Config of voc. Use deault config when it is None. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --ngpu NGPU if ngpu == 0, use cpu. + --test_metadata TEST_METADATA + test metadata. + --output_dir OUTPUT_DIR + output dir. +``` +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--tones_dict TONES_DICT] + [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + Choose acoustic model type of tts task. + --am_config AM_CONFIG + Config of acoustic model. Use deault config when it is + None. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --spk_id SPK_ID spk id for multi speaker acoustic model + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + Choose vocoder type of tts task. + --voc_config VOC_CONFIG + Config of voc. Use deault config when it is None. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --lang LANG Choose model language. zh or en + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize, a 'utt_id sentence' pair per line. + --output_dir OUTPUT_DIR + output dir. +``` +1. `--am` is acoustic model type with the format {model_name}_{dataset} +2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. +3. `--voc` is vocoder type with the format {model_name}_{dataset} +4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +5. `--lang` is the model language, which can be `zh` or `en`. +6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. +7. `--text` is the text file, which contains sentences to synthesize. +8. `--output_dir` is the directory to save synthesized audio files. +9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +### Inferencing +After synthesizing, we will get static models of fastspeech2 and pwgan in `${train_output_path}/inference`. +`./local/inference.sh` calls `${BIN_DIR}/inference.py`, which provides a paddle static model inference example for fastspeech2 + pwgan synthesize. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} +``` + +## Pretrained Model +Pretrained FastSpeech2 model with no silence in the edge of audios: +- [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip) +- [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip) + +The static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip). + +Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss +:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: +default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287| +conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509| + +FastSpeech2 checkpoint contains files listed below. +```text +fastspeech2_nosil_baker_ckpt_0.4 +├── default.yaml # default config used to train fastspeech2 +├── phone_id_map.txt # phone vocabulary file when training fastspeech2 +├── snapshot_iter_76000.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. +```bash +source path.sh + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ + --am_ckpt=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ + --am_stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=exp/default/test_e2e \ + --inference_dir=exp/default/inference \ + --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt +``` diff --git a/examples/csmsc/tts0/conf/default.yaml b/examples/csmsc/tts0/conf/default.yaml new file mode 100644 index 00000000000..171aee8802c --- /dev/null +++ b/examples/csmsc/tts0/conf/default.yaml @@ -0,0 +1,95 @@ +# This configuration is for Paddle to train Tacotron 2. Compared to the +# original paper, this configuration additionally use the guided attention +# loss to accelerate the learning of the diagonal attention. It requires +# only a single GPU with 12 GB memory and it takes ~1 days to finish the +# training on Titan V. + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### + +fs: 24000 # sr +n_fft: 2048 # FFT size (samples). +n_shift: 300 # Hop size (samples). 12.5ms +win_length: 1200 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. + +# Only used for feats_type != raw + +fmin: 80 # Minimum frequency of Mel basis. +fmax: 7600 # Maximum frequency of Mel basis. +n_mels: 80 # The number of mel basis. + +# Only used for the model using pitch features (e.g. FastSpeech2) +f0min: 80 # Maximum f0 for pitch extraction. +f0max: 400 # Minimum f0 for pitch extraction. + +########################################################### +# DATA SETTING # +########################################################### +batch_size: 64 +num_workers: 2 + +########################################################### +# MODEL SETTING # +########################################################### +model: # keyword arguments for the selected model + embed_dim: 512 # char or phn embedding dimension + elayers: 1 # number of blstm layers in encoder + eunits: 512 # number of blstm units + econv_layers: 3 # number of convolutional layers in encoder + econv_chans: 512 # number of channels in convolutional layer + econv_filts: 5 # filter size of convolutional layer + atype: location # attention function type + adim: 512 # attention dimension + aconv_chans: 32 # number of channels in convolutional layer of attention + aconv_filts: 15 # filter size of convolutional layer of attention + cumulate_att_w: True # whether to cumulate attention weight + dlayers: 2 # number of lstm layers in decoder + dunits: 1024 # number of lstm units in decoder + prenet_layers: 2 # number of layers in prenet + prenet_units: 256 # number of units in prenet + postnet_layers: 5 # number of layers in postnet + postnet_chans: 512 # number of channels in postnet + postnet_filts: 5 # filter size of postnet layer + output_activation: null # activation function for the final output + use_batch_norm: True # whether to use batch normalization in encoder + use_concate: True # whether to concatenate encoder embedding with decoder outputs + use_residual: False # whether to use residual connection in encoder + dropout_rate: 0.5 # dropout rate + zoneout_rate: 0.1 # zoneout rate + reduction_factor: 1 # reduction factor + spk_embed_dim: null # speaker embedding dimension + + +########################################################### +# UPDATER SETTING # +########################################################### +updater: + use_masking: True # whether to apply masking for padded part in loss calculation + bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation + use_guided_attn_loss: True # whether to use guided attention loss + guided_attn_loss_sigma: 0.4 # sigma of guided attention loss + guided_attn_loss_lambda: 1.0 # strength of guided attention loss + + +########################################################## +# OPTIMIZER SETTING # +########################################################## +optimizer: + optim: adam # optimizer type + learning_rate: 1.0e-03 # learning rate + epsilon: 1.0e-06 # epsilon + weight_decay: 0.0 # weight decay coefficient + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 200 +num_snapshots: 5 + +########################################################### +# OTHER SETTING # +########################################################### +seed: 42 \ No newline at end of file diff --git a/examples/csmsc/tts0/local/preprocess.sh b/examples/csmsc/tts0/local/preprocess.sh new file mode 100755 index 00000000000..8a4b8dd9442 --- /dev/null +++ b/examples/csmsc/tts0/local/preprocess.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +config_path=$1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # get durations from MFA's result + echo "Generate durations.txt from MFA results ..." + python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ + --inputdir=./baker_alignment_tone \ + --output=durations.txt \ + --config=${config_path} +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/preprocess.py \ + --dataset=baker \ + --rootdir=~/datasets/BZNSYP/ \ + --dumpdir=dump \ + --dur-file=durations.txt \ + --config=${config_path} \ + --num-cpu=20 \ + --cut-sil=True +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="speech" + +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize and covert phone to id, dev and test should use train's stats + echo "Normalize ..." + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt +fi diff --git a/examples/csmsc/tts0/local/synthesize.sh b/examples/csmsc/tts0/local/synthesize.sh new file mode 100755 index 00000000000..4be06dd8055 --- /dev/null +++ b/examples/csmsc/tts0/local/synthesize.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt diff --git a/examples/csmsc/tts0/local/synthesize_e2e.sh b/examples/csmsc/tts0/local/synthesize_e2e.sh new file mode 100755 index 00000000000..fe5d11d4400 --- /dev/null +++ b/examples/csmsc/tts0/local/synthesize_e2e.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +stage=0 +stop_stage=0 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --inference_dir=${train_output_path}/inference \ + --phones_dict=dump/phone_id_map.txt +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ + --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ + --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --inference_dir=${train_output_path}/inference \ + --phones_dict=dump/phone_id_map.txt +fi + +# the pretrained models haven't release now +# style melgan +# style melgan's Dygraph to Static Graph is not ready now +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt + # --inference_dir=${train_output_path}/inference +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in hifigan syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --inference_dir=${train_output_path}/inference \ + --phones_dict=dump/phone_id_map.txt +fi diff --git a/examples/csmsc/tts0/local/train.sh b/examples/csmsc/tts0/local/train.sh new file mode 100755 index 00000000000..f90db91505d --- /dev/null +++ b/examples/csmsc/tts0/local/train.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=1 \ + --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/csmsc/tts0/path.sh b/examples/csmsc/tts0/path.sh new file mode 100755 index 00000000000..9cdbe256e11 --- /dev/null +++ b/examples/csmsc/tts0/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=new_tacotron2 +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/csmsc/tts0/run.sh b/examples/csmsc/tts0/run.sh new file mode 100755 index 00000000000..86800920d68 --- /dev/null +++ b/examples/csmsc/tts0/run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_153.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/csmsc/tts3/conf/conformer.yaml b/examples/csmsc/tts3/conf/conformer.yaml index 252f634d8de..03e4f2e33cb 100644 --- a/examples/csmsc/tts3/conf/conformer.yaml +++ b/examples/csmsc/tts3/conf/conformer.yaml @@ -53,8 +53,8 @@ model: conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type conformer_activation_type: swish # conformer activation type - use_macaron_style_in_conformer: true # whether to use macaron style in conformer - use_cnn_in_conformer: true # whether to use CNN in conformer + use_macaron_style_in_conformer: True # whether to use macaron style in conformer + use_cnn_in_conformer: True # whether to use CNN in conformer conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder init_type: xavier_uniform # initialization type @@ -70,14 +70,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder diff --git a/examples/csmsc/tts3/conf/default.yaml b/examples/csmsc/tts3/conf/default.yaml index 1f723d67cd6..ce2b24d9227 100644 --- a/examples/csmsc/tts3/conf/default.yaml +++ b/examples/csmsc/tts3/conf/default.yaml @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder @@ -82,7 +82,6 @@ updater: use_masking: True # whether to apply masking for padded part in loss calculation - ########################################################### # OPTIMIZER SETTING # ########################################################### diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh index c1ddd3b9862..8f06e933ccc 100755 --- a/examples/csmsc/tts3/run.sh +++ b/examples/csmsc/tts3/run.sh @@ -18,7 +18,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data - bash ./local/preprocess.sh ${conf_path} || exit -1 + ./local/preprocess.sh ${conf_path} || exit -1 fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then diff --git a/examples/csmsc/voc1/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml index 28d218ff365..703be21b35d 100644 --- a/examples/csmsc/voc1/conf/default.yaml +++ b/examples/csmsc/voc1/conf/default.yaml @@ -34,10 +34,10 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - bias: true # use bias in residual blocks - use_weight_norm: true # Whether to use weight norm. + bias: True # use bias in residual blocks + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. - use_causal_conv: false # use causal conv in residual blocks and upsample layers + use_causal_conv: False # use causal conv in residual blocks and upsample layers upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size. interpolate_mode: "nearest" # upsample net interpolate mode freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis @@ -53,8 +53,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/examples/csmsc/voc3/conf/default.yaml b/examples/csmsc/voc3/conf/default.yaml index 27e97664aa2..fbff54f193f 100644 --- a/examples/csmsc/voc3/conf/default.yaml +++ b/examples/csmsc/voc3/conf/default.yaml @@ -63,13 +63,13 @@ discriminator_params: ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: true +use_stft_loss: True stft_loss_params: fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss. win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. window: "hann" # Window function for STFT-based loss -use_subband_stft_loss: true +use_subband_stft_loss: True subband_stft_loss_params: fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss @@ -79,7 +79,7 @@ subband_stft_loss_params: ########################################################### # ADVERSARIAL LOSS SETTING # ########################################################### -use_feat_match_loss: false # Whether to use feature matching loss. +use_feat_match_loss: False # Whether to use feature matching loss. lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss. ########################################################### diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml index a3b1d8b113f..0a38c28200e 100644 --- a/examples/csmsc/voc3/conf/finetune.yaml +++ b/examples/csmsc/voc3/conf/finetune.yaml @@ -63,13 +63,13 @@ discriminator_params: ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: true +use_stft_loss: True stft_loss_params: fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. window: "hann" # Window function for STFT-based loss -use_subband_stft_loss: true +use_subband_stft_loss: True subband_stft_loss_params: fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss. @@ -79,7 +79,7 @@ subband_stft_loss_params: ########################################################### # ADVERSARIAL LOSS SETTING # ########################################################### -use_feat_match_loss: false # Whether to use feature matching loss. +use_feat_match_loss: False # Whether to use feature matching loss. lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss. ########################################################### diff --git a/examples/csmsc/voc4/conf/default.yaml b/examples/csmsc/voc4/conf/default.yaml index c9abf78dc22..cd8f8e2865d 100644 --- a/examples/csmsc/voc4/conf/default.yaml +++ b/examples/csmsc/voc4/conf/default.yaml @@ -65,7 +65,7 @@ discriminator_params: ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: true +use_stft_loss: True stft_loss_params: fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss @@ -78,9 +78,9 @@ lambda_aux: 1.0 # Loss balancing coefficient for aux loss. ########################################################### lambda_adv: 1.0 # Loss balancing coefficient for adv loss. generator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. discriminator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. ########################################################### # DATA LOADER SETTING # diff --git a/examples/csmsc/voc5/conf/default.yaml b/examples/csmsc/voc5/conf/default.yaml index f42fc385acf..38b94cf5c24 100644 --- a/examples/csmsc/voc5/conf/default.yaml +++ b/examples/csmsc/voc5/conf/default.yaml @@ -35,12 +35,12 @@ generator_params: - [1, 3, 5] - [1, 3, 5] - [1, 3, 5] - use_additional_convs: true # Whether to use additional conv layer in residual blocks. - bias: true # Whether to use bias parameter in conv. + use_additional_convs: True # Whether to use additional conv layer in residual blocks. + bias: True # Whether to use bias parameter in conv. nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. + use_weight_norm: True # Whether to apply weight normalization. ########################################################### @@ -60,12 +60,12 @@ discriminator_params: channels: 128 # Initial number of channels. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_groups: 16 # Maximum number of groups in downsampling conv layers. - bias: true + bias: True downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales. nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: negative_slope: 0.1 - follow_official_norm: true # Whether to follow the official norm setting. + follow_official_norm: True # Whether to follow the official norm setting. periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. period_discriminator_params: in_channels: 1 # Number of input channels. @@ -74,19 +74,19 @@ discriminator_params: channels: 32 # Initial number of channels. downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. - bias: true # Whether to use bias parameter in conv layer." + bias: True # Whether to use bias parameter in conv layer." nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. - use_spectral_norm: false # Whether to apply spectral normalization. + use_weight_norm: True # Whether to apply weight normalization. + use_spectral_norm: False # Whether to apply spectral normalization. ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: false # Whether to use multi-resolution STFT loss. -use_mel_loss: true # Whether to use Mel-spectrogram loss. +use_stft_loss: False # Whether to use multi-resolution STFT loss. +use_mel_loss: True # Whether to use Mel-spectrogram loss. mel_loss_params: fs: 24000 fft_size: 2048 @@ -98,14 +98,14 @@ mel_loss_params: fmax: 12000 log_base: null generator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. discriminator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. -use_feat_match_loss: true + average_by_discriminators: False # Whether to average loss by #discriminators. +use_feat_match_loss: True feat_match_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. - average_by_layers: false # Whether to average loss by #layers in each discriminator. - include_final_outputs: false # Whether to include final outputs in feat match loss calculation. + average_by_discriminators: False # Whether to average loss by #discriminators. + average_by_layers: False # Whether to average loss by #layers in each discriminator. + include_final_outputs: False # Whether to include final outputs in feat match loss calculation. ########################################################### # ADVERSARIAL LOSS SETTING # diff --git a/examples/csmsc/voc5/conf/finetune.yaml b/examples/csmsc/voc5/conf/finetune.yaml index 73420625111..110ae052bdd 100644 --- a/examples/csmsc/voc5/conf/finetune.yaml +++ b/examples/csmsc/voc5/conf/finetune.yaml @@ -35,12 +35,12 @@ generator_params: - [1, 3, 5] - [1, 3, 5] - [1, 3, 5] - use_additional_convs: true # Whether to use additional conv layer in residual blocks. - bias: true # Whether to use bias parameter in conv. + use_additional_convs: True # Whether to use additional conv layer in residual blocks. + bias: True # Whether to use bias parameter in conv. nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. + use_weight_norm: True # Whether to apply weight normalization. ########################################################### @@ -60,12 +60,12 @@ discriminator_params: channels: 128 # Initial number of channels. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_groups: 16 # Maximum number of groups in downsampling conv layers. - bias: true + bias: True downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales. nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: negative_slope: 0.1 - follow_official_norm: true # Whether to follow the official norm setting. + follow_official_norm: True # Whether to follow the official norm setting. periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. period_discriminator_params: in_channels: 1 # Number of input channels. @@ -74,19 +74,19 @@ discriminator_params: channels: 32 # Initial number of channels. downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. - bias: true # Whether to use bias parameter in conv layer." + bias: True # Whether to use bias parameter in conv layer." nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. - use_spectral_norm: false # Whether to apply spectral normalization. + use_weight_norm: True # Whether to apply weight normalization. + use_spectral_norm: False # Whether to apply spectral normalization. ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: false # Whether to use multi-resolution STFT loss. -use_mel_loss: true # Whether to use Mel-spectrogram loss. +use_stft_loss: False # Whether to use multi-resolution STFT loss. +use_mel_loss: True # Whether to use Mel-spectrogram loss. mel_loss_params: fs: 24000 fft_size: 2048 @@ -98,14 +98,14 @@ mel_loss_params: fmax: 12000 log_base: null generator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. discriminator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. -use_feat_match_loss: true + average_by_discriminators: False # Whether to average loss by #discriminators. +use_feat_match_loss: True feat_match_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. - average_by_layers: false # Whether to average loss by #layers in each discriminator. - include_final_outputs: false # Whether to include final outputs in feat match loss calculation. + average_by_discriminators: False # Whether to average loss by #discriminators. + average_by_layers: False # Whether to average loss by #layers in each discriminator. + include_final_outputs: False # Whether to include final outputs in feat match loss calculation. ########################################################### # ADVERSARIAL LOSS SETTING # diff --git a/examples/ljspeech/tts1/conf/default.yaml b/examples/ljspeech/tts1/conf/default.yaml index 6b495effc8d..456b6a1e353 100644 --- a/examples/ljspeech/tts1/conf/default.yaml +++ b/examples/ljspeech/tts1/conf/default.yaml @@ -63,9 +63,9 @@ model: # keyword arguments for the selected model # UPDATER SETTING # ########################################################### updater: - use_masking: true # whether to apply masking for padded part in loss calculation + use_masking: True # whether to apply masking for padded part in loss calculation loss_type: L1 - use_guided_attn_loss: true # whether to use guided attention loss + use_guided_attn_loss: True # whether to use guided attention loss guided_attn_loss_sigma: 0.4 # sigma in guided attention loss guided_attn_loss_lambda: 10.0 # lambda in guided attention loss modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss diff --git a/examples/ljspeech/tts3/conf/default.yaml b/examples/ljspeech/tts3/conf/default.yaml index 872dafcbe35..15cfda2c651 100644 --- a/examples/ljspeech/tts3/conf/default.yaml +++ b/examples/ljspeech/tts3/conf/default.yaml @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder diff --git a/examples/ljspeech/voc1/conf/default.yaml b/examples/ljspeech/voc1/conf/default.yaml index 2d39beb795d..d30960d657d 100644 --- a/examples/ljspeech/voc1/conf/default.yaml +++ b/examples/ljspeech/voc1/conf/default.yaml @@ -33,7 +33,7 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - use_weight_norm: true # Whether to use weight norm. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. upsample_scales: [4, 4, 4, 4] # Upsampling scales. prod(upsample_scales) == n_shift @@ -46,8 +46,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/examples/vctk/tts3/conf/default.yaml b/examples/vctk/tts3/conf/default.yaml index 2738e7c2245..86d4a0d5a88 100644 --- a/examples/vctk/tts3/conf/default.yaml +++ b/examples/vctk/tts3/conf/default.yaml @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder spk_embed_dim: 256 # speaker embedding dimension spk_embed_integration_type: concat # speaker embedding integration type diff --git a/examples/vctk/voc1/conf/default.yaml b/examples/vctk/voc1/conf/default.yaml index 59ce3825dcc..af859d4cccc 100644 --- a/examples/vctk/voc1/conf/default.yaml +++ b/examples/vctk/voc1/conf/default.yaml @@ -33,7 +33,7 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - use_weight_norm: true # Whether to use weight norm. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift @@ -46,8 +46,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index 526871a232d..2fcb46d9e28 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -17,6 +17,35 @@ from paddlespeech.t2s.data.batch import batch_sequences +def tacotron2_single_spk_batch_fn(examples): + # fields = ["text", "text_lengths", "speech", "speech_lengths"] + text = [np.array(item["text"], dtype=np.int64) for item in examples] + speech = [np.array(item["speech"], dtype=np.float32) for item in examples] + text_lengths = [ + np.array(item["text_lengths"], dtype=np.int64) for item in examples + ] + speech_lengths = [ + np.array(item["speech_lengths"], dtype=np.int64) for item in examples + ] + + text = batch_sequences(text) + speech = batch_sequences(speech) + + # convert each batch to paddle.Tensor + text = paddle.to_tensor(text) + speech = paddle.to_tensor(speech) + text_lengths = paddle.to_tensor(text_lengths) + speech_lengths = paddle.to_tensor(speech_lengths) + + batch = { + "text": text, + "text_lengths": text_lengths, + "speech": speech, + "speech_lengths": speech_lengths, + } + return batch + + def speedyspeech_single_spk_batch_fn(examples): # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] phones = [np.array(item["phones"], dtype=np.int64) for item in examples] @@ -56,7 +85,7 @@ def speedyspeech_single_spk_batch_fn(examples): def speedyspeech_multi_spk_batch_fn(examples): - # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] + # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations", "spk_id"] phones = [np.array(item["phones"], dtype=np.int64) for item in examples] tones = [np.array(item["tones"], dtype=np.int64) for item in examples] feats = [np.array(item["feats"], dtype=np.float32) for item in examples] diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py index 4ddd19f72b4..13569b9995f 100644 --- a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py +++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py @@ -15,14 +15,14 @@ # for mb melgan finetune # 长度和原本的 mel 不一致怎么办? import argparse +import os from pathlib import Path import numpy as np import paddle import yaml -from yacs.config import CfgNode from tqdm import tqdm -import os +from yacs.config import CfgNode from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import merge_silence @@ -50,11 +50,14 @@ def evaluate(args, fastspeech2_config): spk_id_list = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id_list) else: - spk_num=None + spk_num = None odim = fastspeech2_config.n_mels model = FastSpeech2( - idim=vocab_size, odim=odim, **fastspeech2_config["model"], spk_num=spk_num) + idim=vocab_size, + odim=odim, + **fastspeech2_config["model"], + spk_num=spk_num) model.set_state_dict( paddle.load(args.fastspeech2_checkpoint)["main_params"]) @@ -99,9 +102,15 @@ def evaluate(args, fastspeech2_config): else: train_wav_files += wav_files - train_wav_files = [os.path.basename(str(str_path)) for str_path in train_wav_files] - dev_wav_files = [os.path.basename(str(str_path)) for str_path in dev_wav_files] - test_wav_files = [os.path.basename(str(str_path)) for str_path in test_wav_files] + train_wav_files = [ + os.path.basename(str(str_path)) for str_path in train_wav_files + ] + dev_wav_files = [ + os.path.basename(str(str_path)) for str_path in dev_wav_files + ] + test_wav_files = [ + os.path.basename(str(str_path)) for str_path in test_wav_files + ] for i, utt_id in enumerate(tqdm(sentences)): phones = sentences[utt_id][0] @@ -122,7 +131,8 @@ def evaluate(args, fastspeech2_config): phone_ids = paddle.to_tensor(np.array(phone_ids)) if args.speaker_dict: - speaker_id = int([item[1] for item in spk_id_list if speaker == item[0]][0]) + speaker_id = int( + [item[1] for item in spk_id_list if speaker == item[0]][0]) speaker_id = paddle.to_tensor(speaker_id) else: speaker_id = None @@ -143,7 +153,8 @@ def evaluate(args, fastspeech2_config): sub_output_dir.mkdir(parents=True, exist_ok=True) with paddle.no_grad(): - mel = fastspeech2_inference(phone_ids, durations=durations, spk_id=speaker_id) + mel = fastspeech2_inference( + phone_ids, durations=durations, spk_id=speaker_id) np.save(sub_output_dir / (utt_id + "_feats.npy"), mel) @@ -175,12 +186,9 @@ def main(): type=str, default="phone_id_map.txt", help="phone vocabulary file.") - + parser.add_argument( - "--speaker-dict", - type=str, - default=None, - help="speaker id map file.") + "--speaker-dict", type=str, default=None, help="speaker id map file.") parser.add_argument( "--dur-file", default=None, type=str, help="path to durations.txt.") diff --git a/paddlespeech/t2s/exps/new_tacotron2/normalize.py b/paddlespeech/t2s/exps/new_tacotron2/normalize.py new file mode 120000 index 00000000000..64848f899b1 --- /dev/null +++ b/paddlespeech/t2s/exps/new_tacotron2/normalize.py @@ -0,0 +1 @@ +../transformer_tts/normalize.py \ No newline at end of file diff --git a/paddlespeech/t2s/exps/new_tacotron2/preprocess.py b/paddlespeech/t2s/exps/new_tacotron2/preprocess.py new file mode 100644 index 00000000000..0b61912cf9e --- /dev/null +++ b/paddlespeech/t2s/exps/new_tacotron2/preprocess.py @@ -0,0 +1,353 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +from concurrent.futures import ThreadPoolExecutor +from operator import itemgetter +from pathlib import Path +from typing import Any +from typing import Dict +from typing import List + +import jsonlines +import librosa +import numpy as np +import tqdm +import yaml +from yacs.config import CfgNode + +from paddlespeech.t2s.data.get_feats import Energy +from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.data.get_feats import Pitch +from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length +from paddlespeech.t2s.datasets.preprocess_utils import get_input_token +from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur +from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map +from paddlespeech.t2s.datasets.preprocess_utils import merge_silence + + +def process_sentence(config: Dict[str, Any], + fp: Path, + sentences: Dict, + output_dir: Path, + mel_extractor=None, + pitch_extractor=None, + energy_extractor=None, + cut_sil: bool=True, + spk_emb_dir: Path=None): + utt_id = fp.stem + # for vctk + if utt_id.endswith("_mic2"): + utt_id = utt_id[:-5] + record = None + if utt_id in sentences: + # reading, resampling may occur + wav, _ = librosa.load(str(fp), sr=config.fs) + if len(wav.shape) != 1 or np.abs(wav).max() > 1.0: + return record + assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio." + assert np.abs(wav).max( + ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + speaker = sentences[utt_id][2] + d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant') + # little imprecise than use *.TextGrid directly + times = librosa.frames_to_time( + d_cumsum, sr=config.fs, hop_length=config.n_shift) + if cut_sil: + start = 0 + end = d_cumsum[-1] + if phones[0] == "sil" and len(durations) > 1: + start = times[1] + durations = durations[1:] + phones = phones[1:] + if phones[-1] == 'sil' and len(durations) > 1: + end = times[-2] + durations = durations[:-1] + phones = phones[:-1] + sentences[utt_id][0] = phones + sentences[utt_id][1] = durations + start, end = librosa.time_to_samples([start, end], sr=config.fs) + wav = wav[start:end] + # extract mel feats + logmel = mel_extractor.get_log_mel_fbank(wav) + # change duration according to mel_length + compare_duration_and_mel_length(sentences, utt_id, logmel) + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + num_frames = logmel.shape[0] + assert sum(durations) == num_frames + mel_dir = output_dir / "data_speech" + mel_dir.mkdir(parents=True, exist_ok=True) + mel_path = mel_dir / (utt_id + "_speech.npy") + np.save(mel_path, logmel) + record = { + "utt_id": utt_id, + "phones": phones, + "text_lengths": len(phones), + "speech_lengths": num_frames, + "speech": str(mel_path), + "speaker": speaker + } + if spk_emb_dir: + if speaker in os.listdir(spk_emb_dir): + embed_name = utt_id + ".npy" + embed_path = spk_emb_dir / speaker / embed_name + if embed_path.is_file(): + record["spk_emb"] = str(embed_path) + else: + return None + return record + + +def process_sentences(config, + fps: List[Path], + sentences: Dict, + output_dir: Path, + mel_extractor=None, + pitch_extractor=None, + energy_extractor=None, + nprocs: int=1, + cut_sil: bool=True, + spk_emb_dir: Path=None): + if nprocs == 1: + results = [] + for fp in fps: + record = process_sentence(config, fp, sentences, output_dir, + mel_extractor, pitch_extractor, + energy_extractor, cut_sil, spk_emb_dir) + if record: + results.append(record) + else: + with ThreadPoolExecutor(nprocs) as pool: + futures = [] + with tqdm.tqdm(total=len(fps)) as progress: + for fp in fps: + future = pool.submit(process_sentence, config, fp, + sentences, output_dir, mel_extractor, + pitch_extractor, energy_extractor, + cut_sil, spk_emb_dir) + future.add_done_callback(lambda p: progress.update()) + futures.append(future) + + results = [] + for ft in futures: + record = ft.result() + if record: + results.append(record) + + results.sort(key=itemgetter("utt_id")) + with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer: + for item in results: + writer.write(item) + print("Done") + + +def main(): + # parse config and args + parser = argparse.ArgumentParser( + description="Preprocess audio and then extract features.") + + parser.add_argument( + "--dataset", + default="baker", + type=str, + help="name of dataset, should in {baker, aishell3, ljspeech, vctk} now") + + parser.add_argument( + "--rootdir", default=None, type=str, help="directory to dataset.") + + parser.add_argument( + "--dumpdir", + type=str, + required=True, + help="directory to dump feature files.") + parser.add_argument( + "--dur-file", default=None, type=str, help="path to durations.txt.") + + parser.add_argument("--config", type=str, help="fastspeech2 config file.") + + parser.add_argument( + "--verbose", + type=int, + default=1, + help="logging level. higher is more logging. (default=1)") + parser.add_argument( + "--num-cpu", type=int, default=1, help="number of process.") + + def str2bool(str): + return True if str.lower() == 'true' else False + + parser.add_argument( + "--cut-sil", + type=str2bool, + default=True, + help="whether cut sil in the edge of audio") + + parser.add_argument( + "--spk_emb_dir", + default=None, + type=str, + help="directory to speaker embedding files.") + args = parser.parse_args() + + rootdir = Path(args.rootdir).expanduser() + dumpdir = Path(args.dumpdir).expanduser() + # use absolute path + dumpdir = dumpdir.resolve() + dumpdir.mkdir(parents=True, exist_ok=True) + dur_file = Path(args.dur_file).expanduser() + + if args.spk_emb_dir: + spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve() + else: + spk_emb_dir = None + + assert rootdir.is_dir() + assert dur_file.is_file() + + with open(args.config, 'rt') as f: + config = CfgNode(yaml.safe_load(f)) + + if args.verbose > 1: + print(vars(args)) + print(config) + + sentences, speaker_set = get_phn_dur(dur_file) + + merge_silence(sentences) + phone_id_map_path = dumpdir / "phone_id_map.txt" + speaker_id_map_path = dumpdir / "speaker_id_map.txt" + get_input_token(sentences, phone_id_map_path, args.dataset) + get_spk_id_map(speaker_set, speaker_id_map_path) + + if args.dataset == "baker": + wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) + # split data into 3 sections + num_train = 9800 + num_dev = 100 + train_wav_files = wav_files[:num_train] + dev_wav_files = wav_files[num_train:num_train + num_dev] + test_wav_files = wav_files[num_train + num_dev:] + elif args.dataset == "aishell3": + sub_num_dev = 5 + wav_dir = rootdir / "train" / "wav" + train_wav_files = [] + dev_wav_files = [] + test_wav_files = [] + for speaker in os.listdir(wav_dir): + wav_files = sorted(list((wav_dir / speaker).rglob("*.wav"))) + if len(wav_files) > 100: + train_wav_files += wav_files[:-sub_num_dev * 2] + dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev] + test_wav_files += wav_files[-sub_num_dev:] + else: + train_wav_files += wav_files + + elif args.dataset == "ljspeech": + wav_files = sorted(list((rootdir / "wavs").rglob("*.wav"))) + # split data into 3 sections + num_train = 12900 + num_dev = 100 + train_wav_files = wav_files[:num_train] + dev_wav_files = wav_files[num_train:num_train + num_dev] + test_wav_files = wav_files[num_train + num_dev:] + elif args.dataset == "vctk": + sub_num_dev = 5 + wav_dir = rootdir / "wav48_silence_trimmed" + train_wav_files = [] + dev_wav_files = [] + test_wav_files = [] + for speaker in os.listdir(wav_dir): + wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac"))) + if len(wav_files) > 100: + train_wav_files += wav_files[:-sub_num_dev * 2] + dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev] + test_wav_files += wav_files[-sub_num_dev:] + else: + train_wav_files += wav_files + + else: + print("dataset should in {baker, aishell3, ljspeech, vctk} now!") + + train_dump_dir = dumpdir / "train" / "raw" + train_dump_dir.mkdir(parents=True, exist_ok=True) + dev_dump_dir = dumpdir / "dev" / "raw" + dev_dump_dir.mkdir(parents=True, exist_ok=True) + test_dump_dir = dumpdir / "test" / "raw" + test_dump_dir.mkdir(parents=True, exist_ok=True) + + # Extractor + mel_extractor = LogMelFBank( + sr=config.fs, + n_fft=config.n_fft, + hop_length=config.n_shift, + win_length=config.win_length, + window=config.window, + n_mels=config.n_mels, + fmin=config.fmin, + fmax=config.fmax) + pitch_extractor = Pitch( + sr=config.fs, + hop_length=config.n_shift, + f0min=config.f0min, + f0max=config.f0max) + energy_extractor = Energy( + sr=config.fs, + n_fft=config.n_fft, + hop_length=config.n_shift, + win_length=config.win_length, + window=config.window) + + # process for the 3 sections + if train_wav_files: + process_sentences( + config, + train_wav_files, + sentences, + train_dump_dir, + mel_extractor, + pitch_extractor, + energy_extractor, + nprocs=args.num_cpu, + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) + if dev_wav_files: + process_sentences( + config, + dev_wav_files, + sentences, + dev_dump_dir, + mel_extractor, + pitch_extractor, + energy_extractor, + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) + if test_wav_files: + process_sentences( + config, + test_wav_files, + sentences, + test_dump_dir, + mel_extractor, + pitch_extractor, + energy_extractor, + nprocs=args.num_cpu, + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/new_tacotron2/train.py b/paddlespeech/t2s/exps/new_tacotron2/train.py new file mode 100644 index 00000000000..20f73f0cedf --- /dev/null +++ b/paddlespeech/t2s/exps/new_tacotron2/train.py @@ -0,0 +1,190 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import logging +import os +import shutil +from pathlib import Path + +import jsonlines +import numpy as np +import paddle +import yaml +from paddle import DataParallel +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_single_spk_batch_fn +from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.models.new_tacotron2 import Tacotron2 +from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Evaluator +from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Updater +from paddlespeech.t2s.training.extensions.snapshot import Snapshot +from paddlespeech.t2s.training.extensions.visualizer import VisualDL +from paddlespeech.t2s.training.optimizer import build_optimizers +from paddlespeech.t2s.training.seeding import seed_everything +from paddlespeech.t2s.training.trainer import Trainer + + +def train_sp(args, config): + # decides device type and whether to run in parallel + # setup running environment correctly + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: + paddle.set_device("cpu") + else: + paddle.set_device("gpu") + world_size = paddle.distributed.get_world_size() + if world_size > 1: + paddle.distributed.init_parallel_env() + + # set the random seed, it is a must for multiprocess training + seed_everything(config.seed) + + print( + f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", + ) + + # dataloader has been too verbose + logging.getLogger("DataLoader").disabled = True + + # construct dataset for training and validation + with jsonlines.open(args.train_metadata, 'r') as reader: + train_metadata = list(reader) + train_dataset = DataTable( + data=train_metadata, + fields=[ + "text", + "text_lengths", + "speech", + "speech_lengths", + ], + converters={ + "speech": np.load, + }, ) + with jsonlines.open(args.dev_metadata, 'r') as reader: + dev_metadata = list(reader) + dev_dataset = DataTable( + data=dev_metadata, + fields=[ + "text", + "text_lengths", + "speech", + "speech_lengths", + ], + converters={ + "speech": np.load, + }, ) + + # collate function and dataloader + train_sampler = DistributedBatchSampler( + train_dataset, + batch_size=config.batch_size, + shuffle=True, + drop_last=True) + + print("samplers done!") + + train_dataloader = DataLoader( + train_dataset, + batch_sampler=train_sampler, + collate_fn=tacotron2_single_spk_batch_fn, + num_workers=config.num_workers) + + dev_dataloader = DataLoader( + dev_dataset, + shuffle=False, + drop_last=False, + batch_size=config.batch_size, + collate_fn=tacotron2_single_spk_batch_fn, + num_workers=config.num_workers) + print("dataloaders done!") + + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + odim = config.n_mels + model = Tacotron2(idim=vocab_size, odim=odim, **config["model"]) + if world_size > 1: + model = DataParallel(model) + print("model done!") + + optimizer = build_optimizers(model, **config["optimizer"]) + print("optimizer done!") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + if dist.get_rank() == 0: + config_name = args.config.split("/")[-1] + # copy conf to output_dir + shutil.copyfile(args.config, output_dir / config_name) + + updater = Tacotron2Updater( + model=model, + optimizer=optimizer, + dataloader=train_dataloader, + output_dir=output_dir, + **config["updater"]) + + trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir) + + evaluator = Tacotron2Evaluator( + model, dev_dataloader, output_dir=output_dir, **config["updater"]) + + if dist.get_rank() == 0: + trainer.extend(evaluator, trigger=(1, "epoch")) + trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) + # print(trainer.extensions) + trainer.run() + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="Train a Tacotron2 model.") + parser.add_argument("--config", type=str, help="tacotron2 config file.") + parser.add_argument("--train-metadata", type=str, help="training data.") + parser.add_argument("--dev-metadata", type=str, help="dev data.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + parser.add_argument( + "--phones-dict", type=str, default=None, help="phone vocabulary file.") + + args = parser.parse_args() + + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(config) + print( + f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" + ) + + # dispatch + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) + else: + train_sp(args, config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index f54774704a8..02bfcb15d6f 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -36,6 +36,10 @@ "paddlespeech.t2s.models.fastspeech2:FastSpeech2", "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + "tacotron2": + "paddlespeech.t2s.models.new_tacotron2:Tacotron2", + "tacotron2_inference": + "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", @@ -91,6 +95,8 @@ def evaluate(args): print("spk_num:", spk_num) elif am_name == 'speedyspeech': fields = ["utt_id", "phones", "tones"] + elif am_name == 'tacotron2': + fields = ["utt_id", "text"] test_dataset = DataTable(data=test_metadata, fields=fields) @@ -117,6 +123,8 @@ def evaluate(args): elif am_name == 'speedyspeech': am = am_class( vocab_size=vocab_size, tone_size=tone_size, **am_config["model"]) + elif am_name == 'tacotron2': + am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) am.eval() @@ -168,6 +176,9 @@ def evaluate(args): phone_ids = paddle.to_tensor(datum["phones"]) tone_ids = paddle.to_tensor(datum["tones"]) mel = am_inference(phone_ids, tone_ids) + elif am_name == 'tacotron2': + phone_ids = paddle.to_tensor(datum["text"]) + mel = am_inference(phone_ids) # vocoder wav = voc_inference(mel) sf.write( @@ -188,7 +199,7 @@ def main(): default='fastspeech2_csmsc', choices=[ 'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech', - 'fastspeech2_aishell3', 'fastspeech2_vctk' + 'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc' ], help='Choose acoustic model type of tts task.') parser.add_argument( diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 15ed1e4d44d..9aeff63894d 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -38,6 +38,10 @@ "paddlespeech.t2s.models.fastspeech2:FastSpeech2", "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + "tacotron2": + "paddlespeech.t2s.models.new_tacotron2:Tacotron2", + "tacotron2_inference": + "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", @@ -126,6 +130,8 @@ def evaluate(args): elif am_name == 'speedyspeech': am = am_class( vocab_size=vocab_size, tone_size=tone_size, **am_config["model"]) + elif am_name == 'tacotron2': + am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) am.eval() @@ -230,6 +236,8 @@ def evaluate(args): elif am_name == 'speedyspeech': part_tone_ids = tone_ids[i] mel = am_inference(part_phone_ids, part_tone_ids) + elif am_name == 'tacotron2': + mel = am_inference(part_phone_ids) # vocoder wav = voc_inference(mel) if flags == 0: @@ -255,7 +263,7 @@ def main(): default='fastspeech2_csmsc', choices=[ 'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech', - 'fastspeech2_aishell3', 'fastspeech2_vctk' + 'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc' ], help='Choose acoustic model type of tts task.') parser.add_argument( diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 405ad957d47..fe25351c783 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -556,8 +556,7 @@ def forward( tone_id=tone_id) # modify mod part of groundtruth if self.reduction_factor > 1: - olens = paddle.to_tensor( - [olen - olen % self.reduction_factor for olen in olens.numpy()]) + olens = olens - olens % self.reduction_factor max_olen = max(olens) ys = ys[:, :max_olen] diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py index 0dabf934ceb..3f5e1b565d2 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py @@ -36,12 +36,9 @@ def __init__(self, use_weighted_masking=False, output_dir=None): super().__init__(model, optimizer, dataloader, init_state=None) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking self.criterion = FastSpeech2Loss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking) + use_masking=use_masking, use_weighted_masking=use_weighted_masking) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) @@ -113,8 +110,6 @@ def __init__(self, use_weighted_masking=False, output_dir=None): super().__init__(model, dataloader) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) @@ -123,8 +118,7 @@ def __init__(self, self.msg = "" self.criterion = FastSpeech2Loss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking) + use_masking=use_masking, use_weighted_masking=use_weighted_masking) def evaluate_core(self, batch): self.msg = "Evaluate: " diff --git a/paddlespeech/t2s/models/new_tacotron2/__init__.py b/paddlespeech/t2s/models/new_tacotron2/__init__.py new file mode 100644 index 00000000000..ea63257c80d --- /dev/null +++ b/paddlespeech/t2s/models/new_tacotron2/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .tacotron2 import * +from .tacotron2_updater import * diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py new file mode 100644 index 00000000000..747c74f9aad --- /dev/null +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py @@ -0,0 +1,496 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tacotron 2 related modules for paddle""" +import logging +from typing import Dict +from typing import Optional +from typing import Tuple + +import paddle +import paddle.nn.functional as F +from paddle import nn +from typeguard import check_argument_types + +from paddlespeech.t2s.modules.nets_utils import initialize +from paddlespeech.t2s.modules.nets_utils import make_pad_mask +from paddlespeech.t2s.modules.tacotron2.attentions import AttForward +from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA +from paddlespeech.t2s.modules.tacotron2.attentions import AttLoc +from paddlespeech.t2s.modules.tacotron2.decoder import Decoder +from paddlespeech.t2s.modules.tacotron2.encoder import Encoder + + +class Tacotron2(nn.Layer): + """Tacotron2 module for end-to-end text-to-speech. + + This is a module of Spectrogram prediction network in Tacotron2 described + in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_, + which converts the sequence of characters into the sequence of Mel-filterbanks. + + .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`: + https://arxiv.org/abs/1712.05884 + + """ + + def __init__( + self, + # network structure related + idim: int, + odim: int, + embed_dim: int=512, + elayers: int=1, + eunits: int=512, + econv_layers: int=3, + econv_chans: int=512, + econv_filts: int=5, + atype: str="location", + adim: int=512, + aconv_chans: int=32, + aconv_filts: int=15, + cumulate_att_w: bool=True, + dlayers: int=2, + dunits: int=1024, + prenet_layers: int=2, + prenet_units: int=256, + postnet_layers: int=5, + postnet_chans: int=512, + postnet_filts: int=5, + output_activation: str=None, + use_batch_norm: bool=True, + use_concate: bool=True, + use_residual: bool=False, + reduction_factor: int=1, + # extra embedding related + spk_num: Optional[int]=None, + lang_num: Optional[int]=None, + spk_embed_dim: Optional[int]=None, + spk_embed_integration_type: str="concat", + dropout_rate: float=0.5, + zoneout_rate: float=0.1, + # training related + init_type: str="xavier_uniform",): + """Initialize Tacotron2 module. + Parameters + ---------- + idim : int + Dimension of the inputs. + odim : int + Dimension of the outputs. + embed_dim : int + Dimension of the token embedding. + elayers : int + Number of encoder blstm layers. + eunits : int + Number of encoder blstm units. + econv_layers : int + Number of encoder conv layers. + econv_filts : int + Number of encoder conv filter size. + econv_chans : int + Number of encoder conv filter channels. + dlayers : int + Number of decoder lstm layers. + dunits : int + Number of decoder lstm units. + prenet_layers : int + Number of prenet layers. + prenet_units : int + Number of prenet units. + postnet_layers : int + Number of postnet layers. + postnet_filts : int + Number of postnet filter size. + postnet_chans : int + Number of postnet filter channels. + output_activation : str + Name of activation function for outputs. + adim : int + Number of dimension of mlp in attention. + aconv_chans : int + Number of attention conv filter channels. + aconv_filts : int + Number of attention conv filter size. + cumulate_att_w : bool + Whether to cumulate previous attention weight. + use_batch_norm : bool + Whether to use batch normalization. + use_concate : bool + Whether to concat enc outputs w/ dec lstm outputs. + reduction_factor : int + Reduction factor. + spk_num : Optional[int] + Number of speakers. If set to > 1, assume that the + sids will be provided as the input and use sid embedding layer. + lang_num : Optional[int] + Number of languages. If set to > 1, assume that the + lids will be provided as the input and use sid embedding layer. + spk_embed_dim : Optional[int] + Speaker embedding dimension. If set to > 0, + assume that spk_emb will be provided as the input. + spk_embed_integration_type : str + How to integrate speaker embedding. + dropout_rate : float + Dropout rate. + zoneout_rate : float + Zoneout rate. + """ + assert check_argument_types() + super().__init__() + + # store hyperparameters + self.idim = idim + self.odim = odim + self.eos = idim - 1 + self.cumulate_att_w = cumulate_att_w + self.reduction_factor = reduction_factor + + # define activation function for the final output + if output_activation is None: + self.output_activation_fn = None + elif hasattr(F, output_activation): + self.output_activation_fn = getattr(F, output_activation) + else: + raise ValueError(f"there is no such an activation function. " + f"({output_activation})") + + # set padding idx + padding_idx = 0 + self.padding_idx = padding_idx + + # initialize parameters + initialize(self, init_type) + + # define network modules + self.enc = Encoder( + idim=idim, + embed_dim=embed_dim, + elayers=elayers, + eunits=eunits, + econv_layers=econv_layers, + econv_chans=econv_chans, + econv_filts=econv_filts, + use_batch_norm=use_batch_norm, + use_residual=use_residual, + dropout_rate=dropout_rate, + padding_idx=padding_idx, ) + + self.spk_num = None + if spk_num is not None and spk_num > 1: + self.spk_num = spk_num + self.sid_emb = nn.Embedding(spk_num, eunits) + self.lang_num = None + if lang_num is not None and lang_num > 1: + self.lang_num = lang_num + self.lid_emb = nn.Embedding(lang_num, eunits) + + self.spk_embed_dim = None + if spk_embed_dim is not None and spk_embed_dim > 0: + self.spk_embed_dim = spk_embed_dim + self.spk_embed_integration_type = spk_embed_integration_type + if self.spk_embed_dim is None: + dec_idim = eunits + elif self.spk_embed_integration_type == "concat": + dec_idim = eunits + spk_embed_dim + elif self.spk_embed_integration_type == "add": + dec_idim = eunits + self.projection = nn.Linear(self.spk_embed_dim, eunits) + else: + raise ValueError(f"{spk_embed_integration_type} is not supported.") + + if atype == "location": + att = AttLoc(dec_idim, dunits, adim, aconv_chans, aconv_filts) + elif atype == "forward": + att = AttForward(dec_idim, dunits, adim, aconv_chans, aconv_filts) + if self.cumulate_att_w: + logging.warning("cumulation of attention weights is disabled " + "in forward attention.") + self.cumulate_att_w = False + elif atype == "forward_ta": + att = AttForwardTA(dec_idim, dunits, adim, aconv_chans, aconv_filts, + odim) + if self.cumulate_att_w: + logging.warning("cumulation of attention weights is disabled " + "in forward attention.") + self.cumulate_att_w = False + else: + raise NotImplementedError("Support only location or forward") + self.dec = Decoder( + idim=dec_idim, + odim=odim, + att=att, + dlayers=dlayers, + dunits=dunits, + prenet_layers=prenet_layers, + prenet_units=prenet_units, + postnet_layers=postnet_layers, + postnet_chans=postnet_chans, + postnet_filts=postnet_filts, + output_activation_fn=self.output_activation_fn, + cumulate_att_w=self.cumulate_att_w, + use_batch_norm=use_batch_norm, + use_concate=use_concate, + dropout_rate=dropout_rate, + zoneout_rate=zoneout_rate, + reduction_factor=reduction_factor, ) + + nn.initializer.set_global_initializer(None) + + def forward( + self, + text: paddle.Tensor, + text_lengths: paddle.Tensor, + speech: paddle.Tensor, + speech_lengths: paddle.Tensor, + spk_emb: Optional[paddle.Tensor]=None, + spk_id: Optional[paddle.Tensor]=None, + lang_id: Optional[paddle.Tensor]=None + ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: + """Calculate forward propagation. + + Parameters + ---------- + text : Tensor(int64) + Batch of padded character ids (B, T_text). + text_lengths : Tensor(int64) + Batch of lengths of each input batch (B,). + speech : Tensor + Batch of padded target features (B, T_feats, odim). + speech_lengths : Tensor(int64) + Batch of the lengths of each target (B,). + spk_emb : Optional[Tensor] + Batch of speaker embeddings (B, spk_embed_dim). + spk_id : Optional[Tensor] + Batch of speaker IDs (B, 1). + lang_id : Optional[Tensor] + Batch of language IDs (B, 1). + + Returns + ---------- + Tensor + Loss scalar value. + Dict + Statistics to be monitored. + Tensor + Weight value if not joint training else model outputs. + + """ + text = text[:, :text_lengths.max()] + speech = speech[:, :speech_lengths.max()] + + batch_size = paddle.shape(text)[0] + + # Add eos at the last of sequence + xs = F.pad(text, [0, 0, 0, 1], "constant", self.padding_idx) + for i, l in enumerate(text_lengths): + xs[i, l] = self.eos + ilens = text_lengths + 1 + + ys = speech + olens = speech_lengths + + # make labels for stop prediction + labels = make_pad_mask(olens - 1) + # bool 类型无法切片 + labels = paddle.cast(labels, dtype='float32') + labels = F.pad(labels, [0, 0, 0, 1], "constant", 1.0) + + # calculate tacotron2 outputs + after_outs, before_outs, logits, att_ws = self._forward( + xs=xs, + ilens=ilens, + ys=ys, + olens=olens, + spk_emb=spk_emb, + spk_id=spk_id, + lang_id=lang_id, ) + + # modify mod part of groundtruth + if self.reduction_factor > 1: + assert olens.ge(self.reduction_factor).all( + ), "Output length must be greater than or equal to reduction factor." + olens = olens - olens % self.reduction_factor + max_out = max(olens) + ys = ys[:, :max_out] + labels = labels[:, :max_out] + labels = paddle.scatter(labels, 1, (olens - 1).unsqueeze(1), 1.0) + return after_outs, before_outs, logits, ys, labels, olens, att_ws, ilens + + def _forward( + self, + xs: paddle.Tensor, + ilens: paddle.Tensor, + ys: paddle.Tensor, + olens: paddle.Tensor, + spk_emb: paddle.Tensor, + spk_id: paddle.Tensor, + lang_id: paddle.Tensor, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + + hs, hlens = self.enc(xs, ilens) + if self.spk_num is not None: + sid_embs = self.sid_emb(spk_id.reshape([-1])) + hs = hs + sid_embs.unsqueeze(1) + if self.lang_num is not None: + lid_embs = self.lid_emb(lang_id.reshape([-1])) + hs = hs + lid_embs.unsqueeze(1) + if self.spk_embed_dim is not None: + hs = self._integrate_with_spk_embed(hs, spk_emb) + + return self.dec(hs, hlens, ys) + + def inference( + self, + text: paddle.Tensor, + speech: Optional[paddle.Tensor]=None, + spk_emb: Optional[paddle.Tensor]=None, + spk_id: Optional[paddle.Tensor]=None, + lang_id: Optional[paddle.Tensor]=None, + threshold: float=0.5, + minlenratio: float=0.0, + maxlenratio: float=10.0, + use_att_constraint: bool=False, + backward_window: int=1, + forward_window: int=3, + use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]: + """Generate the sequence of features given the sequences of characters. + + Parameters + ---------- + text Tensor(int64) + Input sequence of characters (T_text,). + speech : Optional[Tensor] + Feature sequence to extract style (N, idim). + spk_emb : ptional[Tensor] + Speaker embedding (spk_embed_dim,). + spk_id : Optional[Tensor] + Speaker ID (1,). + lang_id : Optional[Tensor] + Language ID (1,). + threshold : float + Threshold in inference. + minlenratio : float + Minimum length ratio in inference. + maxlenratio : float + Maximum length ratio in inference. + use_att_constraint : bool + Whether to apply attention constraint. + backward_window : int + Backward window in attention constraint. + forward_window : int + Forward window in attention constraint. + use_teacher_forcing : bool + Whether to use teacher forcing. + + Return + ---------- + Dict[str, Tensor] + Output dict including the following items: + * feat_gen (Tensor): Output sequence of features (T_feats, odim). + * prob (Tensor): Output sequence of stop probabilities (T_feats,). + * att_w (Tensor): Attention weights (T_feats, T). + + """ + x = text + y = speech + + # add eos at the last of sequence + x = F.pad(x, [0, 1], "constant", self.eos) + + # inference with teacher forcing + if use_teacher_forcing: + assert speech is not None, "speech must be provided with teacher forcing." + + xs, ys = x.unsqueeze(0), y.unsqueeze(0) + spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0) + ilens = paddle.shape(xs)[1] + olens = paddle.shape(ys)[1] + outs, _, _, att_ws = self._forward( + xs=xs, + ilens=ilens, + ys=ys, + olens=olens, + spk_emb=spk_emb, + spk_id=spk_id, + lang_id=lang_id, ) + + return dict(feat_gen=outs[0], att_w=att_ws[0]) + + # inference + h = self.enc.inference(x) + if self.spk_num is not None: + sid_emb = self.sid_emb(spk_id.reshape([-1])) + h = h + sid_emb + if self.lang_num is not None: + lid_emb = self.lid_emb(lang_id.reshape([-1])) + h = h + lid_emb + if self.spk_embed_dim is not None: + hs, spk_emb = h.unsqueeze(0), spk_emb.unsqueeze(0) + h = self._integrate_with_spk_embed(hs, spk_emb)[0] + out, prob, att_w = self.dec.inference( + h, + threshold=threshold, + minlenratio=minlenratio, + maxlenratio=maxlenratio, + use_att_constraint=use_att_constraint, + backward_window=backward_window, + forward_window=forward_window, ) + + return dict(feat_gen=out, prob=prob, att_w=att_w) + + def _integrate_with_spk_embed(self, + hs: paddle.Tensor, + spk_emb: paddle.Tensor) -> paddle.Tensor: + """Integrate speaker embedding with hidden states. + + Parameters + ---------- + hs : Tensor + Batch of hidden state sequences (B, Tmax, eunits). + spk_emb : Tensor + Batch of speaker embeddings (B, spk_embed_dim). + + Returns + ---------- + Tensor + Batch of integrated hidden state sequences (B, Tmax, eunits) if + integration_type is "add" else (B, Tmax, eunits + spk_embed_dim). + + """ + if self.spk_embed_integration_type == "add": + # apply projection and then add to hidden states + spk_emb = self.projection(F.normalize(spk_emb)) + hs = hs + spk_emb.unsqueeze(1) + elif self.spk_embed_integration_type == "concat": + # concat hidden states with spk embeds + spk_emb = F.normalize(spk_emb).unsqueeze(1).expand( + -1, paddle.shape(hs)[1], -1) + hs = paddle.concat([hs, spk_emb], axis=-1) + else: + raise NotImplementedError("support only add or concat.") + + return hs + + +class Tacotron2Inference(nn.Layer): + def __init__(self, normalizer, model): + super().__init__() + self.normalizer = normalizer + self.acoustic_model = model + + def forward(self, text, spk_id=None, spk_emb=None): + out = self.acoustic_model.inference( + text, spk_id=spk_id, spk_emb=spk_emb) + normalized_mel = out["feat_gen"] + logmel = self.normalizer.inverse(normalized_mel) + return logmel diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py new file mode 100644 index 00000000000..f1a2a50efa0 --- /dev/null +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py @@ -0,0 +1,217 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from pathlib import Path +from typing import Dict + +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer + +from paddlespeech.t2s.modules.losses import GuidedAttentionLoss +from paddlespeech.t2s.modules.losses import Tacotron2Loss +from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator +from paddlespeech.t2s.training.reporter import report +from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater +logging.basicConfig( + format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', + datefmt='[%Y-%m-%d %H:%M:%S]') +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class Tacotron2Updater(StandardUpdater): + def __init__(self, + model: Dict[str, Layer], + optimizer: Dict[str, Optimizer], + dataloader: DataLoader, + init_state=None, + use_masking: bool=True, + use_weighted_masking: bool=False, + bce_pos_weight: float=5.0, + loss_type: str="L1+L2", + use_guided_attn_loss: bool=True, + guided_attn_loss_sigma: float=0.4, + guided_attn_loss_lambda: float=1.0, + output_dir: Path=None): + super().__init__(model, optimizer, dataloader, init_state=None) + + self.loss_type = loss_type + self.use_guided_attn_loss = use_guided_attn_loss + + self.taco2_loss = Tacotron2Loss( + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight, ) + if self.use_guided_attn_loss: + self.attn_loss = GuidedAttentionLoss( + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def update_core(self, batch): + self.msg = "Rank: {}, ".format(dist.get_rank()) + losses_dict = {} + # spk_id!=None in multiple spk fastspeech2 + spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + # No explicit speaker identifier labels are used during voice cloning training. + if spk_emb is not None: + spk_id = None + + after_outs, before_outs, logits, ys, labels, olens, att_ws, ilens = self.model( + text=batch["text"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + spk_id=spk_id, + spk_emb=spk_emb) + + # calculate taco2 loss + l1_loss, mse_loss, bce_loss = self.taco2_loss(after_outs, before_outs, + logits, ys, labels, olens) + + if self.loss_type == "L1+L2": + loss = l1_loss + mse_loss + bce_loss + elif self.loss_type == "L1": + loss = l1_loss + bce_loss + elif self.loss_type == "L2": + loss = mse_loss + bce_loss + else: + raise ValueError(f"unknown --loss-type {self.loss_type}") + + # calculate attention loss + if self.use_guided_attn_loss: + # NOTE: length of output for auto-regressive + # input will be changed when r > 1 + if self.model.reduction_factor > 1: + olens_in = olens // self.model.reduction_factor + else: + olens_in = olens + attn_loss = self.attn_loss(att_ws, ilens, olens_in) + loss = loss + attn_loss + + optimizer = self.optimizer + optimizer.clear_grad() + loss.backward() + optimizer.step() + + report("train/l1_loss", float(l1_loss)) + report("train/mse_loss", float(mse_loss)) + report("train/bce_loss", float(bce_loss)) + report("train/attn_loss", float(attn_loss)) + report("train/loss", float(loss)) + + losses_dict["l1_loss"] = float(l1_loss) + losses_dict["mse_loss"] = float(mse_loss) + losses_dict["bce_loss"] = float(bce_loss) + losses_dict["attn_loss"] = float(attn_loss) + losses_dict["loss"] = float(loss) + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + + +class Tacotron2Evaluator(StandardEvaluator): + def __init__(self, + model, + dataloader, + use_masking: bool=True, + use_weighted_masking: bool=False, + bce_pos_weight: float=5.0, + loss_type: str="L1+L2", + use_guided_attn_loss: bool=True, + guided_attn_loss_sigma: float=0.4, + guided_attn_loss_lambda: float=1.0, + output_dir=None): + super().__init__(model, dataloader) + + self.loss_type = loss_type + self.use_guided_attn_loss = use_guided_attn_loss + + self.taco2_loss = Tacotron2Loss( + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight, ) + if self.use_guided_attn_loss: + self.attn_loss = GuidedAttentionLoss( + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def evaluate_core(self, batch): + self.msg = "Evaluate: " + losses_dict = {} + # spk_id!=None in multiple spk fastspeech2 + spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + if spk_emb is not None: + spk_id = None + + after_outs, before_outs, logits, ys, labels, olens, att_ws, ilens = self.model( + text=batch["text"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + spk_id=spk_id, + spk_emb=spk_emb) + + # calculate taco2 loss + l1_loss, mse_loss, bce_loss = self.taco2_loss(after_outs, before_outs, + logits, ys, labels, olens) + + if self.loss_type == "L1+L2": + loss = l1_loss + mse_loss + bce_loss + elif self.loss_type == "L1": + loss = l1_loss + bce_loss + elif self.loss_type == "L2": + loss = mse_loss + bce_loss + else: + raise ValueError(f"unknown --loss-type {self.loss_type}") + + # calculate attention loss + if self.use_guided_attn_loss: + # NOTE: length of output for auto-regressive + # input will be changed when r > 1 + if self.model.reduction_factor > 1: + olens_in = olens // self.model.reduction_factor + else: + olens_in = olens + attn_loss = self.attn_loss(att_ws, ilens, olens_in) + loss = loss + attn_loss + + report("eval/l1_loss", float(l1_loss)) + report("eval/mse_loss", float(mse_loss)) + report("eval/bce_loss", float(bce_loss)) + report("eval/attn_loss", float(attn_loss)) + report("eval/loss", float(loss)) + + losses_dict["l1_loss"] = float(l1_loss) + losses_dict["mse_loss"] = float(mse_loss) + losses_dict["bce_loss"] = float(bce_loss) + losses_dict["attn_loss"] = float(attn_loss) + losses_dict["loss"] = float(loss) + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + self.logger.info(self.msg) diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py index f16cf4dd9e7..6022567ece7 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py @@ -46,24 +46,20 @@ def __init__( guided_attn_loss_sigma: float=0.4, guided_attn_loss_lambda: float=1.0, ): super().__init__(model, optimizer, dataloader, init_state=None) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - self.bce_pos_weight = bce_pos_weight + self.loss_type = loss_type self.use_guided_attn_loss = use_guided_attn_loss - self.guided_attn_loss_sigma = guided_attn_loss_sigma - self.guided_attn_loss_lambda = guided_attn_loss_lambda self.modules_applied_guided_attn = modules_applied_guided_attn self.criterion = TransformerTTSLoss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking, - bce_pos_weight=self.bce_pos_weight) + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight) if self.use_guided_attn_loss: self.attn_criterion = GuidedMultiHeadAttentionLoss( - sigma=self.guided_attn_loss_sigma, - alpha=self.guided_attn_loss_lambda, ) + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) @@ -195,24 +191,20 @@ def __init__( guided_attn_loss_sigma: float=0.4, guided_attn_loss_lambda: float=1.0, ): super().__init__(model, dataloader) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - self.bce_pos_weight = bce_pos_weight + self.loss_type = loss_type self.use_guided_attn_loss = use_guided_attn_loss - self.guided_attn_loss_sigma = guided_attn_loss_sigma - self.guided_attn_loss_lambda = guided_attn_loss_lambda self.modules_applied_guided_attn = modules_applied_guided_attn self.criterion = TransformerTTSLoss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking, - bce_pos_weight=self.bce_pos_weight) + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight) if self.use_guided_attn_loss: self.attn_criterion = GuidedMultiHeadAttentionLoss( - sigma=self.guided_attn_loss_sigma, - alpha=self.guided_attn_loss_lambda, ) + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 569e96ada6f..0cb0c6fd1a4 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -20,6 +20,250 @@ from paddle.nn import functional as F from scipy import signal +from paddlespeech.s2t.modules.mask import make_non_pad_mask + + +# Loss for new Tacotron2 +class GuidedAttentionLoss(nn.Layer): + """Guided attention loss function module. + This module calculates the guided attention loss described + in `Efficiently Trainable Text-to-Speech System Based + on Deep Convolutional Networks with Guided Attention`_, + which forces the attention to be diagonal. + .. _`Efficiently Trainable Text-to-Speech System + Based on Deep Convolutional Networks with Guided Attention`: + https://arxiv.org/abs/1710.08969 + """ + + def __init__(self, sigma=0.4, alpha=1.0, reset_always=True): + """Initialize guided attention loss module. + Parameters + ---------- + sigma : float, optional + Standard deviation to control + how close attention to a diagonal. + alpha : float, optional + Scaling coefficient (lambda). + reset_always : bool, optional + Whether to always reset masks. + """ + super().__init__() + self.sigma = sigma + self.alpha = alpha + self.reset_always = reset_always + self.guided_attn_masks = None + self.masks = None + + def _reset_masks(self): + self.guided_attn_masks = None + self.masks = None + + def forward(self, att_ws, ilens, olens): + """Calculate forward propagation. + Parameters + ---------- + att_ws : Tensor + Batch of attention weights (B, T_max_out, T_max_in). + ilens : Tensor(int64) + Batch of input lengths (B,). + olens : Tensor(int64) + Batch of output lengths (B,). + Returns + ---------- + Tensor + Guided attention loss value. + """ + if self.guided_attn_masks is None: + self.guided_attn_masks = self._make_guided_attention_masks(ilens, + olens) + if self.masks is None: + self.masks = self._make_masks(ilens, olens) + losses = self.guided_attn_masks * att_ws + loss = paddle.mean(losses.masked_select(self.masks)) + if self.reset_always: + self._reset_masks() + return self.alpha * loss + + def _make_guided_attention_masks(self, ilens, olens): + n_batches = len(ilens) + max_ilen = max(ilens) + max_olen = max(olens) + guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen)) + for idx, (ilen, olen) in enumerate(zip(ilens, olens)): + guided_attn_masks[idx, :olen, : + ilen] = self._make_guided_attention_mask( + ilen, olen, self.sigma) + return guided_attn_masks + + @staticmethod + def _make_guided_attention_mask(ilen, olen, sigma): + """Make guided attention mask. + Parameters + ---------- + >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4) + >>> guided_attn_mask.shape + Size([5, 5]) + >>> guided_attn_mask + tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647], + [0.1175, 0.0000, 0.1175, 0.3935, 0.6753], + [0.3935, 0.1175, 0.0000, 0.1175, 0.3935], + [0.6753, 0.3935, 0.1175, 0.0000, 0.1175], + [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]]) + >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4) + >>> guided_attn_mask.shape + Size([6, 3]) + >>> guided_attn_mask + tensor([[0.0000, 0.2934, 0.7506], + [0.0831, 0.0831, 0.5422], + [0.2934, 0.0000, 0.2934], + [0.5422, 0.0831, 0.0831], + [0.7506, 0.2934, 0.0000], + [0.8858, 0.5422, 0.0831]]) + """ + grid_x, grid_y = paddle.meshgrid( + paddle.arange(olen), paddle.arange(ilen)) + grid_x = paddle.cast(grid_x, dtype='float32') + grid_y = paddle.cast(grid_y, dtype='float32') + + return 1.0 - paddle.exp(-( + (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2))) + + @staticmethod + def _make_masks(ilens, olens): + """Make masks indicating non-padded part. + Examples + ---------- + ilens : Tensor(int64) or List + Batch of lengths (B,). + olens : Tensor(int64) or List + Batch of lengths (B,). + Returns + ---------- + Tensor + Mask tensor indicating non-padded part. + Examples + ---------- + >>> ilens, olens = [5, 2], [8, 5] + >>> _make_mask(ilens, olens) + tensor([[[1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1]], + [[1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]]],) + """ + # (B, T_in) + in_masks = make_non_pad_mask(ilens) + # (B, T_out) + out_masks = make_non_pad_mask(olens) + # (B, T_out, T_in) + return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2) + + +class Tacotron2Loss(nn.Layer): + """Loss function module for Tacotron2.""" + + def __init__(self, + use_masking=True, + use_weighted_masking=False, + bce_pos_weight=20.0): + """Initialize Tactoron2 loss module. + Parameters + ---------- + use_masking : bool + Whether to apply masking for padded part in loss calculation. + use_weighted_masking : bool + Whether to apply weighted masking in loss calculation. + bce_pos_weight : float + Weight of positive sample of stop token. + """ + super().__init__() + assert (use_masking != use_weighted_masking) or not use_masking + self.use_masking = use_masking + self.use_weighted_masking = use_weighted_masking + + # define criterions + reduction = "none" if self.use_weighted_masking else "mean" + self.l1_criterion = nn.L1Loss(reduction=reduction) + self.mse_criterion = nn.MSELoss(reduction=reduction) + self.bce_criterion = nn.BCEWithLogitsLoss( + reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight)) + + def forward(self, after_outs, before_outs, logits, ys, labels, olens): + """Calculate forward propagation. + Parameters + ---------- + after_outs : Tensor + Batch of outputs after postnets (B, Lmax, odim). + before_outs : Tensor + Batch of outputs before postnets (B, Lmax, odim). + logits : Tensor + Batch of stop logits (B, Lmax). + ys : Tensor + Batch of padded target features (B, Lmax, odim). + labels : Tensor(int64) + Batch of the sequences of stop token labels (B, Lmax). + olens : Tensor(int64) + Batch of the lengths of each target (B,). + Returns + ---------- + Tensor + L1 loss value. + Tensor + Mean square error loss value. + Tensor + Binary cross entropy loss value. + """ + # make mask and apply it + if self.use_masking: + masks = make_non_pad_mask(olens).unsqueeze(-1) + ys = ys.masked_select(masks.broadcast_to(ys.shape)) + after_outs = after_outs.masked_select( + masks.broadcast_to(after_outs.shape)) + before_outs = before_outs.masked_select( + masks.broadcast_to(before_outs.shape)) + labels = labels.masked_select( + masks[:, :, 0].broadcast_to(labels.shape)) + logits = logits.masked_select( + masks[:, :, 0].broadcast_to(logits.shape)) + + # calculate loss + l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion( + before_outs, ys) + mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion( + before_outs, ys) + bce_loss = self.bce_criterion(logits, labels) + + # make weighted mask and apply it + if self.use_weighted_masking: + masks = make_non_pad_mask(olens).unsqueeze(-1) + weights = masks.float() / masks.sum(axis=1, keepdim=True).float() + out_weights = weights.divide( + paddle.shape(ys)[0] * paddle.shape(ys)[2]) + logit_weights = weights.divide(paddle.shape(ys)[0]) + + # apply weight + l1_loss = l1_loss.multiply(out_weights) + l1_loss = l1_loss.masked_select(masks.broadcast_to(l1_loss)).sum() + mse_loss = mse_loss.multiply(out_weights) + mse_loss = mse_loss.masked_select( + masks.broadcast_to(mse_loss)).sum() + bce_loss = bce_loss.multiply(logit_weights.squeeze(-1)) + bce_loss = bce_loss.masked_select( + masks.squeeze(-1).broadcast_to(bce_loss)).sum() + + return l1_loss, mse_loss, bce_loss + # Loss for Tacotron2 def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None): diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py new file mode 100644 index 00000000000..2b912db3dea --- /dev/null +++ b/paddlespeech/t2s/modules/tacotron2/attentions.py @@ -0,0 +1,519 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Attention modules for RNN.""" +import paddle +import paddle.nn.functional as F +from paddle import nn + +from paddlespeech.t2s.modules.masked_fill import masked_fill +from paddlespeech.t2s.modules.nets_utils import make_pad_mask + + +def _apply_attention_constraint(e, + last_attended_idx, + backward_window=1, + forward_window=3): + """Apply monotonic attention constraint. + + This function apply the monotonic attention constraint + introduced in `Deep Voice 3: Scaling + Text-to-Speech with Convolutional Sequence Learning`_. + + Parameters + ---------- + e : Tensor + Attention energy before applying softmax (1, T). + last_attended_idx : int + The index of the inputs of the last attended [0, T]. + backward_window : int, optional + Backward window size in attention constraint. + forward_window : int, optional + Forward window size in attetion constraint. + + Returns + ---------- + Tensor + Monotonic constrained attention energy (1, T). + + .. _`Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`: + https://arxiv.org/abs/1710.07654 + + """ + if paddle.shape(e)[0] != 1: + raise NotImplementedError( + "Batch attention constraining is not yet supported.") + backward_idx = last_attended_idx - backward_window + forward_idx = last_attended_idx + forward_window + if backward_idx > 0: + e[:, :backward_idx] = -float("inf") + if forward_idx < paddle.shape(e)[1]: + e[:, forward_idx:] = -float("inf") + return e + + +class AttLoc(nn.Layer): + """location-aware attention module. + + Reference: Attention-Based Models for Speech Recognition + (https://arxiv.org/pdf/1506.07503.pdf) + Parameters + ---------- + eprojs : int + projection-units of encoder + dunits : int + units of decoder + att_dim : int + att_dim: attention dimension + aconv_chans : int + channels of attention convolution + aconv_filts : int + filter size of attention convolution + han_mode : bool + flag to swith on mode of hierarchical attention and not store pre_compute_enc_h + """ + + def __init__(self, + eprojs, + dunits, + att_dim, + aconv_chans, + aconv_filts, + han_mode=False): + super().__init__() + self.mlp_enc = nn.Linear(eprojs, att_dim) + self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False) + self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False) + self.loc_conv = nn.Conv2D( + 1, + aconv_chans, + (1, 2 * aconv_filts + 1), + padding=(0, aconv_filts), + bias_attr=False, ) + self.gvec = nn.Linear(att_dim, 1) + + self.dunits = dunits + self.eprojs = eprojs + self.att_dim = att_dim + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + self.han_mode = han_mode + + def reset(self): + """reset states""" + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + + def forward( + self, + enc_hs_pad, + enc_hs_len, + dec_z, + att_prev, + scaling=2.0, + last_attended_idx=None, + backward_window=1, + forward_window=3, ): + """Calculate AttLoc forward propagation. + Parameters + ---------- + enc_hs_pad : paddle.Tensor + padded encoder hidden state (B, T_max, D_enc) + enc_hs_len : paddle.Tensor + padded encoder hidden state length (B) + dec_z : paddle.Tensor dec_z + decoder hidden state (B, D_dec) + att_prev : paddle.Tensor + previous attention weight (B, T_max) + scaling : float + scaling parameter before applying softmax + forward_window : paddle.Tensor + forward window size when constraining attention + last_attended_idx : int + index of the inputs of the last attended + backward_window : int + backward window size in attention constraint + forward_window : int + forward window size in attetion constraint + + Returns + ---------- + paddle.Tensor + attention weighted encoder state (B, D_enc) + paddle.Tensor + previous attention weights (B, T_max) + """ + batch = len(enc_hs_pad) + # pre-compute all h outside the decoder loop + if self.pre_compute_enc_h is None or self.han_mode: + # (utt, frame, hdim) + self.enc_h = enc_hs_pad + self.h_length = paddle.shape(self.enc_h)[1] + # (utt, frame, att_dim) + self.pre_compute_enc_h = self.mlp_enc(self.enc_h) + + if dec_z is None: + dec_z = paddle.zeros([batch, self.dunits]) + else: + dec_z = dec_z.reshape([batch, self.dunits]) + + # initialize attention weight with uniform dist. + if att_prev is None: + # if no bias, 0 0-pad goes 0 + + att_prev = 1.0 - make_pad_mask(enc_hs_len) + att_prev = att_prev / enc_hs_len.unsqueeze(-1) + + # att_prev: (utt, frame) -> (utt, 1, 1, frame) + # -> (utt, att_conv_chans, 1, frame) + + att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length])) + # att_conv: (utt, att_conv_chans, 1, frame) -> (utt, frame, att_conv_chans) + att_conv = att_conv.squeeze(2).transpose([0, 2, 1]) + # att_conv: (utt, frame, att_conv_chans) -> (utt, frame, att_dim) + att_conv = self.mlp_att(att_conv) + + # dec_z_tiled: (utt, frame, att_dim) + dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim]) + + # dot with gvec + # (utt, frame, att_dim) -> (utt, frame) + e = self.gvec( + paddle.tanh(att_conv + self.pre_compute_enc_h + + dec_z_tiled)).squeeze(2) + + # NOTE: consider zero padding when compute w. + if self.mask is None: + self.mask = make_pad_mask(enc_hs_len) + e = masked_fill(e, self.mask, -float("inf")) + # apply monotonic attention constraint (mainly for TTS) + if last_attended_idx is not None: + e = _apply_attention_constraint(e, last_attended_idx, + backward_window, forward_window) + + w = F.softmax(scaling * e, axis=1) + + # weighted sum over flames + # utt x hdim + c = paddle.sum( + self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1) + + return c, w + + +class AttForward(nn.Layer): + """Forward attention module. + Reference + ---------- + Forward attention in sequence-to-sequence acoustic modeling for speech synthesis + (https://arxiv.org/pdf/1807.06736.pdf) + + Parameters + ---------- + eprojs : int + projection-units of encoder + dunits : int + units of decoder + att_dim : int + attention dimension + aconv_chans : int + channels of attention convolution + aconv_filts : int + filter size of attention convolution + """ + + def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts): + super().__init__() + self.mlp_enc = nn.Linear(eprojs, att_dim) + self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False) + self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False) + self.loc_conv = nn.Conv2D( + 1, + aconv_chans, + (1, 2 * aconv_filts + 1), + padding=(0, aconv_filts), + bias_attr=False, ) + self.gvec = nn.Linear(att_dim, 1) + self.dunits = dunits + self.eprojs = eprojs + self.att_dim = att_dim + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + + def reset(self): + """reset states""" + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + + def forward( + self, + enc_hs_pad, + enc_hs_len, + dec_z, + att_prev, + scaling=1.0, + last_attended_idx=None, + backward_window=1, + forward_window=3, ): + """Calculate AttForward forward propagation. + Parameters + ---------- + enc_hs_pad : paddle.Tensor + padded encoder hidden state (B, T_max, D_enc) + enc_hs_len : list + padded encoder hidden state length (B,) + dec_z : paddle.Tensor + decoder hidden state (B, D_dec) + att_prev : paddle.Tensor + attention weights of previous step (B, T_max) + scaling : float + scaling parameter before applying softmax + last_attended_idx : int + index of the inputs of the last attended + backward_window : int + backward window size in attention constraint + forward_window : int + forward window size in attetion constraint + Returns + ---------- + paddle.Tensor + attention weighted encoder state (B, D_enc) + paddle.Tensor + previous attention weights (B, T_max) + """ + batch = len(enc_hs_pad) + # pre-compute all h outside the decoder loop + if self.pre_compute_enc_h is None: + self.enc_h = enc_hs_pad # utt x frame x hdim + self.h_length = paddle.shape(self.enc_h)[1] + # utt x frame x att_dim + self.pre_compute_enc_h = self.mlp_enc(self.enc_h) + + if dec_z is None: + dec_z = paddle.zeros([batch, self.dunits]) + else: + dec_z = dec_z.reshape([batch, self.dunits]) + + if att_prev is None: + # initial attention will be [1, 0, 0, ...] + att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]]) + att_prev[:, 0] = 1.0 + + # att_prev: utt x frame -> utt x 1 x 1 x frame + # -> utt x att_conv_chans x 1 x frame + att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length])) + # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans + att_conv = att_conv.squeeze(2).transpose([0, 2, 1]) + # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim + att_conv = self.mlp_att(att_conv) + + # dec_z_tiled: utt x frame x att_dim + dec_z_tiled = self.mlp_dec(dec_z).unsqueeze(1) + + # dot with gvec + # utt x frame x att_dim -> utt x frame + e = self.gvec( + paddle.tanh(self.pre_compute_enc_h + dec_z_tiled + + att_conv)).squeeze(2) + + # NOTE: consider zero padding when compute w. + if self.mask is None: + self.mask = make_pad_mask(enc_hs_len) + e = masked_fill(e, self.mask, -float("inf")) + + # apply monotonic attention constraint (mainly for TTS) + if last_attended_idx is not None: + e = _apply_attention_constraint(e, last_attended_idx, + backward_window, forward_window) + + w = F.softmax(scaling * e, axis=1) + + # forward attention + att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1] + + w = (att_prev + att_prev_shift) * w + # NOTE: clip is needed to avoid nan gradient + w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1) + + # weighted sum over flames + # utt x hdim + # NOTE use bmm instead of sum(*) + c = paddle.sum(self.enc_h * w.unsqueeze(-1), axis=1) + + return c, w + + +class AttForwardTA(nn.Layer): + """Forward attention with transition agent module. + Reference + ---------- + Forward attention in sequence-to-sequence acoustic modeling for speech synthesis + (https://arxiv.org/pdf/1807.06736.pdf) + Parameters + ---------- + eunits : int + units of encoder + dunits : int + units of decoder + att_dim : int + attention dimension + aconv_chans : int + channels of attention convolution + aconv_filts : int + filter size of attention convolution + odim : int + output dimension + """ + + def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim): + super().__init__() + self.mlp_enc = nn.Linear(eunits, att_dim) + self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False) + self.mlp_ta = nn.Linear(eunits + dunits + odim, 1) + self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False) + self.loc_conv = nn.Conv2D( + 1, + aconv_chans, + (1, 2 * aconv_filts + 1), + padding=(0, aconv_filts), + bias_attr=False, ) + self.gvec = nn.Linear(att_dim, 1) + self.dunits = dunits + self.eunits = eunits + self.att_dim = att_dim + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + self.trans_agent_prob = 0.5 + + def reset(self): + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + self.trans_agent_prob = 0.5 + + def forward( + self, + enc_hs_pad, + enc_hs_len, + dec_z, + att_prev, + out_prev, + scaling=1.0, + last_attended_idx=None, + backward_window=1, + forward_window=3, ): + """Calculate AttForwardTA forward propagation. + Parameters + ---------- + enc_hs_pad : paddle.Tensor + padded encoder hidden state (B, Tmax, eunits) + enc_hs_len : list paddle.Tensor + padded encoder hidden state length (B,) + dec_z : paddle.Tensor + decoder hidden state (B, dunits) + att_prev : paddle.Tensor + attention weights of previous step (B, T_max) + out_prev : paddle.Tensor + decoder outputs of previous step (B, odim) + scaling : float + scaling parameter before applying softmax + last_attended_idx : int + index of the inputs of the last attended + backward_window : int + backward window size in attention constraint + forward_window : int + forward window size in attetion constraint + Returns + ---------- + paddle.Tensor + attention weighted encoder state (B, dunits) + paddle.Tensor + previous attention weights (B, Tmax) + """ + batch = len(enc_hs_pad) + # pre-compute all h outside the decoder loop + if self.pre_compute_enc_h is None: + self.enc_h = enc_hs_pad # utt x frame x hdim + self.h_length = paddle.shape(self.enc_h)[1] + # utt x frame x att_dim + self.pre_compute_enc_h = self.mlp_enc(self.enc_h) + + if dec_z is None: + dec_z = paddle.zeros([batch, self.dunits]) + else: + dec_z = dec_z.reshape([batch, self.dunits]) + + if att_prev is None: + # initial attention will be [1, 0, 0, ...] + att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]]) + att_prev[:, 0] = 1.0 + + # att_prev: utt x frame -> utt x 1 x 1 x frame + # -> utt x att_conv_chans x 1 x frame + att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length])) + # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans + att_conv = att_conv.squeeze(2).transpose([0, 2, 1]) + # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim + att_conv = self.mlp_att(att_conv) + + # dec_z_tiled: utt x frame x att_dim + dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim]) + + # dot with gvec + # utt x frame x att_dim -> utt x frame + e = self.gvec( + paddle.tanh(att_conv + self.pre_compute_enc_h + + dec_z_tiled)).squeeze(2) + + # NOTE consider zero padding when compute w. + if self.mask is None: + self.mask = make_pad_mask(enc_hs_len) + e = masked_fill(e, self.mask, -float("inf")) + + # apply monotonic attention constraint (mainly for TTS) + if last_attended_idx is not None: + e = _apply_attention_constraint(e, last_attended_idx, + backward_window, forward_window) + + w = F.softmax(scaling * e, axis=1) + + # forward attention + # att_prev_shift = F.pad(att_prev.unsqueeze(0), (1, 0), data_format='NCL').squeeze(0)[:, :-1] + att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1] + w = (self.trans_agent_prob * att_prev + + (1 - self.trans_agent_prob) * att_prev_shift) * w + # NOTE: clip is needed to avoid nan gradient + w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1) + + # weighted sum over flames + # utt x hdim + # NOTE use bmm instead of sum(*) + c = paddle.sum( + self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1) + + # update transition agent prob + self.trans_agent_prob = F.sigmoid( + self.mlp_ta(paddle.concat([c, out_prev, dec_z], axis=1))) + + return c, w diff --git a/paddlespeech/t2s/modules/tacotron2/decoder.py b/paddlespeech/t2s/modules/tacotron2/decoder.py index 691bb3ee29c..fc15adfda30 100644 --- a/paddlespeech/t2s/modules/tacotron2/decoder.py +++ b/paddlespeech/t2s/modules/tacotron2/decoder.py @@ -13,10 +13,13 @@ # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) """Tacotron2 decoder related modules.""" +import paddle import paddle.nn.functional as F import six from paddle import nn +from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA + class Prenet(nn.Layer): """Prenet module for decoder of Spectrogram prediction network. @@ -196,3 +199,527 @@ def forward(self, xs): for i in six.moves.range(len(self.postnet)): xs = self.postnet[i](xs) return xs + + +class ZoneOutCell(nn.Layer): + """ZoneOut Cell module. + This is a module of zoneout described in + `Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`_. + This code is modified from `eladhoffer/seq2seq.pytorch`_. + Examples + ---------- + >>> lstm = paddle.nn.LSTMCell(16, 32) + >>> lstm = ZoneOutCell(lstm, 0.5) + .. _`Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`: + https://arxiv.org/abs/1606.01305 + .. _`eladhoffer/seq2seq.pytorch`: + https://github.com/eladhoffer/seq2seq.pytorch + """ + + def __init__(self, cell, zoneout_rate=0.1): + """Initialize zone out cell module. + Parameters + ---------- + cell : nn.Layer: + Paddle recurrent cell module + e.g. `paddle.nn.LSTMCell`. + zoneout_rate : float, optional + Probability of zoneout from 0.0 to 1.0. + """ + super().__init__() + self.cell = cell + self.hidden_size = cell.hidden_size + self.zoneout_rate = zoneout_rate + if zoneout_rate > 1.0 or zoneout_rate < 0.0: + raise ValueError( + "zoneout probability must be in the range from 0.0 to 1.0.") + + def forward(self, inputs, hidden): + """Calculate forward propagation. + Parameters + ---------- + inputs : Tensor + Batch of input tensor (B, input_size). + hidden : tuple + - Tensor: Batch of initial hidden states (B, hidden_size). + - Tensor: Batch of initial cell states (B, hidden_size). + Returns + ---------- + Tensor + Batch of next hidden states (B, hidden_size). + tuple: + - Tensor: Batch of next hidden states (B, hidden_size). + - Tensor: Batch of next cell states (B, hidden_size). + """ + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.cell(inputs, hidden) + next_hidden = self._zoneout(hidden, next_hidden, self.zoneout_rate) + # to have the same output format with LSTMCell in paddle + return next_hidden[0], next_hidden + + def _zoneout(self, h, next_h, prob): + # apply recursively + if isinstance(h, tuple): + num_h = len(h) + if not isinstance(prob, tuple): + prob = tuple([prob] * num_h) + return tuple( + [self._zoneout(h[i], next_h[i], prob[i]) for i in range(num_h)]) + if self.training: + mask = paddle.bernoulli(paddle.ones([*paddle.shape(h)]) * prob) + return mask * h + (1 - mask) * next_h + else: + return prob * h + (1 - prob) * next_h + + +class Decoder(nn.Layer): + """Decoder module of Spectrogram prediction network. + This is a module of decoder of Spectrogram prediction network in Tacotron2, + which described in `Natural TTS + Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_. + The decoder generates the sequence of + features from the sequence of the hidden states. + .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`: + https://arxiv.org/abs/1712.05884 + """ + + def __init__( + self, + idim, + odim, + att, + dlayers=2, + dunits=1024, + prenet_layers=2, + prenet_units=256, + postnet_layers=5, + postnet_chans=512, + postnet_filts=5, + output_activation_fn=None, + cumulate_att_w=True, + use_batch_norm=True, + use_concate=True, + dropout_rate=0.5, + zoneout_rate=0.1, + reduction_factor=1, ): + """Initialize Tacotron2 decoder module. + Parameters + ---------- + idim : int + Dimension of the inputs. + odim : int + Dimension of the outputs. + att nn.Layer + Instance of attention class. + dlayers int, optional + The number of decoder lstm layers. + dunits : int, optional + The number of decoder lstm units. + prenet_layers : int, optional + The number of prenet layers. + prenet_units : int, optional + The number of prenet units. + postnet_layers : int, optional + The number of postnet layers. + postnet_filts : int, optional + The number of postnet filter size. + postnet_chans : int, optional + The number of postnet filter channels. + output_activation_fn : nn.Layer, optional + Activation function for outputs. + cumulate_att_w : bool, optional + Whether to cumulate previous attention weight. + use_batch_norm : bool, optional + Whether to use batch normalization. + use_concate : bool, optional + Whether to concatenate encoder embedding with decoder lstm outputs. + dropout_rate : float, optional + Dropout rate. + zoneout_rate : float, optional + Zoneout rate. + reduction_factor : int, optional + Reduction factor. + """ + super().__init__() + + # store the hyperparameters + self.idim = idim + self.odim = odim + self.att = att + self.output_activation_fn = output_activation_fn + self.cumulate_att_w = cumulate_att_w + self.use_concate = use_concate + self.reduction_factor = reduction_factor + + # check attention type + if isinstance(self.att, AttForwardTA): + self.use_att_extra_inputs = True + else: + self.use_att_extra_inputs = False + + # define lstm network + prenet_units = prenet_units if prenet_layers != 0 else odim + self.lstm = nn.LayerList() + for layer in six.moves.range(dlayers): + iunits = idim + prenet_units if layer == 0 else dunits + lstm = nn.LSTMCell(iunits, dunits) + if zoneout_rate > 0.0: + lstm = ZoneOutCell(lstm, zoneout_rate) + self.lstm.append(lstm) + + # define prenet + if prenet_layers > 0: + self.prenet = Prenet( + idim=odim, + n_layers=prenet_layers, + n_units=prenet_units, + dropout_rate=dropout_rate, ) + else: + self.prenet = None + + # define postnet + if postnet_layers > 0: + self.postnet = Postnet( + idim=idim, + odim=odim, + n_layers=postnet_layers, + n_chans=postnet_chans, + n_filts=postnet_filts, + use_batch_norm=use_batch_norm, + dropout_rate=dropout_rate, ) + else: + self.postnet = None + + # define projection layers + iunits = idim + dunits if use_concate else dunits + self.feat_out = nn.Linear( + iunits, odim * reduction_factor, bias_attr=False) + self.prob_out = nn.Linear(iunits, reduction_factor) + + # initialize + # self.apply(decoder_init) + + def _zero_state(self, hs): + init_hs = paddle.zeros([paddle.shape(hs)[0], self.lstm[0].hidden_size]) + return init_hs + + def forward(self, hs, hlens, ys): + """Calculate forward propagation. + Parameters + ---------- + hs : Tensor + Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens : Tensor(int64) padded + Batch of lengths of each input batch (B,). + ys : Tensor + Batch of the sequences of padded target features (B, Lmax, odim). + Returns + ---------- + Tensor + Batch of output tensors after postnet (B, Lmax, odim). + Tensor + Batch of output tensors before postnet (B, Lmax, odim). + Tensor + Batch of logits of stop prediction (B, Lmax). + Tensor + Batch of attention weights (B, Lmax, Tmax). + Note + ---------- + This computation is performed in teacher-forcing manner. + """ + # thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim) + if self.reduction_factor > 1: + ys = ys[:, self.reduction_factor - 1::self.reduction_factor] + + # length list should be list of int + # hlens = list(map(int, hlens)) + + # initialize hidden states of decoder + c_list = [self._zero_state(hs)] + z_list = [self._zero_state(hs)] + for _ in six.moves.range(1, len(self.lstm)): + c_list += [self._zero_state(hs)] + z_list += [self._zero_state(hs)] + prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim]) + + # initialize attention + prev_att_w = None + self.att.reset() + + # loop for an output sequence + outs, logits, att_ws = [], [], [] + for y in ys.transpose([1, 0, 2]): + if self.use_att_extra_inputs: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w, + prev_out) + else: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w) + prenet_out = self.prenet( + prev_out) if self.prenet is not None else prev_out + xs = paddle.concat([att_c, prenet_out], axis=1) + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0])) + z_list[0], c_list[0] = next_hidden + for i in six.moves.range(1, len(self.lstm)): + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[i](z_list[i - 1], + (z_list[i], c_list[i])) + z_list[i], c_list[i] = next_hidden + zcs = (paddle.concat([z_list[-1], att_c], axis=1) + if self.use_concate else z_list[-1]) + outs += [ + self.feat_out(zcs).reshape([paddle.shape(hs)[0], self.odim, -1]) + ] + logits += [self.prob_out(zcs)] + att_ws += [att_w] + # teacher forcing + prev_out = y + if self.cumulate_att_w and prev_att_w is not None: + prev_att_w = prev_att_w + att_w # Note: error when use += + else: + prev_att_w = att_w + # (B, Lmax) + logits = paddle.concat(logits, axis=1) + # (B, odim, Lmax) + before_outs = paddle.concat(outs, axis=2) + # (B, Lmax, Tmax) + att_ws = paddle.stack(att_ws, axis=1) + + if self.reduction_factor > 1: + # (B, odim, Lmax) + before_outs = before_outs.reshape( + [paddle.shape(before_outs)[0], self.odim, -1]) + + if self.postnet is not None: + # (B, odim, Lmax) + after_outs = before_outs + self.postnet(before_outs) + else: + after_outs = before_outs + # (B, Lmax, odim) + before_outs = before_outs.transpose([0, 2, 1]) + # (B, Lmax, odim) + after_outs = after_outs.transpose([0, 2, 1]) + logits = logits + + # apply activation function for scaling + if self.output_activation_fn is not None: + before_outs = self.output_activation_fn(before_outs) + after_outs = self.output_activation_fn(after_outs) + + return after_outs, before_outs, logits, att_ws + + def inference( + self, + h, + threshold=0.5, + minlenratio=0.0, + maxlenratio=10.0, + use_att_constraint=False, + backward_window=None, + forward_window=None, ): + """Generate the sequence of features given the sequences of characters. + Parameters + ---------- + h : Tensor + Input sequence of encoder hidden states (T, C). + threshold : float, optional + Threshold to stop generation. + minlenratio : float, optional + Minimum length ratio. + If set to 1.0 and the length of input is 10, + the minimum length of outputs will be 10 * 1 = 10. + minlenratio : float, optional + Minimum length ratio. + If set to 10 and the length of input is 10, + the maximum length of outputs will be 10 * 10 = 100. + use_att_constraint : bool + Whether to apply attention constraint introduced in `Deep Voice 3`_. + backward_window : int + Backward window size in attention constraint. + forward_window : int + Forward window size in attention constraint. + Returns + ---------- + Tensor + Output sequence of features (L, odim). + Tensor + Output sequence of stop probabilities (L,). + Tensor + Attention weights (L, T). + Note + ---------- + This computation is performed in auto-regressive manner. + .. _`Deep Voice 3`: https://arxiv.org/abs/1710.07654 + """ + # setup + assert len(paddle.shape(h)) == 2 + hs = h.unsqueeze(0) + ilens = paddle.shape(h)[0] + maxlen = int(paddle.shape(h)[0] * maxlenratio) + minlen = int(paddle.shape(h)[0] * minlenratio) + + # initialize hidden states of decoder + c_list = [self._zero_state(hs)] + z_list = [self._zero_state(hs)] + for _ in six.moves.range(1, len(self.lstm)): + c_list += [self._zero_state(hs)] + z_list += [self._zero_state(hs)] + prev_out = paddle.zeros([1, self.odim]) + + # initialize attention + prev_att_w = None + self.att.reset() + + # setup for attention constraint + if use_att_constraint: + last_attended_idx = 0 + else: + last_attended_idx = None + + # loop for an output sequence + idx = 0 + outs, att_ws, probs = [], [], [] + while True: + # updated index + idx += self.reduction_factor + + # decoder calculation + if self.use_att_extra_inputs: + att_c, att_w = self.att( + hs, + ilens, + z_list[0], + prev_att_w, + prev_out, + last_attended_idx=last_attended_idx, + backward_window=backward_window, + forward_window=forward_window, ) + else: + att_c, att_w = self.att( + hs, + ilens, + z_list[0], + prev_att_w, + last_attended_idx=last_attended_idx, + backward_window=backward_window, + forward_window=forward_window, ) + + att_ws += [att_w] + prenet_out = self.prenet( + prev_out) if self.prenet is not None else prev_out + xs = paddle.concat([att_c, prenet_out], axis=1) + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0])) + z_list[0], c_list[0] = next_hidden + for i in six.moves.range(1, len(self.lstm)): + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[i](z_list[i - 1], + (z_list[i], c_list[i])) + z_list[i], c_list[i] = next_hidden + zcs = (paddle.concat([z_list[-1], att_c], axis=1) + if self.use_concate else z_list[-1]) + # [(1, odim, r), ...] + outs += [self.feat_out(zcs).reshape([1, self.odim, -1])] + + # [(r), ...] + probs += [F.sigmoid(self.prob_out(zcs))[0]] + if self.output_activation_fn is not None: + prev_out = self.output_activation_fn( + outs[-1][:, :, -1]) # (1, odim) + else: + prev_out = outs[-1][:, :, -1] # (1, odim) + if self.cumulate_att_w and prev_att_w is not None: + prev_att_w = prev_att_w + att_w # Note: error when use += + else: + prev_att_w = att_w + if use_att_constraint: + last_attended_idx = int(att_w.argmax()) + + # check whether to finish generation + if sum(paddle.cast(probs[-1] >= threshold, + 'int64')) > 0 or idx >= maxlen: + # check mininum length + if idx < minlen: + continue + # (1, odim, L) + outs = paddle.concat(outs, axis=2) + if self.postnet is not None: + # (1, odim, L) + outs = outs + self.postnet(outs) + # (L, odim) + outs = outs.transpose([0, 2, 1]).squeeze(0) + probs = paddle.concat(probs, axis=0) + att_ws = paddle.concat(att_ws, axis=0) + break + + if self.output_activation_fn is not None: + outs = self.output_activation_fn(outs) + + return outs, probs, att_ws + + def calculate_all_attentions(self, hs, hlens, ys): + """Calculate all of the attention weights. + Parameters + ---------- + hs : Tensor + Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens : Tensor(int64) + Batch of lengths of each input batch (B,). + ys : Tensor + Batch of the sequences of padded target features (B, Lmax, odim). + Returns + ---------- + numpy.ndarray + Batch of attention weights (B, Lmax, Tmax). + Note + ---------- + This computation is performed in teacher-forcing manner. + """ + # thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim) + if self.reduction_factor > 1: + ys = ys[:, self.reduction_factor - 1::self.reduction_factor] + + # length list should be list of int + hlens = list(map(int, hlens)) + + # initialize hidden states of decoder + c_list = [self._zero_state(hs)] + z_list = [self._zero_state(hs)] + for _ in six.moves.range(1, len(self.lstm)): + c_list += [self._zero_state(hs)] + z_list += [self._zero_state(hs)] + prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim]) + + # initialize attention + prev_att_w = None + self.att.reset() + + # loop for an output sequence + att_ws = [] + for y in ys.transpose([1, 0, 2]): + if self.use_att_extra_inputs: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w, + prev_out) + else: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w) + att_ws += [att_w] + prenet_out = self.prenet( + prev_out) if self.prenet is not None else prev_out + xs = paddle.concat([att_c, prenet_out], axis=1) + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0])) + z_list[0], c_list[0] = next_hidden + for i in six.moves.range(1, len(self.lstm)): + z_list[i], c_list[i] = self.lstm[i](z_list[i - 1], + (z_list[i], c_list[i])) + # teacher forcing + prev_out = y + if self.cumulate_att_w and prev_att_w is not None: + # Note: error when use += + prev_att_w = prev_att_w + att_w + else: + prev_att_w = att_w + # (B, Lmax, Tmax) + att_ws = paddle.stack(att_ws, axis=1) + + return att_ws diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py index f1889061396..2f88d307efb 100644 --- a/paddlespeech/t2s/modules/tacotron2/encoder.py +++ b/paddlespeech/t2s/modules/tacotron2/encoder.py @@ -145,16 +145,15 @@ def forward(self, xs, ilens=None): Batch of the padded sequence. Either character ids (B, Tmax) or acoustic feature (B, Tmax, idim * encoder_reduction_factor). Padded value should be 0. - ilens : LongTensor + ilens : Tensor(int64) Batch of lengths of each input batch (B,). Returns ---------- Tensor Batch of the sequences of encoder states(B, Tmax, eunits). - LongTensor + Tensor(int64) Batch of lengths of each sequence (B,) - """ xs = self.embed(xs).transpose([0, 2, 1]) if self.convs is not None: @@ -170,7 +169,8 @@ def forward(self, xs, ilens=None): xs = xs.transpose([0, 2, 1]) self.blstm.flatten_parameters() # (B, Tmax, C) - xs, _ = self.blstm(xs) + # see https://www.paddlepaddle.org.cn/documentation/docs/zh/faq/train_cn.html#paddletorch-nn-utils-rnn-pack-padded-sequencetorch-nn-utils-rnn-pad-packed-sequenceapi + xs, _ = self.blstm(xs, sequence_length=ilens) # hlens 是什么 hlens = ilens diff --git a/paddlespeech/t2s/training/optimizer.py b/paddlespeech/t2s/training/optimizer.py index 907e3dafafb..64274d5380b 100644 --- a/paddlespeech/t2s/training/optimizer.py +++ b/paddlespeech/t2s/training/optimizer.py @@ -26,10 +26,13 @@ sgd=paddle.optimizer.SGD, ) -def build_optimizers(model: nn.Layer, - optim='adadelta', - max_grad_norm=None, - learning_rate=0.01) -> paddle.optimizer: +def build_optimizers( + model: nn.Layer, + optim='adadelta', + max_grad_norm=None, + learning_rate=0.01, + weight_decay=None, + epsilon=1.0e-6, ) -> paddle.optimizer: optim_class = optim_classes.get(optim) if optim_class is None: raise ValueError(f"must be one of {list(optim_classes)}: {optim}") @@ -37,10 +40,13 @@ def build_optimizers(model: nn.Layer, grad_clip = None if max_grad_norm: grad_clip = paddle.nn.ClipGradByGlobalNorm(max_grad_norm) - optim = optim_class( - parameters=model.parameters(), - learning_rate=learning_rate, - grad_clip=grad_clip) + optim_dict = {} + optim_dict['parameters'] = model.parameters() + optim_dict['learning_rate'] = learning_rate + optim_dict['grad_clip'] = grad_clip + optim_dict['weight_decay'] = weight_decay + if optim_class not in {'momentum', 'sgd'}: + optim_dict['epsilon'] = epsilon + optimizers = optim_class(**optim_dict) - optimizers = optim return optimizers From 89e988a69e748306c1eb471682f0226ae0d8e97f Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 12 Jan 2022 05:01:01 +0000 Subject: [PATCH 2/5] add csmsc tacotron2, test=tts --- examples/csmsc/tts0/README.md | 264 ------------------ .../t2s/exps/new_tacotron2/__init__.py | 13 + paddlespeech/t2s/models/__init__.py | 1 + .../t2s/models/new_tacotron2/tacotron2.py | 6 +- paddlespeech/t2s/modules/losses.py | 7 +- paddlespeech/t2s/modules/tacotron2/encoder.py | 1 - 6 files changed, 21 insertions(+), 271 deletions(-) delete mode 100644 examples/csmsc/tts0/README.md create mode 100644 paddlespeech/t2s/exps/new_tacotron2/__init__.py diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md deleted file mode 100644 index 13d291b5c39..00000000000 --- a/examples/csmsc/tts0/README.md +++ /dev/null @@ -1,264 +0,0 @@ -# FastSpeech2 with CSMSC -This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). - -## Dataset -### Download and Extract -Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source). - -### Get MFA Result and Extract -We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2. -You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. - -## Get Started -Assume the path to the dataset is `~/datasets/BZNSYP`. -Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. -Run the command below to -1. **source path**. -2. preprocess the dataset. -3. train the model. -4. synthesize wavs. - - synthesize waveform from `metadata.jsonl`. - - synthesize waveform from a text file. -5. inference using the static model. -```bash -./run.sh -``` -You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. -```bash -./run.sh --stage 0 --stop-stage 0 -``` -### Data Preprocessing -```bash -./local/preprocess.sh ${conf_path} -``` -When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. - -```text -dump -├── dev -│ ├── norm -│ └── raw -├── phone_id_map.txt -├── speaker_id_map.txt -├── test -│ ├── norm -│ └── raw -└── train - ├── energy_stats.npy - ├── norm - ├── pitch_stats.npy - ├── raw - └── speech_stats.npy -``` -The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech、pitch and energy features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. - -Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, the path of pitch features, the path of energy features, speaker, and the id of each utterance. - -### Model Training -```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} -``` -`./local/train.sh` calls `${BIN_DIR}/train.py`. -Here's the complete help message. -```text -usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] - [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - -Train a FastSpeech2 model. - -optional arguments: - -h, --help show this help message and exit - --config CONFIG fastspeech2 config file. - --train-metadata TRAIN_METADATA - training data. - --dev-metadata DEV_METADATA - dev data. - --output-dir OUTPUT_DIR - output dir. - --ngpu NGPU if ngpu=0, use cpu. - --phones-dict PHONES_DICT - phone vocabulary file. - --speaker-dict SPEAKER_DICT - speaker id map file for multiple speaker model. - --voice-cloning VOICE_CLONING - whether training voice cloning model. -``` -1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. -2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. -3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. -4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. -5. `--phones-dict` is the path of the phone vocabulary file. - -### Synthesizing -We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it. -```bash -unzip pwg_baker_ckpt_0.4.zip -``` -Parallel WaveGAN checkpoint contains files listed below. -```text -pwg_baker_ckpt_0.4 -├── pwg_default.yaml # default config used to train parallel wavegan -├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan -└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan -``` -`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. -```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} -``` -```text -usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] - [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] - [--am_stat AM_STAT] [--phones_dict PHONES_DICT] - [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] - [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] - [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] - [--voc_stat VOC_STAT] [--ngpu NGPU] - [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] - -Synthesize with acoustic model & vocoder - -optional arguments: - -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} - Choose acoustic model type of tts task. - --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. - --am_ckpt AM_CKPT Checkpoint file of acoustic model. - --am_stat AM_STAT mean and standard deviation used to normalize - spectrogram when training acoustic model. - --phones_dict PHONES_DICT - phone vocabulary file. - --tones_dict TONES_DICT - tone vocabulary file. - --speaker_dict SPEAKER_DICT - speaker id map file. - --voice-cloning VOICE_CLONING - whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} - Choose vocoder type of tts task. - --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. - --voc_ckpt VOC_CKPT Checkpoint file of voc. - --voc_stat VOC_STAT mean and standard deviation used to normalize - spectrogram when training voc. - --ngpu NGPU if ngpu == 0, use cpu. - --test_metadata TEST_METADATA - test metadata. - --output_dir OUTPUT_DIR - output dir. -``` -`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. -```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} -``` -```text -usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] - [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] - [--am_stat AM_STAT] [--phones_dict PHONES_DICT] - [--tones_dict TONES_DICT] - [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] - [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] - [--voc_stat VOC_STAT] [--lang LANG] - [--inference_dir INFERENCE_DIR] [--ngpu NGPU] - [--text TEXT] [--output_dir OUTPUT_DIR] - -Synthesize with acoustic model & vocoder - -optional arguments: - -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} - Choose acoustic model type of tts task. - --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. - --am_ckpt AM_CKPT Checkpoint file of acoustic model. - --am_stat AM_STAT mean and standard deviation used to normalize - spectrogram when training acoustic model. - --phones_dict PHONES_DICT - phone vocabulary file. - --tones_dict TONES_DICT - tone vocabulary file. - --speaker_dict SPEAKER_DICT - speaker id map file. - --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} - Choose vocoder type of tts task. - --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. - --voc_ckpt VOC_CKPT Checkpoint file of voc. - --voc_stat VOC_STAT mean and standard deviation used to normalize - spectrogram when training voc. - --lang LANG Choose model language. zh or en - --inference_dir INFERENCE_DIR - dir to save inference models - --ngpu NGPU if ngpu == 0, use cpu. - --text TEXT text to synthesize, a 'utt_id sentence' pair per line. - --output_dir OUTPUT_DIR - output dir. -``` -1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. -3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. -5. `--lang` is the model language, which can be `zh` or `en`. -6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. -7. `--text` is the text file, which contains sentences to synthesize. -8. `--output_dir` is the directory to save synthesized audio files. -9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. - -### Inferencing -After synthesizing, we will get static models of fastspeech2 and pwgan in `${train_output_path}/inference`. -`./local/inference.sh` calls `${BIN_DIR}/inference.py`, which provides a paddle static model inference example for fastspeech2 + pwgan synthesize. -```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} -``` - -## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios: -- [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip) -- [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip) - -The static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip). - -Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss -:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: -default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287| -conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509| - -FastSpeech2 checkpoint contains files listed below. -```text -fastspeech2_nosil_baker_ckpt_0.4 -├── default.yaml # default config used to train fastspeech2 -├── phone_id_map.txt # phone vocabulary file when training fastspeech2 -├── snapshot_iter_76000.pdz # model parameters and optimizer states -└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 -``` -You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. -```bash -source path.sh - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize_e2e.py \ - --am=fastspeech2_csmsc \ - --am_config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ - --am_ckpt=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ - --am_stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ - --voc=pwgan_csmsc \ - --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --lang=zh \ - --text=${BIN_DIR}/../sentences.txt \ - --output_dir=exp/default/test_e2e \ - --inference_dir=exp/default/inference \ - --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt -``` diff --git a/paddlespeech/t2s/exps/new_tacotron2/__init__.py b/paddlespeech/t2s/exps/new_tacotron2/__init__.py new file mode 100644 index 00000000000..abf198b97e6 --- /dev/null +++ b/paddlespeech/t2s/exps/new_tacotron2/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/t2s/models/__init__.py b/paddlespeech/t2s/models/__init__.py index f268a4e3359..65227374ed7 100644 --- a/paddlespeech/t2s/models/__init__.py +++ b/paddlespeech/t2s/models/__init__.py @@ -14,6 +14,7 @@ from .fastspeech2 import * from .hifigan import * from .melgan import * +from .new_tacotron2 import * from .parallel_wavegan import * from .speedyspeech import * from .tacotron2 import * diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py index 747c74f9aad..c8ef956cef6 100644 --- a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py @@ -77,9 +77,9 @@ def __init__( spk_embed_dim: Optional[int]=None, spk_embed_integration_type: str="concat", dropout_rate: float=0.5, - zoneout_rate: float=0.1, + zoneout_rate: float=0.1, # training related - init_type: str="xavier_uniform",): + init_type: str="xavier_uniform", ): """Initialize Tacotron2 module. Parameters ---------- @@ -243,7 +243,7 @@ def __init__( dropout_rate=dropout_rate, zoneout_rate=zoneout_rate, reduction_factor=reduction_factor, ) - + nn.initializer.set_global_initializer(None) def forward( diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 0cb0c6fd1a4..781ac7924fd 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -20,7 +20,7 @@ from paddle.nn import functional as F from scipy import signal -from paddlespeech.s2t.modules.mask import make_non_pad_mask +from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask # Loss for new Tacotron2 @@ -324,7 +324,7 @@ def stft(x, details. Defaults to "hann". center : bool, optional center (bool, optional): Whether to pad `x` to make that the - :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`. + :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`. pad_mode : str, optional Choose padding pattern when `center` is `True`. Returns @@ -677,7 +677,8 @@ def weighted_mean(input, weight): Weighted mean tensor with the same dtype as input. """ weight = paddle.cast(weight, input.dtype) - broadcast_ratio = input.size / weight.size + # paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__ + broadcast_ratio = input.numel() / weight.numel() return paddle.sum(input * weight) / (paddle.sum(weight) * broadcast_ratio) diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py index 2f88d307efb..b2ed30d1f1c 100644 --- a/paddlespeech/t2s/modules/tacotron2/encoder.py +++ b/paddlespeech/t2s/modules/tacotron2/encoder.py @@ -171,7 +171,6 @@ def forward(self, xs, ilens=None): # (B, Tmax, C) # see https://www.paddlepaddle.org.cn/documentation/docs/zh/faq/train_cn.html#paddletorch-nn-utils-rnn-pack-padded-sequencetorch-nn-utils-rnn-pad-packed-sequenceapi xs, _ = self.blstm(xs, sequence_length=ilens) - # hlens 是什么 hlens = ilens return xs, hlens From 9c7f0762b0528af7192341fdd37581ebe3e8876f Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 17 Jan 2022 04:54:00 +0000 Subject: [PATCH 3/5] update racotron2 and transformer tts, test=tts --- .../t2s/models/new_tacotron2/tacotron2.py | 5 +- .../models/new_tacotron2/tacotron2_updater.py | 19 +- .../models/transformer_tts/transformer_tts.py | 327 +----------------- .../transformer_tts_updater.py | 34 +- paddlespeech/t2s/modules/losses.py | 118 +++++-- 5 files changed, 132 insertions(+), 371 deletions(-) diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py index c8ef956cef6..4804ffb44fa 100644 --- a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py @@ -324,7 +324,10 @@ def forward( ys = ys[:, :max_out] labels = labels[:, :max_out] labels = paddle.scatter(labels, 1, (olens - 1).unsqueeze(1), 1.0) - return after_outs, before_outs, logits, ys, labels, olens, att_ws, ilens + olens_in = olens // self.reduction_factor + else: + olens_in = olens + return after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in def _forward( self, diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py index f1a2a50efa0..6d41702cd08 100644 --- a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py @@ -72,11 +72,10 @@ def update_core(self, batch): # spk_id!=None in multiple spk fastspeech2 spk_id = batch["spk_id"] if "spk_id" in batch else None spk_emb = batch["spk_emb"] if "spk_emb" in batch else None - # No explicit speaker identifier labels are used during voice cloning training. if spk_emb is not None: spk_id = None - after_outs, before_outs, logits, ys, labels, olens, att_ws, ilens = self.model( + after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -101,11 +100,8 @@ def update_core(self, batch): if self.use_guided_attn_loss: # NOTE: length of output for auto-regressive # input will be changed when r > 1 - if self.model.reduction_factor > 1: - olens_in = olens // self.model.reduction_factor - else: - olens_in = olens - attn_loss = self.attn_loss(att_ws, ilens, olens_in) + attn_loss = self.attn_loss( + att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in) loss = loss + attn_loss optimizer = self.optimizer @@ -169,7 +165,7 @@ def evaluate_core(self, batch): if spk_emb is not None: spk_id = None - after_outs, before_outs, logits, ys, labels, olens, att_ws, ilens = self.model( + after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -194,11 +190,8 @@ def evaluate_core(self, batch): if self.use_guided_attn_loss: # NOTE: length of output for auto-regressive # input will be changed when r > 1 - if self.model.reduction_factor > 1: - olens_in = olens // self.model.reduction_factor - else: - olens_in = olens - attn_loss = self.attn_loss(att_ws, ilens, olens_in) + attn_loss = self.attn_loss( + att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in) loss = loss + attn_loss report("eval/l1_loss", float(l1_loss)) diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py index ae6d7365593..ba1f33ea851 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py @@ -447,12 +447,15 @@ def forward( # modifiy mod part of groundtruth if self.reduction_factor > 1: - olens = paddle.to_tensor( - [olen - olen % self.reduction_factor for olen in olens.numpy()]) + olens = olens - olens % self.reduction_factor max_olen = max(olens) ys = ys[:, :max_olen] labels = labels[:, :max_olen] labels[:, -1] = 1.0 # make sure at least one frame has 1 + olens_in = olens // self.reduction_factor + else: + olens_in = olens + need_dict = {} need_dict['encoder'] = self.encoder need_dict['decoder'] = self.decoder @@ -462,7 +465,7 @@ def forward( 'num_layers_applied_guided_attn'] = self.num_layers_applied_guided_attn need_dict['use_scaled_pos_enc'] = self.use_scaled_pos_enc - return after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict + return after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict def _forward( self, @@ -488,8 +491,7 @@ def _forward( # thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim) if self.reduction_factor > 1: ys_in = ys[:, self.reduction_factor - 1::self.reduction_factor] - olens_in = olens.new( - [olen // self.reduction_factor for olen in olens]) + olens_in = olens // self.reduction_factor else: ys_in, olens_in = ys, olens @@ -769,318 +771,3 @@ def forward(self, text, spk_id=None): normalized_mel = self.acoustic_model.inference(text)[0] logmel = self.normalizer.inverse(normalized_mel) return logmel - - -class TransformerTTSLoss(nn.Layer): - """Loss function module for Tacotron2.""" - - def __init__(self, - use_masking=True, - use_weighted_masking=False, - bce_pos_weight=5.0): - """Initialize Tactoron2 loss module. - - Parameters - ---------- - use_masking : bool - Whether to apply masking for padded part in loss calculation. - use_weighted_masking : bool - Whether to apply weighted masking in loss calculation. - bce_pos_weight : float - Weight of positive sample of stop token. - - """ - super().__init__() - assert (use_masking != use_weighted_masking) or not use_masking - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - - # define criterions - reduction = "none" if self.use_weighted_masking else "mean" - self.l1_criterion = nn.L1Loss(reduction=reduction) - self.mse_criterion = nn.MSELoss(reduction=reduction) - self.bce_criterion = nn.BCEWithLogitsLoss( - reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight)) - - def forward(self, after_outs, before_outs, logits, ys, labels, olens): - """Calculate forward propagation. - - Parameters - ---------- - after_outs : Tensor - Batch of outputs after postnets (B, Lmax, odim). - before_outs : Tensor - Batch of outputs before postnets (B, Lmax, odim). - logits : Tensor - Batch of stop logits (B, Lmax). - ys : Tensor - Batch of padded target features (B, Lmax, odim). - labels : LongTensor - Batch of the sequences of stop token labels (B, Lmax). - olens : LongTensor - Batch of the lengths of each target (B,). - - Returns - ---------- - Tensor - L1 loss value. - Tensor - Mean square error loss value. - Tensor - Binary cross entropy loss value. - - """ - # make mask and apply it - if self.use_masking: - masks = make_non_pad_mask(olens).unsqueeze(-1) - ys = ys.masked_select(masks.broadcast_to(ys.shape)) - after_outs = after_outs.masked_select( - masks.broadcast_to(after_outs.shape)) - before_outs = before_outs.masked_select( - masks.broadcast_to(before_outs.shape)) - # Operator slice does not have kernel for data_type[bool] - tmp_masks = paddle.cast(masks, dtype='int64') - tmp_masks = tmp_masks[:, :, 0] - tmp_masks = paddle.cast(tmp_masks, dtype='bool') - labels = labels.masked_select(tmp_masks.broadcast_to(labels.shape)) - logits = logits.masked_select(tmp_masks.broadcast_to(logits.shape)) - - # calculate loss - l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion( - before_outs, ys) - mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion( - before_outs, ys) - bce_loss = self.bce_criterion(logits, labels) - - # make weighted mask and apply it - if self.use_weighted_masking: - masks = make_non_pad_mask(olens).unsqueeze(-1) - weights = masks.float() / masks.sum(dim=1, keepdim=True).float() - out_weights = weights.div(ys.shape[0] * ys.shape[2]) - logit_weights = weights.div(ys.shape[0]) - - # apply weight - l1_loss = l1_loss.multiply(out_weights) - l1_loss = l1_loss.masked_select( - masks.broadcast_to(l1_loss.shape)).sum() - - mse_loss = mse_loss.multiply(out_weights) - mse_loss = mse_loss.masked_select( - masks.broadcast_to(mse_loss.shape)).sum() - - bce_loss = bce_loss.multiply(logit_weights.squeeze(-1)) - bce_loss = bce_loss.masked_select( - masks.squeeze(-1).broadcast_to(bce_loss.shape)).sum() - - return l1_loss, mse_loss, bce_loss - - -class GuidedAttentionLoss(nn.Layer): - """Guided attention loss function module. - - This module calculates the guided attention loss described - in `Efficiently Trainable Text-to-Speech System Based - on Deep Convolutional Networks with Guided Attention`_, - which forces the attention to be diagonal. - - .. _`Efficiently Trainable Text-to-Speech System - Based on Deep Convolutional Networks with Guided Attention`: - https://arxiv.org/abs/1710.08969 - - """ - - def __init__(self, sigma=0.4, alpha=1.0, reset_always=True): - """Initialize guided attention loss module. - - Parameters - ---------- - sigma : float, optional - Standard deviation to control how close attention to a diagonal. - alpha : float, optional - Scaling coefficient (lambda). - reset_always : bool, optional - Whether to always reset masks. - - """ - super(GuidedAttentionLoss, self).__init__() - self.sigma = sigma - self.alpha = alpha - self.reset_always = reset_always - self.guided_attn_masks = None - self.masks = None - - def _reset_masks(self): - self.guided_attn_masks = None - self.masks = None - - def forward(self, att_ws, ilens, olens): - """Calculate forward propagation. - - Parameters - ---------- - att_ws : Tensor - Batch of attention weights (B, T_max_out, T_max_in). - ilens : LongTensor - Batch of input lenghts (B,). - olens : LongTensor - Batch of output lenghts (B,). - - Returns - ---------- - Tensor - Guided attention loss value. - - """ - if self.guided_attn_masks is None: - self.guided_attn_masks = self._make_guided_attention_masks(ilens, - olens) - if self.masks is None: - self.masks = self._make_masks(ilens, olens) - losses = self.guided_attn_masks * att_ws - loss = paddle.mean( - losses.masked_select(self.masks.broadcast_to(losses.shape))) - if self.reset_always: - self._reset_masks() - return self.alpha * loss - - def _make_guided_attention_masks(self, ilens, olens): - n_batches = len(ilens) - max_ilen = max(ilens) - max_olen = max(olens) - guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen)) - - for idx, (ilen, olen) in enumerate(zip(ilens, olens)): - - ilen = int(ilen) - olen = int(olen) - guided_attn_masks[idx, :olen, : - ilen] = self._make_guided_attention_mask( - ilen, olen, self.sigma) - return guided_attn_masks - - @staticmethod - def _make_guided_attention_mask(ilen, olen, sigma): - """Make guided attention mask. - - Examples - ---------- - >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4) - >>> guided_attn_mask.shape - [5, 5] - >>> guided_attn_mask - tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647], - [0.1175, 0.0000, 0.1175, 0.3935, 0.6753], - [0.3935, 0.1175, 0.0000, 0.1175, 0.3935], - [0.6753, 0.3935, 0.1175, 0.0000, 0.1175], - [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]]) - >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4) - >>> guided_attn_mask.shape - [6, 3] - >>> guided_attn_mask - tensor([[0.0000, 0.2934, 0.7506], - [0.0831, 0.0831, 0.5422], - [0.2934, 0.0000, 0.2934], - [0.5422, 0.0831, 0.0831], - [0.7506, 0.2934, 0.0000], - [0.8858, 0.5422, 0.0831]]) - - """ - grid_x, grid_y = paddle.meshgrid( - paddle.arange(olen), paddle.arange(ilen)) - grid_x = grid_x.cast(dtype=paddle.float32) - grid_y = grid_y.cast(dtype=paddle.float32) - return 1.0 - paddle.exp(-( - (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2))) - - @staticmethod - def _make_masks(ilens, olens): - """Make masks indicating non-padded part. - - Parameters - ---------- - ilens (LongTensor or List): Batch of lengths (B,). - olens (LongTensor or List): Batch of lengths (B,). - - Returns - ---------- - Tensor - Mask tensor indicating non-padded part. - - Examples - ---------- - >>> ilens, olens = [5, 2], [8, 5] - >>> _make_mask(ilens, olens) - tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1]], - - [[1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]]], dtype=paddle.uint8) - - """ - # (B, T_in) - in_masks = make_non_pad_mask(ilens) - # (B, T_out) - out_masks = make_non_pad_mask(olens) - # (B, T_out, T_in) - - return paddle.logical_and( - out_masks.unsqueeze(-1), in_masks.unsqueeze(-2)) - - -class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss): - """Guided attention loss function module for multi head attention. - - Parameters - ---------- - sigma : float, optional - Standard deviation to controlGuidedAttentionLoss - how close attention to a diagonal. - alpha : float, optional - Scaling coefficient (lambda). - reset_always : bool, optional - Whether to always reset masks. - - """ - - def forward(self, att_ws, ilens, olens): - """Calculate forward propagation. - - Parameters - ---------- - att_ws : Tensor - Batch of multi head attention weights (B, H, T_max_out, T_max_in). - ilens : Tensor - Batch of input lenghts (B,). - olens : Tensor - Batch of output lenghts (B,). - - Returns - ---------- - Tensor - Guided attention loss value. - - """ - if self.guided_attn_masks is None: - self.guided_attn_masks = ( - self._make_guided_attention_masks(ilens, olens).unsqueeze(1)) - if self.masks is None: - self.masks = self._make_masks(ilens, olens).unsqueeze(1) - losses = self.guided_attn_masks * att_ws - loss = paddle.mean( - losses.masked_select(self.masks.broadcast_to(losses.shape))) - if self.reset_always: - self._reset_masks() - - return self.alpha * loss diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py index 6022567ece7..bcc454c0db3 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py @@ -17,8 +17,8 @@ import paddle from paddle import distributed as dist -from paddlespeech.t2s.models.transformer_tts import GuidedMultiHeadAttentionLoss -from paddlespeech.t2s.models.transformer_tts import TransformerTTSLoss +from paddlespeech.t2s.modules.losses import GuidedMultiHeadAttentionLoss +from paddlespeech.t2s.modules.losses import Tacotron2Loss as TransformerTTSLoss from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator from paddlespeech.t2s.training.reporter import report from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater @@ -71,7 +71,7 @@ def update_core(self, batch): self.msg = "Rank: {}, ".format(dist.get_rank()) losses_dict = {} - after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model( + after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -116,7 +116,10 @@ def update_core(self, batch): break # (B, H*L, T_in, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens) + enc_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=batch["text_lengths"] + 1) loss = loss + enc_attn_loss report("train/enc_attn_loss", float(enc_attn_loss)) losses_dict["enc_attn_loss"] = float(enc_attn_loss) @@ -133,7 +136,8 @@ def update_core(self, batch): break # (B, H*L, T_out, T_out) att_ws = paddle.concat(att_ws, axis=1) - dec_attn_loss = self.attn_criterion(att_ws, olens, olens) + dec_attn_loss = self.attn_criterion( + att_ws=att_ws, ilens=olens_in, olens=olens_in) report("train/dec_attn_loss", float(dec_attn_loss)) losses_dict["dec_attn_loss"] = float(dec_attn_loss) loss = loss + dec_attn_loss @@ -150,7 +154,10 @@ def update_core(self, batch): break # (B, H*L, T_out, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens) + enc_dec_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=olens_in) report("train/enc_dec_attn_loss", float(enc_dec_attn_loss)) losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss) loss = loss + enc_dec_attn_loss @@ -215,7 +222,7 @@ def __init__( def evaluate_core(self, batch): self.msg = "Evaluate: " losses_dict = {} - after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model( + after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -260,7 +267,10 @@ def evaluate_core(self, batch): break # (B, H*L, T_in, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens) + enc_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=batch["text_lengths"] + 1) loss = loss + enc_attn_loss report("train/enc_attn_loss", float(enc_attn_loss)) losses_dict["enc_attn_loss"] = float(enc_attn_loss) @@ -277,7 +287,8 @@ def evaluate_core(self, batch): break # (B, H*L, T_out, T_out) att_ws = paddle.concat(att_ws, axis=1) - dec_attn_loss = self.attn_criterion(att_ws, olens, olens) + dec_attn_loss = self.attn_criterion( + att_ws=att_ws, ilens=olens_in, olens=olens_in) report("eval/dec_attn_loss", float(dec_attn_loss)) losses_dict["dec_attn_loss"] = float(dec_attn_loss) loss = loss + dec_attn_loss @@ -295,7 +306,10 @@ def evaluate_core(self, batch): break # (B, H*L, T_out, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens) + enc_dec_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=olens_in) report("eval/enc_dec_attn_loss", float(enc_dec_attn_loss)) losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss) loss = loss + enc_dec_attn_loss diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 781ac7924fd..044a52e5f6a 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -26,26 +26,30 @@ # Loss for new Tacotron2 class GuidedAttentionLoss(nn.Layer): """Guided attention loss function module. + This module calculates the guided attention loss described in `Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention`_, which forces the attention to be diagonal. + .. _`Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention`: https://arxiv.org/abs/1710.08969 + """ def __init__(self, sigma=0.4, alpha=1.0, reset_always=True): """Initialize guided attention loss module. + Parameters ---------- sigma : float, optional - Standard deviation to control - how close attention to a diagonal. + Standard deviation to control how close attention to a diagonal. alpha : float, optional Scaling coefficient (lambda). reset_always : bool, optional Whether to always reset masks. + """ super().__init__() self.sigma = sigma @@ -60,18 +64,21 @@ def _reset_masks(self): def forward(self, att_ws, ilens, olens): """Calculate forward propagation. + Parameters ---------- att_ws : Tensor Batch of attention weights (B, T_max_out, T_max_in). ilens : Tensor(int64) - Batch of input lengths (B,). + Batch of input lenghts (B,). olens : Tensor(int64) - Batch of output lengths (B,). + Batch of output lenghts (B,). + Returns ---------- Tensor Guided attention loss value. + """ if self.guided_attn_masks is None: self.guided_attn_masks = self._make_guided_attention_masks(ilens, @@ -79,7 +86,8 @@ def forward(self, att_ws, ilens, olens): if self.masks is None: self.masks = self._make_masks(ilens, olens) losses = self.guided_attn_masks * att_ws - loss = paddle.mean(losses.masked_select(self.masks)) + loss = paddle.mean( + losses.masked_select(self.masks.broadcast_to(losses.shape))) if self.reset_always: self._reset_masks() return self.alpha * loss @@ -89,6 +97,7 @@ def _make_guided_attention_masks(self, ilens, olens): max_ilen = max(ilens) max_olen = max(olens) guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen)) + for idx, (ilen, olen) in enumerate(zip(ilens, olens)): guided_attn_masks[idx, :olen, : ilen] = self._make_guided_attention_mask( @@ -98,11 +107,12 @@ def _make_guided_attention_masks(self, ilens, olens): @staticmethod def _make_guided_attention_mask(ilen, olen, sigma): """Make guided attention mask. - Parameters + + Examples ---------- >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4) >>> guided_attn_mask.shape - Size([5, 5]) + [5, 5] >>> guided_attn_mask tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647], [0.1175, 0.0000, 0.1175, 0.3935, 0.6753], @@ -111,7 +121,7 @@ def _make_guided_attention_mask(ilen, olen, sigma): [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]]) >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4) >>> guided_attn_mask.shape - Size([6, 3]) + [6, 3] >>> guided_attn_mask tensor([[0.0000, 0.2934, 0.7506], [0.0831, 0.0831, 0.5422], @@ -119,55 +129,109 @@ def _make_guided_attention_mask(ilen, olen, sigma): [0.5422, 0.0831, 0.0831], [0.7506, 0.2934, 0.0000], [0.8858, 0.5422, 0.0831]]) + """ grid_x, grid_y = paddle.meshgrid( paddle.arange(olen), paddle.arange(ilen)) - grid_x = paddle.cast(grid_x, dtype='float32') - grid_y = paddle.cast(grid_y, dtype='float32') - + grid_x = grid_x.cast(dtype=paddle.float32) + grid_y = grid_y.cast(dtype=paddle.float32) return 1.0 - paddle.exp(-( (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2))) @staticmethod def _make_masks(ilens, olens): """Make masks indicating non-padded part. - Examples + + Parameters ---------- ilens : Tensor(int64) or List Batch of lengths (B,). olens : Tensor(int64) or List Batch of lengths (B,). + Returns ---------- Tensor Mask tensor indicating non-padded part. + Examples ---------- >>> ilens, olens = [5, 2], [8, 5] >>> _make_mask(ilens, olens) tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1]], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1]], + [[1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]]],) + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]]], dtype=paddle.uint8) + """ # (B, T_in) in_masks = make_non_pad_mask(ilens) # (B, T_out) out_masks = make_non_pad_mask(olens) # (B, T_out, T_in) - return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2) + + return paddle.logical_and( + out_masks.unsqueeze(-1), in_masks.unsqueeze(-2)) + + +class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss): + """Guided attention loss function module for multi head attention. + + Parameters + ---------- + sigma : float, optional + Standard deviation to controlGuidedAttentionLoss + how close attention to a diagonal. + alpha : float, optional + Scaling coefficient (lambda). + reset_always : bool, optional + Whether to always reset masks. + + """ + + def forward(self, att_ws, ilens, olens): + """Calculate forward propagation. + + Parameters + ---------- + att_ws : Tensor + Batch of multi head attention weights (B, H, T_max_out, T_max_in). + ilens : Tensor + Batch of input lenghts (B,). + olens : Tensor + Batch of output lenghts (B,). + + Returns + ---------- + Tensor + Guided attention loss value. + + """ + if self.guided_attn_masks is None: + self.guided_attn_masks = ( + self._make_guided_attention_masks(ilens, olens).unsqueeze(1)) + if self.masks is None: + self.masks = self._make_masks(ilens, olens).unsqueeze(1) + losses = self.guided_attn_masks * att_ws + loss = paddle.mean( + losses.masked_select(self.masks.broadcast_to(losses.shape))) + if self.reset_always: + self._reset_masks() + + return self.alpha * loss class Tacotron2Loss(nn.Layer): From 3fd7a7790baf2ef0df1ee5a752c3af9753264ffb Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 18 Jan 2022 07:15:40 +0000 Subject: [PATCH 4/5] add typehit for updater and evaluator, test=tts --- .../models/fastspeech2/fastspeech2_updater.py | 26 +++++++++------- .../models/new_tacotron2/tacotron2_updater.py | 9 +++--- .../speedyspeech/speedyspeech_updater.py | 17 +++++++---- .../transformer_tts_updater.py | 30 +++++++++++-------- 4 files changed, 48 insertions(+), 34 deletions(-) diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py index 3f5e1b565d2..92aa9dfc773 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py @@ -12,8 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +from pathlib import Path from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator @@ -28,13 +32,13 @@ class FastSpeech2Updater(StandardUpdater): def __init__(self, - model, - optimizer, - dataloader, + model: Layer, + optimizer: Optimizer, + dataloader: DataLoader, init_state=None, - use_masking=False, - use_weighted_masking=False, - output_dir=None): + use_masking: bool=False, + use_weighted_masking: bool=False, + output_dir: Path=None): super().__init__(model, optimizer, dataloader, init_state=None) self.criterion = FastSpeech2Loss( @@ -104,11 +108,11 @@ def update_core(self, batch): class FastSpeech2Evaluator(StandardEvaluator): def __init__(self, - model, - dataloader, - use_masking=False, - use_weighted_masking=False, - output_dir=None): + model: Layer, + dataloader: DataLoader, + use_masking: bool=False, + use_weighted_masking: bool=False, + output_dir: Path=None): super().__init__(model, dataloader) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py index 6d41702cd08..7572171b468 100644 --- a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py @@ -13,7 +13,6 @@ # limitations under the License. import logging from pathlib import Path -from typing import Dict from paddle import distributed as dist from paddle.io import DataLoader @@ -34,8 +33,8 @@ class Tacotron2Updater(StandardUpdater): def __init__(self, - model: Dict[str, Layer], - optimizer: Dict[str, Optimizer], + model: Layer, + optimizer: Optimizer, dataloader: DataLoader, init_state=None, use_masking: bool=True, @@ -126,8 +125,8 @@ def update_core(self, batch): class Tacotron2Evaluator(StandardEvaluator): def __init__(self, - model, - dataloader, + model: Layer, + dataloader: DataLoader, use_masking: bool=True, use_weighted_masking: bool=False, bce_pos_weight: float=5.0, diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py index ee45cdc85dc..e30a3fe1a59 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py @@ -12,11 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +from pathlib import Path import paddle from paddle import distributed as dist from paddle.fluid.layers import huber_loss +from paddle.io import DataLoader from paddle.nn import functional as F +from paddle.nn import Layer +from paddle.optimizer import Optimizer from paddlespeech.t2s.modules.losses import masked_l1_loss from paddlespeech.t2s.modules.losses import ssim @@ -33,11 +37,11 @@ class SpeedySpeechUpdater(StandardUpdater): def __init__(self, - model, - optimizer, - dataloader, + model: Layer, + optimizer: Optimizer, + dataloader: DataLoader, init_state=None, - output_dir=None): + output_dir: Path=None): super().__init__(model, optimizer, dataloader, init_state=None) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) @@ -103,7 +107,10 @@ def update_core(self, batch): class SpeedySpeechEvaluator(StandardEvaluator): - def __init__(self, model, dataloader, output_dir=None): + def __init__(self, + model: Layer, + dataloader: DataLoader, + output_dir: Path=None): super().__init__(model, dataloader) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py index bcc454c0db3..1f25b019ce1 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py @@ -12,10 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +from pathlib import Path from typing import Sequence import paddle from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer from paddlespeech.t2s.modules.losses import GuidedMultiHeadAttentionLoss from paddlespeech.t2s.modules.losses import Tacotron2Loss as TransformerTTSLoss @@ -32,14 +36,14 @@ class TransformerTTSUpdater(StandardUpdater): def __init__( self, - model, - optimizer, - dataloader, + model: Layer, + optimizer: Optimizer, + dataloader: DataLoader, init_state=None, - use_masking=False, - use_weighted_masking=False, - output_dir=None, - bce_pos_weight=5.0, + use_masking: bool=False, + use_weighted_masking: bool=False, + output_dir: Path=None, + bce_pos_weight: float=5.0, loss_type: str="L1", use_guided_attn_loss: bool=True, modules_applied_guided_attn: Sequence[str]=("encoder-decoder"), @@ -185,13 +189,13 @@ def update_core(self, batch): class TransformerTTSEvaluator(StandardEvaluator): def __init__( self, - model, - dataloader, + model: Layer, + dataloader: DataLoader, init_state=None, - use_masking=False, - use_weighted_masking=False, - output_dir=None, - bce_pos_weight=5.0, + use_masking: bool=False, + use_weighted_masking: bool=False, + output_dir: Path=None, + bce_pos_weight: float=5.0, loss_type: str="L1", use_guided_attn_loss: bool=True, modules_applied_guided_attn: Sequence[str]=("encoder-decoder"), From 96323816e9da0aae7fb26c7ab4882ec008870ec1 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 18 Jan 2022 10:22:42 +0000 Subject: [PATCH 5/5] fix yamls, change labels to stop_labels, test=tts --- examples/aishell3/tts3/conf/default.yaml | 4 +-- examples/aishell3/vc1/conf/default.yaml | 4 +-- examples/csmsc/tts0/conf/default.yaml | 4 --- examples/csmsc/tts3/conf/conformer.yaml | 4 +-- examples/csmsc/tts3/conf/default.yaml | 4 +-- examples/ljspeech/tts3/conf/default.yaml | 4 +-- examples/vctk/tts3/conf/default.yaml | 4 +-- .../t2s/exps/new_tacotron2/preprocess.py | 27 +------------------ .../t2s/models/new_tacotron2/tacotron2.py | 13 ++++----- .../models/new_tacotron2/tacotron2_updater.py | 22 ++++++++++----- .../models/transformer_tts/transformer_tts.py | 16 +++++------ .../transformer_tts_updater.py | 8 +++--- paddlespeech/t2s/modules/losses.py | 10 +++---- .../t2s/modules/tacotron2/attentions.py | 2 +- 14 files changed, 53 insertions(+), 73 deletions(-) diff --git a/examples/aishell3/tts3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml index 69307049af3..ac4956742eb 100644 --- a/examples/aishell3/tts3/conf/default.yaml +++ b/examples/aishell3/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### diff --git a/examples/aishell3/vc1/conf/default.yaml b/examples/aishell3/vc1/conf/default.yaml index 69307049af3..ac4956742eb 100644 --- a/examples/aishell3/vc1/conf/default.yaml +++ b/examples/aishell3/vc1/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### diff --git a/examples/csmsc/tts0/conf/default.yaml b/examples/csmsc/tts0/conf/default.yaml index 171aee8802c..42635c506ee 100644 --- a/examples/csmsc/tts0/conf/default.yaml +++ b/examples/csmsc/tts0/conf/default.yaml @@ -21,10 +21,6 @@ fmin: 80 # Minimum frequency of Mel basis. fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. -# Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. - ########################################################### # DATA SETTING # ########################################################### diff --git a/examples/csmsc/tts3/conf/conformer.yaml b/examples/csmsc/tts3/conf/conformer.yaml index 03e4f2e33cb..fcad86150a8 100644 --- a/examples/csmsc/tts3/conf/conformer.yaml +++ b/examples/csmsc/tts3/conf/conformer.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### diff --git a/examples/csmsc/tts3/conf/default.yaml b/examples/csmsc/tts3/conf/default.yaml index ce2b24d9227..2c2a1ea1009 100644 --- a/examples/csmsc/tts3/conf/default.yaml +++ b/examples/csmsc/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### diff --git a/examples/ljspeech/tts3/conf/default.yaml b/examples/ljspeech/tts3/conf/default.yaml index 15cfda2c651..5305c912f91 100644 --- a/examples/ljspeech/tts3/conf/default.yaml +++ b/examples/ljspeech/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### diff --git a/examples/vctk/tts3/conf/default.yaml b/examples/vctk/tts3/conf/default.yaml index 86d4a0d5a88..1bca9107b5e 100644 --- a/examples/vctk/tts3/conf/default.yaml +++ b/examples/vctk/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### diff --git a/paddlespeech/t2s/exps/new_tacotron2/preprocess.py b/paddlespeech/t2s/exps/new_tacotron2/preprocess.py index 0b61912cf9e..5fc6b590d3e 100644 --- a/paddlespeech/t2s/exps/new_tacotron2/preprocess.py +++ b/paddlespeech/t2s/exps/new_tacotron2/preprocess.py @@ -27,9 +27,7 @@ import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import Energy from paddlespeech.t2s.data.get_feats import LogMelFBank -from paddlespeech.t2s.data.get_feats import Pitch from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length from paddlespeech.t2s.datasets.preprocess_utils import get_input_token from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur @@ -42,8 +40,6 @@ def process_sentence(config: Dict[str, Any], sentences: Dict, output_dir: Path, mel_extractor=None, - pitch_extractor=None, - energy_extractor=None, cut_sil: bool=True, spk_emb_dir: Path=None): utt_id = fp.stem @@ -117,8 +113,6 @@ def process_sentences(config, sentences: Dict, output_dir: Path, mel_extractor=None, - pitch_extractor=None, - energy_extractor=None, nprocs: int=1, cut_sil: bool=True, spk_emb_dir: Path=None): @@ -126,8 +120,7 @@ def process_sentences(config, results = [] for fp in fps: record = process_sentence(config, fp, sentences, output_dir, - mel_extractor, pitch_extractor, - energy_extractor, cut_sil, spk_emb_dir) + mel_extractor, cut_sil, spk_emb_dir) if record: results.append(record) else: @@ -137,7 +130,6 @@ def process_sentences(config, for fp in fps: future = pool.submit(process_sentence, config, fp, sentences, output_dir, mel_extractor, - pitch_extractor, energy_extractor, cut_sil, spk_emb_dir) future.add_done_callback(lambda p: progress.update()) futures.append(future) @@ -299,17 +291,6 @@ def str2bool(str): n_mels=config.n_mels, fmin=config.fmin, fmax=config.fmax) - pitch_extractor = Pitch( - sr=config.fs, - hop_length=config.n_shift, - f0min=config.f0min, - f0max=config.f0max) - energy_extractor = Energy( - sr=config.fs, - n_fft=config.n_fft, - hop_length=config.n_shift, - win_length=config.win_length, - window=config.window) # process for the 3 sections if train_wav_files: @@ -319,8 +300,6 @@ def str2bool(str): sentences, train_dump_dir, mel_extractor, - pitch_extractor, - energy_extractor, nprocs=args.num_cpu, cut_sil=args.cut_sil, spk_emb_dir=spk_emb_dir) @@ -331,8 +310,6 @@ def str2bool(str): sentences, dev_dump_dir, mel_extractor, - pitch_extractor, - energy_extractor, cut_sil=args.cut_sil, spk_emb_dir=spk_emb_dir) if test_wav_files: @@ -342,8 +319,6 @@ def str2bool(str): sentences, test_dump_dir, mel_extractor, - pitch_extractor, - energy_extractor, nprocs=args.num_cpu, cut_sil=args.cut_sil, spk_emb_dir=spk_emb_dir) diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py index 4804ffb44fa..6a6d107356c 100644 --- a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py @@ -300,10 +300,10 @@ def forward( olens = speech_lengths # make labels for stop prediction - labels = make_pad_mask(olens - 1) + stop_labels = make_pad_mask(olens - 1) # bool 类型无法切片 - labels = paddle.cast(labels, dtype='float32') - labels = F.pad(labels, [0, 0, 0, 1], "constant", 1.0) + stop_labels = paddle.cast(stop_labels, dtype='float32') + stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0) # calculate tacotron2 outputs after_outs, before_outs, logits, att_ws = self._forward( @@ -322,12 +322,13 @@ def forward( olens = olens - olens % self.reduction_factor max_out = max(olens) ys = ys[:, :max_out] - labels = labels[:, :max_out] - labels = paddle.scatter(labels, 1, (olens - 1).unsqueeze(1), 1.0) + stop_labels = stop_labels[:, :max_out] + stop_labels = paddle.scatter(stop_labels, 1, + (olens - 1).unsqueeze(1), 1.0) olens_in = olens // self.reduction_factor else: olens_in = olens - return after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in + return after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in def _forward( self, diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py index 7572171b468..09e6827d04e 100644 --- a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py @@ -74,7 +74,7 @@ def update_core(self, batch): if spk_emb is not None: spk_id = None - after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model( + after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -83,8 +83,13 @@ def update_core(self, batch): spk_emb=spk_emb) # calculate taco2 loss - l1_loss, mse_loss, bce_loss = self.taco2_loss(after_outs, before_outs, - logits, ys, labels, olens) + l1_loss, mse_loss, bce_loss = self.taco2_loss( + after_outs=after_outs, + before_outs=before_outs, + logits=logits, + ys=ys, + stop_labels=stop_labels, + olens=olens) if self.loss_type == "L1+L2": loss = l1_loss + mse_loss + bce_loss @@ -164,7 +169,7 @@ def evaluate_core(self, batch): if spk_emb is not None: spk_id = None - after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model( + after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -173,8 +178,13 @@ def evaluate_core(self, batch): spk_emb=spk_emb) # calculate taco2 loss - l1_loss, mse_loss, bce_loss = self.taco2_loss(after_outs, before_outs, - logits, ys, labels, olens) + l1_loss, mse_loss, bce_loss = self.taco2_loss( + after_outs=after_outs, + before_outs=before_outs, + logits=logits, + ys=ys, + stop_labels=stop_labels, + olens=olens) if self.loss_type == "L1+L2": loss = l1_loss + mse_loss + bce_loss diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py index ba1f33ea851..4babe283623 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py @@ -433,12 +433,10 @@ def forward( olens = paddle.cast(speech_lengths, 'int64') # make labels for stop prediction - labels = make_pad_mask(olens - 1) - labels = numpy.pad( - labels.numpy(), ((0, 0), (0, 1)), 'constant', constant_values=1.0) - labels = paddle.to_tensor(labels) - labels = paddle.cast(labels, dtype="float32") - # labels = F.pad(labels, [0, 1], "constant", 1.0) + stop_labels = make_pad_mask(olens - 1) + # bool 类型无法切片 + stop_labels = paddle.cast(stop_labels, dtype='float32') + stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0) # calculate transformer outputs after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens, @@ -450,8 +448,8 @@ def forward( olens = olens - olens % self.reduction_factor max_olen = max(olens) ys = ys[:, :max_olen] - labels = labels[:, :max_olen] - labels[:, -1] = 1.0 # make sure at least one frame has 1 + stop_labels = stop_labels[:, :max_olen] + stop_labels[:, -1] = 1.0 # make sure at least one frame has 1 olens_in = olens // self.reduction_factor else: olens_in = olens @@ -465,7 +463,7 @@ def forward( 'num_layers_applied_guided_attn'] = self.num_layers_applied_guided_attn need_dict['use_scaled_pos_enc'] = self.use_scaled_pos_enc - return after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict + return after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict def _forward( self, diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py index 1f25b019ce1..dff908e05bf 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py @@ -75,7 +75,7 @@ def update_core(self, batch): self.msg = "Rank: {}, ".format(dist.get_rank()) losses_dict = {} - after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict = self.model( + after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -86,7 +86,7 @@ def update_core(self, batch): before_outs=before_outs, logits=logits, ys=ys, - labels=labels, + stop_labels=stop_labels, olens=olens) report("train/bce_loss", float(bce_loss)) @@ -226,7 +226,7 @@ def __init__( def evaluate_core(self, batch): self.msg = "Evaluate: " losses_dict = {} - after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict = self.model( + after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -237,7 +237,7 @@ def evaluate_core(self, batch): before_outs=before_outs, logits=logits, ys=ys, - labels=labels, + stop_labels=stop_labels, olens=olens) report("eval/bce_loss", float(bce_loss)) diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 044a52e5f6a..3cc7a93cb5f 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -263,7 +263,7 @@ def __init__(self, self.bce_criterion = nn.BCEWithLogitsLoss( reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight)) - def forward(self, after_outs, before_outs, logits, ys, labels, olens): + def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens): """Calculate forward propagation. Parameters ---------- @@ -275,7 +275,7 @@ def forward(self, after_outs, before_outs, logits, ys, labels, olens): Batch of stop logits (B, Lmax). ys : Tensor Batch of padded target features (B, Lmax, odim). - labels : Tensor(int64) + stop_labels : Tensor(int64) Batch of the sequences of stop token labels (B, Lmax). olens : Tensor(int64) Batch of the lengths of each target (B,). @@ -296,8 +296,8 @@ def forward(self, after_outs, before_outs, logits, ys, labels, olens): masks.broadcast_to(after_outs.shape)) before_outs = before_outs.masked_select( masks.broadcast_to(before_outs.shape)) - labels = labels.masked_select( - masks[:, :, 0].broadcast_to(labels.shape)) + stop_labels = stop_labels.masked_select( + masks[:, :, 0].broadcast_to(stop_labels.shape)) logits = logits.masked_select( masks[:, :, 0].broadcast_to(logits.shape)) @@ -306,7 +306,7 @@ def forward(self, after_outs, before_outs, logits, ys, labels, olens): before_outs, ys) mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion( before_outs, ys) - bce_loss = self.bce_criterion(logits, labels) + bce_loss = self.bce_criterion(logits, stop_labels) # make weighted mask and apply it if self.use_weighted_masking: diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py index 2b912db3dea..710e326d608 100644 --- a/paddlespeech/t2s/modules/tacotron2/attentions.py +++ b/paddlespeech/t2s/modules/tacotron2/attentions.py @@ -207,7 +207,7 @@ def forward( w = F.softmax(scaling * e, axis=1) - # weighted sum over flames + # weighted sum over frames # utt x hdim c = paddle.sum( self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1)