forked from NVIDIA/NeMo
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add dataset 'AISHELL-3' from OpenSLR for training mandarin TTS (NVIDI…
…A#7409) * Add dataset 'AISHELL-3' from OpenSLR for training mandarin TTS * Train 'AISHELL-3' dataset with multi-speakers Signed-off-by: Robin Dong <[email protected]> * Update get_data.py update copyright header Signed-off-by: Xuesong Yang <[email protected]> * Update get_data.py added a disclaimer Signed-off-by: Xuesong Yang <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add new configuration file for AISHELL3 with multispeaker of fastpitch Signed-off-by: Robin Dong <[email protected]> --------- Signed-off-by: Robin Dong <[email protected]> Signed-off-by: Xuesong Yang <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Xuesong Yang <[email protected]>
- Loading branch information
1 parent
6a4b337
commit f3a49ee
Showing
3 changed files
with
466 additions
and
0 deletions.
There are no files selected for viewing
261 changes: 261 additions & 0 deletions
261
examples/tts/conf/zh/fastpitch_align_multispeaker_22050.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,261 @@ | ||
# This config contains the default values for training FastPitch model with aligner using 22KHz sampling | ||
# rate. If you want to train model on other dataset, you can change config values according to your dataset. | ||
# Most dataset-specific arguments are in the head of the config file, see below. | ||
|
||
name: FastPitch | ||
|
||
train_dataset: ??? | ||
validation_datasets: ??? | ||
sup_data_path: ??? | ||
sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id"] | ||
|
||
# Default values from librosa.pyin | ||
pitch_fmin: 65.40639132514966 | ||
pitch_fmax: 1986.977294921875 | ||
|
||
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values | ||
# by running `scripts/dataset_processing/tts/extract_sup_data.py` | ||
pitch_mean: ??? # e.g. 221.4948272705078 for SFbilingual dataset. | ||
pitch_std: ??? # e.g. 64.6528930664063 for SFbilingual dataset. | ||
|
||
# Default values for dataset with sample_rate=22050 | ||
sample_rate: 22050 | ||
n_mel_channels: 80 | ||
n_window_size: 1024 | ||
n_window_stride: 256 | ||
n_fft: 1024 | ||
lowfreq: 0 | ||
highfreq: null | ||
window: hann | ||
|
||
# There are four candidates of `phoneme_dict_path` provided for Chinese as shown below, | ||
# 1) 24-final Pinyin: "scripts/tts_dataset_files/zh/24finals/pinyin_dict_nv_22.10.txt", | ||
# 2) IPA converted from 24-final Pinyin: "scripts/tts_dataset_files/zh/24finals/ipa_dict_nv23.05.txt", | ||
# 3) 36-final Pinyin: "scripts/tts_dataset_files/zh/36finals/pinyin_dict_nv23.05.txt", | ||
# 4) (default) IPA converted from 36-final Pinyin: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt" | ||
# Suggest to choose IPA symbol set converted from 36-final Pinyin because better audio quality were observed. | ||
phoneme_dict_path: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt" | ||
|
||
model: | ||
learn_alignment: true | ||
bin_loss_warmup_epochs: 100 | ||
|
||
n_speakers: 1958 | ||
max_token_duration: 75 | ||
symbols_embedding_dim: 384 | ||
pitch_embedding_kernel_size: 3 | ||
speaker_emb_condition_prosody: true | ||
speaker_emb_condition_aligner: true | ||
|
||
pitch_fmin: ${pitch_fmin} | ||
pitch_fmax: ${pitch_fmax} | ||
|
||
pitch_mean: ${pitch_mean} | ||
pitch_std: ${pitch_std} | ||
|
||
sample_rate: ${sample_rate} | ||
n_mel_channels: ${n_mel_channels} | ||
n_window_size: ${n_window_size} | ||
n_window_stride: ${n_window_stride} | ||
n_fft: ${n_fft} | ||
lowfreq: ${lowfreq} | ||
highfreq: ${highfreq} | ||
window: ${window} | ||
|
||
text_normalizer: | ||
_target_: nemo_text_processing.text_normalization.normalize.Normalizer | ||
lang: zh | ||
input_case: cased | ||
|
||
text_normalizer_call_kwargs: | ||
verbose: false | ||
punct_pre_process: true | ||
punct_post_process: true | ||
|
||
text_tokenizer: | ||
_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.ChinesePhonemesTokenizer | ||
punct: true | ||
apostrophe: true | ||
pad_with_space: true | ||
g2p: | ||
_target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p | ||
phoneme_dict: ${phoneme_dict_path} | ||
word_segmenter: jieba # Only jieba is supported now. | ||
phoneme_prefix: "" | ||
phoneme_case: lower | ||
tone_prefix: "#" | ||
ascii_letter_prefix: "" | ||
ascii_letter_case: upper | ||
|
||
train_ds: | ||
dataset: | ||
_target_: nemo.collections.tts.data.dataset.TTSDataset | ||
manifest_filepath: ${train_dataset} | ||
sample_rate: ${model.sample_rate} | ||
sup_data_path: ${sup_data_path} | ||
sup_data_types: ${sup_data_types} | ||
n_fft: ${model.n_fft} | ||
win_length: ${model.n_window_size} | ||
hop_length: ${model.n_window_stride} | ||
window: ${model.window} | ||
n_mels: ${model.n_mel_channels} | ||
lowfreq: ${model.lowfreq} | ||
highfreq: ${model.highfreq} | ||
max_duration: null # change to null to include longer audios. | ||
min_duration: 0.1 | ||
ignore_file: null | ||
trim: true | ||
trim_top_db: 50 | ||
trim_frame_length: ${model.n_window_size} | ||
trim_hop_length: ${model.n_window_stride} | ||
pitch_fmin: ${model.pitch_fmin} | ||
pitch_fmax: ${model.pitch_fmax} | ||
pitch_norm: true | ||
pitch_mean: ${model.pitch_mean} | ||
pitch_std: ${model.pitch_std} | ||
|
||
dataloader_params: | ||
drop_last: false | ||
shuffle: true | ||
batch_size: 32 | ||
num_workers: 12 | ||
pin_memory: true | ||
|
||
validation_ds: | ||
dataset: | ||
_target_: nemo.collections.tts.data.dataset.TTSDataset | ||
manifest_filepath: ${validation_datasets} | ||
sample_rate: ${model.sample_rate} | ||
sup_data_path: ${sup_data_path} | ||
sup_data_types: ${sup_data_types} | ||
n_fft: ${model.n_fft} | ||
win_length: ${model.n_window_size} | ||
hop_length: ${model.n_window_stride} | ||
window: ${model.window} | ||
n_mels: ${model.n_mel_channels} | ||
lowfreq: ${model.lowfreq} | ||
highfreq: ${model.highfreq} | ||
max_duration: null # change to null to include longer audios. | ||
min_duration: 0.1 | ||
ignore_file: null | ||
trim: true | ||
trim_top_db: 50 | ||
trim_frame_length: ${model.n_window_size} | ||
trim_hop_length: ${model.n_window_stride} | ||
pitch_fmin: ${model.pitch_fmin} | ||
pitch_fmax: ${model.pitch_fmax} | ||
pitch_norm: true | ||
pitch_mean: ${model.pitch_mean} | ||
pitch_std: ${model.pitch_std} | ||
|
||
dataloader_params: | ||
drop_last: false | ||
shuffle: false | ||
batch_size: 32 | ||
num_workers: 2 | ||
pin_memory: true | ||
|
||
preprocessor: | ||
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor | ||
features: ${model.n_mel_channels} | ||
lowfreq: ${model.lowfreq} | ||
highfreq: ${model.highfreq} | ||
n_fft: ${model.n_fft} | ||
n_window_size: ${model.n_window_size} | ||
window_size: false | ||
n_window_stride: ${model.n_window_stride} | ||
window_stride: false | ||
pad_to: 1 | ||
pad_value: 0 | ||
sample_rate: ${model.sample_rate} | ||
window: ${model.window} | ||
normalize: null | ||
preemph: null | ||
dither: 0.0 | ||
frame_splicing: 1 | ||
log: true | ||
log_zero_guard_type: add | ||
log_zero_guard_value: 1e-05 | ||
mag_power: 1.0 | ||
|
||
input_fft: #n_embed and padding_idx are added by the model | ||
_target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder | ||
n_layer: 6 | ||
n_head: 1 | ||
d_model: ${model.symbols_embedding_dim} | ||
d_head: 64 | ||
d_inner: 1536 | ||
kernel_size: 3 | ||
dropout: 0.1 | ||
dropatt: 0.1 | ||
dropemb: 0.0 | ||
d_embed: ${model.symbols_embedding_dim} | ||
|
||
output_fft: | ||
_target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder | ||
n_layer: 6 | ||
n_head: 1 | ||
d_model: ${model.symbols_embedding_dim} | ||
d_head: 64 | ||
d_inner: 1536 | ||
kernel_size: 3 | ||
dropout: 0.1 | ||
dropatt: 0.1 | ||
dropemb: 0.0 | ||
|
||
alignment_module: | ||
_target_: nemo.collections.tts.modules.aligner.AlignmentEncoder | ||
n_text_channels: ${model.symbols_embedding_dim} | ||
|
||
duration_predictor: | ||
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor | ||
input_size: ${model.symbols_embedding_dim} | ||
kernel_size: 3 | ||
filter_size: 256 | ||
dropout: 0.1 | ||
n_layers: 2 | ||
|
||
pitch_predictor: | ||
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor | ||
input_size: ${model.symbols_embedding_dim} | ||
kernel_size: 3 | ||
filter_size: 256 | ||
dropout: 0.1 | ||
n_layers: 2 | ||
|
||
optim: | ||
name: adamw | ||
lr: 1e-3 | ||
betas: [0.9, 0.999] | ||
weight_decay: 1e-6 | ||
|
||
sched: | ||
name: NoamAnnealing | ||
warmup_steps: 1000 | ||
last_epoch: -1 | ||
d_model: 1 # Disable scaling based on model dim | ||
|
||
trainer: | ||
num_nodes: 1 | ||
devices: -1 # number of gpus | ||
accelerator: gpu | ||
strategy: ddp | ||
precision: 16 | ||
max_epochs: 5000 | ||
accumulate_grad_batches: 1 | ||
gradient_clip_val: 1000.0 | ||
enable_checkpointing: false # Provided by exp_manager | ||
logger: false # Provided by exp_manager | ||
log_every_n_steps: 100 | ||
check_val_every_n_epoch: 5 | ||
benchmark: false | ||
|
||
exp_manager: | ||
exp_dir: null | ||
name: ${name} | ||
create_tensorboard_logger: true | ||
create_checkpoint_callback: true | ||
checkpoint_callback_params: | ||
monitor: val_loss | ||
resume_if_exists: false | ||
resume_ignore_no_checkpoint: false |
49 changes: 49 additions & 0 deletions
49
scripts/dataset_processing/tts/aishell3/ds_conf/ds_for_fastpitch_align.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
name: "ds_for_fastpitch_align" | ||
|
||
manifest_filepath: "train_manifest.json" | ||
sup_data_path: "sup_data" | ||
sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id"] | ||
phoneme_dict_path: "scripts/tts_dataset_files/zh/24finals/pinyin_dict_nv_22.10.txt" | ||
|
||
dataset: | ||
_target_: nemo.collections.tts.data.dataset.TTSDataset | ||
manifest_filepath: ${manifest_filepath} | ||
sample_rate: 22050 | ||
sup_data_path: ${sup_data_path} | ||
sup_data_types: ${sup_data_types} | ||
n_fft: 1024 | ||
win_length: 1024 | ||
hop_length: 256 | ||
window: "hann" | ||
n_mels: 80 | ||
lowfreq: 0 | ||
highfreq: null | ||
max_duration: null | ||
min_duration: 0.1 | ||
ignore_file: null | ||
trim: true | ||
trim_top_db: 50 | ||
trim_frame_length: 1024 | ||
trim_hop_length: 256 | ||
pitch_fmin: 65.40639132514966 | ||
pitch_fmax: 2093.004522404789 | ||
|
||
text_normalizer: | ||
_target_: nemo_text_processing.text_normalization.normalize.Normalizer | ||
lang: zh | ||
input_case: cased | ||
|
||
text_normalizer_call_kwargs: | ||
verbose: false | ||
punct_pre_process: true | ||
punct_post_process: true | ||
|
||
text_tokenizer: | ||
_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.ChinesePhonemesTokenizer | ||
punct: true | ||
apostrophe: true | ||
pad_with_space: true | ||
g2p: | ||
_target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p | ||
phoneme_dict: ${phoneme_dict_path} | ||
word_segmenter: jieba # Only jieba is supported now. |
Oops, something went wrong.