Train 'AISHELL-3' dataset with multi-speakers

Signed-off-by: Robin Dong <[email protected]>
NVIDIA · Sep 21, 2023 · 8da3699 · 8da3699
1 parent 345312d
commit 8da3699
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 3 deletions.
diff --git a/examples/tts/conf/zh/fastpitch_align_22050.yaml b/examples/tts/conf/zh/fastpitch_align_22050.yaml
@@ -7,7 +7,7 @@ name: FastPitch
 train_dataset: ???
 validation_datasets: ???
 sup_data_path: ???
-sup_data_types: [ "align_prior_matrix", "pitch" ]
+sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id"]
 
 # Default values from librosa.pyin
 pitch_fmin: 65.40639132514966
@@ -40,10 +40,12 @@ model:
   learn_alignment: true
   bin_loss_warmup_epochs: 100
 
-  n_speakers: 1
+  n_speakers: 1958
   max_token_duration: 75
   symbols_embedding_dim: 384
   pitch_embedding_kernel_size: 3
+  speaker_emb_condition_prosody: true
+  speaker_emb_condition_aligner: true
 
   pitch_fmin: ${pitch_fmin}
   pitch_fmax: ${pitch_fmax}

diff --git a/scripts/dataset_processing/tts/aishell3/ds_conf/ds_for_fastpitch_align.yaml b/scripts/dataset_processing/tts/aishell3/ds_conf/ds_for_fastpitch_align.yaml
@@ -2,7 +2,7 @@ name: "ds_for_fastpitch_align"
 
 manifest_filepath: "train_manifest.json"
 sup_data_path: "sup_data"
-sup_data_types: [ "align_prior_matrix", "pitch" ]
+sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id"]
 phoneme_dict_path: "scripts/tts_dataset_files/zh/24finals/pinyin_dict_nv_22.10.txt"
 
 dataset:

diff --git a/scripts/dataset_processing/tts/aishell3/get_data.py b/scripts/dataset_processing/tts/aishell3/get_data.py
@@ -106,6 +106,7 @@ def __process_transcript(file_path: str):
                 'duration': float(duration),
                 'text': text,
                 'normalized_text': normalized_text,
+                'speaker': int(speaker[3:]),
             }
 
             i += 1