-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathHyper_Parameters.yaml
161 lines (148 loc) · 4.16 KB
/
Hyper_Parameters.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
Sound:
Frame_Shift: 320
Sample_Rate: 22050
Mel_Dim: 80
F0_Min: 65
F0_Max: 2094
Tokens: 55
Use_Between_Padding: true
Audio_Codec_Size: 512
Encoder:
Size: 384
Transformer:
Stack: 6
Head: 2
FFN:
Kernel_Size: 17
Dropout_Rate: 0.1
Speech_Prompter:
Size: 384
Transformer:
Stack: 6
Head: 2
FFN:
Kernel_Size: 9
Dropout_Rate: 0.2
Duration_Predictor:
Stack: 10
Attention:
Head: 2
Conv:
Kernel_Size: 3
Stack: 3
Dropout_Rate: 0.2
F0_Predictor:
Stack: 10
Attention:
Head: 2
Conv:
Kernel_Size: 3
Stack: 3
Dropout_Rate: 0.5
Frame_Prior_Network:
Transformer:
Stack: 6
Head: 2
FFN:
Kernel_Size: 17
Dropout_Rate: 0.1
Diffusion:
Max_Step: 100
Size: 256
Pre_Attention:
Query_Token: 32
Query_Size: 512
Head: 2
WaveNet:
Kernel_Size: 3
Dilation: 2
Stack: 40
Dropout_Rate: 0.2
Attention:
Apply_in_Stack: 3
Head: 2
Network_Prediction: 'Start' # 'Start', 'Epsilon'
Token_Path: 'F:/Datasets/22K.NaturalSpeech2.VCTK/Token.yaml'
Latent_Info_Path: 'F:/Datasets/22K.NaturalSpeech2.VCTK/Latent_Info.yaml'
Mel_Info_Path: 'F:/Datasets/22K.NaturalSpeech2.VCTK/Mel_Info.yaml'
F0_Info_Path: 'F:/Datasets/22K.NaturalSpeech2.VCTK/F0_Info.yaml'
Speaker_Info_Path: 'F:/Datasets/22K.NaturalSpeech2.VCTK/Speaker_Info.yaml'
Emotion_Info_Path: 'F:/Datasets/22K.NaturalSpeech2.VCTK/Emotion_Info.yaml'
Language_Info_Path: 'F:/Datasets/22K.NaturalSpeech2.VCTK/Language_Info.yaml'
Gender_Info_Path: 'F:/Datasets/22K.NaturalSpeech2.VCTK/Gender_Info.yaml'
Language_and_Gender_Info_by_Speaker_Path: 'F:/Datasets/22K.NaturalSpeech2.VCTK/Language_and_Gender_Info_by_Speaker.yaml'
Train:
Pattern_Cache: true
# Pattern_Cache: false
Train_Pattern:
Path: 'F:/Datasets/22K.NaturalSpeech2.VCTK/Train'
Metadata_File: 'METADATA.PICKLE'
Feature_Length:
Min: 50
Max: 800
Text_Length:
Min: 1
Max: 200
Accumulated_Dataset_Epoch: 1 # This is to prevent slow down from torch.utils.data.DataLoader when the number of patterns is small.
Augmentation_Ratio: 0.10
Eval_Pattern:
Path: 'F:/Datasets/22K.NaturalSpeech2.VCTK/Eval'
Metadata_File: 'METADATA.PICKLE'
Feature_Length:
Min: 50
Max: 800
Text_Length:
Min: 10
Max: 200
Num_Workers: 0
Batch_Size: 16
Segment_Size: 64
Learning_Rate:
Initial: 1.0e-4
Warmup_Step: 4000
Diffusion_Lambda: 10.0
ADAM:
Beta1: 0.9
Beta2: 0.999
Epsilon: 1.0e-7
Accumulated_Gradient_Step: 1 # 25
Gradient_Norm: 0.0
Max_Step: 5000000
Checkpoint_Save_Interval: 5000
Logging_Interval: 1
Evaluation_Interval: 1000
Inference_Interval: 5000
Initial_Inference: true
# Initial_Inference: false
Inference_in_Train:
Text: [
'Do not kill the goose that lays the golden eggs.',
'A good medicine tastes bitter.',
'Do not count your chickens before they hatch.',
'If you laugh, blessings will come your way.'
]
Reference: [
'Inference_Wav/p279_003_mic2.flac',
'Inference_Wav/p280_006_mic1.flac',
'Inference_Wav/p362_002_mic1.flac',
'Inference_Wav/s5_004_mic2.flac'
]
Speech_Prompt_Length: 250 # 50 == 1 seconds
Inference_Batch_Size: 16
Inference_Path: './results/VCTK_240107/Inference'
Checkpoint_Path: './results/VCTK_240107/Checkpoint'
Log_Path: './results/VCTK_240107/Log'
Weights_and_Biases:
# Use: true
Use: false
Project: 'NaturalSpeech2'
Entity: 'codejin'
Name: 'VCTK_240107'
Save_Checkpoint:
Use: false
Interval: 50000 # Unlike local, The capacity of WandB is small.
Use_Mixed_Precision: true # Don't use mixed precision in this model.
Use_Multi_GPU: false
Device: '0'
# Use_Multi_GPU: true
# Device: '0,1,2,3,4,5,6,7'