-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathHyper_Parameters.yaml
137 lines (128 loc) · 3.8 KB
/
Hyper_Parameters.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
Sound:
Spectrogram_Dim: 1025
Mel_Dim: 80
Frame_Length: 1024
Frame_Shift: 256
Sample_Rate: 24000
Mel_F_Min: 125
Mel_F_Max: 7600
Max_Abs_Mel: 4
Confidence_Threshold: 0.6
Gaussian_Smoothing_Sigma: 0.0
Pitch_Min: 100.0
Pitch_Max: 500.0
Use_Cython_Alignment: true # If true, model uses 'https://github.com/jaywalnut310/glow-tts/tree/master/monotonic_align'.
# Mode: 'PE' #Vanilla, SE, PE, GR
Mode: 'SE' #Vanilla, SE, PE, GR
Encoder:
Channels: 192 # Basic is 192, GR was 256
Embedding_Tokens: 35
Prenet: # Conv -> Norm -> ReLU -> Dropout
Kernel_Size: 5
Dropout_Rate: 0.5
Stacks: 3
Transformer: # Attention -> Dropout -> Norm -> Conv -> Relu -> Dropout -> Conv -> Dropout -> Norm
Attention:
Heads: 2
Window_Size: 4
Conv:
Kernel_Size: 3
Calc_Channels: 768 #Ch -> Calc_Ch -> Ch
Dropout_Rate: 0.1
Stacks: 6
Duration_Predictor:
Kernel_Size: 3
Channels: 256
Stacks: 2
Dropout_Rate: 0.1
Decoder:
Stack: 12
Num_Squeeze: 2
Num_Split: 4
Affine_Coupling:
Calc_Channels: 192
WaveNet:
Num_Layers: 4
Kernel_Size: 5
Dropout_Rate: 0.05
# SE or GR modes
Speaker_Embedding:
Type: 'LUT' #LUT, GE2E
Num_Speakers: 109 # If using SE mode, only lut uses. If using GR mode, always used.
Embedding_Size: 256
GE2E:
LSTM:
Sizes: 256
Stacks: 3
Inference:
Samples: 5
Slice_Length: 64
Overlap_Length: 32
Checkpoint_Path: './Speaker_Embedding/Example_Results/Checkpoint/S_100000.pkl'
# PE or GR modes
Prosody_Encoder:
Size: 256
Reference_Encoder:
Conv:
Kernel_Size: [3, 3, 3, 3, 3, 3]
Channels: [32, 32, 64, 64, 128, 128]
Strides: [2, 2, 2, 2, 2, 2]
GRU:
Size: 128
Stacks: 1
Style_Token:
Num_Tokens: 128 #10
Size: 256
Attention_Head: 4
Speaker_Classifier_GR:
Channels: [256]
# Unbalencing of
Token_Path: 'E:/24K.Pattern.LJVCTKLibri/Token.yaml'
Train:
Use_Pattern_Cache: true
Train_Pattern:
Path: 'C:/Pattern/24K.Pattern.LJVCTK/Train'
Metadata_File: 'METADATA.PICKLE'
Mel_Length:
Min: 50
Max: 1000
Text_Length:
Min: 10
Max: 200
Accumulated_Dataset_Epoch: 1 # This is to prevent slow down from torch.utils.data.DataLoader when the number of patterns is small.
Eval_Pattern:
Path: 'C:/Pattern/24K.Pattern.LJVCTK/Eval'
Metadata_File: 'METADATA.PICKLE'
Mel_Length:
Min: 50
Max: 1000
Text_Length:
Min: 10
Max: 200
Num_Workers: 4
Adversarial_Speaker_Weight: 0.0005
Batch_Size: 32 #16 did not work, but 32 did work. I recommend > 32.
Learning_Rate:
Initial: 1.0e-3
Base: 4000 # This is similar warmup step, but no warmup because of radam.
ADAM:
Beta1: 0.9
Beta2: 0.999
Epsilon: 1.0e-6
Weight_Decay: 1.0e-6
Gradient_Norm: 5.0
Max_Step: 400000
Checkpoint_Save_Interval: 1000
Logging_Interval: 100
Evaluation_Interval: 1000
Prosody_Check_Interval: 5000 #Only in PE and GR mode
Inference_Interval: 1000
Initial_Inference: false
# Inference_Pattern_File_in_Train: 'Inference_Text_for_GR_LUT_LJVCTK.txt'
Inference_Pattern_File_in_Train: 'Inference_Text_for_PE_LJVCTK.txt'
Inference_Batch_Size: null
Inference_Path: 'D:/GlowTTS.Results/SR24K.PE.LJVCTK/Inference'
Checkpoint_Path: 'D:/GlowTTS.Results/SR24K.PE.LJVCTK/Checkpoint'
Log_Path: 'D:/GlowTTS.Results/SR24K.PE.LJVCTK/Log'
Use_Mixed_Precision: false # apex is required.
Device: '0'