Hyper_Parameters.yaml

Sound:
    Spectrogram_Dim: 1025
    Mel_Dim: 80
    Frame_Length: 1024
    Frame_Shift: 256
    Sample_Rate: 24000
    Mel_F_Min: 125
    Mel_F_Max: 7600
    Max_Abs_Mel: 4
    Confidence_Threshold: 0.6
    Gaussian_Smoothing_Sigma: 0.0
    Pitch_Min: 100.0
    Pitch_Max: 500.0

Use_Cython_Alignment: true  # If true, model uses 'https://github.com/jaywalnut310/glow-tts/tree/master/monotonic_align'.

# Mode: 'PE'    #Vanilla, SE, PE, GR
Mode: 'SE'    #Vanilla, SE, PE, GR

Encoder:
    Channels: 192   # Basic is 192, GR was 256
    Embedding_Tokens: 35
    Prenet: # Conv -> Norm -> ReLU -> Dropout
        Kernel_Size: 5
        Dropout_Rate: 0.5
        Stacks: 3
    Transformer:    # Attention -> Dropout -> Norm -> Conv -> Relu ->  Dropout -> Conv -> Dropout -> Norm
        Attention:
            Heads: 2
            Window_Size: 4
        Conv:
            Kernel_Size: 3
            Calc_Channels: 768   #Ch -> Calc_Ch -> Ch
        Dropout_Rate: 0.1
        Stacks: 6
    Duration_Predictor:
        Kernel_Size: 3
        Channels: 256
        Stacks: 2
        Dropout_Rate: 0.1

Decoder:
    Stack: 12
    Num_Squeeze: 2
    Num_Split: 4
    Affine_Coupling:
        Calc_Channels: 192
        WaveNet:
            Num_Layers: 4
            Kernel_Size: 5
            Dropout_Rate: 0.05

# SE or GR modes
Speaker_Embedding:
    Type: 'LUT'    #LUT, GE2E
    Num_Speakers: 109  # If using SE mode, only lut uses. If using GR mode, always used.
    Embedding_Size: 256
    GE2E:
        LSTM:
            Sizes: 256
            Stacks: 3
        Inference:
            Samples: 5
            Slice_Length: 64
            Overlap_Length: 32
        Checkpoint_Path: './Speaker_Embedding/Example_Results/Checkpoint/S_100000.pkl'

# PE or GR modes
Prosody_Encoder:
    Size: 256
    Reference_Encoder:
        Conv:
            Kernel_Size: [3, 3, 3, 3, 3, 3]
            Channels: [32, 32, 64, 64, 128, 128]
            Strides: [2, 2, 2, 2, 2, 2]
        GRU:
            Size: 128
            Stacks: 1
    Style_Token:
        Num_Tokens: 128 #10
        Size: 256
        Attention_Head: 4

Speaker_Classifier_GR:
    Channels: [256]

# Unbalencing of
Token_Path: 'E:/24K.Pattern.LJVCTKLibri/Token.yaml'
Train:
    Use_Pattern_Cache: true
    Train_Pattern:
        Path: 'C:/Pattern/24K.Pattern.LJVCTK/Train'
        Metadata_File: 'METADATA.PICKLE'
        Mel_Length:
            Min: 50
            Max: 1000
        Text_Length:
            Min: 10
            Max: 200
        Accumulated_Dataset_Epoch: 1   # This is to prevent slow down from torch.utils.data.DataLoader when the number of patterns is small.
    Eval_Pattern:
        Path: 'C:/Pattern/24K.Pattern.LJVCTK/Eval'
        Metadata_File: 'METADATA.PICKLE'
        Mel_Length:
            Min: 50
            Max: 1000
        Text_Length:
            Min: 10
            Max: 200
    Num_Workers: 4
    Adversarial_Speaker_Weight: 0.0005
    Batch_Size: 32  #16 did not work, but 32 did work. I recommend > 32.
    Learning_Rate:
        Initial: 1.0e-3
        Base: 4000     # This is similar warmup step, but no warmup because of radam.
    ADAM:
        Beta1: 0.9
        Beta2: 0.999
        Epsilon: 1.0e-6
    Weight_Decay: 1.0e-6
    Gradient_Norm: 5.0
    Max_Step: 400000
    Checkpoint_Save_Interval: 1000
    Logging_Interval: 100
    Evaluation_Interval: 1000
    Prosody_Check_Interval: 5000   #Only in PE and GR mode
    Inference_Interval: 1000
    Initial_Inference: false
    # Inference_Pattern_File_in_Train: 'Inference_Text_for_GR_LUT_LJVCTK.txt'
    Inference_Pattern_File_in_Train: 'Inference_Text_for_PE_LJVCTK.txt'

Inference_Batch_Size: null
Inference_Path: 'D:/GlowTTS.Results/SR24K.PE.LJVCTK/Inference'
Checkpoint_Path: 'D:/GlowTTS.Results/SR24K.PE.LJVCTK/Checkpoint'
Log_Path: 'D:/GlowTTS.Results/SR24K.PE.LJVCTK/Log'
Use_Mixed_Precision: false  # apex is required.
Device: '0'