-
Notifications
You must be signed in to change notification settings - Fork 158
/
Copy pathconfig.ini
121 lines (85 loc) · 6.61 KB
/
config.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
[SETTINGS]
# Set to True if you don't want to translate the subtitles. If so, ignore the language variables
skip_translation = False
# Set to True if you don't want to synthesize the audio. For example, if you already did that and are testing
skip_synthesize = False
# Set to True if you want to stop the program after translating the subtitles.
# For example, if you want to manually review the resulting subtitles before synthesizing the audio.
# Note that to resume the process, you must set this to False again and set skip_translation to True
stop_after_translation = False
# The BCP-47 language code for the original text language
original_language = en-US
# Applies to DeepL translations only - Whether to have it use more or less formal language
# Possible Values: default | more | less
formality_preference = default
# The format/codec of the final audio file
# Possible Values: mp3 | aac | wav
output_format = aac
# Must be a codec from 'Supported Audio Encodings' section here: https://cloud.google.com/speech-to-text/docs/encoding#audio-encodings
# This determines the codec returned by the API, not the one produced by the program! You probably shouldn't change this, it might not work otherwise
synth_audio_encoding = MP3
# Enter the native sample rate for the voice audio provided by the TTS service
# This is usually 24KHz (24000), but some services like Azure offer higher quality audio at 48KHz (48000)
# Enter only number digits, no commas or anything
synth_sample_rate = 24000
# This will drastically improve the quality of the final result, BUT see note below
# Note! Setting this to true will make it so instead of just stretching the audio clips, it will have the API generate new audio clips with adjusted speaking rates
# This can't be done on the first pass because we don't know how long the audio clips will be until we generate them
two_pass_voice_synth = True
# Allows you to choose an alternative audio time stretcher tool. FFMPEG actually seems to be better than Rubberband in my experience
# Possible Values: ffmpeg | rubberband
local_audio_stretch_method = ffmpeg
# On the second pass, each audio clip will be extremely close to the desired length, but a bit off
# Set this to True if you want to stretch the second-pass clip anyway to be exact, down to the millisecond
# However, this will degrade the voice and make it sound similar to if it was just 1-Pass
force_stretch_with_twopass = False
# This will make it so the audio clips get stretched locally even if the TTS service allows specifying exact duration
# This could be used when a TTS service like Azure is creating clips of incorrect length, or if certain voices don't support exact length
# Possible Values: True | False (Default)
force_always_stretch = False
# Azure Only: Sets the exact pause in milliseconds that the TTS voice will pause after a period between sentences
# Set it to "default" to keep it default which is quite slow. I find 80ms is pretty good
# Note: Changing this from default adds about 60 characters per line to the total Azure character usage count
# Possible values: default | Any integer
azure_sentence_pause = 80
# Azure Only: Sets the exact pause in milliseconds that the TTS voice will pause after a comma.
# Set it to "default" to keep it default which is quite slow.
# It doesn't seem to follow this number exactly, and seems to have a minimum around 50ms
# Note: Changing this from default adds about 60 characters per line to the total Azure character usage count
# Possible values: default | Any integer
azure_comma_pause = 50
# Adds a silence buffer between each spoken clip, but keeps the speech "centered" at the right spot so it's still synced
# > To be clear the total length of the audio file will remain the same, each spoken clip gets shrunk within it
# Useful if your subtitles file butts all the beginning and end timings right up against each other
# Note, this applies both before and after, so the total extra between clips will be 2x this
# Warning, setting this too high could result in the TTS speaking extremely fast to fit into remaining clip duration
# > Around 25 - 50 milliseconds is a good starting point
add_line_buffer_milliseconds = 0
# If the combination of two adjacent subtitle lines is below this amount, and one starts at the same time the other ends, it will combine the lines
# This should improve the speech synthesis by reducing unnatural splits in spoken sentences.
# Setting this to zero or a low number will effectively disable it
combine_subtitles_max_chars = 200
# If this setting is true, if the TTS voice is speaking extremely fast for a specific audio segment,
# the script will allow a larger maximum number of characters per subtitle line than set in combine_subtitles_max_chars
# Possible Values: True | False (Default)
increase_max_chars_for_extreme_speeds = False
# This determines the largest gap between subtitle line timestamps where the script will consider combining them
# Setting this too low will result in more fragmented TTS speech, and setting it too high could affect how well in sync the TTS stays with the video in some parts.
# Note: This won't matter if your subtitles have all the timestamps butting right up against eachother
# Default: 200
subtitle_gap_threshold_milliseconds = 200
# If this setting is true, when combining subtitle lines, it will prioritize starting and ending text lines at natural sentence breaks.
# This will result in fewer awkward fragmented speech patterns by the TTS voice, at the cost of some text possibly being read faster or slower than desired.
# Possible Values: True | False
prioritize_avoiding_fragmented_speech = True
# If this setting is set to Auto, it will automatically calculate the TTS speech rate goal, which will determine how subtitle lines are combined
# You can probably keep this set to auto unless you have issues with slow TTS speech, possibly in videos that have large parts with no dialogue
# You can also set it manually to any integer, but 20 is probably around where you want it.
# The number just means the amount of characters per second the combining algorithm will TRY to achieve, it's not like it sets the actual speaking speed.
# Possible Values: Auto (Default) | Any integer (20 is a good starting point)
speech_rate_goal = Auto
# Mostly prevents the program from deleting files in the working directory, and also generates files for each audio step
debug_mode = False
# ---------------- Probably should not touch these below --------------------
# List of languages YouTube is known to allow auto-syncing when uploading a transcript
youtube_autosync_languages = es, it, ko, pt, ru, ja, de