Merge pull request #62 from uezo/develop

Make it possible to select TTS engine when you add voice
uezo · Jun 27, 2020 · 6588938 · 6588938
2 parents 20be5fa + 38fc5de
commit 6588938
Show file tree

Hide file tree

Showing 7 changed files with 98 additions and 35 deletions.
diff --git a/ChatdollKit/Scripts/Model/AnimatedVoice.cs b/ChatdollKit/Scripts/Model/AnimatedVoice.cs
@@ -17,19 +17,19 @@ public AnimatedVoice(List<Voice> voices = null, Dictionary<string, List<Animatio
             Faces = faces ?? new List<FaceExpression>();
         }
 
-        public void AddVoice(string name, float preGap = 0.0f, float postGap = 0.0f, string text = null, string url = null, Dictionary<string, string> ttsOptions = null, VoiceSource source = VoiceSource.Local)
+        public void AddVoice(string name, float preGap = 0.0f, float postGap = 0.0f, string text = null, string url = null, TTSConfiguration ttsConfig = null, VoiceSource source = VoiceSource.Local)
         {
-            Voices.Add(new Voice(name, preGap, postGap, text, url, ttsOptions, source));
+            Voices.Add(new Voice(name, preGap, postGap, text, url, ttsConfig, source));
         }
 
         public void AddVoiceWeb(string url, float preGap = 0.0f, float postGap = 0.0f, string name = null, string text = null)
         {
             Voices.Add(new Voice(name ?? string.Empty, preGap, postGap, text, url, null, VoiceSource.Web));
         }
 
-        public void AddVoiceTTS(string text, float preGap = 0.0f, float postGap = 0.0f, string name = null, Dictionary<string, string> ttsOptions = null)
+        public void AddVoiceTTS(string text, float preGap = 0.0f, float postGap = 0.0f, string name = null, TTSConfiguration ttsConfig = null)
         {
-            Voices.Add(new Voice(name ?? string.Empty, preGap, postGap, text, string.Empty, ttsOptions, VoiceSource.TTS));
+            Voices.Add(new Voice(name ?? string.Empty, preGap, postGap, text, string.Empty, ttsConfig, VoiceSource.TTS));
         }
 
         public void AddAnimation(string name, string layerName = null, float duration = 0.0f, float fadeLength = -1.0f, float weight = 1.0f, float preGap = 0.0f, string description = null)

diff --git a/ChatdollKit/Scripts/Model/AnimatedVoiceRequest.cs b/ChatdollKit/Scripts/Model/AnimatedVoiceRequest.cs
@@ -60,13 +60,13 @@ public void AddVoiceWeb(string url, float preGap = 0.0f, float postGap = 0.0f, s
             AnimatedVoices.Last().AddVoiceWeb(url, preGap, postGap, name, text);
         }
 
-        public void AddVoiceTTS(string text, float preGap = 0.0f, float postGap = 0.0f, string name = null, Dictionary<string, string> ttsOptions = null, bool asNewFrame = false)
+        public void AddVoiceTTS(string text, float preGap = 0.0f, float postGap = 0.0f, string name = null, TTSConfiguration ttsConfig = null, bool asNewFrame = false)
         {
             if (asNewFrame || AnimatedVoices.Count == 0)
             {
                 CreateNewFrame();
             }
-            AnimatedVoices.Last().AddVoiceTTS(text, preGap, postGap, name, ttsOptions);
+            AnimatedVoices.Last().AddVoiceTTS(text, preGap, postGap, name, ttsConfig);
         }
 
         public void AddAnimation(string name, float duration = 0.0f, float fadeLength = -1.0f, float weight = 1.0f, float preGap = 0.0f, string description = null, bool asNewFrame = false)

diff --git a/ChatdollKit/Scripts/Model/ModelController.cs b/ChatdollKit/Scripts/Model/ModelController.cs
@@ -19,6 +19,7 @@ public class ModelController : MonoBehaviour
         private Dictionary<string, AudioClip> voices = new Dictionary<string, AudioClip>();
         public Func<Voice, Task<AudioClip>> VoiceDownloadFunc;
         public Func<Voice, Task<AudioClip>> TextToSpeechFunc;
+        public Dictionary<string, Func<Voice, Task<AudioClip>>> TextToSpeechFunctions = new Dictionary<string, Func<Voice, Task<AudioClip>>>();
         public bool UsePrefetch = true;
 
         // Animation
@@ -285,7 +286,8 @@ public async Task Say(VoiceRequest request, CancellationToken token)
                     }
                     else if (v.Source == VoiceSource.TTS)
                     {
-                        clip = await TextToSpeechFunc?.Invoke(v);
+                        var ttsFunc = GetTTSFunction(v.GetTTSFunctionName());
+                        clip = await ttsFunc?.Invoke(v);
                     }
 
                     if (clip != null)
@@ -351,6 +353,26 @@ public void AddVoice(string name, AudioClip audioClip)
             voices[ReplaceDakuten(name)] = audioClip;
         }
 
+        // Get registered TTS Function by name
+        public Func<Voice, Task<AudioClip>> GetTTSFunction(string name)
+        {
+            if (!string.IsNullOrEmpty(name) && TextToSpeechFunctions.ContainsKey(name))
+            {
+                return TextToSpeechFunctions[name];
+            }
+            return TextToSpeechFunc;
+        }
+
+        // Register TTS Function with name
+        public void RegisterTTSFunction(string name, Func<Voice, Task<AudioClip>> func, bool asDefault = false)
+        {
+            TextToSpeechFunctions[name] = func;
+            if (asDefault)
+            {
+                TextToSpeechFunc = func;
+            }
+        }
+
         // Replace Japanese Dakuten from resource files
         public string ReplaceDakuten(string value)
         {

diff --git a/ChatdollKit/Scripts/Model/Voice.cs b/ChatdollKit/Scripts/Model/Voice.cs
@@ -15,27 +15,68 @@ public class Voice
         public float PostGap { get; set; }
         public string Text { get; set; }
         public string Url { get; set; }
-        public Dictionary<string, string> TTSOptions { get; set; }
+        public TTSConfiguration TTSConfig { get; set; }
         public VoiceSource Source { get; set; }
 
-        public Voice(string name, float preGap, float postGap, string text, string url, Dictionary<string, string> ttsOptions, VoiceSource source)
+        public Voice(string name, float preGap, float postGap, string text, string url, TTSConfiguration ttsConfig, VoiceSource source)
         {
             Name = name;
             PreGap = preGap;
             PostGap = postGap;
             Text = text;
             Url = url;
-            TTSOptions = ttsOptions;
+            TTSConfig = ttsConfig;
             Source = source;
         }
 
-        public string GetTTSOption(string key)
+        public object GetTTSParam(string key)
         {
-            if (TTSOptions != null && TTSOptions.ContainsKey(key))
+            if (TTSConfig != null)
             {
-                return TTSOptions[key];
+                return TTSConfig.GetParam(key);
             }
             return null;
         }
+
+        public string GetTTSFunctionName()
+        {
+            if (TTSConfig != null)
+            {
+                return TTSConfig.TTSFunctionName;
+            }
+            else
+            {
+                return string.Empty;
+            }
+        }
+    }
+
+    public class TTSConfiguration
+    {
+        public string TTSFunctionName { get; set; }
+        public Dictionary<string, object> Params { get; }
+
+        public TTSConfiguration()
+        {
+            Params = new Dictionary<string, object>();
+        }
+
+        public TTSConfiguration(string ttsFunctionName = null)
+        {
+            TTSFunctionName = ttsFunctionName ?? string.Empty;
+            Params = new Dictionary<string, object>();
+        }
+
+        public object GetParam(string key)
+        {
+            if (Params.ContainsKey(key))
+            {
+                return Params[key];
+            }
+            else
+            {
+                return null;
+            }
+        }
     }
 }
diff --git a/ChatdollKit/Scripts/Model/VoiceRequest.cs b/ChatdollKit/Scripts/Model/VoiceRequest.cs
@@ -23,19 +23,19 @@ public VoiceRequest(params string[] voiceNames) : this()
             }
         }
 
-        public void AddVoice(string name, float preGap = 0.0f, float postGap = 0.0f, string text = null, string url = null, Dictionary<string, string> ttsOptions = null, VoiceSource source = VoiceSource.Local)
+        public void AddVoice(string name, float preGap = 0.0f, float postGap = 0.0f, string text = null, string url = null, TTSConfiguration ttsConfig = null, VoiceSource source = VoiceSource.Local)
         {
-            Voices.Add(new Voice(name, preGap, postGap, text, url, ttsOptions, source));
+            Voices.Add(new Voice(name, preGap, postGap, text, url, ttsConfig, source));
         }
 
         public void AddVoiceWeb(string url, float preGap = 0.0f, float postGap = 0.0f, string name = null, string text = null)
         {
             Voices.Add(new Voice(name ?? string.Empty, preGap, postGap, text, url, null, VoiceSource.Web));
         }
 
-        public void AddVoiceTTS(string text, float preGap = 0.0f, float postGap = 0.0f, string name = null, Dictionary<string, string> ttsOptions = null)
+        public void AddVoiceTTS(string text, float preGap = 0.0f, float postGap = 0.0f, string name = null, TTSConfiguration ttsConfig = null)
         {
-            Voices.Add(new Voice(name ?? string.Empty, preGap, postGap, text, string.Empty, ttsOptions, VoiceSource.TTS));
+            Voices.Add(new Voice(name ?? string.Empty, preGap, postGap, text, string.Empty, ttsConfig, VoiceSource.TTS));
         }
     }
 }
diff --git a/Extension/AzureTTSLoader.cs b/Extension/AzureTTSLoader.cs
@@ -40,9 +40,9 @@ protected override async Task<AudioClip> DownloadAudioClipAsync(Voice voice)
                 www.SetRequestHeader("Ocp-Apim-Subscription-Key", ApiKey);
 
                 // Body
-                var ttsLanguage = voice.GetTTSOption("language") ?? Language;
-                var ttsGender = voice.GetTTSOption("gender") ?? Gender;
-                var ttsSpeakerName = voice.GetTTSOption("speakerName") ?? SpeakerName;
+                var ttsLanguage = voice.GetTTSParam("language") as string ?? Language;
+                var ttsGender = voice.GetTTSParam("gender") as string ?? Gender;
+                var ttsSpeakerName = voice.GetTTSParam("speakerName") as string ?? SpeakerName;
                 var text = $"<speak version='1.0' xml:lang='{ttsLanguage}'><voice xml:lang='{ttsLanguage}' xml:gender='{ttsGender}' name='{ttsSpeakerName}'>{voice.Text}</voice></speak>";
                 www.uploadHandler = new UploadHandlerRaw(System.Text.Encoding.UTF8.GetBytes(text));
 

diff --git a/Extension/VoiceroidTTSLoader.cs b/Extension/VoiceroidTTSLoader.cs
@@ -67,35 +67,35 @@ class VoiceroidRequest
             public VoiceroidRequest(Voice voice)
             {
                 Text = voice.Text;
-                Kana = voice.GetTTSOption("Kana");
+                Kana = (string)voice.GetTTSParam("Kana");
                 Speaker = new Dictionary<string, float>();
-                if (!string.IsNullOrEmpty(voice.GetTTSOption("Volume")))
+                if (voice.GetTTSParam("Volume") != null)
                 {
-                    Speaker["Volume"] = float.Parse(voice.GetTTSOption("Volume"));
+                    Speaker["Volume"] = (float)voice.GetTTSParam("Volume");
                 }
-                if (!string.IsNullOrEmpty(voice.GetTTSOption("Speed")))
+                if (voice.GetTTSParam("Speed") != null)
                 {
-                    Speaker["Speed"] = float.Parse(voice.GetTTSOption("Speed"));
+                    Speaker["Speed"] = (float)voice.GetTTSParam("Speed");
                 }
-                if (!string.IsNullOrEmpty(voice.GetTTSOption("Pitch")))
+                if (voice.GetTTSParam("Pitch") != null)
                 {
-                    Speaker["Pitch"] = float.Parse(voice.GetTTSOption("Pitch"));
+                    Speaker["Pitch"] = (float)voice.GetTTSParam("Pitch");
                 }
-                if (!string.IsNullOrEmpty(voice.GetTTSOption("Emphasis")))
+                if (voice.GetTTSParam("Emphasis") != null)
                 {
-                    Speaker["Emphasis"] = float.Parse(voice.GetTTSOption("Emphasis"));
+                    Speaker["Emphasis"] = (float)voice.GetTTSParam("Emphasis");
                 }
-                if (!string.IsNullOrEmpty(voice.GetTTSOption("PauseMiddle")))
+                if (voice.GetTTSParam("PauseMiddle") != null)
                 {
-                    Speaker["PauseMiddle"] = float.Parse(voice.GetTTSOption("PauseMiddle"));
+                    Speaker["PauseMiddle"] = (float)voice.GetTTSParam("PauseMiddle");
                 }
-                if (!string.IsNullOrEmpty(voice.GetTTSOption("PauseLong")))
+                if (voice.GetTTSParam("PauseLong") != null)
                 {
-                    Speaker["PauseLong"] = float.Parse(voice.GetTTSOption("PauseLong"));
+                    Speaker["PauseLong"] = (float)voice.GetTTSParam("PauseLong");
                 }
-                if (!string.IsNullOrEmpty(voice.GetTTSOption("PauseSentence")))
+                if (voice.GetTTSParam("PauseSentence") != null)
                 {
-                    Speaker["PauseSentence"] = float.Parse(voice.GetTTSOption("PauseSentence"));
+                    Speaker["PauseSentence"] = (float)voice.GetTTSParam("PauseSentence");
                 }
             }
         }