VOICEVOX · Hiroshiba · Oct 22, 2023 · Sep 6, 2023 · Sep 6, 2023 · Sep 6, 2023
@@ -82,6 +82,22 @@
     get_save_dir,
 )
 
+import warnings
+
+
+def id_checker(style_id: Optional[int], speaker_id: Optional[int]) -> int:
+    """
+    style_idとspeaker_id両方ともNoneかNoneでないかをチェックし、
+    どちらか片方しかNoneが存在しなければstyle_idを返す
+    """
+    if style_id == speaker_id == None or (style_id != None and speaker_id != None):
+        raise ValueError("speaker_idとstyle_idが両方とも存在しないか、両方とも存在しています")
+    if speaker_id is not None:
+        warnings.warn("style_idに変更になりましたのこちらの利用を推奨しています")
+        style_id = speaker_id
+        speaker_id = None
+        return style_id
+
 
 def b64encode_str(s):
     return base64.b64encode(s).decode("utf-8")
@@ -228,7 +244,7 @@ def audio_query(text: str, speaker: int, core_version: Optional[str] = None):
         クエリの初期値を得ます。ここで得られたクエリはそのまま音声合成に利用できます。各値の意味は`Schemas`を参照してください。
         """
         engine = get_engine(core_version)
-        accent_phrases = engine.create_accent_phrases(text, speaker_id=speaker)
+        accent_phrases = engine.create_accent_phrases(text, style_id=speaker)
         return AudioQuery(
             accent_phrases=accent_phrases,
             speedScale=1,
@@ -267,7 +283,7 @@ def audio_query_from_preset(
             raise HTTPException(status_code=422, detail="該当するプリセットIDが見つかりません")
 
         accent_phrases = engine.create_accent_phrases(
-            text, speaker_id=selected_preset.style_id
+            text, style_id=selected_preset.style_id
         )
         return AudioQuery(
             accent_phrases=accent_phrases,
@@ -319,12 +335,12 @@ def accent_phrases(
                     detail=ParseKanaBadRequest(err).dict(),
                 )
             accent_phrases = engine.replace_mora_data(
-                accent_phrases=accent_phrases, speaker_id=speaker
+                accent_phrases=accent_phrases, style_id=speaker
             )
 
             return accent_phrases
         else:
-            return engine.create_accent_phrases(text, speaker_id=speaker)
+            return engine.create_accent_phrases(text, style_id=speaker)
 
     @app.post(
         "/mora_data",
@@ -338,7 +354,7 @@ def mora_data(
         core_version: Optional[str] = None,
     ):
         engine = get_engine(core_version)
-        return engine.replace_mora_data(accent_phrases, speaker_id=speaker)
+        return engine.replace_mora_data(accent_phrases, style_id=speaker)
 
     @app.post(
         "/mora_length",
@@ -353,7 +369,7 @@ def mora_length(
     ):
         engine = get_engine(core_version)
         return engine.replace_phoneme_length(
-            accent_phrases=accent_phrases, speaker_id=speaker
+            accent_phrases=accent_phrases, style_id=speaker
         )
 
     @app.post(
@@ -369,7 +385,7 @@ def mora_pitch(
     ):
         engine = get_engine(core_version)
         return engine.replace_mora_pitch(
-            accent_phrases=accent_phrases, speaker_id=speaker
+            accent_phrases=accent_phrases, style_id=speaker
         )
 
     @app.post(
@@ -397,7 +413,7 @@ def synthesis(
         engine = get_engine(core_version)
         wave = engine.synthesis(
             query=query,
-            speaker_id=speaker,
+            style_id=speaker,
             enable_interrogative_upspeak=enable_interrogative_upspeak,
         )
 
@@ -438,7 +454,7 @@ def cancellable_synthesis(
             )
         f_name = cancellable_engine._synthesis_impl(
             query=query,
-            speaker_id=speaker,
+            style_id=speaker,
             request=request,
             core_version=core_version,
         )
@@ -475,19 +491,15 @@ def multi_synthesis(
         sampling_rate = queries[0].outputSamplingRate
 
         with NamedTemporaryFile(delete=False) as f:
-
             with zipfile.ZipFile(f, mode="a") as zip_file:
-
                 for i in range(len(queries)):
-
                     if queries[i].outputSamplingRate != sampling_rate:
                         raise HTTPException(
                             status_code=422, detail="サンプリングレートが異なるクエリがあります"
                         )
 
                     with TemporaryFile() as wav_file:
-
-                        wave = engine.synthesis(query=queries[i], speaker_id=speaker)
+                        wave = engine.synthesis(query=queries[i], style_id=speaker)
                         soundfile.write(
                             file=wav_file,
                             data=wave,
@@ -893,17 +905,17 @@ def initialize_speaker(
         core_version: Optional[str] = None,
     ):
         """
-        指定されたspeaker_idの話者を初期化します。
+        指定されたstyle_idの話者を初期化します。
         実行しなくても他のAPIは使用できますが、初回実行時に時間がかかることがあります。
         """
         engine = get_engine(core_version)
-        engine.initialize_speaker_synthesis(speaker_id=speaker, skip_reinit=skip_reinit)
+        engine.initialize_speaker_synthesis(style_id=speaker, skip_reinit=skip_reinit)
         return Response(status_code=204)
 
     @app.get("/is_initialized_speaker", response_model=bool, tags=["その他"])
     def is_initialized_speaker(speaker: int, core_version: Optional[str] = None):
         """
-        指定されたspeaker_idの話者が初期化されているかどうかを返します。
+        指定されたstyle_idの話者が初期化されているかどうかを返します。
         """
         engine = get_engine(core_version)
         return engine.is_initialized_speaker_synthesis(speaker)

@@ -108,7 +108,7 @@ def test_replace_phoneme_length(self):
         self.assertEqual(
             self.engine.replace_phoneme_length(
                 accent_phrases=self.accent_phrases_hello_hiho,
-                speaker_id=0,
+                style_id=0,
             ),
             self.accent_phrases_hello_hiho,
         )
@@ -117,7 +117,7 @@ def test_replace_mora_pitch(self):
         self.assertEqual(
             self.engine.replace_mora_pitch(
                 accent_phrases=self.accent_phrases_hello_hiho,
-                speaker_id=0,
+                style_id=0,
             ),
             self.accent_phrases_hello_hiho,
         )
@@ -136,5 +136,5 @@ def test_synthesis(self):
                 outputStereo=False,
                 kana=create_kana(self.accent_phrases_hello_hiho),
             ),
-            speaker_id=0,
+            style_id=0,
         )
@@ -22,11 +22,11 @@
 )
 
 
-def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, speaker_id: numpy.ndarray):
+def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, style_id: numpy.ndarray):
     result = []
     # mockとしての適当な処理、特に意味はない
     for i in range(length):
-        result.append(float(phoneme_list[i] * 0.5 + speaker_id))
+        result.append(float(phoneme_list[i] * 0.5 + style_id))
     return numpy.array(result)
 
 
@@ -38,7 +38,7 @@ def yukarin_sa_mock(
     end_accent_list: numpy.ndarray,
     start_accent_phrase_list: numpy.ndarray,
     end_accent_phrase_list: numpy.ndarray,
-    speaker_id: numpy.ndarray,
+    style_id: numpy.ndarray,
 ):
     result = []
     # mockとしての適当な処理、特に意味はない
@@ -54,7 +54,7 @@ def yukarin_sa_mock(
                     + end_accent_phrase_list[0][i]
                 )
                 * 0.5
-                + speaker_id
+                + style_id
             )
         )
     return numpy.array(result)[numpy.newaxis]
@@ -65,7 +65,7 @@ def decode_mock(
     phoneme_size: int,
     f0: numpy.ndarray,
     phoneme: numpy.ndarray,
-    speaker_id: Union[numpy.ndarray, int],
+    style_id: Union[numpy.ndarray, int],
 ):
     result = []
     # mockとしての適当な処理、特に意味はない
@@ -75,7 +75,7 @@ def decode_mock(
             result.append(
                 float(
                     f0[i][0] * (numpy.where(phoneme[i] == 1)[0] / phoneme_size)
-                    + speaker_id
+                    + style_id
                 )
             )
     return numpy.array(result)
@@ -92,7 +92,7 @@ def metas(self):
     def supported_devices(self):
         return ""
 
-    def is_model_loaded(self, speaker_id):
+    def is_model_loaded(self, style_id):
         return True
 
 
@@ -303,7 +303,7 @@ def test_pre_process(self):
 
     def test_replace_phoneme_length(self):
         result = self.synthesis_engine.replace_phoneme_length(
-            accent_phrases=deepcopy(self.accent_phrases_hello_hiho), speaker_id=1
+            accent_phrases=deepcopy(self.accent_phrases_hello_hiho), style_id=1
         )
 
         # yukarin_sに渡される値の検証
@@ -340,7 +340,7 @@ def test_replace_phoneme_length(self):
                 dtype=numpy.int64,
             ),
         )
-        self.assertEqual(yukarin_s_args["speaker_id"], 1)
+        self.assertEqual(yukarin_s_args["style_id"], 1)
 
         # flatten_morasを使わずに愚直にaccent_phrasesにデータを反映させてみる
         true_result = deepcopy(self.accent_phrases_hello_hiho)
@@ -368,13 +368,13 @@ def test_replace_mora_pitch(self):
         empty_accent_phrases = []
         self.assertEqual(
             self.synthesis_engine.replace_mora_pitch(
-                accent_phrases=empty_accent_phrases, speaker_id=1
+                accent_phrases=empty_accent_phrases, style_id=1
             ),
             [],
         )
 
         result = self.synthesis_engine.replace_mora_pitch(
-            accent_phrases=deepcopy(self.accent_phrases_hello_hiho), speaker_id=1
+            accent_phrases=deepcopy(self.accent_phrases_hello_hiho), style_id=1
         )
 
         # yukarin_saに渡される値の検証
@@ -393,7 +393,7 @@ def test_replace_mora_pitch(self):
         self.assertEqual(list_length, len(end_accent_list))
         self.assertEqual(list_length, len(start_accent_phrase_list))
         self.assertEqual(list_length, len(end_accent_phrase_list))
-        self.assertEqual(yukarin_sa_args["speaker_id"], 1)
+        self.assertEqual(yukarin_sa_args["style_id"], 1)
 
         numpy.testing.assert_array_equal(
             vowel_phoneme_list,
@@ -512,7 +512,7 @@ def synthesis_test_base(self, audio_query: AudioQuery):
         for i in range(len(phoneme_length_list)):
             phoneme_length_list[i] /= audio_query.speedScale
 
-        result = self.synthesis_engine.synthesis(query=audio_query, speaker_id=1)
+        result = self.synthesis_engine.synthesis(query=audio_query, style_id=1)
 
         # decodeに渡される値の検証
         decode_args = self.decode_mock.call_args[1]
@@ -577,7 +577,7 @@ def synthesis_test_base(self, audio_query: AudioQuery):
                 assert_true_count += bool(phoneme[i][j] == decode_phoneme[i][j])
             assert_phoneme_count += assert_true_count == num_phoneme
         self.assertTrue(assert_phoneme_count >= int(len(decode_phoneme) / 5) * 4)
-        self.assertEqual(decode_args["speaker_id"], 1)
+        self.assertEqual(decode_args["style_id"], 1)
 
         # decode forwarderのmockを使う
         true_result = decode_mock(list_length, num_phoneme, f0, phoneme, 1)

@@ -24,7 +24,7 @@ def yukarin_sa_mock(
     end_accent_list: numpy.ndarray,
     start_accent_phrase_list: numpy.ndarray,
     end_accent_phrase_list: numpy.ndarray,
-    speaker_id: numpy.ndarray,
+    style_id: numpy.ndarray,
 ):
     result = []
     # mockとしての適当な処理、特に意味はない
@@ -41,7 +41,7 @@ def yukarin_sa_mock(
                         + end_accent_phrase_list[0][i]
                     )
                     * 0.0625
-                    + speaker_id
+                    + style_id
                 ),
                 2,
             )
@@ -54,7 +54,7 @@ def decode_mock(
     phoneme_size: int,
     f0: numpy.ndarray,
     phoneme: numpy.ndarray,
-    speaker_id: Union[numpy.ndarray, int],
+    style_id: Union[numpy.ndarray, int],
 ):
     result = []
     # mockとしての適当な処理、特に意味はない
@@ -64,7 +64,7 @@ def decode_mock(
             result.append(
                 float(
                     f0[i][0] * (numpy.where(phoneme[i] == 1)[0] / phoneme_size)
-                    + speaker_id
+                    + style_id
                 )
             )
     return numpy.array(result)
@@ -179,7 +179,7 @@ def metas(self):
     def supported_devices(self):
         return ""
 
-    def is_model_loaded(self, speaker_id):
+    def is_model_loaded(self, style_id):
         return True
 
 

@@ -14,6 +14,7 @@
 from .model import AudioQuery
 from .synthesis_engine import make_synthesis_engines
 from .utility import get_latest_core_version
+from run import id_checker
 
 
 class CancellableEngine:
@@ -115,9 +116,10 @@ def finalize_con(
     def _synthesis_impl(
         self,
         query: AudioQuery,
-        speaker_id: int,
+        style_id: Optional[int],
         request: Request,
         core_version: Optional[str],
+        speaker_id: Optional[int] = None,
     ) -> str:
         """
         音声合成を行う関数
@@ -127,7 +129,7 @@ def _synthesis_impl(
         Parameters
         ----------
         query: AudioQuery
-        speaker_id: int
+        style_id: int
         request: fastapi.Request
             接続確立時に受け取ったものをそのまま渡せばよい
             https://fastapi.tiangolo.com/advanced/using-request-directly/
@@ -138,10 +140,11 @@ def _synthesis_impl(
         f_name: str
             生成された音声ファイルの名前
         """
+        style_id = id_checker(style_id=style_id, speaker_id=speaker_id)
         proc, sub_proc_con1 = self.procs_and_cons.get()
         self.watch_con_list.append((request, proc))
         try:
-            sub_proc_con1.send((query, speaker_id, core_version))
+            sub_proc_con1.send((query, style_id, core_version))
             f_name = sub_proc_con1.recv()
         except EOFError:
             raise HTTPException(status_code=422, detail="既にサブプロセスは終了されています")
@@ -200,7 +203,7 @@ def start_synthesis_subprocess(
     latest_core_version = get_latest_core_version(versions=synthesis_engines.keys())
     while True:
         try:
-            query, speaker_id, core_version = sub_proc_con.recv()
+            query, style_id, core_version = sub_proc_con.recv()
             if core_version is None:
                 _engine = synthesis_engines[latest_core_version]
             elif core_version in synthesis_engines:
@@ -209,7 +212,7 @@ def start_synthesis_subprocess(
                 # バージョンが見つからないエラー
                 sub_proc_con.send("")
                 continue
-            wave = _engine._synthesis_impl(query, speaker_id)
+            wave = _engine._synthesis_impl(query, style_id)
             with NamedTemporaryFile(delete=False) as f:
                 soundfile.write(
                     file=f, data=wave, samplerate=query.outputSamplingRate, format="WAV"