diff --git a/run.py b/run.py index ae3829275..71f653760 100644 --- a/run.py +++ b/run.py @@ -16,26 +16,34 @@ from fastapi.params import Query from starlette.responses import FileResponse +from voicevox_engine import model from voicevox_engine.cancellable_engine import CancellableEngine from voicevox_engine.kana_parser import create_kana, parse_kana -from voicevox_engine.model import ( - AccentPhrase, - AudioQuery, - ParseKanaBadRequest, - ParseKanaError, - Speaker, - SpeakerInfo, -) +from voicevox_engine.model import ParseKanaError from voicevox_engine.morphing import synthesis_morphing from voicevox_engine.morphing import ( synthesis_morphing_parameter as _synthesis_morphing_parameter, ) -from voicevox_engine.preset import Preset, PresetLoader +from voicevox_engine.preset import PresetLoader from voicevox_engine.synthesis_engine import SynthesisEngineBase, make_synthesis_engine from voicevox_engine.synthesis_engine.synthesis_engine_base import ( adjust_interrogative_accent_phrases, ) from voicevox_engine.utility import ConnectBase64WavesException, connect_base64_waves +from voicevox_engine.webapi.fastapi_model import ( + AccentPhrase, + AudioQuery, + ParseKanaBadRequest, + Preset, + Speaker, + SpeakerInfo, +) + +""" +voicevox_enbine/model.pyで定義されている型は内部で使用する型なので、リクエスト及びレスポンスを行う際に使用してはならない。 +リクエスト・レスポンスで使用する型はvoicevox_engine/webapi/fastapi_model.pyで定義されている型を使用し、 +内部で使用している型から(or に)変換すること +""" def b64encode_str(s): @@ -92,7 +100,7 @@ def audio_query( text: str, speaker: int, enable_interrogative: bool = enable_interrogative_query_param(), # noqa B008, - ): + ) -> AudioQuery: """ クエリの初期値を得ます。ここで得られたクエリはそのまま音声合成に利用できます。各値の意味は`Schemas`を参照してください。 """ @@ -101,17 +109,19 @@ def audio_query( speaker_id=speaker, enable_interrogative=enable_interrogative, ) - return AudioQuery( - accent_phrases=accent_phrases, - speedScale=1, - pitchScale=0, - intonationScale=1, - volumeScale=1, - prePhonemeLength=0.1, - postPhonemeLength=0.1, - outputSamplingRate=default_sampling_rate, - outputStereo=False, - kana=create_kana(accent_phrases), + return AudioQuery.from_engine( + model.AudioQuery( + accent_phrases=accent_phrases, + speedScale=1, + pitchScale=0, + intonationScale=1, + volumeScale=1, + prePhonemeLength=0.1, + postPhonemeLength=0.1, + outputSamplingRate=default_sampling_rate, + outputStereo=False, + kana=create_kana(accent_phrases), + ) ) @app.post( @@ -124,7 +134,7 @@ def audio_query_from_preset( text: str, preset_id: int, enable_interrogative: bool = enable_interrogative_query_param(), # noqa B008, - ): + ) -> AudioQuery: """ クエリの初期値を得ます。ここで得られたクエリはそのまま音声合成に利用できます。各値の意味は`Schemas`を参照してください。 """ @@ -143,17 +153,19 @@ def audio_query_from_preset( speaker_id=selected_preset.style_id, enable_interrogative=enable_interrogative, ) - return AudioQuery( - accent_phrases=accent_phrases, - speedScale=selected_preset.speedScale, - pitchScale=selected_preset.pitchScale, - intonationScale=selected_preset.intonationScale, - volumeScale=selected_preset.volumeScale, - prePhonemeLength=selected_preset.prePhonemeLength, - postPhonemeLength=selected_preset.postPhonemeLength, - outputSamplingRate=default_sampling_rate, - outputStereo=False, - kana=create_kana(accent_phrases), + return AudioQuery.from_engine( + model.AudioQuery( + accent_phrases=accent_phrases, + speedScale=selected_preset.speedScale, + pitchScale=selected_preset.pitchScale, + intonationScale=selected_preset.intonationScale, + volumeScale=selected_preset.volumeScale, + prePhonemeLength=selected_preset.prePhonemeLength, + postPhonemeLength=selected_preset.postPhonemeLength, + outputSamplingRate=default_sampling_rate, + outputStereo=False, + kana=create_kana(accent_phrases), + ) ) @app.post( @@ -173,7 +185,7 @@ def accent_phrases( speaker: int, is_kana: bool = False, enable_interrogative: bool = enable_interrogative_query_param(), # noqa B008, - ): + ) -> List[AccentPhrase]: """ テキストからアクセント句を得ます。 is_kanaが`true`のとき、テキストは次のようなAquesTalkライクな記法に従う読み仮名として処理されます。デフォルトは`false`です。 @@ -196,15 +208,27 @@ def accent_phrases( accent_phrases=accent_phrases, speaker_id=speaker ) - return adjust_interrogative_accent_phrases( - accent_phrases, interrogative_accent_phrase_marks, enable_interrogative - ) + return [ + AccentPhrase.from_engine(accent_phrase) + for accent_phrase in ( + adjust_interrogative_accent_phrases( + accent_phrases, + interrogative_accent_phrase_marks, + enable_interrogative, + ) + ) + ] else: - return engine.create_accent_phrases( - text, - speaker_id=speaker, - enable_interrogative=enable_interrogative, - ) + return [ + AccentPhrase.from_engine(accent_phrase) + for accent_phrase in ( + engine.create_accent_phrases( + text, + speaker_id=speaker, + enable_interrogative=enable_interrogative, + ) + ) + ] @app.post( "/mora_data", @@ -212,8 +236,20 @@ def accent_phrases( tags=["クエリ編集"], summary="アクセント句から音高・音素長を得る", ) - def mora_data(accent_phrases: List[AccentPhrase], speaker: int): - return engine.replace_mora_data(accent_phrases, speaker_id=speaker) + def mora_data( + accent_phrases: List[AccentPhrase], speaker: int + ) -> List[AccentPhrase]: + return [ + AccentPhrase.from_engine(accent_phrase) + for accent_phrase in ( + engine.replace_mora_data( + accent_phrases=[ + accent_phrase.to_engine() for accent_phrase in accent_phrases + ], + speaker_id=speaker, + ) + ) + ] @app.post( "/mora_length", @@ -221,10 +257,20 @@ def mora_data(accent_phrases: List[AccentPhrase], speaker: int): tags=["クエリ編集"], summary="アクセント句から音素長を得る", ) - def mora_length(accent_phrases: List[AccentPhrase], speaker: int): - return engine.replace_phoneme_length( - accent_phrases=accent_phrases, speaker_id=speaker - ) + def mora_length( + accent_phrases: List[AccentPhrase], speaker: int + ) -> List[AccentPhrase]: + return [ + AccentPhrase.from_engine(accent_phrase) + for accent_phrase in ( + engine.replace_phoneme_length( + accent_phrases=[ + accent_phrase.to_engine() for accent_phrase in accent_phrases + ], + speaker_id=speaker, + ) + ) + ] @app.post( "/mora_pitch", @@ -232,10 +278,20 @@ def mora_length(accent_phrases: List[AccentPhrase], speaker: int): tags=["クエリ編集"], summary="アクセント句から音高を得る", ) - def mora_pitch(accent_phrases: List[AccentPhrase], speaker: int): - return engine.replace_mora_pitch( - accent_phrases=accent_phrases, speaker_id=speaker - ) + def mora_pitch( + accent_phrases: List[AccentPhrase], speaker: int + ) -> List[AccentPhrase]: + return [ + AccentPhrase.from_engine(accent_phrase) + for accent_phrase in ( + engine.replace_mora_pitch( + accent_phrases=[ + accent_phrase.to_engine() for accent_phrase in accent_phrases + ], + speaker_id=speaker, + ) + ) + ] @app.post( "/synthesis", @@ -250,8 +306,11 @@ def mora_pitch(accent_phrases: List[AccentPhrase], speaker: int): tags=["音声合成"], summary="音声合成する", ) - def synthesis(query: AudioQuery, speaker: int): - wave = engine.synthesis(query=query, speaker_id=speaker) + def synthesis(query: AudioQuery, speaker: int) -> FileResponse: + wave = engine.synthesis( + query=query.to_engine(), + speaker_id=speaker, + ) with NamedTemporaryFile(delete=False) as f: soundfile.write( @@ -273,14 +332,18 @@ def synthesis(query: AudioQuery, speaker: int): tags=["音声合成"], summary="音声合成する(キャンセル可能)", ) - def cancellable_synthesis(query: AudioQuery, speaker: int, request: Request): + def cancellable_synthesis( + query: AudioQuery, speaker: int, request: Request + ) -> FileResponse: if not args.enable_cancellable_synthesis: raise HTTPException( status_code=404, detail="実験的機能はデフォルトで無効になっています。使用するには引数を指定してください。", ) f_name = cancellable_engine.synthesis( - query=query, speaker_id=speaker, request=request + query=query.to_engine(), + speaker_id=speaker, + request=request, ) return FileResponse(f_name, media_type="audio/wav") @@ -300,7 +363,7 @@ def cancellable_synthesis(query: AudioQuery, speaker: int, request: Request): tags=["音声合成"], summary="複数まとめて音声合成する", ) - def multi_synthesis(queries: List[AudioQuery], speaker: int): + def multi_synthesis(queries: List[AudioQuery], speaker: int) -> FileResponse: sampling_rate = queries[0].outputSamplingRate with NamedTemporaryFile(delete=False) as f: @@ -316,7 +379,10 @@ def multi_synthesis(queries: List[AudioQuery], speaker: int): with TemporaryFile() as wav_file: - wave = engine.synthesis(query=queries[i], speaker_id=speaker) + wave = engine.synthesis( + query=queries[i].to_engine(), + speaker_id=speaker, + ) soundfile.write( file=wav_file, data=wave, @@ -346,7 +412,7 @@ def _synthesis_morphing( base_speaker: int, target_speaker: int, morph_rate: float = Query(..., ge=0.0, le=1.0), # noqa: B008 - ): + ) -> FileResponse: """ 指定された2人の話者で音声を合成、指定した割合でモーフィングした音声を得ます。 モーフィングの割合は`morph_rate`で指定でき、0.0でベースの話者、1.0でターゲットの話者に近づきます。 @@ -355,7 +421,7 @@ def _synthesis_morphing( # 生成したパラメータはキャッシュされる morph_param = synthesis_morphing_parameter( engine=engine, - query=query, + query=query.to_engine(), base_speaker=base_speaker, target_speaker=target_speaker, ) @@ -389,14 +455,14 @@ def _synthesis_morphing( tags=["その他"], summary="base64エンコードされた複数のwavデータを一つに結合する", ) - def connect_waves(waves: List[str]): + def connect_waves(waves: List[str]) -> FileResponse: """ base64エンコードされたwavデータを一纏めにし、wavファイルで返します。 """ try: waves_nparray, sampling_rate = connect_base64_waves(waves) except ConnectBase64WavesException as err: - return HTTPException(status_code=422, detail=str(err)) + raise HTTPException(status_code=422, detail=str(err)) with NamedTemporaryFile(delete=False) as f: soundfile.write( @@ -409,7 +475,7 @@ def connect_waves(waves: List[str]): return FileResponse(f.name, media_type="audio/wav") @app.get("/presets", response_model=List[Preset], tags=["その他"]) - def get_presets(): + def get_presets() -> List[Preset]: """ エンジンが保持しているプリセットの設定を返します @@ -421,21 +487,21 @@ def get_presets(): presets, err_detail = preset_loader.load_presets() if err_detail: raise HTTPException(status_code=422, detail=err_detail) - return presets + return [preset.to_engine() for preset in presets] @app.get("/version", tags=["その他"]) def version() -> str: return (root_dir / "VERSION.txt").read_text() @app.get("/speakers", response_model=List[Speaker], tags=["その他"]) - def speakers(): + def speakers() -> Response: return Response( content=engine.speakers, media_type="application/json", ) @app.get("/speaker_info", response_model=SpeakerInfo, tags=["その他"]) - def speaker_info(speaker_uuid: str): + def speaker_info(speaker_uuid: str) -> SpeakerInfo: """ 指定されたspeaker_uuidに関する情報をjson形式で返します。 画像や音声はbase64エンコードされたものが返されます。 @@ -474,7 +540,7 @@ def speaker_info(speaker_uuid: str): for j in range(3) ] style_infos.append( - {"id": id, "icon": icon, "voice_samples": voice_samples} + model.StyleInfo(id=id, icon=icon, voice_samples=voice_samples) ) except FileNotFoundError: import traceback @@ -482,8 +548,9 @@ def speaker_info(speaker_uuid: str): traceback.print_exc() raise HTTPException(status_code=500, detail="追加情報が見つかりませんでした") - ret_data = {"policy": policy, "portrait": portrait, "style_infos": style_infos} - return ret_data + return SpeakerInfo.from_engine( + model.SpeakerInfo(policy=policy, portrait=portrait, style_infos=style_infos) + ) return app diff --git a/test/test_fastapi_model.py b/test/test_fastapi_model.py new file mode 100644 index 000000000..bbd05a3d5 --- /dev/null +++ b/test/test_fastapi_model.py @@ -0,0 +1,372 @@ +from unittest import TestCase + +from voicevox_engine import model, preset +from voicevox_engine.webapi import fastapi_model + + +class TestFastAPIModelConverter(TestCase): + def _assert_equal_types(self, expected, actual): + for i, e in enumerate(expected): + self.assertEqual(type(e), type(actual[i])) + + def _assert_not_equal_types(self, expected, actual): + for i, e in enumerate(expected): + self.assertNotEqual(type(e), type(actual[i])) + + def _asserts_convert_data_and_type(self, expected, before_convert_value, actual): + self.assertEqual(expected, actual) + self.assertEqual(type(expected), type(actual)) + self.assertNotEqual(type(before_convert_value), type(actual)) + + def _asserts_list_convert_data_and_type( + self, expected, before_conert_value, actual + ): + self.assertEqual(expected, actual) + self._assert_equal_types(expected, actual) + self._assert_not_equal_types(before_conert_value, actual) + + def _mora(self): + return model.Mora( + text="ハ", + consonant="h", + consonant_length=0.6, + vowel="a", + vowel_length=0.5, + pitch=3.5, + ) + + def _fastapi_mora(self): + return fastapi_model.Mora( + text="ハ", + consonant="h", + consonant_length=0.6, + vowel="a", + vowel_length=0.5, + pitch=3.5, + ) + + def test_mora_from_engine(self): + actual: fastapi_model.Mora = fastapi_model.Mora.from_engine(self._mora()) + self._asserts_convert_data_and_type( + expected=self._fastapi_mora(), + before_convert_value=self._mora(), + actual=actual, + ) + + def test_mora_to_engine(self): + actual: model.Mora = self._fastapi_mora().to_engine() + self._asserts_convert_data_and_type( + expected=self._mora(), + before_convert_value=self._fastapi_mora(), + actual=actual, + ) + + def _moras(self): + return [self._mora(), self._mora()] + + def _fastapi_moras(self): + return [self._fastapi_mora(), self._fastapi_mora()] + + def _accent_phrase(self): + return model.AccentPhrase( + moras=self._moras(), + accent=3, + pause_mora=None, + ) + + def _fastapi_accent_phrase(self): + return fastapi_model.AccentPhrase( + moras=self._fastapi_moras(), + accent=3, + pause_mora=None, + ) + + def _pause_mora(self): + return model.Mora( + text="、", + consonant=None, + consonant_length=None, + vowel="pau", + vowel_length=0, + pitch=0, + ) + + def _fastapi_pause_mora(self): + return fastapi_model.Mora( + text="、", + consonant=None, + consonant_length=None, + vowel="pau", + vowel_length=0, + pitch=0, + ) + + def test_from_model_accent_phrase(self): + actual: fastapi_model.AccentPhrase = fastapi_model.AccentPhrase.from_engine( + self._accent_phrase() + ) + self._asserts_convert_data_and_type( + expected=self._fastapi_accent_phrase(), + before_convert_value=self._accent_phrase(), + actual=actual, + ) + + given = self._accent_phrase() + given.pause_mora = self._pause_mora() + expected = self._fastapi_accent_phrase() + expected.pause_mora = self._fastapi_pause_mora() + actual: fastapi_model.AccentPhrase = fastapi_model.AccentPhrase.from_engine( + given + ) + + self._asserts_convert_data_and_type( + expected=expected, before_convert_value=given, actual=actual + ) + + def test_to_model_accent_phrase(self): + actual: model.AccentPhrase = self._fastapi_accent_phrase().to_engine() + + self._asserts_convert_data_and_type( + expected=self._accent_phrase(), + before_convert_value=self._fastapi_accent_phrase(), + actual=actual, + ) + + given = self._fastapi_accent_phrase() + given.pause_mora = self._fastapi_pause_mora() + expected = self._accent_phrase() + expected.pause_mora = self._pause_mora() + actual: model.AccentPhrase = given.to_engine() + self._asserts_convert_data_and_type( + expected=expected, before_convert_value=given, actual=actual + ) + + def _accent_phrases(self): + return [ + self._accent_phrase(), + self._accent_phrase(), + ] + + def _fastapi_accent_phrases(self): + return [ + self._fastapi_accent_phrase(), + self._fastapi_accent_phrase(), + ] + + def _audio_query(self): + return model.AudioQuery( + accent_phrases=self._accent_phrases(), + speedScale=3.2, + pitchScale=4, + intonationScale=2.3, + volumeScale=4.3, + prePhonemeLength=1.3, + postPhonemeLength=5.3, + outputSamplingRate=3, + outputStereo=True, + kana="ア", + ) + + def _fastapi_audio_query(self): + return fastapi_model.AudioQuery( + accent_phrases=self._fastapi_accent_phrases(), + speedScale=3.2, + pitchScale=4, + intonationScale=2.3, + volumeScale=4.3, + prePhonemeLength=1.3, + postPhonemeLength=5.3, + outputSamplingRate=3, + outputStereo=True, + kana="ア", + ) + + def test_from_model_audio_query(self): + actual: fastapi_model.AudioQuery = fastapi_model.AudioQuery.from_engine( + self._audio_query() + ) + self._asserts_convert_data_and_type( + expected=self._fastapi_audio_query(), + before_convert_value=self._audio_query(), + actual=actual, + ) + + def test_to_model_audio_query(self): + actual: model.AudioQuery = self._fastapi_audio_query().to_engine() + self._asserts_convert_data_and_type( + expected=self._audio_query(), + before_convert_value=self._fastapi_audio_query(), + actual=actual, + ) + + def _speaker_style(self): + return model.SpeakerStyle( + name="speaker_style_name", + id=3, + ) + + def _fastapi_speaker_style(self): + return fastapi_model.SpeakerStyle( + name="speaker_style_name", + id=3, + ) + + def test_from_model_speaker_style(self): + actual: fastapi_model.SpeakerStyle = fastapi_model.SpeakerStyle.from_engine( + self._speaker_style() + ) + self._asserts_convert_data_and_type( + expected=self._fastapi_speaker_style(), + before_convert_value=self._speaker_style(), + actual=actual, + ) + + def test_to_model_speaker_style(self): + actual: model.SpeakerStyle = self._fastapi_speaker_style().to_engine() + self._asserts_convert_data_and_type( + expected=self._speaker_style(), + before_convert_value=self._fastapi_speaker_style(), + actual=actual, + ) + + def _speaker(self): + return model.Speaker( + name="speakername", + speaker_uuid="speakeruuid", + styles=[self._speaker_style(), self._speaker_style()], + version="1.3", + ) + + def _fastapi_speaker(self): + return fastapi_model.Speaker( + name="speakername", + speaker_uuid="speakeruuid", + styles=[self._fastapi_speaker_style(), self._fastapi_speaker_style()], + version="1.3", + ) + + def test_from_model_speaker(self): + actual: fastapi_model.Speaker = fastapi_model.Speaker.from_engine( + self._speaker() + ) + self._asserts_convert_data_and_type( + expected=self._fastapi_speaker(), + before_convert_value=self._speaker(), + actual=actual, + ) + + def test_to_model_speaker(self): + actual: model.Speaker = self._fastapi_speaker().to_engine() + self._asserts_convert_data_and_type( + expected=self._speaker(), + before_convert_value=self._fastapi_speaker(), + actual=actual, + ) + + def _style_info(self): + return model.StyleInfo( + id=3, + icon="style_info_icon", + voice_samples=["sample1", "sample2"], + ) + + def _fastapi_style_info(self): + return fastapi_model.StyleInfo( + id=3, + icon="style_info_icon", + voice_samples=["sample1", "sample2"], + ) + + def test_from_model_style_info(self): + actual: fastapi_model.StyleInfo = fastapi_model.StyleInfo.from_engine( + self._style_info() + ) + self._asserts_convert_data_and_type( + expected=self._fastapi_style_info(), + before_convert_value=self._style_info(), + actual=actual, + ) + + def test_to_model_style_info(self): + actual: model.StyleInfo = self._fastapi_style_info().to_engine() + self._asserts_convert_data_and_type( + expected=self._style_info(), + before_convert_value=self._fastapi_style_info(), + actual=actual, + ) + + def _speaker_info(self): + return model.SpeakerInfo( + policy="speaker_info_policy", + portrait="speaker_info_portrait", + style_infos=[self._style_info(), self._style_info()], + ) + + def _fastapi_speaker_info(self): + return fastapi_model.SpeakerInfo( + policy="speaker_info_policy", + portrait="speaker_info_portrait", + style_infos=[self._fastapi_style_info(), self._fastapi_style_info()], + ) + + def test_from_model_speaker_info(self): + actual: fastapi_model.SpeakerInfo = fastapi_model.SpeakerInfo.from_engine( + self._speaker_info() + ) + self._asserts_convert_data_and_type( + expected=self._fastapi_speaker_info(), + before_convert_value=self._speaker_info(), + actual=actual, + ) + + def test_to_model_speaker_info(self): + actual: model.SpeakerInfo = self._fastapi_speaker_info().to_engine() + self._asserts_convert_data_and_type( + expected=self._speaker_info(), + before_convert_value=self._fastapi_speaker_info(), + actual=actual, + ) + + def _preset(self): + return preset.Preset( + id=3, + name="preset_name", + speaker_uuid="speaker_uuid", + style_id=4, + speedScale=3.2, + pitchScale=2.3, + intonationScale=4.2, + volumeScale=2.2, + prePhonemeLength=1.1, + postPhonemeLength=1.2, + ) + + def _fastapi_preset(self): + return fastapi_model.Preset( + id=3, + name="preset_name", + speaker_uuid="speaker_uuid", + style_id=4, + speedScale=3.2, + pitchScale=2.3, + intonationScale=4.2, + volumeScale=2.2, + prePhonemeLength=1.1, + postPhonemeLength=1.2, + ) + + def test_from_model_preset(self): + actual: fastapi_model.Preset = fastapi_model.Preset.from_engine(self._preset()) + self._asserts_convert_data_and_type( + expected=self._fastapi_preset(), + before_convert_value=self._preset(), + actual=actual, + ) + + def test_to_model_preset(self): + actual: preset.Preset = self._fastapi_preset().to_engine() + self._asserts_convert_data_and_type( + expected=self._preset(), + before_convert_value=self._fastapi_preset(), + actual=actual, + ) diff --git a/voicevox_engine/model.py b/voicevox_engine/model.py index e67b7d902..76e68e804 100644 --- a/voicevox_engine/model.py +++ b/voicevox_engine/model.py @@ -3,6 +3,14 @@ from pydantic import BaseModel, Field +""" +ここで定義されている型は内部で使用するための型であり、API定義を変更したければvoicevox_engine/webapi/fastapi_model.pyの定義を変更すること。 +また、対応する型との適切な変換処理を実装すること +""" + +# FIXME: このファイルの各型からpydantic由来の機能を削除し、dataclassにする。もともとmodel.pyはAPI定義に使用されていたが、 +# 使用するデータ型が分離したため + class Mora(BaseModel): """ @@ -83,24 +91,6 @@ def __init__(self, errcode: ParseKanaErrorCode, **kwargs): self.text = err_fmt.format(**kwargs) -class ParseKanaBadRequest(BaseModel): - text: str = Field(title="エラーメッセージ") - error_name: str = Field( - title="エラー名", - description="|name|description|\n|---|---|\n" - + "\n".join( - [ - "| {} | {} |".format(err.name, err.value) - for err in list(ParseKanaErrorCode) - ] - ), - ) - error_args: Dict[str, str] = Field(title="エラーを起こした箇所") - - def __init__(self, err: ParseKanaError): - super().__init__(text=err.text, error_name=err.errname, error_args=err.kwargs) - - class SpeakerStyle(BaseModel): """ スピーカーのスタイル情報 diff --git a/voicevox_engine/webapi/__init__.py b/voicevox_engine/webapi/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/voicevox_engine/webapi/fastapi_model.py b/voicevox_engine/webapi/fastapi_model.py new file mode 100644 index 000000000..b934dfa59 --- /dev/null +++ b/voicevox_engine/webapi/fastapi_model.py @@ -0,0 +1,314 @@ +from __future__ import annotations + +import copy +from typing import Dict, List, Optional + +from pydantic import BaseModel, Field + +from voicevox_engine import model, preset + +""" +このファイルの型はvoicevox_engine/model.pyと重複しているように見えるが、 +これは内部で使用しているvoicevox_engine/model.pyの変更をAPI定義に影響を与えないするためである。 +fastapiのリクエスト、レスポンスに使用する型はmodel.pyにあるものではなく必ずここに定義してある型を使用すること。 +model.pyあるいはこのファイルの型に変更がある場合各モデルで必要な変換処理を実装する +""" + + +class Mora(BaseModel): + """ + モーラ(子音+母音)ごとの情報 + """ + + text: str = Field(title="文字") + consonant: Optional[str] = Field(title="子音の音素") + consonant_length: Optional[float] = Field(title="子音の音長") + vowel: str = Field(title="母音の音素") + vowel_length: float = Field(title="母音の音長") + pitch: float = Field(title="音高") # デフォルト値をつけるとts側のOpenAPIで生成されたコードの型がOptionalになる + + def __hash__(self): + items = [ + (k, tuple(v)) if isinstance(v, List) else (k, v) + for k, v in self.__dict__.items() + ] + return hash(tuple(sorted(items))) + + @classmethod + def from_engine(cls, mora: model.Mora) -> Mora: + return cls( + text=mora.text, + consonant=mora.consonant, + consonant_length=mora.consonant_length, + vowel=mora.vowel, + vowel_length=mora.vowel_length, + pitch=mora.pitch, + ) + + def to_engine(self) -> model.Mora: + return model.Mora( + text=self.text, + consonant=self.consonant, + consonant_length=self.consonant_length, + vowel=self.vowel, + vowel_length=self.vowel_length, + pitch=self.pitch, + ) + + +class AccentPhrase(BaseModel): + """ + アクセント句ごとの情報 + """ + + moras: List[Mora] = Field(title="モーラのリスト") + accent: int = Field(title="アクセント箇所") + pause_mora: Optional[Mora] = Field(title="後ろに無音を付けるかどうか") + + def __hash__(self): + items = [ + (k, tuple(v)) if isinstance(v, List) else (k, v) + for k, v in self.__dict__.items() + ] + return hash(tuple(sorted(items))) + + @classmethod + def from_engine(cls, accent_phrase: model.AccentPhrase) -> AccentPhrase: + return cls( + moras=[Mora.from_engine(mora) for mora in accent_phrase.moras], + accent=accent_phrase.accent, + pause_mora=Mora.from_engine(accent_phrase.pause_mora) + if accent_phrase.pause_mora is not None + else None, + ) + + def to_engine(self) -> model.AccentPhrase: + return model.AccentPhrase( + moras=[mora.to_engine() for mora in self.moras], + accent=self.accent, + pause_mora=self.pause_mora.to_engine() + if self.pause_mora is not None + else None, + ) + + +class AudioQuery(BaseModel): + """ + 音声合成用のクエリ + """ + + accent_phrases: List[AccentPhrase] = Field(title="アクセント句のリスト") + speedScale: float = Field(title="全体の話速") + pitchScale: float = Field(title="全体の音高") + intonationScale: float = Field(title="全体の抑揚") + volumeScale: float = Field(title="全体の音量") + prePhonemeLength: float = Field(title="音声の前の無音時間") + postPhonemeLength: float = Field(title="音声の後の無音時間") + outputSamplingRate: int = Field(title="音声データの出力サンプリングレート") + outputStereo: bool = Field(title="音声データをステレオ出力するか否か") + kana: Optional[str] = Field(title="[読み取り専用]AquesTalkライクな読み仮名。音声合成クエリとしては無視される") + + def __hash__(self): + items = [ + (k, tuple(v)) if isinstance(v, List) else (k, v) + for k, v in self.__dict__.items() + ] + return hash(tuple(sorted(items))) + + @classmethod + def from_engine(cls, audio_query: model.AudioQuery) -> AudioQuery: + return cls( + accent_phrases=[ + AccentPhrase.from_engine(accent_phrase) + for accent_phrase in audio_query.accent_phrases + ], + speedScale=audio_query.speedScale, + pitchScale=audio_query.pitchScale, + intonationScale=audio_query.intonationScale, + volumeScale=audio_query.volumeScale, + prePhonemeLength=audio_query.prePhonemeLength, + postPhonemeLength=audio_query.postPhonemeLength, + outputSamplingRate=audio_query.outputSamplingRate, + outputStereo=audio_query.outputStereo, + kana=audio_query.kana, + ) + + def to_engine(self) -> model.AudioQuery: + return model.AudioQuery( + accent_phrases=[ + accent_phrase.to_engine() for accent_phrase in self.accent_phrases + ], + speedScale=self.speedScale, + pitchScale=self.pitchScale, + intonationScale=self.intonationScale, + volumeScale=self.volumeScale, + prePhonemeLength=self.prePhonemeLength, + postPhonemeLength=self.postPhonemeLength, + outputSamplingRate=self.outputSamplingRate, + outputStereo=self.outputStereo, + kana=self.kana, + ) + + +class ParseKanaBadRequest(BaseModel): + text: str = Field(title="エラーメッセージ") + error_name: str = Field( + title="エラー名", + description="|name|description|\n|---|---|\n" + + "\n".join( + [ + "| {} | {} |".format(err.name, err.value) + for err in list(model.ParseKanaErrorCode) + ] + ), + ) + error_args: Dict[str, str] = Field(title="エラーを起こした箇所") + + def __init__(self, err: model.ParseKanaError): + super().__init__(text=err.text, error_name=err.errname, error_args=err.kwargs) + + +class SpeakerStyle(BaseModel): + """ + スピーカーのスタイル情報 + """ + + name: str = Field(title="スタイル名") + id: int = Field(title="スタイルID") + + @classmethod + def from_engine(cls, speaker_style: model.SpeakerStyle) -> SpeakerStyle: + return cls( + name=speaker_style.name, + id=speaker_style.id, + ) + + def to_engine(self) -> model.SpeakerStyle: + return model.SpeakerStyle( + name=self.name, + id=self.id, + ) + + +class Speaker(BaseModel): + """ + スピーカー情報 + """ + + name: str = Field(title="名前") + speaker_uuid: str = Field(title="スピーカーのUUID") + styles: List[SpeakerStyle] = Field(title="スピーカースタイルの一覧") + version: str = Field("スピーカーのバージョン") + + @classmethod + def from_engine(cls, speaker: model.Speaker) -> Speaker: + return cls( + name=speaker.name, + speaker_uuid=speaker.speaker_uuid, + styles=speaker.styles, + version=speaker.version, + ) + + def to_engine(self) -> model.Speaker: + return model.Speaker( + name=self.name, + speaker_uuid=self.speaker_uuid, + styles=self.styles, + version=self.version, + ) + + +class StyleInfo(BaseModel): + """ + スタイルの追加情報 + """ + + id: int = Field(title="スタイルID") + icon: str = Field(title="当該スタイルのアイコンをbase64エンコードしたもの") + voice_samples: List[str] = Field(title="voice_sampleのwavファイルをbase64エンコードしたもの") + + @classmethod + def from_engine(cls, style_info: model.StyleInfo) -> StyleInfo: + return cls( + id=style_info.id, + icon=style_info.icon, + voice_samples=copy.deepcopy(style_info.voice_samples), + ) + + def to_engine(self) -> model.StyleInfo: + return model.StyleInfo( + id=self.id, + icon=self.icon, + voice_samples=copy.deepcopy(self.voice_samples), + ) + + +class SpeakerInfo(BaseModel): + """ + 話者の追加情報 + """ + + policy: str = Field(title="policy.md") + portrait: str = Field(title="portrait.pngをbase64エンコードしたもの") + style_infos: List[StyleInfo] = Field(title="スタイルの追加情報") + + @classmethod + def from_engine(cls, speaker_info: model.SpeakerInfo) -> SpeakerInfo: + return cls( + policy=speaker_info.policy, + portrait=speaker_info.portrait, + style_infos=speaker_info.style_infos, + ) + + def to_engine(self) -> model.SpeakerInfo: + return model.SpeakerInfo( + policy=self.policy, + portrait=self.portrait, + style_infos=self.style_infos, + ) + + +class Preset(BaseModel): + """ + プリセット情報 + """ + + id: int = Field(title="プリセットID") + name: str = Field(title="プリセット名") + speaker_uuid: str = Field(title="スピーカーのUUID") + style_id: int = Field(title="スタイルID") + speedScale: float = Field(title="全体の話速") + pitchScale: float = Field(title="全体の音高") + intonationScale: float = Field(title="全体の抑揚") + volumeScale: float = Field(title="全体の音量") + prePhonemeLength: float = Field(title="音声の前の無音時間") + postPhonemeLength: float = Field(title="音声の後の無音時間") + + @classmethod + def from_engine(cls, preset: preset.Preset) -> Preset: + return cls( + id=preset.id, + name=preset.name, + speaker_uuid=preset.speaker_uuid, + style_id=preset.style_id, + speedScale=preset.speedScale, + pitchScale=preset.pitchScale, + intonationScale=preset.intonationScale, + volumeScale=preset.volumeScale, + prePhonemeLength=preset.prePhonemeLength, + postPhonemeLength=preset.postPhonemeLength, + ) + + def to_engine(self) -> preset.Preset: + return preset.Preset( + id=self.id, + name=self.name, + speaker_uuid=self.speaker_uuid, + style_id=self.style_id, + speedScale=self.speedScale, + pitchScale=self.pitchScale, + intonationScale=self.intonationScale, + volumeScale=self.volumeScale, + prePhonemeLength=self.prePhonemeLength, + postPhonemeLength=self.postPhonemeLength, + )