Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

スタイルID(style_id)のことを話者ID(speaker_id)としているコードを全部置き換える #741

Merged
merged 45 commits into from
Oct 22, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
1bf4f29
speaker_idをstyle_idに(synthesis_engine_base.py)
weweweok Sep 6, 2023
3f58da0
style_idに変更(cancellable_engine.py)
weweweok Sep 6, 2023
aac0ed0
idチェック関数と細部の修正
weweweok Sep 6, 2023
ad51b7c
synthesis_engine_base.pyの一部を書き換え
weweweok Sep 6, 2023
6fdf5b4
不要なspeaker_id引数の削除(synthesis_engine_base.py)
weweweok Sep 7, 2023
1916412
id_checkerの使用とoptionalの追加(cancellable_engine.py)
weweweok Sep 8, 2023
dc66cb5
speaker_idをstyle_idに変更(morphing.py)
weweweok Sep 9, 2023
b914e9c
speaker_idをstyle_idに(mock.py)
weweweok Sep 19, 2023
4cab209
speaker_idをstyle_idに変更(synthesis_engine.py)
weweweok Sep 19, 2023
7432d91
speaker_idをstyle_idに変更(corewrapper.py)
weweweok Sep 19, 2023
9f6305a
speaker_idをstyle_idに変更(run.py)
weweweok Sep 19, 2023
f7cd30f
speaker_idをstyle_idに
weweweok Sep 19, 2023
d6852fa
speaker_idをstyle_idに変更(test_shynthesis_engine_base
weweweok Sep 23, 2023
8314073
speaker_idをstyle_idに(test_synthesis_engine.py)
weweweok Sep 23, 2023
7012a1b
optionalの削除
weweweok Oct 11, 2023
88929bc
optionalの変更
weweweok Oct 11, 2023
2ddc62c
id_checker関数の修正
weweweok Oct 11, 2023
48ebb63
id_checker関数の修正とその関数によるaudio_query関数の一部を追加・変更
weweweok Oct 11, 2023
b1a3420
id_checker関数を修正
weweweok Oct 11, 2023
09160ef
audio_query関数のspeakerをstyle_idに変更
weweweok Oct 11, 2023
ce23723
id_checker関数の名前をget_style_id_from_deprecatedに変更
weweweok Oct 14, 2023
e8b01df
get_style_id_from_deprecated関数の戻り値のインデントを修正
weweweok Oct 14, 2023
6ecbff7
クエリ編集周りのAPI関数におけるspeakerパラメータをstyle_idに変更
weweweok Oct 14, 2023
0e737f0
音声合成周りのAPIにおいて、style_idとspeakerに互換性をもたせた
weweweok Oct 14, 2023
8686888
initialize_speaker関連のAPIでstyle_idとspeakerに互換性をもたせた
weweweok Oct 15, 2023
5cfa87f
speakerNotFoundError関数で使用されるspeakerをstyle_idに変更した
weweweok Oct 15, 2023
4d07a68
ビルドテストの非推奨パワメータを推奨パラメータに変更
weweweok Oct 16, 2023
3ea9a04
speakerNotFoudError関数の細かい記述を修正
weweweok Oct 16, 2023
3b5acec
model.pyの修正漏れを適用
weweweok Oct 16, 2023
89f78b9
非推奨(既存の)エンドポイントと推奨エンドポイント両方を実装
weweweok Oct 16, 2023
5137e5d
(is_)initialize_speakerAPIドキュメントの説明を更新
weweweok Oct 16, 2023
6523548
run.pyにおけるSpeakerNotFoundError周りを修正
weweweok Oct 17, 2023
b5d0e49
SpeakerNotFoundErrorをStyleId...に変更
weweweok Oct 17, 2023
53a979c
SpeakerNotFoundErrorをStyleIdに変更 (変更漏れ)
weweweok Oct 17, 2023
91cead9
descriptionの話者をスタイルに変更
weweweok Oct 17, 2023
46efa5b
(is_)initialize(d)_speaker...のspeakerをstyle_idに変更
weweweok Oct 17, 2023
2b8ad4c
initialize(d)_speakerまわりにおけるdocstringの誤りを修正
weweweok Oct 17, 2023
55218cf
不要なstyle_idの変更をspeakerにリセット
weweweok Oct 17, 2023
7eda81b
READMEで記述されているspeakerパラメータをstyle_idに変更
weweweok Oct 17, 2023
790007f
修正漏れにspeakerをstyle_idに変更(morphing.py)
weweweok Oct 21, 2023
cc2b8e0
speaker_idをstyle_idに変更(test_synthesis...base.py)
weweweok Oct 21, 2023
310f6c2
Merge branch 'master' into dev
weweweok Oct 22, 2023
9d11bc6
ドキュメント調整、型表記をあわせる、linter適用
Hiroshiba Oct 22, 2023
0cf1e04
マージミス解消、エラー解消
Hiroshiba Oct 22, 2023
c949cb5
stacklevel=1,
Hiroshiba Oct 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 29 additions & 17 deletions run.py
weweweok marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,22 @@
get_save_dir,
)

import warnings


def id_checker(style_id: Optional[int], speaker_id: Optional[int]) -> int:
weweweok marked this conversation as resolved.
Show resolved Hide resolved
"""
style_idとspeaker_id両方ともNoneかNoneでないかをチェックし、
どちらか片方しかNoneが存在しなければstyle_idを返す
"""
if style_id == speaker_id == None or (style_id != None and speaker_id != None):
weweweok marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("speaker_idとstyle_idが両方とも存在しないか、両方とも存在しています")
if speaker_id is not None:
warnings.warn("style_idに変更になりましたのこちらの利用を推奨しています")
style_id = speaker_id
speaker_id = None
return style_id
weweweok marked this conversation as resolved.
Show resolved Hide resolved


def b64encode_str(s):
return base64.b64encode(s).decode("utf-8")
Expand Down Expand Up @@ -228,7 +244,7 @@ def audio_query(text: str, speaker: int, core_version: Optional[str] = None):
クエリの初期値を得ます。ここで得られたクエリはそのまま音声合成に利用できます。各値の意味は`Schemas`を参照してください。
"""
engine = get_engine(core_version)
accent_phrases = engine.create_accent_phrases(text, speaker_id=speaker)
accent_phrases = engine.create_accent_phrases(text, style_id=speaker)
return AudioQuery(
accent_phrases=accent_phrases,
speedScale=1,
Expand Down Expand Up @@ -267,7 +283,7 @@ def audio_query_from_preset(
raise HTTPException(status_code=422, detail="該当するプリセットIDが見つかりません")

accent_phrases = engine.create_accent_phrases(
text, speaker_id=selected_preset.style_id
text, style_id=selected_preset.style_id
)
return AudioQuery(
accent_phrases=accent_phrases,
Expand Down Expand Up @@ -319,12 +335,12 @@ def accent_phrases(
detail=ParseKanaBadRequest(err).dict(),
)
accent_phrases = engine.replace_mora_data(
accent_phrases=accent_phrases, speaker_id=speaker
accent_phrases=accent_phrases, style_id=speaker
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

このプルリクエストを見て気づいたのですが、speaker_idではなくspeakerになっている箇所がかなりありますね!!
見逃していました、申し訳ない。。


return accent_phrases
else:
return engine.create_accent_phrases(text, speaker_id=speaker)
return engine.create_accent_phrases(text, style_id=speaker)

@app.post(
"/mora_data",
Expand All @@ -338,7 +354,7 @@ def mora_data(
core_version: Optional[str] = None,
):
engine = get_engine(core_version)
return engine.replace_mora_data(accent_phrases, speaker_id=speaker)
return engine.replace_mora_data(accent_phrases, style_id=speaker)

@app.post(
"/mora_length",
Expand All @@ -353,7 +369,7 @@ def mora_length(
):
engine = get_engine(core_version)
return engine.replace_phoneme_length(
accent_phrases=accent_phrases, speaker_id=speaker
accent_phrases=accent_phrases, style_id=speaker
)

@app.post(
Expand All @@ -369,7 +385,7 @@ def mora_pitch(
):
engine = get_engine(core_version)
return engine.replace_mora_pitch(
accent_phrases=accent_phrases, speaker_id=speaker
accent_phrases=accent_phrases, style_id=speaker
)

@app.post(
Expand Down Expand Up @@ -397,7 +413,7 @@ def synthesis(
engine = get_engine(core_version)
wave = engine.synthesis(
query=query,
speaker_id=speaker,
style_id=speaker,
enable_interrogative_upspeak=enable_interrogative_upspeak,
)

Expand Down Expand Up @@ -438,7 +454,7 @@ def cancellable_synthesis(
)
f_name = cancellable_engine._synthesis_impl(
query=query,
speaker_id=speaker,
style_id=speaker,
request=request,
core_version=core_version,
)
Expand Down Expand Up @@ -475,19 +491,15 @@ def multi_synthesis(
sampling_rate = queries[0].outputSamplingRate

with NamedTemporaryFile(delete=False) as f:

with zipfile.ZipFile(f, mode="a") as zip_file:

for i in range(len(queries)):

if queries[i].outputSamplingRate != sampling_rate:
raise HTTPException(
status_code=422, detail="サンプリングレートが異なるクエリがあります"
)

with TemporaryFile() as wav_file:

wave = engine.synthesis(query=queries[i], speaker_id=speaker)
wave = engine.synthesis(query=queries[i], style_id=speaker)
soundfile.write(
file=wav_file,
data=wave,
Expand Down Expand Up @@ -893,17 +905,17 @@ def initialize_speaker(
core_version: Optional[str] = None,
):
"""
指定されたspeaker_idの話者を初期化します
指定されたstyle_idの話者を初期化します
実行しなくても他のAPIは使用できますが、初回実行時に時間がかかることがあります。
"""
engine = get_engine(core_version)
engine.initialize_speaker_synthesis(speaker_id=speaker, skip_reinit=skip_reinit)
engine.initialize_speaker_synthesis(style_id=speaker, skip_reinit=skip_reinit)
return Response(status_code=204)

@app.get("/is_initialized_speaker", response_model=bool, tags=["その他"])
def is_initialized_speaker(speaker: int, core_version: Optional[str] = None):
"""
指定されたspeaker_idの話者が初期化されているかどうかを返します
指定されたstyle_idの話者が初期化されているかどうかを返します
"""
engine = get_engine(core_version)
return engine.is_initialized_speaker_synthesis(speaker)
Expand Down
6 changes: 3 additions & 3 deletions test/test_mock_synthesis_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def test_replace_phoneme_length(self):
self.assertEqual(
self.engine.replace_phoneme_length(
accent_phrases=self.accent_phrases_hello_hiho,
speaker_id=0,
style_id=0,
),
self.accent_phrases_hello_hiho,
)
Expand All @@ -117,7 +117,7 @@ def test_replace_mora_pitch(self):
self.assertEqual(
self.engine.replace_mora_pitch(
accent_phrases=self.accent_phrases_hello_hiho,
speaker_id=0,
style_id=0,
),
self.accent_phrases_hello_hiho,
)
Expand All @@ -136,5 +136,5 @@ def test_synthesis(self):
outputStereo=False,
kana=create_kana(self.accent_phrases_hello_hiho),
),
speaker_id=0,
style_id=0,
)
28 changes: 14 additions & 14 deletions test/test_synthesis_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@
)


def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, speaker_id: numpy.ndarray):
def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, style_id: numpy.ndarray):
result = []
# mockとしての適当な処理、特に意味はない
for i in range(length):
result.append(float(phoneme_list[i] * 0.5 + speaker_id))
result.append(float(phoneme_list[i] * 0.5 + style_id))
return numpy.array(result)


Expand All @@ -38,7 +38,7 @@ def yukarin_sa_mock(
end_accent_list: numpy.ndarray,
start_accent_phrase_list: numpy.ndarray,
end_accent_phrase_list: numpy.ndarray,
speaker_id: numpy.ndarray,
style_id: numpy.ndarray,
):
result = []
# mockとしての適当な処理、特に意味はない
Expand All @@ -54,7 +54,7 @@ def yukarin_sa_mock(
+ end_accent_phrase_list[0][i]
)
* 0.5
+ speaker_id
+ style_id
)
)
return numpy.array(result)[numpy.newaxis]
Expand All @@ -65,7 +65,7 @@ def decode_mock(
phoneme_size: int,
f0: numpy.ndarray,
phoneme: numpy.ndarray,
speaker_id: Union[numpy.ndarray, int],
style_id: Union[numpy.ndarray, int],
):
result = []
# mockとしての適当な処理、特に意味はない
Expand All @@ -75,7 +75,7 @@ def decode_mock(
result.append(
float(
f0[i][0] * (numpy.where(phoneme[i] == 1)[0] / phoneme_size)
+ speaker_id
+ style_id
)
)
return numpy.array(result)
Expand All @@ -92,7 +92,7 @@ def metas(self):
def supported_devices(self):
return ""

def is_model_loaded(self, speaker_id):
def is_model_loaded(self, style_id):
return True


Expand Down Expand Up @@ -303,7 +303,7 @@ def test_pre_process(self):

def test_replace_phoneme_length(self):
result = self.synthesis_engine.replace_phoneme_length(
accent_phrases=deepcopy(self.accent_phrases_hello_hiho), speaker_id=1
accent_phrases=deepcopy(self.accent_phrases_hello_hiho), style_id=1
)

# yukarin_sに渡される値の検証
Expand Down Expand Up @@ -340,7 +340,7 @@ def test_replace_phoneme_length(self):
dtype=numpy.int64,
),
)
self.assertEqual(yukarin_s_args["speaker_id"], 1)
self.assertEqual(yukarin_s_args["style_id"], 1)

# flatten_morasを使わずに愚直にaccent_phrasesにデータを反映させてみる
true_result = deepcopy(self.accent_phrases_hello_hiho)
Expand Down Expand Up @@ -368,13 +368,13 @@ def test_replace_mora_pitch(self):
empty_accent_phrases = []
self.assertEqual(
self.synthesis_engine.replace_mora_pitch(
accent_phrases=empty_accent_phrases, speaker_id=1
accent_phrases=empty_accent_phrases, style_id=1
),
[],
)

result = self.synthesis_engine.replace_mora_pitch(
accent_phrases=deepcopy(self.accent_phrases_hello_hiho), speaker_id=1
accent_phrases=deepcopy(self.accent_phrases_hello_hiho), style_id=1
)

# yukarin_saに渡される値の検証
Expand All @@ -393,7 +393,7 @@ def test_replace_mora_pitch(self):
self.assertEqual(list_length, len(end_accent_list))
self.assertEqual(list_length, len(start_accent_phrase_list))
self.assertEqual(list_length, len(end_accent_phrase_list))
self.assertEqual(yukarin_sa_args["speaker_id"], 1)
self.assertEqual(yukarin_sa_args["style_id"], 1)

numpy.testing.assert_array_equal(
vowel_phoneme_list,
Expand Down Expand Up @@ -512,7 +512,7 @@ def synthesis_test_base(self, audio_query: AudioQuery):
for i in range(len(phoneme_length_list)):
phoneme_length_list[i] /= audio_query.speedScale

result = self.synthesis_engine.synthesis(query=audio_query, speaker_id=1)
result = self.synthesis_engine.synthesis(query=audio_query, style_id=1)

# decodeに渡される値の検証
decode_args = self.decode_mock.call_args[1]
Expand Down Expand Up @@ -577,7 +577,7 @@ def synthesis_test_base(self, audio_query: AudioQuery):
assert_true_count += bool(phoneme[i][j] == decode_phoneme[i][j])
assert_phoneme_count += assert_true_count == num_phoneme
self.assertTrue(assert_phoneme_count >= int(len(decode_phoneme) / 5) * 4)
self.assertEqual(decode_args["speaker_id"], 1)
self.assertEqual(decode_args["style_id"], 1)

# decode forwarderのmockを使う
true_result = decode_mock(list_length, num_phoneme, f0, phoneme, 1)
Expand Down
10 changes: 5 additions & 5 deletions test/test_synthesis_engine_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def yukarin_sa_mock(
end_accent_list: numpy.ndarray,
start_accent_phrase_list: numpy.ndarray,
end_accent_phrase_list: numpy.ndarray,
speaker_id: numpy.ndarray,
style_id: numpy.ndarray,
):
result = []
# mockとしての適当な処理、特に意味はない
Expand All @@ -41,7 +41,7 @@ def yukarin_sa_mock(
+ end_accent_phrase_list[0][i]
)
* 0.0625
+ speaker_id
+ style_id
),
2,
)
Expand All @@ -54,7 +54,7 @@ def decode_mock(
phoneme_size: int,
f0: numpy.ndarray,
phoneme: numpy.ndarray,
speaker_id: Union[numpy.ndarray, int],
style_id: Union[numpy.ndarray, int],
):
result = []
# mockとしての適当な処理、特に意味はない
Expand All @@ -64,7 +64,7 @@ def decode_mock(
result.append(
float(
f0[i][0] * (numpy.where(phoneme[i] == 1)[0] / phoneme_size)
+ speaker_id
+ style_id
)
)
return numpy.array(result)
Expand Down Expand Up @@ -179,7 +179,7 @@ def metas(self):
def supported_devices(self):
return ""

def is_model_loaded(self, speaker_id):
def is_model_loaded(self, style_id):
return True


Expand Down
13 changes: 8 additions & 5 deletions voicevox_engine/cancellable_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .model import AudioQuery
from .synthesis_engine import make_synthesis_engines
from .utility import get_latest_core_version
from run import id_checker


class CancellableEngine:
Expand Down Expand Up @@ -115,9 +116,10 @@ def finalize_con(
def _synthesis_impl(
self,
query: AudioQuery,
speaker_id: int,
style_id: Optional[int],
request: Request,
core_version: Optional[str],
speaker_id: Optional[int] = None,
) -> str:
"""
音声合成を行う関数
Expand All @@ -127,7 +129,7 @@ def _synthesis_impl(
Parameters
----------
query: AudioQuery
speaker_id: int
style_id: int
request: fastapi.Request
接続確立時に受け取ったものをそのまま渡せばよい
https://fastapi.tiangolo.com/advanced/using-request-directly/
Expand All @@ -138,10 +140,11 @@ def _synthesis_impl(
f_name: str
生成された音声ファイルの名前
"""
style_id = id_checker(style_id=style_id, speaker_id=speaker_id)
proc, sub_proc_con1 = self.procs_and_cons.get()
self.watch_con_list.append((request, proc))
try:
sub_proc_con1.send((query, speaker_id, core_version))
sub_proc_con1.send((query, style_id, core_version))
f_name = sub_proc_con1.recv()
except EOFError:
raise HTTPException(status_code=422, detail="既にサブプロセスは終了されています")
Expand Down Expand Up @@ -200,7 +203,7 @@ def start_synthesis_subprocess(
latest_core_version = get_latest_core_version(versions=synthesis_engines.keys())
while True:
try:
query, speaker_id, core_version = sub_proc_con.recv()
query, style_id, core_version = sub_proc_con.recv()
if core_version is None:
_engine = synthesis_engines[latest_core_version]
elif core_version in synthesis_engines:
Expand All @@ -209,7 +212,7 @@ def start_synthesis_subprocess(
# バージョンが見つからないエラー
sub_proc_con.send("")
continue
wave = _engine._synthesis_impl(query, speaker_id)
wave = _engine._synthesis_impl(query, style_id)
with NamedTemporaryFile(delete=False) as f:
soundfile.write(
file=f, data=wave, samplerate=query.outputSamplingRate, format="WAV"
Expand Down
Loading