From a8603d414b27306f8f84ea8fbf407fb2fbd10fb2 Mon Sep 17 00:00:00 2001 From: tarepan Date: Wed, 6 Dec 2023 10:07:30 +0000 Subject: [PATCH] =?UTF-8?q?Refactor:=20`apply=5Fspeed`=20=E3=81=AE?= =?UTF-8?q?=E5=88=87=E3=82=8A=E5=87=BA=E3=81=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_synthesis_engine.py | 28 +++++++++++++++++++ .../synthesis_engine/synthesis_engine.py | 28 ++++++++++++++++--- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index 1d29cad09..68094f15b 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -15,6 +15,7 @@ from voicevox_engine.synthesis_engine.synthesis_engine import ( apply_intonation, apply_pitch, + apply_speed, apply_volume, calc_frame_per_phoneme, calc_frame_phoneme, @@ -194,6 +195,33 @@ def test_pad_with_silence(): assert moras_with_silence == true_moras_with_silence +def test_apply_speed(): + """Test `apply_speed`.""" + # Inputs + query = _gen_query(speedScale=2.0) + input_moras = [ + _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0), + _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0), + _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), + _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0), + _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), + ] + + # Expects - x2 fast + true_moras = [ + _gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0), + _gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0), + _gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0), + _gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0), + _gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0), + ] + + # Outputs + moras = apply_speed(input_moras, query) + + assert moras == true_moras + + def test_apply_pitch(): """Test `apply_pitch`.""" # Inputs diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py index 6dcabe62c..0c2bbe25c 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine.py @@ -135,6 +135,27 @@ def pad_with_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]: return moras +def apply_speed(moras: list[Mora], query: AudioQuery) -> list[Mora]: + """ + 話速スケール(`speedScale`)の適用 + Parameters + ---------- + moras : list[Mora] + モーラ系列 + query : AudioQuery + 音声合成クエリ + Returns + ------- + moras : list[Mora] + 話速スケールが適用されたモーラ系列 + """ + for mora in moras: + mora.vowel_length /= query.speedScale + if mora.consonant_length: + mora.consonant_length /= query.speedScale + return moras + + def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): """ 音素あたりのフレーム長を算出 @@ -149,6 +170,9 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): frame_per_phoneme : NDArray[] 音素あたりのフレーム長。端数丸め。 """ + # Apply: グローバル特徴量による補正(話速) + moras = apply_speed(moras, query) + # 音素あたりの継続長 sec_per_phoneme = numpy.array( [ @@ -161,10 +185,6 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): ], dtype=numpy.float32, ) - - # 話速による継続長の補正 - sec_per_phoneme /= query.speedScale - # 音素あたりのフレーム長。端数丸め。 framerate = 24000 / 256 # framerate 93.75 [frame/sec] frame_per_phoneme = numpy.round(sec_per_phoneme * framerate).astype(numpy.int32)