diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index 3c38830d0..a5536bb15 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -13,8 +13,8 @@ # TODO: import from voicevox_engine.synthesis_engine.mora from voicevox_engine.synthesis_engine.synthesis_engine import ( - apply_pitch, apply_intonation, + apply_pitch, apply_volume, calc_frame_per_phoneme, calc_frame_phoneme, @@ -230,10 +230,10 @@ def test_apply_volume(): """Test `apply_volume`.""" # Inputs query = _gen_query(volumeScale=3.0) - input_wave = numpy.array([0.0, 1.0, 2.0, 0.0,]) + input_wave = numpy.array([0.0, 1.0, 2.0]) # Expects - x3 scale - true_wave = numpy.array([0.0, 3.0, 6.0, 0.0,]) + true_wave = numpy.array([0.0, 3.0, 6.0]) # Outputs wave = apply_volume(input_wave, query) diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py index 07def1048..9bb139ae0 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine.py @@ -219,17 +219,17 @@ def calc_frame_pitch( moras = apply_pitch(moras, query) moras = apply_intonation(moras, query) + # Convert: Core入力形式への変換(スカラ系列) # TODO: Better function name (c.f. VOICEVOX/voicevox_engine#790) # モーラ(前後の無音含む)ごとの基本周波数 f0 = numpy.array([0] + [mora.pitch for mora in moras] + [0], dtype=numpy.float32) - # フレームごとのピッチ化 + # Rescale: 時間スケールの変更(モーラ -> フレーム) # 母音インデックスに基づき "音素あたりのフレーム長" を "モーラあたりのフレーム長" に集約 vowel_indexes = numpy.array(split_mora(phonemes)[2]) frame_per_mora = [ a.sum() for a in numpy.split(frame_per_phoneme, vowel_indexes[:-1] + 1) ] - # モーラの基本周波数を子音・母音に割当てフレーム化 frame_f0 = numpy.repeat(f0, frame_per_mora) return frame_f0 @@ -267,7 +267,10 @@ def calc_frame_phoneme(phonemes: List[OjtPhoneme], frame_per_phoneme: numpy.ndar フレームごとの音素系列 """ # TODO: Better function name (c.f. VOICEVOX/voicevox_engine#790) + # Convert: Core入力形式への変換(onehotベクトル系列) onehot_phoneme = numpy.stack([p.onehot for p in phonemes]) + + # Rescale: 時間スケールの変更(音素 -> フレーム) frame_phoneme = numpy.repeat(onehot_phoneme, frame_per_phoneme, axis=0) return frame_phoneme @@ -545,7 +548,7 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int): phoneme=phoneme, style_id=numpy.array(style_id, dtype=numpy.int64).reshape(-1), ) - + # Apply: グローバル特徴量による補正(音量) wave = apply_volume(wave, query)