Refactor: frame_per_mora による置き換え (#841)

tarepan · web-flow · commit 5d7562c51364 · 2023-12-10T06:44:35.000+09:00
diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py
@@ -20,6 +20,7 @@
     apply_prepost_silence,
     apply_speed_scale,
     apply_volume_scale,
+    calc_frame_per_mora,
     calc_frame_per_phoneme,
     calc_frame_phoneme,
     calc_frame_pitch,
@@ -353,24 +354,43 @@ def test_calc_frame_per_phoneme():
     assert numpy.array_equal(frame_per_phoneme, true_frame_per_phoneme)
 
 
+def test_calc_frame_per_mora():
+    """Test `calc_frame_per_mora`."""
+    # Inputs
+    moras = [
+        _gen_mora("　", None, None, "　", 2 * 0.01067, 0.0),  # 0.01067 [sec/frame]
+        _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0),
+        _gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0),
+        _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
+        _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0),
+        _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
+        _gen_mora("　", None, None, "　", 6 * 0.01067, 0.0),
+    ]
+
+    # Expects
+    #                    Pre ko  N pau hi hO Pst
+    true_frame_per_mora = [2, 6, 4, 2, 6, 6, 6]
+    true_frame_per_mora = numpy.array(true_frame_per_mora, dtype=numpy.int32)
+
+    # Outputs
+    frame_per_phoneme = numpy.array(list(map(calc_frame_per_mora, moras)))
+
+    assert numpy.array_equal(frame_per_phoneme, true_frame_per_mora)
+
+
 def test_calc_frame_pitch():
     """Test `test_calc_frame_pitch`."""
     # Inputs
     query = _gen_query(pitchScale=2.0, intonationScale=0.5)
     moras = [
-        _gen_mora("　", None, None, "　", 0.0, 0.0),
-        _gen_mora("コ", "k", 0.0, "o", 0.0, 50.0),
-        _gen_mora("ン", None, None, "N", 0.0, 50.0),
-        _gen_mora("、", None, None, "pau", 0.0, 0.0),
-        _gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0),
-        _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
-        _gen_mora("　", None, None, "　", 0.0, 0.0),
+        _gen_mora("　", None, None, "　", 1 * 0.01067, 0.0),
+        _gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0),
+        _gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0),
+        _gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0),
+        _gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0),
+        _gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0),
+        _gen_mora("　", None, None, "　", 3 * 0.01067, 0.0),
     ]
-    phoneme_str = "pau k o N pau h i h O pau"
-    phonemes = [OjtPhoneme(p) for p in phoneme_str.split()]
-    #                   Pre k  o  N pau h  i  h  O Pst
-    frame_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3]
-    frame_per_phoneme = numpy.array(frame_per_phoneme, dtype=numpy.int32)
 
     # Expects - x4 value scaled -> mean=300 var x0.5 intonation scaling
     #           pau   ko     ko     ko      N      N
@@ -382,7 +402,7 @@ def test_calc_frame_pitch():
     true_f0 = numpy.array(true1_f0 + true2_f0 + true3_f0, dtype=numpy.float32)
 
     # Outputs
-    f0 = calc_frame_pitch(query, moras, phonemes, frame_per_phoneme)
+    f0 = calc_frame_pitch(query, moras)
 
     assert numpy.array_equal(f0, true_f0)
 
@@ -461,7 +481,7 @@ def test_feat_to_framescale():
     # Outputs
     flatten_moras = apply_prepost_silence(flatten_moras, query)
     frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras)
-    f0 = calc_frame_pitch(query, flatten_moras, phoneme_data_list, frame_per_phoneme)
+    f0 = calc_frame_pitch(query, flatten_moras)
     frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme)
 
     assert numpy.array_equal(frame_phoneme, true_frame_phoneme)
diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py
@@ -191,6 +191,29 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
     return frame_per_phoneme
 
 
+def _to_frame(sec: float) -> ndarray:
+    FRAMERATE = 93.75  # 24000 / 256 [frame/sec]
+    return numpy.round(sec * FRAMERATE).astype(numpy.int32)
+
+
+def calc_frame_per_mora(mora: Mora) -> ndarray:
+    """
+    モーラあたりのフレーム長を算出
+    Parameters
+    ----------
+    mora : Mora
+        モーラ
+    Returns
+    -------
+    frame_per_mora : NDArray[]
+        モーラあたりのフレーム長。端数丸め。
+    """
+    # 音素ごとにフレーム長を算出し、和をモーラのフレーム長とする
+    vowel_frames = _to_frame(mora.vowel_length)
+    consonant_frames = _to_frame(mora.consonant_length) if mora.consonant else 0
+    return vowel_frames + consonant_frames
+
+
 def apply_pitch_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]:
     """
     音高スケール（`pitchScale`）の適用
@@ -233,12 +256,7 @@ def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]:
     return moras
 
 
-def calc_frame_pitch(
-    query: AudioQuery,
-    moras: List[Mora],
-    phonemes: List[OjtPhoneme],
-    frame_per_phoneme: numpy.ndarray,
-):
+def calc_frame_pitch(query: AudioQuery, moras: list[Mora]) -> ndarray:
     """
     フレームごとのピッチの生成
     Parameters
@@ -247,10 +265,6 @@ def calc_frame_pitch(
         音声合成クエリ
     moras : List[Mora]
         モーラ列
-    phonemes : List[OjtPhoneme]
-        音素列
-    frame_per_phoneme: NDArray
-        音素あたりのフレーム長。端数丸め。
     Returns
     -------
     frame_f0 : NDArray[]
@@ -265,10 +279,7 @@ def calc_frame_pitch(
 
     # Rescale: 時間スケールの変更（モーラ -> フレーム）
     # 母音インデックスに基づき "音素あたりのフレーム長" を "モーラあたりのフレーム長" に集約
-    vowel_indexes = numpy.array(split_mora(phonemes)[2])
-    frame_per_mora = [
-        a.sum() for a in numpy.split(frame_per_phoneme, vowel_indexes[:-1] + 1)
-    ]
+    frame_per_mora = numpy.array(list(map(calc_frame_per_mora, moras)))
     frame_f0 = numpy.repeat(f0, frame_per_mora)
     return frame_f0
 
@@ -619,9 +630,7 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int):
 
         flatten_moras = apply_prepost_silence(flatten_moras, query)
         frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras)
-        f0 = calc_frame_pitch(
-            query, flatten_moras, phoneme_data_list, frame_per_phoneme
-        )
+        f0 = calc_frame_pitch(query, flatten_moras)
         phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme)
 
         # 今まで生成された情報をdecode_forwardにかけ、推論器によって音声波形を生成する