Refactor: apply_speed の切り出し

VOICEVOX · Dec 6, 2023 · a8603d4 · a8603d4
1 parent 1c108f3
commit a8603d4
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 4 deletions.
diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py
@@ -15,6 +15,7 @@
 from voicevox_engine.synthesis_engine.synthesis_engine import (
     apply_intonation,
     apply_pitch,
+    apply_speed,
     apply_volume,
     calc_frame_per_phoneme,
     calc_frame_phoneme,
@@ -194,6 +195,33 @@ def test_pad_with_silence():
     assert moras_with_silence == true_moras_with_silence
 
 
+def test_apply_speed():
+    """Test `apply_speed`."""
+    # Inputs
+    query = _gen_query(speedScale=2.0)
+    input_moras = [
+        _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
+        _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0),
+        _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
+        _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0),
+        _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
+    ]
+
+    # Expects - x2 fast
+    true_moras = [
+        _gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0),
+        _gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0),
+        _gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0),
+        _gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0),
+        _gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0),
+    ]
+
+    # Outputs
+    moras = apply_speed(input_moras, query)
+
+    assert moras == true_moras
+
+
 def test_apply_pitch():
     """Test `apply_pitch`."""
     # Inputs

diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py
@@ -135,6 +135,27 @@ def pad_with_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]:
     return moras
 
 
+def apply_speed(moras: list[Mora], query: AudioQuery) -> list[Mora]:
+    """
+    話速スケール（`speedScale`）の適用
+    Parameters
+    ----------
+    moras : list[Mora]
+        モーラ系列
+    query : AudioQuery
+        音声合成クエリ
+    Returns
+    -------
+    moras : list[Mora]
+        話速スケールが適用されたモーラ系列
+    """
+    for mora in moras:
+        mora.vowel_length /= query.speedScale
+        if mora.consonant_length:
+            mora.consonant_length /= query.speedScale
+    return moras
+
+
 def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
     """
     音素あたりのフレーム長を算出
@@ -149,6 +170,9 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
     frame_per_phoneme : NDArray[]
         音素あたりのフレーム長。端数丸め。
     """
+    # Apply: グローバル特徴量による補正（話速）
+    moras = apply_speed(moras, query)
+
     # 音素あたりの継続長
     sec_per_phoneme = numpy.array(
         [
@@ -161,10 +185,6 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
         ],
         dtype=numpy.float32,
     )
-
-    # 話速による継続長の補正
-    sec_per_phoneme /= query.speedScale
-
     # 音素あたりのフレーム長。端数丸め。
     framerate = 24000 / 256  # framerate 93.75 [frame/sec]
     frame_per_phoneme = numpy.round(sec_per_phoneme * framerate).astype(numpy.int32)