From a8603d414b27306f8f84ea8fbf407fb2fbd10fb2 Mon Sep 17 00:00:00 2001
From: tarepan <tarepan5884@gmail.com>
Date: Wed, 6 Dec 2023 10:07:30 +0000
Subject: [PATCH] =?UTF-8?q?Refactor:=20`apply=5Fspeed`=20=E3=81=AE?=
 =?UTF-8?q?=E5=88=87=E3=82=8A=E5=87=BA=E3=81=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/test_synthesis_engine.py                 | 28 +++++++++++++++++++
 .../synthesis_engine/synthesis_engine.py      | 28 ++++++++++++++++---
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py
index 1d29cad09..68094f15b 100644
--- a/test/test_synthesis_engine.py
+++ b/test/test_synthesis_engine.py
@@ -15,6 +15,7 @@
 from voicevox_engine.synthesis_engine.synthesis_engine import (
     apply_intonation,
     apply_pitch,
+    apply_speed,
     apply_volume,
     calc_frame_per_phoneme,
     calc_frame_phoneme,
@@ -194,6 +195,33 @@ def test_pad_with_silence():
     assert moras_with_silence == true_moras_with_silence
 
 
+def test_apply_speed():
+    """Test `apply_speed`."""
+    # Inputs
+    query = _gen_query(speedScale=2.0)
+    input_moras = [
+        _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
+        _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0),
+        _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
+        _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0),
+        _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
+    ]
+
+    # Expects - x2 fast
+    true_moras = [
+        _gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0),
+        _gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0),
+        _gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0),
+        _gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0),
+        _gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0),
+    ]
+
+    # Outputs
+    moras = apply_speed(input_moras, query)
+
+    assert moras == true_moras
+
+
 def test_apply_pitch():
     """Test `apply_pitch`."""
     # Inputs
diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py
index 6dcabe62c..0c2bbe25c 100644
--- a/voicevox_engine/synthesis_engine/synthesis_engine.py
+++ b/voicevox_engine/synthesis_engine/synthesis_engine.py
@@ -135,6 +135,27 @@ def pad_with_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]:
     return moras
 
 
+def apply_speed(moras: list[Mora], query: AudioQuery) -> list[Mora]:
+    """
+    話速スケール（`speedScale`）の適用
+    Parameters
+    ----------
+    moras : list[Mora]
+        モーラ系列
+    query : AudioQuery
+        音声合成クエリ
+    Returns
+    -------
+    moras : list[Mora]
+        話速スケールが適用されたモーラ系列
+    """
+    for mora in moras:
+        mora.vowel_length /= query.speedScale
+        if mora.consonant_length:
+            mora.consonant_length /= query.speedScale
+    return moras
+
+
 def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
     """
     音素あたりのフレーム長を算出
@@ -149,6 +170,9 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
     frame_per_phoneme : NDArray[]
         音素あたりのフレーム長。端数丸め。
     """
+    # Apply: グローバル特徴量による補正（話速）
+    moras = apply_speed(moras, query)
+
     # 音素あたりの継続長
     sec_per_phoneme = numpy.array(
         [
@@ -161,10 +185,6 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
         ],
         dtype=numpy.float32,
     )
-
-    # 話速による継続長の補正
-    sec_per_phoneme /= query.speedScale
-
     # 音素あたりのフレーム長。端数丸め。
     framerate = 24000 / 256  # framerate 93.75 [frame/sec]
     frame_per_phoneme = numpy.round(sec_per_phoneme * framerate).astype(numpy.int32)