Refactor: グローバル特徴量適用の関数化

VOICEVOX · Dec 6, 2023 · 74e4b77 · 74e4b77
1 parent 2cb552a
commit 74e4b77
Show file tree

Hide file tree

Showing 2 changed files with 140 additions and 12 deletions.
diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py
@@ -13,6 +13,9 @@
 
 # TODO: import from voicevox_engine.synthesis_engine.mora
 from voicevox_engine.synthesis_engine.synthesis_engine import (
+    apply_pitch,
+    apply_intonation,
+    apply_volume,
     calc_frame_per_phoneme,
     calc_frame_phoneme,
     calc_frame_pitch,
@@ -169,6 +172,75 @@ def _gen_mora(
     )
 
 
+def test_apply_pitch():
+    """Test `apply_pitch`."""
+    # Inputs
+    query = _gen_query(pitchScale=2.0)
+    input_moras = [
+        _gen_mora("コ", "k", 0.0, "o", 0.0, 50.0),
+        _gen_mora("ン", None, None, "N", 0.0, 50.0),
+        _gen_mora("、", None, None, "pau", 0.0, 0.0),
+        _gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0),
+        _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
+    ]
+
+    # Expects - x4 value scaled
+    true_moras = [
+        _gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
+        _gen_mora("ン", None, None, "N", 0.0, 200.0),
+        _gen_mora("、", None, None, "pau", 0.0, 0.0),
+        _gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
+        _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
+    ]
+
+    # Outputs
+    moras = apply_pitch(input_moras, query)
+
+    assert moras == true_moras
+
+
+def test_apply_intonation():
+    """Test `apply_intonation`."""
+    # Inputs
+    query = _gen_query(intonationScale=0.5)
+    input_moras = [
+        _gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
+        _gen_mora("ン", None, None, "N", 0.0, 200.0),
+        _gen_mora("、", None, None, "pau", 0.0, 0.0),
+        _gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
+        _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
+    ]
+
+    # Expects - mean=300 var x0.5 intonation scaling
+    true_moras = [
+        _gen_mora("コ", "k", 0.0, "o", 0.0, 250.0),
+        _gen_mora("ン", None, None, "N", 0.0, 250.0),
+        _gen_mora("、", None, None, "pau", 0.0, 0.0),
+        _gen_mora("ヒ", "h", 0.0, "i", 0.0, 400.0),
+        _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
+    ]
+
+    # Outputs
+    moras = apply_intonation(input_moras, query)
+
+    assert moras == true_moras
+
+
+def test_apply_volume():
+    """Test `apply_volume`."""
+    # Inputs
+    query = _gen_query(volumeScale=3.0)
+    input_wave = numpy.array([0.0, 1.0, 2.0, 0.0,])
+
+    # Expects - x3 scale
+    true_wave = numpy.array([0.0, 3.0, 6.0, 0.0,])
+
+    # Outputs
+    wave = apply_volume(input_wave, query)
+
+    assert numpy.allclose(wave, true_wave)
+
+
 def test_calc_frame_per_phoneme():
     """Test `calc_frame_per_phoneme`."""
     # Inputs

diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py
@@ -1,3 +1,4 @@
+import math
 import threading
 from itertools import chain
 from typing import List, Optional, Tuple
@@ -149,6 +150,48 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
     return frame_per_phoneme
 
 
+def apply_pitch(moras: list[Mora], query: AudioQuery) -> list[Mora]:
+    """
+    音高スケール（`pitchScale`）の適用
+    Parameters
+    ----------
+    moras : list[Mora]
+        モーラ系列
+    query : AudioQuery
+        音声合成クエリ
+    Returns
+    -------
+    moras : list[Mora]
+        音高スケールが適用されたモーラ系列
+    """
+    for mora in moras:
+        mora.pitch *= 2**query.pitchScale
+    return moras
+
+
+def apply_intonation(moras: list[Mora], query: AudioQuery) -> list[Mora]:
+    """
+    抑揚スケール（`intonationScale`）の適用
+    Parameters
+    ----------
+    moras : list[Mora]
+        モーラ系列
+    query : AudioQuery
+        音声合成クエリ
+    Returns
+    -------
+    moras : list[Mora]
+        抑揚スケールが適用されたモーラ系列
+    """
+    # 有声音素 (f0>0) の平均値に対する乖離度をスケール
+    voiced = list(filter(lambda mora: mora.pitch > 0, moras))
+    mean_f0 = numpy.mean(list(map(lambda mora: mora.pitch, voiced))).item()
+    if mean_f0 != math.nan:  # 空リスト -> NaN
+        for mora in voiced:
+            mora.pitch = (mora.pitch - mean_f0) * query.intonationScale + mean_f0
+    return moras
+
+
 def calc_frame_pitch(
     query: AudioQuery,
     moras: List[Mora],
@@ -172,19 +215,14 @@ def calc_frame_pitch(
     frame_f0 : NDArray[]
         フレームごとの基本周波数系列
     """
+    # Apply: グローバル特徴量による補正（音高、抑揚）
+    moras = apply_pitch(moras, query)
+    moras = apply_intonation(moras, query)
+
     # TODO: Better function name (c.f. VOICEVOX/voicevox_engine#790)
     # モーラ（前後の無音含む）ごとの基本周波数
     f0 = numpy.array([0] + [mora.pitch for mora in moras] + [0], dtype=numpy.float32)
 
-    # 音高スケールによる補正
-    f0 *= 2**query.pitchScale
-
-    # 抑揚スケールによる補正。有声音素 (f0>0) の平均値に対する乖離度をスケール
-    voiced = f0 > 0
-    mean_f0 = f0[voiced].mean()
-    if not numpy.isnan(mean_f0):
-        f0[voiced] = (f0[voiced] - mean_f0) * query.intonationScale + mean_f0
-
     # フレームごとのピッチ化
     # 母音インデックスに基づき "音素あたりのフレーム長" を "モーラあたりのフレーム長" に集約
     vowel_indexes = numpy.array(split_mora(phonemes)[2])
@@ -196,6 +234,24 @@ def calc_frame_pitch(
     return frame_f0
 
 
+def apply_volume(wave: numpy.ndarray, query: AudioQuery) -> numpy.ndarray:
+    """
+    音量スケール（`volumeScale`）の適用
+    Parameters
+    ----------
+    wave : numpy.ndarray
+        音声波形
+    query : AudioQuery
+        音声合成クエリ
+    Returns
+    -------
+    wave : numpy.ndarray
+        音量スケールが適用された音声波形
+    """
+    wave *= query.volumeScale
+    return wave
+
+
 def calc_frame_phoneme(phonemes: List[OjtPhoneme], frame_per_phoneme: numpy.ndarray):
     """
     フレームごとの音素列の生成（onehot化 + フレーム化）
@@ -489,9 +545,9 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int):
                 phoneme=phoneme,
                 style_id=numpy.array(style_id, dtype=numpy.int64).reshape(-1),
             )
-
-        # volume: ゲイン適用
-        wave *= query.volumeScale
+ 
+        # Apply: グローバル特徴量による補正（音量）
+        wave = apply_volume(wave, query)
 
         # 出力サンプリングレートがデフォルト(decode forwarderによるもの、24kHz)でなければ、それを適用する
         if query.outputSamplingRate != self.default_sampling_rate: