Skip to content

Commit

Permalink
Refactor: グローバル特徴量適用の関数化
Browse files Browse the repository at this point in the history
  • Loading branch information
tarepan committed Dec 6, 2023
1 parent 2cb552a commit 74e4b77
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 12 deletions.
72 changes: 72 additions & 0 deletions test/test_synthesis_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@

# TODO: import from voicevox_engine.synthesis_engine.mora
from voicevox_engine.synthesis_engine.synthesis_engine import (
apply_pitch,
apply_intonation,
apply_volume,
calc_frame_per_phoneme,
calc_frame_phoneme,
calc_frame_pitch,
Expand Down Expand Up @@ -169,6 +172,75 @@ def _gen_mora(
)


def test_apply_pitch():
"""Test `apply_pitch`."""
# Inputs
query = _gen_query(pitchScale=2.0)
input_moras = [
_gen_mora("コ", "k", 0.0, "o", 0.0, 50.0),
_gen_mora("ン", None, None, "N", 0.0, 50.0),
_gen_mora("、", None, None, "pau", 0.0, 0.0),
_gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0),
_gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
]

# Expects - x4 value scaled
true_moras = [
_gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
_gen_mora("ン", None, None, "N", 0.0, 200.0),
_gen_mora("、", None, None, "pau", 0.0, 0.0),
_gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
_gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
]

# Outputs
moras = apply_pitch(input_moras, query)

assert moras == true_moras


def test_apply_intonation():
"""Test `apply_intonation`."""
# Inputs
query = _gen_query(intonationScale=0.5)
input_moras = [
_gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
_gen_mora("ン", None, None, "N", 0.0, 200.0),
_gen_mora("、", None, None, "pau", 0.0, 0.0),
_gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
_gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
]

# Expects - mean=300 var x0.5 intonation scaling
true_moras = [
_gen_mora("コ", "k", 0.0, "o", 0.0, 250.0),
_gen_mora("ン", None, None, "N", 0.0, 250.0),
_gen_mora("、", None, None, "pau", 0.0, 0.0),
_gen_mora("ヒ", "h", 0.0, "i", 0.0, 400.0),
_gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
]

# Outputs
moras = apply_intonation(input_moras, query)

assert moras == true_moras


def test_apply_volume():
"""Test `apply_volume`."""
# Inputs
query = _gen_query(volumeScale=3.0)
input_wave = numpy.array([0.0, 1.0, 2.0, 0.0,])

# Expects - x3 scale
true_wave = numpy.array([0.0, 3.0, 6.0, 0.0,])

# Outputs
wave = apply_volume(input_wave, query)

assert numpy.allclose(wave, true_wave)


def test_calc_frame_per_phoneme():
"""Test `calc_frame_per_phoneme`."""
# Inputs
Expand Down
80 changes: 68 additions & 12 deletions voicevox_engine/synthesis_engine/synthesis_engine.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import math
import threading
from itertools import chain
from typing import List, Optional, Tuple
Expand Down Expand Up @@ -149,6 +150,48 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
return frame_per_phoneme


def apply_pitch(moras: list[Mora], query: AudioQuery) -> list[Mora]:
"""
音高スケール(`pitchScale`)の適用
Parameters
----------
moras : list[Mora]
モーラ系列
query : AudioQuery
音声合成クエリ
Returns
-------
moras : list[Mora]
音高スケールが適用されたモーラ系列
"""
for mora in moras:
mora.pitch *= 2**query.pitchScale
return moras


def apply_intonation(moras: list[Mora], query: AudioQuery) -> list[Mora]:
"""
抑揚スケール(`intonationScale`)の適用
Parameters
----------
moras : list[Mora]
モーラ系列
query : AudioQuery
音声合成クエリ
Returns
-------
moras : list[Mora]
抑揚スケールが適用されたモーラ系列
"""
# 有声音素 (f0>0) の平均値に対する乖離度をスケール
voiced = list(filter(lambda mora: mora.pitch > 0, moras))
mean_f0 = numpy.mean(list(map(lambda mora: mora.pitch, voiced))).item()
if mean_f0 != math.nan: # 空リスト -> NaN
for mora in voiced:
mora.pitch = (mora.pitch - mean_f0) * query.intonationScale + mean_f0
return moras


def calc_frame_pitch(
query: AudioQuery,
moras: List[Mora],
Expand All @@ -172,19 +215,14 @@ def calc_frame_pitch(
frame_f0 : NDArray[]
フレームごとの基本周波数系列
"""
# Apply: グローバル特徴量による補正(音高、抑揚)
moras = apply_pitch(moras, query)
moras = apply_intonation(moras, query)

# TODO: Better function name (c.f. VOICEVOX/voicevox_engine#790)
# モーラ(前後の無音含む)ごとの基本周波数
f0 = numpy.array([0] + [mora.pitch for mora in moras] + [0], dtype=numpy.float32)

# 音高スケールによる補正
f0 *= 2**query.pitchScale

# 抑揚スケールによる補正。有声音素 (f0>0) の平均値に対する乖離度をスケール
voiced = f0 > 0
mean_f0 = f0[voiced].mean()
if not numpy.isnan(mean_f0):
f0[voiced] = (f0[voiced] - mean_f0) * query.intonationScale + mean_f0

# フレームごとのピッチ化
# 母音インデックスに基づき "音素あたりのフレーム長" を "モーラあたりのフレーム長" に集約
vowel_indexes = numpy.array(split_mora(phonemes)[2])
Expand All @@ -196,6 +234,24 @@ def calc_frame_pitch(
return frame_f0


def apply_volume(wave: numpy.ndarray, query: AudioQuery) -> numpy.ndarray:
"""
音量スケール(`volumeScale`)の適用
Parameters
----------
wave : numpy.ndarray
音声波形
query : AudioQuery
音声合成クエリ
Returns
-------
wave : numpy.ndarray
音量スケールが適用された音声波形
"""
wave *= query.volumeScale
return wave


def calc_frame_phoneme(phonemes: List[OjtPhoneme], frame_per_phoneme: numpy.ndarray):
"""
フレームごとの音素列の生成(onehot化 + フレーム化)
Expand Down Expand Up @@ -489,9 +545,9 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int):
phoneme=phoneme,
style_id=numpy.array(style_id, dtype=numpy.int64).reshape(-1),
)

# volume: ゲイン適用
wave *= query.volumeScale
# Apply: グローバル特徴量による補正(音量)
wave = apply_volume(wave, query)

# 出力サンプリングレートがデフォルト(decode forwarderによるもの、24kHz)でなければ、それを適用する
if query.outputSamplingRate != self.default_sampling_rate:
Expand Down

0 comments on commit 74e4b77

Please sign in to comment.