Skip to content

Commit 5d7562c

Browse files
authored
Refactor: frame_per_mora による置き換え (#841)
1 parent d0a596d commit 5d7562c

File tree

2 files changed

+60
-31
lines changed

2 files changed

+60
-31
lines changed

test/test_synthesis_engine.py

+34-14
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
apply_prepost_silence,
2121
apply_speed_scale,
2222
apply_volume_scale,
23+
calc_frame_per_mora,
2324
calc_frame_per_phoneme,
2425
calc_frame_phoneme,
2526
calc_frame_pitch,
@@ -353,24 +354,43 @@ def test_calc_frame_per_phoneme():
353354
assert numpy.array_equal(frame_per_phoneme, true_frame_per_phoneme)
354355

355356

357+
def test_calc_frame_per_mora():
358+
"""Test `calc_frame_per_mora`."""
359+
# Inputs
360+
moras = [
361+
_gen_mora(" ", None, None, " ", 2 * 0.01067, 0.0), # 0.01067 [sec/frame]
362+
_gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0),
363+
_gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0),
364+
_gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
365+
_gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0),
366+
_gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
367+
_gen_mora(" ", None, None, " ", 6 * 0.01067, 0.0),
368+
]
369+
370+
# Expects
371+
# Pre ko N pau hi hO Pst
372+
true_frame_per_mora = [2, 6, 4, 2, 6, 6, 6]
373+
true_frame_per_mora = numpy.array(true_frame_per_mora, dtype=numpy.int32)
374+
375+
# Outputs
376+
frame_per_phoneme = numpy.array(list(map(calc_frame_per_mora, moras)))
377+
378+
assert numpy.array_equal(frame_per_phoneme, true_frame_per_mora)
379+
380+
356381
def test_calc_frame_pitch():
357382
"""Test `test_calc_frame_pitch`."""
358383
# Inputs
359384
query = _gen_query(pitchScale=2.0, intonationScale=0.5)
360385
moras = [
361-
_gen_mora(" ", None, None, " ", 0.0, 0.0),
362-
_gen_mora("コ", "k", 0.0, "o", 0.0, 50.0),
363-
_gen_mora("ン", None, None, "N", 0.0, 50.0),
364-
_gen_mora("、", None, None, "pau", 0.0, 0.0),
365-
_gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0),
366-
_gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
367-
_gen_mora(" ", None, None, " ", 0.0, 0.0),
386+
_gen_mora(" ", None, None, " ", 1 * 0.01067, 0.0),
387+
_gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0),
388+
_gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0),
389+
_gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0),
390+
_gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0),
391+
_gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0),
392+
_gen_mora(" ", None, None, " ", 3 * 0.01067, 0.0),
368393
]
369-
phoneme_str = "pau k o N pau h i h O pau"
370-
phonemes = [OjtPhoneme(p) for p in phoneme_str.split()]
371-
# Pre k o N pau h i h O Pst
372-
frame_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3]
373-
frame_per_phoneme = numpy.array(frame_per_phoneme, dtype=numpy.int32)
374394

375395
# Expects - x4 value scaled -> mean=300 var x0.5 intonation scaling
376396
# pau ko ko ko N N
@@ -382,7 +402,7 @@ def test_calc_frame_pitch():
382402
true_f0 = numpy.array(true1_f0 + true2_f0 + true3_f0, dtype=numpy.float32)
383403

384404
# Outputs
385-
f0 = calc_frame_pitch(query, moras, phonemes, frame_per_phoneme)
405+
f0 = calc_frame_pitch(query, moras)
386406

387407
assert numpy.array_equal(f0, true_f0)
388408

@@ -461,7 +481,7 @@ def test_feat_to_framescale():
461481
# Outputs
462482
flatten_moras = apply_prepost_silence(flatten_moras, query)
463483
frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras)
464-
f0 = calc_frame_pitch(query, flatten_moras, phoneme_data_list, frame_per_phoneme)
484+
f0 = calc_frame_pitch(query, flatten_moras)
465485
frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme)
466486

467487
assert numpy.array_equal(frame_phoneme, true_frame_phoneme)

voicevox_engine/synthesis_engine/synthesis_engine.py

+26-17
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,29 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
191191
return frame_per_phoneme
192192

193193

194+
def _to_frame(sec: float) -> ndarray:
195+
FRAMERATE = 93.75 # 24000 / 256 [frame/sec]
196+
return numpy.round(sec * FRAMERATE).astype(numpy.int32)
197+
198+
199+
def calc_frame_per_mora(mora: Mora) -> ndarray:
200+
"""
201+
モーラあたりのフレーム長を算出
202+
Parameters
203+
----------
204+
mora : Mora
205+
モーラ
206+
Returns
207+
-------
208+
frame_per_mora : NDArray[]
209+
モーラあたりのフレーム長。端数丸め。
210+
"""
211+
# 音素ごとにフレーム長を算出し、和をモーラのフレーム長とする
212+
vowel_frames = _to_frame(mora.vowel_length)
213+
consonant_frames = _to_frame(mora.consonant_length) if mora.consonant else 0
214+
return vowel_frames + consonant_frames
215+
216+
194217
def apply_pitch_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]:
195218
"""
196219
音高スケール(`pitchScale`)の適用
@@ -233,12 +256,7 @@ def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]:
233256
return moras
234257

235258

236-
def calc_frame_pitch(
237-
query: AudioQuery,
238-
moras: List[Mora],
239-
phonemes: List[OjtPhoneme],
240-
frame_per_phoneme: numpy.ndarray,
241-
):
259+
def calc_frame_pitch(query: AudioQuery, moras: list[Mora]) -> ndarray:
242260
"""
243261
フレームごとのピッチの生成
244262
Parameters
@@ -247,10 +265,6 @@ def calc_frame_pitch(
247265
音声合成クエリ
248266
moras : List[Mora]
249267
モーラ列
250-
phonemes : List[OjtPhoneme]
251-
音素列
252-
frame_per_phoneme: NDArray
253-
音素あたりのフレーム長。端数丸め。
254268
Returns
255269
-------
256270
frame_f0 : NDArray[]
@@ -265,10 +279,7 @@ def calc_frame_pitch(
265279

266280
# Rescale: 時間スケールの変更(モーラ -> フレーム)
267281
# 母音インデックスに基づき "音素あたりのフレーム長" を "モーラあたりのフレーム長" に集約
268-
vowel_indexes = numpy.array(split_mora(phonemes)[2])
269-
frame_per_mora = [
270-
a.sum() for a in numpy.split(frame_per_phoneme, vowel_indexes[:-1] + 1)
271-
]
282+
frame_per_mora = numpy.array(list(map(calc_frame_per_mora, moras)))
272283
frame_f0 = numpy.repeat(f0, frame_per_mora)
273284
return frame_f0
274285

@@ -619,9 +630,7 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int):
619630

620631
flatten_moras = apply_prepost_silence(flatten_moras, query)
621632
frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras)
622-
f0 = calc_frame_pitch(
623-
query, flatten_moras, phoneme_data_list, frame_per_phoneme
624-
)
633+
f0 = calc_frame_pitch(query, flatten_moras)
625634
phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme)
626635

627636
# 今まで生成された情報をdecode_forwardにかけ、推論器によって音声波形を生成する

0 commit comments

Comments
 (0)