追加: OjtPhoneme.onehot() float 出力 (#810)

VOICEVOX · Dec 5, 2023 · 349bf58 · 349bf58
1 parent 16845ab
commit 349bf58
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 21 deletions.
diff --git a/test/test_acoustic_feature_extractor.py b/test/test_acoustic_feature_extractor.py
@@ -84,6 +84,6 @@ def test_onehot(self):
         for i, phoneme in enumerate(self.ojt_hello_hiho):
             for j in range(OjtPhoneme.num_phoneme):
                 if phoneme_id_list[i] == j:
-                    self.assertEqual(phoneme.onehot[j], True)
+                    self.assertEqual(phoneme.onehot[j], 1.0)
                 else:
-                    self.assertEqual(phoneme.onehot[j], False)
+                    self.assertEqual(phoneme.onehot[j], 0.0)
diff --git a/voicevox_engine/acoustic_feature_extractor.py b/voicevox_engine/acoustic_feature_extractor.py
@@ -100,12 +100,12 @@ def phoneme_id(self):
     @property
     def onehot(self):
         """
-        phoneme listの長さ分の0埋め配列のうち、phoneme id番目がTrue(1)の配列を返す
+        音素onehotベクトル
         Returns
         -------
-        onehot : numpu.ndarray
-            関数内で変更された配列を返す
+        onehot : numpy.ndarray
+            音素onehotベクトル（listの長さ分の0埋め配列のうち、phoneme id番目が1.0の配列）
         """
-        array = numpy.zeros(self.num_phoneme, dtype=bool)
-        array[self.phoneme_id] = True
+        array = numpy.zeros(self.num_phoneme, dtype=numpy.float32)
+        array[self.phoneme_id] = 1.0
         return array
diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py
@@ -198,7 +198,7 @@ def calc_frame_pitch(
 
 def calc_frame_phoneme(phonemes: List[OjtPhoneme], frame_per_phoneme: numpy.ndarray):
     """
-    フレームごとの音素列の生成
+    フレームごとの音素列の生成（onehot化 + フレーム化）
     Parameters
     ----------
     phonemes : List[OjtPhoneme]
@@ -211,19 +211,8 @@ def calc_frame_phoneme(phonemes: List[OjtPhoneme], frame_per_phoneme: numpy.ndar
         フレームごとの音素系列
     """
     # TODO: Better function name (c.f. VOICEVOX/voicevox_engine#790)
-    # Index化
-    phoneme_ids = numpy.array([p.phoneme_id for p in phonemes], dtype=numpy.int64)
-
-    # フレームごとの音素化
-    frame_phoneme = numpy.repeat(phoneme_ids, frame_per_phoneme)
-
-    # Onehot化
-    array = numpy.zeros(
-        (len(frame_phoneme), OjtPhoneme.num_phoneme), dtype=numpy.float32
-    )
-    array[numpy.arange(len(frame_phoneme)), frame_phoneme] = 1
-    frame_phoneme = array
-
+    onehot_phoneme = numpy.stack([p.onehot for p in phonemes])
+    frame_phoneme = numpy.repeat(onehot_phoneme, frame_per_phoneme, axis=0)
     return frame_phoneme