fix: remove kaldi pitch detection

it didn't really work seemingly and it was dropped in torchaudio 2.1.0 anyways
EveryVoiceTTS · Nov 11, 2023 · e402b83 · e402b83
1 parent a035dbb
commit e402b83
Show file tree

Hide file tree

Showing 8 changed files with 7 additions and 110 deletions.
diff --git a/everyvoice/.schema/everyvoice-aligner-schema-0.1.json b/everyvoice/.schema/everyvoice-aligner-schema-0.1.json
@@ -446,7 +446,6 @@
     "PitchCalculationMethod": {
       "enum": [
         "pyworld",
-        "kaldi",
         "cwt"
       ],
       "title": "PitchCalculationMethod",

diff --git a/everyvoice/.schema/everyvoice-shared-data-schema-0.1.json b/everyvoice/.schema/everyvoice-shared-data-schema-0.1.json
@@ -176,7 +176,6 @@
     "PitchCalculationMethod": {
       "enum": [
         "pyworld",
-        "kaldi",
         "cwt"
       ],
       "title": "PitchCalculationMethod",

diff --git a/everyvoice/.schema/everyvoice-spec-to-wav-schema-0.1.json b/everyvoice/.schema/everyvoice-spec-to-wav-schema-0.1.json
@@ -608,7 +608,6 @@
     "PitchCalculationMethod": {
       "enum": [
         "pyworld",
-        "kaldi",
         "cwt"
       ],
       "title": "PitchCalculationMethod",

diff --git a/everyvoice/.schema/everyvoice-text-to-spec-schema-0.1.json b/everyvoice/.schema/everyvoice-text-to-spec-schema-0.1.json
@@ -203,15 +203,6 @@
       "title": "EarlyStoppingMetricEnum",
       "type": "string"
     },
-    "EmbeddingTypeEnum": {
-      "enum": [
-        "id",
-        "dvector",
-        "none"
-      ],
-      "title": "EmbeddingTypeEnum",
-      "type": "string"
-    },
     "FastSpeech2FreezeLayersConfig": {
       "$schema": "http://json-schema.org/draft-07/schema#",
       "additionalProperties": false,
@@ -300,7 +291,9 @@
           "type": "boolean"
         },
         "multispeaker": {
-          "$ref": "#/$defs/MultiSpeakerConfig"
+          "default": false,
+          "title": "Multispeaker",
+          "type": "boolean"
         }
       },
       "title": "FastSpeech2ModelConfig",
@@ -482,32 +475,6 @@
       "title": "LoggerConfig",
       "type": "object"
     },
-    "MultiSpeakerConfig": {
-      "$schema": "http://json-schema.org/draft-07/schema#",
-      "additionalProperties": false,
-      "properties": {
-        "embedding_type": {
-          "allOf": [
-            {
-              "$ref": "#/$defs/EmbeddingTypeEnum"
-            }
-          ],
-          "default": "none"
-        },
-        "every_layer": {
-          "default": false,
-          "title": "Every Layer",
-          "type": "boolean"
-        },
-        "dvector_gmm": {
-          "default": false,
-          "title": "Dvector Gmm",
-          "type": "boolean"
-        }
-      },
-      "title": "MultiSpeakerConfig",
-      "type": "object"
-    },
     "NoamOptimizer": {
       "$schema": "http://json-schema.org/draft-07/schema#",
       "additionalProperties": false,
@@ -562,7 +529,6 @@
     "PitchCalculationMethod": {
       "enum": [
         "pyworld",
-        "kaldi",
         "cwt"
       ],
       "title": "PitchCalculationMethod",

diff --git a/everyvoice/.schema/everyvoice-text-to-wav-schema-0.1.json b/everyvoice/.schema/everyvoice-text-to-wav-schema-0.1.json
@@ -636,15 +636,6 @@
       "title": "EarlyStoppingMetricEnum",
       "type": "string"
     },
-    "EmbeddingTypeEnum": {
-      "enum": [
-        "id",
-        "dvector",
-        "none"
-      ],
-      "title": "EmbeddingTypeEnum",
-      "type": "string"
-    },
     "FastSpeech2Config": {
       "$schema": "http://json-schema.org/draft-07/schema#",
       "additionalProperties": false,
@@ -805,7 +796,9 @@
           "type": "boolean"
         },
         "multispeaker": {
-          "$ref": "#/$defs/MultiSpeakerConfig"
+          "default": false,
+          "title": "Multispeaker",
+          "type": "boolean"
         }
       },
       "title": "FastSpeech2ModelConfig",
@@ -1354,32 +1347,6 @@
       "title": "LoggerConfig",
       "type": "object"
     },
-    "MultiSpeakerConfig": {
-      "$schema": "http://json-schema.org/draft-07/schema#",
-      "additionalProperties": false,
-      "properties": {
-        "embedding_type": {
-          "allOf": [
-            {
-              "$ref": "#/$defs/EmbeddingTypeEnum"
-            }
-          ],
-          "default": "none"
-        },
-        "every_layer": {
-          "default": false,
-          "title": "Every Layer",
-          "type": "boolean"
-        },
-        "dvector_gmm": {
-          "default": false,
-          "title": "Dvector Gmm",
-          "type": "boolean"
-        }
-      },
-      "title": "MultiSpeakerConfig",
-      "type": "object"
-    },
     "NoamOptimizer": {
       "$schema": "http://json-schema.org/draft-07/schema#",
       "additionalProperties": false,
@@ -1434,7 +1401,6 @@
     "PitchCalculationMethod": {
       "enum": [
         "pyworld",
-        "kaldi",
         "cwt"
       ],
       "title": "PitchCalculationMethod",

diff --git a/everyvoice/config/preprocessing_config.py b/everyvoice/config/preprocessing_config.py
@@ -46,7 +46,6 @@ class AudioConfig(ConfigModel):
 
 class PitchCalculationMethod(Enum):
     pyworld = "pyworld"
-    kaldi = "kaldi"
     cwt = "cwt"
 
 

diff --git a/everyvoice/preprocessor/__init__.py b/everyvoice/preprocessor/__init__.py
@@ -24,7 +24,7 @@
 from torchaudio import load as load_audio
 from torchaudio import save as save_audio
 from torchaudio import transforms
-from torchaudio.functional import compute_kaldi_pitch, resample
+from torchaudio.functional import resample
 from torchaudio.sox_effects import apply_effects_tensor
 from tqdm import tqdm
 
@@ -332,21 +332,6 @@ def extract_pitch(self, audio_tensor: torch.Tensor):
             pitch[pitch == 0] = np.nan
             pitch = self._interpolate(pitch)
             pitch = torch.tensor(pitch).float()
-        elif self.config.preprocessing.pitch_type == PitchCalculationMethod.kaldi.value:
-            pitch = compute_kaldi_pitch(
-                waveform=audio_tensor,
-                sample_rate=self.input_sampling_rate,
-                frame_length=self.audio_config.fft_window_frames
-                / self.input_sampling_rate
-                * 1000,
-                frame_shift=self.audio_config.fft_hop_frames
-                / self.input_sampling_rate
-                * 1000,
-                min_f0=50,
-                max_f0=400,
-            )[0][
-                ..., 1
-            ]  # TODO: the docs and C Minxhoffer implementation take [..., 0] but this doesn't appear to be the pitch, at least for this version of torchaudio.
         else:
             raise ConfigError(
                 f"Sorry, the pitch estimation type '{self.config.preprocessing.pitch_type}' is not supported. Please edit your config file."

diff --git a/everyvoice/tests/test_preprocessing.py b/everyvoice/tests/test_preprocessing.py
@@ -158,17 +158,11 @@ def test_spectral_feats(self):
             self.assertEqual(complex_feats.size(1), linear_feats.size(1))
 
     def test_pitch(self):
-        kaldi_config = VocoderConfig(
-            preprocessing=PreprocessingConfig(
-                pitch_phone_averaging=False, pitch_type=PitchCalculationMethod.kaldi
-            )
-        )
         pyworld_config = VocoderConfig(
             preprocessing=PreprocessingConfig(
                 pitch_phone_averaging=False, pitch_type=PitchCalculationMethod.pyworld
             )
         )
-        preprocessor_kaldi = Preprocessor(kaldi_config)
         preprocessor_pyworld = Preprocessor(pyworld_config)
 
         for entry in self.filelist[1:]:
@@ -196,17 +190,7 @@ def test_pitch(self):
             #     / "ming024"
             #     / ("eng-LJSpeech-pitch-" + entry["filename"] + ".npy")
             # )
-            frame_pitch_kaldi = preprocessor_kaldi.extract_pitch(audio.unsqueeze(0))
-            kaldi_phone_avg_energy = preprocessor_kaldi.average_data_by_durations(
-                frame_pitch_kaldi, durs
-            )
-            # Ensure same number of frames
-            # TODO: Kaldi DOESN'T actually produce the right length tensors here
-            # self.assertEqual(
-            #     frame_pitch_kaldi.size(0) - 2, feats.size(1)
-            # )
             # Ensure avg pitch for each phone
-            self.assertEqual(len(durs), kaldi_phone_avg_energy.size(0))
             frame_pitch_pyworld = preprocessor_pyworld.extract_pitch(audio)
             pyworld_phone_avg_energy = preprocessor_pyworld.average_data_by_durations(
                 frame_pitch_pyworld, durs