Skip to content

Commit

Permalink
fix: remove kaldi pitch detection
Browse files Browse the repository at this point in the history
it didn't really work seemingly and it was dropped in torchaudio 2.1.0 anyways
  • Loading branch information
roedoejet committed Nov 11, 2023
1 parent a035dbb commit e402b83
Show file tree
Hide file tree
Showing 8 changed files with 7 additions and 110 deletions.
1 change: 0 additions & 1 deletion everyvoice/.schema/everyvoice-aligner-schema-0.1.json
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,6 @@
"PitchCalculationMethod": {
"enum": [
"pyworld",
"kaldi",
"cwt"
],
"title": "PitchCalculationMethod",
Expand Down
1 change: 0 additions & 1 deletion everyvoice/.schema/everyvoice-shared-data-schema-0.1.json
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@
"PitchCalculationMethod": {
"enum": [
"pyworld",
"kaldi",
"cwt"
],
"title": "PitchCalculationMethod",
Expand Down
1 change: 0 additions & 1 deletion everyvoice/.schema/everyvoice-spec-to-wav-schema-0.1.json
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,6 @@
"PitchCalculationMethod": {
"enum": [
"pyworld",
"kaldi",
"cwt"
],
"title": "PitchCalculationMethod",
Expand Down
40 changes: 3 additions & 37 deletions everyvoice/.schema/everyvoice-text-to-spec-schema-0.1.json
Original file line number Diff line number Diff line change
Expand Up @@ -203,15 +203,6 @@
"title": "EarlyStoppingMetricEnum",
"type": "string"
},
"EmbeddingTypeEnum": {
"enum": [
"id",
"dvector",
"none"
],
"title": "EmbeddingTypeEnum",
"type": "string"
},
"FastSpeech2FreezeLayersConfig": {
"$schema": "http://json-schema.org/draft-07/schema#",
"additionalProperties": false,
Expand Down Expand Up @@ -300,7 +291,9 @@
"type": "boolean"
},
"multispeaker": {
"$ref": "#/$defs/MultiSpeakerConfig"
"default": false,
"title": "Multispeaker",
"type": "boolean"
}
},
"title": "FastSpeech2ModelConfig",
Expand Down Expand Up @@ -482,32 +475,6 @@
"title": "LoggerConfig",
"type": "object"
},
"MultiSpeakerConfig": {
"$schema": "http://json-schema.org/draft-07/schema#",
"additionalProperties": false,
"properties": {
"embedding_type": {
"allOf": [
{
"$ref": "#/$defs/EmbeddingTypeEnum"
}
],
"default": "none"
},
"every_layer": {
"default": false,
"title": "Every Layer",
"type": "boolean"
},
"dvector_gmm": {
"default": false,
"title": "Dvector Gmm",
"type": "boolean"
}
},
"title": "MultiSpeakerConfig",
"type": "object"
},
"NoamOptimizer": {
"$schema": "http://json-schema.org/draft-07/schema#",
"additionalProperties": false,
Expand Down Expand Up @@ -562,7 +529,6 @@
"PitchCalculationMethod": {
"enum": [
"pyworld",
"kaldi",
"cwt"
],
"title": "PitchCalculationMethod",
Expand Down
40 changes: 3 additions & 37 deletions everyvoice/.schema/everyvoice-text-to-wav-schema-0.1.json
Original file line number Diff line number Diff line change
Expand Up @@ -636,15 +636,6 @@
"title": "EarlyStoppingMetricEnum",
"type": "string"
},
"EmbeddingTypeEnum": {
"enum": [
"id",
"dvector",
"none"
],
"title": "EmbeddingTypeEnum",
"type": "string"
},
"FastSpeech2Config": {
"$schema": "http://json-schema.org/draft-07/schema#",
"additionalProperties": false,
Expand Down Expand Up @@ -805,7 +796,9 @@
"type": "boolean"
},
"multispeaker": {
"$ref": "#/$defs/MultiSpeakerConfig"
"default": false,
"title": "Multispeaker",
"type": "boolean"
}
},
"title": "FastSpeech2ModelConfig",
Expand Down Expand Up @@ -1354,32 +1347,6 @@
"title": "LoggerConfig",
"type": "object"
},
"MultiSpeakerConfig": {
"$schema": "http://json-schema.org/draft-07/schema#",
"additionalProperties": false,
"properties": {
"embedding_type": {
"allOf": [
{
"$ref": "#/$defs/EmbeddingTypeEnum"
}
],
"default": "none"
},
"every_layer": {
"default": false,
"title": "Every Layer",
"type": "boolean"
},
"dvector_gmm": {
"default": false,
"title": "Dvector Gmm",
"type": "boolean"
}
},
"title": "MultiSpeakerConfig",
"type": "object"
},
"NoamOptimizer": {
"$schema": "http://json-schema.org/draft-07/schema#",
"additionalProperties": false,
Expand Down Expand Up @@ -1434,7 +1401,6 @@
"PitchCalculationMethod": {
"enum": [
"pyworld",
"kaldi",
"cwt"
],
"title": "PitchCalculationMethod",
Expand Down
1 change: 0 additions & 1 deletion everyvoice/config/preprocessing_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ class AudioConfig(ConfigModel):

class PitchCalculationMethod(Enum):
pyworld = "pyworld"
kaldi = "kaldi"
cwt = "cwt"


Expand Down
17 changes: 1 addition & 16 deletions everyvoice/preprocessor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from torchaudio import load as load_audio
from torchaudio import save as save_audio
from torchaudio import transforms
from torchaudio.functional import compute_kaldi_pitch, resample
from torchaudio.functional import resample
from torchaudio.sox_effects import apply_effects_tensor
from tqdm import tqdm

Expand Down Expand Up @@ -332,21 +332,6 @@ def extract_pitch(self, audio_tensor: torch.Tensor):
pitch[pitch == 0] = np.nan
pitch = self._interpolate(pitch)
pitch = torch.tensor(pitch).float()
elif self.config.preprocessing.pitch_type == PitchCalculationMethod.kaldi.value:
pitch = compute_kaldi_pitch(
waveform=audio_tensor,
sample_rate=self.input_sampling_rate,
frame_length=self.audio_config.fft_window_frames
/ self.input_sampling_rate
* 1000,
frame_shift=self.audio_config.fft_hop_frames
/ self.input_sampling_rate
* 1000,
min_f0=50,
max_f0=400,
)[0][
..., 1
] # TODO: the docs and C Minxhoffer implementation take [..., 0] but this doesn't appear to be the pitch, at least for this version of torchaudio.
else:
raise ConfigError(
f"Sorry, the pitch estimation type '{self.config.preprocessing.pitch_type}' is not supported. Please edit your config file."
Expand Down
16 changes: 0 additions & 16 deletions everyvoice/tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,17 +158,11 @@ def test_spectral_feats(self):
self.assertEqual(complex_feats.size(1), linear_feats.size(1))

def test_pitch(self):
kaldi_config = VocoderConfig(
preprocessing=PreprocessingConfig(
pitch_phone_averaging=False, pitch_type=PitchCalculationMethod.kaldi
)
)
pyworld_config = VocoderConfig(
preprocessing=PreprocessingConfig(
pitch_phone_averaging=False, pitch_type=PitchCalculationMethod.pyworld
)
)
preprocessor_kaldi = Preprocessor(kaldi_config)
preprocessor_pyworld = Preprocessor(pyworld_config)

for entry in self.filelist[1:]:
Expand Down Expand Up @@ -196,17 +190,7 @@ def test_pitch(self):
# / "ming024"
# / ("eng-LJSpeech-pitch-" + entry["filename"] + ".npy")
# )
frame_pitch_kaldi = preprocessor_kaldi.extract_pitch(audio.unsqueeze(0))
kaldi_phone_avg_energy = preprocessor_kaldi.average_data_by_durations(
frame_pitch_kaldi, durs
)
# Ensure same number of frames
# TODO: Kaldi DOESN'T actually produce the right length tensors here
# self.assertEqual(
# frame_pitch_kaldi.size(0) - 2, feats.size(1)
# )
# Ensure avg pitch for each phone
self.assertEqual(len(durs), kaldi_phone_avg_energy.size(0))
frame_pitch_pyworld = preprocessor_pyworld.extract_pitch(audio)
pyworld_phone_avg_energy = preprocessor_pyworld.average_data_by_durations(
frame_pitch_pyworld, durs
Expand Down

0 comments on commit e402b83

Please sign in to comment.