Skip to content

Commit

Permalink
fix: Relative paths for wavs issue #127
Browse files Browse the repository at this point in the history
  • Loading branch information
SamuelLarkin authored and roedoejet committed Nov 10, 2023
1 parent c2decdd commit a009014
Show file tree
Hide file tree
Showing 2 changed files with 257 additions and 34 deletions.
222 changes: 222 additions & 0 deletions everyvoice/tests/test_wizard.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import Callable, Iterable, NamedTuple, Optional, Sequence, Union
from unittest import TestCase, main

import yaml
from anytree import RenderTree

from everyvoice.config.text_config import Symbols
Expand All @@ -23,6 +24,7 @@
from everyvoice.wizard import Step
from everyvoice.wizard import StepNames as SN
from everyvoice.wizard import Tour, basic, dataset, prompts
from everyvoice.wizard.basic import ConfigFormatStep


class RecursiveAnswers(NamedTuple):
Expand Down Expand Up @@ -649,5 +651,225 @@ def test_keyboard_interrupt(self):
step.run()


class WavFileDirectoryRelativePathTest(TestCase):
"""
Make sure the wav files directory path is correctly handle when transformed
to a relative path.
"""

def setUp(self):
"""
Create a mock state instead of doing all prior steps to ConfigFormatStep.
"""
state = {
SN.output_step.value: "John/Smith",
SN.name_step.value: "Unittest",
"dataset_0": {
SN.dataset_name_step.value: "unit",
SN.wavs_dir_step.value: "Common-Voice",
SN.symbol_set_step.value: Symbols(
silence=["<SIL>"],
pad="_",
punctuation=[],
symbol_set=[
" ",
",",
".",
"A",
"D",
"E",
"H",
"I",
"J",
"K",
],
),
"filelist_data": [
{
"text": "SU ḰÁLS TŦE NOȾE KÁṈI.",
"basename": "5061f5c3-3bf9-42c6-a268-435c146efaf6/dd50ed81b889047cb4399e34b650a91fcbd3b2a5e36cf0068251d64274bffb61",
"language": "und",
},
{
"text": "I, ȻȽ ȻIL TŦE ĆÁSE SWOU,LES.",
"basename": "5061f5c3-3bf9-42c6-a268-435c146efaf6/6c45ab8c6e2454142c95319ca37f7e4ff6526dddbcc7fc540572e4e53264ec47",
"language": "und",
},
{
"text": "ENÁN U, ṈEN XAXE SĆÁ,Is.",
"basename": "5061f5c3-3bf9-42c6-a268-435c146efaf6/3947ae033faeb793e00f836648e240bc91c821798bccc76656ad3e7030b38878",
"language": "und",
},
{
"text": "SU ḰÁLs “U ĆESE OL TŦE NE SXENE I, SQȺ ȻNES U MEQ EXIN ĆȺ.”",
"basename": "5061f5c3-3bf9-42c6-a268-435c146efaf6/65b61440f9621084a1a1d8c461d177c765fad3aff91e0077296081931929629b",
"language": "und",
},
{
"text": "DON,EYE EṮ SXÍEQES.",
"basename": "5061f5c3-3bf9-42c6-a268-435c146efaf6/8a124117481eaf8f91d23aa3acda301e7fae7de85e98c016383381d54a3d5049",
"language": "und",
},
],
"sox_effects": [["channel", "1"]],
},
}
self.config = ConfigFormatStep()
self.config.response = "yaml"
self.config.state = state

def test_wav_file_directory_local(self):
"""
output directory is `.`
wav files directory locate in `.`
"""
self.config.state[SN.output_step.value] = "."
self.config.state[SN.name_step.value] = "Unittest"
with capture_stdout():
with tempfile.TemporaryDirectory() as tmpdir:
os.chdir(tmpdir)
tmpdir = Path(tmpdir)
self.config.effect()
data_file = (
tmpdir
/ self.config.state[SN.name_step.value]
/ "config/everyvoice-shared-data.yaml"
)
with data_file.open() as fin:
config = yaml.load(fin, Loader=yaml.FullLoader)
self.assertEqual(
Path(config["source_data"][0]["data_dir"]), Path("../../Common-Voice")
)

def test_wav_file_directory_under_wavs_directory(self):
"""
output directory is `.`
wav files directory locate in `wavs/`
"""
self.config.state[SN.output_step.value] = "."
self.config.state[SN.name_step.value] = "Unittest"
wavs_dir = "wavs/Common-Voice"
self.config.state["dataset_0"][SN.wavs_dir_step.value] = wavs_dir
with capture_stdout():
with tempfile.TemporaryDirectory() as tmpdir:
os.chdir(tmpdir)
tmpdir = Path(tmpdir)
self.config.effect()
data_file = (
tmpdir
/ self.config.state[SN.name_step.value]
/ "config/everyvoice-shared-data.yaml"
)
with data_file.open() as fin:
config = yaml.load(fin, Loader=yaml.FullLoader)
self.assertEqual(
Path(config["source_data"][0]["data_dir"]), Path("../..") / wavs_dir
)

def test_output_not_local_and_wav_file_directory_local(self):
"""
output directory is NOT `.`
wav files directory locate in `.`
"""
self.config.state[SN.output_step.value] = "John/Smith"
self.config.state[SN.name_step.value] = "Unittest"
with capture_stdout():
with tempfile.TemporaryDirectory() as tmpdir:
os.chdir(tmpdir)
tmpdir = Path(tmpdir)
self.config.effect()
data_file = (
tmpdir
/ self.config.state[SN.output_step.value]
/ self.config.state[SN.name_step.value]
/ "config/everyvoice-shared-data.yaml"
)
with data_file.open() as fin:
config = yaml.load(fin, Loader=yaml.FullLoader)
self.assertEqual(
Path(config["source_data"][0]["data_dir"]), Path("../../../../Common-Voice")
)

def test_output_not_local_and_wav_file_directory_under_hierarchy(self):
"""
output directory is NOT `.`
wav files directory locate in `wavs/`
"""
self.config.state[SN.output_step.value] = "John/Smith"
self.config.state[SN.name_step.value] = "Unittest"
wavs_dir = "wavs/Common-Voice"
self.config.state["dataset_0"][SN.wavs_dir_step.value] = wavs_dir
with capture_stdout():
with tempfile.TemporaryDirectory() as tmpdir:
os.chdir(tmpdir)
tmpdir = Path(tmpdir)
self.config.effect()
data_file = (
tmpdir
/ self.config.state[SN.output_step.value]
/ self.config.state[SN.name_step.value]
/ "config/everyvoice-shared-data.yaml"
)
with data_file.open() as fin:
config = yaml.load(fin, Loader=yaml.FullLoader)
self.assertEqual(
Path(config["source_data"][0]["data_dir"]),
Path("../../../..") / wavs_dir,
)

def test_absolute_wav_file_directory_and_local_experiment(self):
"""
output directory is `.`
wav files directory locate in `/ABSOLUTE/wavs/`
"""
self.config.state[SN.output_step.value] = "."
self.config.state[SN.name_step.value] = "Unittest"
with capture_stdout():
with tempfile.TemporaryDirectory() as tmpdir:
os.chdir(tmpdir)
tmpdir = Path(tmpdir)
wavs_dir = tmpdir / "wavs/Common-Voice"
self.config.state["dataset_0"][SN.wavs_dir_step.value] = wavs_dir
self.config.effect()
data_file = (
tmpdir
/ self.config.state[SN.name_step.value]
/ "config/everyvoice-shared-data.yaml"
)
with data_file.open() as fin:
config = yaml.load(fin, Loader=yaml.FullLoader)
self.assertEqual(
Path(config["source_data"][0]["data_dir"]),
wavs_dir,
)

def test_absolute_wav_file_directory_and_nested_experiment(self):
"""
output directory is NOT `.`
wav files directory locate in `/ABSOLUTE/wavs/`
"""
self.config.state[SN.output_step.value] = "John/Smith"
self.config.state[SN.name_step.value] = "Unittest"
with capture_stdout():
with tempfile.TemporaryDirectory() as tmpdir:
os.chdir(tmpdir)
tmpdir = Path(tmpdir)
wavs_dir = tmpdir / "wavs/Common-Voice"
self.config.state["dataset_0"][SN.wavs_dir_step.value] = wavs_dir
self.config.effect()
data_file = (
tmpdir
/ self.config.state[SN.output_step.value]
/ self.config.state[SN.name_step.value]
/ "config/everyvoice-shared-data.yaml"
)
with data_file.open() as fin:
config = yaml.load(fin, Loader=yaml.FullLoader)
self.assertEqual(
Path(config["source_data"][0]["data_dir"]),
wavs_dir,
)


if __name__ == "__main__":
main()
69 changes: 35 additions & 34 deletions everyvoice/wizard/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,9 @@ def validate(self, response):

def effect(self):
output_path = (
(
Path(self.state[StepNames.output_step.value])
/ self.state[StepNames.name_step.value]
)
.expanduser()
)
Path(self.state[StepNames.output_step.value])
/ self.state[StepNames.name_step.value]
).expanduser()
# create_config_files
config_dir = output_path / "config"
config_dir.absolute().mkdir(exist_ok=True, parents=True)
Expand All @@ -121,19 +118,19 @@ def effect(self):
StepNames.symbol_set_step.value
].symbol_set
# Dataset Configs
wavs_dir = (
Path(self.state[dataset][StepNames.wavs_dir_step.value])
.expanduser()
)
wavs_dir = Path(
self.state[dataset][StepNames.wavs_dir_step.value]
).expanduser()
if not wavs_dir.is_absolute():
wavs_dir = Path("../..") / wavs_dir
if not output_path.is_absolute():
for _ in config_dir.parts:
wavs_dir = Path("..") / wavs_dir
else:
wavs_dir = Path.cwd() / wavs_dir
new_filelist_path = (
(
Path("..")
/ f"{self.state[dataset][StepNames.dataset_name_step.value]}-filelist.psv"
)
.expanduser()
)
Path("..")
/ f"{self.state[dataset][StepNames.dataset_name_step.value]}-filelist.psv"
).expanduser()
for entry_i in range(len(self.state[dataset]["filelist_data"])):
# Remove .wav if it was added to the basename
if self.state[dataset]["filelist_data"][entry_i]["basename"].endswith(".wav"):
Expand All @@ -144,9 +141,9 @@ def effect(self):
if k is not None and not k.startswith("unknown")
}
write_filelist(
self.state[dataset]["filelist_data"],
(config_dir / new_filelist_path).absolute(),
)
self.state[dataset]["filelist_data"],
(config_dir / new_filelist_path).absolute(),
)
sox_effects = self.state[dataset]["sox_effects"]
filelist_loader = generic_psv_dict_reader

Expand Down Expand Up @@ -179,7 +176,9 @@ def effect(self):
save_dir=Path("..") / "preprocessed",
source_data=datasets,
)
preprocessing_config_path = Path(f"{PREPROCESSING_CONFIG_FILENAME_PREFIX}.{self.response}")
preprocessing_config_path = Path(
f"{PREPROCESSING_CONFIG_FILENAME_PREFIX}.{self.response}"
)
write_dict_to_config(
json.loads(preprocessing_config.model_dump_json(exclude_none=True)),
(config_dir / preprocessing_config_path).absolute(),
Expand Down Expand Up @@ -208,9 +207,9 @@ def effect(self):
)
aligner_config_json["path_to_text_config_file"] = str(text_config_path)
write_dict_to_config(
aligner_config_json,
(config_dir / aligner_config_path).absolute(),
)
aligner_config_json,
(config_dir / aligner_config_path).absolute(),
)
# Create Feature Prediction Config
fp_logger = LoggerConfig(name="FeaturePredictionExperiment", save_dir=log_dir)
fp_config = FeaturePredictionConfig(
Expand All @@ -231,9 +230,9 @@ def effect(self):
)
fp_config_json["path_to_text_config_file"] = str(text_config_path)
write_dict_to_config(
fp_config_json,
(config_dir / fp_config_path).absolute(),
)
fp_config_json,
(config_dir / fp_config_path).absolute(),
)
# Create Vocoder Config
vocoder_logger = LoggerConfig(name="VocoderExperiment", save_dir=log_dir)
vocoder_config = VocoderConfig(
Expand All @@ -243,7 +242,9 @@ def effect(self):
logger=vocoder_logger,
).model_dump()
)
vocoder_config_path = Path(f"{SPEC_TO_WAV_CONFIG_FILENAME_PREFIX}.{self.response}")
vocoder_config_path = Path(
f"{SPEC_TO_WAV_CONFIG_FILENAME_PREFIX}.{self.response}"
)
vocoder_config_json = json.loads(
vocoder_config.model_dump_json(
exclude_none=True, exclude={"preprocessing": True}
Expand All @@ -253,9 +254,9 @@ def effect(self):
preprocessing_config_path
)
write_dict_to_config(
vocoder_config_json,
(config_dir / vocoder_config_path).absolute(),
)
vocoder_config_json,
(config_dir / vocoder_config_path).absolute(),
)
# E2E Config
e2e_logger = LoggerConfig(name="E2E-Experiment", save_dir=log_dir)
e2e_config = EveryVoiceConfig(
Expand All @@ -276,9 +277,9 @@ def effect(self):
e2e_config_json["path_to_vocoder_config_file"] = str(vocoder_config_path)
e2e_config_path = Path(f"{TEXT_TO_WAV_CONFIG_FILENAME_PREFIX}.{self.response}")
write_dict_to_config(
e2e_config_json,
(config_dir / e2e_config_path).absolute(),
)
e2e_config_json,
(config_dir / e2e_config_path).absolute(),
)
print(
Panel(
f"You've finished configuring your dataset. Your files are located at {config_dir.absolute()}",
Expand Down

0 comments on commit a009014

Please sign in to comment.