Skip to content

Commit

Permalink
feat: Added lookuptables_from_config()
Browse files Browse the repository at this point in the history
  • Loading branch information
SamuelLarkin committed Jan 22, 2024
1 parent 5cdabda commit 85a9a8a
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 10 deletions.
14 changes: 14 additions & 0 deletions everyvoice/tests/data/lookuptable/training_filelist.psv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
basename|text|language|speaker
LJ002-0149|train1|crk|2
LJ002-0130|train2|str|0
LJ001-0094|train3|str|1
LJ001-0110|train4|git|3
LJ002-0012|train5|crk|1
LJ002-0228|train6|str|2
LJ002-0140|train7|crk|3
LJ001-0016|train8|str|3
LJ001-0071|train9|git|2
LJ002-0187|train10|git|0
LJ002-0129|train11|str|1
LJ002-0232|train12|str|1
LJ001-0007|train13|git|1
9 changes: 9 additions & 0 deletions everyvoice/tests/data/lookuptable/validation_filelist.psv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
basename|text|language|speaker
LJ002-0112|validation1|str|0
LJ002-0079|validation2|git|0
LJ001-0047|validation3|str|3
LJ002-0162|validation4|str|2
LJ002-0124|validation5|str|3
LJ001-0037|validation6|git|0
LJ002-0054|validation7|str|3
LJ002-0093|validation8|crk|1
85 changes: 84 additions & 1 deletion everyvoice/tests/test_text.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
#!/usr/bin/env python
import string
from pathlib import Path
from typing import Dict, List
from unicodedata import normalize
from unittest import TestCase, main

from everyvoice.config.text_config import Symbols, TextConfig
from everyvoice.model.feature_prediction.config import FeaturePredictionConfig
from everyvoice.text import TextProcessor
from everyvoice.text.lookups import build_lookup
from everyvoice.text.lookups import build_lookup, lookuptables_from_data
from everyvoice.utils import generic_dict_loader


class TextTest(TestCase):
Expand Down Expand Up @@ -190,5 +193,85 @@ def test_build_lookup(self):
)


class LookupTables(TestCase):
def test_lookuptables_from_data(self):
"""
Text looluptables for a multilangual and multispeaker.
"""
base_path = Path(__file__).parent / "data/lookuptable/"
lang2id, speaker2id = lookuptables_from_data(
(
generic_dict_loader(base_path / "training_filelist.psv"),
generic_dict_loader(base_path / "validation_filelist.psv"),
)
)
self.assertDictEqual(
lang2id, {"crk": 0, "git": 1, "str": 2}, "Language lookup tables differ"
)
self.assertDictEqual(
speaker2id,
{"0": 0, "1": 1, "2": 2, "3": 3},
"Speaker lookup tables differ.",
)

def test_no_language(self):
"""
Test a datasest that has no language.
"""

def remove_language(data: List[Dict[str, str]]) -> List[Dict[str, str]]:
for d in data:
del d["language"]
return data

base_path = Path(__file__).parent / "data/lookuptable/"
lang2id, speaker2id = lookuptables_from_data(
(
remove_language(
generic_dict_loader(base_path / "training_filelist.psv")
),
remove_language(
generic_dict_loader(base_path / "validation_filelist.psv")
),
)
)
self.assertDictEqual(lang2id, {}, "Language lookup tables differ")
self.assertDictEqual(
speaker2id,
{"0": 0, "1": 1, "2": 2, "3": 3},
"Speaker lookup tables differ.",
)

def test_no_speaker(self):
"""
Test a datasest that has no speaker.
"""

def remove_speaker(data: List[Dict[str, str]]) -> List[Dict[str, str]]:
for d in data:
del d["speaker"]
return data

base_path = Path(__file__).parent / "data/lookuptable/"
lang2id, speaker2id = lookuptables_from_data(
(
remove_speaker(
generic_dict_loader(base_path / "training_filelist.psv")
),
remove_speaker(
generic_dict_loader(base_path / "validation_filelist.psv")
),
)
)
self.assertDictEqual(
lang2id, {"crk": 0, "git": 1, "str": 2}, "Language lookup tables differ"
)
self.assertDictEqual(
speaker2id,
{},
"Speaker lookup tables differ.",
)


if __name__ == "__main__":
main()
37 changes: 28 additions & 9 deletions everyvoice/text/lookups.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,34 @@
from typing import Dict, Sequence, Union
from itertools import chain
from typing import Dict, Iterable, Sequence, Tuple, Union

from everyvoice.model.e2e.config import EveryVoiceConfig
from everyvoice.model.feature_prediction.config import FeaturePredictionConfig

LookupTable = Dict[str, int]


def lookuptables_from_config(
config: Union[EveryVoiceConfig, FeaturePredictionConfig]
) -> Tuple[LookupTable, LookupTable]:
""" """
train_dataset = config.training.filelist_loader(config.training.training_filelist)
val_dataset = config.training.filelist_loader(config.training.validation_filelist)

return lookuptables_from_data((train_dataset, val_dataset))


def lookuptables_from_data(
data: Iterable[Sequence[Dict[str, str]]]
) -> Tuple[LookupTable, LookupTable]:
""" """
languages = set(d["language"] for d in chain(*data) if "language" in d)
lang2id = {language: i for i, language in enumerate(sorted(languages))}

speakers = set(d["speaker"] for d in chain(*data) if "speaker" in d)
speaker2id = {speaker: i for i, speaker in enumerate(sorted(speakers))}

return lang2id, speaker2id


def build_lookup(items: Sequence[Dict[str, str]], key: str) -> Dict[str, int]:
"""
Expand All @@ -16,11 +42,4 @@ def build_lookup(items: Sequence[Dict[str, str]], key: str) -> Dict[str, int]:
class LookupTables:
def __init__(self, config: Union[EveryVoiceConfig, FeaturePredictionConfig]):
self.config = config
self.val_dataset = self.config.training.filelist_loader(
self.config.training.validation_filelist
)
self.train_dataset = self.config.training.filelist_loader(
self.config.training.training_filelist
)
self.speaker2id = build_lookup(self.train_dataset + self.val_dataset, "speaker")
self.lang2id = build_lookup(self.train_dataset + self.val_dataset, "language")
self.lang2id, self.speaker2id = lookuptables_from_config(self.config)

0 comments on commit 85a9a8a

Please sign in to comment.