-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnormalizer.py
67 lines (59 loc) · 1.94 KB
/
normalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from typing import List
class TextNormalizer:
"""Normalize text in to fixed graphme set.
WARNING: It does not accept digits, please use normalized text in LJ Speech.
"""
GRAPHEMES = 'abcdefghijklmnopqrstuvwxyz !?,.'
REPLACER = {
'"\'()-:;[]’“”': '', # punct
'àâ': 'a', 'èéê': 'e', 'ü': 'u' # special character in ljspeech.
}
def __init__(self):
"""Initializer.
"""
replacer = {}
for rep, out in TextNormalizer.REPLACER.items():
for r in rep:
replacer[r] = out
self.replacer = replacer
def grapheme_fn(self, grapheme: str) -> str:
"""Map grapheme into fixed set `TextNormalizer.GRAPHEMES`.
Args:
grapheme: single grapheme.
Returns:
normalized form.
"""
if grapheme in self.replacer:
grapheme = self.replacer[grapheme]
assert grapheme in TextNormalizer.GRAPHEMES, \
f'invalid grapheme: {grapheme}'
return grapheme
def normalize(self, text: str) -> str:
"""Normalize text.
Args:
text: input text.
Returns:
normalized.
"""
return ''.join(self.grapheme_fn(t.lower()) for t in text)
def labeling(self, text: str) -> List[int]:
"""Normalize text and make to integer label.
Padding token for 0, TextNormalizer.GRAPHEMES for afters.
Args:
text: input text.
Returns:
integer labels.
"""
return [
TextNormalizer.GRAPHEMES.index(t) + 1
for t in self.normalize(text)]
def recover(self, labels: List[int]) -> str:
"""Convert label to normalized text.
Ars:
labels: integer label.
Returns:
converted text.
"""
return ''.join(
TextNormalizer.GRAPHEMES[i - 1]
for i in labels if i != 0)