-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
50 lines (39 loc) · 1.22 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import unicodedata
def remove_diacritic(*diacritics):
"""
Given a collection of Unicode diacritics, return a function that takes a
string and returns the string without those diacritics.
"""
def _(text):
return unicodedata.normalize("NFC", "".join(
ch
for ch in unicodedata.normalize("NFD", text)
if ch not in diacritics)
)
return _
def has_diacritic(*diacritics):
"""
Given a collection of Unicode diacritics, return a function that takes a
string and returns a boolean indicating if any of those diacritics exist
in the string.
"""
def _(text):
for ch in unicodedata.normalize("NFD", text):
if ch in diacritics:
return True
return False
return _
OXIA = ACUTE = "\u0301"
VARIA = GRAVE = "\u0300"
PERISPOMENI = CIRCUMFLEX = "\u0342"
remove = remove_diacritic(ACUTE, GRAVE, CIRCUMFLEX)
has_accent = has_diacritic(ACUTE, GRAVE, CIRCUMFLEX)
SHORT = "\u0306"
LONG = "\u0304"
def remove_length(x):
x = remove_diacritic(SHORT, LONG)(x)
x = x.replace("ά̓", "ἄ") # @@@
x = x.replace("ί̓", "ἴ") # @@@
return x
SMOOTH = "\u0313"
remove_smooth = remove_diacritic(SMOOTH)