-
Notifications
You must be signed in to change notification settings - Fork 3
/
parse.py
129 lines (95 loc) · 3.72 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from collections import namedtuple
import re
from characters import strip_accents, remove_redundant_macron, strip_length
import yaml
StemInfo = namedtuple("StemInfo", ["stem", "stem_type", "used_override"])
class Lexicon:
def __init__(self, lexicon_file):
with open(lexicon_file) as f:
self.lexicon = yaml.load(f)
def stem_info(self, lemma, parse, context=None):
lexical_entry = self.lexicon[lemma]
result = None
for stem_key, stem in lexical_entry["stems"].items():
if "/" in stem_key:
stem_key, context_to_match = stem_key.split("/")
if not (context and re.match(context_to_match, context)):
continue
regex = {
"1-": "P",
"1+": "I",
"2-": "F[AM]",
"3-": "A[AM][NP]",
"3+": "A[AM][I]",
"4-": "XA",
"4-S": "XAI..S",
"4-P": "XAI..P",
"4-NP": "XA[NP]",
"4+": "YA",
"5-": "X[MP]",
"5+": "Y[MP]",
"6-": "AP[NP]",
"6+": "AP[I]",
"7-": "FP",
}[stem_key]
if re.match(regex, parse):
result = StemInfo(stem, stem_key, None)
for parse_regex, stem in lexical_entry.get("stem_overrides", []):
if re.match(parse_regex, parse):
result = StemInfo(stem, None, parse_regex)
if not result:
return
result_set = set()
for stem in result.stem.split("/"):
result_set.add(StemInfo(stem, result.stem_type, result.used_override))
return result_set
def debreath(word):
word = word.replace("εἷ", "hεῖ")
word = word.replace("εἵ", "hεί")
word = word.replace("εἱ", "hει")
word = word.replace("ἕ", "hέ")
word = word.replace("ἑ", "hε")
return word
def rebreath(word):
word = word.replace("hεῖ", "εἷ")
word = word.replace("hεί", "εἵ")
word = word.replace("hει", "εἱ")
word = word.replace("hέ", "ἕ")
word = word.replace("hε", "ἑ")
# word = add_necessary_breathing(word)
word = remove_redundant_macron(word)
return word
EndingInfo = namedtuple("EndingInfo", ["stem", "components"])
class Endings:
def __init__(self, endings_file):
with open(endings_file) as f:
self.endings = yaml.load(f)
def ending_info(self, form, parse, test_length=False):
stem_set = set()
form = debreath(form)
if parse in self.endings:
pairs = self.endings[parse]
while isinstance(pairs, dict) and "ref" in pairs:
if pairs["ref"] in self.endings:
pairs = self.endings[pairs["ref"]]
else:
raise Exception("ref to {} which doesn't exist".format(pairs["ref"]))
for entry in pairs:
if not test_length:
entry = strip_length(entry)
s1, s234, s5 = entry.split("|")
s2, s34 = s234.split(">")
s3, s4 = s34.split("<")
s3 = s3.replace("(", "\\(")
s3 = s3.replace(")", "\\)")
s5 = s5.replace("(", "\\(")
s5 = s5.replace(")", "\\)")
regex_pair = ("(.*{}){}{}$".format(s1, s3, s5), s2)
if re.match(regex_pair[0], form):
stem = rebreath(
strip_accents(
re.sub(regex_pair[0], r"\1" + regex_pair[1], form)))
stem_set.add(EndingInfo(stem, (s1, s2, s3, s4, s5)))
else:
return None
return stem_set