-
Notifications
You must be signed in to change notification settings - Fork 3
/
parse_morphgnt.py
executable file
·68 lines (51 loc) · 2.01 KB
/
parse_morphgnt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python3
from characters import strip_length
from pysblgnt import morphgnt_rows
from parse import Lexicon, Endings
IGNORE_LIST = [
"σαβαχθάνι",
"ἔνι",
"χρή",
]
LEXICON_FILE = "lexicons/morphgnt.yaml"
ENDINGS_FILE = "stemming.yaml"
lexicon = Lexicon(LEXICON_FILE)
endings = Endings(ENDINGS_FILE)
if __name__ == "__main__":
for book_num in range(1, 28):
for row in morphgnt_rows(book_num):
ccat_pos = row["ccat-pos"]
ccat_parse = row["ccat-parse"]
form = row["norm"]
lemma = row["lemma"]
if ccat_pos != "V-":
continue
if lemma in IGNORE_LIST:
continue
if ccat_parse[3] == "N":
parse = ccat_parse[1:4]
elif ccat_parse[3] == "P":
parse = ccat_parse[1:4] + "." + ccat_parse[4:7]
elif ccat_parse[3] == "I":
parse = ccat_parse[1:4] + "." + ccat_parse[0] + ccat_parse[5]
else:
continue
stem_info = lexicon.stem_info(lemma, parse, context=row["bcv"])
if stem_info is None:
print("couldn't get stem info for {} {}".format(lemma, parse))
continue
ending_info = endings.ending_info(form, parse)
valid_stems = (
set(strip_length(info.stem.replace("|", "")) for info in stem_info) &
set(info.stem for info in ending_info))
if len(valid_stems) != 1:
print(form, parse, lemma, len(valid_stems))
print(" {}".format(stem_info))
print(" {}".format(ending_info))
for valid_stem in valid_stems:
for info in stem_info:
if info.stem == valid_stem:
print(" {}".format(info))
for info in ending_info:
if info.stem == valid_stem:
print(" {}".format(info))