-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathtransform_lemmata.py
122 lines (96 loc) · 4.34 KB
/
transform_lemmata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from cltk.corpus.greek.beta_to_unicode import Replacer
from collections import Counter
from collections import defaultdict
import operator
replacer = Replacer()
MANUAL_REPLACEMENTS = {'ἐστὶν': 'εἰμί',
'ἐστὶ': 'εἰμί',
'ἐστί': 'εἰμί',
'ἐστίν': 'εἰμί',
'λαμβάνει': 'λαμβάνω',
'λάβοι': 'λαμβάνω',
'λαβὼν': 'λαμβάνω',
'λαμβάνων': 'λαμβάνω',
}
def file_line_generator(file):
"""Open file line-by-line"""
with open(file) as file_opened:
for file_line in file_opened:
yield file_line[:-1] # remove '\n' from end of each line
def iter_headwords(def_dict):
for inflection, headwords in def_dict.items():
for headword in list(headwords):
yield headword
def make_headword_count(def_dict):
headwords = iter_headwords(def_dict)
return Counter(headwords)
def parse_perseus_lemmata_file(file_generator, greek):
"""Parse lemmata file, looping through string for all data."""
count = 0
for line in file_generator:
count += 1
if count % 10000 == 0:
print('Parsing line {0}'.format(count))
line_split = line.split('\t')
headword = line_split[0]
if greek:
headword = replacer.beta_code(headword.upper() + ' ')[:-1].lower() # add space to get final sigma 'ς', then rm it
headword_id = line_split[1]
line_lemmata = line_split[2:]
for lemma_str in line_lemmata:
lemma_list = lemma_str.split(' ', 1)
lemma = lemma_list[0]
if greek:
lemma = replacer.beta_code(lemma.upper() + ' ')[:-1].lower() # add space to get final sigma 'ς', then rm it #? why some coming out capitalized?
lemma_pos_str = lemma_list[1]
lemma_pos_list = lemma_pos_str.split(') (')
for lemma_pos in lemma_pos_list:
# rm initial paren
if lemma_pos.startswith('('):
lemma_pos = lemma_pos[1:]
# rm final paren
if lemma_pos.endswith(')'):
lemma_pos = lemma_pos[:-1]
# break out any dialect or extra data: '(epic doric ionic aeolic)', '(adverb)'
if '(' in lemma_pos and lemma_pos.endswith(')'):
lemma_pos_list = lemma_pos.split(' (')
lemma_pos = lemma_pos_list[0]
lemma_pos_comment = lemma_pos_list[1][:-1]
lemma_pos_comment_list = lemma_pos_comment.split(' ')
#print(lemma, headword)
yield lemma, headword
if __name__ == '__main__':
lemma_headword_map = {}
file_generator = file_line_generator('greek-lemmata.txt')
lemma_headword = parse_perseus_lemmata_file(file_generator, greek=True)
print('Starting to build map …')
lemmata_dd = defaultdict(set)
for k, v in lemma_headword:
lemmata_dd[k].add(v)
print('Building headword frequencies …')
headword_frequencies = make_headword_count(lemmata_dd)
print('Building final lemma-headword dict …')
# for any lemma with more than one possible headword
# check each for which occurs most
final_lemmata = {}
for k, v in lemmata_dd.items():
if len(list(v)) > 1:
count_dict = {}
for curr_hw in list(v):
curr_count = headword_frequencies[curr_hw]
count_dict[curr_hw] = curr_count
# Break ties by taking the lexicographically greatest.
top_headword = max(v, key = lambda lemma: (count_dict[lemma], lemma))
final_lemmata[k] = top_headword
else:
final_lemmata[k] = list(v)[0]
# could be improved to add pairs not in final_lemmata
for k, v in MANUAL_REPLACEMENTS.items():
if k in final_lemmata.keys():
final_lemmata[k] = MANUAL_REPLACEMENTS[k]
print('Starting to write file …')
with open('greek_lemmata_cltk.py', 'w') as file_opened:
print('LEMMATA = {', file=file_opened)
for word, lemma in sorted(final_lemmata.items()):
print('{!r}: {!r},'.format(word, lemma), file=file_opened)
print('}', file=file_opened)