-
Notifications
You must be signed in to change notification settings - Fork 0
/
guess_deprels.py
146 lines (130 loc) · 4.53 KB
/
guess_deprels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import re
import sys
from collections import namedtuple
Token = namedtuple("token", "id form lemma upos xpos feats head deprel edeprel other")
def assign_deprel(token, in_pp, nouny, last_token):
deprel = "dep"
xpos_mapping = {
"Cc": "cc", "Cs": "mark", "Dd": "det", "Dq": "det", "I": "discourse",
"Mc": "nummod", "Mo": "nummod", "Nv": "xcomp:pred", "Px": "nmod",
"Nn-mv": "vocative", "Nn-fv": "vocative",
"Q-r": "mark:prt",
"Sa": "case", "Sp": "case", "Spa-s": "case", "Uv": "case:voc",
"Xx": "dep"
}
upos_mapping = {
"ADJ": "amod", "ADV": "advmod", "AUX": "cop", "PUNCT": "punct", "PART": "mark:prt"
}
if token.xpos in xpos_mapping:
return xpos_mapping[token.xpos], in_pp
if token.upos in upos_mapping:
return upos_mapping[token.upos], in_pp
if token.form == "ais":
deprel = "obl"
if token.id == "1":
deprel = "root"
if token.lemma == "airson":
deprel = "case"
if (token.xpos in ["Nn", "Nt"]) and last_token.upos == "NOUN":
deprel = "nmod"
if token.xpos.startswith("Td"):
deprel = "det"
if token.xpos.startswith("Dp"):
deprel = "nmod:poss"
if re.match('^V.*d$', token.xpos):
deprel = "ccomp"
if last_token.deprel == "cc":
deprel = "conj"
if last_token.deprel == "cop" and (token.form in ["e", "an"]):
deprel = "fixed"
if nouny:
if last_token.upos == "VERB":
return "nsubj", in_pp
if in_pp:
return "obl", False
return "obj", in_pp
if last_token.xpos == "Q-r":
deprel = "acl:relcl"
if last_token.xpos == "Qq":
deprel = "ccomp"
return deprel, in_pp
def assign_head(root, deprel, noun_id, verb_id, token):
head = 0
if deprel == "root":
return head
if deprel in ["acl:relcl", "amod"]:
head = noun_id
if deprel == "advmod" and token.form == "ro":
head = int(token.id) + 1
if deprel in ["advmod", "ccomp", "nsubj", "obl", "xcomp:pred", "punct"]:
head = verb_id
if deprel == "case" and token.upos == "ADP":
head = int(token.id) + 2
if deprel == "case" and token.upos == "PART":
head = int(token.id) + 1
if token.form == "math" and token.upos == "ADV":
head = int(token.id) + 1
if deprel in ["case:voc", "cc", "cop", "mark", "mark:prt", "nmod:poss"]:
head = int(token.id) + 1
if deprel == "conj" and root != -1:
head = root
if deprel in ["dep", "discourse"]:
head = verb_id
if deprel in ["compound", "fixed", "nmod"]:
head = int(token.id) - 1
if deprel == "det":
if token.xpos == "Dd":
head = int(token.id) - 1
else:
head = int(token.id) + 1
if deprel == "nummod":
head = int(token.id) + 1
if deprel == "obj" and token.upos == "PART":
head = int(token.id) + 1
return head
def process_line(line, last_token, in_pp, verb_id, noun_id, root):
token = Token(*line.split("\t"))
if token.deprel == "root":
root = int(token.id)
verb_id = root
nouny = token.upos in ["NOUN", "PRON", "PROPN"]
if nouny:
noun_id = int(token.id)
if token.deprel == "_":
deprel, in_pp = assign_deprel(token, in_pp, nouny, last_token)
else:
deprel = token.deprel
if token.head == "_":
head = assign_head(root, deprel, noun_id, verb_id, token)
else:
head = token.head
if deprel == "case":
in_pp = True
if deprel in ["obl", "nmod"]:
in_pp = False
if token.upos == "VERB" or token.xpos == "Nv":
verb_id = int(token.id)
tokens = (token.id, token.form, token.lemma, token.upos, token.xpos,
token.feats, str(head), deprel, token.edeprel, token.other)
print("\t".join(tokens).strip())
return Token(*tokens), in_pp, verb_id, noun_id, root
def guess_deprels(file):
last_token = Token("", "", "", "", "", "", "", "", "", "")
in_pp = False
verb_id = -1
noun_id = -1
root = -1
for line in file:
if line.startswith('#'):
print(line.strip())
root = 0
in_pp = False
last_token = Token("", "", "", "", "", "", "", "", "", "")
verb_id = 1
elif line.strip() == "":
print(line.strip())
elif "-" in line.split("\t")[0]:
print(line.strip())
else:
last_token, in_pp, verb_id, noun_id, root = process_line(line, last_token, in_pp, verb_id, noun_id, root)
guess_deprels(open(sys.argv[1]))