-
Notifications
You must be signed in to change notification settings - Fork 3
/
feature_extractor.py
127 lines (96 loc) · 3.18 KB
/
feature_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from nltk.corpus import wordnet
#gets the word
def word(sent, i, history):
word, pos = sent[i]
return {'word': word}
#gets the part of speech of the word
def pos(sent, i, history):
word, pos = sent[i]
return {'pos': pos}
#gets the part of speech and the word
def pos_word(sent, i, history):
word, pos = sent[i]
return {'pos': pos, 'word': word}
#gets the part of speech of the previous word, and the curent words
def prev_pos(sent, i, history):
word, pos = sent[i]
if i == 0:
prevword, prevpos = '<START>', '<START>'
else:
prevword, prevpos = sent[i-1]
return {'pos': pos, 'prevpos': prevpos}
#gets the part of speech of the previous word, the current word, and the word
def prev_pos_word(sent, i, history):
word, pos = sent[i]
if i == 0:
prevword, prevpos = '<START>', '<START>'
else:
prevword, prevpos = sent[i-1]
return {'pos': pos, 'prevpos': prevpos, 'word': word}
#gets the part of speech of the next word, and the current word
def next_pos(sent, i, history):
word, pos = sent[i]
if i == len(sent) - 1:
nextword, nextpos = '<END>', '<END>'
else:
nextword, nextpos = sent[i+1]
return {'pos': pos, 'nextpos': nextpos}
#get the part of speech of the current and next words and the actual next word
def next_pos_word(sent, i, history):
word, pos = sent[i]
if i == len(sent) - 1:
nextword, nextpos = '<END>', '<END>'
else:
nextword, nextpos = sent[i+1]
return {'pos': pos, 'nextpos': nextpos, 'word': word}
#get the part of speech of the current, previous and next words
def prev_next_pos(sent, i, history):
word, pos = sent[i]
if i == 0:
prevword, prevpos = '<START>', '<START>'
else:
prevword, prevpos = sent[i-1]
if i == len(sent) - 1:
nextword, nextpos = '<END>', '<END>'
else:
nextword, nextpos = sent[i+1]
return {'pos': pos, 'nextpos': nextpos, 'prevpos': prevpos}
#get the part of speech of the current, previous and next words, and the current word
def prev_next_pos_word(sent, i, history):
word, pos = sent[i]
if i == 0:
prevword, prevpos = '<START>', '<START>'
else:
prevword, prevpos = sent[i-1]
if i == len(sent) - 1:
nextword, nextpos = '<END>', '<END>'
else:
nextword, nextpos = sent[i+1]
return {'pos': pos, 'nextpos': nextpos, 'word': word, 'prevpos': prevpos}
#return if the word is in the english dictionary
def in_dictionary(sent, i, history):
word, pos = sent[i]
return {'dictionary': wordnet.synsets(word)}
#return if the word is capitalized
def is_capitalized(sent, i, history):
word, pos = sent[i]
wordnet.synsets(word)
return {'caps': word[0].isupper()}
#return if the word is in all caps
def is_all_capitalized(sent, i, history):
word, pos = sent[i]
result = True
for char in word:
result = result = char.isupper()
return {'allcaps': result}
#a helper fn that checks if a word is a number
def number(s):
try:
float(s)
return True
except ValueError:
return False
#return if the word is a number
def is_numeric(sent, i, history):
word, pos = sent[i]
return{'num': number(word)}