This repository has been archived by the owner on Mar 17, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
classifiers.py
178 lines (156 loc) · 5.47 KB
/
classifiers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/python
"""
Jon Tatum
Classifiers.py
Classifiers for sentiment problem
"""
import numpy
class Vectorizer(object):
def __init__(self):
pass
def to_vector(self, input):
return
# initial model
class UnigramVectorizer(Vectorizer):
def __init__(self, token_map):
super(UnigramVectorizer, self).__init__()
self.token_map = token_map
def to_vector(self, tweet):
n = self.feature_size
v = numpy.zeros(n)
for token in tweet['tokens']:
if token in self.token_map:
v[self.token_map[token]] += 1.0
else:
v[self.token_map["<OMIT>"]] += 1.0
return v
@property
def feature_size(self):
return len(self.token_map.keys())
class KGramVectorizer(Vectorizer):
def __init__(self, token_map, kgram_map, k):
super(KGramVectorizer, self).__init__()
self.kgram_map = kgram_map
self.token_map = token_map
self.k = k
def to_vector(self, tweet):
toks = tweet['tokens']
k = self.k
padlen = k-1
padded = ["<NULL>" for i in xrange(padlen)] + toks + ["<NULL>" for i in xrange(padlen)]
v = numpy.zeros(self.feature_size)
for i in xrange(padlen-1, len(toks) + padlen) :
kgid = tuple(toks[i:i+k])
if kgid in self.kgram_map:
idx = self.kgram_map[kgid]
else:
idx = self.kgram_map["<OMIT>"]
v[idx] += 1
return v
@property
def feature_size(self):
return len(self.kgram_map)
class KGramUniGramVectorizer(Vectorizer):
def __init__(self, token_map, kgram_map, k):
super(KGramUniGramVectorizer, self).__init__()
self.kgv = KGramVectorizer(token_map, kgram_map, k)
self.ugv = UnigramVectorizer(token_map)
def to_vector(self, tweet):
v = numpy.zeros(self.feature_size)
a = self.ugv.to_vector(tweet)
b = self.kgv.to_vector(tweet)
al = self.ugv.feature_size - 1
bl = self.kgv.feature_size
v[0:al] = a[0:al]
v[al:al+bl] = b[0:bl]
return v
@property
def feature_size(self):
return self.kgv.feature_size + self.ugv.feature_size - 1
class Classifier(object):
class ClassifierException(ValueError):
pass
def __init__(self):
pass
def train(self, training_set, labels):
raise ValueError("Abstract Class")
pass
def classify(self, example):
raise ValueError("Abstract Class")
pass
def classify_many(self, examples):
result = numpy.zeros(len(examples))
for i, example in enumerate(examples):
result[i] = self.classify(example)
return result
# MULTINOMIAL NAIVE BAYES
class NBClassifier(Classifier):
"""
multinomial bayes classifier (easy baseline for checking methods)
parameters: n -- the size of the feature vectors
k -- the number of classes to consider
"""
def __init__(self, n, labels, use_priors=True):
super(NBClassifier, self).__init__()
self.theta = None
self.class_priors = None
self.n = n
self.k = len(labels)
self.label_to_idx = {l:i for i, l in enumerate(labels)}
self.idx_to_label = {i:l for i, l in enumerate(labels)}
self.use_priors = use_priors
def train(self, training_set, labels):
# MLE
class_priors = numpy.zeros([self.k, 1])
theta = numpy.zeros([self.k, self.n])
num_examples = labels.shape[0]
feature_priors = numpy.zeros([1, self.n])
for label in self.label_to_idx:
lbl_idx = self.label_to_idx[label]
indices = numpy.where(labels == label)[0]
class_examples = training_set[indices,:]
theta[lbl_idx,:] = class_examples.sum(0) + 2 # sum over rows
feature_priors += theta[lbl_idx,:]
theta[lbl_idx,:] *= 1.0 / theta[lbl_idx,:].sum()
class_priors[lbl_idx] = len(indices) / float(num_examples)
#Convert parameters to logspace
self.class_priors = numpy.log(class_priors)
feature_priors *= 1.0 / feature_priors.sum()
self.feature_priors = numpy.log(feature_priors).T
self.theta = numpy.log(theta).T
def classify(self, example):
if self.class_priors is None:
raise ClassifierException("ERROR: model not trained")
ex = example.reshape([1, self.n])
lls = ex.dot(self.theta) - ex.dot(self.feature_priors)
if self.use_priors:
lls += self.class_priors.T
return self.idx_to_label[numpy.argmax(lls)]
class PCA(object):
"""
project features into k-dimensional space
"""
def __init__(self, k=200):
self.basis = None
self.k = k
def train(self, training_set):
m = training_set.shape[0]
n = training_set.shape[1]
mus = numpy.mean(training_set, axis=0)
d = training_set - mus
cov = numpy.cov(d.T)
evals, evecs = numpy.linalg.eigh(cov)
self.mus = mus
self.cov = cov
self.evecs = evecs
self.evals = evals
self.basis = evecs[:,(-evals).argsort()[0: self.k]]
projection = d.dot(self.basis)
return projection
def project(self, data):
if self.basis is None:
raise ClassifierException("model not trained")
m = data.shape[0]
b = (data - self.mus)
return b.dot(self.basis)
# SVM HERE!!!