-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataClassifier.py
207 lines (173 loc) · 7.63 KB
/
dataClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# dataClassifier.py
# -----------------
import mostFrequent
import perceptron
import svm
import mlp
import samples
import sys
import util
TRAINING_SET_SIZE = 5000
TEST_SET_SIZE = 1000
DIGIT_DATUM_WIDTH=28
DIGIT_DATUM_HEIGHT=28
def basicFeatureExtractorDigit(datum):
"""
Returns a set of pixel features indicating whether
each pixel in the provided datum is white (0) or gray/black (1)
"""
a = datum.getPixels()
features = util.Counter()
for x in range(DIGIT_DATUM_WIDTH):
for y in range(DIGIT_DATUM_HEIGHT):
if datum.getPixel(x, y) > 0:
features[(x,y)] = 1
else:
features[(x,y)] = 0
return features
def analysis(classifier, guesses, testLabels, testData, rawTestData, printImage):
"""
This function is called after learning.
Include any code that you want here to help you analyze your results.
Use the printImage(<list of pixels>) function to visualize features.
An example of use has been given to you.
- classifier is the trained classifier
- guesses is the list of labels predicted by your classifier on the test set
- testLabels is the list of true labels
- testData is the list of training datapoints (as util.Counter of features)
- rawTestData is the list of training datapoints (as samples.Datum)
- printImage is a method to visualize the features
(see its use in the odds ratio part in runClassifier method)
This code won't be evaluated. It is for your own optional use
(and you can modify the signature if you want).
"""
# Put any code here...
# Example of use:
for i in range(len(guesses)):
prediction = guesses[i]
truth = testLabels[i]
if (prediction != truth):
print "==================================="
print "Mistake on example %d" % i
print "Predicted %d; truth is %d" % (prediction, truth)
print "Image: "
print rawTestData[i]
break
class ImagePrinter:
def __init__(self, width, height):
self.width = width
self.height = height
def printImage(self, pixels):
"""
Prints a Datum object that contains all pixels in the
provided list of pixels. This will serve as a helper function
to the analysis function you write.
Pixels should take the form
[(2,2), (2, 3), ...]
where each tuple represents a pixel.
"""
image = samples.Datum(None,self.width,self.height)
for pix in pixels:
try:
# This is so that new features that you could define which
# which are not of the form of (x,y) will not break
# this image printer...
x,y = pix
image.pixels[x][y] = 2
except:
print "new features:", pix
continue
print image
def default(str):
return str + ' [Default: %default]'
def readCommand( argv ):
"Processes the command used to run from the command line."
from optparse import OptionParser
parser = OptionParser(USAGE_STRING)
parser.add_option('-c', '--classifier', help=default('The type of classifier'), choices=['mostFrequent', 'perceptron', 'mlp', 'svm'], default='mostFrequent')
parser.add_option('-t', '--training', help=default('The size of the training set'), default=TRAINING_SET_SIZE, type="int")
parser.add_option('-w', '--weights', help=default('Whether to print weights'), default=False, action="store_true")
parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=3, type="int")
parser.add_option('-s', '--test', help=default("Amount of test data to use"), default=TEST_SET_SIZE, type="int")
options, otherjunk = parser.parse_args(argv)
if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk))
args = {}
# Set up variables according to the command line input.
print "Doing classification"
print "--------------------"
print "classifier:\t\t" + options.classifier
print "training set size:\t" + str(options.training)
printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage
featureFunction = basicFeatureExtractorDigit
legalLabels = range(10)
if options.training <= 0:
print "Training set size should be a positive integer (you provided: %d)" % options.training
print USAGE_STRING
sys.exit(2)
if(options.classifier == "mostFrequent"):
classifier = mostFrequent.MostFrequentClassifier(legalLabels)
elif(options.classifier == "mlp"):
classifier = mlp.MLPClassifier(legalLabels,options.iterations)
elif(options.classifier == "perceptron"):
classifier = perceptron.PerceptronClassifier(legalLabels,options.iterations)
elif(options.classifier == "svm"):
classifier = svm.SVMClassifier(legalLabels)
else:
print "Unknown classifier:", options.classifier
print USAGE_STRING
sys.exit(2)
args['classifier'] = classifier
args['featureFunction'] = featureFunction
args['printImage'] = printImage
return args, options
USAGE_STRING = """
USAGE: python dataClassifier.py <options>
EXAMPLES: (1) python dataClassifier.py
- trains the default mostFrequent classifier on the digit dataset
using the default 100 training examples and
then test the classifier on test data
(2) python dataClassifier.py -c perceptron -t 1000 -s 500
- would run the perceptron classifier on 1000 training examples, would
test the classifier on 500 test data points
"""
# Main harness code
def runClassifier(args, options):
featureFunction = args['featureFunction']
classifier = args['classifier']
printImage = args['printImage']
# Load data
numTraining = options.training
numTest = options.test
rawTrainingData = samples.loadDataFile("data/digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
trainingLabels = samples.loadLabelsFile("data/digitdata/traininglabels", numTraining)
rawValidationData = samples.loadDataFile("data/digitdata/validationimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
validationLabels = samples.loadLabelsFile("data/digitdata/validationlabels", numTest)
rawTestData = samples.loadDataFile("data/digitdata/testimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
testLabels = samples.loadLabelsFile("data/digitdata/testlabels", numTest)
# Extract features
print "Extracting features..."
trainingData = map(featureFunction, rawTrainingData)
validationData = map(featureFunction, rawValidationData)
testData = map(featureFunction, rawTestData)
# Conduct training and testing
print "Training..."
classifier.train(trainingData, trainingLabels, validationData, validationLabels)
print "Validating..."
guesses = classifier.classify(validationData)
correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))
print "Testing..."
guesses = classifier.classify(testData)
correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)
if((options.weights) & (options.classifier == "perceptron")):
for l in classifier.legalLabels:
features_weights = classifier.findHighWeightFeatures(l)
print ("=== Features with high weight for label %d ==="%l)
printImage(features_weights)
if __name__ == '__main__':
# Read input
args, options = readCommand( sys.argv[1:] )
# Run classifier
runClassifier(args, options)