-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
63 lines (48 loc) · 2.08 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import idx2numpy
import numpy
from lshash import LSHash
# Converts data from MNIST data set http://yann.lecun.com/exdb/mnist/ to numpy arrays
# Uses idx2numpy https://github.com/ivanyu/idx2numpy
# To generalize, may want to turn each image into a single vector
folder = "data/"
# Returns training data in the form of tuple (image(28*28), label)
def getTraining(size):
trainImages = idx2numpy.convert_from_file(folder + 'train-images-idx3-ubyte')
trainLabels = idx2numpy.convert_from_file(folder + 'train-labels-idx1-ubyte')
trainImages = trainImages.astype(float)
size = min(trainImages.shape[0], size)
return zip(trainImages[:size], trainLabels[:size])
# Returns testing data in the form of tuple (image(28*28), label)
def getTesting(size):
testImages = idx2numpy.convert_from_file(folder + 't10k-images-idx3-ubyte')
testLabels = idx2numpy.convert_from_file(folder + 't10k-labels-idx1-ubyte')
testImages = testImages.astype(float)
size = min(testImages.shape[0], size)
return zip(testImages[:size], testLabels[:size])
# Returns training data in the form of tuple (image(1*784), label)
def getTrainingVectors(size):
trainingData = getTraining(size)
# Turn each into image into a single vector
for i in xrange(size):
trainingData[i] = (trainingData[i][0].flatten(), trainingData[i][1])
return trainingData
# Returns testing data in the form of tuple (image(1*784), label)
def getTestingVectors(size):
testingData = getTesting(size)
# Turn each into image into a single vector
for i in xrange(size):
testingData[i] = (testingData[i][0].flatten(), testingData[i][1])
return testingData
def getTrainingLSH(size):
trainingData = getTrainingVectors(size)
print "Building Locality Sensitive Hash Tables..."
# This is very slow, and should be persisted
# Hash all training examples
# Choosing the the size of the hash and the number of queries, allows for a
# tradeoff between, speed and accuracy
lsh = LSHash(24, trainingData[0][0].size, num_hashtables = 18)
for i in xrange(size):
image = trainingData[i][0]
label = trainingData[i][1]
lsh.index(image, extra_data=label)
return lsh