-
Notifications
You must be signed in to change notification settings - Fork 13
/
predicting.py
170 lines (117 loc) · 5.88 KB
/
predicting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import numpy as np
import os
from sklearn import cluster
from sklearn import mixture
CLUSTERING_METHODS = ('kmeans', 'birch', 'gaussian_mixture', 'agglomerative_clustering')
def normalize_predictions(predictions):
"""
We take the assumption that the data set contains less than 50 % of outlier.
Given that the classifier, gives the label 0 and 1 for the same data
randomly. We make sure that an inlier is described as a 0.
:param predictions: A 1 D numpy array with the predictions of our detector
:return: predictions: A 1 D numpy array with the predictions of our detector, cleaned
"""
if np.sum(predictions) > (len(predictions) / 2 - 1):
predictions = 1 - predictions
return predictions
def detection_with_kmeans(image_set):
"""
Fast, but might not be able to map great for nonlinear separation of classes.
:param image_set: The bottleneck values of the relevant images.
:return: Predictions vector
"""
clf = cluster.KMeans(n_clusters=2)
clf.fit(image_set)
predictions = clf.labels_
predictions = normalize_predictions(predictions)
return predictions
def detection_with_birch(image_set):
"""
:param image_set: The bottleneck values of the relevant images.
:return: Predictions vector
"""
# The branching_factor, might be fine tune for better results
clf = cluster.Birch(n_clusters=2)
clf.fit(image_set)
predictions = clf.labels_
predictions = normalize_predictions(predictions)
return predictions
def detection_with_gaussian_mixture(image_set):
"""
:param image_set: The bottleneck values of the relevant images.
:return: Predictions vector
"""
# Might achieve, better results by initializing weights, or means, given we know when we introduce noisy labels
clf = mixture.GaussianMixture(n_components=2)
clf.fit(image_set)
predictions = clf.predict(image_set)
predictions = normalize_predictions(predictions)
return predictions
def detection_with_agglomaritve_clustering(image_set):
"""
Really good if the classes you are analyzing are close to what the network learned.
:param image_set: The bottleneck values of the relevant images.
:return: Predictions vector
N.B : The detector breaks with a full black image.
"""
# http://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_clustering.html#sphx-glr-auto-examples-cluster-plot-agglomerative-clustering-py
clf = cluster.AgglomerativeClustering(n_clusters=2, affinity="l2", linkage="complete")
clf.fit(image_set)
predictions = clf.labels_
predictions = normalize_predictions(predictions)
return predictions
def grabbing_pollution(architecture, pollution_dir, pollution_points):
"""
This function that will see if the pollution directory exist, and try to look a .npy file following the right naming
scheme.
:param architecture: Which architecture to use to generate your bottlenecks.
:param pollution_dir: Location of the directory containing precomputed values for random images.
:param pollution_points: Number of points desired by the user to be added to the data values.
:return: An int that contains how many pollution bottleneck we have and a numpy array containing, bottlencks of
random images.
"""
saved_values = os.listdir(pollution_dir)
entry = 'Noise_' + architecture + '.npy'
path = os.path.join(pollution_dir, entry)
if entry not in saved_values:
print('Pollution label not found')
pollution_bottlenecks = np.array()
nb_bottlenecks_to_return = 0
else:
pollution_bottlenecks = np.load(path)
nb_bottlenecks = pollution_bottlenecks.shape[0]
if nb_bottlenecks > pollution_points:
nb_bottlenecks_to_return = pollution_points
else:
print('Problem, not enough polluted bottlenecks have been pre computed')
nb_bottlenecks_to_return = nb_bottlenecks
pollution_bottlenecks = pollution_bottlenecks[:nb_bottlenecks_to_return, :]
return nb_bottlenecks_to_return, pollution_bottlenecks
def semi_supervised_detection(image_set, clustering_method, architecture, pollution_dir,
pollution_percent=0.20):
"""
This function will assemble the values of the image directory, and from random images, to perform a clustering
on those, and will return the predictions, only on image bottlenecks.
:param image_set: The bottleneck values of the relevant images.
:param clustering_method: Which algorithm is used to get a prediction on the data.
:param architecture: Which architecture used to generate your bottlenecks.
:param pollution_dir: Location of the directory containing precomputed values for random images.
:param pollution_percent: Fraction of pollution added to our image values.
:return: A prediction vector, that is altered by a given amount of random data, to hopefuly get a better performance.
"""
pollution_points = int(image_set.shape[0] * pollution_percent)
pollution_points, pollution_set = grabbing_pollution(architecture, pollution_dir, pollution_points)
percent_of_pollution = pollution_points / image_set.shape[0]
print('We use a pollution of :', percent_of_pollution * 100, '%')
synthetic_set = np.concatenate((image_set, pollution_set))
if clustering_method == CLUSTERING_METHODS[0]:
predictions = detection_with_kmeans(synthetic_set)
elif clustering_method == CLUSTERING_METHODS[1]:
predictions = detection_with_birch(synthetic_set)
elif clustering_method == CLUSTERING_METHODS[2]:
predictions = detection_with_gaussian_mixture(synthetic_set)
elif clustering_method == CLUSTERING_METHODS[3]:
predictions = detection_with_agglomaritve_clustering(synthetic_set)
if pollution_points > 0:
predictions = predictions[:-pollution_points]
return predictions