-
Notifications
You must be signed in to change notification settings - Fork 0
/
semisupervised_clustering.m
373 lines (333 loc) · 18.4 KB
/
semisupervised_clustering.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
classdef semisupervised_clustering < handle
%CLUSTERING_PARAMETERS Input for the semisupervised_clustering
%functions
properties(GetAccess = 'public', SetAccess = 'protected')
% number of features
nfeatures = 0;
% feature values
features = [];
% labels - has to be a vector of integers of same length as the
% features
labels = [];
% non-empty labels only
nlabels = [];
% number of external labels (these are used only to guide the
% clustering)
nexternal_labels = 0;
% non-empty labels index
non_empty_labels_idx = [];
% optional -> just to map the labels to classes
classes = [];
nclasses = 0;
segments = [];
config = [];
end
properties(GetAccess = 'public', SetAccess = 'public')
% use two-stage clustering
two_stage = 1;
% force must-link constraints (not recommended)
must_link = 0;
% maximum distance between two elements to define constraints
constraints_max_distance = 0.25;
% percentage of constraints to use for the clustering (for testing
% purposes)
pconstraints = 1.;
end
properties(GetAccess = 'protected', SetAccess = 'protected')
% create an array of integers in which each bit represents one
% label; this will be needed bellow to test if any two sets of labels
% have a common member (using 'intersect' is just absurdely slow)
labels_binary = [];
end
methods
function inst = semisupervised_clustering(conf, seg, feat, lbls, classes_in, next)
% Constructor
inst.config = conf;
inst.segments = seg;
if nargin > 2
inst.classes = classes_in;
inst.nclasses = length(classes_in);
end
if nargin > 3
inst.nexternal_labels = next;
end
inst.nfeatures = size(feat, 2);
% normalize features
inst.features = feat ./ repmat( max(feat) - min(feat), size(feat, 1), 1);
inst.labels = lbls;
% look for non-empty labels
for i = 1:length(lbls)
tmp = lbls{i};
if tmp ~= -1
inst.non_empty_labels_idx = [inst.non_empty_labels_idx, i];
inst.labels_binary = [inst.labels_binary, int32(0)];
for j = 1:length(tmp)
inst.labels_binary(end) = bitset(inst.labels_binary(end), tmp(j) + 1);
end
end
end
inst.nlabels = length(inst.non_empty_labels_idx);
end
function [res, res1st] = cluster(inst, nclusters, test_p)
% run the clustering
mask = ones(1, length(inst.non_empty_labels_idx));
if nargin > 2 && test_p > 0.
n = length(inst.non_empty_labels_idx);
mask(randsample(1:n, floor(n*test_p))) = 0;
end
% run clusterer using a subset of the labels for training and
% all of them for validation
if nargout > 1
% get also the results for the 1st stage
% clustering
[res, res1st] = inst.internal_cluster(nclusters, mask, ones(1, inst.nlabels));
else
res = inst.internal_cluster(nclusters, mask, ones(1, inst.nlabels));
end
end
function [res, res1st] = cluster_cross_validation(inst, nclusters, varargin)
res = [];
if nargout > 1
res1st = [];
end
[folds, ptraining, nruns, test_set] = process_options(varargin, ...
'Folds', 10, 'TrainingPercentage', 0, 'Runs', 1, 'TestSet', [] ...
);
idx = 1:inst.nlabels;
if ~isempty(test_set)
% remove test set from available labels for training
idx = idx(test_set == 0);
else
test_set = ones(1, inst.nlabels);
end
for t = 1:nruns
if ptraining == 0
% use
cv = cvpartition(idx, 'k', folds);
for j = 1:cv.NumTestSets % perforn a N-fold stratified cross-validation
% perform classifcation using only a subset of the labels
training = zeros(1, inst.nlabels);
training(idx(cv.training(j))) = 1;
test = zeros(1, inst.nlabels);
test(idx(cv.test(j))) = 1;
if nargout > 1
% get also the results for the 1st stage
% clustering
[tmp, tmp2] = inst.internal_cluster(nclusters, training, test);
res = [res, tmp];
res1st = [res1st, tmp2];
else
res = [res, inst.internal_cluster(nclusters, training, test)];
end
end
else
% take ptraining
training_set = zeros(1, inst.nlabels);
training_set(idx(randsample(length(idx), floor(length(idx)*ptraining)))) = 1;
if nargout > 1
% get also the results for the 1st stage
% clustering
[tmp, tmp2] = inst.internal_cluster(nclusters, training_set, test_set);
res = [res, tmp];
res1st = [res1st, tmp2];
else
res = [res, inst.internal_cluster(nclusters, training_set, test_set)];
end
end
end
res = clustering_cv_results(res);
if nargout > 1
res1st = clustering_cv_results(res1st);
end
end
end
methods(Access = private)
function [res, res1st] = internal_cluster(inst, nclusters, training_set, test_set)
% divide the data into labelled/unlabelled elements
% move labelled items to the front
labels_idx = inst.non_empty_labels_idx(training_set == 1);
bin_labels = inst.labels_binary(training_set == 1);
lbls = inst.labels(labels_idx);
unlabelled = setdiff(1:length(inst.labels), labels_idx);
% mapping of the reordered features back to the original sequence
results_map = zeros(1, length(inst.labels));
results_map(labels_idx) = 1:length(labels_idx);
results_map(unlabelled) = (length(labels_idx) + 1):length(inst.labels);
% reorded features -> first labelled then unlabelled items
reordered_feat = [inst.features(labels_idx, :); inst.features(unlabelled, :)];
%% build list of constraints
constr = [];
for j = 1:length(labels_idx)
for k = (j + 1):length(labels_idx)
% determine if both sets of labels match (if at least one element
% is present in both sets)
% don't use 'intersect' for this -> its implementation is
% really really slow
if bitand(bin_labels(j), bin_labels(k))
if ~inst.two_stage && inst.must_link && sqrt(sum((inst.features(labels_idx(j), :) - inst.features(labels_idx(k), :)).^2)) < inst.constraints_max_distance
% MUST-LINK constraint - lower index followed by higher one
constr = [constr; min(j, k), max(j, k)];
end
elseif sqrt(sum((inst.features(labels_idx(j), :) - inst.features(labels_idx(k), :)).^2)) < inst.constraints_max_distance
% CANNOT-LINK constraint - other way around
constr = [constr; max(j, k), min(j, k)];
end
end
end
if inst.pconstraints < 1
constr = constr(randsample(size(constr, 1), floor(size(constr, 1)*inst.pconstraints)), :);
end
nconstr = length(constr);
%% 1st (main stage): cluster the data
fprintf('\nClustering... (target clusters: %d, total number of constraints: %d)', nclusters, size(constr, 1));
[cluster_idx, centroids] = inst.clustering_algo(reordered_feat, constr, nclusters);
% reorder indexes agaion to match the input data
cluster_idx = cluster_idx(results_map);
% map clusters to classes
cluster_map = cluster_to_class( ...
arrayfun( @(ci) sum(cluster_idx == ci) ...
, 1:nclusters) ...
, lbls ...
, cluster_idx(labels_idx) ...
, 'MinSamplesPercentage', inst.config.property('CLUSTER_CLASS_MINIMUM_SAMPLES_P', 0.01) ...
, 'MinSamplesExponent', inst.config.property('CLUSTER_CLASS_MINIMUM_SAMPLES_EXP', 0.75) );
%% 2nd stage: subcluster ambigouous clusters (containing elements of more than one
% this is the new cluster index for sub-clusters: start at the previous
% cluster count
icluster = 0;
new_cluster_idx = zeros(1, length(cluster_idx));
new_cluster_map = zeros(1, length(cluster_map));
new_centroids = zeros(inst.nfeatures, length(cluster_map));
nconstr2 = 0;
for i = 1:nclusters
% elements of this cluster
sub_idx = find(cluster_idx == i);
% check if cluster is empty
if ~isempty(sub_idx)
% non-empty cluster -> rebase elements
icluster = icluster + 1;
new_cluster_idx(sub_idx) = icluster;
new_cluster_map(icluster) = cluster_map(i);
new_centroids(:, icluster) = centroids(:, i);
if inst.two_stage && cluster_map(i) == 0
% take labels in this cluster which are also not
% masked out
sub_labels = inst.labels(sub_idx);
sub_labels_idx = [];
sub_bin_labels = [];
for j = 1:numel(sub_labels)
tmp = sub_labels{j};
if tmp ~= -1
sub_labels_idx = [sub_labels_idx, j];
sub_bin_labels = [sub_bin_labels, int32(0)];
for k = 1:length(tmp)
sub_bin_labels(end) = bitset(sub_bin_labels(end), tmp(k) + 1);
end
end
end
sub_labels = sub_labels(sub_labels_idx);
unique_labels = [];
constr = [];
for j = 1:(length(sub_labels_idx) - 1)
for k = (j + 1):length(sub_labels_idx)
if (numel(sub_labels{j}) > 1 || sub_labels{j} ~= -1) && (numel(sub_labels{k}) > 1 || sub_labels{k} ~= -1)
if bitand(sub_bin_labels(j), sub_bin_labels(k))
constr = [constr; min(j, k), max(j, k)];
unique_labels = unique([unique_labels, intersect(sub_labels{j}, sub_labels{k})]);
else
constr = [constr; max(j, k), min(j, k)];
unique_labels = unique([unique_labels, sub_labels{j}, sub_labels{k}]);
end
end
end
end
if ~isempty(unique_labels)
unlabelled = setdiff(1:length(sub_idx), sub_labels_idx);
% mapping of the reordered features back to the original sequence
results_map = zeros(1, length(sub_idx));
results_map(sub_labels_idx) = 1:length(sub_labels_idx);
results_map(unlabelled) = (length(sub_labels_idx) + 1):length(sub_idx);
% reorded features -> first labelled then unlabelled items
reordered_feat = [inst.features(sub_idx(sub_labels_idx), :); inst.features(sub_idx(unlabelled), :)];
feat_std = reordered_feat ./ repmat( max(reordered_feat) - min(reordered_feat), size(reordered_feat, 1), 1);
reordered_feat = feat_std;
% sub-clustering
nsub = max(2, length(unique_labels));
for s = nsub:2*nsub
fprintf('**** Partitioning cluster %d into %d sub-clusters... ****\n', i, s);
[sub_cluster_idx, sub_centroids] = inst.clustering_algo(reordered_feat, constr, s);
% reorder indexes agaion to match the input data
sub_cluster_idx = sub_cluster_idx(results_map);
sub_cluster_map = cluster_to_class(arrayfun( @(ci) sum(sub_cluster_idx == ci), 1:s), sub_labels, sub_cluster_idx(sub_labels_idx));
if sum(sub_cluster_map) > 0
nconstr2 = nconstr2 + length(constr);
first = 1;
for j = 1:s
if sum(sub_cluster_idx == j) > 0
% for the first sub-cluster reuse the previous cluster number
if ~first
icluster = icluster + 1;
else
first = 0;
end
new_cluster_idx(sub_idx(sub_cluster_idx == j)) = icluster;
new_cluster_map(icluster) = sub_cluster_map(j);
new_centroids(:, icluster) = sub_centroids(:, j);
end
end
break;
end
end
end
end
end
end
% see if we want the intermediate 1st stage results
if nargout > 1
class_idx = zeros(1, length(cluster_idx));
for i = 1:icluster
sel = find(cluster_idx == i);
if ~isempty(sel)
class_idx(sel) = cluster_map(i);
end
end
res1st = clustering_results( inst.segments ...
, length(inst.classes) ...
, inst.labels(inst.nexternal_labels + 1:end) ...
, training_set, test_set ...
, inst.nexternal_labels ...
, nconstr, class_idx ...
, cluster_idx, cluster_map, centroids, inst.classes);
end
cluster_idx = new_cluster_idx;
cluster_map = new_cluster_map;
centroids = new_centroids;
class_idx = zeros(1, length(cluster_idx));
for i = 1:icluster
sel = find(cluster_idx == i);
if ~isempty(sel)
class_idx(sel) = cluster_map(i);
end
end
% create return object
res = clustering_results(inst.segments, length(inst.classes), inst.labels, training_set, test_set, inst.nexternal_labels, [nconstr, nconstr2], class_idx, cluster_idx, cluster_map, centroids, inst.classes);
end
function [cluster_idx, centroids] = clustering_algo(inst, feat, constr, nclusters)
% use MPCK-means algorithm by default
algo = inst.config.property('CLUSTERING_ALGORITHM', 'MPCKm');
% constraints supported only by MPCkmeans
if ~strcmp(algo, 'MPCKm') && ~isempty(constr)
error('Constraints not supported with the standard clustering algorithm');
end
if strcmp(algo, 'MPCKm')
[cluster_idx, centroids] = mpckmeans(feat, constr, nclusters);
cluster_idx = cluster_idx + 1; % mpck-means uses zero based indexes (it's Java after all) -> we want them to start at 1
elseif strcmp(algo, 'kmeans')
[cluster_idx, centroids] = kmeans(feat, nclusters, 'MaxIter', 10000);
centroids = centroids';
else
error('Unknown clustering algorithm');
end
end
end
end