semisupervised_clustering.m

classdef semisupervised_clustering < handle
    %CLUSTERING_PARAMETERS Input for the semisupervised_clustering
    %functions
    
    properties(GetAccess = 'public', SetAccess = 'protected')
        % number of features
        nfeatures = 0;
        % feature values
        features = [];
        % labels - has to be a vector of integers of same length as the
        % features
        labels = []; 
        % non-empty labels only
        nlabels = [];
        % number of external labels (these are used only to guide the
        % clustering)
        nexternal_labels = 0;
        % non-empty labels index 
        non_empty_labels_idx = [];
        % optional -> just to map the labels to classes
        classes = [];
        nclasses = 0;
        segments = [];
        config = [];
    end
        
    properties(GetAccess = 'public', SetAccess = 'public')
        % use two-stage clustering
        two_stage = 1;
        % force must-link constraints (not recommended)
        must_link = 0;
        % maximum distance between two elements to define constraints
        constraints_max_distance = 0.25;
        % percentage of constraints to use for the clustering (for testing
        % purposes)
        pconstraints = 1.;
    end
    
    properties(GetAccess = 'protected', SetAccess = 'protected')
        % create an array of integers in which each bit represents one
        % label; this will be needed bellow to test if any two sets of labels
        % have a common member (using 'intersect' is just absurdely slow)    
        labels_binary = [];        
    end
    
    methods        
        function inst = semisupervised_clustering(conf, seg, feat, lbls, classes_in, next)            
            % Constructor         
            inst.config = conf;
            inst.segments = seg;
            if nargin > 2
                inst.classes = classes_in;
                inst.nclasses = length(classes_in);
            end               
            if nargin > 3
                inst.nexternal_labels = next;
            end
            inst.nfeatures = size(feat, 2);
            % normalize features            
            inst.features = feat ./ repmat( max(feat) - min(feat), size(feat, 1), 1);            
            inst.labels = lbls;
            
            % look for non-empty labels
            for i = 1:length(lbls)
                tmp = lbls{i};
                if tmp ~= -1
                    inst.non_empty_labels_idx = [inst.non_empty_labels_idx, i];
                    inst.labels_binary = [inst.labels_binary, int32(0)];
                    for j = 1:length(tmp)
                        inst.labels_binary(end) = bitset(inst.labels_binary(end), tmp(j) + 1);
                    end
                end
            end
            inst.nlabels = length(inst.non_empty_labels_idx); 
        end
        
        function [res, res1st] = cluster(inst, nclusters, test_p)
            % run the clustering
            mask = ones(1, length(inst.non_empty_labels_idx));
            
            if nargin > 2 && test_p > 0.
                n = length(inst.non_empty_labels_idx);
                mask(randsample(1:n, floor(n*test_p))) = 0;
            end 
            
            % run clusterer using a subset of the labels for training and
            % all of them for validation
            if nargout > 1
                % get also the results for the 1st stage
                % clustering
                [res, res1st] = inst.internal_cluster(nclusters, mask, ones(1, inst.nlabels));                
            else
                res = inst.internal_cluster(nclusters, mask, ones(1, inst.nlabels));
            end
        end
        
        function [res, res1st] = cluster_cross_validation(inst, nclusters, varargin)
            res = [];
            if nargout > 1
                res1st = [];
            end
            
            [folds, ptraining, nruns, test_set] = process_options(varargin, ...
                'Folds', 10, 'TrainingPercentage', 0, 'Runs', 1, 'TestSet', [] ...
            ); 
                        
            idx = 1:inst.nlabels;
            if ~isempty(test_set)
                % remove test set from available labels for training
                idx = idx(test_set == 0);
            else
                test_set = ones(1, inst.nlabels);
            end
                       
            for t = 1:nruns
                if ptraining == 0
                    % use 
                    cv = cvpartition(idx, 'k', folds);

                    for j = 1:cv.NumTestSets % perforn a N-fold stratified cross-validation                                        
                        % perform classifcation using only a subset of the labels
                        training = zeros(1, inst.nlabels);
                        training(idx(cv.training(j))) = 1;
                        test = zeros(1, inst.nlabels);
                        test(idx(cv.test(j))) = 1;

                        if nargout > 1
                            % get also the results for the 1st stage
                            % clustering
                            [tmp, tmp2] = inst.internal_cluster(nclusters, training, test);
                            res = [res, tmp];
                            res1st = [res1st, tmp2];
                        else
                            res = [res, inst.internal_cluster(nclusters, training, test)];
                        end                        
                    end
                else
                    % take ptraining 
                    training_set = zeros(1, inst.nlabels);
                    training_set(idx(randsample(length(idx), floor(length(idx)*ptraining)))) = 1;
                    
                    if nargout > 1
                        % get also the results for the 1st stage
                        % clustering
                        [tmp, tmp2] = inst.internal_cluster(nclusters, training_set, test_set);
                        res = [res, tmp];
                        res1st = [res1st, tmp2];
                    else
                        res = [res, inst.internal_cluster(nclusters, training_set, test_set)];
                    end                                        
                end
            end
            res = clustering_cv_results(res);
            if nargout > 1
                res1st = clustering_cv_results(res1st);
            end
        end
    end            
        
    methods(Access = private)                                        
        function [res, res1st] = internal_cluster(inst, nclusters, training_set, test_set)            
            % divide the data into labelled/unlabelled elements
            % move labelled items to the front   
            labels_idx = inst.non_empty_labels_idx(training_set == 1);       
            bin_labels = inst.labels_binary(training_set == 1);
            lbls = inst.labels(labels_idx);            
            unlabelled = setdiff(1:length(inst.labels), labels_idx);    
            % mapping of the reordered features back to the original sequence
            results_map = zeros(1, length(inst.labels));    
            results_map(labels_idx) = 1:length(labels_idx);
            results_map(unlabelled) = (length(labels_idx) + 1):length(inst.labels);
            % reorded features -> first labelled then unlabelled items
            reordered_feat = [inst.features(labels_idx, :); inst.features(unlabelled, :)];         
                                    
            %% build list of constraints    
            constr = [];
            for j = 1:length(labels_idx)
                for k = (j + 1):length(labels_idx)          
                    % determine if both sets of labels match (if at least one element
                    % is present in both sets)
                    % don't use 'intersect' for this -> its implementation is
                    % really really slow
                    if bitand(bin_labels(j), bin_labels(k)) 
                        if ~inst.two_stage && inst.must_link && sqrt(sum((inst.features(labels_idx(j), :) - inst.features(labels_idx(k), :)).^2)) < inst.constraints_max_distance                       
                           % MUST-LINK constraint - lower index followed by higher one
                           constr = [constr; min(j, k), max(j, k)];                                      
                        end              
                    elseif sqrt(sum((inst.features(labels_idx(j), :) - inst.features(labels_idx(k), :)).^2)) < inst.constraints_max_distance
                        % CANNOT-LINK constraint - other way around
                        constr = [constr; max(j, k), min(j, k)];
                    end
                end
            end
            
         
            if inst.pconstraints < 1 
                constr = constr(randsample(size(constr, 1), floor(size(constr, 1)*inst.pconstraints)), :);
            end
            
            nconstr = length(constr);
                                   
            %% 1st (main stage): cluster the data            
            fprintf('\nClustering... (target clusters: %d, total number of constraints: %d)', nclusters, size(constr, 1));
            [cluster_idx, centroids] = inst.clustering_algo(reordered_feat, constr, nclusters);
            
            % reorder indexes agaion to match the input data
            cluster_idx = cluster_idx(results_map);   

            % map clusters to classes
            cluster_map = cluster_to_class( ...
                arrayfun( @(ci) sum(cluster_idx == ci) ...
              , 1:nclusters) ...
              , lbls ...
              , cluster_idx(labels_idx) ...
              , 'MinSamplesPercentage', inst.config.property('CLUSTER_CLASS_MINIMUM_SAMPLES_P', 0.01) ...
              , 'MinSamplesExponent', inst.config.property('CLUSTER_CLASS_MINIMUM_SAMPLES_EXP', 0.75) ); 
                        
            %% 2nd stage: subcluster ambigouous clusters (containing elements of more than one 
            % this is the new cluster index for sub-clusters: start at the previous
            % cluster count
            icluster = 0;    
            new_cluster_idx = zeros(1, length(cluster_idx));
            new_cluster_map = zeros(1, length(cluster_map));
            new_centroids = zeros(inst.nfeatures, length(cluster_map));
            nconstr2 = 0;
            for i = 1:nclusters
                % elements of this cluster
                sub_idx = find(cluster_idx == i);            
                % check if cluster is empty
                if ~isempty(sub_idx)
                    % non-empty cluster -> rebase elements
                    icluster = icluster + 1;
                    new_cluster_idx(sub_idx) = icluster;
                    new_cluster_map(icluster) = cluster_map(i); 
                    new_centroids(:, icluster) = centroids(:, i);
                    if inst.two_stage && cluster_map(i) == 0
                        % take labels in this cluster which are also not
                        % masked out                        
                        sub_labels = inst.labels(sub_idx);
                        sub_labels_idx = [];                        
                        sub_bin_labels = [];
                        for j = 1:numel(sub_labels)
                            tmp = sub_labels{j};
                            if tmp ~= -1
                                sub_labels_idx = [sub_labels_idx, j];
                                sub_bin_labels = [sub_bin_labels, int32(0)];
                                for k = 1:length(tmp)
                                    sub_bin_labels(end) = bitset(sub_bin_labels(end), tmp(k) + 1);
                                end                             
                            end
                        end

                        sub_labels = sub_labels(sub_labels_idx);
                           
                        unique_labels = [];
                        constr = [];
                        for j = 1:(length(sub_labels_idx) - 1)
                            for k = (j + 1):length(sub_labels_idx)            
                                if (numel(sub_labels{j}) > 1 || sub_labels{j} ~= -1) && (numel(sub_labels{k}) > 1 || sub_labels{k} ~= -1)
                                    if bitand(sub_bin_labels(j), sub_bin_labels(k))
                                        constr = [constr; min(j, k), max(j, k)];
                                        unique_labels = unique([unique_labels, intersect(sub_labels{j}, sub_labels{k})]); 
                                    else                            
                                        constr = [constr; max(j, k), min(j, k)];
                                        unique_labels = unique([unique_labels, sub_labels{j}, sub_labels{k}]);
                                    end
                                end
                            end
                        end

                        if ~isempty(unique_labels)                                                        
                            unlabelled = setdiff(1:length(sub_idx), sub_labels_idx);    
                            % mapping of the reordered features back to the original sequence
                            results_map = zeros(1, length(sub_idx));    
                            results_map(sub_labels_idx) = 1:length(sub_labels_idx);
                            results_map(unlabelled) = (length(sub_labels_idx) + 1):length(sub_idx);
                            % reorded features -> first labelled then unlabelled items
                            reordered_feat = [inst.features(sub_idx(sub_labels_idx), :); inst.features(sub_idx(unlabelled), :)];         
                            feat_std = reordered_feat ./ repmat( max(reordered_feat) - min(reordered_feat), size(reordered_feat, 1), 1);
                            reordered_feat = feat_std;        

                            % sub-clustering      
                            nsub = max(2, length(unique_labels));

                            for s = nsub:2*nsub
                                fprintf('**** Partitioning cluster %d into %d sub-clusters... ****\n', i, s);

                                [sub_cluster_idx, sub_centroids] = inst.clustering_algo(reordered_feat, constr, s);                                
                                % reorder indexes agaion to match the input data
                                sub_cluster_idx = sub_cluster_idx(results_map);   

                                sub_cluster_map = cluster_to_class(arrayfun( @(ci) sum(sub_cluster_idx == ci), 1:s), sub_labels, sub_cluster_idx(sub_labels_idx));

                                if sum(sub_cluster_map) > 0
                                    nconstr2 = nconstr2 + length(constr);
                                    first = 1;
                                    for j = 1:s
                                        if sum(sub_cluster_idx == j) > 0
                                            % for the first sub-cluster reuse the previous cluster number
                                            if ~first
                                                icluster = icluster + 1;
                                            else
                                                first = 0;                                        
                                            end
                                            new_cluster_idx(sub_idx(sub_cluster_idx == j)) = icluster;                                    
                                            new_cluster_map(icluster) = sub_cluster_map(j);           
                                            new_centroids(:, icluster) = sub_centroids(:, j);
                                        end
                                    end
                                    break;
                                end
                            end
                        end                       
                    end
                end
            end

            % see if we want the intermediate 1st stage results            
            if nargout > 1
                class_idx = zeros(1, length(cluster_idx));
                for i = 1:icluster
                    sel = find(cluster_idx == i);        
                    if ~isempty(sel)
                        class_idx(sel) = cluster_map(i);
                    end
                end

                res1st = clustering_results( inst.segments ...
                                           , length(inst.classes) ...
                                           , inst.labels(inst.nexternal_labels + 1:end) ...
                                           , training_set, test_set ...
                                           , inst.nexternal_labels ...
                                           , nconstr, class_idx ...
                                           , cluster_idx, cluster_map, centroids, inst.classes);             
            end
                        
            cluster_idx = new_cluster_idx;
            cluster_map = new_cluster_map;
            centroids = new_centroids;

            class_idx = zeros(1, length(cluster_idx));
            for i = 1:icluster
                sel = find(cluster_idx == i);        
                if ~isempty(sel)
                    class_idx(sel) = cluster_map(i);
                end
            end
                                    
            % create return object
            res = clustering_results(inst.segments, length(inst.classes), inst.labels, training_set, test_set, inst.nexternal_labels, [nconstr, nconstr2], class_idx, cluster_idx, cluster_map, centroids, inst.classes);             
        end    
        
        function [cluster_idx, centroids] = clustering_algo(inst, feat, constr, nclusters)
            % use MPCK-means algorithm by default
            algo = inst.config.property('CLUSTERING_ALGORITHM', 'MPCKm');
       
            % constraints supported only by MPCkmeans
            if ~strcmp(algo, 'MPCKm') && ~isempty(constr)
                error('Constraints not supported with the standard clustering algorithm');
            end
            
            if strcmp(algo, 'MPCKm')                
                [cluster_idx, centroids] = mpckmeans(feat, constr, nclusters);
                cluster_idx = cluster_idx + 1; % mpck-means uses zero based indexes (it's Java after all) -> we want them to start at 1            
            elseif strcmp(algo, 'kmeans')
                [cluster_idx, centroids] = kmeans(feat, nclusters, 'MaxIter', 10000);
                centroids = centroids';
            else
                error('Unknown clustering algorithm');
            end
        end
    end
end