#540: 'predict' method for X-Means algorithm.

U-SIOUX\Novikova · U-SIOUX\Novikova · commit 3791e5c5250f · 2019-09-03T15:43:57.000+02:00
diff --git a/pyclustering/cluster/kmeans.py b/pyclustering/cluster/kmeans.py
@@ -441,30 +441,6 @@ def predict(self, points):
         @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
                  collection if 'process()' method was not called.
 
-        An example how to calculate (or predict) the closest cluster to specified points.
-        @code
-            from pyclustering.cluster.kmeans import kmeans
-            from pyclustering.samples.definitions import SIMPLE_SAMPLES
-            from pyclustering.utils import read_sample
-
-            # Load list of points for cluster analysis.
-            sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
-
-            # Initial centers for sample 'Simple3'.
-            initial_centers = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]
-
-            # Create instance of K-Means algorithm with prepared centers.
-            kmeans_instance = kmeans(sample, initial_centers)
-
-            # Run cluster analysis.
-            kmeans_instance.process()
-
-            # Calculate the closest cluster to following two points.
-            points = [[0.25, 0.2], [2.5, 4.0]]
-            closest_clusters = kmeans_instance.predict(points)
-            print(closest_clusters)
-        @endcode
-
         """
 
         nppoints = numpy.array(points)
diff --git a/pyclustering/cluster/tests/integration/it_xmeans.py b/pyclustering/cluster/tests/integration/it_xmeans.py
@@ -24,18 +24,18 @@
 """
 
 
-import unittest;
+import unittest
 
 # Generate images without having a window appear.
-import matplotlib;
-matplotlib.use('Agg');
+import matplotlib
+matplotlib.use('Agg')
 
-from pyclustering.cluster.tests.xmeans_templates import XmeansTestTemplates;
-from pyclustering.cluster.xmeans import xmeans, splitting_type;
+from pyclustering.cluster.tests.xmeans_templates import XmeansTestTemplates
+from pyclustering.cluster.xmeans import xmeans, splitting_type
 
-from pyclustering.samples.definitions import SIMPLE_SAMPLES, FCPS_SAMPLES;
+from pyclustering.samples.definitions import SIMPLE_SAMPLES, FCPS_SAMPLES
 
-from pyclustering.core.tests import remove_library;
+from pyclustering.core.tests import remove_library
 
 
 class XmeansIntegrationTest(unittest.TestCase):
@@ -184,11 +184,22 @@ def testKmax05Amount20Offset02Initial05(self):
     def testKmax05Amount01Offset01Initial04(self):
         XmeansTestTemplates.templateMaxAllocatedClusters(True, 1, 1000, 1, 4, 5);
 
+    def testPredictOnePoint(self):
+        centers = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]
+        XmeansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, [[0.3, 0.2]], 4, [0], True)
+        XmeansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, [[4.1, 1.1]], 4, [1], True)
+        XmeansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, [[2.1, 1.9]], 4, [2], True)
+        XmeansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, [[2.1, 4.1]], 4, [3], True)
+
+    def testPredictTwoPoints(self):
+        centers = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]
+        XmeansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, [[0.3, 0.2], [2.1, 1.9]], 4, [0, 2], True)
+        XmeansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, [[2.1, 4.1], [2.1, 1.9]], 4, [3, 2], True)
 
     @remove_library
     def testProcessingWhenLibraryCoreCorrupted(self):
         XmeansTestTemplates.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.BAYESIAN_INFORMATION_CRITERION, 20, True);
 
 
 if __name__ == "__main__":
-    unittest.main();
+    unittest.main()
diff --git a/pyclustering/cluster/tests/unit/ut_xmeans.py b/pyclustering/cluster/tests/unit/ut_xmeans.py
@@ -184,6 +184,18 @@ def testKmax05Amount20Offset02Initial05(self):
     def testKmax05Amount01Offset01Initial04(self):
         XmeansTestTemplates.templateMaxAllocatedClusters(False, 1, 1000, 1, 4, 5)
 
+    def testPredictOnePoint(self):
+        centers = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]
+        XmeansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, [[0.3, 0.2]], 4, [0], False)
+        XmeansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, [[4.1, 1.1]], 4, [1], False)
+        XmeansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, [[2.1, 1.9]], 4, [2], False)
+        XmeansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, [[2.1, 4.1]], 4, [3], False)
+
+    def testPredictTwoPoints(self):
+        centers = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]
+        XmeansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, [[0.3, 0.2], [2.1, 1.9]], 4, [0, 2], False)
+        XmeansTestTemplates.templatePredict(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, centers, [[2.1, 4.1], [2.1, 1.9]], 4, [3, 2], False)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/pyclustering/cluster/tests/xmeans_templates.py b/pyclustering/cluster/tests/xmeans_templates.py
@@ -23,12 +23,13 @@
 
 """
 
+import numpy
 import random
 
 from pyclustering.cluster.xmeans import xmeans, splitting_type
 from pyclustering.cluster.center_initializer import random_center_initializer
 
-from pyclustering.utils import read_sample
+from pyclustering.utils import read_sample, distance_metric, type_metric
 
 from pyclustering.tests.assertion import assertion
 
@@ -63,6 +64,21 @@ def templateLengthProcessData(input_sample, start_centers, expected_cluster_leng
             assert obtained_cluster_sizes == expected_cluster_length;
 
 
+    @staticmethod
+    def templatePredict(path_to_file, initial_centers, points, expected_amount, expected_closest_clusters, ccore, **kwargs):
+        sample = read_sample(path_to_file)
+
+        kmax = kwargs.get('kmax', 20)
+
+        xmeans_instance = xmeans(sample, initial_centers, kmax, 0.025, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore)
+        xmeans_instance.process()
+
+        closest_clusters = xmeans_instance.predict(points)
+        assertion.eq(expected_amount, len(xmeans_instance.get_clusters()))
+        assertion.eq(len(expected_closest_clusters), len(closest_clusters))
+        assertion.true(numpy.array_equal(numpy.array(expected_closest_clusters), closest_clusters))
+
+
     @staticmethod
     def templateClusterAllocationOneDimensionData(ccore_flag):
         input_data = [ [0.0] for _ in range(10) ] + [ [5.0] for _ in range(10) ] + [ [10.0] for _ in range(10) ] + [ [15.0] for _ in range(10) ]
diff --git a/pyclustering/cluster/xmeans.py b/pyclustering/cluster/xmeans.py
@@ -40,7 +40,7 @@
 
 import pyclustering.core.xmeans_wrapper as wrapper
 
-from pyclustering.utils import euclidean_distance_square, euclidean_distance
+from pyclustering.utils import euclidean_distance_square, euclidean_distance, distance_metric, type_metric
 
 
 class splitting_type(IntEnum):
@@ -123,7 +123,7 @@ class xmeans:
     
     """
     
-    def __init__(self, data, initial_centers = None, kmax = 20, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = True):
+    def __init__(self, data, initial_centers=None, kmax=20, tolerance=0.025, criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore=True):
         """!
         @brief Constructor of clustering algorithm X-Means.
         
@@ -143,7 +143,7 @@ def __init__(self, data, initial_centers = None, kmax = 20, tolerance = 0.025, c
         if initial_centers is not None:
             self.__centers = initial_centers[:]
         else:
-            self.__centers = [ [random.random() for _ in range(len(data[0])) ] ]
+            self.__centers = [[random.random() for _ in range(len(data[0]))]]
         
         self.__kmax = kmax
         self.__tolerance = tolerance
@@ -165,7 +165,7 @@ def process(self):
         
         """
         
-        if (self.__ccore is True):
+        if self.__ccore is True:
             self.__clusters, self.__centers = wrapper.xmeans(self.__pointer_data, self.__centers, self.__kmax, self.__tolerance, self.__criterion)
 
         else:
@@ -185,6 +185,53 @@ def process(self):
             self.__clusters, self.__centers = self.__improve_parameters(self.__centers)
 
 
+    def predict(self, points):
+        """!
+        @brief Calculates the closest cluster to each point.
+
+        @param[in] points (array_like): Points for which closest clusters are calculated.
+
+        @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
+                 collection if 'process()' method was not called.
+
+        An example how to calculate (or predict) the closest cluster to specified points.
+        @code
+            from pyclustering.cluster.xmeans import xmeans
+            from pyclustering.samples.definitions import SIMPLE_SAMPLES
+            from pyclustering.utils import read_sample
+
+            # Load list of points for cluster analysis.
+            sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
+
+            # Initial centers for sample 'Simple3'.
+            initial_centers = [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]]
+
+            # Create instance of X-Means algorithm with prepared centers.
+            xmeans_instance = xmeans(sample, initial_centers)
+
+            # Run cluster analysis.
+            xmeans_instance.process()
+
+            # Calculate the closest cluster to following two points.
+            points = [[0.25, 0.2], [2.5, 4.0]]
+            closest_clusters = xmeans_instance.predict(points)
+            print(closest_clusters)
+        @endcode
+
+        """
+        nppoints = numpy.array(points)
+        if len(self.__clusters) == 0:
+            return []
+
+        metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, numpy_usage=True)
+
+        differences = numpy.zeros((len(nppoints), len(self.__centers)))
+        for index_point in range(len(nppoints)):
+            differences[index_point] = metric(nppoints[index_point], self.__centers)
+
+        return numpy.argmin(differences, axis=1)
+
+
     def get_clusters(self):
         """!
         @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.