#563: Unit-tests are added for BIRCH algorithm.

annoviko · Nov 27, 2019 · 2856d37 · 2856d37
1 parent c7ba5e5
commit 2856d37
Show file tree

Hide file tree

Showing 5 changed files with 61 additions and 43 deletions.
diff --git a/pyclustering/cluster/birch.py b/pyclustering/cluster/birch.py
@@ -63,7 +63,7 @@ class birch:
     
     """
 
-    def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=200, initial_diameter=0.5,
+    def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=200, diameter=0.5,
                  type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE,
                  entry_size_limit=500,
                  diameter_multiplier=1.5,
@@ -75,7 +75,7 @@ def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=
         @param[in] number_clusters (uint): Number of clusters that should be allocated.
         @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
         @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
-        @param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded.
+        @param[in] diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded.
         @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
         @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt.
         @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded.
@@ -96,7 +96,7 @@ def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=
         self.__verify_arguments()
 
         self.__features = None
-        self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement)
+        self.__tree = cftree(branching_factor, max_node_entries, diameter, type_measurement)
 
         self.__clusters = []
         self.__noise = []
@@ -223,7 +223,7 @@ def __rebuild_tree(self, index_point):
         @return (cftree) Rebuilt tree with encoded points till specified point from input data space.
         
         """
-        
+
         rebuild_result = False
         increased_diameter = self.__tree.threshold * self.__diameter_multiplier
 

diff --git a/pyclustering/cluster/examples/birch_examples.py b/pyclustering/cluster/examples/birch_examples.py
@@ -78,19 +78,18 @@ def cluster_elongate():
 
 def cluster_lsun():
     template_clustering(3, FCPS_SAMPLES.SAMPLE_LSUN)
-    template_clustering(3, FCPS_SAMPLES.SAMPLE_LSUN, 5, 5, 0.2, measurement_type.CENTROID_MANHATTAN_DISTANCE, 200)  # not correct, but almost good result
+
+def cluster_lsun_rebuilt():
+    template_clustering(3, FCPS_SAMPLES.SAMPLE_LSUN, entry_size_limit=20, diameter_multiplier=1.5)
 
 def cluster_target():
     template_clustering(6, FCPS_SAMPLES.SAMPLE_TARGET)
-    template_clustering(6, FCPS_SAMPLES.SAMPLE_TARGET, 5, 10, 0.5, measurement_type.CENTROID_MANHATTAN_DISTANCE, 200)  # interesting - sliced cake.
-    template_clustering(6, FCPS_SAMPLES.SAMPLE_TARGET, 50, 100, 0.5, measurement_type.VARIANCE_INCREASE_DISTANCE, 200)  # interesting - sliced cake.
 
 def cluster_two_diamonds():
     template_clustering(2, FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
 
 def cluster_wing_nut():
     template_clustering(2, FCPS_SAMPLES.SAMPLE_WING_NUT)
-    template_clustering(2, FCPS_SAMPLES.SAMPLE_WING_NUT, 5, 5, 0.1, measurement_type.CENTROID_EUCLIDEAN_DISTANCE, 800)     # not correct, but almost good result
 
 def cluster_chainlink():
     template_clustering(2, FCPS_SAMPLES.SAMPLE_CHAINLINK)
@@ -102,8 +101,7 @@ def cluster_tetra():
     template_clustering(4, FCPS_SAMPLES.SAMPLE_TETRA)
 
 def cluster_engy_time():
-    template_clustering(2, FCPS_SAMPLES.SAMPLE_ENGY_TIME)  # one cluster is allocated
-    template_clustering(2, FCPS_SAMPLES.SAMPLE_ENGY_TIME, 10, 10, 0.05, measurement_type.VARIANCE_INCREASE_DISTANCE, 500)  # good result
+    template_clustering(2, FCPS_SAMPLES.SAMPLE_ENGY_TIME)
 
 
 def experiment_execution_time(ccore=False):
@@ -124,14 +122,14 @@ def experiment_execution_time(ccore=False):
 
 
 def display_fcps_clustering_results():
-    (lsun, lsun_clusters, _) = template_clustering(3, FCPS_SAMPLES.SAMPLE_LSUN, show_result=False)
-    (target, target_clusters, _) = template_clustering(6, FCPS_SAMPLES.SAMPLE_TARGET, show_result=False)
-    (two_diamonds, two_diamonds_clusters, _) = template_clustering(2, FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS, show_result=False)
-    (wing_nut, wing_nut_clusters, _) = template_clustering(2, FCPS_SAMPLES.SAMPLE_WING_NUT, show_result=False)
-    (chainlink, chainlink_clusters, _) = template_clustering(2, FCPS_SAMPLES.SAMPLE_CHAINLINK, show_result=False)
-    (hepta, hepta_clusters, _) = template_clustering(7, FCPS_SAMPLES.SAMPLE_HEPTA, show_result=False)
-    (tetra, tetra_clusters, _) = template_clustering(4, FCPS_SAMPLES.SAMPLE_TETRA, show_result=False)
-    (atom, atom_clusters, _) = template_clustering(2, FCPS_SAMPLES.SAMPLE_ATOM, show_result=False)
+    (lsun, lsun_clusters) = template_clustering(3, FCPS_SAMPLES.SAMPLE_LSUN, show_result=False)
+    (target, target_clusters) = template_clustering(6, FCPS_SAMPLES.SAMPLE_TARGET, show_result=False)
+    (two_diamonds, two_diamonds_clusters) = template_clustering(2, FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS, show_result=False)
+    (wing_nut, wing_nut_clusters) = template_clustering(2, FCPS_SAMPLES.SAMPLE_WING_NUT, show_result=False)
+    (chainlink, chainlink_clusters) = template_clustering(2, FCPS_SAMPLES.SAMPLE_CHAINLINK, show_result=False)
+    (hepta, hepta_clusters) = template_clustering(7, FCPS_SAMPLES.SAMPLE_HEPTA, show_result=False)
+    (tetra, tetra_clusters) = template_clustering(4, FCPS_SAMPLES.SAMPLE_TETRA, show_result=False)
+    (atom, atom_clusters) = template_clustering(2, FCPS_SAMPLES.SAMPLE_ATOM, show_result=False)
 
     visualizer = cluster_visualizer(8, 4)
     visualizer.append_clusters(lsun_clusters, lsun, 0)
@@ -154,15 +152,15 @@ def display_fcps_clustering_results():
 cluster_sample8()
 cluster_elongate()
 cluster_lsun()
+cluster_lsun_rebuilt()
 cluster_target()
 cluster_two_diamonds()
 cluster_wing_nut()
 cluster_chainlink()
 cluster_hepta()
 cluster_tetra()
 cluster_engy_time()
-
-
+
 experiment_execution_time(True)    # C++ code + Python env.
 
 display_fcps_clustering_results()
diff --git a/pyclustering/cluster/tests/agglomerative_templates.py b/pyclustering/cluster/tests/agglomerative_templates.py
@@ -33,47 +33,47 @@
 class AgglomerativeTestTemplates:
     @staticmethod
     def templateClusteringResults(path, number_clusters, link, expected_length_clusters, ccore_flag):
-        sample = read_sample(path);
+        sample = read_sample(path)
 
-        agglomerative_instance = agglomerative(sample, number_clusters, link, ccore_flag);
-        agglomerative_instance.process();
+        agglomerative_instance = agglomerative(sample, number_clusters, link, ccore_flag)
+        agglomerative_instance.process()
 
-        clusters = agglomerative_instance.get_clusters();
+        clusters = agglomerative_instance.get_clusters()
 
         assert sum([len(cluster) for cluster in clusters]) == len(sample);
         assert sum([len(cluster) for cluster in clusters]) == sum(expected_length_clusters);
         assert sorted([len(cluster) for cluster in clusters]) == expected_length_clusters;
 
     @staticmethod
     def templateClusterAllocationOneDimensionData(link, ccore_flag):
-        input_data = [ [random()] for i in range(10) ] + [ [random() + 3] for i in range(10) ] + [ [random() + 5] for i in range(10) ] + [ [random() + 8] for i in range(10) ];
+        input_data = [ [random()] for i in range(10) ] + [ [random() + 3] for i in range(10) ] + [ [random() + 5] for i in range(10) ] + [ [random() + 8] for i in range(10) ]
 
-        agglomerative_instance = agglomerative(input_data, 4, link, ccore_flag);
-        agglomerative_instance.process();
-        clusters = agglomerative_instance.get_clusters();
+        agglomerative_instance = agglomerative(input_data, 4, link, ccore_flag)
+        agglomerative_instance.process()
+        clusters = agglomerative_instance.get_clusters()
 
         assert len(clusters) == 4;
         for cluster in clusters:
             assert len(cluster) == 10;
 
     @staticmethod
     def templateClusterAllocationTheSameObjects(number_objects, number_clusters, link, ccore_flag):
-        input_data = [ [random()] ] * number_objects;
+        input_data = [ [random()] ] * number_objects
 
-        agglomerative_instance = agglomerative(input_data, number_clusters, link, ccore_flag);
-        agglomerative_instance.process();
-        clusters = agglomerative_instance.get_clusters();
+        agglomerative_instance = agglomerative(input_data, number_clusters, link, ccore_flag)
+        agglomerative_instance.process()
+        clusters = agglomerative_instance.get_clusters()
 
         assert len(clusters) == number_clusters;
 
-        object_mark = [False] * number_objects;
-        allocated_number_objects = 0;
+        object_mark = [False] * number_objects
+        allocated_number_objects = 0
 
         for cluster in clusters:
             for index_object in cluster: 
                 assert (object_mark[index_object] == False);    # one object can be in only one cluster.
 
-                object_mark[index_object] = True;
-                allocated_number_objects += 1;
+                object_mark[index_object] = True
+                allocated_number_objects += 1
 
-        assert (number_objects == allocated_number_objects);    # number of allocated objects should be the same.
+        assert (number_objects == allocated_number_objects);    # number of allocated objects should be the same.
diff --git a/pyclustering/cluster/tests/unit/ut_birch.py b/pyclustering/cluster/tests/unit/ut_birch.py
@@ -28,27 +28,29 @@
 import matplotlib
 matplotlib.use('Agg')
 
-from pyclustering.samples.definitions import SIMPLE_SAMPLES
+from pyclustering.samples.definitions import SIMPLE_SAMPLES, FCPS_SAMPLES
 
 from pyclustering.utils import read_sample
 
 from pyclustering.container.cftree import measurement_type
 
+from pyclustering.cluster.encoder import type_encoding
 from pyclustering.cluster.birch import birch
 
 from random import random
 
 
 class BirchUnitTest(unittest.TestCase):
-    def templateClusterAllocation(self, path, cluster_sizes, number_clusters, branching_factor=5, max_node_entries=5,
-                                  initial_diameter=0.1, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE,
+    def templateClusterAllocation(self, path, cluster_sizes, number_clusters, branching_factor=50, max_node_entries=100,
+                                  initial_diameter=0.5, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE,
                                   entry_size_limit=200, diameter_multiplier=1.5):
         sample = read_sample(path)
 
         birch_instance = birch(sample, number_clusters, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit, diameter_multiplier)
         birch_instance.process()
 
         clusters = birch_instance.get_clusters()
+        self.assertEqual(birch_instance.get_cluster_encoding(), type_encoding.CLUSTER_INDEX_LIST_SEPARATION)
 
         obtained_cluster_sizes = [len(cluster) for cluster in clusters]
 
@@ -133,13 +135,26 @@ def testClusterAllocationTheSameData2(self):
     def testClusterAllocationZeroColumn(self):
         self.templateClusterAllocation(SIMPLE_SAMPLES.SAMPLE_SIMPLE13, [5, 5], 2)
 
+    def testClusterAllocationLsun(self):
+        self.templateClusterAllocation(FCPS_SAMPLES.SAMPLE_LSUN, [100, 101, 202], 3)
+
+    def testClusterAllocationTarget(self):
+        self.templateClusterAllocation(FCPS_SAMPLES.SAMPLE_TARGET, [3, 3, 3, 3, 363, 395], 6)
+
+    def testClusterAllocationLsunTreeRebuilt(self):
+        self.templateClusterAllocation(FCPS_SAMPLES.SAMPLE_LSUN, [100, 101, 202], 3,
+                                       branching_factor=200, entry_size_limit=20)
+
+    def testClusterAllocationHepta(self):
+        self.templateClusterAllocation(FCPS_SAMPLES.SAMPLE_HEPTA, [30, 30, 30, 30, 30, 30, 32], 7)
+
     def templateClusterAllocationOneDimensionData(self, branching_factor=5, max_node_entries=10, initial_diameter=1.0, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=20):
         input_data = [[random()] for _ in range(10)] + [[random() + 4] for _ in range(10)] + [[random() + 8] for _ in range(10)] + [[random() + 12] for _ in range(10)]
-         
+
         birch_instance = birch(input_data, 4, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit)
         birch_instance.process()
         clusters = birch_instance.get_clusters()
-         
+
         assert len(clusters) == 4
         for cluster in clusters:
             assert len(cluster) == 10

diff --git a/pyclustering/container/tests/unit/ut_cftree.py b/pyclustering/container/tests/unit/ut_cftree.py
@@ -64,7 +64,12 @@ def testCfClusterRepresentationTwoDimension(self):
 
     def testGetNearestEntry(self):
         sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1)
-        tree = cftree(10, 100, 0.2)
+        tree = cftree(10, 100, 0.2, measurement_type.CENTROID_EUCLIDEAN_DISTANCE)
+
+        self.assertEqual(10, tree.branch_factor)
+        self.assertEqual(100, tree.max_entries)
+        self.assertEqual(0.2, tree.threshold)
+        self.assertEqual(measurement_type.CENTROID_EUCLIDEAN_DISTANCE, tree.type_measurement)
 
         for index_point in range(len(sample)):
             tree.insert_point(sample[index_point])