diff --git a/pyclustering/cluster/birch.py b/pyclustering/cluster/birch.py index 5b240506..b3a02ea8 100755 --- a/pyclustering/cluster/birch.py +++ b/pyclustering/cluster/birch.py @@ -63,7 +63,7 @@ class birch: """ - def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=200, initial_diameter=0.5, + def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=200, diameter=0.5, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=500, diameter_multiplier=1.5, @@ -75,7 +75,7 @@ def __init__(self, data, number_clusters, branching_factor=50, max_node_entries= @param[in] number_clusters (uint): Number of clusters that should be allocated. @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree. @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree. - @param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded. + @param[in] diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded. @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics. @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt. @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded. @@ -96,7 +96,7 @@ def __init__(self, data, number_clusters, branching_factor=50, max_node_entries= self.__verify_arguments() self.__features = None - self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement) + self.__tree = cftree(branching_factor, max_node_entries, diameter, type_measurement) self.__clusters = [] self.__noise = [] @@ -223,7 +223,7 @@ def __rebuild_tree(self, index_point): @return (cftree) Rebuilt tree with encoded points till specified point from input data space. """ - + rebuild_result = False increased_diameter = self.__tree.threshold * self.__diameter_multiplier diff --git a/pyclustering/cluster/examples/birch_examples.py b/pyclustering/cluster/examples/birch_examples.py index 1c373e25..54e4ae41 100755 --- a/pyclustering/cluster/examples/birch_examples.py +++ b/pyclustering/cluster/examples/birch_examples.py @@ -78,19 +78,18 @@ def cluster_elongate(): def cluster_lsun(): template_clustering(3, FCPS_SAMPLES.SAMPLE_LSUN) - template_clustering(3, FCPS_SAMPLES.SAMPLE_LSUN, 5, 5, 0.2, measurement_type.CENTROID_MANHATTAN_DISTANCE, 200) # not correct, but almost good result + +def cluster_lsun_rebuilt(): + template_clustering(3, FCPS_SAMPLES.SAMPLE_LSUN, entry_size_limit=20, diameter_multiplier=1.5) def cluster_target(): template_clustering(6, FCPS_SAMPLES.SAMPLE_TARGET) - template_clustering(6, FCPS_SAMPLES.SAMPLE_TARGET, 5, 10, 0.5, measurement_type.CENTROID_MANHATTAN_DISTANCE, 200) # interesting - sliced cake. - template_clustering(6, FCPS_SAMPLES.SAMPLE_TARGET, 50, 100, 0.5, measurement_type.VARIANCE_INCREASE_DISTANCE, 200) # interesting - sliced cake. def cluster_two_diamonds(): template_clustering(2, FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS) def cluster_wing_nut(): template_clustering(2, FCPS_SAMPLES.SAMPLE_WING_NUT) - template_clustering(2, FCPS_SAMPLES.SAMPLE_WING_NUT, 5, 5, 0.1, measurement_type.CENTROID_EUCLIDEAN_DISTANCE, 800) # not correct, but almost good result def cluster_chainlink(): template_clustering(2, FCPS_SAMPLES.SAMPLE_CHAINLINK) @@ -102,8 +101,7 @@ def cluster_tetra(): template_clustering(4, FCPS_SAMPLES.SAMPLE_TETRA) def cluster_engy_time(): - template_clustering(2, FCPS_SAMPLES.SAMPLE_ENGY_TIME) # one cluster is allocated - template_clustering(2, FCPS_SAMPLES.SAMPLE_ENGY_TIME, 10, 10, 0.05, measurement_type.VARIANCE_INCREASE_DISTANCE, 500) # good result + template_clustering(2, FCPS_SAMPLES.SAMPLE_ENGY_TIME) def experiment_execution_time(ccore=False): @@ -124,14 +122,14 @@ def experiment_execution_time(ccore=False): def display_fcps_clustering_results(): - (lsun, lsun_clusters, _) = template_clustering(3, FCPS_SAMPLES.SAMPLE_LSUN, show_result=False) - (target, target_clusters, _) = template_clustering(6, FCPS_SAMPLES.SAMPLE_TARGET, show_result=False) - (two_diamonds, two_diamonds_clusters, _) = template_clustering(2, FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS, show_result=False) - (wing_nut, wing_nut_clusters, _) = template_clustering(2, FCPS_SAMPLES.SAMPLE_WING_NUT, show_result=False) - (chainlink, chainlink_clusters, _) = template_clustering(2, FCPS_SAMPLES.SAMPLE_CHAINLINK, show_result=False) - (hepta, hepta_clusters, _) = template_clustering(7, FCPS_SAMPLES.SAMPLE_HEPTA, show_result=False) - (tetra, tetra_clusters, _) = template_clustering(4, FCPS_SAMPLES.SAMPLE_TETRA, show_result=False) - (atom, atom_clusters, _) = template_clustering(2, FCPS_SAMPLES.SAMPLE_ATOM, show_result=False) + (lsun, lsun_clusters) = template_clustering(3, FCPS_SAMPLES.SAMPLE_LSUN, show_result=False) + (target, target_clusters) = template_clustering(6, FCPS_SAMPLES.SAMPLE_TARGET, show_result=False) + (two_diamonds, two_diamonds_clusters) = template_clustering(2, FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS, show_result=False) + (wing_nut, wing_nut_clusters) = template_clustering(2, FCPS_SAMPLES.SAMPLE_WING_NUT, show_result=False) + (chainlink, chainlink_clusters) = template_clustering(2, FCPS_SAMPLES.SAMPLE_CHAINLINK, show_result=False) + (hepta, hepta_clusters) = template_clustering(7, FCPS_SAMPLES.SAMPLE_HEPTA, show_result=False) + (tetra, tetra_clusters) = template_clustering(4, FCPS_SAMPLES.SAMPLE_TETRA, show_result=False) + (atom, atom_clusters) = template_clustering(2, FCPS_SAMPLES.SAMPLE_ATOM, show_result=False) visualizer = cluster_visualizer(8, 4) visualizer.append_clusters(lsun_clusters, lsun, 0) @@ -154,6 +152,7 @@ def display_fcps_clustering_results(): cluster_sample8() cluster_elongate() cluster_lsun() +cluster_lsun_rebuilt() cluster_target() cluster_two_diamonds() cluster_wing_nut() @@ -161,8 +160,7 @@ def display_fcps_clustering_results(): cluster_hepta() cluster_tetra() cluster_engy_time() - - + experiment_execution_time(True) # C++ code + Python env. display_fcps_clustering_results() diff --git a/pyclustering/cluster/tests/agglomerative_templates.py b/pyclustering/cluster/tests/agglomerative_templates.py index e909b787..61175fe3 100755 --- a/pyclustering/cluster/tests/agglomerative_templates.py +++ b/pyclustering/cluster/tests/agglomerative_templates.py @@ -33,12 +33,12 @@ class AgglomerativeTestTemplates: @staticmethod def templateClusteringResults(path, number_clusters, link, expected_length_clusters, ccore_flag): - sample = read_sample(path); + sample = read_sample(path) - agglomerative_instance = agglomerative(sample, number_clusters, link, ccore_flag); - agglomerative_instance.process(); + agglomerative_instance = agglomerative(sample, number_clusters, link, ccore_flag) + agglomerative_instance.process() - clusters = agglomerative_instance.get_clusters(); + clusters = agglomerative_instance.get_clusters() assert sum([len(cluster) for cluster in clusters]) == len(sample); assert sum([len(cluster) for cluster in clusters]) == sum(expected_length_clusters); @@ -46,11 +46,11 @@ def templateClusteringResults(path, number_clusters, link, expected_length_clust @staticmethod def templateClusterAllocationOneDimensionData(link, ccore_flag): - input_data = [ [random()] for i in range(10) ] + [ [random() + 3] for i in range(10) ] + [ [random() + 5] for i in range(10) ] + [ [random() + 8] for i in range(10) ]; + input_data = [ [random()] for i in range(10) ] + [ [random() + 3] for i in range(10) ] + [ [random() + 5] for i in range(10) ] + [ [random() + 8] for i in range(10) ] - agglomerative_instance = agglomerative(input_data, 4, link, ccore_flag); - agglomerative_instance.process(); - clusters = agglomerative_instance.get_clusters(); + agglomerative_instance = agglomerative(input_data, 4, link, ccore_flag) + agglomerative_instance.process() + clusters = agglomerative_instance.get_clusters() assert len(clusters) == 4; for cluster in clusters: @@ -58,22 +58,22 @@ def templateClusterAllocationOneDimensionData(link, ccore_flag): @staticmethod def templateClusterAllocationTheSameObjects(number_objects, number_clusters, link, ccore_flag): - input_data = [ [random()] ] * number_objects; + input_data = [ [random()] ] * number_objects - agglomerative_instance = agglomerative(input_data, number_clusters, link, ccore_flag); - agglomerative_instance.process(); - clusters = agglomerative_instance.get_clusters(); + agglomerative_instance = agglomerative(input_data, number_clusters, link, ccore_flag) + agglomerative_instance.process() + clusters = agglomerative_instance.get_clusters() assert len(clusters) == number_clusters; - object_mark = [False] * number_objects; - allocated_number_objects = 0; + object_mark = [False] * number_objects + allocated_number_objects = 0 for cluster in clusters: for index_object in cluster: assert (object_mark[index_object] == False); # one object can be in only one cluster. - object_mark[index_object] = True; - allocated_number_objects += 1; + object_mark[index_object] = True + allocated_number_objects += 1 - assert (number_objects == allocated_number_objects); # number of allocated objects should be the same. \ No newline at end of file + assert (number_objects == allocated_number_objects); # number of allocated objects should be the same. diff --git a/pyclustering/cluster/tests/unit/ut_birch.py b/pyclustering/cluster/tests/unit/ut_birch.py index 79f07c57..664756bd 100755 --- a/pyclustering/cluster/tests/unit/ut_birch.py +++ b/pyclustering/cluster/tests/unit/ut_birch.py @@ -28,20 +28,21 @@ import matplotlib matplotlib.use('Agg') -from pyclustering.samples.definitions import SIMPLE_SAMPLES +from pyclustering.samples.definitions import SIMPLE_SAMPLES, FCPS_SAMPLES from pyclustering.utils import read_sample from pyclustering.container.cftree import measurement_type +from pyclustering.cluster.encoder import type_encoding from pyclustering.cluster.birch import birch from random import random class BirchUnitTest(unittest.TestCase): - def templateClusterAllocation(self, path, cluster_sizes, number_clusters, branching_factor=5, max_node_entries=5, - initial_diameter=0.1, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, + def templateClusterAllocation(self, path, cluster_sizes, number_clusters, branching_factor=50, max_node_entries=100, + initial_diameter=0.5, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=200, diameter_multiplier=1.5): sample = read_sample(path) @@ -49,6 +50,7 @@ def templateClusterAllocation(self, path, cluster_sizes, number_clusters, branch birch_instance.process() clusters = birch_instance.get_clusters() + self.assertEqual(birch_instance.get_cluster_encoding(), type_encoding.CLUSTER_INDEX_LIST_SEPARATION) obtained_cluster_sizes = [len(cluster) for cluster in clusters] @@ -133,13 +135,26 @@ def testClusterAllocationTheSameData2(self): def testClusterAllocationZeroColumn(self): self.templateClusterAllocation(SIMPLE_SAMPLES.SAMPLE_SIMPLE13, [5, 5], 2) + def testClusterAllocationLsun(self): + self.templateClusterAllocation(FCPS_SAMPLES.SAMPLE_LSUN, [100, 101, 202], 3) + + def testClusterAllocationTarget(self): + self.templateClusterAllocation(FCPS_SAMPLES.SAMPLE_TARGET, [3, 3, 3, 3, 363, 395], 6) + + def testClusterAllocationLsunTreeRebuilt(self): + self.templateClusterAllocation(FCPS_SAMPLES.SAMPLE_LSUN, [100, 101, 202], 3, + branching_factor=200, entry_size_limit=20) + + def testClusterAllocationHepta(self): + self.templateClusterAllocation(FCPS_SAMPLES.SAMPLE_HEPTA, [30, 30, 30, 30, 30, 30, 32], 7) + def templateClusterAllocationOneDimensionData(self, branching_factor=5, max_node_entries=10, initial_diameter=1.0, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=20): input_data = [[random()] for _ in range(10)] + [[random() + 4] for _ in range(10)] + [[random() + 8] for _ in range(10)] + [[random() + 12] for _ in range(10)] - + birch_instance = birch(input_data, 4, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit) birch_instance.process() clusters = birch_instance.get_clusters() - + assert len(clusters) == 4 for cluster in clusters: assert len(cluster) == 10 diff --git a/pyclustering/container/tests/unit/ut_cftree.py b/pyclustering/container/tests/unit/ut_cftree.py index 13d19151..98adad59 100755 --- a/pyclustering/container/tests/unit/ut_cftree.py +++ b/pyclustering/container/tests/unit/ut_cftree.py @@ -64,7 +64,12 @@ def testCfClusterRepresentationTwoDimension(self): def testGetNearestEntry(self): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1) - tree = cftree(10, 100, 0.2) + tree = cftree(10, 100, 0.2, measurement_type.CENTROID_EUCLIDEAN_DISTANCE) + + self.assertEqual(10, tree.branch_factor) + self.assertEqual(100, tree.max_entries) + self.assertEqual(0.2, tree.threshold) + self.assertEqual(measurement_type.CENTROID_EUCLIDEAN_DISTANCE, tree.type_measurement) for index_point in range(len(sample)): tree.insert_point(sample[index_point])