Skip to content

Commit

Permalink
#563: Unit-tests are added for BIRCH algorithm.
Browse files Browse the repository at this point in the history
  • Loading branch information
annoviko committed Nov 27, 2019
1 parent c7ba5e5 commit 2856d37
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 43 deletions.
8 changes: 4 additions & 4 deletions pyclustering/cluster/birch.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class birch:
"""

def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=200, initial_diameter=0.5,
def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=200, diameter=0.5,
type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE,
entry_size_limit=500,
diameter_multiplier=1.5,
Expand All @@ -75,7 +75,7 @@ def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=
@param[in] number_clusters (uint): Number of clusters that should be allocated.
@param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
@param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
@param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded.
@param[in] diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded.
@param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
@param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt.
@param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded.
Expand All @@ -96,7 +96,7 @@ def __init__(self, data, number_clusters, branching_factor=50, max_node_entries=
self.__verify_arguments()

self.__features = None
self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement)
self.__tree = cftree(branching_factor, max_node_entries, diameter, type_measurement)

self.__clusters = []
self.__noise = []
Expand Down Expand Up @@ -223,7 +223,7 @@ def __rebuild_tree(self, index_point):
@return (cftree) Rebuilt tree with encoded points till specified point from input data space.
"""

rebuild_result = False
increased_diameter = self.__tree.threshold * self.__diameter_multiplier

Expand Down
30 changes: 14 additions & 16 deletions pyclustering/cluster/examples/birch_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,19 +78,18 @@ def cluster_elongate():

def cluster_lsun():
template_clustering(3, FCPS_SAMPLES.SAMPLE_LSUN)
template_clustering(3, FCPS_SAMPLES.SAMPLE_LSUN, 5, 5, 0.2, measurement_type.CENTROID_MANHATTAN_DISTANCE, 200) # not correct, but almost good result

def cluster_lsun_rebuilt():
template_clustering(3, FCPS_SAMPLES.SAMPLE_LSUN, entry_size_limit=20, diameter_multiplier=1.5)

def cluster_target():
template_clustering(6, FCPS_SAMPLES.SAMPLE_TARGET)
template_clustering(6, FCPS_SAMPLES.SAMPLE_TARGET, 5, 10, 0.5, measurement_type.CENTROID_MANHATTAN_DISTANCE, 200) # interesting - sliced cake.
template_clustering(6, FCPS_SAMPLES.SAMPLE_TARGET, 50, 100, 0.5, measurement_type.VARIANCE_INCREASE_DISTANCE, 200) # interesting - sliced cake.

def cluster_two_diamonds():
template_clustering(2, FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)

def cluster_wing_nut():
template_clustering(2, FCPS_SAMPLES.SAMPLE_WING_NUT)
template_clustering(2, FCPS_SAMPLES.SAMPLE_WING_NUT, 5, 5, 0.1, measurement_type.CENTROID_EUCLIDEAN_DISTANCE, 800) # not correct, but almost good result

def cluster_chainlink():
template_clustering(2, FCPS_SAMPLES.SAMPLE_CHAINLINK)
Expand All @@ -102,8 +101,7 @@ def cluster_tetra():
template_clustering(4, FCPS_SAMPLES.SAMPLE_TETRA)

def cluster_engy_time():
template_clustering(2, FCPS_SAMPLES.SAMPLE_ENGY_TIME) # one cluster is allocated
template_clustering(2, FCPS_SAMPLES.SAMPLE_ENGY_TIME, 10, 10, 0.05, measurement_type.VARIANCE_INCREASE_DISTANCE, 500) # good result
template_clustering(2, FCPS_SAMPLES.SAMPLE_ENGY_TIME)


def experiment_execution_time(ccore=False):
Expand All @@ -124,14 +122,14 @@ def experiment_execution_time(ccore=False):


def display_fcps_clustering_results():
(lsun, lsun_clusters, _) = template_clustering(3, FCPS_SAMPLES.SAMPLE_LSUN, show_result=False)
(target, target_clusters, _) = template_clustering(6, FCPS_SAMPLES.SAMPLE_TARGET, show_result=False)
(two_diamonds, two_diamonds_clusters, _) = template_clustering(2, FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS, show_result=False)
(wing_nut, wing_nut_clusters, _) = template_clustering(2, FCPS_SAMPLES.SAMPLE_WING_NUT, show_result=False)
(chainlink, chainlink_clusters, _) = template_clustering(2, FCPS_SAMPLES.SAMPLE_CHAINLINK, show_result=False)
(hepta, hepta_clusters, _) = template_clustering(7, FCPS_SAMPLES.SAMPLE_HEPTA, show_result=False)
(tetra, tetra_clusters, _) = template_clustering(4, FCPS_SAMPLES.SAMPLE_TETRA, show_result=False)
(atom, atom_clusters, _) = template_clustering(2, FCPS_SAMPLES.SAMPLE_ATOM, show_result=False)
(lsun, lsun_clusters) = template_clustering(3, FCPS_SAMPLES.SAMPLE_LSUN, show_result=False)
(target, target_clusters) = template_clustering(6, FCPS_SAMPLES.SAMPLE_TARGET, show_result=False)
(two_diamonds, two_diamonds_clusters) = template_clustering(2, FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS, show_result=False)
(wing_nut, wing_nut_clusters) = template_clustering(2, FCPS_SAMPLES.SAMPLE_WING_NUT, show_result=False)
(chainlink, chainlink_clusters) = template_clustering(2, FCPS_SAMPLES.SAMPLE_CHAINLINK, show_result=False)
(hepta, hepta_clusters) = template_clustering(7, FCPS_SAMPLES.SAMPLE_HEPTA, show_result=False)
(tetra, tetra_clusters) = template_clustering(4, FCPS_SAMPLES.SAMPLE_TETRA, show_result=False)
(atom, atom_clusters) = template_clustering(2, FCPS_SAMPLES.SAMPLE_ATOM, show_result=False)

visualizer = cluster_visualizer(8, 4)
visualizer.append_clusters(lsun_clusters, lsun, 0)
Expand All @@ -154,15 +152,15 @@ def display_fcps_clustering_results():
cluster_sample8()
cluster_elongate()
cluster_lsun()
cluster_lsun_rebuilt()
cluster_target()
cluster_two_diamonds()
cluster_wing_nut()
cluster_chainlink()
cluster_hepta()
cluster_tetra()
cluster_engy_time()



experiment_execution_time(True) # C++ code + Python env.

display_fcps_clustering_results()
34 changes: 17 additions & 17 deletions pyclustering/cluster/tests/agglomerative_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,47 +33,47 @@
class AgglomerativeTestTemplates:
@staticmethod
def templateClusteringResults(path, number_clusters, link, expected_length_clusters, ccore_flag):
sample = read_sample(path);
sample = read_sample(path)

agglomerative_instance = agglomerative(sample, number_clusters, link, ccore_flag);
agglomerative_instance.process();
agglomerative_instance = agglomerative(sample, number_clusters, link, ccore_flag)
agglomerative_instance.process()

clusters = agglomerative_instance.get_clusters();
clusters = agglomerative_instance.get_clusters()

assert sum([len(cluster) for cluster in clusters]) == len(sample);
assert sum([len(cluster) for cluster in clusters]) == sum(expected_length_clusters);
assert sorted([len(cluster) for cluster in clusters]) == expected_length_clusters;

@staticmethod
def templateClusterAllocationOneDimensionData(link, ccore_flag):
input_data = [ [random()] for i in range(10) ] + [ [random() + 3] for i in range(10) ] + [ [random() + 5] for i in range(10) ] + [ [random() + 8] for i in range(10) ];
input_data = [ [random()] for i in range(10) ] + [ [random() + 3] for i in range(10) ] + [ [random() + 5] for i in range(10) ] + [ [random() + 8] for i in range(10) ]

agglomerative_instance = agglomerative(input_data, 4, link, ccore_flag);
agglomerative_instance.process();
clusters = agglomerative_instance.get_clusters();
agglomerative_instance = agglomerative(input_data, 4, link, ccore_flag)
agglomerative_instance.process()
clusters = agglomerative_instance.get_clusters()

assert len(clusters) == 4;
for cluster in clusters:
assert len(cluster) == 10;

@staticmethod
def templateClusterAllocationTheSameObjects(number_objects, number_clusters, link, ccore_flag):
input_data = [ [random()] ] * number_objects;
input_data = [ [random()] ] * number_objects

agglomerative_instance = agglomerative(input_data, number_clusters, link, ccore_flag);
agglomerative_instance.process();
clusters = agglomerative_instance.get_clusters();
agglomerative_instance = agglomerative(input_data, number_clusters, link, ccore_flag)
agglomerative_instance.process()
clusters = agglomerative_instance.get_clusters()

assert len(clusters) == number_clusters;

object_mark = [False] * number_objects;
allocated_number_objects = 0;
object_mark = [False] * number_objects
allocated_number_objects = 0

for cluster in clusters:
for index_object in cluster:
assert (object_mark[index_object] == False); # one object can be in only one cluster.

object_mark[index_object] = True;
allocated_number_objects += 1;
object_mark[index_object] = True
allocated_number_objects += 1

assert (number_objects == allocated_number_objects); # number of allocated objects should be the same.
assert (number_objects == allocated_number_objects); # number of allocated objects should be the same.
25 changes: 20 additions & 5 deletions pyclustering/cluster/tests/unit/ut_birch.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,27 +28,29 @@
import matplotlib
matplotlib.use('Agg')

from pyclustering.samples.definitions import SIMPLE_SAMPLES
from pyclustering.samples.definitions import SIMPLE_SAMPLES, FCPS_SAMPLES

from pyclustering.utils import read_sample

from pyclustering.container.cftree import measurement_type

from pyclustering.cluster.encoder import type_encoding
from pyclustering.cluster.birch import birch

from random import random


class BirchUnitTest(unittest.TestCase):
def templateClusterAllocation(self, path, cluster_sizes, number_clusters, branching_factor=5, max_node_entries=5,
initial_diameter=0.1, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE,
def templateClusterAllocation(self, path, cluster_sizes, number_clusters, branching_factor=50, max_node_entries=100,
initial_diameter=0.5, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE,
entry_size_limit=200, diameter_multiplier=1.5):
sample = read_sample(path)

birch_instance = birch(sample, number_clusters, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit, diameter_multiplier)
birch_instance.process()

clusters = birch_instance.get_clusters()
self.assertEqual(birch_instance.get_cluster_encoding(), type_encoding.CLUSTER_INDEX_LIST_SEPARATION)

obtained_cluster_sizes = [len(cluster) for cluster in clusters]

Expand Down Expand Up @@ -133,13 +135,26 @@ def testClusterAllocationTheSameData2(self):
def testClusterAllocationZeroColumn(self):
self.templateClusterAllocation(SIMPLE_SAMPLES.SAMPLE_SIMPLE13, [5, 5], 2)

def testClusterAllocationLsun(self):
self.templateClusterAllocation(FCPS_SAMPLES.SAMPLE_LSUN, [100, 101, 202], 3)

def testClusterAllocationTarget(self):
self.templateClusterAllocation(FCPS_SAMPLES.SAMPLE_TARGET, [3, 3, 3, 3, 363, 395], 6)

def testClusterAllocationLsunTreeRebuilt(self):
self.templateClusterAllocation(FCPS_SAMPLES.SAMPLE_LSUN, [100, 101, 202], 3,
branching_factor=200, entry_size_limit=20)

def testClusterAllocationHepta(self):
self.templateClusterAllocation(FCPS_SAMPLES.SAMPLE_HEPTA, [30, 30, 30, 30, 30, 30, 32], 7)

def templateClusterAllocationOneDimensionData(self, branching_factor=5, max_node_entries=10, initial_diameter=1.0, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=20):
input_data = [[random()] for _ in range(10)] + [[random() + 4] for _ in range(10)] + [[random() + 8] for _ in range(10)] + [[random() + 12] for _ in range(10)]

birch_instance = birch(input_data, 4, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit)
birch_instance.process()
clusters = birch_instance.get_clusters()

assert len(clusters) == 4
for cluster in clusters:
assert len(cluster) == 10
Expand Down
7 changes: 6 additions & 1 deletion pyclustering/container/tests/unit/ut_cftree.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@ def testCfClusterRepresentationTwoDimension(self):

def testGetNearestEntry(self):
sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1)
tree = cftree(10, 100, 0.2)
tree = cftree(10, 100, 0.2, measurement_type.CENTROID_EUCLIDEAN_DISTANCE)

self.assertEqual(10, tree.branch_factor)
self.assertEqual(100, tree.max_entries)
self.assertEqual(0.2, tree.threshold)
self.assertEqual(measurement_type.CENTROID_EUCLIDEAN_DISTANCE, tree.type_measurement)

for index_point in range(len(sample)):
tree.insert_point(sample[index_point])
Expand Down

0 comments on commit 2856d37

Please sign in to comment.