Skip to content

Commit

Permalink
#543: [pyclustering.cluster.silhouette] Distance matrix support.
Browse files Browse the repository at this point in the history
  • Loading branch information
annoviko committed Sep 9, 2019
1 parent 0342e30 commit 7f0d5be
Show file tree
Hide file tree
Showing 19 changed files with 255 additions and 55 deletions.
3 changes: 3 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ CHANGE NOTES FOR 0.9.2 (STARTED Sep 9, 2019), (RELEASED: -)
------------------------------------------------------------------------

GENERAL CHANGES:
- Introduced parameter `data_type` to Silhouette method to use distance matrix (pyclustering.cluster.silhouette, ccore.clst.silhouette).
See: https://github.com/annoviko/pyclustering/issues/543

- Optimization of HHN (Hodgkin-Huxley Neural Network) by parallel processing (ccore.nnet.hhn).
See: https://github.com/annoviko/pyclustering/issues/541

Expand Down
11 changes: 11 additions & 0 deletions ccore/src/cluster/silhouette.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,15 @@ silhouette::silhouette(const distance_metric<point> & p_metric) :


void silhouette::process(const dataset & p_data, const cluster_sequence & p_clusters, silhouette_data & p_result) {
process(p_data, p_clusters, silhouette_data_t::POINTS, p_result);
}


void silhouette::process(const dataset & p_data, const cluster_sequence & p_clusters, const silhouette_data_t & p_type, silhouette_data & p_result) {
m_data = &p_data;
m_clusters = &p_clusters;
m_result = &p_result;
m_type = p_type;

m_result->get_score().reserve(m_data->size());

Expand All @@ -61,6 +67,11 @@ double silhouette::calculate_score(const std::size_t p_index_point, const std::s


void silhouette::calculate_dataset_difference(const std::size_t p_index_point, std::vector<double> & p_dataset_difference) const {
if (m_type == silhouette_data_t::DISTANCE_MATRIX) {
p_dataset_difference = m_data->at(p_index_point);
return;
}

p_dataset_difference.reserve(m_data->size());

const auto & current_point = m_data->at(p_index_point);
Expand Down
12 changes: 11 additions & 1 deletion ccore/src/cluster/silhouette.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,21 @@ namespace ccore {
namespace clst {


enum class silhouette_data_t {
POINTS,
DISTANCE_MATRIX
};


class silhouette {
private:
const dataset * m_data = nullptr; /* temporary object, exists during processing */
const cluster_sequence * m_clusters = nullptr; /* temporary object, exists during processing */
silhouette_data * m_result = nullptr; /* temporary object, exists during processing */

distance_metric<point> m_metric = distance_metric_factory<point>::euclidean_square();
silhouette_data_t m_type = silhouette_data_t::POINTS;

distance_metric<point> m_metric = distance_metric_factory<point>::euclidean_square();

public:
silhouette(void) = default;
Expand All @@ -61,6 +69,8 @@ class silhouette {
public:
void process(const dataset & p_data, const cluster_sequence & p_clusters, silhouette_data & p_result);

void process(const dataset & p_data, const cluster_sequence & p_clusters, const silhouette_data_t & p_type, silhouette_data & p_result);

private:
double calculate_score(const std::size_t p_index_point, const std::size_t p_index_cluster) const;

Expand Down
2 changes: 1 addition & 1 deletion ccore/src/interface/interface_property.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@


const char * INTERFACE_DESCRIPTION = "ccore library is a C/C++ part of pyclustering library";
const char * INTERFACE_VERSION = "0.9.1";
const char * INTERFACE_VERSION = "0.9.2";


void * get_interface_description(void) {
Expand Down
5 changes: 3 additions & 2 deletions ccore/src/interface/silhouette_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ ccore::clst::silhouette_ksearch_allocator::ptr get_silhouette_ksearch_allocator(
pyclustering_package * silhouette_algorithm(
const pyclustering_package * const p_sample,
const pyclustering_package * const p_clusters,
const void * const p_metric)
const void * const p_metric,
const std::size_t p_data_type)
{
dataset data;
p_sample->extract(data);
Expand All @@ -58,7 +59,7 @@ pyclustering_package * silhouette_algorithm(
metric = &default_metric;

ccore::clst::silhouette_data result;
ccore::clst::silhouette(*metric).process(data, clusters, result);
ccore::clst::silhouette(*metric).process(data, clusters, static_cast<ccore::clst::silhouette_data_t>(p_data_type), result);

return create_package(&result.get_score());
}
Expand Down
4 changes: 3 additions & 1 deletion ccore/src/interface/silhouette_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,16 @@ ccore::clst::silhouette_ksearch_allocator::ptr get_silhouette_ksearch_allocator(
* @param[in] p_sample: input data for clustering.
* @param[in] p_clusters: clusters that have been allocated for that data.
* @param[in] p_metric: pointer to distance metric 'distance_metric' that is used for distance calculation between two points.
* @param[in] p_data_type: defines data type that is used for clustering process ('0' - points, '1' - distance matrix).
*
* @return Returns Silhouette's analysis results as a pyclustering package [ scores ].
*
*/
extern "C" DECLARATION pyclustering_package * silhouette_algorithm(
const pyclustering_package * const p_sample,
const pyclustering_package * const p_clusters,
const void * const p_metric);
const void * const p_metric,
const std::size_t p_data_type);


/**
Expand Down
63 changes: 39 additions & 24 deletions ccore/src/utils/metric.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,30 +230,6 @@ double chi_square_distance(const TypeContainer & point1, const TypeContainer & p
}


/**
*
* @brief Calculates distance matrix using points container.
*
* @param[in] p_points: input data that is represented by points.
* @param[out] p_distance_matrix: output distance matrix of points.
*
*/
template <typename TypeContainer>
void distance_matrix(const TypeContainer & p_points, TypeContainer & p_distance_matrix) {
using TypeElement = typename TypeContainer::value_type;

p_distance_matrix = TypeContainer(p_points.size(), TypeElement(p_points.size(), 0.0));

for (std::size_t i = 0; i < p_points.size(); i++) {
for (std::size_t j = i + 1; j < p_points.size(); j++) {
const double distance = euclidean_distance(p_points.at(i), p_points.at(j));
p_distance_matrix[i][j] = distance;
p_distance_matrix[j][i] = distance;
}
}
}


/**
*
* @brief Basic distance metric provides interface for calculation distance between objects in line with
Expand Down Expand Up @@ -553,6 +529,45 @@ double farthest_distance(const TypeContainer & p_container, const distance_metri
}


/**
*
* @brief Calculates distance matrix using points container using Euclidean distance.
*
* @param[in] p_points: input data that is represented by points.
* @param[in] p_metric: metric for distance calculation between points.
* @param[out] p_distance_matrix: output distance matrix of points.
*
*/
template <typename TypeContainer>
void distance_matrix(const TypeContainer & p_points, const distance_metric<point> & p_metric, TypeContainer & p_distance_matrix) {
using TypeElement = typename TypeContainer::value_type;

p_distance_matrix = TypeContainer(p_points.size(), TypeElement(p_points.size(), 0.0));

for (std::size_t i = 0; i < p_points.size(); i++) {
for (std::size_t j = i + 1; j < p_points.size(); j++) {
const double distance = p_metric(p_points.at(i), p_points.at(j));
p_distance_matrix[i][j] = distance;
p_distance_matrix[j][i] = distance;
}
}
}


/**
*
* @brief Calculates distance matrix using points container using Euclidean distance.
*
* @param[in] p_points: input data that is represented by points.
* @param[out] p_distance_matrix: output distance matrix of points.
*
*/
template <typename TypeContainer>
void distance_matrix(const TypeContainer & p_points, TypeContainer & p_distance_matrix) {
distance_matrix(p_points, distance_metric_factory<point>::euclidean(), p_distance_matrix);
}


}

}
Expand Down
20 changes: 20 additions & 0 deletions ccore/tst/utest-interface-silhouette.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,31 @@

#include "samples.hpp"

#include "answer.hpp"
#include "answer_reader.hpp"
#include "utenv_utils.hpp"

#include <memory>


TEST(utest_interface_silhouette, silhouette) {
dataset_ptr data = simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_01);
answer ans = answer_reader::read(SAMPLE_SIMPLE::SAMPLE_SIMPLE_01);

std::shared_ptr<pyclustering_package> data_package = pack(*data);
std::shared_ptr<pyclustering_package> cluster_package = pack(ans.clusters());

pyclustering_package * result_points = silhouette_algorithm(data_package.get(), cluster_package.get(), nullptr, 0);
pyclustering_package * result_matrix = silhouette_algorithm(data_package.get(), cluster_package.get(), nullptr, 1);

ASSERT_NE(nullptr, result_points);
ASSERT_NE(nullptr, result_matrix);

ASSERT_EQ(result_points->size, result_matrix->size);
ASSERT_EQ(result_points->type, result_matrix->type);
}


TEST(utest_interface_silhouette, silhouette_ksearch) {
dataset_ptr data = simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_01);
std::shared_ptr<pyclustering_package> sample = pack(*data);
Expand Down
54 changes: 53 additions & 1 deletion ccore/tst/utest-silhouette.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
using namespace ccore::clst;


static void template_correct_scores(const dataset_ptr & p_data, const answer & p_answer) {
void template_correct_scores(const dataset_ptr & p_data, const answer & p_answer) {
silhouette_data result;
silhouette().process(*p_data, p_answer.clusters(), result);

Expand All @@ -45,6 +45,25 @@ static void template_correct_scores(const dataset_ptr & p_data, const answer & p
}


void template_correct_score_data_types(const dataset_ptr & p_data, const answer & p_answer) {
silhouette_data result_points, result_matrix;

dataset matrix;
distance_matrix(*p_data, distance_metric_factory<point>::euclidean_square(), matrix);

silhouette().process(*p_data, p_answer.clusters(), silhouette_data_t::POINTS, result_points);
silhouette().process(matrix, p_answer.clusters(), silhouette_data_t::DISTANCE_MATRIX, result_matrix);

ASSERT_EQ(p_data->size(), result_points.get_score().size());
ASSERT_EQ(p_data->size(), result_matrix.get_score().size());

const auto & scores_points = result_points.get_score();
const auto & scores_matrix = result_matrix.get_score();

ASSERT_EQ(scores_points, scores_matrix);
}


TEST(utest_silhouette, correct_score_simple01) {
template_correct_scores(simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_01), answer_reader::read(SAMPLE_SIMPLE::SAMPLE_SIMPLE_01));
}
Expand Down Expand Up @@ -76,3 +95,36 @@ TEST(utest_silhouette, correct_score_simple07) {
TEST(utest_silhouette, correct_score_simple08) {
template_correct_scores(simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_08), answer_reader::read(SAMPLE_SIMPLE::SAMPLE_SIMPLE_08));
}


TEST(utest_silhouette, correct_score_distance_matrix_simple01) {
template_correct_score_data_types(simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_01), answer_reader::read(SAMPLE_SIMPLE::SAMPLE_SIMPLE_01));
}

TEST(utest_silhouette, correct_score_distance_matrix_simple02) {
template_correct_score_data_types(simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_02), answer_reader::read(SAMPLE_SIMPLE::SAMPLE_SIMPLE_02));
}

TEST(utest_silhouette, correct_score_distance_matrix_simple03) {
template_correct_score_data_types(simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_03), answer_reader::read(SAMPLE_SIMPLE::SAMPLE_SIMPLE_03));
}

TEST(utest_silhouette, correct_score_distance_matrix_simple04) {
template_correct_score_data_types(simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_04), answer_reader::read(SAMPLE_SIMPLE::SAMPLE_SIMPLE_04));
}

TEST(utest_silhouette, correct_score_distance_matrix_simple05) {
template_correct_score_data_types(simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_05), answer_reader::read(SAMPLE_SIMPLE::SAMPLE_SIMPLE_05));
}

TEST(utest_silhouette, correct_score_distance_matrix_simple06) {
template_correct_score_data_types(simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_06), answer_reader::read(SAMPLE_SIMPLE::SAMPLE_SIMPLE_06));
}

TEST(utest_silhouette, correct_score_distance_matrix_simple07) {
template_correct_score_data_types(simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_07), answer_reader::read(SAMPLE_SIMPLE::SAMPLE_SIMPLE_07));
}

TEST(utest_silhouette, correct_score_distance_matrix_simple08) {
template_correct_score_data_types(simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_08), answer_reader::read(SAMPLE_SIMPLE::SAMPLE_SIMPLE_08));
}
3 changes: 2 additions & 1 deletion pyclustering/cluster/dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ def __init__(self, data, eps, neighbors, ccore = True, **kwargs):
"""!
@brief Constructor of clustering algorithm DBSCAN.
@param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
@param[in] data (list): Input data that is presented as list of points or distance matrix (defined by parameter
'data_type', by default data is considered as a list of points).
@param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less then the radius.
@param[in] neighbors (uint): minimum number of shared neighbors that is required for establish links between points.
@param[in] ccore (bool): if True than DLL CCORE (C++ solution) will be used for solving the problem.
Expand Down
12 changes: 6 additions & 6 deletions pyclustering/cluster/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ def __init__(self):
@brief Initializer of observer of K-Means algorithm.
"""
self.__evolution_clusters = []
self.__evolution_centers = []
self.__initial_centers = []
self.__evolution_clusters = []
self.__evolution_centers = []
self.__initial_centers = []


def __len__(self):
Expand Down Expand Up @@ -168,9 +168,9 @@ def show_clusters(sample, clusters, centers, initial_centers = None, **kwargs):
display = kwargs.get('display', True)

if figure is None:
figure = visualizer.show(display = False)
figure = visualizer.show(display=False)
else:
visualizer.show(figure = figure, display = False)
visualizer.show(figure=figure, display=False)

kmeans_visualizer.__draw_centers(figure, offset, visualizer, centers, initial_centers)
kmeans_visualizer.__draw_rays(figure, offset, visualizer, sample, clusters, centers)
Expand Down Expand Up @@ -229,7 +229,7 @@ def __draw_centers(figure, offset, visualizer, centers, initial_centers):


@staticmethod
def animate_cluster_allocation(data, observer, animation_velocity = 500, movie_fps = 1, save_movie = None):
def animate_cluster_allocation(data, observer, animation_velocity=500, movie_fps=1, save_movie=None):
"""!
@brief Animates clustering process that is performed by K-Means algorithm.
Expand Down
13 changes: 10 additions & 3 deletions pyclustering/cluster/silhouette.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,19 +85,23 @@ def __init__(self, data, clusters, **kwargs):
"""!
@brief Initializes Silhouette method for analysis.
@param[in] data (array_like): Input data that was used for cluster analysis.
@param[in] data (array_like): Input data that was used for cluster analysis and that is presented as list of
points or distance matrix (defined by parameter 'data_type', by default data is considered as a list
of points).
@param[in] clusters (list): Cluster that have been obtained after cluster analysis.
@param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric').
<b>Keyword Args:</b><br>
- metric (distance_metric): Metric that was used for cluster analysis and should be used for Silhouette
score calculation (by default Square Euclidean distance).
- data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
- ccore (bool): If True then CCORE (C++ implementation of pyclustering library) is used (by default True).
"""
self.__data = data
self.__clusters = clusters
self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
self.__data_type = kwargs.get('data_type', 'points')

if self.__metric.get_type() != type_metric.USER_DEFINED:
self.__metric.enable_numpy_usage()
Expand Down Expand Up @@ -135,7 +139,7 @@ def __process_by_ccore(self):
"""
ccore_metric = metric_wrapper.create_instance(self.__metric)
self.__score = wrapper.silhoeutte(self.__data, self.__clusters, ccore_metric.get_pointer())
self.__score = wrapper.silhoeutte(self.__data, self.__clusters, ccore_metric.get_pointer(), self.__data_type)


def __process_by_python(self):
Expand Down Expand Up @@ -168,7 +172,10 @@ def __calculate_score(self, index_point, index_cluster):
@return (float) Silhouette score for the object.
"""
difference = self.__calculate_dataset_difference(index_point)
if self.__data_type == 'points':
difference = self.__calculate_dataset_difference(index_point)
else:
difference = self.__data[index_point]

a_score = self.__calculate_within_cluster_score(index_cluster, difference)
b_score = self.__caclulate_optimal_neighbor_cluster_score(index_cluster, difference)
Expand Down
Loading

0 comments on commit 7f0d5be

Please sign in to comment.