Skip to content

Commit

Permalink
#578: 'random_state' parameter for cluster module. #606: type correct…
Browse files Browse the repository at this point in the history
…ion - 'dict' instead of 'list'.
  • Loading branch information
annoviko committed Jun 15, 2020
1 parent 41c37cf commit b19c298
Show file tree
Hide file tree
Showing 25 changed files with 893 additions and 441 deletions.
6 changes: 6 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ CHANGE NOTES FOR 0.10.0 (STARTED Jan 24, 2020), (RELEASED: Dev -)

GENERAL CHANGES:

- Introduced parameter `random_seed` for algorithms/models that use random functionality: `kmeans++`, `random_center_initializer`, `ga`, `gmeans`, `xmeans`, `som`, `somsc`, `elbow`, `silhouette_ksearch` (Python: `pyclustering.cluster`; C++: `pyclustering.clst`).
See: https://github.com/annoviko/pyclustering/issues/578

- Introduced parameter `k_max` to G-Means algorithm to use it as an optional stop condition for the algorithm (Python: `pyclustering.cluster.gmeans`; C++: `pyclustering::clst::gmeans`).
See: https://github.com/annoviko/pyclustering/issues/602

Expand Down Expand Up @@ -33,6 +36,9 @@ GENERAL CHANGES:

CORRECTED MAJOR BUGS:

- Bug with wrong data type for `scores` in Silhouette K-search algorithm in case of using C++ (Python: `pyclustering.cluster.silhouette`).
See: https://github.com/annoviko/pyclustering/issues/606

- Bug with a random distribution in the random center initializer (Python: `pyclustering.cluster.center_initializer`).
See: https://github.com/annoviko/pyclustering/issues/573

Expand Down
52 changes: 51 additions & 1 deletion ccore/include/pyclustering/cluster/silhouette_ksearch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,18 @@ class silhouette_ksearch_allocator {
*/
virtual void allocate(const std::size_t p_amount, const dataset & p_data, cluster_sequence & p_clusters) = 0;

/*!
@brief Performs cluster analysis in order to allocate specified amount of cluster from an input data.
@param[in] p_amount: amount of clusters that should be allocated.
@param[in] p_data: input data for cluster analysis.
@param[in] p_random_state: seed for random state (value `RANDOM_STATE_CURRENT_TIME` means the current system time is going to used as a seed).
@param[out] p_clusters: container where result (allocated clusters) is placed.
*/
virtual void allocate(const std::size_t p_amount, const dataset & p_data, const long long p_random_state, cluster_sequence & p_clusters) = 0;
};


Expand All @@ -96,6 +108,18 @@ class kmeans_allocator : public silhouette_ksearch_allocator {
*/
virtual void allocate(const std::size_t p_amount, const dataset & p_data, cluster_sequence & p_clusters) override;

/*!
@brief Performs cluster analysis using K-Means algorithm.
@param[in] p_amount: amount of clusters that should be allocated.
@param[in] p_data: input data for cluster analysis.
@param[in] p_random_state: seed for random state (value `RANDOM_STATE_CURRENT_TIME` means the current system time is going to used as a seed).
@param[out] p_clusters: container where result (allocated clusters) is placed.
*/
virtual void allocate(const std::size_t p_amount, const dataset & p_data, const long long p_random_state, cluster_sequence & p_clusters) override;
};


Expand All @@ -118,6 +142,18 @@ class kmedians_allocator : public silhouette_ksearch_allocator {
*/
virtual void allocate(const std::size_t p_amount, const dataset & p_data, cluster_sequence & p_clusters) override;

/*!
@brief Performs cluster analysis using K-Medians algorithm.
@param[in] p_amount: amount of clusters that should be allocated.
@param[in] p_data: input data for cluster analysis.
@param[in] p_random_state: seed for random state (value `RANDOM_STATE_CURRENT_TIME` means the current system time is going to used as a seed).
@param[out] p_clusters: container where result (allocated clusters) is placed.
*/
virtual void allocate(const std::size_t p_amount, const dataset & p_data, const long long p_random_state, cluster_sequence & p_clusters) override;
};


Expand All @@ -140,6 +176,18 @@ class kmedoids_allocator : public silhouette_ksearch_allocator {
*/
virtual void allocate(const std::size_t p_amount, const dataset & p_data, cluster_sequence & p_clusters) override;

/*!
@brief Performs cluster analysis using K-Medoids algorithm.
@param[in] p_amount: amount of clusters that should be allocated.
@param[in] p_data: input data for cluster analysis.
@param[in] p_random_state: seed for random state (value `RANDOM_STATE_CURRENT_TIME` means the current system time is going to used as a seed).
@param[out] p_clusters: container where result (allocated clusters) is placed.
*/
virtual void allocate(const std::size_t p_amount, const dataset & p_data, const long long p_random_state, cluster_sequence & p_clusters) override;
};


Expand All @@ -156,6 +204,7 @@ class silhouette_ksearch {
std::size_t m_kmin;
std::size_t m_kmax;
silhouette_ksearch_allocator::ptr m_allocator = std::make_shared<kmeans_allocator>();
long long m_random_state;

public:
/*!
Expand All @@ -172,9 +221,10 @@ class silhouette_ksearch {
@param[in] p_kmin: minimum amount of clusters that might be allocated.
@param[in] p_kmax: maximum amount of clusters that might be allocated.
@param[in] p_allocator: strategy that is used to allocate clusters or in other words, to perform cluster analysis.
@param[in] p_random_state: seed for random state (by default is `RANDOM_STATE_CURRENT_TIME`, current system time is used).
*/
silhouette_ksearch(const std::size_t p_kmin, const std::size_t p_kmax, const silhouette_ksearch_allocator::ptr & p_allocator = std::make_shared<kmeans_allocator>());
silhouette_ksearch(const std::size_t p_kmin, const std::size_t p_kmax, const silhouette_ksearch_allocator::ptr & p_allocator = std::make_shared<kmeans_allocator>(), const long long p_random_state = RANDOM_STATE_CURRENT_TIME);

/*!
Expand Down
24 changes: 18 additions & 6 deletions ccore/include/pyclustering/cluster/silhouette_ksearch_data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class silhouette_ksearch_data {
@return Optimal amount of clusters.
*/
const std::size_t get_amount() const { return m_amount; }
const std::size_t get_amount() const;

/*!
Expand All @@ -65,7 +65,7 @@ class silhouette_ksearch_data {
@param[in] p_amount: optimal amount of clusters.
*/
void set_amount(const std::size_t p_amount) { m_amount = p_amount; }
void set_amount(const std::size_t p_amount);

/*!
Expand All @@ -74,7 +74,7 @@ class silhouette_ksearch_data {
@return Optimal amount of clusters that has been found during the analysis.
*/
const double get_score() const { return m_score; }
const double get_score() const;

/*!
Expand All @@ -83,7 +83,7 @@ class silhouette_ksearch_data {
@param[in] p_score: optimal amount of clusters that has been found during the analysis.
*/
void set_score(const double p_score) { m_score = p_score; }
void set_score(const double p_score);

/*!
Expand All @@ -92,7 +92,7 @@ class silhouette_ksearch_data {
@return Constant reference to silhouette score for each K value (amount of clusters).
*/
const silhouette_score_sequence & scores() const { return m_scores; }
const silhouette_score_sequence & scores() const;

/*!
Expand All @@ -101,7 +101,19 @@ class silhouette_ksearch_data {
@return Reference to silhouette score for each K value (amount of clusters).
*/
silhouette_score_sequence & scores() { return m_scores; }
silhouette_score_sequence & scores();

public:
/*!
@brief Compares Silhouette K-search results.
@param[in] p_other: another Silhouette K-search result that is used for comparison.
@return Returns true if both objects are the same.
*/
bool operator==(const silhouette_ksearch_data & p_other) const;
};


Expand Down
46 changes: 24 additions & 22 deletions ccore/include/pyclustering/interface/silhouette_interface.h
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
/**
*
* @authors Andrei Novikov ([email protected])
* @date 2014-2020
* @copyright GNU Public License
*
* GNU_PUBLIC_LICENSE
* pyclustering is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* pyclustering is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
/*!
@authors Andrei Novikov ([email protected])
@date 2014-2020
@copyright GNU Public License
@cond GNU_PUBLIC_LICENSE
pyclustering is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
pyclustering is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
@endcond
*/
#pragma once


Expand Down Expand Up @@ -98,6 +98,7 @@ extern "C" DECLARATION pyclustering_package * silhouette_algorithm(
* @param[in] p_kmin: minimum amount of clusters that should be considered.
* @param[in] p_kmax: maximum amount of clusters that should be considered.
* @param[in] p_metric: cluster allocator that is used by Silhouette K-Search method.
* @param[in] p_random_state: seed for random state (by default is `RANDOM_STATE_CURRENT_TIME`, current system time is used).
*
* @return Returns Silhouette K-Search results as a pyclustering package [ [ amount of clusters], [ optimal score ], [ score for each K ] ].
*
Expand All @@ -106,4 +107,5 @@ extern "C" DECLARATION pyclustering_package * silhouette_ksearch_algorithm(
const pyclustering_package * const p_sample,
const std::size_t p_kmin,
const std::size_t p_kmax,
const std::size_t p_algorithm);
const std::size_t p_algorithm,
const long long p_random_state);
1 change: 1 addition & 0 deletions ccore/src/ccore.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
<ClCompile Include="cluster\rock.cpp" />
<ClCompile Include="cluster\silhouette.cpp" />
<ClCompile Include="cluster\silhouette_ksearch.cpp" />
<ClCompile Include="cluster\silhouette_ksearch_data.cpp" />
<ClCompile Include="cluster\somsc.cpp" />
<ClCompile Include="cluster\syncnet.cpp" />
<ClCompile Include="cluster\ttsas.cpp" />
Expand Down
3 changes: 3 additions & 0 deletions ccore/src/ccore.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,9 @@
<ClCompile Include="container\kdtree_balanced.cpp">
<Filter>Source Files\container</Filter>
</ClCompile>
<ClCompile Include="cluster\silhouette_ksearch_data.cpp">
<Filter>Source Files\cluster</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\include\pyclustering\cluster\agglomerative.hpp">
Expand Down
70 changes: 44 additions & 26 deletions ccore/src/cluster/silhouette_ksearch.cpp
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
/**
*
* @authors Andrei Novikov ([email protected])
* @date 2014-2020
* @copyright GNU Public License
*
* GNU_PUBLIC_LICENSE
* pyclustering is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* pyclustering is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
/*!
@authors Andrei Novikov ([email protected])
@date 2014-2020
@copyright GNU Public License
@cond GNU_PUBLIC_LICENSE
pyclustering is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
pyclustering is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
@endcond
*/


#include <pyclustering/cluster/silhouette_ksearch.hpp>

#include <pyclustering/cluster/kmeans_plus_plus.hpp>
Expand All @@ -37,8 +39,13 @@ namespace clst {


void kmeans_allocator::allocate(const std::size_t p_amount, const dataset & p_data, cluster_sequence & p_clusters) {
allocate(p_amount, p_data, RANDOM_STATE_CURRENT_TIME, p_clusters);
}


void kmeans_allocator::allocate(const std::size_t p_amount, const dataset & p_data, const long long p_random_state, cluster_sequence & p_clusters) {
dataset initial_centers;
kmeans_plus_plus(p_amount).initialize(p_data, initial_centers);
kmeans_plus_plus(p_amount, 1, p_random_state).initialize(p_data, initial_centers);

kmeans_data result;
kmeans(initial_centers).process(p_data, result);
Expand All @@ -48,8 +55,13 @@ void kmeans_allocator::allocate(const std::size_t p_amount, const dataset & p_da


void kmedians_allocator::allocate(const std::size_t p_amount, const dataset & p_data, cluster_sequence & p_clusters) {
allocate(p_amount, p_data, RANDOM_STATE_CURRENT_TIME, p_clusters);
}


void kmedians_allocator::allocate(const std::size_t p_amount, const dataset & p_data, const long long p_random_state, cluster_sequence & p_clusters) {
dataset initial_medians;
kmeans_plus_plus(p_amount).initialize(p_data, initial_medians);
kmeans_plus_plus(p_amount, 1, p_random_state).initialize(p_data, initial_medians);

kmedians_data result;
kmedians(initial_medians).process(p_data, result);
Expand All @@ -59,8 +71,13 @@ void kmedians_allocator::allocate(const std::size_t p_amount, const dataset & p_


void kmedoids_allocator::allocate(const std::size_t p_amount, const dataset & p_data, cluster_sequence & p_clusters) {
allocate(p_amount, p_data, RANDOM_STATE_CURRENT_TIME, p_clusters);
}


void kmedoids_allocator::allocate(const std::size_t p_amount, const dataset & p_data, const long long p_random_state, cluster_sequence & p_clusters) {
medoid_sequence initial_medoids;
kmeans_plus_plus(p_amount).initialize(p_data, initial_medoids);
kmeans_plus_plus(p_amount, 1, p_random_state).initialize(p_data, initial_medoids);

kmedoids_data result;
kmedoids(initial_medoids).process(p_data, result);
Expand All @@ -70,10 +87,11 @@ void kmedoids_allocator::allocate(const std::size_t p_amount, const dataset & p_



silhouette_ksearch::silhouette_ksearch(const std::size_t p_kmin, const std::size_t p_kmax, const silhouette_ksearch_allocator::ptr & p_allocator) :
silhouette_ksearch::silhouette_ksearch(const std::size_t p_kmin, const std::size_t p_kmax, const silhouette_ksearch_allocator::ptr & p_allocator, const long long p_random_state) :
m_kmin(p_kmin),
m_kmax(p_kmax),
m_allocator(p_allocator)
m_allocator(p_allocator),
m_random_state(p_random_state)
{
if (m_kmin <= 1) {
throw std::invalid_argument("K min value '" + std::to_string(m_kmin) +
Expand All @@ -92,7 +110,7 @@ void silhouette_ksearch::process(const dataset & p_data, silhouette_ksearch_data

for (std::size_t k = m_kmin; k < m_kmax; k++) {
cluster_sequence clusters;
m_allocator->allocate(k, p_data, clusters);
m_allocator->allocate(k, p_data, m_random_state, clusters);

if (clusters.size() != k) {
p_result.scores().push_back(std::nan("1"));
Expand Down
Loading

0 comments on commit b19c298

Please sign in to comment.