Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions FlucomaClients.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ add_client(DataSetQuery clients/nrt/DataSetQueryClient.hpp CLASS NRTThreadedData
add_client(LabelSet clients/nrt/LabelSetClient.hpp CLASS NRTThreadedLabelSetClient GROUP MANIPULATION)
add_client(KDTree clients/nrt/KDTreeClient.hpp CLASS NRTThreadedKDTreeClient GROUP MANIPULATION)
add_client(KMeans clients/nrt/KMeansClient.hpp CLASS NRTThreadedKMeansClient GROUP MANIPULATION)
add_client(SKMeans clients/nrt/SKMeansClient.hpp CLASS NRTThreadedSKMeansClient GROUP MANIPULATION)
add_client(KNNClassifier clients/nrt/KNNClassifierClient.hpp CLASS NRTThreadedKNNClassifierClient GROUP MANIPULATION)
add_client(KNNRegressor clients/nrt/KNNRegressorClient.hpp CLASS NRTThreadedKNNRegressorClient GROUP MANIPULATION)
add_client(Normalize clients/nrt/NormalizeClient.hpp CLASS NRTThreadedNormalizeClient GROUP MANIPULATION)
Expand Down
6 changes: 3 additions & 3 deletions include/algorithms/public/KMeans.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ class KMeans
out <<= _impl::asFluid(mAssignments);
}

void getDistances(RealMatrixView data, RealMatrixView out) const
void transform(RealMatrixView data, RealMatrixView out) const
{
Eigen::ArrayXXd points = _impl::asEigen<Eigen::Array>(data);
Eigen::ArrayXXd D = fluid::algorithm::DistanceMatrix(points, 2);
Expand All @@ -118,8 +118,8 @@ class KMeans
out <<= _impl::asFluid(D);
}

private:
double distance(Eigen::ArrayXd v1, Eigen::ArrayXd v2) const
protected:
double distance(const Eigen::ArrayXd& v1, const Eigen::ArrayXd& v2) const
{
return (v1 - v2).matrix().norm();
}
Expand Down
121 changes: 121 additions & 0 deletions include/algorithms/public/SKMeans.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/*
Part of the Fluid Corpus Manipulation Project (http://www.flucoma.org/)
Copyright 2017-2019 University of Huddersfield.
Licensed under the BSD-3 License.
See license.md file in the project root for full license information.
This project has received funding from the European Research Council (ERC)
under the European Union’s Horizon 2020 research and innovation programme
(grant agreement No 725899).
*/

#pragma once

#include "../public/KMeans.hpp"
#include "../util/FluidEigenMappings.hpp"
#include "../../data/FluidDataSet.hpp"
#include "../../data/FluidIndex.hpp"
#include "../../data/FluidTensor.hpp"
#include "../../data/TensorTypes.hpp"
#include <Eigen/Core>
#include <queue>
#include <string>

namespace fluid {
namespace algorithm {

class SKMeans : public KMeans
{

public:
void train(const FluidDataSet<std::string, double, 1>& dataset, index k,
index maxIter)
{
using namespace Eigen;
using namespace _impl;
assert(!mTrained || (dataset.pointSize() == mDims && mK == k));
MatrixXd dataPoints = asEigen<Matrix>(dataset.getData());
MatrixXd dataPointsT = dataPoints.transpose();
if (mTrained) { mAssignments = assignClusters(dataPointsT);}
else
{
mK = k;
mDims = dataset.pointSize();
initMeans(dataPoints);
}

while (maxIter-- > 0)
{
mEmbedding = mMeans.matrix() * dataPointsT;
auto assignments = assignClusters(mEmbedding);
if (!changed(assignments)) { break; }
else
mAssignments = assignments;
updateEmbedding();
computeMeans(dataPoints);
}
mTrained = true;
}


void encode(RealMatrixView data, RealMatrixView out,
double alpha = 0.25) const
{
using namespace Eigen;
MatrixXd points = _impl::asEigen<Matrix>(data).transpose();
MatrixXd embedding = (mMeans.matrix() * points).array() - alpha;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure about baking in the encoding scheme from Coates and Ng here?

I guess the argument in favour is that it enables recreating their feature learning scheme with the fewest objects. The arguments against would be that it's not strictly part of skmeans, but was a separate step used by C&N specifically in the feature learning setting, and they do discuss alternatives.

Obviously having it here doesn't preclude using an alternative scheme, because alpha can be set to 0. However, the other question is whether this encoder could be useful for other things if it were factored out, e.g. using NMF for feature learning?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The soft thresholding function is similar to neural network activation functions, so NNFuncs is where it would fit best, but I don't think these functions would deserve their own client, so in practice I would still see this as part of the SKMeans client. So it would not help with code duplication. An interesting idea (maybe for the future?) could be to have a feature learning client that could use different learning techniques and encodings. For the moment, maybe it can be introduced as an option for skmeans. We can also use an MLP as autoencoder for feature learning.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, let's roll with what we have and see how we get on. I like the idea of some future feature learning object that could make it easy to explore options and manage some of the complexity / fiddliness.

embedding = (embedding.array() > 0).select(embedding, 0).transpose();
out <<= _impl::asFluid(embedding);
}

private:

void initMeans(Eigen::MatrixXd& dataPoints)
{
using namespace Eigen;
mMeans = ArrayXXd::Zero(mK, mDims);
mAssignments =
((0.5 + (0.5 * ArrayXd::Random(dataPoints.rows()))) * (mK - 1))
.round()
.cast<int>();
mEmbedding = MatrixXd::Zero(mK, dataPoints.rows());
for (index i = 0; i < dataPoints.rows(); i++)
mEmbedding(mAssignments(i), i) = 1;
computeMeans(dataPoints);
}

void updateEmbedding()
{
for (index i = 0; i < mAssignments.cols(); i++)
{
double val = mEmbedding(mAssignments(i), i);
mEmbedding.col(i).setZero();
mEmbedding(mAssignments(i), i) = val;
}
}


Eigen::VectorXi assignClusters(Eigen::MatrixXd& embedding) const
{
Eigen::VectorXi assignments = Eigen::VectorXi::Zero(embedding.cols());
for (index i = 0; i < embedding.cols(); i++)
{
Eigen::VectorXd::Index maxIndex;
embedding.col(i).maxCoeff(&maxIndex);
assignments(i) = static_cast<int>(maxIndex);
}
return assignments;
}


void computeMeans(Eigen::MatrixXd& dataPoints)
{
mMeans = mEmbedding * dataPoints;
mMeans.matrix().rowwise().normalize();
}


private:
Eigen::MatrixXd mEmbedding;
};
} // namespace algorithm
} // namespace fluid
4 changes: 2 additions & 2 deletions include/clients/nrt/KMeansClient.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ class KMeansClient : public FluidBaseClient,

StringVectorView ids = srcDataSet.getIds();
RealMatrix output(srcDataSet.size(), mAlgorithm.size());
mAlgorithm.getDistances(srcDataSet.getData(), output);
mAlgorithm.transform(srcDataSet.getData(), output);
FluidDataSet<string, double, 1> result(ids, output);
destPtr->setDataSet(result);
return OK();
Expand Down Expand Up @@ -224,7 +224,7 @@ class KMeansClient : public FluidBaseClient,
RealMatrix dest(1, mAlgorithm.size());
src.row(0) <<=
BufferAdaptor::ReadAccess(in.get()).samps(0, mAlgorithm.dims(), 0);
mAlgorithm.getDistances(src, dest);
mAlgorithm.transform(src, dest);
outBuf.allFrames()(Slice(0, 1), Slice(0, mAlgorithm.size())) <<= dest;
return OK();
}
Expand Down
Loading