Skip to content

Commit

Permalink
[voctree] Run kmeans trials in parallel
Browse files Browse the repository at this point in the history
Currently each kmeans trial is run sequentially which is later
parallelized across all cores when summing distances. This has higher
overhead compared to performing all trials in parallel and then
parallelizing for summing because less threads are started up. numTrials
is currently 5, so the number of thread startups is up to 5 times less
than before.
  • Loading branch information
p12tic committed Oct 10, 2022
1 parent 7101e19 commit 9b0aa70
Showing 1 changed file with 32 additions and 8 deletions.
40 changes: 32 additions & 8 deletions src/aliceVision/voctree/SimpleKmeans.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "distance.hpp"
#include "DefaultAllocator.hpp"

#include <aliceVision/alicevision_omp.hpp>
#include <aliceVision/system/Logger.hpp>

#include <boost/function.hpp>
Expand Down Expand Up @@ -80,9 +81,17 @@ struct InitKmeanspp
centers.clear();
centers.resize(k);

auto threadCount = std::min(numTrials, omp_get_max_threads());

std::vector<squared_distance_type> dists(features.size(), std::numeric_limits<squared_distance_type>::max());
std::vector<squared_distance_type> distsTemp(features.size(), std::numeric_limits<squared_distance_type>::max());
std::vector<squared_distance_type> distsTempBest(features.size(), std::numeric_limits<squared_distance_type>::max());
std::vector<std::vector<squared_distance_type>> threadDistsTemp(threadCount);
for (int i = 0; i < threadCount; ++i)
{
// Data will be overwritten, can be initialized to anything.
threadDistsTemp[i].resize(features.size());
}

typename std::vector<squared_distance_type>::iterator dstiter;
typename std::vector<Feature*>::const_iterator featiter;

Expand All @@ -101,6 +110,10 @@ struct InitKmeanspp
currSum += *dstiter;
}

std::mutex bestSumMutex;

std::vector<float> threadPercs(threadCount);

// iterate k-1 times
for(int i = 1; i < k; ++i)
{
Expand All @@ -109,8 +122,14 @@ struct InitKmeanspp
squared_distance_type bestSum = std::numeric_limits<squared_distance_type>::max();
std::size_t bestCenter = -1;

for (auto& perc : threadPercs)
{
perc = (float)std::rand() / RAND_MAX;
}

//make it a little bit more robust and try several guesses
// choose the one with the global minimal distance
#pragma omp parallel for num_threads(threadCount)
for(int j = 0; j < numTrials; ++j)
{
// draw an element from 0 to currSum
Expand All @@ -119,7 +138,7 @@ struct InitKmeanspp
// 0 and this sum, then start compute the sum from the first element again
// until the partial sum is greater than the number drawn: the
// the previous element is what we are looking for
const float perc = (float)rand() / RAND_MAX;
const float perc = threadPercs[omp_get_thread_num()];
squared_distance_type partial = (squared_distance_type)(currSum * perc);
// look for the element that cap the partial sum that has been
// drawn
Expand Down Expand Up @@ -149,21 +168,26 @@ struct InitKmeanspp
// 2. compute the distance of each feature from the current center
squared_distance_type distSum = 0;

auto& distsTemp = threadDistsTemp[omp_get_thread_num()];

Feature newCenter = *features[ featidx ];
#pragma omp parallel for reduction(+:distSum)
#pragma omp parallel for reduction(+:distSum) num_threads(omp_get_max_threads() / threadCount)
for(ptrdiff_t it = 0; it < static_cast<ptrdiff_t>(features.size()); ++it)
{
distsTemp[it] = std::min(distance(*(features[it]), newCenter), dists[it]);
distSum += distsTemp[it];
}
if(verbose > 2) ALICEVISION_LOG_DEBUG("trial " << j << " found feat " << featidx << ": " << *features[ featidx ] << " with sum: " << distSum);

if(distSum < bestSum)
{
// save the best so far
bestSum = distSum;
bestCenter = featidx;
std::swap(distsTemp, distsTempBest);
std::lock_guard<std::mutex> lock(bestSumMutex);
if (distSum < bestSum)
{
// save the best so far
bestSum = distSum;
bestCenter = featidx;
std::swap(distsTemp, distsTempBest);
}
}

}
Expand Down

0 comments on commit 9b0aa70

Please sign in to comment.