[voctree] Run kmeans trials in parallel

Currently each kmeans trial is run sequentially which is later parallelized across all cores when summing distances. This has higher overhead compared to performing all trials in parallel and then parallelizing for summing because less threads are started up. numTrials is currently 5, so the number of thread startups is up to 5 times less than before.
alicevision · Oct 10, 2022 · 9b0aa70 · 9b0aa70
1 parent 7101e19
commit 9b0aa70
Showing 1 changed file with 32 additions and 8 deletions.
diff --git a/src/aliceVision/voctree/SimpleKmeans.hpp b/src/aliceVision/voctree/SimpleKmeans.hpp
@@ -9,6 +9,7 @@
 #include "distance.hpp"
 #include "DefaultAllocator.hpp"
 
+#include <aliceVision/alicevision_omp.hpp>
 #include <aliceVision/system/Logger.hpp>
 
 #include <boost/function.hpp>
@@ -80,9 +81,17 @@ struct InitKmeanspp
     centers.clear();
     centers.resize(k);
 
+    auto threadCount = std::min(numTrials, omp_get_max_threads());
+
     std::vector<squared_distance_type> dists(features.size(), std::numeric_limits<squared_distance_type>::max());
-    std::vector<squared_distance_type> distsTemp(features.size(), std::numeric_limits<squared_distance_type>::max());
     std::vector<squared_distance_type> distsTempBest(features.size(), std::numeric_limits<squared_distance_type>::max());
+    std::vector<std::vector<squared_distance_type>> threadDistsTemp(threadCount);
+    for (int i = 0; i < threadCount; ++i)
+    {
+        // Data will be overwritten, can be initialized to anything.
+        threadDistsTemp[i].resize(features.size());
+    }
+
     typename std::vector<squared_distance_type>::iterator dstiter;
     typename std::vector<Feature*>::const_iterator featiter;
 
@@ -101,6 +110,10 @@ struct InitKmeanspp
       currSum += *dstiter;
     }
 
+    std::mutex bestSumMutex;
+
+    std::vector<float> threadPercs(threadCount);
+
     // iterate k-1 times
     for(int i = 1; i < k; ++i)
     {
@@ -109,8 +122,14 @@ struct InitKmeanspp
       squared_distance_type bestSum = std::numeric_limits<squared_distance_type>::max();
       std::size_t bestCenter = -1;
 
+      for (auto& perc : threadPercs)
+      {
+          perc = (float)std::rand() / RAND_MAX;
+      }
+
       //make it a little bit more robust and try several guesses
       // choose the one with the global minimal distance
+#pragma omp parallel for num_threads(threadCount)
       for(int j = 0; j < numTrials; ++j)
       {
         // draw an element from 0 to currSum
@@ -119,7 +138,7 @@ struct InitKmeanspp
         // 0 and this sum, then start compute the sum from the first element again
         // until the partial sum is greater than the number drawn: the
         // the previous element is what we are looking for
-        const float perc = (float)rand() / RAND_MAX;
+        const float perc = threadPercs[omp_get_thread_num()];
         squared_distance_type partial = (squared_distance_type)(currSum * perc);
         // look for the element that cap the partial sum that has been
         // drawn
@@ -149,21 +168,26 @@ struct InitKmeanspp
         // 2. compute the distance of each feature from the current center
         squared_distance_type distSum = 0;
 
+        auto& distsTemp = threadDistsTemp[omp_get_thread_num()];
+
         Feature newCenter = *features[ featidx ];
-        #pragma omp parallel for reduction(+:distSum)
+        #pragma omp parallel for reduction(+:distSum) num_threads(omp_get_max_threads() / threadCount)
         for(ptrdiff_t it = 0; it < static_cast<ptrdiff_t>(features.size()); ++it)
         {
           distsTemp[it] = std::min(distance(*(features[it]), newCenter), dists[it]);
           distSum += distsTemp[it];
         }
         if(verbose > 2) ALICEVISION_LOG_DEBUG("trial " << j << " found feat " << featidx << ": " << *features[ featidx ] << " with sum: " << distSum);
 
-        if(distSum < bestSum)
         {
-          // save the best so far
-          bestSum = distSum;
-          bestCenter = featidx;
-          std::swap(distsTemp, distsTempBest);
+            std::lock_guard<std::mutex> lock(bestSumMutex);
+            if (distSum < bestSum)
+            {
+                // save the best so far
+                bestSum = distSum;
+                bestCenter = featidx;
+                std::swap(distsTemp, distsTempBest);
+            }
         }
 
       }