HubSpot · ssalinas · Aug 16, 2018 · Jul 17, 2018 · Jul 19, 2018 · Jul 24, 2018
diff --git a/...ularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java b/...ularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java
@@ -376,6 +376,8 @@ public class SingularityConfiguration extends Configuration {
 
   private long preemptibleTaskMaxExpectedRuntimeMs = 900000; // 15 minutes
 
+  private long maxSlaveUsageMetricAgeMs = 30000;
+
   public long getAskDriverToKillTasksAgainAfterMillis() {
     return askDriverToKillTasksAgainAfterMillis;
   }
@@ -1593,4 +1595,12 @@ public long getPreemptibleTaskMaxExpectedRuntimeMs() {
   public void setPreemptibleTaskMaxExpectedRuntimeMs(long preemptibleTaskMaxExpectedRuntimeMs) {
     this.preemptibleTaskMaxExpectedRuntimeMs = preemptibleTaskMaxExpectedRuntimeMs;
   }
+
+  public long getMaxSlaveUsageMetricAgeMs() {
+    return maxSlaveUsageMetricAgeMs;
+  }
+
+  public void setMaxSlaveUsageMetricAgeMs(long maxSlaveUsageMetricAgeMs) {
+    this.maxSlaveUsageMetricAgeMs = maxSlaveUsageMetricAgeMs;
+  }
 }
diff --git a/...tyService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/...tyService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java
@@ -29,6 +29,7 @@
 import com.hubspot.singularity.RequestUtilization;
 import com.hubspot.singularity.SingularityDeployStatistics;
 import com.hubspot.singularity.SingularityPendingTaskId;
+import com.hubspot.singularity.SingularitySlave;
 import com.hubspot.singularity.SingularitySlaveUsage;
 import com.hubspot.singularity.SingularitySlaveUsageWithId;
 import com.hubspot.singularity.SingularityTask;
@@ -41,13 +42,15 @@
 import com.hubspot.singularity.config.MesosConfiguration;
 import com.hubspot.singularity.config.SingularityConfiguration;
 import com.hubspot.singularity.data.DeployManager;
+import com.hubspot.singularity.data.SlaveManager;
 import com.hubspot.singularity.data.TaskManager;
 import com.hubspot.singularity.data.UsageManager;
 import com.hubspot.singularity.helpers.MesosUtils;
 import com.hubspot.singularity.helpers.SingularityMesosTaskHolder;
 import com.hubspot.singularity.mesos.SingularitySlaveUsageWithCalculatedScores.MaxProbableUsage;
 import com.hubspot.singularity.scheduler.SingularityLeaderCache;
 import com.hubspot.singularity.scheduler.SingularityScheduler;
+import com.hubspot.singularity.scheduler.SingularityUsagePoller;
 
 @Singleton
 public class SingularityMesosOfferScheduler {
@@ -65,6 +68,8 @@ public class SingularityMesosOfferScheduler {
   private final SingularitySlaveAndRackManager slaveAndRackManager;
   private final SingularitySlaveAndRackHelper slaveAndRackHelper;
   private final SingularityTaskSizeOptimizer taskSizeOptimizer;
+  private final SingularityUsagePoller usagePoller;
+  private final SlaveManager slaveManager;
   private final UsageManager usageManager;
   private final DeployManager deployManager;
   private final SingularitySchedulerLock lock;
@@ -89,6 +94,8 @@ public SingularityMesosOfferScheduler(MesosConfiguration mesosConfiguration,
                                         SingularityTaskSizeOptimizer taskSizeOptimizer,
                                         SingularitySlaveAndRackHelper slaveAndRackHelper,
                                         SingularityLeaderCache leaderCache,
+                                        SingularityUsagePoller usagePoller,
+                                        SlaveManager slaveManager,
                                         UsageManager usageManager,
                                         DeployManager deployManager,
                                         SingularitySchedulerLock lock) {
@@ -102,6 +109,8 @@ public SingularityMesosOfferScheduler(MesosConfiguration mesosConfiguration,
     this.slaveAndRackManager = slaveAndRackManager;
     this.taskSizeOptimizer = taskSizeOptimizer;
     this.leaderCache = leaderCache;
+    this.usagePoller = usagePoller;
+    this.slaveManager = slaveManager;
     this.slaveAndRackHelper = slaveAndRackHelper;
     this.taskPrioritizer = taskPrioritizer;
     this.usageManager = usageManager;
@@ -180,7 +189,8 @@ public Collection<SingularityOfferHolder> checkOffers(final Collection<Offer> of
                 mesosConfiguration.getScoreUsingSystemLoad(),
                 getMaxProbableUsageForSlave(activeTaskIds, requestUtilizations, offerHolders.get(usageWithId.getSlaveId()).getSanitizedHost()),
                 mesosConfiguration.getLoad5OverloadedThreshold(),
-                mesosConfiguration.getLoad1OverloadedThreshold()
+                mesosConfiguration.getLoad1OverloadedThreshold(),
+                usageWithId.getTimestamp()
             )
         ));
 
@@ -196,23 +206,11 @@ public Collection<SingularityOfferHolder> checkOffers(final Collection<Offer> of
         List<CompletableFuture<Void>> scoringFutures = new ArrayList<>();
         AtomicReference<Throwable> scoringException = new AtomicReference<>(null);
         for (SingularityOfferHolder offerHolder : offerHolders.values()) {
-          if (!isOfferFull(offerHolder)) {
-            scoringFutures.add(
-                offerScoringSemaphore.call(
-                    () -> CompletableFuture.runAsync(() -> {
-                          try {
-                            double score = calculateScore(offerHolder, currentSlaveUsagesBySlaveId, tasksPerOfferHost, taskRequestHolder, activeTaskIdsForRequest, requestUtilizations.get(taskRequestHolder.getTaskRequest().getRequest().getId()));
-                            if (score != 0) {
-                              scorePerOffer.put(offerHolder.getSlaveId(), score);
-                            }
-                          } catch (Throwable t) {
-                            LOG.error("Uncaught exception while scoring offers", t);
-                            scoringException.set(t);
-                          }
-                        },
-                        offerScoringExecutor
-                    )));
-          }
+          scoringFutures.add(offerScoringSemaphore.call(() ->
+              CompletableFuture.supplyAsync(() -> {
+                return buildScoringFuture(offerHolders, requestUtilizations, activeTaskIds, currentSlaveUsagesBySlaveId, tasksPerOfferHost, taskRequestHolder, scorePerOffer, activeTaskIdsForRequest, scoringException, offerHolder);
+              },
+              offerScoringExecutor)));
         }
 
         CompletableFutures.allOf(scoringFutures).join();
@@ -240,6 +238,59 @@ public Collection<SingularityOfferHolder> checkOffers(final Collection<Offer> of
     return offerHolders.values();
   }
 
+  private Void buildScoringFuture(
+      Map<String, SingularityOfferHolder> offerHolders,
+      Map<String, RequestUtilization> requestUtilizations,
+      List<SingularityTaskId> activeTaskIds,
+      Map<String, SingularitySlaveUsageWithCalculatedScores> currentSlaveUsagesBySlaveId,
+      Map<String, Integer> tasksPerOfferHost,
+      SingularityTaskRequestHolder taskRequestHolder,
+      Map<String, Double> scorePerOffer,
+      List<SingularityTaskId> activeTaskIdsForRequest,
+      AtomicReference<Throwable> scoringException,
+      SingularityOfferHolder offerHolder) {
+    if (isOfferFull(offerHolder)) {
+      return null;
+    }
+    String slaveId = offerHolder.getSlaveId();
+    Optional<SingularitySlaveUsageWithCalculatedScores> maybeSlaveUsage = Optional.fromNullable(currentSlaveUsagesBySlaveId.get(slaveId));
+
+    if (taskManager.getActiveTasks().stream()
+        .anyMatch(t -> t.getTaskRequest().getDeploy().getTimestamp().or(System.currentTimeMillis()) > maybeSlaveUsage.get().getTimestamp()
+            && t.getMesosTask().getSlaveId().getValue().equals(slaveId))) {
+      Optional<SingularitySlave> maybeSlave = slaveManager.getSlave(slaveId);
+      if (maybeSlave.isPresent()) {
+        usagePoller.getSlaveUsage(maybeSlave.get())
+          .whenComplete((usage, throwable) -> {
+              if (throwable == null) {
+                currentSlaveUsagesBySlaveId.put(slaveId, new SingularitySlaveUsageWithCalculatedScores(
+                    usage,
+                    mesosConfiguration.getScoreUsingSystemLoad(),
+                    getMaxProbableUsageForSlave(activeTaskIds, requestUtilizations, offerHolders.get(slaveId).getSanitizedHost()),
+                    mesosConfiguration.getLoad5OverloadedThreshold(),
+                    mesosConfiguration.getLoad1OverloadedThreshold(),
+                    usage.getTimestamp()
+                ));
+              } else {
+                throw new RuntimeException(throwable);
+              }
+          });
+      }
+      return null;
+    }
+
+    try {
+      double score = calculateScore(offerHolder, currentSlaveUsagesBySlaveId, tasksPerOfferHost, taskRequestHolder, activeTaskIdsForRequest, requestUtilizations.get(taskRequestHolder.getTaskRequest().getRequest().getId()));
+      if (score != 0) {
+        scorePerOffer.put(slaveId, score);
+      }
+    } catch (Throwable t) {
+      LOG.error("Uncaught exception while scoring offers", t);
+      scoringException.set(t);
+    }
+    return null;
+  }
+
   private MaxProbableUsage getMaxProbableUsageForSlave(List<SingularityTaskId> activeTaskIds, Map<String, RequestUtilization> requestUtilizations, String sanitizedHostname) {
     double cpu = 0;
     double memBytes = 0;

diff --git a/...rc/main/java/com/hubspot/singularity/mesos/SingularitySlaveUsageWithCalculatedScores.java b/...rc/main/java/com/hubspot/singularity/mesos/SingularitySlaveUsageWithCalculatedScores.java
@@ -26,7 +26,14 @@ class SingularitySlaveUsageWithCalculatedScores {
   private final double load5Threshold;
   private final double load1Threshold;
 
-  SingularitySlaveUsageWithCalculatedScores(SingularitySlaveUsage slaveUsage, MachineLoadMetric systemLoadMetric, MaxProbableUsage maxProbableTaskUsage, double load5Threshold, double load1Threshold) {
+  private final long timestamp;
+
+  SingularitySlaveUsageWithCalculatedScores(SingularitySlaveUsage slaveUsage,
+                                            MachineLoadMetric systemLoadMetric,
+                                            MaxProbableUsage maxProbableTaskUsage,
+                                            double load5Threshold,
+                                            double load1Threshold,
+                                            long timestamp) {
     this.slaveUsage = slaveUsage;
     this.systemLoadMetric = systemLoadMetric;
     this.maxProbableTaskUsage = maxProbableTaskUsage;
@@ -39,6 +46,7 @@ class SingularitySlaveUsageWithCalculatedScores {
     }
     this.load5Threshold = load5Threshold;
     this.load1Threshold = load1Threshold;
+    this.timestamp = timestamp;
   }
 
   boolean isCpuOverloaded(double estimatedNumCpusToAdd) {
@@ -121,6 +129,10 @@ SingularitySlaveUsage getSlaveUsage() {
     return diskInUseScore;
   }
 
+  long getTimestamp() {
+    return timestamp;
+  }
+
   void addEstimatedCpuUsage(double estimatedAddedCpus) {
     this.estimatedAddedCpusUsage += estimatedAddedCpus;
   }

diff --git a/...larityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java b/...larityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java
@@ -68,7 +68,7 @@ public class SingularityUsagePoller extends SingularityLeaderOnlyPoller {
   private final DeployManager deployManager;
   private final TaskManager taskManager;
 
-  private final AsyncSemaphore<Void> usageCollectionSemaphore;
+  private final AsyncSemaphore<SingularitySlaveUsage> usageCollectionSemaphore;
   private final ExecutorService usageExecutor;
   private final ConcurrentHashMap<String, ReentrantLock> requestLocks;
 
@@ -112,12 +112,12 @@ public void runActionOnPoll() {
 
     Map<SingularitySlaveUsage, List<TaskIdWithUsage>> overLoadedHosts = new ConcurrentHashMap<>();
 
-    List<CompletableFuture<Void>> usageFutures = new ArrayList<>();
+    List<CompletableFuture<SingularitySlaveUsage>> usageFutures = new ArrayList<>();
 
     usageHelper.getSlavesToTrackUsageFor().forEach((slave) -> {
       usageFutures.add(usageCollectionSemaphore.call(() ->
-          CompletableFuture.runAsync(() -> {
-            collectSlaveUage(slave, now, utilizationPerRequestId, previousUtilizations, overLoadedHosts, totalMemBytesUsed, totalMemBytesAvailable,
+          CompletableFuture.supplyAsync(() -> {
+            return collectSlaveUsage(slave, now, utilizationPerRequestId, previousUtilizations, overLoadedHosts, totalMemBytesUsed, totalMemBytesAvailable,
                 totalCpuUsed, totalCpuAvailable, totalDiskBytesUsed, totalDiskBytesAvailable);
           }, usageExecutor)
       ));
@@ -126,15 +126,35 @@ public void runActionOnPoll() {
     CompletableFutures.allOf(usageFutures).join();
 
     usageManager.saveClusterUtilization(
-        getClusterUtilization(utilizationPerRequestId, totalMemBytesUsed.get(), totalMemBytesAvailable.get(), totalCpuUsed.get(), totalCpuAvailable.get(), totalDiskBytesUsed.get(), totalDiskBytesAvailable
-            .get(), now));
+        getClusterUtilization(
+            utilizationPerRequestId, totalMemBytesUsed.get(), totalMemBytesAvailable.get(),
+            totalCpuUsed.get(), totalCpuAvailable.get(), totalDiskBytesUsed.get(), totalDiskBytesAvailable.get(), now));
     utilizationPerRequestId.values().forEach(usageManager::saveRequestUtilization);
 
     if (configuration.isShuffleTasksForOverloadedSlaves()) {
       shuffleTasksOnOverloadedHosts(overLoadedHosts);
     }
   }
 
+  public CompletableFuture<SingularitySlaveUsage> getSlaveUsage(SingularitySlave slave) {
+    return usageCollectionSemaphore.call(() ->
+        CompletableFuture.supplyAsync(() -> {
+          return collectSlaveUsage(
+              slave,
+              System.currentTimeMillis(),
+              new ConcurrentHashMap<>(),
+              usageManager.getRequestUtilizations(),
+              new ConcurrentHashMap<>(),
+              new AtomicLong(),
+              new AtomicLong(),
+              new AtomicDouble(),
+              new AtomicDouble(),
+              new AtomicLong(),
+              new AtomicLong());
+        }, usageExecutor)
+    );
+  }
+
   public void runWithRequestLock(Runnable function, String requestId) {
     ReentrantLock lock = requestLocks.computeIfAbsent(requestId, (r) -> new ReentrantLock());
     lock.lock();
@@ -145,17 +165,17 @@ public void runWithRequestLock(Runnable function, String requestId) {
     }
   }
 
-  private void collectSlaveUage(SingularitySlave slave,
-                                long now,
-                                Map<String, RequestUtilization> utilizationPerRequestId,
-                                Map<String, RequestUtilization> previousUtilizations,
-                                Map<SingularitySlaveUsage, List<TaskIdWithUsage>> overLoadedHosts,
-                                AtomicLong totalMemBytesUsed,
-                                AtomicLong totalMemBytesAvailable,
-                                AtomicDouble totalCpuUsed,
-                                AtomicDouble totalCpuAvailable,
-                                AtomicLong totalDiskBytesUsed,
-                                AtomicLong totalDiskBytesAvailable) {
+  private SingularitySlaveUsage collectSlaveUsage(SingularitySlave slave,
+                                 long now,
+                                 Map<String, RequestUtilization> utilizationPerRequestId,
+                                 Map<String, RequestUtilization> previousUtilizations,
+                                 Map<SingularitySlaveUsage, List<TaskIdWithUsage>> overLoadedHosts,
+                                 AtomicLong totalMemBytesUsed,
+                                 AtomicLong totalMemBytesAvailable,
+                                 AtomicDouble totalCpuUsed,
+                                 AtomicDouble totalCpuAvailable,
+                                 AtomicLong totalDiskBytesUsed,
+                                 AtomicLong totalDiskBytesAvailable) {
     Optional<Long> memoryMbTotal = Optional.absent();
     Optional<Double> cpusTotal = Optional.absent();
     Optional<Long> diskMbTotal = Optional.absent();
@@ -314,11 +334,13 @@ private void collectSlaveUage(SingularitySlave slave,
 
       LOG.debug("Saving slave {} usage {}", slave.getHost(), slaveUsage);
       usageManager.saveSpecificSlaveUsageAndSetCurrent(slave.getId(), slaveUsage);
+      return slaveUsage;
     } catch (Throwable t) {
       String message = String.format("Could not get slave usage for host %s", slave.getHost());
       LOG.error(message, t);
       exceptionNotifier.notify(message, t);
     }
+    return null; // TODO: is this really okay?
   }
 
   private boolean isEligibleForShuffle(SingularityTaskId task) {