envoyproxy · mattklein123 · Oct 11, 2018 · Sep 17, 2018 · Sep 21, 2018 · Oct 2, 2018
diff --git a/docs/root/intro/arch_overview/load_balancing.rst b/docs/root/intro/arch_overview/load_balancing.rst
@@ -143,26 +143,6 @@ percentage of healthy hosts multiplied by the overprovisioning factor drops
 below 100. The default value is 1.4, so a priority level or locality will not be
 considered unhealthy until the percentage of healthy endpoints goes below 72%.
 
-.. _arch_overview_load_balancing_panic_threshold:
-
-Panic threshold
----------------
-
-During load balancing, Envoy will generally only consider healthy hosts in an upstream cluster.
-However, if the percentage of healthy hosts in the cluster becomes too low, Envoy will disregard
-health status and balance amongst all hosts. This is known as the *panic threshold*. The default
-panic threshold is 50%. This is :ref:`configurable <config_cluster_manager_cluster_runtime>` via
-runtime as well as in the :ref:`cluster configuration
-<envoy_api_field_Cluster.CommonLbConfig.healthy_panic_threshold>`. The panic threshold
-is used to avoid a situation in which host failures cascade throughout the cluster as load
-increases.
-
-Note that panic thresholds are *per-priority*. This means that if the percentage of healthy nodes
-in a single priority goes below the threshold, that priority will enter panic mode. In general
-it is discouraged to use panic thresholds in conjunction with priorities, as by the time enough
-nodes are unhealthy to trigger the panic threshold most of the traffic should already have spilled
-over to the next priority level.
-
 .. _arch_overview_load_balancing_priority_levels:
 
 Priority levels
@@ -247,6 +227,72 @@ To sum this up in pseudo algorithms:
   total_health = min(100, Σ(health(P_0)...health(P_X))
   load to P_X = 100 - Σ(percent_load(P_0)..percent_load(P_X-1))
 
+.. _arch_overview_load_balancing_panic_threshold:
+
+Panic threshold
+---------------
+
+During load balancing, Envoy will generally only consider healthy hosts in an upstream cluster.
+However, if the percentage of healthy hosts in the cluster becomes too low, Envoy will disregard
+health status and balance amongst all hosts. This is known as the *panic threshold*. The default
+panic threshold is 50%. This is :ref:`configurable <config_cluster_manager_cluster_runtime>` via
+runtime as well as in the :ref:`cluster configuration
+<envoy_api_field_Cluster.CommonLbConfig.healthy_panic_threshold>`. The panic threshold
+is used to avoid a situation in which host failures cascade throughout the cluster as load
+increases.
+
+Panic thresholds work in conjunction with priorities. If number of healthy hosts in given priority
+goes down, Envoy will try try shift some traffic to lower priorities. If it succeeds finding enough 
+healthy hosts in lower priorities, Envoy will disregard panic thresholds. In mathematical terms, 
+if total health across all priority levels is 100%, Envoy disregards panic thresholds but continues to
+distribute traffic load across priorities according to algorithm described :ref:`here <arch_overview_load_balancing_priority_levels>`. 
+
+The following examples explain relationship between total health and panic threshold. It is 
+assumed that default value of 50% is used for panic threshold.
+
+Assume a simple set-up with 2 priority levels, P=1 100% healthy. In this scenario
+total health is always 100% and P=0 never enters panic mode and Envoy is able to shift entire traffic to P=1.
+
++----------------------------+--------------------+
+| P=0 healthy endpoints      | P=0 in panic       |
++============================+====================+
+| 100%                       | NO                 |
++----------------------------+--------------------+
+| 72%                        | NO                 |
++----------------------------+--------------------+
+| 71%                        | NO                 |
++----------------------------+--------------------+
+| 50%                        | NO                 |
++----------------------------+--------------------+
+| 25%                        | NO                 |
++----------------------------+--------------------+
+| 0%                         | NO                 |
++----------------------------+--------------------+
+
+If P=1 becomes unhealthy, panic threshold continues to be disregarded until the sum of the health
+P=0 + P=1 goes below 100. At this point Envoy starts checking panic threshold value for each 
+priority.
+
++------------------------+-------------------------+-----------------+-----------------+-----------------+
+| P=0 healthy endpoints  | P=1 healthy endpoints   | Total health    |  P=0 in panic   | P=1 in panic    |
++========================+=========================+=================+=================+=================+
+| 100%                   |  100%                   | 100%            |   NO            | NO              | 
++------------------------+-------------------------+-----------------+-----------------+-----------------+
+| 72%                    |  72%                    | 100%            |   NO            | NO              |
++------------------------+-------------------------+-----------------+-----------------+-----------------+
+| 71%                    |  71%                    | 100%            |   NO            | NO              |
++------------------------+-------------------------+-----------------+-----------------+-----------------+
+| 50%                    |  50%                    | 100%            |   NO            | NO              |
++------------------------+-------------------------+-----------------+-----------------+-----------------+
+| 25%                    |  100%                   | 100%            |   NO            | NO              |
++------------------------+-------------------------+-----------------+-----------------+-----------------+
+| 25%                    |  25%                    | 70%             |   YES           | YES             |
++------------------------+-------------------------+-----------------+-----------------+-----------------+
+| 5%                     |  65%                    | 98%             |   YES           | NO              |
++------------------------+-------------------------+-----------------+-----------------+-----------------+
+
+Note that panic thresholds can be configured *per-priority*.
+
 .. _arch_overview_load_balancing_zone_aware_routing:
 
 Zone aware routing

diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc
@@ -47,12 +47,29 @@ LoadBalancerBase::LoadBalancerBase(const PrioritySet& priority_set, ClusterStats
     recalculatePerPriorityState(host_set->priority(), priority_set_, per_priority_load_,
                                 per_priority_health_);
   }
+  // Reclaculate panic mode for all levels.
+  recalculatePerPriorityPanic();
+
   priority_set_.addMemberUpdateCb([this](uint32_t priority, const HostVector&,
                                          const HostVector&) -> void {
     recalculatePerPriorityState(priority, priority_set_, per_priority_load_, per_priority_health_);
   });
+  priority_set_.addMemberUpdateCb(
+      [this](uint32_t priority, const HostVector&, const HostVector&) -> void {
+        UNREFERENCED_PARAMETER(priority);
+        recalculatePerPriorityPanic();
+      });
 }
 
+// The following cases are handled by
+// recalculatePerPriorityState and recalculatePerPriorityPanic methods;
+// - Total health is = 100. It means there are enough healthy hosts to handle the load.
+//   Do not enter panic mode, even if a specific priority has low number of healthy hosts.
+// - Total health is < 100. There are not enough healthy hosts to handle the load. Continue
+//   distibuting the load among priority sets, but turn on panic mode for a given priority
+//   if # of healthy hosts in priority set is low.
+// - Total health is 0. All hosts are down. Redirect 100% of traffic to P=0 and enable panic mode.
+
 void LoadBalancerBase::recalculatePerPriorityState(uint32_t priority,
                                                    const PrioritySet& priority_set,
                                                    PriorityLoad& per_priority_load,
@@ -71,17 +88,20 @@ void LoadBalancerBase::recalculatePerPriorityState(uint32_t priority,
                                  host_set.healthyHosts().size() / host_set.hosts().size()));
   }
 
-  // Now that we've updated health for the changed priority level, we need to caculate percentage
+  // Now that we've updated health for the changed priority level, we need to calculate percentage
   // load for all priority levels.
+
   //
   // First, determine if the load needs to be scaled relative to health. For example if there are
   // 3 host sets with 20% / 20% / 10% health they will get 40% / 40% / 20% load to ensure total load
   // adds up to 100.
-  const uint32_t total_health = std::min<uint32_t>(
+  // Sum of priority levels' health values may exceed 100, so it is capped at 100 and referred as
+  // normalized total health.
+  const uint32_t normalized_total_health = std::min<uint32_t>(
       std::accumulate(per_priority_health.begin(), per_priority_health.end(), 0), 100);
-  if (total_health == 0) {
+  if (normalized_total_health == 0) {
     // Everything is terrible. Send all load to P=0.
-    // In this one case sumEntries(per_priority_load_) != 100 since we sinkhole all traffic in P=0.
+    // In this one case sumEntries(per_priority_load) != 100 since we sinkhole all traffic in P=0.
     per_priority_load[0] = 100;
     return;
   }
@@ -95,7 +115,7 @@ void LoadBalancerBase::recalculatePerPriorityState(uint32_t priority,
     // Now assign as much load as possible to the high priority levels and cease assigning load
     // when total_load runs out.
     per_priority_load[i] =
-        std::min<uint32_t>(total_load, per_priority_health[i] * 100 / total_health);
+        std::min<uint32_t>(total_load, per_priority_health[i] * 100 / normalized_total_health);
     total_load -= per_priority_load[i];
   }
 
@@ -107,6 +127,29 @@ void LoadBalancerBase::recalculatePerPriorityState(uint32_t priority,
   }
 }
 
+// Method iterates through priority levels and turns on/off panic mode.
+void LoadBalancerBase::recalculatePerPriorityPanic() {
+  per_priority_panic_.resize(priority_set_.hostSetsPerPriority().size());
+
+  const uint32_t normalized_total_health = std::min<uint32_t>(
+      std::accumulate(per_priority_health_.begin(), per_priority_health_.end(), 0), 100);
+
+  if (normalized_total_health == 0) {
+    // Everything is terrible. All load should be to P=0. Turn on panic mode.
+    ASSERT(per_priority_load_[0] == 100);
+    per_priority_panic_[0] = true;
+    return;
+  }
+
+  for (size_t i = 0; i < per_priority_health_.size(); ++i) {
+    // For each level check if it should run in panic mode. Never set panic mode if the total health
+    // is 100%, even when individual priority level has very low # of healthy hosts.
+    const HostSet& priority_host_set = *priority_set_.hostSetsPerPriority()[i];
+    per_priority_panic_[i] =
+        (normalized_total_health == 100 ? false : isGlobalPanic(priority_host_set));
+  }
+}
+
 HostSet& LoadBalancerBase::chooseHostSet(LoadBalancerContext* context) {
   if (context) {
     const auto& per_priority_load =
@@ -389,7 +432,7 @@ ZoneAwareLoadBalancerBase::hostSourceToUse(LoadBalancerContext* context) {
   hosts_source.priority_ = host_set.priority();
 
   // If the selected host set has insufficient healthy hosts, return all hosts.
-  if (isGlobalPanic(host_set)) {
+  if (per_priority_panic_[hosts_source.priority_]) {
     stats_.lb_healthy_panic_.inc();
     hosts_source.source_type_ = HostsSource::SourceType::AllHosts;
     return hosts_source;

diff --git a/source/common/upstream/load_balancer_impl.h b/source/common/upstream/load_balancer_impl.h
@@ -62,6 +62,7 @@ class LoadBalancerBase : public LoadBalancer {
   HostSet& chooseHostSet(LoadBalancerContext* context);
 
   uint32_t percentageLoad(uint32_t priority) const { return per_priority_load_[priority]; }
+  bool isInPanic(uint32_t priority) const { return per_priority_panic_[priority]; }
 
   ClusterStats& stats_;
   Runtime::Loader& runtime_;
@@ -77,12 +78,15 @@ class LoadBalancerBase : public LoadBalancer {
   void static recalculatePerPriorityState(uint32_t priority, const PrioritySet& priority_set,
                                           PriorityLoad& priority_load,
                                           std::vector<uint32_t>& per_priority_health);
+  void recalculatePerPriorityPanic();
 
 protected:
   // The percentage load (0-100) for each priority level
   std::vector<uint32_t> per_priority_load_;
   // The health (0-100) for each priority level.
   std::vector<uint32_t> per_priority_health_;
+  // Levels which are in panic
+  std::vector<bool> per_priority_panic_;
 };
 
 class LoadBalancerContextBase : public LoadBalancerContext {

diff --git a/source/common/upstream/maglev_lb.h b/source/common/upstream/maglev_lb.h
@@ -58,13 +58,13 @@ class MaglevLoadBalancer : public ThreadAwareLoadBalancerBase {
 
 private:
   // ThreadAwareLoadBalancerBase
-  HashingLoadBalancerSharedPtr createLoadBalancer(const HostSet& host_set) override {
+  HashingLoadBalancerSharedPtr createLoadBalancer(const HostSet& host_set, bool in_panic) override {
     // Note that we only compute global panic on host set refresh. Given that the runtime setting
     // will rarely change, this is a reasonable compromise to avoid creating extra LBs when we only
     // need to create one per priority level.
     const bool has_locality =
         host_set.localityWeights() != nullptr && !host_set.localityWeights()->empty();
-    if (isGlobalPanic(host_set)) {
+    if (in_panic) {
       if (!has_locality) {
         return std::make_shared<MaglevTable>(HostsPerLocalityImpl(host_set.hosts(), false), nullptr,
                                              table_size_);

diff --git a/source/common/upstream/ring_hash_lb.h b/source/common/upstream/ring_hash_lb.h
@@ -45,11 +45,11 @@ class RingHashLoadBalancer : public ThreadAwareLoadBalancerBase,
   typedef std::shared_ptr<const Ring> RingConstSharedPtr;
 
   // ThreadAwareLoadBalancerBase
-  HashingLoadBalancerSharedPtr createLoadBalancer(const HostSet& host_set) override {
+  HashingLoadBalancerSharedPtr createLoadBalancer(const HostSet& host_set, bool in_panic) override {
     // Note that we only compute global panic on host set refresh. Given that the runtime setting
     // will rarely change, this is a reasonable compromise to avoid creating extra LBs when we only
     // need to create one per priority level.
-    if (isGlobalPanic(host_set)) {
+    if (in_panic) {
       return std::make_shared<Ring>(config_, host_set.hosts());
     } else {
       return std::make_shared<Ring>(config_, host_set.healthyHosts());

diff --git a/source/common/upstream/thread_aware_lb_impl.cc b/source/common/upstream/thread_aware_lb_impl.cc
@@ -25,8 +25,11 @@ void ThreadAwareLoadBalancerBase::refresh() {
     const uint32_t priority = host_set->priority();
     (*per_priority_state_vector)[priority].reset(new PerPriorityState);
     const auto& per_priority_state = (*per_priority_state_vector)[priority];
-    per_priority_state->current_lb_ = createLoadBalancer(*host_set);
-    per_priority_state->global_panic_ = isGlobalPanic(*host_set);
+    // Copy panic flag from LoadBalancerBase. It is calculated when there is a change
+    // in hosts set or hosts' health.
+    per_priority_state->global_panic_ = per_priority_panic_[priority];
+    per_priority_state->current_lb_ =
+        createLoadBalancer(*host_set, per_priority_state->global_panic_);
   }
 
   {
@@ -42,6 +45,7 @@ ThreadAwareLoadBalancerBase::LoadBalancerImpl::chooseHost(LoadBalancerContext* c
   if (per_priority_state_ == nullptr) {
     return nullptr;
   }
+
   // If there is no hash in the context, just choose a random value (this effectively becomes
   // the random LB but it won't crash if someone configures it this way).
   // computeHashKey() may be computed on demand, so get it only once.

diff --git a/source/common/upstream/thread_aware_lb_impl.h b/source/common/upstream/thread_aware_lb_impl.h
@@ -75,7 +75,8 @@ class ThreadAwareLoadBalancerBase : public LoadBalancerBase, public ThreadAwareL
     std::shared_ptr<std::vector<uint32_t>> per_priority_load_ GUARDED_BY(mutex_);
   };
 
-  virtual HashingLoadBalancerSharedPtr createLoadBalancer(const HostSet& host_set) PURE;
+  virtual HashingLoadBalancerSharedPtr createLoadBalancer(const HostSet& host_set,
+                                                          bool in_panic) PURE;
   void refresh();
 
   std::shared_ptr<LoadBalancerFactoryImpl> factory_;