envoyproxy · htuch · Feb 8, 2019 · Jan 18, 2019 · Jan 18, 2019 · Jan 22, 2019
diff --git a/api/envoy/api/v2/core/health_check.proto b/api/envoy/api/v2/core/health_check.proto
@@ -252,4 +252,7 @@ enum HealthStatus {
   // Health check timed out. This is part of HDS and is interpreted by Envoy as
   // *UNHEALTHY*.
   TIMEOUT = 4;
+
+  // Degraded.
+  DEGRADED = 5;
 }
diff --git a/include/envoy/upstream/upstream.h b/include/envoy/upstream/upstream.h
@@ -48,7 +48,9 @@ class Host : virtual public HostDescription {
   /* The host is currently marked as unhealthy by EDS. */                        \
   m(FAILED_EDS_HEALTH, 0x04)                                                     \
   /* The host is currently marked as degraded through active health checking. */ \
-  m(DEGRADED_ACTIVE_HC, 0x08)
+  m(DEGRADED_ACTIVE_HC, 0x08)                                                    \
+  /* The host is currently marked as degraded by EDS. */                         \
+  m(DEGRADED_EDS_HEALTH, 0x10)
   // clang-format on
 
 #define DECLARE_ENUM(name, value) name = value,

diff --git a/source/common/upstream/eds.cc b/source/common/upstream/eds.cc
@@ -75,7 +75,7 @@ void EdsClusterImpl::onConfigUpdate(const ResourceVector& resources, const std::
     for (const auto& lb_endpoint : locality_lb_endpoint.lb_endpoints()) {
       priority_state_manager.registerHostForPriority(
           "", resolveProtoAddress(lb_endpoint.endpoint().address()), locality_lb_endpoint,
-          lb_endpoint, Host::HealthFlag::FAILED_EDS_HEALTH);
+          lb_endpoint);
     }
   }
 

diff --git a/source/common/upstream/logical_dns_cluster.cc b/source/common/upstream/logical_dns_cluster.cc
@@ -113,7 +113,7 @@ void LogicalDnsCluster::startResolve() {
             PriorityStateManager priority_state_manager(*this, local_info_);
             priority_state_manager.initializePriorityFor(locality_lb_endpoint);
             priority_state_manager.registerHostForPriority(logical_host_, locality_lb_endpoint,
-                                                           lbEndpoint(), absl::nullopt);
+                                                           lbEndpoint());
 
             const uint32_t priority = locality_lb_endpoint.priority();
             priority_state_manager.updateClusterPrioritySet(

diff --git a/source/common/upstream/upstream_impl.cc b/source/common/upstream/upstream_impl.cc
@@ -170,6 +170,46 @@ parseExtensionProtocolOptions(const envoy::api::v2::Cluster& config) {
   return options;
 }
 
+// Updates the health flags for an existing host to match the new host.
+// @param updated_host the new host to read health flag values from.
+// @param existing_host the host to update.
+// @param flag the health flag to update.
+// @return bool whether the flag update caused the host health to change.
+bool updateHealthFlags(const Host& updated_host, Host& existing_host, Host::HealthFlag flag) {
+  // Check if the health flag has changed.
+  if (existing_host.healthFlagGet(flag) != updated_host.healthFlagGet(flag)) {
+    // Keep track of the previous health value of the host.
+    const auto previous_health = existing_host.health();
+
+    if (updated_host.healthFlagGet(flag)) {
+      existing_host.healthFlagSet(flag);
+    } else {
+      existing_host.healthFlagClear(flag);
+    }
+
+    // Rebuild if changing the flag affected the host health.
+    return previous_health != existing_host.health();
+  }
+
+  return false;
+}
+
+void setEdsHealthFlag(Host& host, envoy::api::v2::core::HealthStatus health_status) {
+  switch (health_status) {
+  case envoy::api::v2::core::HealthStatus::UNHEALTHY:
+  case envoy::api::v2::core::HealthStatus::DRAINING:
+  case envoy::api::v2::core::HealthStatus::TIMEOUT:
+    host.healthFlagSet(Host::HealthFlag::FAILED_EDS_HEALTH);
+    break;
+  case envoy::api::v2::core::HealthStatus::DEGRADED:
+    host.healthFlagSet(Host::HealthFlag::DEGRADED_EDS_HEALTH);
+    break;
+  default:;
+    break;
+    // No health flags should be set.
+  }
+}
+
 } // namespace
 
 Host::CreateConnectionData HostImpl::createConnection(
@@ -861,32 +901,24 @@ void PriorityStateManager::initializePriorityFor(
 void PriorityStateManager::registerHostForPriority(
     const std::string& hostname, Network::Address::InstanceConstSharedPtr address,
     const envoy::api::v2::endpoint::LocalityLbEndpoints& locality_lb_endpoint,
-    const envoy::api::v2::endpoint::LbEndpoint& lb_endpoint,
-    const absl::optional<Upstream::Host::HealthFlag> health_checker_flag) {
+    const envoy::api::v2::endpoint::LbEndpoint& lb_endpoint) {
   const HostSharedPtr host(
       new HostImpl(parent_.info(), hostname, address, lb_endpoint.metadata(),
                    lb_endpoint.load_balancing_weight().value(), locality_lb_endpoint.locality(),
                    lb_endpoint.endpoint().health_check_config(), locality_lb_endpoint.priority()));
-  registerHostForPriority(host, locality_lb_endpoint, lb_endpoint, health_checker_flag);
+  registerHostForPriority(host, locality_lb_endpoint, lb_endpoint);
 }
 
 void PriorityStateManager::registerHostForPriority(
     const HostSharedPtr& host,
     const envoy::api::v2::endpoint::LocalityLbEndpoints& locality_lb_endpoint,
-    const envoy::api::v2::endpoint::LbEndpoint& lb_endpoint,
-    const absl::optional<Upstream::Host::HealthFlag> health_checker_flag) {
+    const envoy::api::v2::endpoint::LbEndpoint& lb_endpoint) {
   const uint32_t priority = locality_lb_endpoint.priority();
   // Should be called after initializePriorityFor.
   ASSERT(priority_state_[priority].first);
   priority_state_[priority].first->emplace_back(host);
-  if (health_checker_flag.has_value()) {
-    const auto& health_status = lb_endpoint.health_status();
-    if (health_status == envoy::api::v2::core::HealthStatus::UNHEALTHY ||
-        health_status == envoy::api::v2::core::HealthStatus::DRAINING ||
-        health_status == envoy::api::v2::core::HealthStatus::TIMEOUT) {
-      priority_state_[priority].first->back()->healthFlagSet(health_checker_flag.value());
-    }
-  }
+
+  setEdsHealthFlag(*priority_state_[priority].first->back(), lb_endpoint.health_status());
 }
 
 void PriorityStateManager::updateClusterPrioritySet(
@@ -980,7 +1012,7 @@ StaticClusterImpl::StaticClusterImpl(
     for (const auto& lb_endpoint : locality_lb_endpoint.lb_endpoints()) {
       priority_state_manager_->registerHostForPriority(
           "", resolveProtoAddress(lb_endpoint.endpoint().address()), locality_lb_endpoint,
-          lb_endpoint, absl::nullopt);
+          lb_endpoint);
     }
   }
 }
@@ -1067,24 +1099,10 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const HostVector& new_hosts,
         max_host_weight = host->weight();
       }
 
-      if (existing_host->second->healthFlagGet(Host::HealthFlag::FAILED_EDS_HEALTH) !=
-          host->healthFlagGet(Host::HealthFlag::FAILED_EDS_HEALTH)) {
-        // TODO(snowp): To accommodate degraded, this bit should be checking for any changes
-        // to the health flag, not just healthy vs not healthy.
-        const bool previously_healthy = existing_host->second->health() == Host::Health::Healthy;
-        if (host->healthFlagGet(Host::HealthFlag::FAILED_EDS_HEALTH)) {
-          existing_host->second->healthFlagSet(Host::HealthFlag::FAILED_EDS_HEALTH);
-          // If the host was previously healthy and we're now unhealthy, we need to
-          // rebuild.
-          hosts_changed |= previously_healthy;
-        } else {
-          existing_host->second->healthFlagClear(Host::HealthFlag::FAILED_EDS_HEALTH);
-          // If the host was previously unhealthy and now healthy, we need to
-          // rebuild.
-          hosts_changed |=
-              !previously_healthy && existing_host->second->health() == Host::Health::Healthy;
-        }
-      }
+      hosts_changed |=
+          updateHealthFlags(*host, *existing_host->second, Host::HealthFlag::FAILED_EDS_HEALTH);
+      hosts_changed |=
+          updateHealthFlags(*host, *existing_host->second, Host::HealthFlag::DEGRADED_EDS_HEALTH);
 
       // Did metadata change?
       const bool metadata_changed = !Protobuf::util::MessageDifferencer::Equivalent(
@@ -1258,7 +1276,7 @@ void StrictDnsClusterImpl::updateAllHosts(const HostVector& hosts_added,
     for (const HostSharedPtr& host : target->hosts_) {
       if (target->locality_lb_endpoint_.priority() == current_priority) {
         priority_state_manager.registerHostForPriority(host, target->locality_lb_endpoint_,
-                                                       target->lb_endpoint_, absl::nullopt);
+                                                       target->lb_endpoint_);
       }
     }
   }
@@ -1308,6 +1326,7 @@ void StrictDnsClusterImpl::ResolveTarget::startResolve() {
               lb_endpoint_.metadata(), lb_endpoint_.load_balancing_weight().value(),
               locality_lb_endpoint_.locality(), lb_endpoint_.endpoint().health_check_config(),
               locality_lb_endpoint_.priority()));
+          setEdsHealthFlag(*new_hosts.back(), lb_endpoint_.health_status());
         }
 
         HostVector hosts_added;

diff --git a/source/common/upstream/upstream_impl.h b/source/common/upstream/upstream_impl.h
@@ -206,7 +206,8 @@ class HostImpl : public HostDescriptionImpl,
     }
 
     // Only possible option at this point is that the host is degraded.
-    ASSERT(health_flags_ == static_cast<uint32_t>(HealthFlag::DEGRADED_ACTIVE_HC));
+    ASSERT(healthFlagGet(HealthFlag::DEGRADED_ACTIVE_HC) ||
+           healthFlagGet(HealthFlag::DEGRADED_EDS_HEALTH));
     return Host::Health::Degraded;
   }
 
@@ -665,14 +666,12 @@ class PriorityStateManager : protected Logger::Loggable<Logger::Id::upstream> {
   registerHostForPriority(const std::string& hostname,
                           Network::Address::InstanceConstSharedPtr address,
                           const envoy::api::v2::endpoint::LocalityLbEndpoints& locality_lb_endpoint,
-                          const envoy::api::v2::endpoint::LbEndpoint& lb_endpoint,
-                          const absl::optional<Upstream::Host::HealthFlag> health_checker_flag);
+                          const envoy::api::v2::endpoint::LbEndpoint& lb_endpoint);
 
   void
   registerHostForPriority(const HostSharedPtr& host,
                           const envoy::api::v2::endpoint::LocalityLbEndpoints& locality_lb_endpoint,
-                          const envoy::api::v2::endpoint::LbEndpoint& lb_endpoint,
-                          const absl::optional<Upstream::Host::HealthFlag> health_checker_flag);
+                          const envoy::api::v2::endpoint::LbEndpoint& lb_endpoint);
 
   void
   updateClusterPrioritySet(const uint32_t priority, HostVectorSharedPtr&& current_hosts,

diff --git a/source/server/http/admin.cc b/source/server/http/admin.cc
@@ -168,10 +168,14 @@ void setHealthFlag(Upstream::Host::HealthFlag flag, const Upstream::Host& host,
         host.healthFlagGet(Upstream::Host::HealthFlag::FAILED_OUTLIER_CHECK));
     break;
   case Upstream::Host::HealthFlag::FAILED_EDS_HEALTH:
-    health_status.set_eds_health_status(
-        host.healthFlagGet(Upstream::Host::HealthFlag::FAILED_EDS_HEALTH)
-            ? envoy::api::v2::core::HealthStatus::UNHEALTHY
-            : envoy::api::v2::core::HealthStatus::HEALTHY);
+  case Upstream::Host::HealthFlag::DEGRADED_EDS_HEALTH:
+    if (host.healthFlagGet(Upstream::Host::HealthFlag::FAILED_EDS_HEALTH)) {
+      health_status.set_eds_health_status(envoy::api::v2::core::HealthStatus::UNHEALTHY);
+    } else if (host.healthFlagGet(Upstream::Host::HealthFlag::DEGRADED_EDS_HEALTH)) {
+      health_status.set_eds_health_status(envoy::api::v2::core::HealthStatus::DEGRADED);
+    } else {
+      health_status.set_eds_health_status(envoy::api::v2::core::HealthStatus::HEALTHY);
+    }
     break;
   case Upstream::Host::HealthFlag::DEGRADED_ACTIVE_HC:
     health_status.set_failed_active_degraded_check(

diff --git a/test/common/upstream/eds_test.cc b/test/common/upstream/eds_test.cc
@@ -332,6 +332,7 @@ TEST_F(EdsTest, EndpointHealthStatus) {
           {envoy::api::v2::core::HealthStatus::UNHEALTHY, Host::Health::Unhealthy},
           {envoy::api::v2::core::HealthStatus::DRAINING, Host::Health::Unhealthy},
           {envoy::api::v2::core::HealthStatus::TIMEOUT, Host::Health::Unhealthy},
+          {envoy::api::v2::core::HealthStatus::DEGRADED, Host::Health::Degraded},
       };
 
   int port = 80;
@@ -416,6 +417,37 @@ TEST_F(EdsTest, EndpointHealthStatus) {
     hosts[0]->healthFlagClear(Host::HealthFlag::FAILED_ACTIVE_HC);
     EXPECT_EQ(Host::Health::Healthy, hosts[0]->health());
   }
+
+  const auto rebuild_conter = stats_.counter("cluster.name.update_no_rebuild").value();
+  // Now mark host 0 degraded via EDS, it should be degraded.
+  endpoints->mutable_lb_endpoints(0)->set_health_status(
+      envoy::api::v2::core::HealthStatus::DEGRADED);
+  VERBOSE_EXPECT_NO_THROW(cluster_->onConfigUpdate(resources, ""));
+  {
+    auto& hosts = cluster_->prioritySet().hostSetsPerPriority()[0]->hosts();
+    EXPECT_EQ(Host::Health::Degraded, hosts[0]->health());
+  }
+
+  std::cerr << cluster_->prioritySet().hostSetsPerPriority()[0]->hosts().size() << std::endl;
+
+  // We should rebuild the cluster since we went from healthy -> degraded.
+  EXPECT_EQ(rebuild_conter, stats_.counter("cluster.name.update_no_rebuild").value());
+
+  // Now mark the host as having been degraded through active hc.
+  cluster_->prioritySet().hostSetsPerPriority()[0]->hosts()[0]->healthFlagSet(
+      Host::HealthFlag::DEGRADED_ACTIVE_HC);
+
+  // Now mark host 0 healthy via EDS, it should still be degraded.
+  endpoints->mutable_lb_endpoints(0)->set_health_status(
+      envoy::api::v2::core::HealthStatus::HEALTHY);
+  VERBOSE_EXPECT_NO_THROW(cluster_->onConfigUpdate(resources, ""));
+  {
+    auto& hosts = cluster_->prioritySet().hostSetsPerPriority()[0]->hosts();
+    EXPECT_EQ(Host::Health::Degraded, hosts[0]->health());
+  }
+
+  // Since the host health didn't change, expect no rebuild.
+  EXPECT_EQ(rebuild_conter + 1, stats_.counter("cluster.name.update_no_rebuild").value());
 }
 
 // Validate that onConfigUpdate() removes endpoints that are marked as healthy

diff --git a/test/common/upstream/upstream_impl_test.cc b/test/common/upstream/upstream_impl_test.cc
@@ -463,6 +463,7 @@ TEST(StrictDnsClusterImplTest, LoadAssignmentBasic) {
                 port_value: 11001
             health_check_config:
               port_value: 8000
+          health_status: DEGRADED
         - endpoint:
             address:
               socket_address:
@@ -532,6 +533,10 @@ TEST(StrictDnsClusterImplTest, LoadAssignmentBasic) {
   EXPECT_EQ("localhost1", cluster.prioritySet().hostSetsPerPriority()[0]->hosts()[0]->hostname());
   EXPECT_EQ("localhost1", cluster.prioritySet().hostSetsPerPriority()[0]->hosts()[1]->hostname());
   EXPECT_EQ(100, cluster.prioritySet().hostSetsPerPriority()[0]->overprovisioningFactor());
+  EXPECT_EQ(Host::Health::Degraded,
+            cluster.prioritySet().hostSetsPerPriority()[0]->hosts()[0]->health());
+  EXPECT_EQ(Host::Health::Degraded,
+            cluster.prioritySet().hostSetsPerPriority()[0]->hosts()[1]->health());
 
   // This is the first time we receveived an update for localhost1, we expect to rebuild.
   EXPECT_EQ(0UL, stats.counter("cluster.name.update_no_rebuild").value());
@@ -590,7 +595,8 @@ TEST(StrictDnsClusterImplTest, LoadAssignmentBasic) {
       std::list<std::string>({"127.0.0.3:11001", "10.0.0.1:11002"}),
       ContainerEq(hostListToAddresses(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())));
 
-  EXPECT_EQ(2UL, cluster.prioritySet().hostSetsPerPriority()[0]->healthyHosts().size());
+  EXPECT_EQ(1UL, cluster.prioritySet().hostSetsPerPriority()[0]->healthyHosts().size());
+  EXPECT_EQ(1UL, cluster.prioritySet().hostSetsPerPriority()[0]->degradedHosts().size());
   EXPECT_EQ(1UL, cluster.prioritySet().hostSetsPerPriority()[0]->hostsPerLocality().get().size());
   EXPECT_EQ(1UL,
             cluster.prioritySet().hostSetsPerPriority()[0]->healthyHostsPerLocality().get().size());
@@ -604,7 +610,8 @@ TEST(StrictDnsClusterImplTest, LoadAssignmentBasic) {
   EXPECT_THAT(std::list<std::string>({"127.0.0.3:11001", "10.0.0.1:11002", "10.0.0.1:11002"}),
               ContainerEq(hostListToAddresses(hosts)));
 
-  EXPECT_EQ(3UL, cluster.prioritySet().hostSetsPerPriority()[0]->healthyHosts().size());
+  EXPECT_EQ(2UL, cluster.prioritySet().hostSetsPerPriority()[0]->healthyHosts().size());
+  EXPECT_EQ(1UL, cluster.prioritySet().hostSetsPerPriority()[0]->degradedHosts().size());
   EXPECT_EQ(1UL, cluster.prioritySet().hostSetsPerPriority()[0]->hostsPerLocality().get().size());
   EXPECT_EQ(1UL,
             cluster.prioritySet().hostSetsPerPriority()[0]->healthyHostsPerLocality().get().size());
@@ -860,6 +867,18 @@ TEST(HostImplTest, HealthFlags) {
   // If the degraded flag is the only thing set, host is degraded.
   host->healthFlagClear(Host::HealthFlag::FAILED_ACTIVE_HC);
   EXPECT_EQ(Host::Health::Degraded, host->health());
+
+  // If the EDS and active degraded flag is set, host is degraded.
+  host->healthFlagSet(Host::HealthFlag::DEGRADED_EDS_HEALTH);
+  EXPECT_EQ(Host::Health::Degraded, host->health());
+
+  // If only the EDS degraded is set, host is degraded.
+  host->healthFlagClear(Host::HealthFlag::DEGRADED_ACTIVE_HC);
+  EXPECT_EQ(Host::Health::Degraded, host->health());
+
+  // If EDS and failed active hc is set, host is unhealthy.
+  host->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC);
+  EXPECT_EQ(Host::Health::Unhealthy, host->health());
 }
 
 TEST(StaticClusterImplTest, InitialHosts) {
@@ -1088,6 +1107,48 @@ TEST(StaticClusterImplTest, LoadAssignmentLocality) {
   EXPECT_FALSE(cluster.info()->addedViaApi());
 }
 
+TEST(StaticClusterImplTest, LoadAssignmentEdsHealth) {
+  Stats::IsolatedStoreImpl stats;
+  Ssl::MockContextManager ssl_context_manager;
+  NiceMock<Event::MockDispatcher> dispatcher;
+  NiceMock<Runtime::MockLoader> runtime;
+  NiceMock<LocalInfo::MockLocalInfo> local_info;
+  NiceMock<Runtime::MockRandomGenerator> random;
+  const std::string yaml = R"EOF(
+    name: staticcluster
+    connect_timeout: 0.25s
+    type: STATIC
+    lb_policy: ROUND_ROBIN
+    load_assignment:
+      policy:
+        overprovisioning_factor: 100
+      endpoints:
+      - lb_endpoints:
+        - endpoint:
+            address:
+              socket_address:
+                address: 10.0.0.1
+                port_value: 443
+            health_check_config:
+              port_value: 8000
+          health_status: DEGRADED
+  )EOF";
+
+  NiceMock<MockClusterManager> cm;
+  envoy::api::v2::Cluster cluster_config = parseClusterFromV2Yaml(yaml);
+  Envoy::Stats::ScopePtr scope = stats.createScope(fmt::format(
+      "cluster.{}.", cluster_config.alt_stat_name().empty() ? cluster_config.name()
+                                                            : cluster_config.alt_stat_name()));
+  Envoy::Server::Configuration::TransportSocketFactoryContextImpl factory_context(
+      ssl_context_manager, *scope, cm, local_info, dispatcher, random, stats);
+  StaticClusterImpl cluster(cluster_config, runtime, factory_context, std::move(scope), false);
+  cluster.initialize([] {});
+
+  EXPECT_EQ(1UL, cluster.prioritySet().hostSetsPerPriority()[0]->degradedHosts().size());
+  EXPECT_EQ(Host::Health::Degraded,
+            cluster.prioritySet().hostSetsPerPriority()[0]->hosts()[0]->health());
+}
+
 TEST(StaticClusterImplTest, AltStatName) {
   Stats::IsolatedStoreImpl stats;
   Ssl::MockContextManager ssl_context_manager;

diff --git a/test/server/http/admin_test.cc b/test/server/http/admin_test.cc
@@ -1033,6 +1033,8 @@ TEST_P(AdminInstanceTest, ClustersJson) {
       .WillByDefault(Return(false));
   ON_CALL(*host, healthFlagGet(Upstream::Host::HealthFlag::DEGRADED_ACTIVE_HC))
       .WillByDefault(Return(true));
+  ON_CALL(*host, healthFlagGet(Upstream::Host::HealthFlag::DEGRADED_EDS_HEALTH))
+      .WillByDefault(Return(true));
 
   ON_CALL(host->outlier_detector_, successRate()).WillByDefault(Return(43.2));
   ON_CALL(*host, weight()).WillByDefault(Return(5));
@@ -1089,7 +1091,7 @@ TEST_P(AdminInstanceTest, ClustersJson) {
       },
      ],
      "health_status": {
-      "eds_health_status": "HEALTHY",
+      "eds_health_status": "DEGRADED",
       "failed_active_health_check": true,
       "failed_outlier_check": true,
       "failed_active_degraded_check": true