diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index a33f3b0fb161c..0c86cbd4d1134 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -22,6 +22,11 @@ Version history in full by the router. This ensures that the per try timeout does not account for slow downstreams and that will not start before the global timeout. * upstream: added :ref:`upstream_cx_pool_overflow ` for the connection pool circuit breaker. +* upstream: an EDS management server can now force removal of a host that is still passing active + health checking by first marking the host as failed via EDS health check and subsequently removing + it in a future update. This is a mechanism to work around a race condition in which an EDS + implementation may remove a host before it has stopped passing active HC, thus causing the host + to become stranded until a future update. 1.10.0 (Apr 5, 2019) ==================== diff --git a/source/common/upstream/upstream_impl.cc b/source/common/upstream/upstream_impl.cc index fbc27861a3f4f..0e1cf05619b7f 100644 --- a/source/common/upstream/upstream_impl.cc +++ b/source/common/upstream/upstream_impl.cc @@ -1217,13 +1217,16 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const HostVector& new_hosts, // The remaining hosts are hosts that are not referenced in the config update. We remove them from // the priority if any of the following is true: // - Active health checking is not enabled. - // - The removed hosts are failing active health checking. + // - The removed hosts are failing active health checking OR have been explicitly marked as + // unhealthy by a previous EDS update. We do not count outlier as a reason to remove a host + // or any other future health condition that may be added so we do not use the health() API. // - We have explicitly configured the cluster to remove hosts regardless of active health status. const bool dont_remove_healthy_hosts = health_checker_ != nullptr && !info()->drainConnectionsOnHostRemoval(); if (!current_priority_hosts.empty() && dont_remove_healthy_hosts) { for (auto i = current_priority_hosts.begin(); i != current_priority_hosts.end();) { - if (!(*i)->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC)) { + if (!((*i)->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC) || + (*i)->healthFlagGet(Host::HealthFlag::FAILED_EDS_HEALTH))) { if ((*i)->weight() > max_host_weight) { max_host_weight = (*i)->weight(); } diff --git a/test/common/upstream/eds_test.cc b/test/common/upstream/eds_test.cc index 2ffdb3f991ca3..be66f4f013218 100644 --- a/test/common/upstream/eds_test.cc +++ b/test/common/upstream/eds_test.cc @@ -478,9 +478,69 @@ TEST_F(EdsTest, EndpointHealthStatus) { EXPECT_EQ(rebuild_container + 1, stats_.counter("cluster.name.update_no_rebuild").value()); } +// Verify that a host is removed when it is still passing active HC, but has been previously +// told by the EDS server to fail health check. +TEST_F(EdsTest, EndpointRemovalEdsFailButActiveHcSuccess) { + envoy::api::v2::ClusterLoadAssignment cluster_load_assignment; + cluster_load_assignment.set_cluster_name("fare"); + auto* endpoints = cluster_load_assignment.add_endpoints(); + + auto health_checker = std::make_shared(); + EXPECT_CALL(*health_checker, start()); + EXPECT_CALL(*health_checker, addHostCheckCompleteCb(_)).Times(2); + cluster_->setHealthChecker(health_checker); + + auto add_endpoint = [endpoints](int port) { + auto* socket_address = endpoints->add_lb_endpoints() + ->mutable_endpoint() + ->mutable_address() + ->mutable_socket_address(); + socket_address->set_address("1.2.3.4"); + socket_address->set_port_value(port); + }; + + add_endpoint(80); + add_endpoint(81); + doOnConfigUpdateVerifyNoThrow(cluster_load_assignment); + + { + auto& hosts = cluster_->prioritySet().hostSetsPerPriority()[0]->hosts(); + EXPECT_EQ(hosts.size(), 2); + + // Mark the hosts as healthy + hosts[0]->healthFlagClear(Host::HealthFlag::FAILED_ACTIVE_HC); + hosts[1]->healthFlagClear(Host::HealthFlag::FAILED_ACTIVE_HC); + } + + // Mark the first endpoint as unhealthy from EDS. + endpoints->mutable_lb_endpoints(0)->set_health_status( + envoy::api::v2::core::HealthStatus::UNHEALTHY); + doOnConfigUpdateVerifyNoThrow(cluster_load_assignment); + + { + auto& hosts = cluster_->prioritySet().hostSetsPerPriority()[0]->hosts(); + EXPECT_EQ(hosts.size(), 2); + + EXPECT_EQ(hosts[0]->health(), Host::Health::Unhealthy); + EXPECT_FALSE(hosts[0]->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC)); + EXPECT_TRUE(hosts[0]->healthFlagGet(Host::HealthFlag::FAILED_EDS_HEALTH)); + EXPECT_EQ(hosts[1]->health(), Host::Health::Healthy); + } + + // Now remove the first host. Even though it is still passing active HC, since EDS has + // previously explicitly failed it, we won't stabilize it anymore. + endpoints->mutable_lb_endpoints()->erase(endpoints->mutable_lb_endpoints()->begin()); + doOnConfigUpdateVerifyNoThrow(cluster_load_assignment); + + { + auto& hosts = cluster_->prioritySet().hostSetsPerPriority()[0]->hosts(); + EXPECT_EQ(hosts.size(), 1); + } +} + // Validate that onConfigUpdate() removes endpoints that are marked as healthy -// when configured to do so. -TEST_F(EdsTest, EndpointRemoval) { +// when configured to drain on host removal. +TEST_F(EdsTest, EndpointRemovalClusterDrainOnHostRemoval) { envoy::api::v2::ClusterLoadAssignment cluster_load_assignment; cluster_load_assignment.set_cluster_name("fare"); resetClusterDrainOnHostRemoval();