Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/root/intro/version_history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ Version history
in full by the router. This ensures that the per try timeout does not account for slow
downstreams and that will not start before the global timeout.
* upstream: added :ref:`upstream_cx_pool_overflow <config_cluster_manager_cluster_stats>` for the connection pool circuit breaker.
* upstream: an EDS management server can now force removal of a host that is still passing active
health checking by first marking the host as failed via EDS health check and subsequently removing
it in a future update. This is a mechanism to work around a race condition in which an EDS
implementation may remove a host before it has stopped passing active HC, thus causing the host
to become stranded until a future update.

1.10.0 (Apr 5, 2019)
====================
Expand Down
7 changes: 5 additions & 2 deletions source/common/upstream/upstream_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1217,13 +1217,16 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const HostVector& new_hosts,
// The remaining hosts are hosts that are not referenced in the config update. We remove them from
// the priority if any of the following is true:
// - Active health checking is not enabled.
// - The removed hosts are failing active health checking.
// - The removed hosts are failing active health checking OR have been explicitly marked as
// unhealthy by a previous EDS update. We do not count outlier as a reason to remove a host
// or any other future health condition that may be added so we do not use the health() API.
// - We have explicitly configured the cluster to remove hosts regardless of active health status.
const bool dont_remove_healthy_hosts =
health_checker_ != nullptr && !info()->drainConnectionsOnHostRemoval();
if (!current_priority_hosts.empty() && dont_remove_healthy_hosts) {
for (auto i = current_priority_hosts.begin(); i != current_priority_hosts.end();) {
if (!(*i)->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC)) {
if (!((*i)->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC) ||
(*i)->healthFlagGet(Host::HealthFlag::FAILED_EDS_HEALTH))) {
if ((*i)->weight() > max_host_weight) {
max_host_weight = (*i)->weight();
}
Expand Down
64 changes: 62 additions & 2 deletions test/common/upstream/eds_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -478,9 +478,69 @@ TEST_F(EdsTest, EndpointHealthStatus) {
EXPECT_EQ(rebuild_container + 1, stats_.counter("cluster.name.update_no_rebuild").value());
}

// Verify that a host is removed when it is still passing active HC, but has been previously
// told by the EDS server to fail health check.
TEST_F(EdsTest, EndpointRemovalEdsFailButActiveHcSuccess) {
envoy::api::v2::ClusterLoadAssignment cluster_load_assignment;
cluster_load_assignment.set_cluster_name("fare");
auto* endpoints = cluster_load_assignment.add_endpoints();

auto health_checker = std::make_shared<MockHealthChecker>();
EXPECT_CALL(*health_checker, start());
EXPECT_CALL(*health_checker, addHostCheckCompleteCb(_)).Times(2);
cluster_->setHealthChecker(health_checker);

auto add_endpoint = [endpoints](int port) {
auto* socket_address = endpoints->add_lb_endpoints()
->mutable_endpoint()
->mutable_address()
->mutable_socket_address();
socket_address->set_address("1.2.3.4");
socket_address->set_port_value(port);
};

add_endpoint(80);
add_endpoint(81);
doOnConfigUpdateVerifyNoThrow(cluster_load_assignment);

{
auto& hosts = cluster_->prioritySet().hostSetsPerPriority()[0]->hosts();
EXPECT_EQ(hosts.size(), 2);

// Mark the hosts as healthy
hosts[0]->healthFlagClear(Host::HealthFlag::FAILED_ACTIVE_HC);
hosts[1]->healthFlagClear(Host::HealthFlag::FAILED_ACTIVE_HC);
}

// Mark the first endpoint as unhealthy from EDS.
endpoints->mutable_lb_endpoints(0)->set_health_status(
envoy::api::v2::core::HealthStatus::UNHEALTHY);
doOnConfigUpdateVerifyNoThrow(cluster_load_assignment);

{
auto& hosts = cluster_->prioritySet().hostSetsPerPriority()[0]->hosts();
EXPECT_EQ(hosts.size(), 2);

EXPECT_EQ(hosts[0]->health(), Host::Health::Unhealthy);
EXPECT_FALSE(hosts[0]->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC));
EXPECT_TRUE(hosts[0]->healthFlagGet(Host::HealthFlag::FAILED_EDS_HEALTH));
EXPECT_EQ(hosts[1]->health(), Host::Health::Healthy);
}

// Now remove the first host. Even though it is still passing active HC, since EDS has
// previously explicitly failed it, we won't stabilize it anymore.
endpoints->mutable_lb_endpoints()->erase(endpoints->mutable_lb_endpoints()->begin());
doOnConfigUpdateVerifyNoThrow(cluster_load_assignment);

{
auto& hosts = cluster_->prioritySet().hostSetsPerPriority()[0]->hosts();
EXPECT_EQ(hosts.size(), 1);
}
}

// Validate that onConfigUpdate() removes endpoints that are marked as healthy
// when configured to do so.
TEST_F(EdsTest, EndpointRemoval) {
// when configured to drain on host removal.
TEST_F(EdsTest, EndpointRemovalClusterDrainOnHostRemoval) {
envoy::api::v2::ClusterLoadAssignment cluster_load_assignment;
cluster_load_assignment.set_cluster_name("fare");
resetClusterDrainOnHostRemoval();
Expand Down