-
Notifications
You must be signed in to change notification settings - Fork 5.5k
upstream: add support for setting degraded through LoadAssignment #5649
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
d8d4aa3
5f6fe9a
c980745
8f413f5
0071ab8
aa99b0e
5cb9477
198eb6e
3626ea7
44015a1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -170,6 +170,46 @@ parseExtensionProtocolOptions(const envoy::api::v2::Cluster& config) { | |
| return options; | ||
| } | ||
|
|
||
| // Updates the health flags for an existing host to match the new host. | ||
| // @param updated_host the new host to read health flag values from. | ||
| // @param existing_host the host to update. | ||
| // @param flag the health flag to update. | ||
| // @return bool whether the flag update caused the host health to change. | ||
| bool updateHealthFlags(const Host& updated_host, Host& existing_host, Host::HealthFlag flag) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: s/updateHealthFlags/updateHealthFlag ? |
||
| // Check if the health flag has changed. | ||
| if (existing_host.healthFlagGet(flag) != updated_host.healthFlagGet(flag)) { | ||
| // Keep track of the previous health value of the host. | ||
| const auto previous_health = existing_host.health(); | ||
|
|
||
| if (updated_host.healthFlagGet(flag)) { | ||
| existing_host.healthFlagSet(flag); | ||
| } else { | ||
| existing_host.healthFlagClear(flag); | ||
| } | ||
|
|
||
| // Rebuild if changing the flag affected the host health. | ||
| return previous_health != existing_host.health(); | ||
| } | ||
|
|
||
| return false; | ||
| } | ||
|
|
||
| void setEdsHealthFlag(Host& host, envoy::api::v2::core::HealthStatus health_status) { | ||
| switch (health_status) { | ||
| case envoy::api::v2::core::HealthStatus::UNHEALTHY: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Maybe add fall-thru annotations or comments here. |
||
| case envoy::api::v2::core::HealthStatus::DRAINING: | ||
| case envoy::api::v2::core::HealthStatus::TIMEOUT: | ||
| host.healthFlagSet(Host::HealthFlag::FAILED_EDS_HEALTH); | ||
| break; | ||
| case envoy::api::v2::core::HealthStatus::DEGRADED: | ||
| host.healthFlagSet(Host::HealthFlag::DEGRADED_EDS_HEALTH); | ||
| break; | ||
| default:; | ||
| break; | ||
| // No health flags should be set. | ||
| } | ||
| } | ||
|
|
||
| } // namespace | ||
|
|
||
| Host::CreateConnectionData HostImpl::createConnection( | ||
|
|
@@ -861,32 +901,24 @@ void PriorityStateManager::initializePriorityFor( | |
| void PriorityStateManager::registerHostForPriority( | ||
| const std::string& hostname, Network::Address::InstanceConstSharedPtr address, | ||
| const envoy::api::v2::endpoint::LocalityLbEndpoints& locality_lb_endpoint, | ||
| const envoy::api::v2::endpoint::LbEndpoint& lb_endpoint, | ||
| const absl::optional<Upstream::Host::HealthFlag> health_checker_flag) { | ||
| const envoy::api::v2::endpoint::LbEndpoint& lb_endpoint) { | ||
| const HostSharedPtr host( | ||
| new HostImpl(parent_.info(), hostname, address, lb_endpoint.metadata(), | ||
| lb_endpoint.load_balancing_weight().value(), locality_lb_endpoint.locality(), | ||
| lb_endpoint.endpoint().health_check_config(), locality_lb_endpoint.priority())); | ||
| registerHostForPriority(host, locality_lb_endpoint, lb_endpoint, health_checker_flag); | ||
| registerHostForPriority(host, locality_lb_endpoint, lb_endpoint); | ||
| } | ||
|
|
||
| void PriorityStateManager::registerHostForPriority( | ||
| const HostSharedPtr& host, | ||
| const envoy::api::v2::endpoint::LocalityLbEndpoints& locality_lb_endpoint, | ||
| const envoy::api::v2::endpoint::LbEndpoint& lb_endpoint, | ||
| const absl::optional<Upstream::Host::HealthFlag> health_checker_flag) { | ||
| const envoy::api::v2::endpoint::LbEndpoint& lb_endpoint) { | ||
| const uint32_t priority = locality_lb_endpoint.priority(); | ||
| // Should be called after initializePriorityFor. | ||
| ASSERT(priority_state_[priority].first); | ||
| priority_state_[priority].first->emplace_back(host); | ||
| if (health_checker_flag.has_value()) { | ||
| const auto& health_status = lb_endpoint.health_status(); | ||
| if (health_status == envoy::api::v2::core::HealthStatus::UNHEALTHY || | ||
| health_status == envoy::api::v2::core::HealthStatus::DRAINING || | ||
| health_status == envoy::api::v2::core::HealthStatus::TIMEOUT) { | ||
| priority_state_[priority].first->back()->healthFlagSet(health_checker_flag.value()); | ||
| } | ||
| } | ||
|
|
||
| setEdsHealthFlag(*priority_state_[priority].first->back(), lb_endpoint.health_status()); | ||
| } | ||
|
|
||
| void PriorityStateManager::updateClusterPrioritySet( | ||
|
|
@@ -980,7 +1012,7 @@ StaticClusterImpl::StaticClusterImpl( | |
| for (const auto& lb_endpoint : locality_lb_endpoint.lb_endpoints()) { | ||
| priority_state_manager_->registerHostForPriority( | ||
| "", resolveProtoAddress(lb_endpoint.endpoint().address()), locality_lb_endpoint, | ||
| lb_endpoint, absl::nullopt); | ||
| lb_endpoint); | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -1067,24 +1099,10 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const HostVector& new_hosts, | |
| max_host_weight = host->weight(); | ||
| } | ||
|
|
||
| if (existing_host->second->healthFlagGet(Host::HealthFlag::FAILED_EDS_HEALTH) != | ||
| host->healthFlagGet(Host::HealthFlag::FAILED_EDS_HEALTH)) { | ||
| // TODO(snowp): To accommodate degraded, this bit should be checking for any changes | ||
| // to the health flag, not just healthy vs not healthy. | ||
| const bool previously_healthy = existing_host->second->health() == Host::Health::Healthy; | ||
| if (host->healthFlagGet(Host::HealthFlag::FAILED_EDS_HEALTH)) { | ||
| existing_host->second->healthFlagSet(Host::HealthFlag::FAILED_EDS_HEALTH); | ||
| // If the host was previously healthy and we're now unhealthy, we need to | ||
| // rebuild. | ||
| hosts_changed |= previously_healthy; | ||
| } else { | ||
| existing_host->second->healthFlagClear(Host::HealthFlag::FAILED_EDS_HEALTH); | ||
| // If the host was previously unhealthy and now healthy, we need to | ||
| // rebuild. | ||
| hosts_changed |= | ||
| !previously_healthy && existing_host->second->health() == Host::Health::Healthy; | ||
| } | ||
| } | ||
| hosts_changed |= | ||
| updateHealthFlags(*host, *existing_host->second, Host::HealthFlag::FAILED_EDS_HEALTH); | ||
| hosts_changed |= | ||
| updateHealthFlags(*host, *existing_host->second, Host::HealthFlag::DEGRADED_EDS_HEALTH); | ||
|
|
||
| // Did metadata change? | ||
| const bool metadata_changed = !Protobuf::util::MessageDifferencer::Equivalent( | ||
|
|
@@ -1258,7 +1276,7 @@ void StrictDnsClusterImpl::updateAllHosts(const HostVector& hosts_added, | |
| for (const HostSharedPtr& host : target->hosts_) { | ||
| if (target->locality_lb_endpoint_.priority() == current_priority) { | ||
| priority_state_manager.registerHostForPriority(host, target->locality_lb_endpoint_, | ||
| target->lb_endpoint_, absl::nullopt); | ||
| target->lb_endpoint_); | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -1308,6 +1326,7 @@ void StrictDnsClusterImpl::ResolveTarget::startResolve() { | |
| lb_endpoint_.metadata(), lb_endpoint_.load_balancing_weight().value(), | ||
| locality_lb_endpoint_.locality(), lb_endpoint_.endpoint().health_check_config(), | ||
| locality_lb_endpoint_.priority())); | ||
| setEdsHealthFlag(*new_hosts.back(), lb_endpoint_.health_status()); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is necessary because we create a new
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I think this would make sense. |
||
| } | ||
|
|
||
| HostVector hosts_added; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -332,6 +332,7 @@ TEST_F(EdsTest, EndpointHealthStatus) { | |
| {envoy::api::v2::core::HealthStatus::UNHEALTHY, Host::Health::Unhealthy}, | ||
| {envoy::api::v2::core::HealthStatus::DRAINING, Host::Health::Unhealthy}, | ||
| {envoy::api::v2::core::HealthStatus::TIMEOUT, Host::Health::Unhealthy}, | ||
| {envoy::api::v2::core::HealthStatus::DEGRADED, Host::Health::Degraded}, | ||
| }; | ||
|
|
||
| int port = 80; | ||
|
|
@@ -416,6 +417,37 @@ TEST_F(EdsTest, EndpointHealthStatus) { | |
| hosts[0]->healthFlagClear(Host::HealthFlag::FAILED_ACTIVE_HC); | ||
| EXPECT_EQ(Host::Health::Healthy, hosts[0]->health()); | ||
| } | ||
|
|
||
| const auto rebuild_conter = stats_.counter("cluster.name.update_no_rebuild").value(); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s/rebuild_conter/rebuild_counter |
||
| // Now mark host 0 degraded via EDS, it should be degraded. | ||
| endpoints->mutable_lb_endpoints(0)->set_health_status( | ||
| envoy::api::v2::core::HealthStatus::DEGRADED); | ||
| VERBOSE_EXPECT_NO_THROW(cluster_->onConfigUpdate(resources, "")); | ||
| { | ||
| auto& hosts = cluster_->prioritySet().hostSetsPerPriority()[0]->hosts(); | ||
| EXPECT_EQ(Host::Health::Degraded, hosts[0]->health()); | ||
| } | ||
|
|
||
| std::cerr << cluster_->prioritySet().hostSetsPerPriority()[0]->hosts().size() << std::endl; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use logger?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. left behind debug logging, ill just remove |
||
|
|
||
| // We should rebuild the cluster since we went from healthy -> degraded. | ||
| EXPECT_EQ(rebuild_conter, stats_.counter("cluster.name.update_no_rebuild").value()); | ||
|
|
||
| // Now mark the host as having been degraded through active hc. | ||
| cluster_->prioritySet().hostSetsPerPriority()[0]->hosts()[0]->healthFlagSet( | ||
| Host::HealthFlag::DEGRADED_ACTIVE_HC); | ||
|
|
||
| // Now mark host 0 healthy via EDS, it should still be degraded. | ||
| endpoints->mutable_lb_endpoints(0)->set_health_status( | ||
| envoy::api::v2::core::HealthStatus::HEALTHY); | ||
| VERBOSE_EXPECT_NO_THROW(cluster_->onConfigUpdate(resources, "")); | ||
| { | ||
| auto& hosts = cluster_->prioritySet().hostSetsPerPriority()[0]->hosts(); | ||
| EXPECT_EQ(Host::Health::Degraded, hosts[0]->health()); | ||
| } | ||
|
|
||
| // Since the host health didn't change, expect no rebuild. | ||
| EXPECT_EQ(rebuild_conter + 1, stats_.counter("cluster.name.update_no_rebuild").value()); | ||
| } | ||
|
|
||
| // Validate that onConfigUpdate() removes endpoints that are marked as healthy | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you remind me why we can't just use
FAILED_EDS_HEALTHfor this use case? AFAICT it fits the bill, and looking at the underlying issue being fixed, it seemed we just needed some plumbing around that.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because this is for degrading an endpoint, not marking it as unhealthy. The new value is necessary to differentiate it here https://github.com/envoyproxy/envoy/pull/5649/files#diff-583237ddb4e16f38ccda2e9affdb0ad8R210 from the host being marked as unhealthy.
Degraded docs if you're not familiar: https://www.envoyproxy.io/docs/envoy/latest/intro/arch_overview/load_balancing/degraded
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe it wasn't super clear, but this PR adds support for marking endpoints as degraded, and while I was in here I also made sure to fix #5637
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, that clarifies.