Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
3293441
upstream: exclude hosts from lb calculations until first active hc re…
Apr 28, 2019
fd70113
fix upstream tests
May 1, 2019
83f454b
fix locality test
May 3, 2019
f7b142f
add test for zero warmed in locality
May 3, 2019
8d0b343
fix most of the tests
May 3, 2019
75c8a7c
actually adjust panic based on warmed hosts
May 3, 2019
d94e116
format + other fixes
May 3, 2019
821913e
fix proto field id + format
May 3, 2019
9de4c56
spelling + fix load_balancer_benchmark build
May 3, 2019
0bc3d03
pass whole list of hosts to faclitate subsetting
May 3, 2019
1cb40f5
fix more build failures
May 3, 2019
a4a4f10
fix filter bug
May 4, 2019
ed4eaa7
fix test failures
May 4, 2019
d881b27
track excluded hosts instead of warmed hosts to reduce memory usage
May 5, 2019
ce7d5b4
fix cm tests
May 5, 2019
f017af1
expose const shared ptr, use UpdateHostsParams
May 6, 2019
5e688e2
remove warmed() function, check flag directly
May 6, 2019
6b962fd
Merge remote-tracking branch 'envoy/master' into warm-new-hosts
May 6, 2019
8c33216
add integration test + stats
May 6, 2019
ab918c3
add UTs for panic
May 6, 2019
7950e0b
move params helpers to test + other nits
May 6, 2019
510a7a9
add UT for partitionHosts
May 6, 2019
c89bb7b
add version notes
May 6, 2019
8a9153a
fix more updateHostsParams usages
May 6, 2019
820cd35
fix format failures
May 7, 2019
b69d22e
remove redundant this
May 7, 2019
b79ef40
normalize handling of pending flag, add TODO
May 7, 2019
1a931fb
increase stats limit
May 7, 2019
0207652
Merge remote-tracking branch 'envoy/master' into warm-new-hosts
May 7, 2019
5510ac7
move clearPendingFlag to helper method
May 8, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions api/envoy/admin/v2alpha/clusters.proto
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ message HostHealthStatus {
// health checking.
bool pending_dynamic_removal = 5;

// The host has not yet been health checked.
bool pending_active_hc = 6;

// Health status as reported by EDS. Note: only HEALTHY and UNHEALTHY are currently supported
// here.
// TODO(mrice32): pipe through remaining EDS health status possibilities.
Expand Down
21 changes: 21 additions & 0 deletions api/envoy/api/v2/cds.proto
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,27 @@ message Cluster {
// because merging those updates isn't currently safe. See
// https://github.com/envoyproxy/envoy/pull/3941.
google.protobuf.Duration update_merge_window = 4;

// If set to true, Envoy will not consider new hosts when computing load balancing weights until
// they have been health checked for the first time. This will have no effect unless
// active health checking is also configured.
//
// Ignoring a host means that for any load balancing calculations that adjust weights based
// on the ratio of eligible hosts and total hosts (priority spillover, locality weighting and
// panic mode) Envoy will exclude these hosts in the denominator.
//
// For example, with hosts in two priorities P0 and P1, where P0 looks like
// {healthy, unhealthy (new), unhealthy (new)}
// and where P1 looks like
// {healthy, healthy}
// all traffic will still hit P0, as 1 / (3 - 2) = 1.
//
// Enabling this will allow scaling up the number of hosts for a given cluster without entering
// panic mode or triggering priority spillover, assuming the hosts pass the first health check.
//
// If panic mode is triggered, new hosts are still eligible for traffic; they simply do not
// contribute to the calculation when deciding whether panic mode is enabled or not.
bool ignore_new_hosts_until_first_hc = 5;
}

// Common configuration for all load balancer implementations.
Expand Down
3 changes: 3 additions & 0 deletions docs/root/intro/version_history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ Version history
it in a future update. This is a mechanism to work around a race condition in which an EDS
implementation may remove a host before it has stopped passing active HC, thus causing the host
to become stranded until a future update.
* upstream: added :ref:`an option <envoy_api_field_Cluster.CommonLbConfig.ignore_new_hosts_until_first_hc>`
that allows ignoring new hosts for the purpose of load balancing calculations until they have
been health checked for the first time.
* upstream: added runtime error checking to prevent setting dns type to STRICT_DNS or LOGICAL_DNS when custom resolver name is specified.
* grpc-json: added support for :ref:`auto mapping
<envoy_api_field_config.filter.http.transcoder.v2.GrpcJsonTranscoder.auto_mapping>`.
Expand Down
2 changes: 2 additions & 0 deletions include/envoy/upstream/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ struct HealthyAvailability : PriorityAvailability {
struct Healthy {};
// Phantom type indicating that the type is related to degraded hosts.
struct Degraded {};
// Phantom type indicating that the type is related to excluded hosts.
struct Excluded {};

} // namespace Upstream
} // namespace Envoy
67 changes: 66 additions & 1 deletion include/envoy/upstream/upstream.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ class Host : virtual public HostDescription {
m(DEGRADED_EDS_HEALTH, 0x10) \
/* The host is pending removal from discovery but is stabilized due to */ \
/* active HC. */ \
m(PENDING_DYNAMIC_REMOVAL, 0x20)
m(PENDING_DYNAMIC_REMOVAL, 0x20) \
/* The host is pending its initial active health check. */ \
m(PENDING_ACTIVE_HC, 0x40)
// clang-format on

#define DECLARE_ENUM(name, value) name = value,
Expand Down Expand Up @@ -197,12 +199,14 @@ typedef std::shared_ptr<const Host> HostConstSharedPtr;
typedef std::vector<HostSharedPtr> HostVector;
typedef Phantom<HostVector, Healthy> HealthyHostVector;
typedef Phantom<HostVector, Degraded> DegradedHostVector;
typedef Phantom<HostVector, Excluded> ExcludedHostVector;
typedef std::unordered_map<std::string, Upstream::HostSharedPtr> HostMap;
typedef std::shared_ptr<HostVector> HostVectorSharedPtr;
typedef std::shared_ptr<const HostVector> HostVectorConstSharedPtr;

typedef std::shared_ptr<const HealthyHostVector> HealthyHostVectorConstSharedPtr;
typedef std::shared_ptr<const DegradedHostVector> DegradedHostVectorConstSharedPtr;
typedef std::shared_ptr<const ExcludedHostVector> ExcludedHostVectorConstSharedPtr;

typedef std::unique_ptr<HostVector> HostListPtr;
typedef std::unordered_map<envoy::api::v2::core::Locality, uint32_t, LocalityHash, LocalityEqualTo>
Expand Down Expand Up @@ -260,6 +264,7 @@ typedef std::shared_ptr<const LocalityWeights> LocalityWeightsConstSharedPtr;
* Base host set interface. This contains all of the endpoints for a given LocalityLbEndpoints
* priority level.
*/
// TODO(snowp): Remove the const ref accessors in favor of the shared_ptr ones.
class HostSet {
public:
virtual ~HostSet() {}
Expand All @@ -269,6 +274,11 @@ class HostSet {
*/
virtual const HostVector& hosts() const PURE;

/**
* @return a shared ptr to the vector returned by hosts().
*/
virtual HostVectorConstSharedPtr hostsPtr() const PURE;
Comment thread
mattklein123 marked this conversation as resolved.

/**
* @return all healthy hosts contained in the set at the current time. NOTE: This set is
* eventually consistent. There is a time window where a host in this set may become
Expand All @@ -277,6 +287,11 @@ class HostSet {
*/
virtual const HostVector& healthyHosts() const PURE;

/**
* @return a shared ptr to the vector returned by healthyHosts().
*/
virtual HealthyHostVectorConstSharedPtr healthyHostsPtr() const PURE;

/**
* @return all degraded hosts contained in the set at the current time. NOTE: This set is
* eventually consistent. There is a time window where a host in this set may become
Expand All @@ -285,21 +300,62 @@ class HostSet {
*/
virtual const HostVector& degradedHosts() const PURE;

/**
* @return a shared ptr to the vector returned by degradedHosts().
*/
virtual DegradedHostVectorConstSharedPtr degradedHostsPtr() const PURE;

/*
* @return all excluded hosts contained in the set at the current time. Excluded hosts should be
* ignored when computing load balancing weights, but may overlap with hosts in hosts().
*/
virtual const HostVector& excludedHosts() const PURE;

/**
* @return a shared ptr to the vector returned by excludedHosts().
*/
virtual ExcludedHostVectorConstSharedPtr excludedHostsPtr() const PURE;

/**
* @return hosts per locality.
*/
virtual const HostsPerLocality& hostsPerLocality() const PURE;

/**
* @return a shared ptr to the HostsPerLocality returned by hostsPerLocality().
*/
virtual HostsPerLocalityConstSharedPtr hostsPerLocalityPtr() const PURE;

/**
* @return same as hostsPerLocality but only contains healthy hosts.
*/
virtual const HostsPerLocality& healthyHostsPerLocality() const PURE;

/**
* @return a shared ptr to the HostsPerLocality returned by healthyHostsPerLocality().
*/
virtual HostsPerLocalityConstSharedPtr healthyHostsPerLocalityPtr() const PURE;

/**
* @return same as hostsPerLocality but only contains degraded hosts.
*/
virtual const HostsPerLocality& degradedHostsPerLocality() const PURE;

/**
* @return a shared ptr to the HostsPerLocality returned by degradedHostsPerLocality().
*/
virtual HostsPerLocalityConstSharedPtr degradedHostsPerLocalityPtr() const PURE;

/**
* @return same as hostsPerLocality but only contains excluded hosts.
*/
virtual const HostsPerLocality& excludedHostsPerLocality() const PURE;

/**
* @return a shared ptr to the HostsPerLocality returned by excludedHostsPerLocality().
*/
virtual HostsPerLocalityConstSharedPtr excludedHostsPerLocalityPtr() const PURE;

/**
* @return weights for each locality in the host set.
*/
Expand Down Expand Up @@ -378,9 +434,11 @@ class PrioritySet {
HostVectorConstSharedPtr hosts;
HealthyHostVectorConstSharedPtr healthy_hosts;
DegradedHostVectorConstSharedPtr degraded_hosts;
ExcludedHostVectorConstSharedPtr excluded_hosts;
HostsPerLocalityConstSharedPtr hosts_per_locality;
HostsPerLocalityConstSharedPtr healthy_hosts_per_locality;
HostsPerLocalityConstSharedPtr degraded_hosts_per_locality;
HostsPerLocalityConstSharedPtr excluded_hosts_per_locality;
};

/**
Expand Down Expand Up @@ -518,6 +576,7 @@ class PrioritySet {
COUNTER (membership_change) \
GAUGE (membership_healthy) \
GAUGE (membership_degraded) \
GAUGE (membership_excluded) \
GAUGE (membership_total) \
COUNTER (retry_or_shadow_abandoned) \
COUNTER (update_attempt) \
Expand Down Expand Up @@ -775,6 +834,12 @@ class ClusterInfo {
*/
virtual bool drainConnectionsOnHostRemoval() const PURE;

/**
* @return true if this cluster is configured to ignore hosts for the purpose of load balancing
* computations until they have been health checked for the first time.
*/
virtual bool warmHosts() const PURE;

/**
* @return eds cluster service_name of the cluster.
*/
Expand Down
27 changes: 5 additions & 22 deletions source/common/upstream/cluster_manager_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -676,29 +676,13 @@ void ClusterManagerImpl::postThreadLocalClusterUpdate(const Cluster& cluster, ui
const HostVector& hosts_removed) {
const auto& host_set = cluster.prioritySet().hostSetsPerPriority()[priority];

// TODO(htuch): Can we skip these copies by exporting out const shared_ptr from HostSet?
HostVectorConstSharedPtr hosts_copy(new HostVector(host_set->hosts()));
HealthyHostVectorConstSharedPtr healthy_hosts_copy(
new HealthyHostVector(host_set->healthyHosts()));
DegradedHostVectorConstSharedPtr degraded_hosts_copy(
new DegradedHostVector(host_set->degradedHosts()));
HostsPerLocalityConstSharedPtr hosts_per_locality_copy = host_set->hostsPerLocality().clone();
HostsPerLocalityConstSharedPtr healthy_hosts_per_locality_copy =
host_set->healthyHostsPerLocality().clone();
HostsPerLocalityConstSharedPtr degraded_hosts_per_locality_copy =
host_set->degradedHostsPerLocality().clone();

tls_->runOnAllThreads([this, name = cluster.info()->name(), priority, hosts_copy,
healthy_hosts_copy, degraded_hosts_copy, hosts_per_locality_copy,
healthy_hosts_per_locality_copy, degraded_hosts_per_locality_copy,
tls_->runOnAllThreads([this, name = cluster.info()->name(), priority,
update_params = HostSetImpl::updateHostsParams(*host_set),
locality_weights = host_set->localityWeights(), hosts_added, hosts_removed,
overprovisioning_factor = host_set->overprovisioningFactor()]() {
ThreadLocalClusterManagerImpl::updateClusterMembership(
name, priority,
HostSetImpl::updateHostsParams(hosts_copy, hosts_per_locality_copy, healthy_hosts_copy,
healthy_hosts_per_locality_copy, degraded_hosts_copy,
degraded_hosts_per_locality_copy),
locality_weights, hosts_added, hosts_removed, *tls_, overprovisioning_factor);
name, priority, update_params, locality_weights, hosts_added, hosts_removed, *tls_,
overprovisioning_factor);
});
}

Expand Down Expand Up @@ -967,8 +951,7 @@ void ClusterManagerImpl::ThreadLocalClusterManagerImpl::removeHosts(const std::s
}

void ClusterManagerImpl::ThreadLocalClusterManagerImpl::updateClusterMembership(
const std::string& name, uint32_t priority,
PrioritySet::UpdateHostsParams&& update_hosts_params,
const std::string& name, uint32_t priority, PrioritySet::UpdateHostsParams update_hosts_params,
LocalityWeightsConstSharedPtr locality_weights, const HostVector& hosts_added,
const HostVector& hosts_removed, ThreadLocal::Slot& tls, uint64_t overprovisioning_factor) {

Expand Down
2 changes: 1 addition & 1 deletion source/common/upstream/cluster_manager_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable<Logger::Id::u
static void removeHosts(const std::string& name, const HostVector& hosts_removed,
ThreadLocal::Slot& tls);
static void updateClusterMembership(const std::string& name, uint32_t priority,
PrioritySet::UpdateHostsParams&& update_hosts_params,
PrioritySet::UpdateHostsParams update_hosts_params,
LocalityWeightsConstSharedPtr locality_weights,
const HostVector& hosts_added,
const HostVector& hosts_removed, ThreadLocal::Slot& tls,
Expand Down
16 changes: 16 additions & 0 deletions source/common/upstream/health_checker_base_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,8 @@ void HealthCheckerImplBase::ActiveHealthCheckSession::handleSuccess(bool degrade
}
}

changed_state = clearPendingFlag(changed_state);

if (degraded != host_->healthFlagGet(Host::HealthFlag::DEGRADED_ACTIVE_HC)) {
if (degraded) {
host_->healthFlagSet(Host::HealthFlag::DEGRADED_ACTIVE_HC);
Expand Down Expand Up @@ -307,6 +309,8 @@ HealthTransition HealthCheckerImplBase::ActiveHealthCheckSession::setUnhealthy(
}
}

changed_state = clearPendingFlag(changed_state);

if ((first_check_ || parent_.always_log_health_check_failures_) && parent_.event_logger_) {
parent_.event_logger_->logUnhealthy(parent_.healthCheckerType(), host_, type, first_check_);
}
Expand Down Expand Up @@ -336,6 +340,18 @@ void HealthCheckerImplBase::ActiveHealthCheckSession::handleFailure(
}
}

HealthTransition
HealthCheckerImplBase::ActiveHealthCheckSession::clearPendingFlag(HealthTransition changed_state) {
if (host_->healthFlagGet(Host::HealthFlag::PENDING_ACTIVE_HC)) {
host_->healthFlagClear(Host::HealthFlag::PENDING_ACTIVE_HC);
// Even though the health value of the host might have not changed, we set this to Changed to
// that the cluster can update its list of excluded hosts.
return HealthTransition::Changed;
}

return changed_state;
}

void HealthCheckerImplBase::ActiveHealthCheckSession::onIntervalBase() {
onInterval();
timeout_timer_->enableTimer(parent_.timeout_);
Expand Down
4 changes: 4 additions & 0 deletions source/common/upstream/health_checker_base_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ class HealthCheckerImplBase : public HealthChecker,
HostSharedPtr host_;

private:
// Clears the pending flag if it is set. By clearing this flag we're marking the host as having
// been health checked.
// Returns the changed state to use following the flag update.
HealthTransition clearPendingFlag(HealthTransition changed_state);
virtual void onInterval() PURE;
void onIntervalBase();
virtual void onTimeout() PURE;
Expand Down
28 changes: 15 additions & 13 deletions source/common/upstream/load_balancer_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,20 +136,21 @@ void LoadBalancerBase::recalculatePerPriorityState(uint32_t priority,
// by the overprovisioning factor.
HostSet& host_set = *priority_set.hostSetsPerPriority()[priority];
per_priority_health.get()[priority] = 0;
if (!host_set.hosts().empty()) {
per_priority_degraded.get()[priority] = 0;
const auto host_count = host_set.hosts().size() - host_set.excludedHosts().size();

if (host_count > 0) {
// Each priority level's health is ratio of healthy hosts to total number of hosts in a priority
// multiplied by overprovisioning factor of 1.4 and capped at 100%. It means that if all
// hosts are healthy that priority's health is 100%*1.4=140% and is capped at 100% which results
// in 100%. If 80% of hosts are healthy, that priority's health is still 100% (80%*1.4=112% and
// capped at 100%).
per_priority_health.get()[priority] =
std::min<uint32_t>(100, (host_set.overprovisioningFactor() *
host_set.healthyHosts().size() / host_set.hosts().size()));
per_priority_health.get()[priority] = std::min<uint32_t>(
100, (host_set.overprovisioningFactor() * host_set.healthyHosts().size() / host_count));

// We perform the same computation for degraded hosts.
per_priority_degraded.get()[priority] =
std::min<uint32_t>(100, (host_set.overprovisioningFactor() *
host_set.degradedHosts().size() / host_set.hosts().size()));
per_priority_degraded.get()[priority] = std::min<uint32_t>(
100, (host_set.overprovisioningFactor() * host_set.degradedHosts().size() / host_count));
}

// Now that we've updated health for the changed priority level, we need to calculate percentage
Expand Down Expand Up @@ -442,13 +443,12 @@ HostConstSharedPtr LoadBalancerBase::chooseHost(LoadBalancerContext* context) {
bool LoadBalancerBase::isGlobalPanic(const HostSet& host_set) {
uint64_t global_panic_threshold = std::min<uint64_t>(
100, runtime_.snapshot().getInteger(RuntimePanicThreshold, default_healthy_panic_percent_));
double healthy_percent = host_set.hosts().empty()
? 0
: 100.0 * host_set.healthyHosts().size() / host_set.hosts().size();
const auto host_count = host_set.hosts().size() - host_set.excludedHosts().size();
double healthy_percent =
host_count == 0 ? 0.0 : 100.0 * host_set.healthyHosts().size() / host_count;

double degraded_percent = host_set.hosts().empty()
? 0
: 100.0 * host_set.degradedHosts().size() / host_set.hosts().size();
double degraded_percent =
host_count == 0 ? 0.0 : 100.0 * host_set.degradedHosts().size() / host_count;
// If the % of healthy hosts in the cluster is less than our panic threshold, we use all hosts.
if ((healthy_percent + degraded_percent) < global_panic_threshold) {
return true;
Expand All @@ -464,6 +464,8 @@ void ZoneAwareLoadBalancerBase::calculateLocalityPercentage(
total_hosts += locality_hosts.size();
}

// TODO(snowp): Should we ignore excluded hosts here too?

size_t i = 0;
for (const auto& locality_hosts : hosts_per_locality.get()) {
ret[i++] = total_hosts > 0 ? 10000ULL * locality_hosts.size() / total_hosts : 0;
Expand Down
Loading