Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion envoy/upstream/cluster_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,10 @@ class ClusterManager {
*
* @return the stat names.
*/
virtual const ClusterStatNames& clusterStatNames() const PURE;
virtual const ClusterTrafficStatNames& clusterStatNames() const PURE;
virtual const ClusterConfigUpdateStatNames& clusterConfigUpdateStatNames() const PURE;
virtual const ClusterLbStatNames& clusterLbStatNames() const PURE;
virtual const ClusterEndpointStatNames& clusterEndpointStatNames() const PURE;
virtual const ClusterLoadReportStatNames& clusterLoadReportStatNames() const PURE;
virtual const ClusterCircuitBreakersStatNames& clusterCircuitBreakersStatNames() const PURE;
virtual const ClusterRequestResponseSizeStatNames&
Expand Down
96 changes: 74 additions & 22 deletions envoy/upstream/upstream.h
Original file line number Diff line number Diff line change
Expand Up @@ -565,12 +565,36 @@ class PrioritySet {
};

/**
* All cluster stats. @see stats_macros.h
* All cluster config update related stats.
* See https://github.com/envoyproxy/envoy/issues/23575 for details. Stats from ClusterInfo::stats()
* will be split into subgroups "config-update", "lb", "endpoint" and "the rest"(which are mainly
* upstream related), roughly based on their semantics.
*/
#define ALL_CLUSTER_STATS(COUNTER, GAUGE, HISTOGRAM, TEXT_READOUT, STATNAME) \
#define ALL_CLUSTER_CONFIG_UPDATE_STATS(COUNTER, GAUGE, HISTOGRAM, TEXT_READOUT, STATNAME) \
COUNTER(assignment_stale) \
COUNTER(assignment_timeout_received) \
COUNTER(bind_errors) \
COUNTER(update_attempt) \
COUNTER(update_empty) \
COUNTER(update_failure) \
COUNTER(update_no_rebuild) \
COUNTER(update_success) \
GAUGE(version, NeverImport)

/**
* All cluster endpoints related stats.
*/
#define ALL_CLUSTER_ENDPOINT_STATS(COUNTER, GAUGE, HISTOGRAM, TEXT_READOUT, STATNAME) \
GAUGE(max_host_weight, NeverImport) \
COUNTER(membership_change) \
GAUGE(membership_degraded, NeverImport) \
GAUGE(membership_excluded, NeverImport) \
GAUGE(membership_healthy, NeverImport) \
GAUGE(membership_total, NeverImport)

/**
* All cluster loadbalancing related stats.
*/
#define ALL_CLUSTER_LB_STATS(COUNTER, GAUGE, HISTOGRAM, TEXT_READOUT, STATNAME) \

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these should be further sub-divided by functionality. lb_subset, zone_aware, and then all the rest. Both of those features are unused in many configs so they could all be omitted.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, the zone_aware stats are the "base" for subset_lb loadbalancers, due to the following:

class EdfLoadBalancerBase : public ZoneAwareLoadBalancerBase

so the lb-related stats (subset, and zone-aware) will be created/required at the same time.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But I think the zone-awareness isn't used in many cases, and those stats will remain zero'd (at least I think that's how it works). So if they are separate and lazy-created, I think they won't get created in many configs.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

our of the 5 lb-impls used in subset_lb, only RingHashLoadBalancer and MagLevLoadBalancer are NOT inherited from zoneawareLoadBalancer. But I see your point there tho. It requires some refactoring work: we'd better instantiate those stats objs from within each lb-object, which triggers even more delta.

do you mind if I do it incrementally?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Incrementally sounds good. Do you want to land this as-is and do it in a subsequent PR?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah. it'd be much cleaner.

COUNTER(lb_healthy_panic) \
COUNTER(lb_local_cluster_not_ok) \
COUNTER(lb_recalculate_zone_structures) \
Expand All @@ -585,14 +609,15 @@ class PrioritySet {
COUNTER(lb_zone_routing_all_directly) \
COUNTER(lb_zone_routing_cross_zone) \
COUNTER(lb_zone_routing_sampled) \
COUNTER(membership_change) \
GAUGE(lb_subsets_active, Accumulate)

/**
* All cluster stats. @see stats_macros.h
*/
#define ALL_CLUSTER_TRAFFIC_STATS(COUNTER, GAUGE, HISTOGRAM, TEXT_READOUT, STATNAME) \
COUNTER(bind_errors) \
COUNTER(original_dst_host_invalid) \
COUNTER(retry_or_shadow_abandoned) \
COUNTER(update_attempt) \
COUNTER(update_empty) \
COUNTER(update_failure) \
COUNTER(update_no_rebuild) \
COUNTER(update_success) \
COUNTER(upstream_cx_close_notify) \
COUNTER(upstream_cx_connect_attempts_exceeded) \
COUNTER(upstream_cx_connect_fail) \
Expand Down Expand Up @@ -644,18 +669,11 @@ class PrioritySet {
COUNTER(upstream_rq_total) \
COUNTER(upstream_rq_tx_reset) \
COUNTER(upstream_http3_broken) \
GAUGE(lb_subsets_active, Accumulate) \
GAUGE(max_host_weight, NeverImport) \
GAUGE(membership_degraded, NeverImport) \
GAUGE(membership_excluded, NeverImport) \
GAUGE(membership_healthy, NeverImport) \
GAUGE(membership_total, NeverImport) \
GAUGE(upstream_cx_active, Accumulate) \
GAUGE(upstream_cx_rx_bytes_buffered, Accumulate) \
GAUGE(upstream_cx_tx_bytes_buffered, Accumulate) \
GAUGE(upstream_rq_active, Accumulate) \
GAUGE(upstream_rq_pending_active, Accumulate) \
GAUGE(version, NeverImport) \
HISTOGRAM(upstream_cx_connect_ms, Milliseconds) \
HISTOGRAM(upstream_cx_length_ms, Milliseconds)

Expand Down Expand Up @@ -708,10 +726,29 @@ class PrioritySet {
HISTOGRAM(upstream_rq_timeout_budget_per_try_percent_used, Unspecified)

/**
* Struct definition for all cluster stats. @see stats_macros.h
* Struct definition for cluster config update stats. @see stats_macros.h
*/
MAKE_STAT_NAMES_STRUCT(ClusterConfigUpdateStatNames, ALL_CLUSTER_CONFIG_UPDATE_STATS);
MAKE_STATS_STRUCT(ClusterConfigUpdateStats, ClusterConfigUpdateStatNames,
ALL_CLUSTER_CONFIG_UPDATE_STATS);

/**
* Struct definition for cluster endpoint related stats. @see stats_macros.h
*/
MAKE_STAT_NAMES_STRUCT(ClusterEndpointStatNames, ALL_CLUSTER_ENDPOINT_STATS);
MAKE_STATS_STRUCT(ClusterEndpointStats, ClusterEndpointStatNames, ALL_CLUSTER_ENDPOINT_STATS);

/**
* Struct definition for cluster load balancing stats. @see stats_macros.h
*/
MAKE_STAT_NAMES_STRUCT(ClusterStatNames, ALL_CLUSTER_STATS);
MAKE_STATS_STRUCT(ClusterStats, ClusterStatNames, ALL_CLUSTER_STATS);
MAKE_STAT_NAMES_STRUCT(ClusterLbStatNames, ALL_CLUSTER_LB_STATS);
MAKE_STATS_STRUCT(ClusterLbStats, ClusterLbStatNames, ALL_CLUSTER_LB_STATS);

/**
* Struct definition for all cluster traffic stats. @see stats_macros.h
*/
MAKE_STAT_NAMES_STRUCT(ClusterTrafficStatNames, ALL_CLUSTER_TRAFFIC_STATS);
MAKE_STATS_STRUCT(ClusterTrafficStats, ClusterTrafficStatNames, ALL_CLUSTER_TRAFFIC_STATS);

MAKE_STAT_NAMES_STRUCT(ClusterLoadReportStatNames, ALL_CLUSTER_LOAD_REPORT_STATS);
MAKE_STATS_STRUCT(ClusterLoadReportStats, ClusterLoadReportStatNames,
Expand Down Expand Up @@ -992,9 +1029,24 @@ class ClusterInfo : public Http::FilterChainFactory {
virtual TransportSocketMatcher& transportSocketMatcher() const PURE;

/**
* @return ClusterStats& strongly named stats for this cluster.
* @return ClusterConfigUpdateStats& config update stats for this cluster.
*/
virtual ClusterConfigUpdateStats& configUpdateStats() const PURE;

/**
* @return ClusterLbStats& load-balancer-related stats for this cluster.
*/
virtual ClusterLbStats& lbStats() const PURE;

/**
* @return ClusterEndpointStats& endpoint related stats for this cluster.
*/
virtual ClusterEndpointStats& endpointStats() const PURE;

/**
* @return ClusterTrafficStats& all traffic related stats for this cluster.
*/
virtual ClusterStats& stats() const PURE;
virtual ClusterTrafficStats& trafficStats() const PURE;

/**
* @return the stats scope that contains all cluster stats. This can be used to produce dynamic
Expand All @@ -1003,7 +1055,7 @@ class ClusterInfo : public Http::FilterChainFactory {
virtual Stats::Scope& statsScope() const PURE;

/**
* @return ClusterLoadReportStats& strongly named load report stats for this cluster.
* @return ClusterLoadReportStats& load report stats for this cluster.
*/
virtual ClusterLoadReportStats& loadReportStats() const PURE;

Expand Down
46 changes: 24 additions & 22 deletions source/common/conn_pool/conn_pool_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ ConnPoolImplBase::tryCreateNewConnection(float global_preconnect_ratio) {
const bool can_create_connection = host_->canCreateConnection(priority_);

if (!can_create_connection) {
host_->cluster().stats().upstream_cx_overflow_.inc();
host_->cluster().trafficStats().upstream_cx_overflow_.inc();
}
// If we are at the connection circuit-breaker limit due to other upstreams having
// too many open connections, and this upstream has no connections, always create one, to
Expand Down Expand Up @@ -168,14 +168,14 @@ void ConnPoolImplBase::attachStreamToClient(Envoy::ConnectionPool::ActiveClient&
ASSERT(client.readyForStream());

if (client.state() == Envoy::ConnectionPool::ActiveClient::State::ReadyForEarlyData) {
host_->cluster().stats().upstream_rq_0rtt_.inc();
host_->cluster().trafficStats().upstream_rq_0rtt_.inc();
}

if (enforceMaxRequests() && !host_->cluster().resourceManager(priority_).requests().canCreate()) {
ENVOY_LOG(debug, "max streams overflow");
onPoolFailure(client.real_host_description_, absl::string_view(),
ConnectionPool::PoolFailureReason::Overflow, context);
host_->cluster().stats().upstream_rq_pending_overflow_.inc();
host_->cluster().trafficStats().upstream_rq_pending_overflow_.inc();
return;
}
ENVOY_CONN_LOG(debug, "creating stream", client);
Expand All @@ -185,7 +185,7 @@ void ConnPoolImplBase::attachStreamToClient(Envoy::ConnectionPool::ActiveClient&
client.remaining_streams_--;
if (client.remaining_streams_ == 0) {
ENVOY_CONN_LOG(debug, "maximum streams per connection, start draining", client);
host_->cluster().stats().upstream_cx_max_requests_.inc();
host_->cluster().trafficStats().upstream_cx_max_requests_.inc();
transitionActiveClientState(client, Envoy::ConnectionPool::ActiveClient::State::Draining);
} else if (capacity == 1) {
// As soon as the new stream is created, the client will be maxed out.
Expand All @@ -202,8 +202,8 @@ void ConnPoolImplBase::attachStreamToClient(Envoy::ConnectionPool::ActiveClient&
num_active_streams_++;
host_->stats().rq_total_.inc();
host_->stats().rq_active_.inc();
host_->cluster().stats().upstream_rq_total_.inc();
host_->cluster().stats().upstream_rq_active_.inc();
host_->cluster().trafficStats().upstream_rq_total_.inc();
host_->cluster().trafficStats().upstream_rq_active_.inc();
host_->cluster().resourceManager(priority_).requests().inc();

onPoolReady(client, context);
Expand All @@ -216,7 +216,7 @@ void ConnPoolImplBase::onStreamClosed(Envoy::ConnectionPool::ActiveClient& clien
state_.decrActiveStreams(1);
num_active_streams_--;
host_->stats().rq_active_.dec();
host_->cluster().stats().upstream_rq_active_.dec();
host_->cluster().trafficStats().upstream_rq_active_.dec();
host_->cluster().resourceManager(priority_).requests().dec();
// We don't update the capacity for HTTP/3 as the stream count should only
// increase when a MAX_STREAMS frame is received.
Expand Down Expand Up @@ -282,7 +282,7 @@ ConnectionPool::Cancellable* ConnPoolImplBase::newStreamImpl(AttachContext& cont
ENVOY_LOG(debug, "max pending streams overflow");
onPoolFailure(nullptr, absl::string_view(), ConnectionPool::PoolFailureReason::Overflow,
context);
host_->cluster().stats().upstream_rq_pending_overflow_.inc();
host_->cluster().trafficStats().upstream_rq_pending_overflow_.inc();
return nullptr;
}

Expand Down Expand Up @@ -490,7 +490,7 @@ void ConnPoolImplBase::onConnectionEvent(ActiveClient& client, absl::string_view

if (!client.hasHandshakeCompleted()) {
client.has_handshake_completed_ = true;
host_->cluster().stats().upstream_cx_connect_fail_.inc();
host_->cluster().trafficStats().upstream_cx_connect_fail_.inc();
host_->stats().cx_connect_fail_.inc();

onConnectFailed(client);
Expand Down Expand Up @@ -595,7 +595,7 @@ void ConnPoolImplBase::onConnectionEvent(ActiveClient& client, absl::string_view
client.currentUnusedCapacity());
// No need to update connecting capacity and connect_timer_ as the client is still connecting.
ASSERT(client.state() == ActiveClient::State::Connecting);
host()->cluster().stats().upstream_cx_connect_with_0_rtt_.inc();
host()->cluster().trafficStats().upstream_cx_connect_with_0_rtt_.inc();
transitionActiveClientState(client, (client.currentUnusedCapacity() > 0
? ActiveClient::State::ReadyForEarlyData
: ActiveClient::State::Busy));
Expand All @@ -606,13 +606,13 @@ void ConnPoolImplBase::onConnectionEvent(ActiveClient& client, absl::string_view

PendingStream::PendingStream(ConnPoolImplBase& parent, bool can_send_early_data)
: parent_(parent), can_send_early_data_(can_send_early_data) {
parent_.host()->cluster().stats().upstream_rq_pending_total_.inc();
parent_.host()->cluster().stats().upstream_rq_pending_active_.inc();
parent_.host()->cluster().trafficStats().upstream_rq_pending_total_.inc();
parent_.host()->cluster().trafficStats().upstream_rq_pending_active_.inc();
parent_.host()->cluster().resourceManager(parent_.priority()).pendingRequests().inc();
}

PendingStream::~PendingStream() {
parent_.host()->cluster().stats().upstream_rq_pending_active_.dec();
parent_.host()->cluster().trafficStats().upstream_rq_pending_active_.dec();
parent_.host()->cluster().resourceManager(parent_.priority()).pendingRequests().dec();
}

Expand All @@ -630,7 +630,7 @@ void ConnPoolImplBase::purgePendingStreams(
while (!pending_streams_to_purge_.empty()) {
PendingStreamPtr stream =
pending_streams_to_purge_.front()->removeFromList(pending_streams_to_purge_);
host_->cluster().stats().upstream_rq_pending_failure_eject_.inc();
host_->cluster().trafficStats().upstream_rq_pending_failure_eject_.inc();
onPoolFailure(host_description, failure_reason, reason, stream->context());
}
}
Expand Down Expand Up @@ -683,7 +683,7 @@ void ConnPoolImplBase::onPendingStreamCancel(PendingStream& stream,
}
}

host_->cluster().stats().upstream_rq_cancelled_.inc();
host_->cluster().trafficStats().upstream_rq_cancelled_.inc();
checkForIdleAndCloseIdleConnsIfDraining();
}

Expand Down Expand Up @@ -757,14 +757,16 @@ ActiveClient::ActiveClient(ConnPoolImplBase& parent, uint32_t lifetime_stream_li
concurrent_stream_limit_(translateZeroToUnlimited(concurrent_stream_limit)),
connect_timer_(parent_.dispatcher().createTimer([this]() { onConnectTimeout(); })) {
conn_connect_ms_ = std::make_unique<Stats::HistogramCompletableTimespanImpl>(
parent_.host()->cluster().stats().upstream_cx_connect_ms_, parent_.dispatcher().timeSource());
parent_.host()->cluster().trafficStats().upstream_cx_connect_ms_,
parent_.dispatcher().timeSource());
conn_length_ = std::make_unique<Stats::HistogramCompletableTimespanImpl>(
parent_.host()->cluster().stats().upstream_cx_length_ms_, parent_.dispatcher().timeSource());
parent_.host()->cluster().trafficStats().upstream_cx_length_ms_,
parent_.dispatcher().timeSource());
connect_timer_->enableTimer(parent_.host()->cluster().connectTimeout());
parent_.host()->stats().cx_total_.inc();
parent_.host()->stats().cx_active_.inc();
parent_.host()->cluster().stats().upstream_cx_total_.inc();
parent_.host()->cluster().stats().upstream_cx_active_.inc();
parent_.host()->cluster().trafficStats().upstream_cx_total_.inc();
parent_.host()->cluster().trafficStats().upstream_cx_active_.inc();
parent_.host()->cluster().resourceManager(parent_.priority()).connections().inc();
}

Expand All @@ -776,15 +778,15 @@ void ActiveClient::releaseResourcesBase() {

conn_length_->complete();

parent_.host()->cluster().stats().upstream_cx_active_.dec();
parent_.host()->cluster().trafficStats().upstream_cx_active_.dec();
parent_.host()->stats().cx_active_.dec();
parent_.host()->cluster().resourceManager(parent_.priority()).connections().dec();
}
}

void ActiveClient::onConnectTimeout() {
ENVOY_CONN_LOG(debug, "connect timeout", *this);
parent_.host()->cluster().stats().upstream_cx_connect_timeout_.inc();
parent_.host()->cluster().trafficStats().upstream_cx_connect_timeout_.inc();
timed_out_ = true;
close();
}
Expand All @@ -809,7 +811,7 @@ void ActiveClient::onConnectionDurationTimeout() {
}

ENVOY_CONN_LOG(debug, "max connection duration reached, start draining", *this);
parent_.host()->cluster().stats().upstream_cx_max_duration_reached_.inc();
parent_.host()->cluster().trafficStats().upstream_cx_max_duration_reached_.inc();
parent_.transitionActiveClientState(*this, Envoy::ConnectionPool::ActiveClient::State::Draining);

// Close out the draining client if we no longer have active streams.
Expand Down
2 changes: 1 addition & 1 deletion source/common/http/codec_client.cc
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ void CodecClient::onData(Buffer::Instance& data) {
if (!isPrematureResponseError(status) ||
(!active_requests_.empty() ||
getPrematureResponseHttpCode(status) != Code::RequestTimeout)) {
host_->cluster().stats().upstream_cx_protocol_error_.inc();
host_->cluster().trafficStats().upstream_cx_protocol_error_.inc();
protocol_error_ = true;
}
close();
Expand Down
2 changes: 1 addition & 1 deletion source/common/http/codec_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ class CodecClient : protected Logger::Loggable<Logger::Id::client>,
}

void onIdleTimeout() {
host_->cluster().stats().upstream_cx_idle_timeout_.inc();
host_->cluster().trafficStats().upstream_cx_idle_timeout_.inc();
close();
}

Expand Down
Loading