envoyproxy · jmarantz · Nov 15, 2022 · Oct 17, 2022 · Oct 17, 2022 · Oct 19, 2022
diff --git a/envoy/upstream/cluster_manager.h b/envoy/upstream/cluster_manager.h
@@ -378,7 +378,10 @@ class ClusterManager {
    *
    * @return the stat names.
    */
-  virtual const ClusterStatNames& clusterStatNames() const PURE;
+  virtual const ClusterTrafficStatNames& clusterStatNames() const PURE;
+  virtual const ClusterConfigUpdateStatNames& clusterConfigUpdateStatNames() const PURE;
+  virtual const ClusterLbStatNames& clusterLbStatNames() const PURE;
+  virtual const ClusterEndpointStatNames& clusterEndpointStatNames() const PURE;
   virtual const ClusterLoadReportStatNames& clusterLoadReportStatNames() const PURE;
   virtual const ClusterCircuitBreakersStatNames& clusterCircuitBreakersStatNames() const PURE;
   virtual const ClusterRequestResponseSizeStatNames&

diff --git a/envoy/upstream/upstream.h b/envoy/upstream/upstream.h
@@ -565,12 +565,36 @@ class PrioritySet {
 };
 
 /**
- * All cluster stats. @see stats_macros.h
+ * All cluster config update related stats.
+ * See https://github.com/envoyproxy/envoy/issues/23575 for details. Stats from ClusterInfo::stats()
+ * will be split into subgroups "config-update", "lb", "endpoint" and "the rest"(which are mainly
+ * upstream related), roughly based on their semantics.
  */
-#define ALL_CLUSTER_STATS(COUNTER, GAUGE, HISTOGRAM, TEXT_READOUT, STATNAME)                       \
+#define ALL_CLUSTER_CONFIG_UPDATE_STATS(COUNTER, GAUGE, HISTOGRAM, TEXT_READOUT, STATNAME)         \
   COUNTER(assignment_stale)                                                                        \
   COUNTER(assignment_timeout_received)                                                             \
-  COUNTER(bind_errors)                                                                             \
+  COUNTER(update_attempt)                                                                          \
+  COUNTER(update_empty)                                                                            \
+  COUNTER(update_failure)                                                                          \
+  COUNTER(update_no_rebuild)                                                                       \
+  COUNTER(update_success)                                                                          \
+  GAUGE(version, NeverImport)
+
+/**
+ * All cluster endpoints related stats.
+ */
+#define ALL_CLUSTER_ENDPOINT_STATS(COUNTER, GAUGE, HISTOGRAM, TEXT_READOUT, STATNAME)              \
+  GAUGE(max_host_weight, NeverImport)                                                              \
+  COUNTER(membership_change)                                                                       \
+  GAUGE(membership_degraded, NeverImport)                                                          \
+  GAUGE(membership_excluded, NeverImport)                                                          \
+  GAUGE(membership_healthy, NeverImport)                                                           \
+  GAUGE(membership_total, NeverImport)
+
+/**
+ * All cluster loadbalancing related stats.
+ */
+#define ALL_CLUSTER_LB_STATS(COUNTER, GAUGE, HISTOGRAM, TEXT_READOUT, STATNAME)                    \
   COUNTER(lb_healthy_panic)                                                                        \
   COUNTER(lb_local_cluster_not_ok)                                                                 \
   COUNTER(lb_recalculate_zone_structures)                                                          \
@@ -585,14 +609,15 @@ class PrioritySet {
   COUNTER(lb_zone_routing_all_directly)                                                            \
   COUNTER(lb_zone_routing_cross_zone)                                                              \
   COUNTER(lb_zone_routing_sampled)                                                                 \
-  COUNTER(membership_change)                                                                       \
+  GAUGE(lb_subsets_active, Accumulate)
+
+/**
+ * All cluster stats. @see stats_macros.h
+ */
+#define ALL_CLUSTER_TRAFFIC_STATS(COUNTER, GAUGE, HISTOGRAM, TEXT_READOUT, STATNAME)               \
+  COUNTER(bind_errors)                                                                             \
   COUNTER(original_dst_host_invalid)                                                               \
   COUNTER(retry_or_shadow_abandoned)                                                               \
-  COUNTER(update_attempt)                                                                          \
-  COUNTER(update_empty)                                                                            \
-  COUNTER(update_failure)                                                                          \
-  COUNTER(update_no_rebuild)                                                                       \
-  COUNTER(update_success)                                                                          \
   COUNTER(upstream_cx_close_notify)                                                                \
   COUNTER(upstream_cx_connect_attempts_exceeded)                                                   \
   COUNTER(upstream_cx_connect_fail)                                                                \
@@ -644,18 +669,11 @@ class PrioritySet {
   COUNTER(upstream_rq_total)                                                                       \
   COUNTER(upstream_rq_tx_reset)                                                                    \
   COUNTER(upstream_http3_broken)                                                                   \
-  GAUGE(lb_subsets_active, Accumulate)                                                             \
-  GAUGE(max_host_weight, NeverImport)                                                              \
-  GAUGE(membership_degraded, NeverImport)                                                          \
-  GAUGE(membership_excluded, NeverImport)                                                          \
-  GAUGE(membership_healthy, NeverImport)                                                           \
-  GAUGE(membership_total, NeverImport)                                                             \
   GAUGE(upstream_cx_active, Accumulate)                                                            \
   GAUGE(upstream_cx_rx_bytes_buffered, Accumulate)                                                 \
   GAUGE(upstream_cx_tx_bytes_buffered, Accumulate)                                                 \
   GAUGE(upstream_rq_active, Accumulate)                                                            \
   GAUGE(upstream_rq_pending_active, Accumulate)                                                    \
-  GAUGE(version, NeverImport)                                                                      \
   HISTOGRAM(upstream_cx_connect_ms, Milliseconds)                                                  \
   HISTOGRAM(upstream_cx_length_ms, Milliseconds)
 
@@ -708,10 +726,29 @@ class PrioritySet {
   HISTOGRAM(upstream_rq_timeout_budget_per_try_percent_used, Unspecified)
 
 /**
- * Struct definition for all cluster stats. @see stats_macros.h
+ * Struct definition for cluster config update stats. @see stats_macros.h
+ */
+MAKE_STAT_NAMES_STRUCT(ClusterConfigUpdateStatNames, ALL_CLUSTER_CONFIG_UPDATE_STATS);
+MAKE_STATS_STRUCT(ClusterConfigUpdateStats, ClusterConfigUpdateStatNames,
+                  ALL_CLUSTER_CONFIG_UPDATE_STATS);
+
+/**
+ * Struct definition for cluster endpoint related stats. @see stats_macros.h
+ */
+MAKE_STAT_NAMES_STRUCT(ClusterEndpointStatNames, ALL_CLUSTER_ENDPOINT_STATS);
+MAKE_STATS_STRUCT(ClusterEndpointStats, ClusterEndpointStatNames, ALL_CLUSTER_ENDPOINT_STATS);
+
+/**
+ * Struct definition for cluster load balancing stats. @see stats_macros.h
  */
-MAKE_STAT_NAMES_STRUCT(ClusterStatNames, ALL_CLUSTER_STATS);
-MAKE_STATS_STRUCT(ClusterStats, ClusterStatNames, ALL_CLUSTER_STATS);
+MAKE_STAT_NAMES_STRUCT(ClusterLbStatNames, ALL_CLUSTER_LB_STATS);
+MAKE_STATS_STRUCT(ClusterLbStats, ClusterLbStatNames, ALL_CLUSTER_LB_STATS);
+
+/**
+ * Struct definition for all cluster traffic stats. @see stats_macros.h
+ */
+MAKE_STAT_NAMES_STRUCT(ClusterTrafficStatNames, ALL_CLUSTER_TRAFFIC_STATS);
+MAKE_STATS_STRUCT(ClusterTrafficStats, ClusterTrafficStatNames, ALL_CLUSTER_TRAFFIC_STATS);
 
 MAKE_STAT_NAMES_STRUCT(ClusterLoadReportStatNames, ALL_CLUSTER_LOAD_REPORT_STATS);
 MAKE_STATS_STRUCT(ClusterLoadReportStats, ClusterLoadReportStatNames,
@@ -992,9 +1029,24 @@ class ClusterInfo : public Http::FilterChainFactory {
   virtual TransportSocketMatcher& transportSocketMatcher() const PURE;
 
   /**
-   * @return ClusterStats& strongly named stats for this cluster.
+   * @return ClusterConfigUpdateStats& config update stats for this cluster.
+   */
+  virtual ClusterConfigUpdateStats& configUpdateStats() const PURE;
+
+  /**
+   * @return ClusterLbStats& load-balancer-related stats for this cluster.
+   */
+  virtual ClusterLbStats& lbStats() const PURE;
+
+  /**
+   * @return ClusterEndpointStats& endpoint related stats for this cluster.
+   */
+  virtual ClusterEndpointStats& endpointStats() const PURE;
+
+  /**
+   * @return ClusterTrafficStats& all traffic related stats for this cluster.
    */
-  virtual ClusterStats& stats() const PURE;
+  virtual ClusterTrafficStats& trafficStats() const PURE;
 
   /**
    * @return the stats scope that contains all cluster stats. This can be used to produce dynamic
@@ -1003,7 +1055,7 @@ class ClusterInfo : public Http::FilterChainFactory {
   virtual Stats::Scope& statsScope() const PURE;
 
   /**
-   * @return ClusterLoadReportStats& strongly named load report stats for this cluster.
+   * @return ClusterLoadReportStats& load report stats for this cluster.
    */
   virtual ClusterLoadReportStats& loadReportStats() const PURE;
 

diff --git a/source/common/conn_pool/conn_pool_base.cc b/source/common/conn_pool/conn_pool_base.cc
@@ -135,7 +135,7 @@ ConnPoolImplBase::tryCreateNewConnection(float global_preconnect_ratio) {
   const bool can_create_connection = host_->canCreateConnection(priority_);
 
   if (!can_create_connection) {
-    host_->cluster().stats().upstream_cx_overflow_.inc();
+    host_->cluster().trafficStats().upstream_cx_overflow_.inc();
   }
   // If we are at the connection circuit-breaker limit due to other upstreams having
   // too many open connections, and this upstream has no connections, always create one, to
@@ -168,14 +168,14 @@ void ConnPoolImplBase::attachStreamToClient(Envoy::ConnectionPool::ActiveClient&
   ASSERT(client.readyForStream());
 
   if (client.state() == Envoy::ConnectionPool::ActiveClient::State::ReadyForEarlyData) {
-    host_->cluster().stats().upstream_rq_0rtt_.inc();
+    host_->cluster().trafficStats().upstream_rq_0rtt_.inc();
   }
 
   if (enforceMaxRequests() && !host_->cluster().resourceManager(priority_).requests().canCreate()) {
     ENVOY_LOG(debug, "max streams overflow");
     onPoolFailure(client.real_host_description_, absl::string_view(),
                   ConnectionPool::PoolFailureReason::Overflow, context);
-    host_->cluster().stats().upstream_rq_pending_overflow_.inc();
+    host_->cluster().trafficStats().upstream_rq_pending_overflow_.inc();
     return;
   }
   ENVOY_CONN_LOG(debug, "creating stream", client);
@@ -185,7 +185,7 @@ void ConnPoolImplBase::attachStreamToClient(Envoy::ConnectionPool::ActiveClient&
   client.remaining_streams_--;
   if (client.remaining_streams_ == 0) {
     ENVOY_CONN_LOG(debug, "maximum streams per connection, start draining", client);
-    host_->cluster().stats().upstream_cx_max_requests_.inc();
+    host_->cluster().trafficStats().upstream_cx_max_requests_.inc();
     transitionActiveClientState(client, Envoy::ConnectionPool::ActiveClient::State::Draining);
   } else if (capacity == 1) {
     // As soon as the new stream is created, the client will be maxed out.
@@ -202,8 +202,8 @@ void ConnPoolImplBase::attachStreamToClient(Envoy::ConnectionPool::ActiveClient&
   num_active_streams_++;
   host_->stats().rq_total_.inc();
   host_->stats().rq_active_.inc();
-  host_->cluster().stats().upstream_rq_total_.inc();
-  host_->cluster().stats().upstream_rq_active_.inc();
+  host_->cluster().trafficStats().upstream_rq_total_.inc();
+  host_->cluster().trafficStats().upstream_rq_active_.inc();
   host_->cluster().resourceManager(priority_).requests().inc();
 
   onPoolReady(client, context);
@@ -216,7 +216,7 @@ void ConnPoolImplBase::onStreamClosed(Envoy::ConnectionPool::ActiveClient& clien
   state_.decrActiveStreams(1);
   num_active_streams_--;
   host_->stats().rq_active_.dec();
-  host_->cluster().stats().upstream_rq_active_.dec();
+  host_->cluster().trafficStats().upstream_rq_active_.dec();
   host_->cluster().resourceManager(priority_).requests().dec();
   // We don't update the capacity for HTTP/3 as the stream count should only
   // increase when a MAX_STREAMS frame is received.
@@ -282,7 +282,7 @@ ConnectionPool::Cancellable* ConnPoolImplBase::newStreamImpl(AttachContext& cont
     ENVOY_LOG(debug, "max pending streams overflow");
     onPoolFailure(nullptr, absl::string_view(), ConnectionPool::PoolFailureReason::Overflow,
                   context);
-    host_->cluster().stats().upstream_rq_pending_overflow_.inc();
+    host_->cluster().trafficStats().upstream_rq_pending_overflow_.inc();
     return nullptr;
   }
 
@@ -490,7 +490,7 @@ void ConnPoolImplBase::onConnectionEvent(ActiveClient& client, absl::string_view
 
     if (!client.hasHandshakeCompleted()) {
       client.has_handshake_completed_ = true;
-      host_->cluster().stats().upstream_cx_connect_fail_.inc();
+      host_->cluster().trafficStats().upstream_cx_connect_fail_.inc();
       host_->stats().cx_connect_fail_.inc();
 
       onConnectFailed(client);
@@ -595,7 +595,7 @@ void ConnPoolImplBase::onConnectionEvent(ActiveClient& client, absl::string_view
                    client.currentUnusedCapacity());
     // No need to update connecting capacity and connect_timer_ as the client is still connecting.
     ASSERT(client.state() == ActiveClient::State::Connecting);
-    host()->cluster().stats().upstream_cx_connect_with_0_rtt_.inc();
+    host()->cluster().trafficStats().upstream_cx_connect_with_0_rtt_.inc();
     transitionActiveClientState(client, (client.currentUnusedCapacity() > 0
                                              ? ActiveClient::State::ReadyForEarlyData
                                              : ActiveClient::State::Busy));
@@ -606,13 +606,13 @@ void ConnPoolImplBase::onConnectionEvent(ActiveClient& client, absl::string_view
 
 PendingStream::PendingStream(ConnPoolImplBase& parent, bool can_send_early_data)
     : parent_(parent), can_send_early_data_(can_send_early_data) {
-  parent_.host()->cluster().stats().upstream_rq_pending_total_.inc();
-  parent_.host()->cluster().stats().upstream_rq_pending_active_.inc();
+  parent_.host()->cluster().trafficStats().upstream_rq_pending_total_.inc();
+  parent_.host()->cluster().trafficStats().upstream_rq_pending_active_.inc();
   parent_.host()->cluster().resourceManager(parent_.priority()).pendingRequests().inc();
 }
 
 PendingStream::~PendingStream() {
-  parent_.host()->cluster().stats().upstream_rq_pending_active_.dec();
+  parent_.host()->cluster().trafficStats().upstream_rq_pending_active_.dec();
   parent_.host()->cluster().resourceManager(parent_.priority()).pendingRequests().dec();
 }
 
@@ -630,7 +630,7 @@ void ConnPoolImplBase::purgePendingStreams(
   while (!pending_streams_to_purge_.empty()) {
     PendingStreamPtr stream =
         pending_streams_to_purge_.front()->removeFromList(pending_streams_to_purge_);
-    host_->cluster().stats().upstream_rq_pending_failure_eject_.inc();
+    host_->cluster().trafficStats().upstream_rq_pending_failure_eject_.inc();
     onPoolFailure(host_description, failure_reason, reason, stream->context());
   }
 }
@@ -683,7 +683,7 @@ void ConnPoolImplBase::onPendingStreamCancel(PendingStream& stream,
     }
   }
 
-  host_->cluster().stats().upstream_rq_cancelled_.inc();
+  host_->cluster().trafficStats().upstream_rq_cancelled_.inc();
   checkForIdleAndCloseIdleConnsIfDraining();
 }
 
@@ -757,14 +757,16 @@ ActiveClient::ActiveClient(ConnPoolImplBase& parent, uint32_t lifetime_stream_li
       concurrent_stream_limit_(translateZeroToUnlimited(concurrent_stream_limit)),
       connect_timer_(parent_.dispatcher().createTimer([this]() { onConnectTimeout(); })) {
   conn_connect_ms_ = std::make_unique<Stats::HistogramCompletableTimespanImpl>(
-      parent_.host()->cluster().stats().upstream_cx_connect_ms_, parent_.dispatcher().timeSource());
+      parent_.host()->cluster().trafficStats().upstream_cx_connect_ms_,
+      parent_.dispatcher().timeSource());
   conn_length_ = std::make_unique<Stats::HistogramCompletableTimespanImpl>(
-      parent_.host()->cluster().stats().upstream_cx_length_ms_, parent_.dispatcher().timeSource());
+      parent_.host()->cluster().trafficStats().upstream_cx_length_ms_,
+      parent_.dispatcher().timeSource());
   connect_timer_->enableTimer(parent_.host()->cluster().connectTimeout());
   parent_.host()->stats().cx_total_.inc();
   parent_.host()->stats().cx_active_.inc();
-  parent_.host()->cluster().stats().upstream_cx_total_.inc();
-  parent_.host()->cluster().stats().upstream_cx_active_.inc();
+  parent_.host()->cluster().trafficStats().upstream_cx_total_.inc();
+  parent_.host()->cluster().trafficStats().upstream_cx_active_.inc();
   parent_.host()->cluster().resourceManager(parent_.priority()).connections().inc();
 }
 
@@ -776,15 +778,15 @@ void ActiveClient::releaseResourcesBase() {
 
     conn_length_->complete();
 
-    parent_.host()->cluster().stats().upstream_cx_active_.dec();
+    parent_.host()->cluster().trafficStats().upstream_cx_active_.dec();
     parent_.host()->stats().cx_active_.dec();
     parent_.host()->cluster().resourceManager(parent_.priority()).connections().dec();
   }
 }
 
 void ActiveClient::onConnectTimeout() {
   ENVOY_CONN_LOG(debug, "connect timeout", *this);
-  parent_.host()->cluster().stats().upstream_cx_connect_timeout_.inc();
+  parent_.host()->cluster().trafficStats().upstream_cx_connect_timeout_.inc();
   timed_out_ = true;
   close();
 }
@@ -809,7 +811,7 @@ void ActiveClient::onConnectionDurationTimeout() {
   }
 
   ENVOY_CONN_LOG(debug, "max connection duration reached, start draining", *this);
-  parent_.host()->cluster().stats().upstream_cx_max_duration_reached_.inc();
+  parent_.host()->cluster().trafficStats().upstream_cx_max_duration_reached_.inc();
   parent_.transitionActiveClientState(*this, Envoy::ConnectionPool::ActiveClient::State::Draining);
 
   // Close out the draining client if we no longer have active streams.

diff --git a/source/common/http/codec_client.cc b/source/common/http/codec_client.cc
@@ -171,7 +171,7 @@ void CodecClient::onData(Buffer::Instance& data) {
     if (!isPrematureResponseError(status) ||
         (!active_requests_.empty() ||
          getPrematureResponseHttpCode(status) != Code::RequestTimeout)) {
-      host_->cluster().stats().upstream_cx_protocol_error_.inc();
+      host_->cluster().trafficStats().upstream_cx_protocol_error_.inc();
       protocol_error_ = true;
     }
     close();

diff --git a/source/common/http/codec_client.h b/source/common/http/codec_client.h
@@ -164,7 +164,7 @@ class CodecClient : protected Logger::Loggable<Logger::Id::client>,
   }
 
   void onIdleTimeout() {
-    host_->cluster().stats().upstream_cx_idle_timeout_.inc();
+    host_->cluster().trafficStats().upstream_cx_idle_timeout_.inc();
     close();
   }