Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions api/envoy/api/v2/cluster/circuit_breaker.proto
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ message CircuitBreakers {
// The maximum number of parallel retries that Envoy will allow to the
// upstream cluster. If not specified, the default is 3.
google.protobuf.UInt32Value max_retries = 5;

// If track_remaining is true, then stats will be published that expose
// the number of resources remaining until the circuit breakers open. If
// not specified, the default is false.
bool track_remaining = 6;
}

// If multiple :ref:`Thresholds<envoy_api_msg_cluster.CircuitBreakers.Thresholds>`
Expand Down
4 changes: 4 additions & 0 deletions docs/root/configuration/cluster_manager/cluster_stats.rst
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,10 @@ Circuit breakers statistics will be rooted at *cluster.<name>.circuit_breakers.<
rq_pending_open, Gauge, Whether the pending requests circuit breaker is closed (0) or open (1)
rq_open, Gauge, Whether the requests circuit breaker is closed (0) or open (1)
rq_retry_open, Gauge, Whether the retry circuit breaker is closed (0) or open (1)
remaining_cx, Gauge, Number of remaining connections until the circuit breaker opens
remaining_pending, Gauge, Number of remaining pending requests until the circuit breaker opens
remaining_rq, Gauge, Number of remaining requests until the circuit breaker opens
remaining_retries, Gauge, Number of remaining retries until the circuit breaker opens

.. _config_cluster_manager_cluster_stats_dynamic_http:

Expand Down
3 changes: 2 additions & 1 deletion docs/root/intro/arch_overview/circuit_breaking.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ configure and code each application independently. Envoy supports various types
Each circuit breaking limit is :ref:`configurable <config_cluster_manager_cluster_circuit_breakers>`
and tracked on a per upstream cluster and per priority basis. This allows different components of
the distributed system to be tuned independently and have different limits. The live state of these
circuit breakers can be observed via :ref:`statistics <config_cluster_manager_cluster_stats_circuit_breakers>`.
circuit breakers, including the number of resources remaining until a circuit breaker opens, can
be observed via :ref:`statistics <config_cluster_manager_cluster_stats_circuit_breakers>`.

Note that circuit breaking will cause the :ref:`x-envoy-overloaded
<config_http_filters_router_x-envoy-overloaded_set>` header to be set by the router filter in the
Expand Down
1 change: 1 addition & 0 deletions docs/root/intro/version_history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ Version history
* stats: added support for histograms in prometheus
* stats: added usedonly flag to prometheus stats to only output metrics which have been
updated at least once.
* stats: added gauges tracking remaining resources before circuit breakers open.
* tap: added new alpha :ref:`HTTP tap filter <config_http_filters_tap>`.
* tls: enabled TLS 1.3 on the server-side (non-FIPS builds).
* upstream: add hash_function to specify the hash function for :ref:`ring hash<envoy_api_msg_Cluster.RingHashLbConfig>` as either xxHash or `murmurHash2 <https://sites.google.com/site/murmurhash>`_. MurmurHash2 is compatible with std::hash in GNU libstdc++ 3.4.20 or above. This is typically the case when compiled on Linux and not macOS.
Expand Down
6 changes: 6 additions & 0 deletions include/envoy/stats/scope.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class Gauge;
class Histogram;
class Scope;
class StatsOptions;
class NullGaugeImpl;

typedef std::unique_ptr<Scope> ScopePtr;
typedef std::shared_ptr<Scope> ScopeSharedPtr;
Expand Down Expand Up @@ -51,6 +52,11 @@ class Scope {
*/
virtual Gauge& gauge(const std::string& name) PURE;

/**
* @return a null gauge within the scope's namespace.
*/
virtual NullGaugeImpl& nullGauge(const std::string& name) PURE;

/**
* @return a histogram within the scope's namespace with a particular value type.
*/
Expand Down
4 changes: 4 additions & 0 deletions include/envoy/stats/stats_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,8 @@ namespace Envoy {
#define POOL_COUNTER(POOL) POOL_COUNTER_PREFIX(POOL, "")
#define POOL_GAUGE(POOL) POOL_GAUGE_PREFIX(POOL, "")
#define POOL_HISTOGRAM(POOL) POOL_HISTOGRAM_PREFIX(POOL, "")

#define NULL_STAT_DECL_(X) std::string(#X)),

#define NULL_POOL_GAUGE(POOL) (POOL).nullGauge(NULL_STAT_DECL_
} // namespace Envoy
19 changes: 12 additions & 7 deletions include/envoy/upstream/upstream.h
Original file line number Diff line number Diff line change
Expand Up @@ -521,14 +521,19 @@ class PrioritySet {
// clang-format on

/**
* Cluster circuit breakers stats.
* Cluster circuit breakers stats. Open circuit breaker stats and remaining resource stats
* can be handled differently by passing in different macros.
*/
// clang-format off
#define ALL_CLUSTER_CIRCUIT_BREAKERS_STATS(GAUGE) \
GAUGE (cx_open) \
GAUGE (rq_pending_open) \
GAUGE (rq_open) \
GAUGE (rq_retry_open)
#define ALL_CLUSTER_CIRCUIT_BREAKERS_STATS(OPEN_GAUGE, REMAINING_GAUGE) \
OPEN_GAUGE (cx_open) \
OPEN_GAUGE (rq_pending_open) \
OPEN_GAUGE (rq_open) \
OPEN_GAUGE (rq_retry_open) \
REMAINING_GAUGE (remaining_cx) \
REMAINING_GAUGE (remaining_pending) \
REMAINING_GAUGE (remaining_rq) \
REMAINING_GAUGE (remaining_retries)
// clang-format on

/**
Expand All @@ -549,7 +554,7 @@ struct ClusterLoadReportStats {
* Struct definition for cluster circuit breakers stats. @see stats_macros.h
*/
struct ClusterCircuitBreakersStats {
ALL_CLUSTER_CIRCUIT_BREAKERS_STATS(GENERATE_GAUGE_STRUCT)
ALL_CLUSTER_CIRCUIT_BREAKERS_STATS(GENERATE_GAUGE_STRUCT, GENERATE_GAUGE_STRUCT)
};

/**
Expand Down
2 changes: 2 additions & 0 deletions source/common/stats/isolated_store_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,14 @@ struct IsolatedScopeImpl : public Scope {
void deliverHistogramToSinks(const Histogram&, uint64_t) override {}
Counter& counter(const std::string& name) override { return parent_.counter(prefix_ + name); }
Gauge& gauge(const std::string& name) override { return parent_.gauge(prefix_ + name); }
NullGaugeImpl& nullGauge(const std::string&) override { return null_gauge_; }
Histogram& histogram(const std::string& name) override {
return parent_.histogram(prefix_ + name);
}
const Stats::StatsOptions& statsOptions() const override { return parent_.statsOptions(); }

IsolatedStoreImpl& parent_;
NullGaugeImpl null_gauge_;
const std::string prefix_;
};

Expand Down
2 changes: 2 additions & 0 deletions source/common/stats/isolated_store_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class IsolatedStoreImpl : public Store {
ScopePtr createScope(const std::string& name) override;
void deliverHistogramToSinks(const Histogram&, uint64_t) override {}
Gauge& gauge(const std::string& name) override { return gauges_.get(name); }
NullGaugeImpl& nullGauge(const std::string&) override { return null_gauge_; }
Histogram& histogram(const std::string& name) override {
Histogram& histogram = histograms_.get(name);
return histogram;
Expand All @@ -80,6 +81,7 @@ class IsolatedStoreImpl : public Store {
IsolatedStatsCache<Gauge> gauges_;
IsolatedStatsCache<Histogram> histograms_;
const StatsOptionsImpl stats_options_;
NullGaugeImpl null_gauge_;
};

} // namespace Stats
Expand Down
3 changes: 3 additions & 0 deletions source/common/stats/thread_local_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ class ThreadLocalStoreImpl : Logger::Loggable<Logger::Id::stats>, public StoreRo
return default_scope_->deliverHistogramToSinks(histogram, value);
}
Gauge& gauge(const std::string& name) override { return default_scope_->gauge(name); }
NullGaugeImpl& nullGauge(const std::string&) override { return null_gauge_; }
Histogram& histogram(const std::string& name) override {
return default_scope_->histogram(name);
};
Expand Down Expand Up @@ -199,6 +200,7 @@ class ThreadLocalStoreImpl : Logger::Loggable<Logger::Id::stats>, public StoreRo
}
void deliverHistogramToSinks(const Histogram& histogram, uint64_t value) override;
Gauge& gauge(const std::string& name) override;
NullGaugeImpl& nullGauge(const std::string&) override { return null_gauge_; }
Histogram& histogram(const std::string& name) override;
Histogram& tlsHistogram(const std::string& name, ParentHistogramImpl& parent) override;
const Stats::StatsOptions& statsOptions() const override { return parent_.statsOptions(); }
Expand Down Expand Up @@ -272,6 +274,7 @@ class ThreadLocalStoreImpl : Logger::Loggable<Logger::Id::stats>, public StoreRo
Counter& num_last_resort_stats_;
HeapStatDataAllocator heap_allocator_;
SourceImpl source_;
NullGaugeImpl null_gauge_;

// Retain storage for deleted stats; these are no longer in maps because the
// matcher-pattern was established after they were created. Since the stats
Expand Down
39 changes: 33 additions & 6 deletions source/common/upstream/resource_manager_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,14 @@ class ResourceManagerImpl : public ResourceManager {
uint64_t max_connections, uint64_t max_pending_requests,
uint64_t max_requests, uint64_t max_retries,
ClusterCircuitBreakersStats cb_stats)
: connections_(max_connections, runtime, runtime_key + "max_connections", cb_stats.cx_open_),
: connections_(max_connections, runtime, runtime_key + "max_connections", cb_stats.cx_open_,
cb_stats.remaining_cx_),
pending_requests_(max_pending_requests, runtime, runtime_key + "max_pending_requests",
cb_stats.rq_pending_open_),
requests_(max_requests, runtime, runtime_key + "max_requests", cb_stats.rq_open_),
retries_(max_retries, runtime, runtime_key + "max_retries", cb_stats.rq_retry_open_) {}
cb_stats.rq_pending_open_, cb_stats.remaining_pending_),
requests_(max_requests, runtime, runtime_key + "max_requests", cb_stats.rq_open_,
cb_stats.remaining_rq_),
retries_(max_retries, runtime, runtime_key + "max_retries", cb_stats.rq_retry_open_,
cb_stats.remaining_retries_) {}

// Upstream::ResourceManager
Resource& connections() override { return connections_; }
Expand All @@ -44,23 +47,42 @@ class ResourceManagerImpl : public ResourceManager {
private:
struct ResourceImpl : public Resource {
ResourceImpl(uint64_t max, Runtime::Loader& runtime, const std::string& runtime_key,
Stats::Gauge& open_gauge)
: max_(max), runtime_(runtime), runtime_key_(runtime_key), open_gauge_(open_gauge) {}
Stats::Gauge& open_gauge, Stats::Gauge& remaining)
: max_(max), runtime_(runtime), runtime_key_(runtime_key), open_gauge_(open_gauge),
remaining_(remaining) {
remaining_.set(max);
}
~ResourceImpl() { ASSERT(current_ == 0); }

// Upstream::Resource
bool canCreate() override { return current_ < max(); }
void inc() override {
current_++;
updateRemaining();
open_gauge_.set(canCreate() ? 0 : 1);
}
void dec() override {
ASSERT(current_ > 0);
current_--;
updateRemaining();
open_gauge_.set(canCreate() ? 0 : 1);
}
uint64_t max() override { return runtime_.snapshot().getInteger(runtime_key_, max_); }

/**
* We set the gauge instead of incrementing and decrementing because,
* though atomics are used, it is possible for the current resource count
* to be greater than the supplied max.
*/
void updateRemaining() {
/**
* We cannot use std::max here because max() and current_ are
* unsigned and subtracting them may overflow.
*/
const uint64_t current_copy = current_;
remaining_.set(max() > current_copy ? max() - current_copy : 0);
}

const uint64_t max_;
std::atomic<uint64_t> current_{};
Runtime::Loader& runtime_;
Expand All @@ -72,6 +94,11 @@ class ResourceManagerImpl : public ResourceManager {
* is open.
*/
Stats::Gauge& open_gauge_;

/**
* The number of resources remaining before the circuit breaker opens.
*/
Stats::Gauge& remaining_;
};

ResourceImpl connections_;
Expand Down
16 changes: 13 additions & 3 deletions source/common/upstream/upstream_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -816,9 +816,16 @@ ClusterInfoImpl::ResourceManagers::ResourceManagers(const envoy::api::v2::Cluste
}

ClusterCircuitBreakersStats
ClusterInfoImpl::generateCircuitBreakersStats(Stats::Scope& scope, const std::string& stat_prefix) {
ClusterInfoImpl::generateCircuitBreakersStats(Stats::Scope& scope, const std::string& stat_prefix,
bool track_remaining) {
std::string prefix(fmt::format("circuit_breakers.{}.", stat_prefix));
return {ALL_CLUSTER_CIRCUIT_BREAKERS_STATS(POOL_GAUGE_PREFIX(scope, prefix))};
if (track_remaining) {
return {ALL_CLUSTER_CIRCUIT_BREAKERS_STATS(POOL_GAUGE_PREFIX(scope, prefix),
POOL_GAUGE_PREFIX(scope, prefix))};
} else {
return {ALL_CLUSTER_CIRCUIT_BREAKERS_STATS(POOL_GAUGE_PREFIX(scope, prefix),
NULL_POOL_GAUGE(scope))};
}
}

ResourceManagerImplPtr
Expand All @@ -831,6 +838,8 @@ ClusterInfoImpl::ResourceManagers::load(const envoy::api::v2::Cluster& config,
uint64_t max_requests = 1024;
uint64_t max_retries = 3;

bool track_remaining = false;

std::string priority_name;
switch (priority) {
case envoy::api::v2::core::RoutingPriority::DEFAULT:
Expand Down Expand Up @@ -858,10 +867,11 @@ ClusterInfoImpl::ResourceManagers::load(const envoy::api::v2::Cluster& config,
PROTOBUF_GET_WRAPPED_OR_DEFAULT(*it, max_pending_requests, max_pending_requests);
max_requests = PROTOBUF_GET_WRAPPED_OR_DEFAULT(*it, max_requests, max_requests);
max_retries = PROTOBUF_GET_WRAPPED_OR_DEFAULT(*it, max_retries, max_retries);
track_remaining = it->track_remaining();
}
return std::make_unique<ResourceManagerImpl>(
runtime, runtime_prefix, max_connections, max_pending_requests, max_requests, max_retries,
ClusterInfoImpl::generateCircuitBreakersStats(stats_scope, priority_name));
ClusterInfoImpl::generateCircuitBreakersStats(stats_scope, priority_name, track_remaining));
}

PriorityStateManager::PriorityStateManager(ClusterImplBase& cluster,
Expand Down
3 changes: 2 additions & 1 deletion source/common/upstream/upstream_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,8 @@ class ClusterInfoImpl : public ClusterInfo {
static ClusterStats generateStats(Stats::Scope& scope);
static ClusterLoadReportStats generateLoadReportStats(Stats::Scope& scope);
static ClusterCircuitBreakersStats generateCircuitBreakersStats(Stats::Scope& scope,
const std::string& stat_prefix);
const std::string& stat_prefix,
bool track_remaining);

// Upstream::ClusterInfo
bool addedViaApi() const override { return added_via_api_; }
Expand Down
5 changes: 1 addition & 4 deletions test/common/http/http1/conn_pool_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,7 @@ class Http1ConnPoolImplTest : public testing::Test {
conn_pool_(dispatcher_, cluster_, upstream_ready_timer_) {}

~Http1ConnPoolImplTest() {
// Make sure all gauges are 0.
for (const Stats::GaugeSharedPtr& gauge : cluster_->stats_store_.gauges()) {
EXPECT_EQ(0U, gauge->value());
}
EXPECT_TRUE(TestUtility::gaugesZeroed(cluster_->stats_store_.gauges()));
}

NiceMock<Event::MockDispatcher> dispatcher_;
Expand Down
5 changes: 1 addition & 4 deletions test/common/http/http2/conn_pool_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,7 @@ class Http2ConnPoolImplTest : public testing::Test {
pool_(dispatcher_, host_, Upstream::ResourcePriority::Default, nullptr) {}

~Http2ConnPoolImplTest() {
// Make sure all gauges are 0.
for (const Stats::GaugeSharedPtr& gauge : cluster_->stats_store_.gauges()) {
EXPECT_EQ(0U, gauge->value());
}
EXPECT_TRUE(TestUtility::gaugesZeroed(cluster_->stats_store_.gauges()));
}

// Creates a new test client, expecting a new connection to be created and associated
Expand Down
5 changes: 1 addition & 4 deletions test/common/tcp/conn_pool_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,7 @@ class TcpConnPoolImplTest : public testing::Test {
conn_pool_(dispatcher_, cluster_, upstream_ready_timer_) {}

~TcpConnPoolImplTest() {
// Make sure all gauges are 0.
for (const Stats::GaugeSharedPtr& gauge : cluster_->stats_store_.gauges()) {
EXPECT_EQ(0U, gauge->value());
}
EXPECT_TRUE(TestUtility::gaugesZeroed(cluster_->stats_store_.gauges()));
}

NiceMock<Event::MockDispatcher> dispatcher_;
Expand Down
49 changes: 48 additions & 1 deletion test/common/upstream/resource_manager_impl_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ TEST(ResourceManagerImplTest, RuntimeResourceManager) {

ResourceManagerImpl resource_manager(
runtime, "circuit_breakers.runtime_resource_manager_test.default.", 0, 0, 0, 1,
ClusterCircuitBreakersStats{ALL_CLUSTER_CIRCUIT_BREAKERS_STATS(POOL_GAUGE(store))});
ClusterCircuitBreakersStats{
ALL_CLUSTER_CIRCUIT_BREAKERS_STATS(POOL_GAUGE(store), POOL_GAUGE(store))});

EXPECT_CALL(
runtime.snapshot_,
Expand Down Expand Up @@ -59,6 +60,52 @@ TEST(ResourceManagerImplTest, RuntimeResourceManager) {
EXPECT_FALSE(resource_manager.retries().canCreate());
}

TEST(ResourceManagerImplTest, RemainingResourceGauges) {
NiceMock<Runtime::MockLoader> runtime;
Stats::IsolatedStoreImpl store;

auto stats = ClusterCircuitBreakersStats{
ALL_CLUSTER_CIRCUIT_BREAKERS_STATS(POOL_GAUGE(store), POOL_GAUGE(store))};
ResourceManagerImpl resource_manager(
runtime, "circuit_breakers.runtime_resource_manager_test.default.", 1, 2, 1, 0, stats);

// Test remaining_cx_ gauge
EXPECT_EQ(1U, resource_manager.connections().max());
EXPECT_EQ(1U, stats.remaining_cx_.value());
resource_manager.connections().inc();
EXPECT_EQ(0U, stats.remaining_cx_.value());
resource_manager.connections().dec();
EXPECT_EQ(1U, stats.remaining_cx_.value());

// Test remaining_pending_ gauge
EXPECT_EQ(2U, resource_manager.pendingRequests().max());
EXPECT_EQ(2U, stats.remaining_pending_.value());
resource_manager.pendingRequests().inc();
EXPECT_EQ(1U, stats.remaining_pending_.value());
resource_manager.pendingRequests().inc();
EXPECT_EQ(0U, stats.remaining_pending_.value());
resource_manager.pendingRequests().dec();
EXPECT_EQ(1U, stats.remaining_pending_.value());
resource_manager.pendingRequests().dec();
EXPECT_EQ(2U, stats.remaining_pending_.value());

// Test remaining_rq_ gauge
EXPECT_EQ(1U, resource_manager.requests().max());
EXPECT_EQ(1U, stats.remaining_rq_.value());
resource_manager.requests().inc();
EXPECT_EQ(0U, stats.remaining_rq_.value());
resource_manager.requests().dec();
EXPECT_EQ(1U, stats.remaining_rq_.value());

// Test remaining_retries_ gauge. Confirm that the value will not be negative
// despite having more retries than the configured max
EXPECT_EQ(0U, resource_manager.retries().max());
EXPECT_EQ(0U, stats.remaining_retries_.value());
resource_manager.retries().inc();
EXPECT_EQ(0U, stats.remaining_retries_.value());
resource_manager.retries().dec();
EXPECT_EQ(0U, stats.remaining_retries_.value());
}
} // namespace
} // namespace Upstream
} // namespace Envoy
Loading