Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions envoy/upstream/cluster_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,7 @@ class ClusterManager {
* @return the stat names.
*/
virtual const ClusterStatNames& clusterStatNames() const PURE;
virtual const ClusterConfigUpdateStatNames& clusterConfigUpdateStatNames() const PURE;
virtual const ClusterLoadReportStatNames& clusterLoadReportStatNames() const PURE;
virtual const ClusterCircuitBreakersStatNames& clusterCircuitBreakersStatNames() const PURE;
virtual const ClusterRequestResponseSizeStatNames&
Expand Down
33 changes: 25 additions & 8 deletions envoy/upstream/upstream.h
Original file line number Diff line number Diff line change
Expand Up @@ -565,11 +565,22 @@ class PrioritySet {
};

/**
* All cluster stats. @see stats_macros.h
* All cluster config update related stats.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think some comments here about the rationale for the partitioning would help.

In particular, what I think you mean to have here is a block of stats you expect to be incremented for clusters that see zero traffic.

Whereas ALL_CLUSTER_STATS (which should be re-named as that's now a lie :) are stats that you expect to only be incremented when a cluster sees traffic. Is that right?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds good. I will more details in comment.

*/
#define ALL_CLUSTER_STATS(COUNTER, GAUGE, HISTOGRAM, TEXT_READOUT, STATNAME) \
#define ALL_CLUSTER_CONFIG_UPDATE_STATS(COUNTER, GAUGE, HISTOGRAM, TEXT_READOUT, STATNAME) \
COUNTER(assignment_stale) \
COUNTER(assignment_timeout_received) \
COUNTER(update_attempt) \
COUNTER(update_empty) \
COUNTER(update_failure) \
COUNTER(update_no_rebuild) \
COUNTER(update_success) \
GAUGE(version, NeverImport)

/**
* All cluster stats. @see stats_macros.h
*/
#define ALL_CLUSTER_STATS(COUNTER, GAUGE, HISTOGRAM, TEXT_READOUT, STATNAME) \
COUNTER(bind_errors) \
COUNTER(lb_healthy_panic) \
COUNTER(lb_local_cluster_not_ok) \
Expand All @@ -588,11 +599,6 @@ class PrioritySet {
COUNTER(membership_change) \
COUNTER(original_dst_host_invalid) \
COUNTER(retry_or_shadow_abandoned) \
COUNTER(update_attempt) \
COUNTER(update_empty) \
COUNTER(update_failure) \
COUNTER(update_no_rebuild) \
COUNTER(update_success) \
COUNTER(upstream_cx_close_notify) \
COUNTER(upstream_cx_connect_attempts_exceeded) \
COUNTER(upstream_cx_connect_fail) \
Expand Down Expand Up @@ -655,7 +661,6 @@ class PrioritySet {
GAUGE(upstream_cx_tx_bytes_buffered, Accumulate) \
GAUGE(upstream_rq_active, Accumulate) \
GAUGE(upstream_rq_pending_active, Accumulate) \
GAUGE(version, NeverImport) \
HISTOGRAM(upstream_cx_connect_ms, Milliseconds) \
HISTOGRAM(upstream_cx_length_ms, Milliseconds)

Expand Down Expand Up @@ -707,6 +712,13 @@ class PrioritySet {
HISTOGRAM(upstream_rq_timeout_budget_percent_used, Unspecified) \
HISTOGRAM(upstream_rq_timeout_budget_per_try_percent_used, Unspecified)

/**
* Struct definition for cluster config update stats. @see stats_macros.h
*/
MAKE_STAT_NAMES_STRUCT(ClusterConfigUpdateStatNames, ALL_CLUSTER_CONFIG_UPDATE_STATS);
MAKE_STATS_STRUCT(ClusterConfigUpdateStats, ClusterConfigUpdateStatNames,
ALL_CLUSTER_CONFIG_UPDATE_STATS);

/**
* Struct definition for all cluster stats. @see stats_macros.h
*/
Expand Down Expand Up @@ -991,6 +1003,11 @@ class ClusterInfo : public Http::FilterChainFactory {
*/
virtual TransportSocketMatcher& transportSocketMatcher() const PURE;

/**
* @return ClusterConfigUpdateStats& strongly named config update stats for this cluster.
*/
virtual ClusterConfigUpdateStats& configUpdateStats() const PURE;

/**
* @return ClusterStats& strongly named stats for this cluster.
*/
Expand Down
1 change: 1 addition & 0 deletions source/common/upstream/cluster_manager_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ ClusterManagerImpl::ClusterManagerImpl(
time_source_(main_thread_dispatcher.timeSource()), dispatcher_(main_thread_dispatcher),
http_context_(http_context), router_context_(router_context),
cluster_stat_names_(stats.symbolTable()),
cluster_config_update_stat_names_(stats.symbolTable()),
cluster_load_report_stat_names_(stats.symbolTable()),
cluster_circuit_breakers_stat_names_(stats.symbolTable()),
cluster_request_response_size_stat_names_(stats.symbolTable()),
Expand Down
4 changes: 4 additions & 0 deletions source/common/upstream/cluster_manager_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,9 @@ class ClusterManagerImpl : public ClusterManager,
initializeSecondaryClusters(const envoy::config::bootstrap::v3::Bootstrap& bootstrap) override;

const ClusterStatNames& clusterStatNames() const override { return cluster_stat_names_; }
const ClusterConfigUpdateStatNames& clusterConfigUpdateStatNames() const override {
return cluster_config_update_stat_names_;
}
const ClusterLoadReportStatNames& clusterLoadReportStatNames() const override {
return cluster_load_report_stat_names_;
}
Expand Down Expand Up @@ -765,6 +768,7 @@ class ClusterManagerImpl : public ClusterManager,
Http::Context& http_context_;
Router::Context& router_context_;
ClusterStatNames cluster_stat_names_;
ClusterConfigUpdateStatNames cluster_config_update_stat_names_;
ClusterLoadReportStatNames cluster_load_report_stat_names_;
ClusterCircuitBreakersStatNames cluster_circuit_breakers_stat_names_;
ClusterRequestResponseSizeStatNames cluster_request_response_size_stat_names_;
Expand Down
8 changes: 4 additions & 4 deletions source/common/upstream/eds.cc
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ void EdsClusterImpl::BatchUpdateHelper::batchUpdate(PrioritySet::HostUpdateCb& h
}

if (!cluster_rebuilt) {
parent_.info_->stats().update_no_rebuild_.inc();
parent_.info_->configUpdateStats().update_no_rebuild_.inc();
}

// If we didn't setup to initialize when our first round of health checking is complete, just
Expand Down Expand Up @@ -178,7 +178,7 @@ void EdsClusterImpl::onConfigUpdate(const std::vector<Config::DecodedResourceRef
PROTOBUF_GET_MS_OR_DEFAULT(cluster_load_assignment.policy(), endpoint_stale_after, 0);
if (stale_after_ms > 0) {
// Stat to track how often we receive valid assignment_timeout in response.
info_->stats().assignment_timeout_received_.inc();
info_->configUpdateStats().assignment_timeout_received_.inc();
assignment_timeout_->enableTimer(std::chrono::milliseconds(stale_after_ms));
}

Expand Down Expand Up @@ -260,7 +260,7 @@ void EdsClusterImpl::onConfigUpdate(const std::vector<Config::DecodedResourceRef
bool EdsClusterImpl::validateUpdateSize(int num_resources) {
if (num_resources == 0) {
ENVOY_LOG(debug, "Missing ClusterLoadAssignment for {} in onConfigUpdate()", cluster_name_);
info_->stats().update_empty_.inc();
info_->configUpdateStats().update_empty_.inc();
onPreInitComplete();
return false;
}
Expand All @@ -286,7 +286,7 @@ void EdsClusterImpl::onAssignmentTimeout() {
std::vector<Config::DecodedResourceRef> resource_refs = {*decoded_resource};
onConfigUpdate(resource_refs, "");
// Stat to track how often we end up with stale assignments.
info_->stats().assignment_stale_.inc();
info_->configUpdateStats().assignment_stale_.inc();
}

void EdsClusterImpl::reloadHealthyHostsHelper(const HostSharedPtr& host) {
Expand Down
6 changes: 3 additions & 3 deletions source/common/upstream/logical_dns_cluster.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ LogicalDnsCluster::~LogicalDnsCluster() {

void LogicalDnsCluster::startResolve() {
ENVOY_LOG(debug, "starting async DNS resolution for {}", dns_address_);
info_->stats().update_attempt_.inc();
info_->configUpdateStats().update_attempt_.inc();

active_dns_query_ = dns_resolver_->resolve(
dns_address_, dns_lookup_family_,
Expand All @@ -121,7 +121,7 @@ void LogicalDnsCluster::startResolve() {
// cluster does not update. This ensures that a potentially previously resolved address does
// not stabilize back to 0 hosts.
if (status == Network::DnsResolver::ResolutionStatus::Success && !response.empty()) {
info_->stats().update_success_.inc();
info_->configUpdateStats().update_success_.inc();
const auto addrinfo = response.front().addrInfo();
// TODO(mattklein123): Move port handling into the DNS interface.
ASSERT(addrinfo.address_ != nullptr);
Expand Down Expand Up @@ -166,7 +166,7 @@ void LogicalDnsCluster::startResolve() {
ENVOY_LOG(debug, "DNS refresh rate reset for {}, refresh rate {} ms", dns_address_,
final_refresh_rate.count());
} else {
info_->stats().update_failure_.inc();
info_->configUpdateStats().update_failure_.inc();
final_refresh_rate =
std::chrono::milliseconds(failure_backoff_strategy_->nextBackOffMs());
ENVOY_LOG(debug, "DNS refresh rate reset for {}, (failure) refresh rate {} ms",
Expand Down
8 changes: 4 additions & 4 deletions source/common/upstream/strict_dns_cluster.cc
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ StrictDnsClusterImpl::ResolveTarget::~ResolveTarget() {

void StrictDnsClusterImpl::ResolveTarget::startResolve() {
ENVOY_LOG(trace, "starting async DNS resolution for {}", dns_address_);
parent_.info_->stats().update_attempt_.inc();
parent_.info_->configUpdateStats().update_attempt_.inc();

active_query_ = parent_.dns_resolver_->resolve(
dns_address_, parent_.dns_lookup_family_,
Expand All @@ -117,7 +117,7 @@ void StrictDnsClusterImpl::ResolveTarget::startResolve() {
std::chrono::milliseconds final_refresh_rate = parent_.dns_refresh_rate_ms_;

if (status == Network::DnsResolver::ResolutionStatus::Success) {
parent_.info_->stats().update_success_.inc();
parent_.info_->configUpdateStats().update_success_.inc();

HostVector new_hosts;
std::chrono::seconds ttl_refresh_rate = std::chrono::seconds::max();
Expand Down Expand Up @@ -164,7 +164,7 @@ void StrictDnsClusterImpl::ResolveTarget::startResolve() {

parent_.updateAllHosts(hosts_added, hosts_removed, locality_lb_endpoints_.priority());
} else {
parent_.info_->stats().update_no_rebuild_.inc();
parent_.info_->configUpdateStats().update_no_rebuild_.inc();
}

// reset failure backoff strategy because there was a success.
Expand All @@ -179,7 +179,7 @@ void StrictDnsClusterImpl::ResolveTarget::startResolve() {
ENVOY_LOG(debug, "DNS refresh rate reset for {}, refresh rate {} ms", dns_address_,
final_refresh_rate.count());
} else {
parent_.info_->stats().update_failure_.inc();
parent_.info_->configUpdateStats().update_failure_.inc();

final_refresh_rate =
std::chrono::milliseconds(parent_.failure_backoff_strategy_->nextBackOffMs());
Expand Down
2 changes: 2 additions & 0 deletions source/common/upstream/upstream_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -979,6 +979,8 @@ ClusterInfoImpl::ClusterInfoImpl(
PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, per_connection_buffer_limit_bytes, 1024 * 1024)),
socket_matcher_(std::move(socket_matcher)), stats_scope_(std::move(stats_scope)),
stats_(generateStats(*stats_scope_, factory_context.clusterManager().clusterStatNames())),
config_update_stats_(factory_context.clusterManager().clusterConfigUpdateStatNames(),
*stats_scope_),
load_report_stats_store_(stats_scope_->symbolTable()),
load_report_stats_(generateLoadReportStats(
load_report_stats_store_, factory_context.clusterManager().clusterLoadReportStatNames())),
Expand Down
2 changes: 2 additions & 0 deletions source/common/upstream/upstream_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -792,6 +792,7 @@ class ClusterInfoImpl : public ClusterInfo,
ResourceManager& resourceManager(ResourcePriority priority) const override;
TransportSocketMatcher& transportSocketMatcher() const override { return *socket_matcher_; }
ClusterStats& stats() const override { return stats_; }
ClusterConfigUpdateStats& configUpdateStats() const override { return config_update_stats_; }
Stats::Scope& statsScope() const override { return *stats_scope_; }

ClusterRequestResponseSizeStatsOptRef requestResponseSizeStats() const override {
Expand Down Expand Up @@ -906,6 +907,7 @@ class ClusterInfoImpl : public ClusterInfo,
TransportSocketMatcherPtr socket_matcher_;
Stats::ScopeSharedPtr stats_scope_;
mutable ClusterStats stats_;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so IIRC the game-plan would be to convert this to Thread::AtomicPtr<ClusterStats> stats_ right? And then stats() above would be implemented as stats_.get() ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right. more PRs to come.
The idea is that we only increase when an individual field in the sub-group is accessed (ideally, only when it's inc-ed).

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would put the AtomicPtr in this PR, otherwise this PR actually increases memory usage, even in the case where some clusters get no traffic. It will definitely increase memory usage (by a small amount) when all clusters get traffic. But those situations probably don't have a huge number of clusters.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I plan to have another "refactoring" PR before actually introduce the "lazy-init" Struct, to hopefully make the PR easier to read.
Let me know if you would prefer a bigger PR. :P

mutable ClusterConfigUpdateStats config_update_stats_;
Stats::IsolatedStoreImpl load_report_stats_store_;
mutable ClusterLoadReportStats load_report_stats_;
const std::unique_ptr<OptionalClusterStats> optional_cluster_stats_;
Expand Down
16 changes: 8 additions & 8 deletions source/extensions/clusters/redis/redis_cluster.cc
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ void RedisCluster::onClusterSlotUpdate(ClusterSlotsSharedPtr&& slots) {
}));
updateAllHosts(hosts_added, hosts_removed, localityLbEndpoint().priority());
} else {
info_->stats().update_no_rebuild_.inc();
info_->configUpdateStats().update_no_rebuild_.inc();
}

// TODO(hyang): If there is an initialize callback, fire it now. Note that if the
Expand Down Expand Up @@ -190,9 +190,9 @@ void RedisCluster::DnsDiscoveryResolveTarget::startResolveDns() {
ENVOY_LOG(trace, "async DNS resolution complete for {}", dns_address_);
if (status == Network::DnsResolver::ResolutionStatus::Failure || response.empty()) {
if (status == Network::DnsResolver::ResolutionStatus::Failure) {
parent_.info_->stats().update_failure_.inc();
parent_.info_->configUpdateStats().update_failure_.inc();
} else {
parent_.info_->stats().update_empty_.inc();
parent_.info_->configUpdateStats().update_empty_.inc();
}

if (!resolve_timer_) {
Expand Down Expand Up @@ -273,7 +273,7 @@ void RedisCluster::RedisDiscoverySession::registerDiscoveryAddress(
}

void RedisCluster::RedisDiscoverySession::startResolveRedis() {
parent_.info_->stats().update_attempt_.inc();
parent_.info_->configUpdateStats().update_attempt_.inc();
// If a resolution is currently in progress, skip it.
if (current_request_) {
ENVOY_LOG(debug, "redis cluster slot request is already in progress for '{}'",
Expand Down Expand Up @@ -311,9 +311,9 @@ void RedisCluster::RedisDiscoverySession::startResolveRedis() {
void RedisCluster::RedisDiscoverySession::updateDnsStats(
Network::DnsResolver::ResolutionStatus status, bool empty_response) {
if (status == Network::DnsResolver::ResolutionStatus::Failure) {
parent_.info_->stats().update_failure_.inc();
parent_.info_->configUpdateStats().update_failure_.inc();
} else if (empty_response) {
parent_.info_->stats().update_empty_.inc();
parent_.info_->configUpdateStats().update_empty_.inc();
}
}

Expand Down Expand Up @@ -558,7 +558,7 @@ bool RedisCluster::RedisDiscoverySession::validateCluster(
void RedisCluster::RedisDiscoverySession::onUnexpectedResponse(
const NetworkFilters::Common::Redis::RespValuePtr& value) {
ENVOY_LOG(warn, "Unexpected response to cluster slot command: {}", value->toString());
this->parent_.info_->stats().update_failure_.inc();
this->parent_.info_->configUpdateStats().update_failure_.inc();
resolve_timer_->enableTimer(parent_.cluster_refresh_rate_);
}

Expand All @@ -569,7 +569,7 @@ void RedisCluster::RedisDiscoverySession::onFailure() {
auto client_to_delete = client_map_.find(current_host_address_);
client_to_delete->second->client_->close();
}
parent_.info()->stats().update_failure_.inc();
parent_.info()->configUpdateStats().update_failure_.inc();
resolve_timer_->enableTimer(parent_.cluster_refresh_rate_);
}

Expand Down
Loading