From 431a63492945a0a7e9755556a689b57fde2da381 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 23 Jul 2018 14:33:01 -0700 Subject: [PATCH 01/38] Add option for coalescing cluster updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change introduces a new configuration parameter for clusters, `time_between_updates`. If this is set, all cluster updates — membership changes, metadata changes on endpoints, or healtcheck state changes — that happen within a `time_between_updates` duration will be merged and delivered at once when the duration ends. This is useful for big clusters (> 1k endpoints) using the subset LB. Partially addresses https://github.com/envoyproxy/envoy/issues/3929. Signed-off-by: Raul Gutierrez Segales --- api/envoy/api/v2/cds.proto | 6 ++ docs/root/intro/version_history.rst | 2 + .../common/upstream/cluster_manager_impl.cc | 59 ++++++++++++++++++- source/common/upstream/cluster_manager_impl.h | 15 +++++ .../upstream/cluster_manager_impl_test.cc | 44 ++++++++++++++ 5 files changed, 124 insertions(+), 2 deletions(-) diff --git a/api/envoy/api/v2/cds.proto b/api/envoy/api/v2/cds.proto index f649dafe0fb63..cdce38dae3269 100644 --- a/api/envoy/api/v2/cds.proto +++ b/api/envoy/api/v2/cds.proto @@ -432,6 +432,12 @@ message Cluster { ZoneAwareLbConfig zone_aware_lb_config = 2; LocalityWeightedLbConfig locality_weighted_lb_config = 3; } + // If set, membership and healthcheck updates that happen within this duration will be coalesced + // and delivered in one shot when the duration expires. The start of the duration is when the + // first update happens. This is useful for big clusters, with potentially noisy deploys that + // might trigger excessive CPU usage due to a constant stream of healthcheck state changes or + // membership updates. + google.protobuf.Duration time_between_updates = 4; } // Common configuration for all load balancer implementations. diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index 8d1c1e6fba4ac..1a4dbb1a29a0c 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -48,6 +48,8 @@ Version history * upstream: added configuration option to the subset load balancer to take locality weights into account when selecting a host from a subset. * access log: added RESPONSE_DURATION and RESPONSE_TX_DURATION. +* cluster: added :ref:`option ` to coalesce updates + within the given duration. 1.7.0 =============== diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index dd1f042822968..8342c7f280cff 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -180,7 +180,7 @@ ClusterManagerImpl::ClusterManagerImpl(const envoy::config::bootstrap::v2::Boots init_helper_([this](Cluster& cluster) { onClusterInit(cluster); }), config_tracker_entry_( admin.getConfigTracker().add("clusters", [this] { return dumpClusterConfigs(); })), - system_time_source_(system_time_source) { + system_time_source_(system_time_source), dispatcher_(main_thread_dispatcher) { async_client_manager_ = std::make_unique(*this, tls); const auto& cm_config = bootstrap.cluster_manager(); if (cm_config.has_outlier_detection()) { @@ -330,7 +330,46 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { const HostVector& hosts_removed) { // This fires when a cluster is about to have an updated member set. We need to send this // out to all of the thread local configurations. - postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); + + // Should we coalesce updates? + if (cluster.info()->lbConfig().has_time_between_updates()) { + PendingUpdatesByPriorityMapPtr updates_by_prio; + PendingUpdatesPtr updates; + + // Find pending updates for this cluster. + auto updates_by_prio_it = updates_map_.find(cluster.info()->name()); + if (updates_by_prio_it != updates_map_.end()) { + updates_by_prio = updates_by_prio_it->second; + } else { + updates_by_prio = std::make_shared(); + updates_map_[cluster.info()->name()] = updates_by_prio; + } + + // Find pending updates for this priority. + auto updates_it = updates_by_prio->find(priority); + if (updates_it != updates_by_prio->end()) { + updates = updates_it->second; + } else { + updates = std::make_shared(); + (*updates_by_prio)[priority] = updates; + } + + // Record the updates that should be applied when the timer fires. + updates->added.insert(hosts_added.begin(), hosts_added.end()); + updates->removed.insert(hosts_removed.begin(), hosts_removed.end()); + + // If there's no timer, create one. + if (updates->timer == nullptr) { + updates->timer = dispatcher_.createTimer([this, &cluster, priority, &updates]() -> void { + applyUpdates(cluster, priority, updates); + }); + const auto& time_between_updates = cluster.info()->lbConfig().time_between_updates(); + const auto timeout = DurationUtil::durationToMilliseconds(time_between_updates); + updates->timer->enableTimer(std::chrono::milliseconds(timeout)); + } + } else { + postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); + } }); // Finally, if the cluster has any hosts, post updates cross-thread so the per-thread load @@ -343,6 +382,22 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { } } +void ClusterManagerImpl::applyUpdates(const Cluster& cluster, uint32_t priority, + PendingUpdatesPtr updates) { + // Merge pending updates & deliver. + const HostVector& hosts_added{updates->added.begin(), updates->added.end()}; + const HostVector& hosts_removed{updates->removed.begin(), updates->removed.end()}; + + postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); + + cm_stats_.coalesced_updates_.inc(); + + // Reset everything. + updates->timer = nullptr; + updates->added.clear(); + updates->removed.clear(); +} + bool ClusterManagerImpl::addOrUpdateCluster(const envoy::api::v2::Cluster& cluster, const std::string& version_info) { // First we need to see if this new config is new or an update to an existing dynamic cluster. diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index f65e27ccb2825..94b974b6b2800 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -138,6 +138,7 @@ class ClusterManagerInitHelper : Logger::Loggable { COUNTER(cluster_added) \ COUNTER(cluster_modified) \ COUNTER(cluster_removed) \ + COUNTER(coalesced_updates) \ GAUGE (active_clusters) \ GAUGE (warming_clusters) // clang-format on @@ -361,6 +362,18 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable ClusterMap; + struct PendingUpdates { + PendingUpdates() {} + Event::TimerPtr timer; + std::unordered_set added; + std::unordered_set removed; + }; + typedef std::shared_ptr PendingUpdatesPtr; + typedef std::unordered_map PendingUpdatesByPriorityMap; + typedef std::shared_ptr PendingUpdatesByPriorityMapPtr; + typedef std::unordered_map ClusterUpdatesMap; + + void applyUpdates(const Cluster& cluster, uint32_t priority, PendingUpdatesPtr updates); void createOrUpdateThreadLocalCluster(ClusterData& cluster); ProtobufTypes::MessagePtr dumpClusterConfigs(); static ClusterManagerStats generateStats(Stats::Scope& scope); @@ -394,6 +407,8 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggableget("cluster_1")->info()->addedViaApi()); + + // Save the updates timer. + Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); + + // Remove each host, sequentially. + const Cluster& cluster = cluster_manager_->clusters().begin()->second; + + HostVectorSharedPtr hosts( + new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); + HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); + HostVector hosts_added{}; + HostVector hosts_removed_0{(*hosts)[0]}; + HostVector hosts_removed_1{(*hosts)[1]}; + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed_0); + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed_1); + + // Ensure the coalesced updates were applied. + timer->callback_(); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); +} + class ClusterManagerInitHelperTest : public testing::Test { public: MOCK_METHOD1(onClusterInit, void(Cluster& cluster)); From 8e202048e9cc2ba18129dae91f8eef8074bbfbad Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Tue, 24 Jul 2018 12:43:19 -0700 Subject: [PATCH 02/38] Address review * move coalesced update logic into its own method * extend the unit test to exercise a follow-up delivery of updates Signed-off-by: Raul Gutierrez Segales --- .../common/upstream/cluster_manager_impl.cc | 74 ++++++++++--------- source/common/upstream/cluster_manager_impl.h | 2 + .../upstream/cluster_manager_impl_test.cc | 13 ++++ 3 files changed, 55 insertions(+), 34 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index 8342c7f280cff..1ef20f8cb3520 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -333,40 +333,7 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { // Should we coalesce updates? if (cluster.info()->lbConfig().has_time_between_updates()) { - PendingUpdatesByPriorityMapPtr updates_by_prio; - PendingUpdatesPtr updates; - - // Find pending updates for this cluster. - auto updates_by_prio_it = updates_map_.find(cluster.info()->name()); - if (updates_by_prio_it != updates_map_.end()) { - updates_by_prio = updates_by_prio_it->second; - } else { - updates_by_prio = std::make_shared(); - updates_map_[cluster.info()->name()] = updates_by_prio; - } - - // Find pending updates for this priority. - auto updates_it = updates_by_prio->find(priority); - if (updates_it != updates_by_prio->end()) { - updates = updates_it->second; - } else { - updates = std::make_shared(); - (*updates_by_prio)[priority] = updates; - } - - // Record the updates that should be applied when the timer fires. - updates->added.insert(hosts_added.begin(), hosts_added.end()); - updates->removed.insert(hosts_removed.begin(), hosts_removed.end()); - - // If there's no timer, create one. - if (updates->timer == nullptr) { - updates->timer = dispatcher_.createTimer([this, &cluster, priority, &updates]() -> void { - applyUpdates(cluster, priority, updates); - }); - const auto& time_between_updates = cluster.info()->lbConfig().time_between_updates(); - const auto timeout = DurationUtil::durationToMilliseconds(time_between_updates); - updates->timer->enableTimer(std::chrono::milliseconds(timeout)); - } + scheduleUpdate(cluster, priority, hosts_added, hosts_removed); } else { postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); } @@ -382,6 +349,45 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { } } +void ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priority, + const HostVector& hosts_added, + const HostVector& hosts_removed) { + PendingUpdatesByPriorityMapPtr updates_by_prio; + PendingUpdatesPtr updates; + + // Find pending updates for this cluster. + auto updates_by_prio_it = updates_map_.find(cluster.info()->name()); + if (updates_by_prio_it != updates_map_.end()) { + updates_by_prio = updates_by_prio_it->second; + } else { + updates_by_prio = std::make_shared(); + updates_map_[cluster.info()->name()] = updates_by_prio; + } + + // Find pending updates for this priority. + auto updates_it = updates_by_prio->find(priority); + if (updates_it != updates_by_prio->end()) { + updates = updates_it->second; + } else { + updates = std::make_shared(); + (*updates_by_prio)[priority] = updates; + } + + // Record the updates that should be applied when the timer fires. + updates->added.insert(hosts_added.begin(), hosts_added.end()); + updates->removed.insert(hosts_removed.begin(), hosts_removed.end()); + + // If there's no timer, create one. + if (updates->timer == nullptr) { + updates->timer = dispatcher_.createTimer([this, &cluster, priority, &updates]() -> void { + applyUpdates(cluster, priority, updates); + }); + const auto& time_between_updates = cluster.info()->lbConfig().time_between_updates(); + const auto timeout = DurationUtil::durationToMilliseconds(time_between_updates); + updates->timer->enableTimer(std::chrono::milliseconds(timeout)); + } +} + void ClusterManagerImpl::applyUpdates(const Cluster& cluster, uint32_t priority, PendingUpdatesPtr updates) { // Merge pending updates & deliver. diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index 94b974b6b2800..a0520c21e53c5 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -374,6 +374,8 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable ClusterUpdatesMap; void applyUpdates(const Cluster& cluster, uint32_t priority, PendingUpdatesPtr updates); + void scheduleUpdate(const Cluster& cluster, uint32_t priority, const HostVector& hosts_added, + const HostVector& hosts_removed); void createOrUpdateThreadLocalCluster(ClusterData& cluster); ProtobufTypes::MessagePtr dumpClusterConfigs(); static ClusterManagerStats generateStats(Stats::Scope& scope); diff --git a/test/common/upstream/cluster_manager_impl_test.cc b/test/common/upstream/cluster_manager_impl_test.cc index 9a7ec8f3dbe0e..6d4479f9e0798 100644 --- a/test/common/upstream/cluster_manager_impl_test.cc +++ b/test/common/upstream/cluster_manager_impl_test.cc @@ -1662,6 +1662,19 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdates) { // Ensure the coalesced updates were applied. timer->callback_(); EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); + + // Prepare a new timer. + timer = new NiceMock(&factory_.dispatcher_); + + // Add them back. + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_removed_0, hosts_added); + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_removed_1, hosts_added); + + // Ensure the coalesced updates were applied again. + timer->callback_(); + EXPECT_EQ(2, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); } class ClusterManagerInitHelperTest : public testing::Test { From c6dbc7d586e1ecd96e53740fcec9dd1c0ceb5fde Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Tue, 24 Jul 2018 16:07:42 -0700 Subject: [PATCH 03/38] Address more review comments * fix documentation * rename config knob to `update_merge_window` * apply updates immediately when outside of window Signed-off-by: Raul Gutierrez Segales --- api/envoy/api/v2/cds.proto | 5 ++-- .../common/upstream/cluster_manager_impl.cc | 28 +++++++++++++++---- source/common/upstream/cluster_manager_impl.h | 3 +- .../upstream/cluster_manager_impl_test.cc | 14 ++++++---- 4 files changed, 36 insertions(+), 14 deletions(-) diff --git a/api/envoy/api/v2/cds.proto b/api/envoy/api/v2/cds.proto index cdce38dae3269..5aa4220ee08a4 100644 --- a/api/envoy/api/v2/cds.proto +++ b/api/envoy/api/v2/cds.proto @@ -436,8 +436,9 @@ message Cluster { // and delivered in one shot when the duration expires. The start of the duration is when the // first update happens. This is useful for big clusters, with potentially noisy deploys that // might trigger excessive CPU usage due to a constant stream of healthcheck state changes or - // membership updates. - google.protobuf.Duration time_between_updates = 4; + // membership updates. By default, this is not configured and updates apply immediately. + // Also, the first set of updates to be seen apply immediately as well (e.g.: a new cluster). + google.protobuf.Duration update_merge_window = 4; } // Common configuration for all load balancer implementations. diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index 1ef20f8cb3520..934da88722b97 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -332,9 +332,13 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { // out to all of the thread local configurations. // Should we coalesce updates? - if (cluster.info()->lbConfig().has_time_between_updates()) { - scheduleUpdate(cluster, priority, hosts_added, hosts_removed); - } else { + bool scheduled = false; + if (cluster.info()->lbConfig().has_update_merge_window()) { + scheduled = scheduleUpdate(cluster, priority, hosts_added, hosts_removed); + } + + // If an update was not scheduled, deliver it immediately. + if (!scheduled) { postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); } }); @@ -349,9 +353,11 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { } } -void ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priority, +bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priority, const HostVector& hosts_added, const HostVector& hosts_removed) { + const auto& update_merge_window = cluster.info()->lbConfig().update_merge_window(); + const auto timeout = DurationUtil::durationToMilliseconds(update_merge_window); PendingUpdatesByPriorityMapPtr updates_by_prio; PendingUpdatesPtr updates; @@ -373,6 +379,15 @@ void ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit (*updates_by_prio)[priority] = updates; } + // Has an update_merge_window gone by since the last update? If so, don't schedule + // the update so it can be applied immediately. + auto delta = std::chrono::steady_clock::now() - updates->last_updated; + uint64_t delta_ms = std::chrono::duration_cast(delta).count(); + if (delta_ms > timeout) { + updates->last_updated = std::chrono::steady_clock::now(); + return false; + } + // Record the updates that should be applied when the timer fires. updates->added.insert(hosts_added.begin(), hosts_added.end()); updates->removed.insert(hosts_removed.begin(), hosts_removed.end()); @@ -382,10 +397,10 @@ void ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit updates->timer = dispatcher_.createTimer([this, &cluster, priority, &updates]() -> void { applyUpdates(cluster, priority, updates); }); - const auto& time_between_updates = cluster.info()->lbConfig().time_between_updates(); - const auto timeout = DurationUtil::durationToMilliseconds(time_between_updates); updates->timer->enableTimer(std::chrono::milliseconds(timeout)); } + + return true; } void ClusterManagerImpl::applyUpdates(const Cluster& cluster, uint32_t priority, @@ -402,6 +417,7 @@ void ClusterManagerImpl::applyUpdates(const Cluster& cluster, uint32_t priority, updates->timer = nullptr; updates->added.clear(); updates->removed.clear(); + updates->last_updated = std::chrono::steady_clock::now(); } bool ClusterManagerImpl::addOrUpdateCluster(const envoy::api::v2::Cluster& cluster, diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index a0520c21e53c5..04e21fbd0954b 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -367,6 +367,7 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable added; std::unordered_set removed; + MonotonicTime last_updated; }; typedef std::shared_ptr PendingUpdatesPtr; typedef std::unordered_map PendingUpdatesByPriorityMap; @@ -374,7 +375,7 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable ClusterUpdatesMap; void applyUpdates(const Cluster& cluster, uint32_t priority, PendingUpdatesPtr updates); - void scheduleUpdate(const Cluster& cluster, uint32_t priority, const HostVector& hosts_added, + bool scheduleUpdate(const Cluster& cluster, uint32_t priority, const HostVector& hosts_added, const HostVector& hosts_removed); void createOrUpdateThreadLocalCluster(ClusterData& cluster); ProtobufTypes::MessagePtr dumpClusterConfigs(); diff --git a/test/common/upstream/cluster_manager_impl_test.cc b/test/common/upstream/cluster_manager_impl_test.cc index 6d4479f9e0798..4810b7ff422ab 100644 --- a/test/common/upstream/cluster_manager_impl_test.cc +++ b/test/common/upstream/cluster_manager_impl_test.cc @@ -1636,15 +1636,12 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdates) { address: "127.0.0.1" port_value: 11002 common_lb_config: - time_between_updates: 3s + update_merge_window: 3s )EOF"; create(parseBootstrapFromV2Yaml(yaml)); EXPECT_FALSE(cluster_manager_->get("cluster_1")->info()->addedViaApi()); - // Save the updates timer. - Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); - // Remove each host, sequentially. const Cluster& cluster = cluster_manager_->clusters().begin()->second; @@ -1654,8 +1651,15 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdates) { HostVector hosts_added{}; HostVector hosts_removed_0{(*hosts)[0]}; HostVector hosts_removed_1{(*hosts)[1]}; + + // No timer needs to be mocked at this point, since the first update should be applied + // immediately. cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed_0); + + // Now we need the timer to be ready, since createTimer() will be call given that this update + // _will_ be delayed. + Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed_1); @@ -1663,7 +1667,7 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdates) { timer->callback_(); EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); - // Prepare a new timer. + // Prepare a new timer for the next round of updates. timer = new NiceMock(&factory_.dispatcher_); // Add them back. From 9e3288e4470aafabb57fa5c4ea4d966368eb8f6b Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Tue, 24 Jul 2018 17:17:39 -0700 Subject: [PATCH 04/38] Fix docs Signed-off-by: Raul Gutierrez Segales --- docs/root/intro/version_history.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index 1a4dbb1a29a0c..ab735c51b9826 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -48,7 +48,7 @@ Version history * upstream: added configuration option to the subset load balancer to take locality weights into account when selecting a host from a subset. * access log: added RESPONSE_DURATION and RESPONSE_TX_DURATION. -* cluster: added :ref:`option ` to coalesce updates +* cluster: added :ref:`option ` to coalesce updates within the given duration. 1.7.0 From add79b21bc0026f6bb2839a512ff229a037275d3 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Tue, 24 Jul 2018 17:49:34 -0700 Subject: [PATCH 05/38] Preserve order when merging updates Signed-off-by: Raul Gutierrez Segales --- .../common/upstream/cluster_manager_impl.cc | 25 ++++++++++++------- source/common/upstream/cluster_manager_impl.h | 6 +++-- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index 934da88722b97..ac941fcd5e6fb 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -388,9 +388,18 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit return false; } - // Record the updates that should be applied when the timer fires. - updates->added.insert(hosts_added.begin(), hosts_added.end()); - updates->removed.insert(hosts_removed.begin(), hosts_removed.end()); + // Record the updates — preserving order — that should be applied when the timer fires. + for (auto host : hosts_added) { + if (updates->added_seen.count(host) != 1) { + updates->added.push_back(host); + } + } + + for (auto host : hosts_removed) { + if (updates->removed_seen.count(host) != 1) { + updates->removed.push_back(host); + } + } // If there's no timer, create one. if (updates->timer == nullptr) { @@ -405,18 +414,16 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit void ClusterManagerImpl::applyUpdates(const Cluster& cluster, uint32_t priority, PendingUpdatesPtr updates) { - // Merge pending updates & deliver. - const HostVector& hosts_added{updates->added.begin(), updates->added.end()}; - const HostVector& hosts_removed{updates->removed.begin(), updates->removed.end()}; - - postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); - + // Deliver pending updates. + postThreadLocalClusterUpdate(cluster, priority, updates->added, updates->removed); cm_stats_.coalesced_updates_.inc(); // Reset everything. updates->timer = nullptr; updates->added.clear(); updates->removed.clear(); + updates->added_seen.clear(); + updates->removed_seen.clear(); updates->last_updated = std::chrono::steady_clock::now(); } diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index 04e21fbd0954b..2ea1e9d1ed2ee 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -365,8 +365,10 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable added; - std::unordered_set removed; + HostVector added; + HostVector removed; + std::unordered_set added_seen; + std::unordered_set removed_seen; MonotonicTime last_updated; }; typedef std::shared_ptr PendingUpdatesPtr; From 29f1d3d37f3493cc02e0e4b3107651346c0b06d6 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Tue, 24 Jul 2018 23:13:34 -0700 Subject: [PATCH 06/38] Address @htuch's review * use const when possible * use using instead of typedef * drop ordering of add/remove insertions * handle cancelled pairs of (add, remove) * remove unneeded struct constructor Will follow-up in another commit tomorrow with: * reuse timer instead of destroying/reallocating * add test for host added/removed/added Signed-off-by: Raul Gutierrez Segales --- .../common/upstream/cluster_manager_impl.cc | 33 +++++++++++-------- source/common/upstream/cluster_manager_impl.h | 15 ++++----- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index ac941fcd5e6fb..a6c717126143d 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -362,7 +362,7 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit PendingUpdatesPtr updates; // Find pending updates for this cluster. - auto updates_by_prio_it = updates_map_.find(cluster.info()->name()); + const auto updates_by_prio_it = updates_map_.find(cluster.info()->name()); if (updates_by_prio_it != updates_map_.end()) { updates_by_prio = updates_by_prio_it->second; } else { @@ -371,7 +371,7 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit } // Find pending updates for this priority. - auto updates_it = updates_by_prio->find(priority); + const auto updates_it = updates_by_prio->find(priority); if (updates_it != updates_by_prio->end()) { updates = updates_it->second; } else { @@ -381,23 +381,28 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit // Has an update_merge_window gone by since the last update? If so, don't schedule // the update so it can be applied immediately. - auto delta = std::chrono::steady_clock::now() - updates->last_updated; - uint64_t delta_ms = std::chrono::duration_cast(delta).count(); + const auto delta = std::chrono::steady_clock::now() - updates->last_updated; + const uint64_t delta_ms = std::chrono::duration_cast(delta).count(); if (delta_ms > timeout) { updates->last_updated = std::chrono::steady_clock::now(); return false; } - // Record the updates — preserving order — that should be applied when the timer fires. - for (auto host : hosts_added) { - if (updates->added_seen.count(host) != 1) { - updates->added.push_back(host); + // Record the updates that should be applied when the timer fires. + // Handle the case when a pair of add/remove cancels out. + for (const auto& host : hosts_added) { + if (updates->removed.count(host) == 1) { + updates->removed.erase(host); + } else { + updates->added.insert(host); } } - for (auto host : hosts_removed) { - if (updates->removed_seen.count(host) != 1) { - updates->removed.push_back(host); + for (const auto& host : hosts_removed) { + if (updates->added.count(host) == 1) { + updates->added.erase(host); + } else { + updates->removed.insert(host); } } @@ -415,15 +420,15 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit void ClusterManagerImpl::applyUpdates(const Cluster& cluster, uint32_t priority, PendingUpdatesPtr updates) { // Deliver pending updates. - postThreadLocalClusterUpdate(cluster, priority, updates->added, updates->removed); + const HostVector& hosts_added{updates->added.begin(), updates->added.end()}; + const HostVector& hosts_removed{updates->removed.begin(), updates->removed.end()}; + postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); cm_stats_.coalesced_updates_.inc(); // Reset everything. updates->timer = nullptr; updates->added.clear(); updates->removed.clear(); - updates->added_seen.clear(); - updates->removed_seen.clear(); updates->last_updated = std::chrono::steady_clock::now(); } diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index 2ea1e9d1ed2ee..f2d09082bd6eb 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -363,18 +363,15 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable ClusterMap; struct PendingUpdates { - PendingUpdates() {} Event::TimerPtr timer; - HostVector added; - HostVector removed; - std::unordered_set added_seen; - std::unordered_set removed_seen; + std::set added; + std::set removed; MonotonicTime last_updated; }; - typedef std::shared_ptr PendingUpdatesPtr; - typedef std::unordered_map PendingUpdatesByPriorityMap; - typedef std::shared_ptr PendingUpdatesByPriorityMapPtr; - typedef std::unordered_map ClusterUpdatesMap; + using PendingUpdatesPtr = std::shared_ptr; + using PendingUpdatesByPriorityMap = std::unordered_map; + using PendingUpdatesByPriorityMapPtr = std::shared_ptr; + using ClusterUpdatesMap = std::unordered_map; void applyUpdates(const Cluster& cluster, uint32_t priority, PendingUpdatesPtr updates); bool scheduleUpdate(const Cluster& cluster, uint32_t priority, const HostVector& hosts_added, From 0dd31c4cc2f37279de63b119764395668c630c54 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Tue, 24 Jul 2018 23:38:10 -0700 Subject: [PATCH 07/38] Skip immediate apply if there's an active timer Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index a6c717126143d..6120ff0f4fbea 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -380,10 +380,11 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit } // Has an update_merge_window gone by since the last update? If so, don't schedule - // the update so it can be applied immediately. + // the update so it can be applied immediately as long as there's no active timer + // (to avoid changing the order of updates). const auto delta = std::chrono::steady_clock::now() - updates->last_updated; const uint64_t delta_ms = std::chrono::duration_cast(delta).count(); - if (delta_ms > timeout) { + if (delta_ms > timeout && updates->timer == nullptr) { updates->last_updated = std::chrono::steady_clock::now(); return false; } From 0bd58e636ba8458df3e51f2e8ddcec89aae2df68 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Wed, 25 Jul 2018 13:57:56 -0700 Subject: [PATCH 08/38] Fix a few more issues pointed out by @htuch * track host by address, not by its shared_ptr * ensure we always publish updates with the latest copy of the host Signed-off-by: Raul Gutierrez Segales --- .../common/upstream/cluster_manager_impl.cc | 53 ++++++++++++++++--- source/common/upstream/cluster_manager_impl.h | 7 ++- 2 files changed, 50 insertions(+), 10 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index 6120ff0f4fbea..bf5ed9c098efd 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -391,19 +391,48 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit // Record the updates that should be applied when the timer fires. // Handle the case when a pair of add/remove cancels out. + // + // A few notes: + // * if a host's health changed, we won't see that as an add or remove + // so there's no need to check for health changes here. The fact that + // we got called with no added/removed hosts implies a health or metadata + // change, so we just forward the empty vectors along. + // * if while we are merging updates we see a host multiple times (e.g.: + // add -> remove -> add), we deal with the cancellations and we always + // propagate the latest version of that host. + // * we track hosts by their address since the Host ptr will most likely change + // as it gets materialized out of an EDS change, and an address is considered + // the unique constant identifier for a host. for (const auto& host : hosts_added) { - if (updates->removed.count(host) == 1) { - updates->removed.erase(host); + const auto& addr = host->address()->asString(); + // Is adding this host cancelling a previous remove? + if (updates->removed.count(addr) == 1) { + updates->removed.erase(addr); + + // If it was previously added, store the latest one. + if (updates->added.count(addr) == 1) { + updates->added[addr] = host; + } } else { - updates->added.insert(host); + // No previous remove, just store. + updates->added[addr] = host; } } + // Same dance as above. for (const auto& host : hosts_removed) { - if (updates->added.count(host) == 1) { - updates->added.erase(host); + const auto& addr = host->address()->asString(); + // Is removing this host cancelling a previous add? + if (updates->added.count(addr) == 1) { + updates->added.erase(addr); + + // If it was previously removed, store the latest one. + if (updates->removed.count(addr) == 1) { + updates->removed[addr] = host; + } } else { - updates->removed.insert(host); + // No previous add, just store. + updates->removed[addr] = host; } } @@ -421,8 +450,8 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit void ClusterManagerImpl::applyUpdates(const Cluster& cluster, uint32_t priority, PendingUpdatesPtr updates) { // Deliver pending updates. - const HostVector& hosts_added{updates->added.begin(), updates->added.end()}; - const HostVector& hosts_removed{updates->removed.begin(), updates->removed.end()}; + const HostVector& hosts_added = fromMap(updates->added); + const HostVector& hosts_removed = fromMap(updates->removed); postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); cm_stats_.coalesced_updates_.inc(); @@ -433,6 +462,14 @@ void ClusterManagerImpl::applyUpdates(const Cluster& cluster, uint32_t priority, updates->last_updated = std::chrono::steady_clock::now(); } +HostVector ClusterManagerImpl::fromMap(HostMap map) const { + HostVector hosts; + for (const auto& host_pair : map) { + hosts.push_back(host_pair.second); + } + return hosts; +} + bool ClusterManagerImpl::addOrUpdateCluster(const envoy::api::v2::Cluster& cluster, const std::string& version_info) { // First we need to see if this new config is new or an update to an existing dynamic cluster. diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index f2d09082bd6eb..1982cebbcd87b 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -362,10 +362,12 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable ClusterMap; + // Track hosts by their address (as a string). + using HostMap = std::map; struct PendingUpdates { Event::TimerPtr timer; - std::set added; - std::set removed; + HostMap added; + HostMap removed; MonotonicTime last_updated; }; using PendingUpdatesPtr = std::shared_ptr; @@ -376,6 +378,7 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable Date: Wed, 25 Jul 2018 14:01:29 -0700 Subject: [PATCH 09/38] Improve existing test Ensure we are calling `postThreadLocalClusterUpdate` at the right time with the right values. I'll follow-up with tests that exercise as many updates combinations as possible: * added/removed/added * unhealthy/healthy/added/removed/added * etc. Just wanted to get the basic helpers in place. Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.h | 7 +- .../upstream/cluster_manager_impl_test.cc | 71 ++++++++++++++++++- 2 files changed, 75 insertions(+), 3 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index 1982cebbcd87b..d921c359d8543 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -210,6 +210,11 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable admin_; NiceMock system_time_source_; NiceMock monotonic_time_source_; + MockLocalClusterUpdate local_cluster_update_; }; envoy::config::bootstrap::v2::Bootstrap parseBootstrapFromJson(const std::string& json_string) { @@ -1639,7 +1681,7 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdates) { update_merge_window: 3s )EOF"; - create(parseBootstrapFromV2Yaml(yaml)); + createWithLocalClusterUpdate(parseBootstrapFromV2Yaml(yaml)); EXPECT_FALSE(cluster_manager_->get("cluster_1")->info()->addedViaApi()); // Remove each host, sequentially. @@ -1652,6 +1694,33 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdates) { HostVector hosts_removed_0{(*hosts)[0]}; HostVector hosts_removed_1{(*hosts)[1]}; + // Ensure we see the right set of added/removed hosts on every call. + EXPECT_CALL(local_cluster_update_, post(_, _, _)) + .WillRepeatedly(Invoke([](uint32_t priority, const HostVector& hosts_added, + const HostVector& hosts_removed) -> void { + static int call = 0; + + switch (call) { + case 0: + EXPECT_EQ(0, priority); + EXPECT_EQ(0, hosts_added.size()); + EXPECT_EQ(1, hosts_removed.size()); + break; + case 1: + EXPECT_EQ(0, priority); + EXPECT_EQ(0, hosts_added.size()); + EXPECT_EQ(1, hosts_removed.size()); + break; + case 2: + EXPECT_EQ(0, priority); + EXPECT_EQ(2, hosts_added.size()); + EXPECT_EQ(0, hosts_removed.size()); + break; + } + + ++call; + })); + // No timer needs to be mocked at this point, since the first update should be applied // immediately. cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( From ec643f1e18db17d88271454f9f93b006ed9e49ba Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Wed, 25 Jul 2018 15:19:10 -0700 Subject: [PATCH 10/38] Move EXPECT_CALL() to the top for clarity Signed-off-by: Raul Gutierrez Segales --- .../upstream/cluster_manager_impl_test.cc | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/test/common/upstream/cluster_manager_impl_test.cc b/test/common/upstream/cluster_manager_impl_test.cc index a78287c716e3c..68716afc0a49e 100644 --- a/test/common/upstream/cluster_manager_impl_test.cc +++ b/test/common/upstream/cluster_manager_impl_test.cc @@ -1684,16 +1684,6 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdates) { createWithLocalClusterUpdate(parseBootstrapFromV2Yaml(yaml)); EXPECT_FALSE(cluster_manager_->get("cluster_1")->info()->addedViaApi()); - // Remove each host, sequentially. - const Cluster& cluster = cluster_manager_->clusters().begin()->second; - - HostVectorSharedPtr hosts( - new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); - HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); - HostVector hosts_added{}; - HostVector hosts_removed_0{(*hosts)[0]}; - HostVector hosts_removed_1{(*hosts)[1]}; - // Ensure we see the right set of added/removed hosts on every call. EXPECT_CALL(local_cluster_update_, post(_, _, _)) .WillRepeatedly(Invoke([](uint32_t priority, const HostVector& hosts_added, @@ -1721,6 +1711,15 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdates) { ++call; })); + // Remove each host, sequentially. + const Cluster& cluster = cluster_manager_->clusters().begin()->second; + HostVectorSharedPtr hosts( + new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); + HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); + HostVector hosts_added{}; + HostVector hosts_removed_0{(*hosts)[0]}; + HostVector hosts_removed_1{(*hosts)[1]}; + // No timer needs to be mocked at this point, since the first update should be applied // immediately. cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( From d05ef9938956a3007ba493c3d3e4a168d76602c2 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Wed, 25 Jul 2018 15:39:37 -0700 Subject: [PATCH 11/38] More tests * add/remove/add -> add * remove/add/remove -> remove * existing host changes multiple times (e.g.: failed/active/failed -> failed) * metadata/metadata/metadata -> latest metadata * weight/weight/weight -> latest weight Signed-off-by: Raul Gutierrez Segales --- .../upstream/cluster_manager_impl_test.cc | 462 ++++++++++++++++++ 1 file changed, 462 insertions(+) diff --git a/test/common/upstream/cluster_manager_impl_test.cc b/test/common/upstream/cluster_manager_impl_test.cc index 68716afc0a49e..87aca05d56fc5 100644 --- a/test/common/upstream/cluster_manager_impl_test.cc +++ b/test/common/upstream/cluster_manager_impl_test.cc @@ -195,6 +195,18 @@ class ClusterManagerImplTest : public testing::Test { EXPECT_EQ(expected_clusters_config_dump.DebugString(), clusters_config_dump.DebugString()); } + envoy::api::v2::core::Metadata buildMetadata(const std::string& version) const { + envoy::api::v2::core::Metadata metadata; + + if (version != "") { + Envoy::Config::Metadata::mutableMetadataValue( + metadata, Config::MetadataFilters::get().ENVOY_LB, "version") + .set_string_value(version); + } + + return metadata; + } + NiceMock factory_; std::unique_ptr cluster_manager_; AccessLog::MockAccessLogManager log_manager_; @@ -1749,6 +1761,456 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdates) { EXPECT_EQ(2, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); } +TEST_F(ClusterManagerImplTest, CoalescedUpdatesAddRemoveAdd) { + const std::string yaml = R"EOF( + static_resources: + clusters: + - name: cluster_1 + connect_timeout: 0.250s + type: STATIC + lb_policy: ROUND_ROBIN + hosts: + - socket_address: + address: "127.0.0.1" + port_value: 11001 + - socket_address: + address: "127.0.0.1" + port_value: 11002 + common_lb_config: + update_merge_window: 3s + )EOF"; + + createWithLocalClusterUpdate(parseBootstrapFromV2Yaml(yaml)); + EXPECT_FALSE(cluster_manager_->get("cluster_1")->info()->addedViaApi()); + + // Ensure we see the right set of added/removed hosts on every call. + uint32_t calls = 0; + EXPECT_CALL(local_cluster_update_, post(_, _, _)) + .WillRepeatedly(Invoke([&calls](uint32_t priority, const HostVector& hosts_added, + const HostVector& hosts_removed) -> void { + switch (calls) { + case 0: + // First immediate call, nothing was added/removed. + EXPECT_EQ(0, priority); + EXPECT_EQ(0, hosts_added.size()); + EXPECT_EQ(0, hosts_removed.size()); + break; + case 1: + // Merged call, add/remove/add is coalesced as 1 add. + EXPECT_EQ(0, priority); + EXPECT_EQ(1, hosts_added.size()); + EXPECT_EQ(0, hosts_removed.size()); + break; + } + + ++calls; + })); + + // Remove each host, sequentially. + const Cluster& cluster = cluster_manager_->clusters().begin()->second; + HostVectorSharedPtr hosts( + new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); + HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); + HostVector hosts_added{}; + HostVector hosts_removed{}; + + // No timer needs to be mocked at this point, since the first update should be applied + // immediately. + // No added/removed hosts (emulate metadata or health change). + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Now we need the timer to be ready, since createTimer() will be call given that this update + // _will_ be delayed. + Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); + + // Add the host. + hosts_added.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11003")); + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Remove it. + hosts_added.clear(); + hosts_removed.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11003")); + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Add it again. + hosts_added.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11003")); + hosts_removed.clear(); + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Ensure the coalesced updates were applied. + timer->callback_(); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); + + // postThreadLocalClusterUpdate() calls. + EXPECT_EQ(2, calls); +} + +TEST_F(ClusterManagerImplTest, CoalescedUpdatesRemoveAddRemove) { + const std::string yaml = R"EOF( + static_resources: + clusters: + - name: cluster_1 + connect_timeout: 0.250s + type: STATIC + lb_policy: ROUND_ROBIN + hosts: + - socket_address: + address: "127.0.0.1" + port_value: 11001 + - socket_address: + address: "127.0.0.1" + port_value: 11002 + common_lb_config: + update_merge_window: 3s + )EOF"; + + createWithLocalClusterUpdate(parseBootstrapFromV2Yaml(yaml)); + EXPECT_FALSE(cluster_manager_->get("cluster_1")->info()->addedViaApi()); + + // Ensure we see the right set of added/removed hosts on every call. + uint32_t calls = 0; + EXPECT_CALL(local_cluster_update_, post(_, _, _)) + .WillRepeatedly(Invoke([&calls](uint32_t priority, const HostVector& hosts_added, + const HostVector& hosts_removed) -> void { + switch (calls) { + case 0: + // First immediate call, nothing was added/removed. + EXPECT_EQ(0, priority); + EXPECT_EQ(0, hosts_added.size()); + EXPECT_EQ(0, hosts_removed.size()); + break; + case 1: + // Merged call, remove/add/remove is coalesced as 1 remove. + EXPECT_EQ(0, priority); + EXPECT_EQ(0, hosts_added.size()); + EXPECT_EQ(1, hosts_removed.size()); + break; + } + + ++calls; + })); + + // Remove each host, sequentially. + const Cluster& cluster = cluster_manager_->clusters().begin()->second; + HostVectorSharedPtr hosts( + new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); + HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); + HostVector hosts_added{}; + HostVector hosts_removed{}; + + // No timer needs to be mocked at this point, since the first update should be applied + // immediately. + // No added/removed hosts (emulate metadata or health change). + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Now we need the timer to be ready, since createTimer() will be call given that this update + // _will_ be delayed. + Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); + + // Remove a host. + hosts_added.clear(); + hosts_removed.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11002")); + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Add it back. + hosts_added.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11002")); + hosts_removed.clear(); + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Remove it again. + hosts_added.clear(); + hosts_removed.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11002")); + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Ensure the coalesced updates were applied. + timer->callback_(); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); + + // postThreadLocalClusterUpdate() calls. + EXPECT_EQ(2, calls); +} + +TEST_F(ClusterManagerImplTest, CoalescedUpdatesHostChangesInPlace) { + const std::string yaml = R"EOF( + static_resources: + clusters: + - name: cluster_1 + connect_timeout: 0.250s + type: STATIC + lb_policy: ROUND_ROBIN + hosts: + - socket_address: + address: "127.0.0.1" + port_value: 11001 + - socket_address: + address: "127.0.0.1" + port_value: 11002 + common_lb_config: + update_merge_window: 3s + )EOF"; + + createWithLocalClusterUpdate(parseBootstrapFromV2Yaml(yaml)); + EXPECT_FALSE(cluster_manager_->get("cluster_1")->info()->addedViaApi()); + + // Ensure we see the right set of added/removed hosts on every call. + uint32_t calls = 0; + EXPECT_CALL(local_cluster_update_, post(_, _, _)) + .WillRepeatedly(Invoke([&calls](uint32_t priority, const HostVector& hosts_added, + const HostVector& hosts_removed) -> void { + switch (calls) { + case 0: + // First immediate call, nothing was added/removed. + EXPECT_EQ(0, priority); + EXPECT_EQ(0, hosts_added.size()); + EXPECT_EQ(0, hosts_removed.size()); + break; + case 1: + // Merged call, failed/active/failed is coalesced as 1 call. + // Note, note, note: the host gets updated in place in the original + // list of hosts in this case, so nothing expected here. + EXPECT_EQ(0, priority); + EXPECT_EQ(0, hosts_added.size()); + EXPECT_EQ(0, hosts_removed.size()); + break; + } + + ++calls; + })); + + // Remove each host, sequentially. + const Cluster& cluster = cluster_manager_->clusters().begin()->second; + HostVectorSharedPtr hosts( + new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); + HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); + HostVector hosts_added{}; + HostVector hosts_removed{}; + + // No timer needs to be mocked at this point, since the first update should be applied + // immediately. + // No added/removed hosts (emulate metadata or health change). + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Now we need the timer to be ready, since createTimer() will be call given that this update + // _will_ be delayed. + Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); + + // Host becomes failed (no need to actually change it here). + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + // Host becomes active. + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + // Host becomes failed again. + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Ensure the coalesced updates were applied. + timer->callback_(); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); + + // postThreadLocalClusterUpdate() calls. + EXPECT_EQ(2, calls); +} + +TEST_F(ClusterManagerImplTest, CoalescedUpdatesMetadataLatest) { + const std::string yaml = R"EOF( + static_resources: + clusters: + - name: cluster_1 + connect_timeout: 0.250s + type: STATIC + lb_policy: ROUND_ROBIN + hosts: + - socket_address: + address: "127.0.0.1" + port_value: 11001 + - socket_address: + address: "127.0.0.1" + port_value: 11002 + common_lb_config: + update_merge_window: 3s + )EOF"; + + createWithLocalClusterUpdate(parseBootstrapFromV2Yaml(yaml)); + EXPECT_FALSE(cluster_manager_->get("cluster_1")->info()->addedViaApi()); + + // Ensure we see the right set of added/removed hosts on every call. + uint32_t calls = 0; + EXPECT_CALL(local_cluster_update_, post(_, _, _)) + .WillRepeatedly(Invoke([&calls](uint32_t priority, const HostVector& hosts_added, + const HostVector& hosts_removed) -> void { + switch (calls) { + case 0: + // First immediate call, nothing was added/removed. + EXPECT_EQ(0, priority); + EXPECT_EQ(0, hosts_added.size()); + EXPECT_EQ(0, hosts_removed.size()); + break; + case 1: + // Merged call, add/remove/add is coalesced as 1 add. + // We should see the latest metadata. + EXPECT_EQ(0, priority); + EXPECT_EQ(1, hosts_added.size()); + EXPECT_EQ(0, hosts_removed.size()); + EXPECT_EQ(Config::Metadata::metadataValue(*hosts_added[0]->metadata(), + Config::MetadataFilters::get().ENVOY_LB, + "version") + .string_value(), + std::string("v1")); + break; + } + + ++calls; + })); + + // Remove each host, sequentially. + const Cluster& cluster = cluster_manager_->clusters().begin()->second; + HostVectorSharedPtr hosts( + new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); + HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); + HostVector hosts_added{}; + HostVector hosts_removed{}; + + // No timer needs to be mocked at this point, since the first update should be applied + // immediately. + // No added/removed hosts (emulate metadata or health change). + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Now we need the timer to be ready, since createTimer() will be call given that this update + // _will_ be delayed. + Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); + + // Add the host without metadata. + hosts_added.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11003")); + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Remove it. + hosts_added.clear(); + hosts_removed.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11003")); + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Add it again, with metadata. + auto host = makeTestHost(cluster.info(), "tcp://127.0.0.1:11003"); + host->metadata(buildMetadata("v1")); + hosts_added.push_back(host); + hosts_removed.clear(); + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Ensure the coalesced updates were applied. + timer->callback_(); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); + + // postThreadLocalClusterUpdate() calls. + EXPECT_EQ(2, calls); +} + +TEST_F(ClusterManagerImplTest, CoalescedUpdatesWeightLatest) { + const std::string yaml = R"EOF( + static_resources: + clusters: + - name: cluster_1 + connect_timeout: 0.250s + type: STATIC + lb_policy: ROUND_ROBIN + hosts: + - socket_address: + address: "127.0.0.1" + port_value: 11001 + - socket_address: + address: "127.0.0.1" + port_value: 11002 + common_lb_config: + update_merge_window: 3s + )EOF"; + + createWithLocalClusterUpdate(parseBootstrapFromV2Yaml(yaml)); + EXPECT_FALSE(cluster_manager_->get("cluster_1")->info()->addedViaApi()); + + // Ensure we see the right set of added/removed hosts on every call. + uint32_t calls = 0; + EXPECT_CALL(local_cluster_update_, post(_, _, _)) + .WillRepeatedly(Invoke([&calls](uint32_t priority, const HostVector& hosts_added, + const HostVector& hosts_removed) -> void { + switch (calls) { + case 0: + // First immediate call, nothing was added/removed. + EXPECT_EQ(0, priority); + EXPECT_EQ(0, hosts_added.size()); + EXPECT_EQ(0, hosts_removed.size()); + break; + case 1: + // Merged call, add/remove/add is coalesced as 1 add. + // We should the latest weight + EXPECT_EQ(0, priority); + EXPECT_EQ(1, hosts_added.size()); + EXPECT_EQ(0, hosts_removed.size()); + EXPECT_EQ(hosts_added[0]->weight(), 100); + break; + } + + ++calls; + })); + + // Remove each host, sequentially. + const Cluster& cluster = cluster_manager_->clusters().begin()->second; + HostVectorSharedPtr hosts( + new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); + HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); + HostVector hosts_added{}; + HostVector hosts_removed{}; + + // No timer needs to be mocked at this point, since the first update should be applied + // immediately. + // No added/removed hosts (emulate metadata or health change). + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Now we need the timer to be ready, since createTimer() will be call given that this update + // _will_ be delayed. + Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); + + // Add the host with some weight. + auto host = makeTestHost(cluster.info(), "tcp://127.0.0.1:11003"); + host->weight(50); + hosts_added.push_back(host); + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Remove it. + hosts_added.clear(); + hosts_removed.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11003")); + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Add it again, with a new weight. + auto updated_host = makeTestHost(cluster.info(), "tcp://127.0.0.1:11003"); + updated_host->weight(100); + hosts_added.push_back(updated_host); + hosts_removed.clear(); + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + + // Ensure the coalesced updates were applied. + timer->callback_(); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); + + // postThreadLocalClusterUpdate() calls. + EXPECT_EQ(2, calls); +} + class ClusterManagerInitHelperTest : public testing::Test { public: MOCK_METHOD1(onClusterInit, void(Cluster& cluster)); From 95e5260b54a6e50a48eadaac75cc86d4a7790211 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Wed, 25 Jul 2018 17:04:31 -0700 Subject: [PATCH 12/38] Cleanup tests Signed-off-by: Raul Gutierrez Segales --- .../upstream/cluster_manager_impl_test.cc | 183 +++++------------- 1 file changed, 44 insertions(+), 139 deletions(-) diff --git a/test/common/upstream/cluster_manager_impl_test.cc b/test/common/upstream/cluster_manager_impl_test.cc index 87aca05d56fc5..554bce9e2ac03 100644 --- a/test/common/upstream/cluster_manager_impl_test.cc +++ b/test/common/upstream/cluster_manager_impl_test.cc @@ -160,6 +160,12 @@ class TestClusterManagerImpl : public ClusterManagerImpl { MockLocalClusterUpdate& local_cluster_update_; }; +envoy::config::bootstrap::v2::Bootstrap parseBootstrapFromV2Yaml(const std::string& yaml) { + envoy::config::bootstrap::v2::Bootstrap bootstrap; + MessageUtil::loadFromYaml(yaml, bootstrap); + return bootstrap; +} + class ClusterManagerImplTest : public testing::Test { public: void create(const envoy::config::bootstrap::v2::Bootstrap& bootstrap) { @@ -169,7 +175,26 @@ class ClusterManagerImplTest : public testing::Test { monotonic_time_source_)); } - void createWithLocalClusterUpdate(const envoy::config::bootstrap::v2::Bootstrap& bootstrap) { + void createWithLocalClusterUpdate() { + const std::string yaml = R"EOF( + static_resources: + clusters: + - name: cluster_1 + connect_timeout: 0.250s + type: STATIC + lb_policy: ROUND_ROBIN + hosts: + - socket_address: + address: "127.0.0.1" + port_value: 11001 + - socket_address: + address: "127.0.0.1" + port_value: 11002 + common_lb_config: + update_merge_window: 3s + )EOF"; + const auto& bootstrap = parseBootstrapFromV2Yaml(yaml); + cluster_manager_.reset(new TestClusterManagerImpl( bootstrap, factory_, factory_.stats_, factory_.tls_, factory_.runtime_, factory_.random_, factory_.local_info_, log_manager_, factory_.dispatcher_, admin_, system_time_source_, @@ -225,12 +250,6 @@ envoy::config::bootstrap::v2::Bootstrap parseBootstrapFromJson(const std::string return bootstrap; } -envoy::config::bootstrap::v2::Bootstrap parseBootstrapFromV2Yaml(const std::string& yaml) { - envoy::config::bootstrap::v2::Bootstrap bootstrap; - MessageUtil::loadFromYaml(yaml, bootstrap); - return bootstrap; -} - TEST_F(ClusterManagerImplTest, MultipleProtocolClusterFail) { const std::string yaml = R"EOF( static_resources: @@ -1675,26 +1694,7 @@ TEST_F(ClusterManagerImplTest, OriginalDstInitialization) { } TEST_F(ClusterManagerImplTest, CoalescedUpdates) { - const std::string yaml = R"EOF( - static_resources: - clusters: - - name: cluster_1 - connect_timeout: 0.250s - type: STATIC - lb_policy: ROUND_ROBIN - hosts: - - socket_address: - address: "127.0.0.1" - port_value: 11001 - - socket_address: - address: "127.0.0.1" - port_value: 11002 - common_lb_config: - update_merge_window: 3s - )EOF"; - - createWithLocalClusterUpdate(parseBootstrapFromV2Yaml(yaml)); - EXPECT_FALSE(cluster_manager_->get("cluster_1")->info()->addedViaApi()); + createWithLocalClusterUpdate(); // Ensure we see the right set of added/removed hosts on every call. EXPECT_CALL(local_cluster_update_, post(_, _, _)) @@ -1723,25 +1723,26 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdates) { ++call; })); - // Remove each host, sequentially. const Cluster& cluster = cluster_manager_->clusters().begin()->second; HostVectorSharedPtr hosts( new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); HostVector hosts_added{}; - HostVector hosts_removed_0{(*hosts)[0]}; - HostVector hosts_removed_1{(*hosts)[1]}; + HostVector hosts_removed{}; // No timer needs to be mocked at this point, since the first update should be applied // immediately. + hosts_removed.push_back((*hosts)[0]); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed_0); + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); // Now we need the timer to be ready, since createTimer() will be call given that this update // _will_ be delayed. Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); + hosts_removed.clear(); + hosts_removed.push_back((*hosts)[1]); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed_1); + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); // Ensure the coalesced updates were applied. timer->callback_(); @@ -1751,10 +1752,14 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdates) { timer = new NiceMock(&factory_.dispatcher_); // Add them back. + hosts_removed.clear(); + hosts_added.push_back((*hosts)[0]); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_removed_0, hosts_added); + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + hosts_added.clear(); + hosts_added.push_back((*hosts)[1]); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_removed_1, hosts_added); + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); // Ensure the coalesced updates were applied again. timer->callback_(); @@ -1762,26 +1767,7 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdates) { } TEST_F(ClusterManagerImplTest, CoalescedUpdatesAddRemoveAdd) { - const std::string yaml = R"EOF( - static_resources: - clusters: - - name: cluster_1 - connect_timeout: 0.250s - type: STATIC - lb_policy: ROUND_ROBIN - hosts: - - socket_address: - address: "127.0.0.1" - port_value: 11001 - - socket_address: - address: "127.0.0.1" - port_value: 11002 - common_lb_config: - update_merge_window: 3s - )EOF"; - - createWithLocalClusterUpdate(parseBootstrapFromV2Yaml(yaml)); - EXPECT_FALSE(cluster_manager_->get("cluster_1")->info()->addedViaApi()); + createWithLocalClusterUpdate(); // Ensure we see the right set of added/removed hosts on every call. uint32_t calls = 0; @@ -1806,7 +1792,6 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesAddRemoveAdd) { ++calls; })); - // Remove each host, sequentially. const Cluster& cluster = cluster_manager_->clusters().begin()->second; HostVectorSharedPtr hosts( new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); @@ -1850,26 +1835,7 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesAddRemoveAdd) { } TEST_F(ClusterManagerImplTest, CoalescedUpdatesRemoveAddRemove) { - const std::string yaml = R"EOF( - static_resources: - clusters: - - name: cluster_1 - connect_timeout: 0.250s - type: STATIC - lb_policy: ROUND_ROBIN - hosts: - - socket_address: - address: "127.0.0.1" - port_value: 11001 - - socket_address: - address: "127.0.0.1" - port_value: 11002 - common_lb_config: - update_merge_window: 3s - )EOF"; - - createWithLocalClusterUpdate(parseBootstrapFromV2Yaml(yaml)); - EXPECT_FALSE(cluster_manager_->get("cluster_1")->info()->addedViaApi()); + createWithLocalClusterUpdate(); // Ensure we see the right set of added/removed hosts on every call. uint32_t calls = 0; @@ -1894,7 +1860,6 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesRemoveAddRemove) { ++calls; })); - // Remove each host, sequentially. const Cluster& cluster = cluster_manager_->clusters().begin()->second; HostVectorSharedPtr hosts( new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); @@ -1939,26 +1904,7 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesRemoveAddRemove) { } TEST_F(ClusterManagerImplTest, CoalescedUpdatesHostChangesInPlace) { - const std::string yaml = R"EOF( - static_resources: - clusters: - - name: cluster_1 - connect_timeout: 0.250s - type: STATIC - lb_policy: ROUND_ROBIN - hosts: - - socket_address: - address: "127.0.0.1" - port_value: 11001 - - socket_address: - address: "127.0.0.1" - port_value: 11002 - common_lb_config: - update_merge_window: 3s - )EOF"; - - createWithLocalClusterUpdate(parseBootstrapFromV2Yaml(yaml)); - EXPECT_FALSE(cluster_manager_->get("cluster_1")->info()->addedViaApi()); + createWithLocalClusterUpdate(); // Ensure we see the right set of added/removed hosts on every call. uint32_t calls = 0; @@ -1985,7 +1931,6 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesHostChangesInPlace) { ++calls; })); - // Remove each host, sequentially. const Cluster& cluster = cluster_manager_->clusters().begin()->second; HostVectorSharedPtr hosts( new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); @@ -2022,26 +1967,7 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesHostChangesInPlace) { } TEST_F(ClusterManagerImplTest, CoalescedUpdatesMetadataLatest) { - const std::string yaml = R"EOF( - static_resources: - clusters: - - name: cluster_1 - connect_timeout: 0.250s - type: STATIC - lb_policy: ROUND_ROBIN - hosts: - - socket_address: - address: "127.0.0.1" - port_value: 11001 - - socket_address: - address: "127.0.0.1" - port_value: 11002 - common_lb_config: - update_merge_window: 3s - )EOF"; - - createWithLocalClusterUpdate(parseBootstrapFromV2Yaml(yaml)); - EXPECT_FALSE(cluster_manager_->get("cluster_1")->info()->addedViaApi()); + createWithLocalClusterUpdate(); // Ensure we see the right set of added/removed hosts on every call. uint32_t calls = 0; @@ -2072,7 +1998,6 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesMetadataLatest) { ++calls; })); - // Remove each host, sequentially. const Cluster& cluster = cluster_manager_->clusters().begin()->second; HostVectorSharedPtr hosts( new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); @@ -2118,26 +2043,7 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesMetadataLatest) { } TEST_F(ClusterManagerImplTest, CoalescedUpdatesWeightLatest) { - const std::string yaml = R"EOF( - static_resources: - clusters: - - name: cluster_1 - connect_timeout: 0.250s - type: STATIC - lb_policy: ROUND_ROBIN - hosts: - - socket_address: - address: "127.0.0.1" - port_value: 11001 - - socket_address: - address: "127.0.0.1" - port_value: 11002 - common_lb_config: - update_merge_window: 3s - )EOF"; - - createWithLocalClusterUpdate(parseBootstrapFromV2Yaml(yaml)); - EXPECT_FALSE(cluster_manager_->get("cluster_1")->info()->addedViaApi()); + createWithLocalClusterUpdate(); // Ensure we see the right set of added/removed hosts on every call. uint32_t calls = 0; @@ -2164,7 +2070,6 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesWeightLatest) { ++calls; })); - // Remove each host, sequentially. const Cluster& cluster = cluster_manager_->clusters().begin()->second; HostVectorSharedPtr hosts( new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); From 0f70fb3028afde70de1e7756b50b1b5f13c5d955 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Thu, 26 Jul 2018 13:42:25 -0700 Subject: [PATCH 13/38] Improve naming I mixed coalescing and merging between docs, comments and stats name; for consistency lets use merge everywhere. Signed-off-by: Raul Gutierrez Segales --- api/envoy/api/v2/cds.proto | 2 +- docs/root/intro/version_history.rst | 2 +- .../common/upstream/cluster_manager_impl.cc | 6 +-- source/common/upstream/cluster_manager_impl.h | 2 +- .../upstream/cluster_manager_impl_test.cc | 50 +++++++++---------- 5 files changed, 31 insertions(+), 31 deletions(-) diff --git a/api/envoy/api/v2/cds.proto b/api/envoy/api/v2/cds.proto index 5aa4220ee08a4..d0fb571956237 100644 --- a/api/envoy/api/v2/cds.proto +++ b/api/envoy/api/v2/cds.proto @@ -432,7 +432,7 @@ message Cluster { ZoneAwareLbConfig zone_aware_lb_config = 2; LocalityWeightedLbConfig locality_weighted_lb_config = 3; } - // If set, membership and healthcheck updates that happen within this duration will be coalesced + // If set, membership and healthcheck updates that happen within this duration will be merged // and delivered in one shot when the duration expires. The start of the duration is when the // first update happens. This is useful for big clusters, with potentially noisy deploys that // might trigger excessive CPU usage due to a constant stream of healthcheck state changes or diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index ab735c51b9826..2c6208fa4c33d 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -48,7 +48,7 @@ Version history * upstream: added configuration option to the subset load balancer to take locality weights into account when selecting a host from a subset. * access log: added RESPONSE_DURATION and RESPONSE_TX_DURATION. -* cluster: added :ref:`option ` to coalesce updates +* cluster: added :ref:`option ` to merge updates within the given duration. 1.7.0 diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index bf5ed9c098efd..df9503484f72c 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -331,13 +331,13 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { // This fires when a cluster is about to have an updated member set. We need to send this // out to all of the thread local configurations. - // Should we coalesce updates? + // Should we save this update and merge it with other updates? bool scheduled = false; if (cluster.info()->lbConfig().has_update_merge_window()) { scheduled = scheduleUpdate(cluster, priority, hosts_added, hosts_removed); } - // If an update was not scheduled, deliver it immediately. + // If an update was not scheduled for later, deliver it immediately. if (!scheduled) { postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); } @@ -453,7 +453,7 @@ void ClusterManagerImpl::applyUpdates(const Cluster& cluster, uint32_t priority, const HostVector& hosts_added = fromMap(updates->added); const HostVector& hosts_removed = fromMap(updates->removed); postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); - cm_stats_.coalesced_updates_.inc(); + cm_stats_.merged_updates_.inc(); // Reset everything. updates->timer = nullptr; diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index d921c359d8543..5fb9227b43622 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -138,7 +138,7 @@ class ClusterManagerInitHelper : Logger::Loggable { COUNTER(cluster_added) \ COUNTER(cluster_modified) \ COUNTER(cluster_removed) \ - COUNTER(coalesced_updates) \ + COUNTER(merged_updates) \ GAUGE (active_clusters) \ GAUGE (warming_clusters) // clang-format on diff --git a/test/common/upstream/cluster_manager_impl_test.cc b/test/common/upstream/cluster_manager_impl_test.cc index 554bce9e2ac03..ab50a56aee518 100644 --- a/test/common/upstream/cluster_manager_impl_test.cc +++ b/test/common/upstream/cluster_manager_impl_test.cc @@ -1693,7 +1693,7 @@ TEST_F(ClusterManagerImplTest, OriginalDstInitialization) { factory_.tls_.shutdownThread(); } -TEST_F(ClusterManagerImplTest, CoalescedUpdates) { +TEST_F(ClusterManagerImplTest, MergedUpdates) { createWithLocalClusterUpdate(); // Ensure we see the right set of added/removed hosts on every call. @@ -1744,9 +1744,9 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdates) { cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - // Ensure the coalesced updates were applied. + // Ensure the merged updates were applied. timer->callback_(); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); // Prepare a new timer for the next round of updates. timer = new NiceMock(&factory_.dispatcher_); @@ -1761,12 +1761,12 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdates) { cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - // Ensure the coalesced updates were applied again. + // Ensure the merged updates were applied again. timer->callback_(); - EXPECT_EQ(2, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); + EXPECT_EQ(2, factory_.stats_.counter("cluster_manager.merged_updates").value()); } -TEST_F(ClusterManagerImplTest, CoalescedUpdatesAddRemoveAdd) { +TEST_F(ClusterManagerImplTest, MergedUpdatesAddRemoveAdd) { createWithLocalClusterUpdate(); // Ensure we see the right set of added/removed hosts on every call. @@ -1782,7 +1782,7 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesAddRemoveAdd) { EXPECT_EQ(0, hosts_removed.size()); break; case 1: - // Merged call, add/remove/add is coalesced as 1 add. + // Merged call, add/remove/add is merged as 1 add. EXPECT_EQ(0, priority); EXPECT_EQ(1, hosts_added.size()); EXPECT_EQ(0, hosts_removed.size()); @@ -1826,15 +1826,15 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesAddRemoveAdd) { cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - // Ensure the coalesced updates were applied. + // Ensure the merged updates were applied. timer->callback_(); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); // postThreadLocalClusterUpdate() calls. EXPECT_EQ(2, calls); } -TEST_F(ClusterManagerImplTest, CoalescedUpdatesRemoveAddRemove) { +TEST_F(ClusterManagerImplTest, MergedUpdatesRemoveAddRemove) { createWithLocalClusterUpdate(); // Ensure we see the right set of added/removed hosts on every call. @@ -1850,7 +1850,7 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesRemoveAddRemove) { EXPECT_EQ(0, hosts_removed.size()); break; case 1: - // Merged call, remove/add/remove is coalesced as 1 remove. + // Merged call, remove/add/remove is merged as 1 remove. EXPECT_EQ(0, priority); EXPECT_EQ(0, hosts_added.size()); EXPECT_EQ(1, hosts_removed.size()); @@ -1895,15 +1895,15 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesRemoveAddRemove) { cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - // Ensure the coalesced updates were applied. + // Ensure the merged updates were applied. timer->callback_(); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); // postThreadLocalClusterUpdate() calls. EXPECT_EQ(2, calls); } -TEST_F(ClusterManagerImplTest, CoalescedUpdatesHostChangesInPlace) { +TEST_F(ClusterManagerImplTest, MergedUpdatesHostChangesInPlace) { createWithLocalClusterUpdate(); // Ensure we see the right set of added/removed hosts on every call. @@ -1919,7 +1919,7 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesHostChangesInPlace) { EXPECT_EQ(0, hosts_removed.size()); break; case 1: - // Merged call, failed/active/failed is coalesced as 1 call. + // Merged call, failed/active/failed is merged as 1 call. // Note, note, note: the host gets updated in place in the original // list of hosts in this case, so nothing expected here. EXPECT_EQ(0, priority); @@ -1958,15 +1958,15 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesHostChangesInPlace) { cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - // Ensure the coalesced updates were applied. + // Ensure the merged updates were applied. timer->callback_(); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); // postThreadLocalClusterUpdate() calls. EXPECT_EQ(2, calls); } -TEST_F(ClusterManagerImplTest, CoalescedUpdatesMetadataLatest) { +TEST_F(ClusterManagerImplTest, MergedUpdatesMetadataLatest) { createWithLocalClusterUpdate(); // Ensure we see the right set of added/removed hosts on every call. @@ -1982,7 +1982,7 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesMetadataLatest) { EXPECT_EQ(0, hosts_removed.size()); break; case 1: - // Merged call, add/remove/add is coalesced as 1 add. + // Merged call, add/remove/add is merged as 1 add. // We should see the latest metadata. EXPECT_EQ(0, priority); EXPECT_EQ(1, hosts_added.size()); @@ -2034,15 +2034,15 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesMetadataLatest) { cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - // Ensure the coalesced updates were applied. + // Ensure the merged updates were applied. timer->callback_(); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); // postThreadLocalClusterUpdate() calls. EXPECT_EQ(2, calls); } -TEST_F(ClusterManagerImplTest, CoalescedUpdatesWeightLatest) { +TEST_F(ClusterManagerImplTest, MergedUpdatesWeightLatest) { createWithLocalClusterUpdate(); // Ensure we see the right set of added/removed hosts on every call. @@ -2058,7 +2058,7 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesWeightLatest) { EXPECT_EQ(0, hosts_removed.size()); break; case 1: - // Merged call, add/remove/add is coalesced as 1 add. + // Merged call, add/remove/add is merged as 1 add. // We should the latest weight EXPECT_EQ(0, priority); EXPECT_EQ(1, hosts_added.size()); @@ -2108,9 +2108,9 @@ TEST_F(ClusterManagerImplTest, CoalescedUpdatesWeightLatest) { cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - // Ensure the coalesced updates were applied. + // Ensure the merged updates were applied. timer->callback_(); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.coalesced_updates").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); // postThreadLocalClusterUpdate() calls. EXPECT_EQ(2, calls); From b392c7b170d040b28be3598357d5982fa1fc4246 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Thu, 26 Jul 2018 14:54:00 -0700 Subject: [PATCH 14/38] Erase cluster from updates_map_ from removeCluster() Without this, we'd fail to unschedule pending updates for a cluster that was removed and we'd leak the updates_map_ entry for the cluster. Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index df9503484f72c..3fd1fb6e7ff69 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -595,6 +595,11 @@ bool ClusterManagerImpl::removeCluster(const std::string& cluster_name) { if (removed) { cm_stats_.cluster_removed_.inc(); updateGauges(); + // Did we ever deliver merged updates for this cluster? + if (updates_map_.count(cluster_name) == 1) { + // No need to manually disable timers, this should take care of it. + updates_map_.erase(cluster_name); + } } return removed; From a3999494fab0449ee95837b515becc3f4de57bbe Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Fri, 27 Jul 2018 14:41:23 -0700 Subject: [PATCH 15/38] Drop merging adds/removals So we are only merging updates with no hosts added/removed; those that signal HC/weight/metadata changes. This avoids leaking HostSharedPtrs and makes it easier to reason around this. It also addresses the main pain point we are experiencing: a flood of healthcheck changes during deploys of a big clusters. Merging updates related to adds/removes can be addressed later on when we can come up with a sound approach. Signed-off-by: Raul Gutierrez Segales --- api/envoy/api/v2/cds.proto | 16 +- docs/root/intro/version_history.rst | 4 +- .../common/upstream/cluster_manager_impl.cc | 122 ++--- source/common/upstream/cluster_manager_impl.h | 27 +- .../upstream/cluster_manager_impl_test.cc | 426 +++--------------- 5 files changed, 128 insertions(+), 467 deletions(-) diff --git a/api/envoy/api/v2/cds.proto b/api/envoy/api/v2/cds.proto index d0fb571956237..18f1cc47e982c 100644 --- a/api/envoy/api/v2/cds.proto +++ b/api/envoy/api/v2/cds.proto @@ -432,12 +432,16 @@ message Cluster { ZoneAwareLbConfig zone_aware_lb_config = 2; LocalityWeightedLbConfig locality_weighted_lb_config = 3; } - // If set, membership and healthcheck updates that happen within this duration will be merged - // and delivered in one shot when the duration expires. The start of the duration is when the - // first update happens. This is useful for big clusters, with potentially noisy deploys that - // might trigger excessive CPU usage due to a constant stream of healthcheck state changes or - // membership updates. By default, this is not configured and updates apply immediately. - // Also, the first set of updates to be seen apply immediately as well (e.g.: a new cluster). + // If set, all healthcheck/weight/metadata updates that happen within this duration will be + // merged and delivered in one shot when the duration expires. The start of the duration is when + // the first update happens. This is useful for big clusters, with potentially noisy deploys + // that might trigger excessive CPU usage due to a constant stream of healthcheck state changes + // or metadata updates. By default, this is not configured and updates apply immediately. Also, + // the first set of updates to be seen apply immediately as well (e.g.: a new cluster). + // + // Note: merging does not apply to cluser membership changes (e.g.: adds/removes); this is + // because merging those updates isn't currently safe. See + // https://github.com/envoyproxy/envoy/pull/3941. google.protobuf.Duration update_merge_window = 4; } diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index 2c6208fa4c33d..f29daf0aa1ed7 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -48,8 +48,8 @@ Version history * upstream: added configuration option to the subset load balancer to take locality weights into account when selecting a host from a subset. * access log: added RESPONSE_DURATION and RESPONSE_TX_DURATION. -* cluster: added :ref:`option ` to merge updates - within the given duration. +* cluster: added :ref:`option ` to merge healthcheck/weight/metadata + updates within the given duration. 1.7.0 =============== diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index 3fd1fb6e7ff69..72d470f58ee29 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -332,9 +332,28 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { // out to all of the thread local configurations. // Should we save this update and merge it with other updates? + // + // Note that we can only _safely_ merge updates that have no added/removed hosts. That is, + // only those updates that signal a change in host healthcheck state, weight or metadata. + // + // We've discussed merging updates related to hosts being added/removed, but it's really + // tricky to merge those given that downstream consumers of these updates expect to see the + // full list of updates, not a condensed one. This is because they use the broadcasted + // HostSharedPtrs within internal maps to track hosts. If we fail to broadcast the entire list + // of removals, these maps will leak those HostSharedPtrs. + // + // See https://github.com/envoyproxy/envoy/pull/3941 for more context. + bool scheduled = false; - if (cluster.info()->lbConfig().has_update_merge_window()) { - scheduled = scheduleUpdate(cluster, priority, hosts_added, hosts_removed); + bool merging_enabled = cluster.info()->lbConfig().has_update_merge_window(); + + if (merging_enabled) { + // Remember: we only merge updates with no adds/removes — just hc/weight/metadata changes. + bool is_mergeable = !hosts_added.size() && !hosts_removed.size(); + + // If this is not mergeable, we should cancel any scheduled updates since + // we'll deliver it immediately. + scheduled = scheduleUpdate(cluster, priority, is_mergeable); } // If an update was not scheduled for later, deliver it immediately. @@ -353,9 +372,7 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { } } -bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priority, - const HostVector& hosts_added, - const HostVector& hosts_removed) { +bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priority, bool mergeable) { const auto& update_merge_window = cluster.info()->lbConfig().update_merge_window(); const auto timeout = DurationUtil::durationToMilliseconds(update_merge_window); PendingUpdatesByPriorityMapPtr updates_by_prio; @@ -380,68 +397,29 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit } // Has an update_merge_window gone by since the last update? If so, don't schedule - // the update so it can be applied immediately as long as there's no active timer - // (to avoid changing the order of updates). - const auto delta = std::chrono::steady_clock::now() - updates->last_updated; + // the update so it can be applied immediately. Ditto if this is not a mergeable update. + const auto delta = std::chrono::steady_clock::now() - updates->last_updated_; const uint64_t delta_ms = std::chrono::duration_cast(delta).count(); - if (delta_ms > timeout && updates->timer == nullptr) { - updates->last_updated = std::chrono::steady_clock::now(); - return false; - } - - // Record the updates that should be applied when the timer fires. - // Handle the case when a pair of add/remove cancels out. - // - // A few notes: - // * if a host's health changed, we won't see that as an add or remove - // so there's no need to check for health changes here. The fact that - // we got called with no added/removed hosts implies a health or metadata - // change, so we just forward the empty vectors along. - // * if while we are merging updates we see a host multiple times (e.g.: - // add -> remove -> add), we deal with the cancellations and we always - // propagate the latest version of that host. - // * we track hosts by their address since the Host ptr will most likely change - // as it gets materialized out of an EDS change, and an address is considered - // the unique constant identifier for a host. - for (const auto& host : hosts_added) { - const auto& addr = host->address()->asString(); - // Is adding this host cancelling a previous remove? - if (updates->removed.count(addr) == 1) { - updates->removed.erase(addr); - - // If it was previously added, store the latest one. - if (updates->added.count(addr) == 1) { - updates->added[addr] = host; - } - } else { - // No previous remove, just store. - updates->added[addr] = host; + if (delta_ms > timeout || !mergeable) { + // If there was a pending update, we count this update as merged update. + if (updates->disableTimer()) { + cm_stats_.merged_updates_.inc(); } - } - - // Same dance as above. - for (const auto& host : hosts_removed) { - const auto& addr = host->address()->asString(); - // Is removing this host cancelling a previous add? - if (updates->added.count(addr) == 1) { - updates->added.erase(addr); - // If it was previously removed, store the latest one. - if (updates->removed.count(addr) == 1) { - updates->removed[addr] = host; - } - } else { - // No previous add, just store. - updates->removed[addr] = host; - } + updates->last_updated_ = std::chrono::steady_clock::now(); + return false; } // If there's no timer, create one. - if (updates->timer == nullptr) { - updates->timer = dispatcher_.createTimer([this, &cluster, priority, &updates]() -> void { + if (updates->timer_ == nullptr) { + updates->timer_ = dispatcher_.createTimer([this, &cluster, priority, updates]() -> void { applyUpdates(cluster, priority, updates); }); - updates->timer->enableTimer(std::chrono::milliseconds(timeout)); + } + + // Ensure there's a timer set to deliver these updates. + if (!updates->timer_enabled_) { + updates->enableTimer(timeout); } return true; @@ -450,24 +428,18 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit void ClusterManagerImpl::applyUpdates(const Cluster& cluster, uint32_t priority, PendingUpdatesPtr updates) { // Deliver pending updates. - const HostVector& hosts_added = fromMap(updates->added); - const HostVector& hosts_removed = fromMap(updates->removed); - postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); - cm_stats_.merged_updates_.inc(); - // Reset everything. - updates->timer = nullptr; - updates->added.clear(); - updates->removed.clear(); - updates->last_updated = std::chrono::steady_clock::now(); -} + // Remember that these merged updates are _only_ for updates related to + // HC/weight/metadata changes. That's why added/removed are empty. All + // adds/removals were already immediately broadcasted. + const HostVector hosts_added; + const HostVector hosts_removed; -HostVector ClusterManagerImpl::fromMap(HostMap map) const { - HostVector hosts; - for (const auto& host_pair : map) { - hosts.push_back(host_pair.second); - } - return hosts; + postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); + + cm_stats_.merged_updates_.inc(); + updates->timer_enabled_ = false; + updates->last_updated_ = std::chrono::steady_clock::now(); } bool ClusterManagerImpl::addOrUpdateCluster(const envoy::api::v2::Cluster& cluster, diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index 5fb9227b43622..84521e785ea8d 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -367,13 +367,24 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable ClusterMap; - // Track hosts by their address (as a string). - using HostMap = std::map; struct PendingUpdates { - Event::TimerPtr timer; - HostMap added; - HostMap removed; - MonotonicTime last_updated; + Event::TimerPtr timer_; + bool timer_enabled_; + MonotonicTime last_updated_; + void enableTimer(const uint64_t timeout) { + if (timer_ != nullptr) { + timer_->enableTimer(std::chrono::milliseconds(timeout)); + timer_enabled_ = true; + } + } + bool disableTimer() { + bool was_enabled = timer_enabled_; + if (timer_ != nullptr) { + timer_->disableTimer(); + timer_enabled_ = false; + } + return was_enabled; + } }; using PendingUpdatesPtr = std::shared_ptr; using PendingUpdatesByPriorityMap = std::unordered_map; @@ -381,9 +392,7 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable; void applyUpdates(const Cluster& cluster, uint32_t priority, PendingUpdatesPtr updates); - bool scheduleUpdate(const Cluster& cluster, uint32_t priority, const HostVector& hosts_added, - const HostVector& hosts_removed); - HostVector fromMap(HostMap map) const; + bool scheduleUpdate(const Cluster& cluster, uint32_t priority, bool mergeable); void createOrUpdateThreadLocalCluster(ClusterData& cluster); ProtobufTypes::MessagePtr dumpClusterConfigs(); static ClusterManagerStats generateStats(Stats::Scope& scope); diff --git a/test/common/upstream/cluster_manager_impl_test.cc b/test/common/upstream/cluster_manager_impl_test.cc index ab50a56aee518..29287a44a206f 100644 --- a/test/common/upstream/cluster_manager_impl_test.cc +++ b/test/common/upstream/cluster_manager_impl_test.cc @@ -1693,427 +1693,103 @@ TEST_F(ClusterManagerImplTest, OriginalDstInitialization) { factory_.tls_.shutdownThread(); } +// Tests that all the HC/weight/metadata changes are delivered in one go, as long as +// there's no hosts changes in between. +// Also tests that if hosts are added/removed between mergeable updates, delivery will +// happen and the scheduled update will be canceled. TEST_F(ClusterManagerImplTest, MergedUpdates) { createWithLocalClusterUpdate(); // Ensure we see the right set of added/removed hosts on every call. EXPECT_CALL(local_cluster_update_, post(_, _, _)) - .WillRepeatedly(Invoke([](uint32_t priority, const HostVector& hosts_added, - const HostVector& hosts_removed) -> void { - static int call = 0; - - switch (call) { - case 0: - EXPECT_EQ(0, priority); - EXPECT_EQ(0, hosts_added.size()); - EXPECT_EQ(1, hosts_removed.size()); - break; - case 1: - EXPECT_EQ(0, priority); - EXPECT_EQ(0, hosts_added.size()); - EXPECT_EQ(1, hosts_removed.size()); - break; - case 2: - EXPECT_EQ(0, priority); - EXPECT_EQ(2, hosts_added.size()); - EXPECT_EQ(0, hosts_removed.size()); - break; - } - - ++call; + .WillOnce(Invoke([](uint32_t priority, const HostVector& hosts_added, + const HostVector& hosts_removed) -> void { + // 1st removal. + EXPECT_EQ(0, priority); + EXPECT_EQ(0, hosts_added.size()); + EXPECT_EQ(1, hosts_removed.size()); + })) + .WillOnce(Invoke([](uint32_t priority, const HostVector& hosts_added, + const HostVector& hosts_removed) -> void { + // Triggerd by the 2 HC updates, it's a merged update so no added/removed + // hosts. + EXPECT_EQ(0, priority); + EXPECT_EQ(0, hosts_added.size()); + EXPECT_EQ(0, hosts_removed.size()); + })) + .WillOnce(Invoke([](uint32_t priority, const HostVector& hosts_added, + const HostVector& hosts_removed) -> void { + // 1st removed host added back. + EXPECT_EQ(0, priority); + EXPECT_EQ(1, hosts_added.size()); + EXPECT_EQ(0, hosts_removed.size()); + })) + .WillOnce(Invoke([](uint32_t priority, const HostVector& hosts_added, + const HostVector& hosts_removed) -> void { + // 1st removed host removed again, plus the 3 HC/weight/metadata updates that were + // waiting for delivery. + EXPECT_EQ(0, priority); + EXPECT_EQ(0, hosts_added.size()); + EXPECT_EQ(1, hosts_removed.size()); })); + Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); const Cluster& cluster = cluster_manager_->clusters().begin()->second; HostVectorSharedPtr hosts( new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); - HostVector hosts_added{}; - HostVector hosts_removed{}; + HostVector hosts_added; + HostVector hosts_removed; - // No timer needs to be mocked at this point, since the first update should be applied - // immediately. + // The first update should be applied immediately, since it's not mergeable. hosts_removed.push_back((*hosts)[0]); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.merged_updates").value()); - // Now we need the timer to be ready, since createTimer() will be call given that this update - // _will_ be delayed. - Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); - hosts_removed.clear(); - hosts_removed.push_back((*hosts)[1]); - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - - // Ensure the merged updates were applied. - timer->callback_(); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); - - // Prepare a new timer for the next round of updates. - timer = new NiceMock(&factory_.dispatcher_); - - // Add them back. + // This calls should be merged, since there are not added/removed hosts. hosts_removed.clear(); - hosts_added.push_back((*hosts)[0]); - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - hosts_added.clear(); - hosts_added.push_back((*hosts)[1]); - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - - // Ensure the merged updates were applied again. - timer->callback_(); - EXPECT_EQ(2, factory_.stats_.counter("cluster_manager.merged_updates").value()); -} - -TEST_F(ClusterManagerImplTest, MergedUpdatesAddRemoveAdd) { - createWithLocalClusterUpdate(); - - // Ensure we see the right set of added/removed hosts on every call. - uint32_t calls = 0; - EXPECT_CALL(local_cluster_update_, post(_, _, _)) - .WillRepeatedly(Invoke([&calls](uint32_t priority, const HostVector& hosts_added, - const HostVector& hosts_removed) -> void { - switch (calls) { - case 0: - // First immediate call, nothing was added/removed. - EXPECT_EQ(0, priority); - EXPECT_EQ(0, hosts_added.size()); - EXPECT_EQ(0, hosts_removed.size()); - break; - case 1: - // Merged call, add/remove/add is merged as 1 add. - EXPECT_EQ(0, priority); - EXPECT_EQ(1, hosts_added.size()); - EXPECT_EQ(0, hosts_removed.size()); - break; - } - - ++calls; - })); - - const Cluster& cluster = cluster_manager_->clusters().begin()->second; - HostVectorSharedPtr hosts( - new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); - HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); - HostVector hosts_added{}; - HostVector hosts_removed{}; - - // No timer needs to be mocked at this point, since the first update should be applied - // immediately. - // No added/removed hosts (emulate metadata or health change). - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - - // Now we need the timer to be ready, since createTimer() will be call given that this update - // _will_ be delayed. - Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); - - // Add the host. - hosts_added.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11003")); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - - // Remove it. - hosts_added.clear(); - hosts_removed.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11003")); - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - - // Add it again. - hosts_added.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11003")); - hosts_removed.clear(); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.merged_updates").value()); // Ensure the merged updates were applied. timer->callback_(); EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); - // postThreadLocalClusterUpdate() calls. - EXPECT_EQ(2, calls); -} - -TEST_F(ClusterManagerImplTest, MergedUpdatesRemoveAddRemove) { - createWithLocalClusterUpdate(); - - // Ensure we see the right set of added/removed hosts on every call. - uint32_t calls = 0; - EXPECT_CALL(local_cluster_update_, post(_, _, _)) - .WillRepeatedly(Invoke([&calls](uint32_t priority, const HostVector& hosts_added, - const HostVector& hosts_removed) -> void { - switch (calls) { - case 0: - // First immediate call, nothing was added/removed. - EXPECT_EQ(0, priority); - EXPECT_EQ(0, hosts_added.size()); - EXPECT_EQ(0, hosts_removed.size()); - break; - case 1: - // Merged call, remove/add/remove is merged as 1 remove. - EXPECT_EQ(0, priority); - EXPECT_EQ(0, hosts_added.size()); - EXPECT_EQ(1, hosts_removed.size()); - break; - } - - ++calls; - })); - - const Cluster& cluster = cluster_manager_->clusters().begin()->second; - HostVectorSharedPtr hosts( - new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); - HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); - HostVector hosts_added{}; - HostVector hosts_removed{}; - - // No timer needs to be mocked at this point, since the first update should be applied - // immediately. - // No added/removed hosts (emulate metadata or health change). - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - - // Now we need the timer to be ready, since createTimer() will be call given that this update - // _will_ be delayed. - Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); - - // Remove a host. - hosts_added.clear(); - hosts_removed.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11002")); - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - - // Add it back. - hosts_added.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11002")); + // Add the host back, the update should be immediately applied. hosts_removed.clear(); + hosts_added.push_back((*hosts)[0]); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - - // Remove it again. - hosts_added.clear(); - hosts_removed.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11002")); - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - - // Ensure the merged updates were applied. - timer->callback_(); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); - - // postThreadLocalClusterUpdate() calls. - EXPECT_EQ(2, calls); -} - -TEST_F(ClusterManagerImplTest, MergedUpdatesHostChangesInPlace) { - createWithLocalClusterUpdate(); - - // Ensure we see the right set of added/removed hosts on every call. - uint32_t calls = 0; - EXPECT_CALL(local_cluster_update_, post(_, _, _)) - .WillRepeatedly(Invoke([&calls](uint32_t priority, const HostVector& hosts_added, - const HostVector& hosts_removed) -> void { - switch (calls) { - case 0: - // First immediate call, nothing was added/removed. - EXPECT_EQ(0, priority); - EXPECT_EQ(0, hosts_added.size()); - EXPECT_EQ(0, hosts_removed.size()); - break; - case 1: - // Merged call, failed/active/failed is merged as 1 call. - // Note, note, note: the host gets updated in place in the original - // list of hosts in this case, so nothing expected here. - EXPECT_EQ(0, priority); - EXPECT_EQ(0, hosts_added.size()); - EXPECT_EQ(0, hosts_removed.size()); - break; - } - - ++calls; - })); - - const Cluster& cluster = cluster_manager_->clusters().begin()->second; - HostVectorSharedPtr hosts( - new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); - HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); - HostVector hosts_added{}; - HostVector hosts_removed{}; - - // No timer needs to be mocked at this point, since the first update should be applied - // immediately. - // No added/removed hosts (emulate metadata or health change). - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - - // Now we need the timer to be ready, since createTimer() will be call given that this update - // _will_ be delayed. - Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); - - // Host becomes failed (no need to actually change it here). - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - // Host becomes active. - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - // Host becomes failed again. - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - - // Ensure the merged updates were applied. - timer->callback_(); EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); - // postThreadLocalClusterUpdate() calls. - EXPECT_EQ(2, calls); -} - -TEST_F(ClusterManagerImplTest, MergedUpdatesMetadataLatest) { - createWithLocalClusterUpdate(); - - // Ensure we see the right set of added/removed hosts on every call. - uint32_t calls = 0; - EXPECT_CALL(local_cluster_update_, post(_, _, _)) - .WillRepeatedly(Invoke([&calls](uint32_t priority, const HostVector& hosts_added, - const HostVector& hosts_removed) -> void { - switch (calls) { - case 0: - // First immediate call, nothing was added/removed. - EXPECT_EQ(0, priority); - EXPECT_EQ(0, hosts_added.size()); - EXPECT_EQ(0, hosts_removed.size()); - break; - case 1: - // Merged call, add/remove/add is merged as 1 add. - // We should see the latest metadata. - EXPECT_EQ(0, priority); - EXPECT_EQ(1, hosts_added.size()); - EXPECT_EQ(0, hosts_removed.size()); - EXPECT_EQ(Config::Metadata::metadataValue(*hosts_added[0]->metadata(), - Config::MetadataFilters::get().ENVOY_LB, - "version") - .string_value(), - std::string("v1")); - break; - } - - ++calls; - })); - - const Cluster& cluster = cluster_manager_->clusters().begin()->second; - HostVectorSharedPtr hosts( - new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); - HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); - HostVector hosts_added{}; - HostVector hosts_removed{}; - - // No timer needs to be mocked at this point, since the first update should be applied - // immediately. - // No added/removed hosts (emulate metadata or health change). - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - - // Now we need the timer to be ready, since createTimer() will be call given that this update - // _will_ be delayed. - Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); - - // Add the host without metadata. - hosts_added.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11003")); - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - - // Remove it. + // Now emit 3 updates that should be scheduled: metadata, HC, and weight. hosts_added.clear(); - hosts_removed.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11003")); - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - // Add it again, with metadata. - auto host = makeTestHost(cluster.info(), "tcp://127.0.0.1:11003"); - host->metadata(buildMetadata("v1")); - hosts_added.push_back(host); - hosts_removed.clear(); + (*hosts)[0]->metadata(buildMetadata("v1")); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - // Ensure the merged updates were applied. - timer->callback_(); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); - - // postThreadLocalClusterUpdate() calls. - EXPECT_EQ(2, calls); -} - -TEST_F(ClusterManagerImplTest, MergedUpdatesWeightLatest) { - createWithLocalClusterUpdate(); - - // Ensure we see the right set of added/removed hosts on every call. - uint32_t calls = 0; - EXPECT_CALL(local_cluster_update_, post(_, _, _)) - .WillRepeatedly(Invoke([&calls](uint32_t priority, const HostVector& hosts_added, - const HostVector& hosts_removed) -> void { - switch (calls) { - case 0: - // First immediate call, nothing was added/removed. - EXPECT_EQ(0, priority); - EXPECT_EQ(0, hosts_added.size()); - EXPECT_EQ(0, hosts_removed.size()); - break; - case 1: - // Merged call, add/remove/add is merged as 1 add. - // We should the latest weight - EXPECT_EQ(0, priority); - EXPECT_EQ(1, hosts_added.size()); - EXPECT_EQ(0, hosts_removed.size()); - EXPECT_EQ(hosts_added[0]->weight(), 100); - break; - } - - ++calls; - })); - - const Cluster& cluster = cluster_manager_->clusters().begin()->second; - HostVectorSharedPtr hosts( - new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); - HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); - HostVector hosts_added{}; - HostVector hosts_removed{}; - - // No timer needs to be mocked at this point, since the first update should be applied - // immediately. - // No added/removed hosts (emulate metadata or health change). + (*hosts)[0]->healthFlagSet(Host::HealthFlag::FAILED_EDS_HEALTH); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - // Now we need the timer to be ready, since createTimer() will be call given that this update - // _will_ be delayed. - Event::MockTimer* timer = new NiceMock(&factory_.dispatcher_); - - // Add the host with some weight. - auto host = makeTestHost(cluster.info(), "tcp://127.0.0.1:11003"); - host->weight(50); - hosts_added.push_back(host); + (*hosts)[0]->weight(100); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - // Remove it. - hosts_added.clear(); - hosts_removed.push_back(makeTestHost(cluster.info(), "tcp://127.0.0.1:11003")); - cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( - hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + // Updates not delivered yet. + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); - // Add it again, with a new weight. - auto updated_host = makeTestHost(cluster.info(), "tcp://127.0.0.1:11003"); - updated_host->weight(100); - hosts_added.push_back(updated_host); - hosts_removed.clear(); + // Remove the host again, should cancel the scheduled update and be delivered immediately. + hosts_removed.push_back((*hosts)[0]); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - // Ensure the merged updates were applied. - timer->callback_(); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); - - // postThreadLocalClusterUpdate() calls. - EXPECT_EQ(2, calls); + EXPECT_EQ(2, factory_.stats_.counter("cluster_manager.merged_updates").value()); } class ClusterManagerInitHelperTest : public testing::Test { From f88c2970595339f4ec08b89069149929e899162b Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Fri, 27 Jul 2018 17:20:28 -0700 Subject: [PATCH 16/38] Fix memory leak Use a weak_ptr to avoid the circular reference the lambda used for the timer generates (given it taked a copy of the shared_ptr that is used for its own storage). Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index 72d470f58ee29..0e62845f761ff 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -412,8 +412,12 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit // If there's no timer, create one. if (updates->timer_ == nullptr) { - updates->timer_ = dispatcher_.createTimer([this, &cluster, priority, updates]() -> void { - applyUpdates(cluster, priority, updates); + std::weak_ptr weak_ptr(updates); + updates->timer_ = dispatcher_.createTimer([this, &cluster, priority, weak_ptr]() -> void { + auto updates = weak_ptr.lock(); + if (updates) { + applyUpdates(cluster, priority, updates); + } }); } From a1145dd8df0818ca3b57413e5e649d1b281e27ef Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Sun, 29 Jul 2018 22:43:09 -0700 Subject: [PATCH 17/38] Make 2 local vars const. Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index 0e62845f761ff..8c7f7168ddda7 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -345,11 +345,11 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { // See https://github.com/envoyproxy/envoy/pull/3941 for more context. bool scheduled = false; - bool merging_enabled = cluster.info()->lbConfig().has_update_merge_window(); + const bool merging_enabled = cluster.info()->lbConfig().has_update_merge_window(); if (merging_enabled) { // Remember: we only merge updates with no adds/removes — just hc/weight/metadata changes. - bool is_mergeable = !hosts_added.size() && !hosts_removed.size(); + const bool is_mergeable = !hosts_added.size() && !hosts_removed.size(); // If this is not mergeable, we should cancel any scheduled updates since // we'll deliver it immediately. From b33b08095b505a07d4d8654fc40003abe2d8c9ba Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 08:33:41 -0700 Subject: [PATCH 18/38] More constness, remove newline & spelling. Signed-off-by: Raul Gutierrez Segales --- docs/root/intro/version_history.rst | 2 +- source/common/upstream/cluster_manager_impl.cc | 1 - source/common/upstream/cluster_manager_impl.h | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index f29daf0aa1ed7..e49d702fd315d 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -48,7 +48,7 @@ Version history * upstream: added configuration option to the subset load balancer to take locality weights into account when selecting a host from a subset. * access log: added RESPONSE_DURATION and RESPONSE_TX_DURATION. -* cluster: added :ref:`option ` to merge healthcheck/weight/metadata +* cluster: added :ref:`option ` to merge health check/weight/metadata updates within the given duration. 1.7.0 diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index 8c7f7168ddda7..ceab9d9e7aa04 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -343,7 +343,6 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { // of removals, these maps will leak those HostSharedPtrs. // // See https://github.com/envoyproxy/envoy/pull/3941 for more context. - bool scheduled = false; const bool merging_enabled = cluster.info()->lbConfig().has_update_merge_window(); diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index 84521e785ea8d..497ca2966c520 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -378,7 +378,7 @@ class ClusterManagerImpl : public ClusterManager, Logger::LoggabledisableTimer(); timer_enabled_ = false; From d70809c7b89f064f6671a128025dbd0c08b39823 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 09:10:47 -0700 Subject: [PATCH 19/38] Simplify use of shared_ptrs/refs/weak ptrs * we don't really need shared_ptrs, use unique_ptrs * for the lambda, use a ref instead of weak ptr now that we are not using a shared ptr anymore Things should be more readable. Signed-off-by: Raul Gutierrez Segales --- .../common/upstream/cluster_manager_impl.cc | 30 ++++++------------- source/common/upstream/cluster_manager_impl.h | 6 ++-- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index ceab9d9e7aa04..f3cb0c13baa31 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -374,26 +374,18 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priority, bool mergeable) { const auto& update_merge_window = cluster.info()->lbConfig().update_merge_window(); const auto timeout = DurationUtil::durationToMilliseconds(update_merge_window); - PendingUpdatesByPriorityMapPtr updates_by_prio; - PendingUpdatesPtr updates; // Find pending updates for this cluster. - const auto updates_by_prio_it = updates_map_.find(cluster.info()->name()); - if (updates_by_prio_it != updates_map_.end()) { - updates_by_prio = updates_by_prio_it->second; - } else { - updates_by_prio = std::make_shared(); - updates_map_[cluster.info()->name()] = updates_by_prio; + if (updates_map_.count(cluster.info()->name()) != 1) { + updates_map_[cluster.info()->name()] = std::make_unique(); } + PendingUpdatesByPriorityMapPtr& updates_by_prio = updates_map_[cluster.info()->name()]; // Find pending updates for this priority. - const auto updates_it = updates_by_prio->find(priority); - if (updates_it != updates_by_prio->end()) { - updates = updates_it->second; - } else { - updates = std::make_shared(); - (*updates_by_prio)[priority] = updates; + if (updates_by_prio->count(priority) != 1) { + (*updates_by_prio)[priority] = std::make_unique(); } + PendingUpdatesPtr& updates = (*updates_by_prio)[priority]; // Has an update_merge_window gone by since the last update? If so, don't schedule // the update so it can be applied immediately. Ditto if this is not a mergeable update. @@ -411,12 +403,8 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit // If there's no timer, create one. if (updates->timer_ == nullptr) { - std::weak_ptr weak_ptr(updates); - updates->timer_ = dispatcher_.createTimer([this, &cluster, priority, weak_ptr]() -> void { - auto updates = weak_ptr.lock(); - if (updates) { - applyUpdates(cluster, priority, updates); - } + updates->timer_ = dispatcher_.createTimer([this, &cluster, priority, &updates]() -> void { + applyUpdates(cluster, priority, updates); }); } @@ -429,7 +417,7 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit } void ClusterManagerImpl::applyUpdates(const Cluster& cluster, uint32_t priority, - PendingUpdatesPtr updates) { + PendingUpdatesPtr& updates) { // Deliver pending updates. // Remember that these merged updates are _only_ for updates related to diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index 497ca2966c520..61266a4944486 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -386,12 +386,12 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable; + using PendingUpdatesPtr = std::unique_ptr; using PendingUpdatesByPriorityMap = std::unordered_map; - using PendingUpdatesByPriorityMapPtr = std::shared_ptr; + using PendingUpdatesByPriorityMapPtr = std::unique_ptr; using ClusterUpdatesMap = std::unordered_map; - void applyUpdates(const Cluster& cluster, uint32_t priority, PendingUpdatesPtr updates); + void applyUpdates(const Cluster& cluster, uint32_t priority, PendingUpdatesPtr& updates); bool scheduleUpdate(const Cluster& cluster, uint32_t priority, bool mergeable); void createOrUpdateThreadLocalCluster(ClusterData& cluster); ProtobufTypes::MessagePtr dumpClusterConfigs(); From 10d3670de9fca1c8c3e125f88ae2c551a53875ac Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 10:22:09 -0700 Subject: [PATCH 20/38] More review comments * pass around an object ref instead of a unique_ptr * initialize bool with {} Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.cc | 8 ++++---- source/common/upstream/cluster_manager_impl.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index f3cb0c13baa31..e263c8bc19ec9 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -404,7 +404,7 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit // If there's no timer, create one. if (updates->timer_ == nullptr) { updates->timer_ = dispatcher_.createTimer([this, &cluster, priority, &updates]() -> void { - applyUpdates(cluster, priority, updates); + applyUpdates(cluster, priority, *updates); }); } @@ -417,7 +417,7 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit } void ClusterManagerImpl::applyUpdates(const Cluster& cluster, uint32_t priority, - PendingUpdatesPtr& updates) { + PendingUpdates& updates) { // Deliver pending updates. // Remember that these merged updates are _only_ for updates related to @@ -429,8 +429,8 @@ void ClusterManagerImpl::applyUpdates(const Cluster& cluster, uint32_t priority, postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); cm_stats_.merged_updates_.inc(); - updates->timer_enabled_ = false; - updates->last_updated_ = std::chrono::steady_clock::now(); + updates.timer_enabled_ = false; + updates.last_updated_ = std::chrono::steady_clock::now(); } bool ClusterManagerImpl::addOrUpdateCluster(const envoy::api::v2::Cluster& cluster, diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index 61266a4944486..2e5af19d6938b 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -369,7 +369,7 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable; using ClusterUpdatesMap = std::unordered_map; - void applyUpdates(const Cluster& cluster, uint32_t priority, PendingUpdatesPtr& updates); + void applyUpdates(const Cluster& cluster, uint32_t priority, PendingUpdates& updates); bool scheduleUpdate(const Cluster& cluster, uint32_t priority, bool mergeable); void createOrUpdateThreadLocalCluster(ClusterData& cluster); ProtobufTypes::MessagePtr dumpClusterConfigs(); From 80e03db4c86e94a224f65612f513dad9c69c429b Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 11:33:04 -0700 Subject: [PATCH 21/38] version history: sort & reflow cols Signed-off-by: Raul Gutierrez Segales --- docs/root/intro/version_history.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index e49d702fd315d..a5fd30aa08224 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -5,10 +5,13 @@ Version history =============== * access log: added :ref:`response flag filter ` to filter based on the presence of Envoy response flags. +* access log: added RESPONSE_DURATION and RESPONSE_TX_DURATION. * admin: added :http:get:`/hystrix_event_stream` as an endpoint for monitoring envoy's statistics through `Hystrix dashboard `_. * grpc-json: added support for building HTTP response from `google.api.HttpBody `_. +* cluster: added :ref:`option ` to merge + health check/weight/metadata updates within the given duration. * config: v1 disabled by default. v1 support remains available until October via flipping --v2-config-only=false. * config: v1 disabled by default. v1 support remains available until October via setting :option:`--allow-deprecated-v1-api`. * health check: added support for :ref:`custom health check `. @@ -47,9 +50,6 @@ Version history `. * upstream: added configuration option to the subset load balancer to take locality weights into account when selecting a host from a subset. -* access log: added RESPONSE_DURATION and RESPONSE_TX_DURATION. -* cluster: added :ref:`option ` to merge health check/weight/metadata - updates within the given duration. 1.7.0 =============== From dcd717467f10ec0d95f2421c1e3e1fbd7bd3127e Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 11:42:38 -0700 Subject: [PATCH 22/38] Collapse the number of map lookups Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index e263c8bc19ec9..df6a43e9626ba 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -376,16 +376,16 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit const auto timeout = DurationUtil::durationToMilliseconds(update_merge_window); // Find pending updates for this cluster. - if (updates_map_.count(cluster.info()->name()) != 1) { - updates_map_[cluster.info()->name()] = std::make_unique(); + auto& updates_by_prio = updates_map_[cluster.info()->name()]; + if (!updates_by_prio) { + updates_by_prio.reset(new PendingUpdatesByPriorityMap()); } - PendingUpdatesByPriorityMapPtr& updates_by_prio = updates_map_[cluster.info()->name()]; // Find pending updates for this priority. - if (updates_by_prio->count(priority) != 1) { - (*updates_by_prio)[priority] = std::make_unique(); + auto& updates = (*updates_by_prio)[priority]; + if (!updates) { + updates.reset(new PendingUpdates()); } - PendingUpdatesPtr& updates = (*updates_by_prio)[priority]; // Has an update_merge_window gone by since the last update? If so, don't schedule // the update so it can be applied immediately. Ditto if this is not a mergeable update. From b3745de6d2fdb08d3e78fd25d7c5295c9507d3af Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 11:44:19 -0700 Subject: [PATCH 23/38] Unconditionally remove cluster from updates map Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index df6a43e9626ba..c71a0c30922c4 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -559,10 +559,8 @@ bool ClusterManagerImpl::removeCluster(const std::string& cluster_name) { cm_stats_.cluster_removed_.inc(); updateGauges(); // Did we ever deliver merged updates for this cluster? - if (updates_map_.count(cluster_name) == 1) { - // No need to manually disable timers, this should take care of it. - updates_map_.erase(cluster_name); - } + // No need to manually disable timers, this should take care of it. + updates_map_.erase(cluster_name); } return removed; From 6fac84fc093bcf0322810794f156b2b04513365b Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 11:54:26 -0700 Subject: [PATCH 24/38] Reorder members of PendingUpdates Also, add a comment about how the MonotonicTime time_point is default constructed to the clock's epoch and how we rely on this to immediately deliver the first seen update. Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index 2e5af19d6938b..8d0c3cf296ea3 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -368,9 +368,6 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable ClusterMap; struct PendingUpdates { - Event::TimerPtr timer_; - bool timer_enabled_{}; - MonotonicTime last_updated_; void enableTimer(const uint64_t timeout) { if (timer_ != nullptr) { timer_->enableTimer(std::chrono::milliseconds(timeout)); @@ -385,6 +382,16 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable; using PendingUpdatesByPriorityMap = std::unordered_map; From 9eae09adcc8f3f52ef9de57d4522369621e18f3e Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 11:59:32 -0700 Subject: [PATCH 25/38] Add TODO for extending Event::Timer's interface Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index 8d0c3cf296ea3..9bd73d30d0c0f 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -384,6 +384,7 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable Date: Mon, 30 Jul 2018 12:01:26 -0700 Subject: [PATCH 26/38] Make const empty vectors static No need to construct them on every call. Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index c71a0c30922c4..b6707e14785e6 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -423,8 +423,8 @@ void ClusterManagerImpl::applyUpdates(const Cluster& cluster, uint32_t priority, // Remember that these merged updates are _only_ for updates related to // HC/weight/metadata changes. That's why added/removed are empty. All // adds/removals were already immediately broadcasted. - const HostVector hosts_added; - const HostVector hosts_removed; + static const HostVector hosts_added; + static const HostVector hosts_removed; postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); From 4d0c528c1a9ea8a2309ee46ecdeb550eb1ed05ab Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 12:08:24 -0700 Subject: [PATCH 27/38] Assert we are not enabling an already enabled timer Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index 9bd73d30d0c0f..6ceb592b5be62 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -369,6 +369,7 @@ class ClusterManagerImpl : public ClusterManager, Logger::LoggableenableTimer(std::chrono::milliseconds(timeout)); timer_enabled_ = true; From 09d56cccce6660de7229b8b66d1dd88e097de9c9 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 12:56:39 -0700 Subject: [PATCH 28/38] More counters Lets track 3 more events: * regular updates * merged updates that were cancelled * mergeable updates that arrived outside of a merge window Updated tests to assert these behave correctly. Signed-off-by: Raul Gutierrez Segales --- .../common/upstream/cluster_manager_impl.cc | 19 ++++++++++++++----- source/common/upstream/cluster_manager_impl.h | 3 +++ .../upstream/cluster_manager_impl_test.cc | 8 +++++++- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index b6707e14785e6..a344f1be36641 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -345,11 +345,10 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { // See https://github.com/envoyproxy/envoy/pull/3941 for more context. bool scheduled = false; const bool merging_enabled = cluster.info()->lbConfig().has_update_merge_window(); + // Remember: we only merge updates with no adds/removes — just hc/weight/metadata changes. + const bool is_mergeable = !hosts_added.size() && !hosts_removed.size(); if (merging_enabled) { - // Remember: we only merge updates with no adds/removes — just hc/weight/metadata changes. - const bool is_mergeable = !hosts_added.size() && !hosts_removed.size(); - // If this is not mergeable, we should cancel any scheduled updates since // we'll deliver it immediately. scheduled = scheduleUpdate(cluster, priority, is_mergeable); @@ -357,6 +356,16 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { // If an update was not scheduled for later, deliver it immediately. if (!scheduled) { + // Is this a regular update? + if (!is_mergeable) { + cm_stats_.regular_updates_.inc(); + } + + // Or are we outside a merge window? + if (is_mergeable && !scheduled) { + cm_stats_.merged_updates_offwindow_.inc(); + } + postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); } }); @@ -392,9 +401,9 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit const auto delta = std::chrono::steady_clock::now() - updates->last_updated_; const uint64_t delta_ms = std::chrono::duration_cast(delta).count(); if (delta_ms > timeout || !mergeable) { - // If there was a pending update, we count this update as merged update. + // If there was a pending update, we cancel the pending merged update. if (updates->disableTimer()) { - cm_stats_.merged_updates_.inc(); + cm_stats_.merged_updates_cancelled_.inc(); } updates->last_updated_ = std::chrono::steady_clock::now(); diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index 6ceb592b5be62..f6eb49feb6394 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -138,7 +138,10 @@ class ClusterManagerInitHelper : Logger::Loggable { COUNTER(cluster_added) \ COUNTER(cluster_modified) \ COUNTER(cluster_removed) \ + COUNTER(regular_updates) \ COUNTER(merged_updates) \ + COUNTER(merged_updates_cancelled) \ + COUNTER(merged_updates_offwindow) \ GAUGE (active_clusters) \ GAUGE (warming_clusters) // clang-format on diff --git a/test/common/upstream/cluster_manager_impl_test.cc b/test/common/upstream/cluster_manager_impl_test.cc index 29287a44a206f..79cb68579dd08 100644 --- a/test/common/upstream/cluster_manager_impl_test.cc +++ b/test/common/upstream/cluster_manager_impl_test.cc @@ -1745,6 +1745,7 @@ TEST_F(ClusterManagerImplTest, MergedUpdates) { hosts_removed.push_back((*hosts)[0]); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.regular_updates").value()); EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.merged_updates").value()); // This calls should be merged, since there are not added/removed hosts. @@ -1753,10 +1754,12 @@ TEST_F(ClusterManagerImplTest, MergedUpdates) { hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.regular_updates").value()); EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.merged_updates").value()); // Ensure the merged updates were applied. timer->callback_(); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.regular_updates").value()); EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); // Add the host back, the update should be immediately applied. @@ -1764,6 +1767,7 @@ TEST_F(ClusterManagerImplTest, MergedUpdates) { hosts_added.push_back((*hosts)[0]); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + EXPECT_EQ(2, factory_.stats_.counter("cluster_manager.regular_updates").value()); EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); // Now emit 3 updates that should be scheduled: metadata, HC, and weight. @@ -1782,6 +1786,7 @@ TEST_F(ClusterManagerImplTest, MergedUpdates) { hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); // Updates not delivered yet. + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.merged_updates_cancelled").value()); EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); // Remove the host again, should cancel the scheduled update and be delivered immediately. @@ -1789,7 +1794,8 @@ TEST_F(ClusterManagerImplTest, MergedUpdates) { cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - EXPECT_EQ(2, factory_.stats_.counter("cluster_manager.merged_updates").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates_cancelled").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); } class ClusterManagerInitHelperTest : public testing::Test { From 3d782348dc2e43afb311b51a1cc3c9f07a4cad9a Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 13:01:10 -0700 Subject: [PATCH 29/38] Explain a possible race condition Explain what happens when a we are outside a merge window, but there's an enabled timer. Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index a344f1be36641..b941f93340254 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -402,6 +402,10 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit const uint64_t delta_ms = std::chrono::duration_cast(delta).count(); if (delta_ms > timeout || !mergeable) { // If there was a pending update, we cancel the pending merged update. + // + // Note: it's possible that even though we are outside of a merge window (delta_ms > timeout), + // a timer is enabled. This race condition is fine, since we'll disable the timer here and + // deliver the update immediately. if (updates->disableTimer()) { cm_stats_.merged_updates_cancelled_.inc(); } From 6408528f124fbbee001129ab2376208f65234759 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 13:06:16 -0700 Subject: [PATCH 30/38] Simplify branching Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index b941f93340254..d4ba249d48d3f 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -359,11 +359,11 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { // Is this a regular update? if (!is_mergeable) { cm_stats_.regular_updates_.inc(); - } - - // Or are we outside a merge window? - if (is_mergeable && !scheduled) { - cm_stats_.merged_updates_offwindow_.inc(); + } else { + // Or are we outside a merge window? + if (!scheduled) { + cm_stats_.merged_updates_offwindow_.inc(); + } } postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); From 31e4fdf8fbb6264e020060886252df0da2af641f Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 14:07:49 -0700 Subject: [PATCH 31/38] Increment counter if merging is enabled Lets only bump `merged_updates_offwindow` if merging is on. Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index d4ba249d48d3f..178cd50f7126d 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -361,7 +361,7 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { cm_stats_.regular_updates_.inc(); } else { // Or are we outside a merge window? - if (!scheduled) { + if (merging_enabled && !scheduled) { cm_stats_.merged_updates_offwindow_.inc(); } } From a5a5391802f101b79f6ba02c8332b8e92c28cc92 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 15:40:40 -0700 Subject: [PATCH 32/38] Counter renames + more tests * renamed counters for consistency with existing counters * test for offwindow updates * test that merging related counters don't get updated when merging is disabled Signed-off-by: Raul Gutierrez Segales --- .../common/upstream/cluster_manager_impl.cc | 24 ++-- source/common/upstream/cluster_manager_impl.h | 8 +- .../upstream/cluster_manager_impl_test.cc | 105 +++++++++++++++--- 3 files changed, 106 insertions(+), 31 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index 178cd50f7126d..e39d8d07c341c 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -356,16 +356,7 @@ void ClusterManagerImpl::onClusterInit(Cluster& cluster) { // If an update was not scheduled for later, deliver it immediately. if (!scheduled) { - // Is this a regular update? - if (!is_mergeable) { - cm_stats_.regular_updates_.inc(); - } else { - // Or are we outside a merge window? - if (merging_enabled && !scheduled) { - cm_stats_.merged_updates_offwindow_.inc(); - } - } - + cm_stats_.cluster_updated_.inc(); postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); } }); @@ -400,14 +391,21 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit // the update so it can be applied immediately. Ditto if this is not a mergeable update. const auto delta = std::chrono::steady_clock::now() - updates->last_updated_; const uint64_t delta_ms = std::chrono::duration_cast(delta).count(); - if (delta_ms > timeout || !mergeable) { + const bool offwindow = delta_ms > timeout; + if (offwindow || !mergeable) { // If there was a pending update, we cancel the pending merged update. // // Note: it's possible that even though we are outside of a merge window (delta_ms > timeout), // a timer is enabled. This race condition is fine, since we'll disable the timer here and // deliver the update immediately. + + // Why wasn't the update scheduled for later delivery? + if (mergeable && offwindow) { + cm_stats_.update_merge_offwindow_.inc(); + } + if (updates->disableTimer()) { - cm_stats_.merged_updates_cancelled_.inc(); + cm_stats_.update_merge_cancelled_.inc(); } updates->last_updated_ = std::chrono::steady_clock::now(); @@ -441,7 +439,7 @@ void ClusterManagerImpl::applyUpdates(const Cluster& cluster, uint32_t priority, postThreadLocalClusterUpdate(cluster, priority, hosts_added, hosts_removed); - cm_stats_.merged_updates_.inc(); + cm_stats_.cluster_updated_via_merge_.inc(); updates.timer_enabled_ = false; updates.last_updated_ = std::chrono::steady_clock::now(); } diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index f6eb49feb6394..ca9960e76d675 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -138,10 +138,10 @@ class ClusterManagerInitHelper : Logger::Loggable { COUNTER(cluster_added) \ COUNTER(cluster_modified) \ COUNTER(cluster_removed) \ - COUNTER(regular_updates) \ - COUNTER(merged_updates) \ - COUNTER(merged_updates_cancelled) \ - COUNTER(merged_updates_offwindow) \ + COUNTER(cluster_updated) \ + COUNTER(cluster_updated_via_merge) \ + COUNTER(update_merge_cancelled) \ + COUNTER(update_merge_offwindow) \ GAUGE (active_clusters) \ GAUGE (warming_clusters) // clang-format on diff --git a/test/common/upstream/cluster_manager_impl_test.cc b/test/common/upstream/cluster_manager_impl_test.cc index 79cb68579dd08..da5ca7b6863b3 100644 --- a/test/common/upstream/cluster_manager_impl_test.cc +++ b/test/common/upstream/cluster_manager_impl_test.cc @@ -175,8 +175,8 @@ class ClusterManagerImplTest : public testing::Test { monotonic_time_source_)); } - void createWithLocalClusterUpdate() { - const std::string yaml = R"EOF( + void createWithLocalClusterUpdate(const bool enable_merge_window=true) { + std::string yaml = R"EOF( static_resources: clusters: - name: cluster_1 @@ -190,9 +190,16 @@ class ClusterManagerImplTest : public testing::Test { - socket_address: address: "127.0.0.1" port_value: 11002 + )EOF"; + const std::string merge_window = R"EOF( common_lb_config: update_merge_window: 3s )EOF"; + + if (enable_merge_window) { + yaml += merge_window; + } + const auto& bootstrap = parseBootstrapFromV2Yaml(yaml); cluster_manager_.reset(new TestClusterManagerImpl( @@ -1745,8 +1752,9 @@ TEST_F(ClusterManagerImplTest, MergedUpdates) { hosts_removed.push_back((*hosts)[0]); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.regular_updates").value()); - EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.merged_updates").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.cluster_updated").value()); + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.cluster_updated_via_merge").value()); + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.update_merge_cancelled").value()); // This calls should be merged, since there are not added/removed hosts. hosts_removed.clear(); @@ -1754,21 +1762,24 @@ TEST_F(ClusterManagerImplTest, MergedUpdates) { hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.regular_updates").value()); - EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.merged_updates").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.cluster_updated").value()); + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.cluster_updated_via_merge").value()); + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.update_merge_cancelled").value()); // Ensure the merged updates were applied. timer->callback_(); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.regular_updates").value()); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.cluster_updated").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.cluster_updated_via_merge").value()); + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.update_merge_cancelled").value()); // Add the host back, the update should be immediately applied. hosts_removed.clear(); hosts_added.push_back((*hosts)[0]); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - EXPECT_EQ(2, factory_.stats_.counter("cluster_manager.regular_updates").value()); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); + EXPECT_EQ(2, factory_.stats_.counter("cluster_manager.cluster_updated").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.cluster_updated_via_merge").value()); + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.update_merge_cancelled").value()); // Now emit 3 updates that should be scheduled: metadata, HC, and weight. hosts_added.clear(); @@ -1786,16 +1797,82 @@ TEST_F(ClusterManagerImplTest, MergedUpdates) { hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); // Updates not delivered yet. - EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.merged_updates_cancelled").value()); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); + EXPECT_EQ(2, factory_.stats_.counter("cluster_manager.cluster_updated").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.cluster_updated_via_merge").value()); + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.update_merge_cancelled").value()); // Remove the host again, should cancel the scheduled update and be delivered immediately. hosts_removed.push_back((*hosts)[0]); cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates_cancelled").value()); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.merged_updates").value()); + EXPECT_EQ(3, factory_.stats_.counter("cluster_manager.cluster_updated").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.cluster_updated_via_merge").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.update_merge_cancelled").value()); + +} + +// Tests that mergeable updates outside of a window get applied immediately. +TEST_F(ClusterManagerImplTest, MergedUpdatesOffWindow) { + createWithLocalClusterUpdate(); + + // Ensure we see the right set of added/removed hosts on every call. + EXPECT_CALL(local_cluster_update_, post(_, _, _)) + .WillOnce(Invoke([](uint32_t priority, const HostVector& hosts_added, + const HostVector& hosts_removed) -> void { + // HC update, immediately delivered. + EXPECT_EQ(0, priority); + EXPECT_EQ(0, hosts_added.size()); + EXPECT_EQ(0, hosts_removed.size()); + })); + + const Cluster& cluster = cluster_manager_->clusters().begin()->second; + HostVectorSharedPtr hosts( + new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); + HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); + HostVector hosts_added; + HostVector hosts_removed; + + // The first update should be applied immediately, because even though it's mergeable + // it's outside a merge window. + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.cluster_updated").value()); + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.cluster_updated_via_merge").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.update_merge_offwindow").value()); + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.update_merge_cancelled").value()); +} + +// Tests that mergeable updates outside of a window get applied immediately when +// merging is disabled, and that the counters are correct. +TEST_F(ClusterManagerImplTest, MergedUpdatesOffWindowDisabled) { + createWithLocalClusterUpdate(false); + + // Ensure we see the right set of added/removed hosts on every call. + EXPECT_CALL(local_cluster_update_, post(_, _, _)) + .WillOnce(Invoke([](uint32_t priority, const HostVector& hosts_added, + const HostVector& hosts_removed) -> void { + // HC update, immediately delivered. + EXPECT_EQ(0, priority); + EXPECT_EQ(0, hosts_added.size()); + EXPECT_EQ(0, hosts_removed.size()); + })); + + const Cluster& cluster = cluster_manager_->clusters().begin()->second; + HostVectorSharedPtr hosts( + new HostVector(cluster.prioritySet().hostSetsPerPriority()[0]->hosts())); + HostsPerLocalitySharedPtr hosts_per_locality = std::make_shared(); + HostVector hosts_added; + HostVector hosts_removed; + + // The first update should be applied immediately, because even though it's mergeable + // and outside a merge window, merging is disabled. + cluster.prioritySet().hostSetsPerPriority()[0]->updateHosts( + hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.cluster_updated").value()); + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.cluster_updated_via_merge").value()); + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.update_merge_offwindow").value()); + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.update_merge_cancelled").value()); } class ClusterManagerInitHelperTest : public testing::Test { From 2a264d194c99face988b65c6e29fee5bf842814a Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 15:48:45 -0700 Subject: [PATCH 33/38] Remove trailing newline Signed-off-by: Raul Gutierrez Segales --- test/common/upstream/cluster_manager_impl_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/test/common/upstream/cluster_manager_impl_test.cc b/test/common/upstream/cluster_manager_impl_test.cc index da5ca7b6863b3..356b7ae96e094 100644 --- a/test/common/upstream/cluster_manager_impl_test.cc +++ b/test/common/upstream/cluster_manager_impl_test.cc @@ -1809,7 +1809,6 @@ TEST_F(ClusterManagerImplTest, MergedUpdates) { EXPECT_EQ(3, factory_.stats_.counter("cluster_manager.cluster_updated").value()); EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.cluster_updated_via_merge").value()); EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.update_merge_cancelled").value()); - } // Tests that mergeable updates outside of a window get applied immediately. From 25faa2b0980f9034ccb0230686ee01674005e32f Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 15:53:11 -0700 Subject: [PATCH 34/38] Fix format Signed-off-by: Raul Gutierrez Segales --- test/common/upstream/cluster_manager_impl_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/common/upstream/cluster_manager_impl_test.cc b/test/common/upstream/cluster_manager_impl_test.cc index 356b7ae96e094..aa51225fa0de5 100644 --- a/test/common/upstream/cluster_manager_impl_test.cc +++ b/test/common/upstream/cluster_manager_impl_test.cc @@ -175,7 +175,7 @@ class ClusterManagerImplTest : public testing::Test { monotonic_time_source_)); } - void createWithLocalClusterUpdate(const bool enable_merge_window=true) { + void createWithLocalClusterUpdate(const bool enable_merge_window = true) { std::string yaml = R"EOF( static_resources: clusters: From 5affa4cf58fd7ed353636257754bc300b799c71c Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 20:19:30 -0700 Subject: [PATCH 35/38] Fix spelling Signed-off-by: Raul Gutierrez Segales --- api/envoy/api/v2/cds.proto | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/envoy/api/v2/cds.proto b/api/envoy/api/v2/cds.proto index 18f1cc47e982c..ff936754aad04 100644 --- a/api/envoy/api/v2/cds.proto +++ b/api/envoy/api/v2/cds.proto @@ -432,14 +432,14 @@ message Cluster { ZoneAwareLbConfig zone_aware_lb_config = 2; LocalityWeightedLbConfig locality_weighted_lb_config = 3; } - // If set, all healthcheck/weight/metadata updates that happen within this duration will be + // If set, all health check/weight/metadata updates that happen within this duration will be // merged and delivered in one shot when the duration expires. The start of the duration is when // the first update happens. This is useful for big clusters, with potentially noisy deploys // that might trigger excessive CPU usage due to a constant stream of healthcheck state changes // or metadata updates. By default, this is not configured and updates apply immediately. Also, // the first set of updates to be seen apply immediately as well (e.g.: a new cluster). // - // Note: merging does not apply to cluser membership changes (e.g.: adds/removes); this is + // Note: merging does not apply to cluster membership changes (e.g.: adds/removes); this is // because merging those updates isn't currently safe. See // https://github.com/envoyproxy/envoy/pull/3941. google.protobuf.Duration update_merge_window = 4; From d3464d41d1600b4d8fb9807738770363683b9078 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 20:26:29 -0700 Subject: [PATCH 36/38] =?UTF-8?q?Rename=20offwindow=20=E2=86=92=20out=5Fof?= =?UTF-8?q?=5Fmerge=5Fwindow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.cc | 8 ++++---- source/common/upstream/cluster_manager_impl.h | 2 +- test/common/upstream/cluster_manager_impl_test.cc | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index e39d8d07c341c..aed6916cba884 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -391,8 +391,8 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit // the update so it can be applied immediately. Ditto if this is not a mergeable update. const auto delta = std::chrono::steady_clock::now() - updates->last_updated_; const uint64_t delta_ms = std::chrono::duration_cast(delta).count(); - const bool offwindow = delta_ms > timeout; - if (offwindow || !mergeable) { + const bool out_of_merge_window = delta_ms > timeout; + if (out_of_merge_window || !mergeable) { // If there was a pending update, we cancel the pending merged update. // // Note: it's possible that even though we are outside of a merge window (delta_ms > timeout), @@ -400,8 +400,8 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit // deliver the update immediately. // Why wasn't the update scheduled for later delivery? - if (mergeable && offwindow) { - cm_stats_.update_merge_offwindow_.inc(); + if (mergeable && out_of_merge_window) { + cm_stats_.update_out_of_merge_window_.inc(); } if (updates->disableTimer()) { diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h index ca9960e76d675..9acf6927c3f0a 100644 --- a/source/common/upstream/cluster_manager_impl.h +++ b/source/common/upstream/cluster_manager_impl.h @@ -141,7 +141,7 @@ class ClusterManagerInitHelper : Logger::Loggable { COUNTER(cluster_updated) \ COUNTER(cluster_updated_via_merge) \ COUNTER(update_merge_cancelled) \ - COUNTER(update_merge_offwindow) \ + COUNTER(update_out_of_merge_window) \ GAUGE (active_clusters) \ GAUGE (warming_clusters) // clang-format on diff --git a/test/common/upstream/cluster_manager_impl_test.cc b/test/common/upstream/cluster_manager_impl_test.cc index aa51225fa0de5..79520b0f73c58 100644 --- a/test/common/upstream/cluster_manager_impl_test.cc +++ b/test/common/upstream/cluster_manager_impl_test.cc @@ -1812,7 +1812,7 @@ TEST_F(ClusterManagerImplTest, MergedUpdates) { } // Tests that mergeable updates outside of a window get applied immediately. -TEST_F(ClusterManagerImplTest, MergedUpdatesOffWindow) { +TEST_F(ClusterManagerImplTest, MergedUpdatesOutOfWindow) { createWithLocalClusterUpdate(); // Ensure we see the right set of added/removed hosts on every call. @@ -1838,13 +1838,13 @@ TEST_F(ClusterManagerImplTest, MergedUpdatesOffWindow) { hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.cluster_updated").value()); EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.cluster_updated_via_merge").value()); - EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.update_merge_offwindow").value()); + EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.update_out_of_merge_window").value()); EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.update_merge_cancelled").value()); } // Tests that mergeable updates outside of a window get applied immediately when // merging is disabled, and that the counters are correct. -TEST_F(ClusterManagerImplTest, MergedUpdatesOffWindowDisabled) { +TEST_F(ClusterManagerImplTest, MergedUpdatesOutOfWindowDisabled) { createWithLocalClusterUpdate(false); // Ensure we see the right set of added/removed hosts on every call. @@ -1870,7 +1870,7 @@ TEST_F(ClusterManagerImplTest, MergedUpdatesOffWindowDisabled) { hosts, hosts, hosts_per_locality, hosts_per_locality, {}, hosts_added, hosts_removed); EXPECT_EQ(1, factory_.stats_.counter("cluster_manager.cluster_updated").value()); EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.cluster_updated_via_merge").value()); - EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.update_merge_offwindow").value()); + EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.update_out_of_merge_window").value()); EXPECT_EQ(0, factory_.stats_.counter("cluster_manager.update_merge_cancelled").value()); } From bbbfef809945051a617b47bed3aa6fffb429dc01 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Mon, 30 Jul 2018 20:43:19 -0700 Subject: [PATCH 37/38] Add comment to explain why we keeping some stats This clarifies the meaning of the stats we are tracking when we don't schedule an update for merging & later delivery. Signed-off-by: Raul Gutierrez Segales --- source/common/upstream/cluster_manager_impl.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc index aed6916cba884..0b53d5119bf7e 100644 --- a/source/common/upstream/cluster_manager_impl.cc +++ b/source/common/upstream/cluster_manager_impl.cc @@ -399,11 +399,15 @@ bool ClusterManagerImpl::scheduleUpdate(const Cluster& cluster, uint32_t priorit // a timer is enabled. This race condition is fine, since we'll disable the timer here and // deliver the update immediately. - // Why wasn't the update scheduled for later delivery? + // Why wasn't the update scheduled for later delivery? We keep some stats that are helpful + // to understand why merging did not happen. There's 2 things we are tracking here: + + // 1) Was this update out of a merge window? if (mergeable && out_of_merge_window) { cm_stats_.update_out_of_merge_window_.inc(); } + // 2) Were there previous updates that we are cancelling (and delivering immediately)? if (updates->disableTimer()) { cm_stats_.update_merge_cancelled_.inc(); } From d81fb7fb5c693bc3d38e19e35c94383a3053f6a9 Mon Sep 17 00:00:00 2001 From: Raul Gutierrez Segales Date: Tue, 31 Jul 2018 09:24:33 -0700 Subject: [PATCH 38/38] Document new cluster stats Signed-off-by: Raul Gutierrez Segales --- docs/root/configuration/cluster_manager/cluster_stats.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/root/configuration/cluster_manager/cluster_stats.rst b/docs/root/configuration/cluster_manager/cluster_stats.rst index 83fe8a10b0340..bfeb164c795a5 100644 --- a/docs/root/configuration/cluster_manager/cluster_stats.rst +++ b/docs/root/configuration/cluster_manager/cluster_stats.rst @@ -19,6 +19,10 @@ statistics. Any ``:`` character in the stats name is replaced with ``_``. cluster_added, Counter, Total clusters added (either via static config or CDS) cluster_modified, Counter, Total clusters modified (via CDS) cluster_removed, Counter, Total clusters removed (via CDS) + cluster_updated, Counter, Total cluster updates + cluster_updated_via_merge, Counter, Total cluster updates applied as merged updates + update_merge_cancelled, Counter, Total merged updates that got cancelled and delivered early + update_out_of_merge_window, Counter, Total updates which arrived out of a merge window active_clusters, Gauge, Number of currently active (warmed) clusters warming_clusters, Gauge, Number of currently warming (not active) clusters