Skip to content

Commit

Permalink
dns: adjust DNS refresh rate respecting to record TTL (#6975)
Browse files Browse the repository at this point in the history
Signed-off-by: Yan Xue <[email protected]>
  • Loading branch information
yxue authored and mattklein123 committed Jul 1, 2019
1 parent 3c8a1ef commit e0e7628
Show file tree
Hide file tree
Showing 20 changed files with 371 additions and 214 deletions.
7 changes: 6 additions & 1 deletion api/envoy/api/v2/cds.proto
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ service ClusterDiscoveryService {
// [#protodoc-title: Clusters]

// Configuration for a single upstream cluster.
// [#comment:next free field: 39]
// [#comment:next free field: 40]
message Cluster {
// Supplies the name of the cluster which must be unique across all clusters.
// The cluster name is used when emitting
Expand Down Expand Up @@ -274,6 +274,11 @@ message Cluster {
google.protobuf.Duration dns_refresh_rate = 16
[(validate.rules).duration.gt = {}, (gogoproto.stdduration) = true];

// Optional configuration for setting cluster's DNS refresh rate. If the value is set to true,
// cluster's DNS refresh rate will be set to resource record's TTL which comes from DNS
// resolution.
bool respect_dns_ttl = 39;

// When V4_ONLY is selected, the DNS resolver will only perform a lookup for
// addresses in the IPv4 family. If V6_ONLY is selected, the DNS resolver will
// only perform a lookup for addresses in the IPv6 family. If AUTO is
Expand Down
12 changes: 12 additions & 0 deletions docs/root/intro/arch_overview/upstream/service_discovery.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ This means that care should be taken if active health checking is used with DNS
to the same IPs: if an IP is repeated many times between DNS names it might cause undue load on the
upstream host.

If :ref:`respect_dns_ttl <envoy_api_field_Cluster.respect_dns_ttl>` is enabled, DNS record TTLs and
:ref:`dns_refresh_rate <envoy_api_field_Cluster.dns_refresh_rate>` are used to control DNS refresh rate.
For strict DNS cluster, if the minimum of all record TTLs is 0, :ref:`dns_refresh_rate <envoy_api_field_Cluster.dns_refresh_rate>`
will be used as the cluster's DNS refresh rate. :ref:`dns_refresh_rate <envoy_api_field_Cluster.dns_refresh_rate>`
defaults to 5000ms if not specified.

.. _arch_overview_service_discovery_types_logical_dns:

Logical DNS
Expand All @@ -58,6 +64,12 @@ When interacting with large scale web services, this is the best of all possible
asynchronous/eventually consistent DNS resolution, long lived connections, and zero blocking in the
forwarding path.

If :ref:`respect_dns_ttl <envoy_api_field_Cluster.respect_dns_ttl>` is enabled, DNS record TTLs and
:ref:`dns_refresh_rate <envoy_api_field_Cluster.dns_refresh_rate>` are used to control DNS refresh rate.
For logical DNS cluster, if the TTL of first record is 0, :ref:`dns_refresh_rate <envoy_api_field_Cluster.dns_refresh_rate>`
will be used as the cluster's DNS refresh rate. :ref:`dns_refresh_rate <envoy_api_field_Cluster.dns_refresh_rate>`
defaults to 5000ms if not specified.

.. _arch_overview_service_discovery_types_original_destination:

Original destination
Expand Down
1 change: 1 addition & 0 deletions docs/root/intro/version_history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Version history
* build: releases are built with Clang and linked with LLD.
* control-plane: management servers can respond with HTTP 304 to indicate that config is up to date for Envoy proxies polling a :ref:`REST API Config Type <envoy_api_field_core.ApiConfigSource.api_type>`
* csrf: added support for whitelisting additional source origins.
* dns: added support for getting DNS record TTL which is used by STRICT_DNS/LOGICAL_DNS cluster as DNS refresh rate.
* dubbo_proxy: support the :ref:`Dubbo proxy filter <config_network_filters_dubbo_proxy>`.
* eds: added support to specify max time for which endpoints can be used :ref:`gRPC filter <envoy_api_msg_ClusterLoadAssignment.Policy>`.
* event: added :ref:`loop duration and poll delay statistics <operations_performance>`.
Expand Down
16 changes: 14 additions & 2 deletions include/envoy/network/dns.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include <chrono>
#include <functional>
#include <list>
#include <memory>
Expand All @@ -24,6 +25,17 @@ class ActiveDnsQuery {
virtual void cancel() PURE;
};

/**
* DNS response.
*/
struct DnsResponse {
DnsResponse(const Address::InstanceConstSharedPtr& address, const std::chrono::seconds ttl)
: address_(address), ttl_(ttl) {}

const Address::InstanceConstSharedPtr address_;
const std::chrono::seconds ttl_;
};

enum class DnsLookupFamily { V4Only, V6Only, Auto };

/**
Expand All @@ -35,10 +47,10 @@ class DnsResolver {

/**
* Called when a resolution attempt is complete.
* @param address_list supplies the list of resolved IP addresses. The list will be empty if
* @param response supplies the list of resolved IP addresses and TTLs. The list will be empty if
* the resolution failed.
*/
using ResolveCb = std::function<void(std::list<Address::InstanceConstSharedPtr>&& address_list)>;
using ResolveCb = std::function<void(std::list<DnsResponse>&& response)>;

/**
* Initiate an async DNS resolution.
Expand Down
11 changes: 8 additions & 3 deletions source/common/network/dns_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ void DnsResolverImpl::PendingResolution::onAresGetAddrInfoCallback(int status, i
completed_ = true;
}

std::list<Address::InstanceConstSharedPtr> address_list;
std::list<DnsResponse> address_list;
if (status == ARES_SUCCESS) {
if (addrinfo != nullptr && addrinfo->nodes != nullptr) {
if (addrinfo->nodes->ai_family == AF_INET) {
Expand All @@ -89,7 +89,10 @@ void DnsResolverImpl::PendingResolution::onAresGetAddrInfoCallback(int status, i
address.sin_family = AF_INET;
address.sin_port = 0;
address.sin_addr = reinterpret_cast<sockaddr_in*>(ai->ai_addr)->sin_addr;
address_list.emplace_back(new Address::Ipv4Instance(&address));

address_list.emplace_back(
DnsResponse(std::make_shared<const Address::Ipv4Instance>(&address),
std::chrono::seconds(ai->ai_ttl)));
}
} else if (addrinfo->nodes->ai_family == AF_INET6) {
for (const ares_addrinfo_node* ai = addrinfo->nodes; ai != nullptr; ai = ai->ai_next) {
Expand All @@ -98,7 +101,9 @@ void DnsResolverImpl::PendingResolution::onAresGetAddrInfoCallback(int status, i
address.sin6_family = AF_INET6;
address.sin6_port = 0;
address.sin6_addr = reinterpret_cast<sockaddr_in6*>(ai->ai_addr)->sin6_addr;
address_list.emplace_back(new Address::Ipv6Instance(address));
address_list.emplace_back(
DnsResponse(std::make_shared<const Address::Ipv6Instance>(address),
std::chrono::seconds(ai->ai_ttl)));
}
}
}
Expand Down
18 changes: 12 additions & 6 deletions source/common/upstream/logical_dns_cluster.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ LogicalDnsCluster::LogicalDnsCluster(
dns_resolver_(dns_resolver),
dns_refresh_rate_ms_(
std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(cluster, dns_refresh_rate, 5000))),
respect_dns_ttl_(cluster.respect_dns_ttl()),
resolve_timer_(
factory_context.dispatcher().createTimer([this]() -> void { startResolve(); })),
local_info_(factory_context.localInfo()),
Expand Down Expand Up @@ -70,18 +71,23 @@ void LogicalDnsCluster::startResolve() {

active_dns_query_ = dns_resolver_->resolve(
dns_address, dns_lookup_family_,
[this,
dns_address](std::list<Network::Address::InstanceConstSharedPtr>&& address_list) -> void {
[this, dns_address](std::list<Network::DnsResponse>&& response) -> void {
active_dns_query_ = nullptr;
ENVOY_LOG(debug, "async DNS resolution complete for {}", dns_address);
info_->stats().update_success_.inc();

if (!address_list.empty()) {
std::chrono::milliseconds refresh_rate = dns_refresh_rate_ms_;
if (!response.empty()) {
// TODO(mattklein123): Move port handling into the DNS interface.
ASSERT(address_list.front() != nullptr);
ASSERT(response.front().address_ != nullptr);
Network::Address::InstanceConstSharedPtr new_address =
Network::Utility::getAddressWithPort(*address_list.front(),
Network::Utility::getAddressWithPort(*(response.front().address_),
Network::Utility::portFromTcpUrl(dns_url_));

if (respect_dns_ttl_ && response.front().ttl_ != std::chrono::seconds(0)) {
refresh_rate = response.front().ttl_;
}

if (!logical_host_) {
logical_host_.reset(
new LogicalHost(info_, hostname_, new_address, localityLbEndpoint(), lbEndpoint()));
Expand All @@ -107,7 +113,7 @@ void LogicalDnsCluster::startResolve() {
}

onPreInitComplete();
resolve_timer_->enableTimer(dns_refresh_rate_ms_);
resolve_timer_->enableTimer(refresh_rate);
});
}

Expand Down
1 change: 1 addition & 0 deletions source/common/upstream/logical_dns_cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class LogicalDnsCluster : public ClusterImplBase {

Network::DnsResolverSharedPtr dns_resolver_;
const std::chrono::milliseconds dns_refresh_rate_ms_;
const bool respect_dns_ttl_;
Network::DnsLookupFamily dns_lookup_family_;
Event::TimerPtr resolve_timer_;
std::string dns_url_;
Expand Down
26 changes: 20 additions & 6 deletions source/common/upstream/strict_dns_cluster.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ StrictDnsClusterImpl::StrictDnsClusterImpl(
added_via_api),
local_info_(factory_context.localInfo()), dns_resolver_(dns_resolver),
dns_refresh_rate_ms_(
std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(cluster, dns_refresh_rate, 5000))) {
std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(cluster, dns_refresh_rate, 5000))),
respect_dns_ttl_(cluster.respect_dns_ttl()) {
std::list<ResolveTargetPtr> resolve_targets;
const envoy::api::v2::ClusterLoadAssignment load_assignment(
cluster.has_load_assignment() ? cluster.load_assignment()
Expand Down Expand Up @@ -90,24 +91,28 @@ void StrictDnsClusterImpl::ResolveTarget::startResolve() {

active_query_ = parent_.dns_resolver_->resolve(
dns_address_, parent_.dns_lookup_family_,
[this](std::list<Network::Address::InstanceConstSharedPtr>&& address_list) -> void {
[this](std::list<Network::DnsResponse>&& response) -> void {
active_query_ = nullptr;
ENVOY_LOG(trace, "async DNS resolution complete for {}", dns_address_);
parent_.info_->stats().update_success_.inc();

std::unordered_map<std::string, HostSharedPtr> updated_hosts;
HostVector new_hosts;
for (const Network::Address::InstanceConstSharedPtr& address : address_list) {
std::chrono::seconds ttl_refresh_rate = std::chrono::seconds::max();
for (const auto& resp : response) {
// TODO(mattklein123): Currently the DNS interface does not consider port. We need to
// make a new address that has port in it. We need to both support IPv6 as well as
// potentially move port handling into the DNS interface itself, which would work better
// for SRV.
ASSERT(address != nullptr);
ASSERT(resp.address_ != nullptr);
new_hosts.emplace_back(new HostImpl(
parent_.info_, dns_address_, Network::Utility::getAddressWithPort(*address, port_),
parent_.info_, dns_address_,
Network::Utility::getAddressWithPort(*(resp.address_), port_),
lb_endpoint_.metadata(), lb_endpoint_.load_balancing_weight().value(),
locality_lb_endpoint_.locality(), lb_endpoint_.endpoint().health_check_config(),
locality_lb_endpoint_.priority(), lb_endpoint_.health_status()));

ttl_refresh_rate = min(ttl_refresh_rate, resp.ttl_);
}

HostVector hosts_added;
Expand All @@ -130,7 +135,16 @@ void StrictDnsClusterImpl::ResolveTarget::startResolve() {
// completes. This is not perfect but is easier to code and unclear if the extra
// complexity is needed so will start with this.
parent_.onPreInitComplete();
resolve_timer_->enableTimer(parent_.dns_refresh_rate_ms_);

std::chrono::milliseconds final_refresh_rate = parent_.dns_refresh_rate_ms_;

if (parent_.respect_dns_ttl_ && ttl_refresh_rate != std::chrono::seconds(0)) {
final_refresh_rate = ttl_refresh_rate;
ENVOY_LOG(debug, "DNS refresh rate reset for {}, refresh rate {} ms", dns_address_,
final_refresh_rate.count());
}

resolve_timer_->enableTimer(final_refresh_rate);
});
}

Expand Down
1 change: 1 addition & 0 deletions source/common/upstream/strict_dns_cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class StrictDnsClusterImpl : public BaseDynamicClusterImpl {
Network::DnsResolverSharedPtr dns_resolver_;
std::list<ResolveTargetPtr> resolve_targets_;
const std::chrono::milliseconds dns_refresh_rate_ms_;
const bool respect_dns_ttl_;
Network::DnsLookupFamily dns_lookup_family_;
uint32_t overprovisioning_factor_;
};
Expand Down
13 changes: 6 additions & 7 deletions source/extensions/clusters/redis/redis_cluster.cc
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,10 @@ void RedisCluster::DnsDiscoveryResolveTarget::startResolve() {

active_query_ = parent_.dns_resolver_->resolve(
dns_address_, parent_.dns_lookup_family_,
[this](std::list<Network::Address::InstanceConstSharedPtr>&& address_list) -> void {
[this](std::list<Network::DnsResponse>&& response) -> void {
active_query_ = nullptr;
ENVOY_LOG(trace, "async DNS resolution complete for {}", dns_address_);
parent_.redis_discovery_session_.registerDiscoveryAddress(address_list, port_);
parent_.redis_discovery_session_.registerDiscoveryAddress(std::move(response), port_);
parent_.redis_discovery_session_.startResolve();
});
}
Expand Down Expand Up @@ -190,13 +190,12 @@ void RedisCluster::RedisDiscoveryClient::onEvent(Network::ConnectionEvent event)
}

void RedisCluster::RedisDiscoverySession::registerDiscoveryAddress(
const std::list<Envoy::Network::Address::InstanceConstSharedPtr>& address_list,
const uint32_t port) {
std::list<Envoy::Network::DnsResponse>&& response, const uint32_t port) {
// Since the address from DNS does not have port, we need to make a new address that has port in
// it.
for (const Network::Address::InstanceConstSharedPtr& address : address_list) {
ASSERT(address != nullptr);
discovery_address_list_.push_back(Network::Utility::getAddressWithPort(*address, port));
for (const Network::DnsResponse& res : response) {
ASSERT(res.address_ != nullptr);
discovery_address_list_.push_back(Network::Utility::getAddressWithPort(*(res.address_), port));
}
}

Expand Down
4 changes: 1 addition & 3 deletions source/extensions/clusters/redis/redis_cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,7 @@ class RedisCluster : public Upstream::BaseDynamicClusterImpl {

~RedisDiscoverySession() override;

void registerDiscoveryAddress(
const std::list<Network::Address::InstanceConstSharedPtr>& address_list,
const uint32_t port);
void registerDiscoveryAddress(std::list<Network::DnsResponse>&& response, const uint32_t port);

// Start discovery against a random host from existing hosts
void startResolve();
Expand Down
29 changes: 14 additions & 15 deletions source/extensions/common/dynamic_forward_proxy/dns_cache_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -134,19 +134,18 @@ void DnsCacheImpl::startResolve(const std::string& host, PrimaryHostInfo& host_i
ENVOY_LOG(debug, "starting main thread resolve for host='{}' dns='{}' port='{}'", host,
host_info.host_to_resolve_, host_info.port_);
ASSERT(host_info.active_query_ == nullptr);

stats_.dns_query_attempt_.inc();
host_info.active_query_ = resolver_->resolve(
host_info.host_to_resolve_, dns_lookup_family_,
[this, host](const std::list<Network::Address::InstanceConstSharedPtr>&& address_list) {
finishResolve(host, address_list);
});
host_info.active_query_ =
resolver_->resolve(host_info.host_to_resolve_, dns_lookup_family_,
[this, host](std::list<Network::DnsResponse>&& response) {
finishResolve(host, std::move(response));
});
}

void DnsCacheImpl::finishResolve(
const std::string& host,
const std::list<Network::Address::InstanceConstSharedPtr>& address_list) {
ENVOY_LOG(debug, "main thread resolve complete for host '{}'. {} results", host,
address_list.size());
void DnsCacheImpl::finishResolve(const std::string& host,
std::list<Network::DnsResponse>&& response) {
ENVOY_LOG(debug, "main thread resolve complete for host '{}'. {} results", host, response.size());
const auto primary_host_it = primary_hosts_.find(host);
ASSERT(primary_host_it != primary_hosts_.end());

Expand All @@ -158,12 +157,12 @@ void DnsCacheImpl::finishResolve(
std::make_shared<DnsHostInfoImpl>(main_thread_dispatcher_.timeSource());
}

const auto new_address =
!address_list.empty()
? Network::Utility::getAddressWithPort(*address_list.front(), primary_host_info.port_)
: nullptr;
const auto new_address = !response.empty()
? Network::Utility::getAddressWithPort(*(response.front().address_),
primary_host_info.port_)
: nullptr;

if (address_list.empty()) {
if (response.empty()) {
stats_.dns_query_failure_.inc();
} else {
stats_.dns_query_success_.inc();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,7 @@ class DnsCacheImpl : public DnsCache, Logger::Loggable<Logger::Id::forward_proxy

void startCacheLoad(const std::string& host, uint16_t default_port);
void startResolve(const std::string& host, PrimaryHostInfo& host_info);
void finishResolve(const std::string& host,
const std::list<Network::Address::InstanceConstSharedPtr>& address_list);
void finishResolve(const std::string& host, std::list<Network::DnsResponse>&& response);
void runAddUpdateCallbacks(const std::string& host, const DnsHostInfoSharedPtr& host_info);
void runRemoveCallbacks(const std::string& host);
void updateTlsHostsMap();
Expand Down
Loading

0 comments on commit e0e7628

Please sign in to comment.