Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions api/envoy/config/cluster/v3/cluster.proto
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@ message Cluster {
// If V4_PREFERRED is specified, the DNS resolver will first perform a lookup for addresses in the
// IPv4 family and fallback to a lookup for addresses in the IPv6 family. i.e., the callback
// target will only get v6 addresses if there were NO v4 addresses to return.
// If ALL is specified, the DNS resolver will perform a lookup for both IPv4 and IPv6 families,
// and return all resolved addresses.
// For cluster types other than
// :ref:`STRICT_DNS<envoy_v3_api_enum_value_config.cluster.v3.Cluster.DiscoveryType.STRICT_DNS>` and
// :ref:`LOGICAL_DNS<envoy_v3_api_enum_value_config.cluster.v3.Cluster.DiscoveryType.LOGICAL_DNS>`,
Expand All @@ -140,6 +142,7 @@ message Cluster {
V4_ONLY = 1;
V6_ONLY = 2;
V4_PREFERRED = 3;
ALL = 4;
}

enum ClusterProtocolSelection {
Expand Down
1 change: 1 addition & 0 deletions docs/root/version_history/current.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Removed Config or Runtime

New Features
------------
* dns: added :ref:`ALL <envoy_v3_api_enum_value_config.cluster.v3.Cluster.DnsLookupFamily.ALL>` option to return both IPv4 and IPv6 addresses.
* http: added support for :ref:`retriable health check status codes <envoy_v3_api_field_config.core.v3.HealthCheck.HttpHealthCheck.retriable_statuses>`.

Deprecated
Expand Down
2 changes: 1 addition & 1 deletion envoy/network/dns.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ struct DnsResponse {
const std::chrono::seconds ttl_;
};

enum class DnsLookupFamily { V4Only, V6Only, Auto, V4Preferred };
enum class DnsLookupFamily { V4Only, V6Only, Auto, V4Preferred, All };

/**
* An asynchronous DNS resolver.
Expand Down
46 changes: 29 additions & 17 deletions source/common/network/apple_dns_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,8 @@ AppleDnsResolverImpl::PendingResolution::PendingResolution(AppleDnsResolverImpl&
const std::string& dns_name,
DnsLookupFamily dns_lookup_family)
: parent_(parent), callback_(callback), dispatcher_(dispatcher), dns_name_(dns_name),
pending_cb_({ResolutionStatus::Success, {}, {}}), dns_lookup_family_(dns_lookup_family) {}
pending_response_({ResolutionStatus::Success, {}, {}, {}}),
dns_lookup_family_(dns_lookup_family) {}

AppleDnsResolverImpl::PendingResolution::~PendingResolution() {
ENVOY_LOG(debug, "Destroying PendingResolution for {}", dns_name_);
Expand Down Expand Up @@ -181,37 +182,47 @@ void AppleDnsResolverImpl::PendingResolution::onEventCallback(uint32_t events) {
// Similar to receiving an error in onDNSServiceGetAddrInfoReply, an error while processing fd
// events indicates that the sd_ref state is broken.
// Therefore, finish resolving with an error.
pending_cb_.status_ = ResolutionStatus::Failure;
pending_response_.status_ = ResolutionStatus::Failure;
finishResolve();
}
}

std::list<DnsResponse>& AppleDnsResolverImpl::PendingResolution::finalAddressList() {
switch (dns_lookup_family_) {
case DnsLookupFamily::V4Only:
return pending_cb_.v4_responses_;
return pending_response_.v4_responses_;
case DnsLookupFamily::V6Only:
return pending_cb_.v6_responses_;
return pending_response_.v6_responses_;
case DnsLookupFamily::Auto:
// Per API docs only give v4 if v6 is not available.
if (pending_cb_.v6_responses_.empty()) {
return pending_cb_.v4_responses_;
if (pending_response_.v6_responses_.empty()) {
return pending_response_.v4_responses_;
}
return pending_cb_.v6_responses_;
return pending_response_.v6_responses_;
case DnsLookupFamily::V4Preferred:
// Per API docs only give v6 if v4 is not available.
if (pending_cb_.v4_responses_.empty()) {
return pending_cb_.v6_responses_;
if (pending_response_.v4_responses_.empty()) {
return pending_response_.v6_responses_;
}
return pending_cb_.v4_responses_;
return pending_response_.v4_responses_;
case DnsLookupFamily::All:
ASSERT(pending_response_.all_responses_.empty());
pending_response_.all_responses_.insert(pending_response_.all_responses_.end(),

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I'm understanding this correctly, it's adding v4 addresses before v6 addresses. In general I'd suggest v6 first but you can get fancier as per the happy eyeballs spec

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, that's the case. But the callback receiver can do what they want with the list, and is where I would expect the ordering complexity to happen; so I am not too opinionated with order here.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should follow the RFC 8305 (the happy eyeballs spec) as David suggest, and I agree with Jose that we should probably do that in HappyEyeballsConnectionImpl. I'd be happy to take a whack at that.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 to do this outside.

pending_response_.v4_responses_.begin(),
pending_response_.v4_responses_.end());
pending_response_.all_responses_.insert(pending_response_.all_responses_.end(),
pending_response_.v6_responses_.begin(),
pending_response_.v6_responses_.end());
return pending_response_.all_responses_;
}
NOT_REACHED_GCOVR_EXCL_LINE;
}

void AppleDnsResolverImpl::PendingResolution::finishResolve() {
ENVOY_LOG_EVENT(debug, "apple_dns_resolution_complete",
"dns resolution for {} completed with status {}", dns_name_, pending_cb_.status_);
callback_(pending_cb_.status_, std::move(finalAddressList()));
"dns resolution for {} completed with status {}", dns_name_,
pending_response_.status_);
callback_(pending_response_.status_, std::move(finalAddressList()));

if (owned_) {
ENVOY_LOG(debug, "Resolution for {} completed (async)", dns_name_);
Expand All @@ -233,6 +244,7 @@ DNSServiceErrorType AppleDnsResolverImpl::PendingResolution::dnsServiceGetAddrIn
break;
case DnsLookupFamily::Auto:
case DnsLookupFamily::V4Preferred:
case DnsLookupFamily::All:
/* We want to make sure we don't get any address that is not routable. Passing 0
* to apple's `DNSServiceGetAddrInfo` will make a best attempt to filter out IPv6
* or IPv4 addresses depending on what's routable, per Apple's documentation:
Expand Down Expand Up @@ -289,9 +301,9 @@ void AppleDnsResolverImpl::PendingResolution::onDNSServiceGetAddrInfoReply(
if (error_code != kDNSServiceErr_NoError) {
parent_.chargeGetAddrInfoErrorStats(error_code);

pending_cb_.status_ = ResolutionStatus::Failure;
pending_cb_.v4_responses_.clear();
pending_cb_.v6_responses_.clear();
pending_response_.status_ = ResolutionStatus::Failure;
pending_response_.v4_responses_.clear();
pending_response_.v6_responses_.clear();

finishResolve();
// Note: Nothing can follow this call to flushPendingQueries due to deletion of this
Expand All @@ -308,10 +320,10 @@ void AppleDnsResolverImpl::PendingResolution::onDNSServiceGetAddrInfoReply(
ENVOY_LOG(debug, "Address to add address={}, ttl={}",
dns_response.address_->ip()->addressAsString(), ttl);
if (dns_response.address_->ip()->ipv4()) {
pending_cb_.v4_responses_.push_back(dns_response);
pending_response_.v4_responses_.push_back(dns_response);
} else {
ASSERT(dns_response.address_->ip()->ipv6());
pending_cb_.v6_responses_.push_back(dns_response);
pending_response_.v6_responses_.push_back(dns_response);
}
}

Expand Down
5 changes: 3 additions & 2 deletions source/common/network/apple_dns_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,11 @@ class AppleDnsResolverImpl : public DnsResolver, protected Logger::Loggable<Logg

// Small wrapping struct to accumulate addresses from firings of the
// onDNSServiceGetAddrInfoReply callback.
struct FinalResponse {
struct PendingResponse {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a much better name!

ResolutionStatus status_;
std::list<DnsResponse> v4_responses_;
std::list<DnsResponse> v6_responses_;
std::list<DnsResponse> all_responses_;
};

AppleDnsResolverImpl& parent_;
Expand All @@ -127,7 +128,7 @@ class AppleDnsResolverImpl : public DnsResolver, protected Logger::Loggable<Logg
// DNSServiceGetAddrInfo fires one callback DNSServiceGetAddrInfoReply callback per IP address,
// and informs via flags if more IP addresses are incoming. Therefore, these addresses need to
// be accumulated before firing callback_.
FinalResponse pending_cb_;
PendingResponse pending_response_;
DnsLookupFamily dns_lookup_family_;
};

Expand Down
143 changes: 91 additions & 52 deletions source/common/network/dns_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,16 +104,24 @@ void DnsResolverImpl::PendingResolution::onAresGetAddrInfoCallback(int status, i
// ARES_ECONNREFUSED. If the PendingResolution has not been cancelled that means that the

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The c-ares based resolver changed a bit more because it was not originally written to accumulate responses from multiple c-ares callbacks.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I refactored for ease of understanding but open to comments to simplify further!

// callback_ target _should_ still be around. In that case, raise the callback_ so the target
// can be done with this query and initiate a new one.
if (!cancelled_) {
ENVOY_LOG_EVENT(debug, "cares_dns_resolution_destroyed", "dns resolution for {} destroyed",
dns_name_);

callback_(ResolutionStatus::Failure, {});
ENVOY_LOG_EVENT(debug, "cares_dns_resolution_destroyed", "dns resolution for {} destroyed",
dns_name_);

if (!pending_response_.address_list_.empty()) {
ASSERT(dns_lookup_family_ == DnsLookupFamily::All && !dual_resolution_);

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here and below I considered a failure in the second resolution an overall success if the first resolution was successful, and hence fired the callback with the existing addresses and a success state. Up to discussing if this should be the behavior.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah IMO a resolve success of either v4 or v6 should be considered success.
cc @DavidSchinazi @RyanTheOptimist for informed opinions :-)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1, success of v4 OR v6 should mean overall success since we have usable addresses

// The only way the resolver would have addresses here is if it is configured to resolve All
// families, and the first resolution was successful. In that case, we might as well return
// the addresses that resolved as a success.
} else {
pending_response_.status_ = ResolutionStatus::Failure;
}
delete this;
// Nothing can follow a call to finishResolve due to the deletion of this object upon
// finishResolve().
finishResolve();
return;
}
if (!fallback_if_failed_) {

if (!dual_resolution_) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm, are we holding all forward progress on the dns resolver returning v4 and v6 addreses? That might work for a first pass but I believe that the chrome folks have told me it's a bad idea in general for latency / Quality of Experience reasons. If they confirm, I think we need a TODO to handle ipv4 and ipv6 asynchronously.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I was thinking about this as well from two different angles:

  1. Should both c-ares function calls happen concurrently?
  2. This applies both for c-ares and for apple: should we stream results as they come in to the callback target.

(1) is easy and I could do that in this PR. (2) is a bit harder, and I wouldn't want to do it in this PR.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally I would do (1) and not (2) for now.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alyssawilk @mattklein123 re:(1) do we have a preference for all three modes (All,Auto,V4Preferred) to do the calls concurrently, vs only All doing it concurrently and Auto/V4Preferred only do the second call if the first one fails/returns no addresses?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, less sure on that - I'll defer to Matt and David.
Totally fine with not doing 2 in this PR but I think we should either comment or TODO that it has latency implications and should probably be fixed at some point.

@junr03 junr03 Oct 7, 2021

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alyssawilk @mattklein123 I settled on doing both lookups concurrrently for All and leaving Auto and V4Preferred with sequential lookups. I did not want to block this PR on making the necessary changes to enable concurrent lookups for Auto/V4Preferred given that the existing behavior is already sequential, and concurrent lookup for All did not need bigger changes.

I am already part of the way on the TODO, so will put that up as a separate PR.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree that waiting for both means users are waiting unnecessarily. But tackling in a separate PR with TODO here sounds good

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SG. Do we have a TODO on not waiting for both?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

decided to create an issue #18572

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This definitely seems like a good way to handle this for now. While suboptimal because it block progres until both return, Chrome does exactly this today. Solving #18572 (later) is a great chance to make envoy mobile better than Chrome!

completed_ = true;

// If c-ares returns ARES_ECONNREFUSED and there is no fallback we assume that the channel_ is
Expand All @@ -130,10 +138,8 @@ void DnsResolverImpl::PendingResolution::onAresGetAddrInfoCallback(int status, i
}
}

std::list<DnsResponse> address_list;
ResolutionStatus resolution_status;
if (status == ARES_SUCCESS) {
resolution_status = ResolutionStatus::Success;
pending_response_.status_ = ResolutionStatus::Success;
if (addrinfo != nullptr && addrinfo->nodes != nullptr) {
if (addrinfo->nodes->ai_family == AF_INET) {
for (const ares_addrinfo_node* ai = addrinfo->nodes; ai != nullptr; ai = ai->ai_next) {
Expand All @@ -143,7 +149,7 @@ void DnsResolverImpl::PendingResolution::onAresGetAddrInfoCallback(int status, i
address.sin_port = 0;
address.sin_addr = reinterpret_cast<sockaddr_in*>(ai->ai_addr)->sin_addr;

address_list.emplace_back(
pending_response_.address_list_.emplace_back(
DnsResponse(std::make_shared<const Address::Ipv4Instance>(&address),
std::chrono::seconds(ai->ai_ttl)));
}
Expand All @@ -154,66 +160,50 @@ void DnsResolverImpl::PendingResolution::onAresGetAddrInfoCallback(int status, i
address.sin6_family = AF_INET6;
address.sin6_port = 0;
address.sin6_addr = reinterpret_cast<sockaddr_in6*>(ai->ai_addr)->sin6_addr;
address_list.emplace_back(
pending_response_.address_list_.emplace_back(
DnsResponse(std::make_shared<const Address::Ipv6Instance>(address),
std::chrono::seconds(ai->ai_ttl)));
}
}
}

if (!address_list.empty()) {
if (!pending_response_.address_list_.empty() && dns_lookup_family_ != DnsLookupFamily::All) {
completed_ = true;
}

ASSERT(addrinfo != nullptr);
ares_freeaddrinfo(addrinfo);
} else {
resolution_status = ResolutionStatus::Failure;
if (!pending_response_.address_list_.empty()) {
ASSERT(dns_lookup_family_ == DnsLookupFamily::All && !dual_resolution_);
// The only way the resolver would have addresses here is if it is configured to resolve All
// families, and the first resolution was successful. In that case, we might as well return
// the addresses that resolved.
} else {
pending_response_.status_ = ResolutionStatus::Failure;
}
}

if (timeouts > 0) {
ENVOY_LOG(debug, "DNS request timed out {} times", timeouts);
}

if (completed_) {
if (!cancelled_) {
// Use a raw try here because it is used in both main thread and filter.
// Can not convert to use status code as there may be unexpected exceptions in server fuzz
// tests, which must be handled. Potential exception may come from getAddressWithPort() or
// portFromTcpUrl().
// TODO(chaoqin-li1123): remove try catch pattern here once we figure how to handle unexpected
// exception in fuzz tests.
ENVOY_LOG_EVENT(debug, "cares_dns_resolution_complete",
"dns resolution for {} completed with status {}", dns_name_,
resolution_status);

TRY_NEEDS_AUDIT { callback_(resolution_status, std::move(address_list)); }
catch (const EnvoyException& e) {
ENVOY_LOG(critical, "EnvoyException in c-ares callback: {}", e.what());
dispatcher_.post([s = std::string(e.what())] { throw EnvoyException(s); });
}
catch (const std::exception& e) {
ENVOY_LOG(critical, "std::exception in c-ares callback: {}", e.what());
dispatcher_.post([s = std::string(e.what())] { throw EnvoyException(s); });
}
catch (...) {
ENVOY_LOG(critical, "Unknown exception in c-ares callback");
dispatcher_.post([] { throw EnvoyException("unknown"); });
}
}
if (owned_) {
delete this;
return;
}
finishResolve();
// Nothing can follow a call to finishResolve due to the deletion of this object upon
// finishResolve().
return;
}

if (!completed_ && fallback_if_failed_) {
fallback_if_failed_ = false;
if (dual_resolution_) {
dual_resolution_ = false;

// Perform a second lookup for DnsLookupFamily::Auto and DnsLookupFamily::V4Preferred, given
// that the first lookup failed to return any addresses. Note that DnsLookupFamily::All issues
// both lookups concurrently so there is no need to fire a second lookup here.
if (dns_lookup_family_ == DnsLookupFamily::Auto) {
getAddrInfo(AF_INET);
} else {
ASSERT(dns_lookup_family_ == DnsLookupFamily::V4Preferred);
} else if (dns_lookup_family_ == DnsLookupFamily::V4Preferred) {
getAddrInfo(AF_INET6);
}

Expand All @@ -223,6 +213,40 @@ void DnsResolverImpl::PendingResolution::onAresGetAddrInfoCallback(int status, i
}
}

void DnsResolverImpl::PendingResolution::finishResolve() {
if (!cancelled_) {
// Use a raw try here because it is used in both main thread and filter.
// Can not convert to use status code as there may be unexpected exceptions in server fuzz
// tests, which must be handled. Potential exception may come from getAddressWithPort() or
// portFromTcpUrl().
// TODO(chaoqin-li1123): remove try catch pattern here once we figure how to handle unexpected
// exception in fuzz tests.
ENVOY_LOG_EVENT(debug, "cares_dns_resolution_complete",
"dns resolution for {} completed with status {}", dns_name_,
pending_response_.status_);

TRY_NEEDS_AUDIT {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does TRY_NEEDS_AUDIT imply that we're trying to avoids exceptions on the data plane but we're required to use them here? (Is there an actual audit happening?)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah I think there is a push to avoid exceptions on the data plane, except some times on the main thread. But this code does not always run on the main thread so it was left as one of the remaining uses of try that needs to be addressed. Looks like @chaoqin-li1123 has a TODO to clean up.

callback_(pending_response_.status_, std::move(pending_response_.address_list_));
}
catch (const EnvoyException& e) {
ENVOY_LOG(critical, "EnvoyException in c-ares callback: {}", e.what());
dispatcher_.post([s = std::string(e.what())] { throw EnvoyException(s); });
}
catch (const std::exception& e) {
ENVOY_LOG(critical, "std::exception in c-ares callback: {}", e.what());
dispatcher_.post([s = std::string(e.what())] { throw EnvoyException(s); });
}
catch (...) {
ENVOY_LOG(critical, "Unknown exception in c-ares callback");
dispatcher_.post([] { throw EnvoyException("unknown"); });
}
}
if (owned_) {
delete this;
return;
}
}

void DnsResolverImpl::updateAresTimer() {
// Update the timeout for events.
timeval timeout;
Expand Down Expand Up @@ -283,15 +307,30 @@ ActiveDnsQuery* DnsResolverImpl::resolve(const std::string& dns_name,
auto pending_resolution = std::make_unique<PendingResolution>(
*this, callback, dispatcher_, channel_, dns_name, dns_lookup_family);
if (dns_lookup_family == DnsLookupFamily::Auto ||
dns_lookup_family == DnsLookupFamily::V4Preferred) {
pending_resolution->fallback_if_failed_ = true;
dns_lookup_family == DnsLookupFamily::V4Preferred ||
dns_lookup_family == DnsLookupFamily::All) {
pending_resolution->dual_resolution_ = true;
}

if (dns_lookup_family == DnsLookupFamily::V4Only ||
dns_lookup_family == DnsLookupFamily::V4Preferred) {
switch (dns_lookup_family) {
case DnsLookupFamily::V4Only:
case DnsLookupFamily::V4Preferred:
pending_resolution->getAddrInfo(AF_INET);
break;
case DnsLookupFamily::V6Only:
case DnsLookupFamily::Auto:
pending_resolution->getAddrInfo(AF_INET6);
break;
// NOTE: DnsLookupFamily::All performs both lookups concurrently as addresses from both families
// are being requested.
// TODO: DnsLookupFamily::Auto and DnsLookupFamily::V4Preferred could also do concurrent lookups.
Comment thread
junr03 marked this conversation as resolved.
Outdated
// This will require some refactoring and should be done in a subsequent PR.
case DnsLookupFamily::All:
pending_resolution->getAddrInfo(AF_INET);
} else {
pending_resolution->getAddrInfo(AF_INET6);
break;
default:
NOT_REACHED_GCOVR_EXCL_LINE;
}

if (pending_resolution->completed_) {
Expand Down
Loading