diff --git a/source/common/network/dns_impl.cc b/source/common/network/dns_impl.cc index c134872b71e74..505b1bea7dc14 100644 --- a/source/common/network/dns_impl.cc +++ b/source/common/network/dns_impl.cc @@ -24,16 +24,11 @@ DnsResolverImpl::DnsResolverImpl( const std::vector& resolvers, const bool use_tcp_for_dns_lookups) : dispatcher_(dispatcher), - timer_(dispatcher.createTimer([this] { onEventCallback(ARES_SOCKET_BAD, 0); })) { - ares_options options{}; - int optmask = 0; + timer_(dispatcher.createTimer([this] { onEventCallback(ARES_SOCKET_BAD, 0); })), + use_tcp_for_dns_lookups_(use_tcp_for_dns_lookups) { - if (use_tcp_for_dns_lookups) { - optmask |= ARES_OPT_FLAGS; - options.flags |= ARES_FLAG_USEVC; - } - - initializeChannel(&options, optmask); + AresOptions options = defaultAresOptions(); + initializeChannel(&options.options_, options.optmask_); if (!resolvers.empty()) { std::vector resolver_addrs; @@ -65,6 +60,17 @@ DnsResolverImpl::~DnsResolverImpl() { ares_destroy(channel_); } +DnsResolverImpl::AresOptions DnsResolverImpl::defaultAresOptions() { + AresOptions options{}; + + if (use_tcp_for_dns_lookups_) { + options.optmask_ |= ARES_OPT_FLAGS; + options.options_.flags |= ARES_FLAG_USEVC; + } + + return options; +} + void DnsResolverImpl::initializeChannel(ares_options* options, int optmask) { options->sock_state_cb = [](void* arg, int fd, int read, int write) { static_cast(arg)->onAresSocketStateChange(fd, read, write); @@ -83,6 +89,19 @@ void DnsResolverImpl::PendingResolution::onAresGetAddrInfoCallback(int status, i } if (!fallback_if_failed_) { completed_ = true; + + // If c-ares returns ARES_ECONNREFUSED and there is no fallback we assume that the channel_ is + // broken. Mark the channel dirty so that it is destroyed and reinitialized on a subsequent call + // to DnsResolver::resolve(). The optimal solution would be for c-ares to reinitialize the + // channel, and not have Envoy track side effects. + // context: https://github.com/envoyproxy/envoy/issues/4543 and + // https://github.com/c-ares/c-ares/issues/301. + // + // The channel cannot be destroyed and reinitialized here because that leads to a c-ares + // segfault. + if (status == ARES_ECONNREFUSED) { + parent_.dirty_channel_ = true; + } } std::list address_list; @@ -203,8 +222,17 @@ ActiveDnsQuery* DnsResolverImpl::resolve(const std::string& dns_name, // TODO(hennna): Add DNS caching which will allow testing the edge case of a // failed initial call to getHostByName followed by a synchronous IPv4 // resolution. + + // @see DnsResolverImpl::PendingResolution::onAresGetAddrInfoCallback for why this is done. + if (dirty_channel_) { + dirty_channel_ = false; + ares_destroy(channel_); + + AresOptions options = defaultAresOptions(); + initializeChannel(&options.options_, options.optmask_); + } std::unique_ptr pending_resolution( - new PendingResolution(callback, dispatcher_, channel_, dns_name)); + new PendingResolution(*this, callback, dispatcher_, channel_, dns_name)); if (dns_lookup_family == DnsLookupFamily::Auto) { pending_resolution->fallback_if_failed_ = true; } diff --git a/source/common/network/dns_impl.h b/source/common/network/dns_impl.h index cecad8a2e7280..79a140dea6cf9 100644 --- a/source/common/network/dns_impl.h +++ b/source/common/network/dns_impl.h @@ -39,9 +39,10 @@ class DnsResolverImpl : public DnsResolver, protected Logger::Loggable events_; }; diff --git a/test/common/network/dns_impl_test.cc b/test/common/network/dns_impl_test.cc index 84b14151446ed..65bc03c10dc97 100644 --- a/test/common/network/dns_impl_test.cc +++ b/test/common/network/dns_impl_test.cc @@ -55,9 +55,9 @@ enum class RecordType { A, AAAA }; class TestDnsServerQuery { public: TestDnsServerQuery(ConnectionPtr connection, const HostMap& hosts_a, const HostMap& hosts_aaaa, - const CNameMap& cnames, const std::chrono::seconds& record_ttl) + const CNameMap& cnames, const std::chrono::seconds& record_ttl, bool refused) : connection_(std::move(connection)), hosts_a_(hosts_a), hosts_aaaa_(hosts_aaaa), - cnames_(cnames), record_ttl_(record_ttl) { + cnames_(cnames), record_ttl_(record_ttl), refused_(refused) { connection_->addReadFilter(Network::ReadFilterSharedPtr{new ReadFilter(*this)}); } @@ -171,7 +171,11 @@ class TestDnsServerQuery { memcpy(response_base, request, response_base_len); DNS_HEADER_SET_QR(response_base, 1); DNS_HEADER_SET_AA(response_base, 0); - DNS_HEADER_SET_RCODE(response_base, answer_size > 0 ? NOERROR : NXDOMAIN); + if (parent_.refused_) { + DNS_HEADER_SET_RCODE(response_base, REFUSED); + } else { + DNS_HEADER_SET_RCODE(response_base, answer_size > 0 ? NOERROR : NXDOMAIN); + } DNS_HEADER_SET_ANCOUNT(response_base, answer_size); DNS_HEADER_SET_NSCOUNT(response_base, 0); DNS_HEADER_SET_ARCOUNT(response_base, 0); @@ -253,6 +257,7 @@ class TestDnsServerQuery { const HostMap& hosts_aaaa_; const CNameMap& cnames_; const std::chrono::seconds& record_ttl_; + bool refused_{}; }; class TestDnsServer : public ListenerCallbacks { @@ -263,7 +268,7 @@ class TestDnsServer : public ListenerCallbacks { Network::ConnectionPtr new_connection = dispatcher_.createServerConnection( std::move(socket), Network::Test::createRawBufferSocket()); TestDnsServerQuery* query = new TestDnsServerQuery(std::move(new_connection), hosts_a_, - hosts_aaaa_, cnames_, record_ttl_); + hosts_aaaa_, cnames_, record_ttl_, refused_); queries_.emplace_back(query); } @@ -280,6 +285,7 @@ class TestDnsServer : public ListenerCallbacks { } void setRecordTtl(const std::chrono::seconds& ttl) { record_ttl_ = ttl; } + void setRefused(bool refused) { refused_ = refused; } private: Event::Dispatcher& dispatcher_; @@ -288,6 +294,7 @@ class TestDnsServer : public ListenerCallbacks { HostMap hosts_aaaa_; CNameMap cnames_; std::chrono::seconds record_ttl_; + bool refused_{}; // All queries are tracked so we can do resource reclamation when the test is // over. std::vector> queries_; @@ -300,6 +307,7 @@ class DnsResolverImplPeer { DnsResolverImplPeer(DnsResolverImpl* resolver) : resolver_(resolver) {} ares_channel channel() const { return resolver_->channel_; } + bool isChannelDirty() const { return resolver_->dirty_channel_; } const std::unordered_map& events() { return resolver_->events_; } // Reset the channel state for a DnsResolverImpl such that it will only use // TCP and optionally has a zero timeout (for validating timeout behavior). @@ -582,6 +590,64 @@ TEST_P(DnsImplTest, CallbackException) { "unknown"); } +// Validate that the c-ares channel is destroyed and re-initialized when c-ares returns +// ARES_ECONNREFUSED as its callback status. +TEST_P(DnsImplTest, DestroyChannelOnRefused) { + ASSERT_FALSE(peer_->isChannelDirty()); + server_->addHosts("some.good.domain", {"201.134.56.7"}, RecordType::A); + server_->setRefused(true); + + std::list address_list; + EXPECT_NE(nullptr, resolver_->resolve("", DnsLookupFamily::V4Only, + [&](std::list&& results) -> void { + address_list = getAddressList(results); + dispatcher_->exit(); + })); + + dispatcher_->run(Event::Dispatcher::RunType::Block); + // The c-ares channel should be dirty because the TestDnsServer replied with return code REFUSED; + // This test, and the way the TestDnsServerQuery is setup, relies on the fact that Envoy's + // c-ares channel is configured **without** the ARES_FLAG_NOCHECKRESP flag. This causes c-ares to + // discard packets with REFUSED, and thus Envoy receives ARES_ECONNREFUSED due to the code here: + // https://github.com/c-ares/c-ares/blob/d7e070e7283f822b1d2787903cce3615536c5610/ares_process.c#L654 + // If that flag needs to be set, or c-ares changes its handling this test will need to be updated + // to create another condition where c-ares invokes onAresGetAddrInfoCallback with status == + // ARES_ECONNREFUSED. + EXPECT_TRUE(peer_->isChannelDirty()); + EXPECT_TRUE(address_list.empty()); + + server_->setRefused(false); + + // Resolve will destroy the original channel and create a new one. + EXPECT_NE(nullptr, resolver_->resolve("some.good.domain", DnsLookupFamily::V4Only, + [&](std::list&& results) -> void { + address_list = getAddressList(results); + dispatcher_->exit(); + })); + + dispatcher_->run(Event::Dispatcher::RunType::Block); + // However, the fresh channel initialized by production code does not point to the TestDnsServer. + // This means that resolution will return ARES_ENOTFOUND. This should not dirty the channel. + EXPECT_FALSE(peer_->isChannelDirty()); + EXPECT_TRUE(address_list.empty()); + + // Reset the channel to point to the TestDnsServer, and make sure resolution is healthy. + if (tcp_only()) { + peer_->resetChannelTcpOnly(zero_timeout()); + } + ares_set_servers_ports_csv(peer_->channel(), socket_->localAddress()->asString().c_str()); + + EXPECT_NE(nullptr, resolver_->resolve("some.good.domain", DnsLookupFamily::Auto, + [&](std::list&& results) -> void { + address_list = getAddressList(results); + dispatcher_->exit(); + })); + + dispatcher_->run(Event::Dispatcher::RunType::Block); + EXPECT_FALSE(peer_->isChannelDirty()); + EXPECT_TRUE(hasAddress(address_list, "201.134.56.7")); +} + TEST_P(DnsImplTest, DnsIpAddressVersion) { std::list address_list; server_->addHosts("some.good.domain", {"1.2.3.4"}, RecordType::A); diff --git a/tools/spelling_dictionary.txt b/tools/spelling_dictionary.txt index aafa29d7f61f3..27e007dda358d 100644 --- a/tools/spelling_dictionary.txt +++ b/tools/spelling_dictionary.txt @@ -76,6 +76,7 @@ EDESTRUCTION EDF EINVAL ELB +ENOTFOUND ENOTSUP ENV EOF @@ -181,6 +182,7 @@ NDEBUG NEXTHDR NGHTTP NOAUTH +NOCHECKRESP NODELAY NOLINT NOLINTNEXTLINE