diff --git a/source/extensions/network/dns_resolver/cares/dns_impl.cc b/source/extensions/network/dns_resolver/cares/dns_impl.cc index 47fdc4233da75..826c441509bac 100644 --- a/source/extensions/network/dns_resolver/cares/dns_impl.cc +++ b/source/extensions/network/dns_resolver/cares/dns_impl.cc @@ -125,6 +125,9 @@ void DnsResolverImpl::initializeChannel(ares_options* options, int optmask) { void DnsResolverImpl::AddrInfoPendingResolution::onAresGetAddrInfoCallback( int status, int timeouts, ares_addrinfo* addrinfo) { + ASSERT(pending_resolutions_ > 0); + pending_resolutions_--; + if (status != ARES_SUCCESS) { ENVOY_LOG_EVENT(debug, "cares_resolution_failure", "dns resolution for {} failed with c-ares status {}", dns_name_, status); @@ -132,6 +135,12 @@ void DnsResolverImpl::AddrInfoPendingResolution::onAresGetAddrInfoCallback( // We receive ARES_EDESTRUCTION when destructing with pending queries. if (status == ARES_EDESTRUCTION) { + // In the destruction path we must wait until there are no more pending queries. Resolution is + // not truly finished until the last parallel query has been destroyed. + if (pending_resolutions_ > 0) { + return; + } + ASSERT(owned_); // This destruction might have been triggered by a peer PendingResolution that received a // ARES_ECONNREFUSED. If the PendingResolution has not been cancelled that means that the @@ -372,6 +381,11 @@ DnsResolverImpl::AddrInfoPendingResolution::AddrInfoPendingResolution( } } +DnsResolverImpl::AddrInfoPendingResolution::~AddrInfoPendingResolution() { + // All pending resolutions should be cleaned up at this point. + ASSERT(pending_resolutions_ == 0); +} + void DnsResolverImpl::AddrInfoPendingResolution::startResolution() { if (lookup_all_) { startResolutionImpl(AF_INET); @@ -382,6 +396,7 @@ void DnsResolverImpl::AddrInfoPendingResolution::startResolution() { } void DnsResolverImpl::AddrInfoPendingResolution::startResolutionImpl(int family) { + pending_resolutions_++; if (parent_.filter_unroutable_families_) { switch (family) { case AF_INET: diff --git a/source/extensions/network/dns_resolver/cares/dns_impl.h b/source/extensions/network/dns_resolver/cares/dns_impl.h index b3b2de267383e..3e906f7f77068 100644 --- a/source/extensions/network/dns_resolver/cares/dns_impl.h +++ b/source/extensions/network/dns_resolver/cares/dns_impl.h @@ -94,6 +94,7 @@ class DnsResolverImpl : public DnsResolver, protected Logger::Loggable { return resolver_->resolve(address, lookup_family, [expected_to_execute](DnsResolver::ResolutionStatus status, std::list&& results) -> void { - if (!expected_to_execute) { - FAIL(); - } + EXPECT_TRUE(expected_to_execute); UNREFERENCED_PARAMETER(status); UNREFERENCED_PARAMETER(results); }); @@ -881,6 +879,21 @@ TEST_P(DnsImplTest, DestructPending) { EXPECT_GT(peer_->events().size(), 0U); } +// Validate that All queries (2 concurrent queries) properly cleanup when the channel is destroyed. +// TODO(mattklein123): This is a brute force way of testing this path, however we have seen +// evidence that this happens during normal operation via the "channel dirty" path. The sequence +// must look something like: +// 1) Issue All query with 2 parallel sub-queries. +// 2) Issue parallel query which fails with ARES_ECONNREFUSED, mark channel dirty. +// 3) Issue 3rd query which sees a dirty channel and destroys it, causing the parallel queries +// issued in step 1 to fail. It's not clear whether this is possible over a TCP channel or +// just via UDP. +// Either way, we have no tests today that cover parallel queries. We can do this is a follow up. +TEST_P(DnsImplTest, DestructPendingAllQuery) { + ActiveDnsQuery* query = resolveWithUnreferencedParameters("", DnsLookupFamily::All, true); + ASSERT_NE(nullptr, query); +} + TEST_P(DnsImplTest, DestructCallback) { server_->addHosts("some.good.domain", {"201.134.56.7"}, RecordType::A);