diff --git a/api/envoy/extensions/network/dns_resolver/cares/v3/cares_dns_resolver.proto b/api/envoy/extensions/network/dns_resolver/cares/v3/cares_dns_resolver.proto index b060bce969b42..d05d073da4e35 100644 --- a/api/envoy/extensions/network/dns_resolver/cares/v3/cares_dns_resolver.proto +++ b/api/envoy/extensions/network/dns_resolver/cares/v3/cares_dns_resolver.proto @@ -21,7 +21,7 @@ option (udpa.annotations.file_status).package_version_status = ACTIVE; // [#extension: envoy.network.dns_resolver.cares] // Configuration for c-ares DNS resolver. -// [#next-free-field: 11] +// [#next-free-field: 12] message CaresDnsResolverConfig { // A list of DNS resolver addresses. // :ref:`use_resolvers_as_fallback ` @@ -99,4 +99,18 @@ message CaresDnsResolverConfig { // // If not specified, no periodic refresh will be performed. google.protobuf.Duration max_udp_channel_duration = 10 [(validate.rules).duration = {gte {}}]; + + // If true, reinitialize the c-ares channel when a DNS query fails with ``ARES_ETIMEOUT``. + // + // This can help recover from rare cases where the UDP sockets held by the c-ares + // channel become unusable after timeouts, causing subsequent queries to fail or + // Envoy to keep serving stale DNS results. When enabled, a timeout-triggered + // reinitialization attempts to restore healthy state quickly. In environments + // where timeouts are caused by intermittent network issues, enabling this may + // increase channel churn; consider using + // :ref:`max_udp_channel_duration ` + // for periodic refresh instead. + // + // Default is false. + bool reinit_channel_on_timeout = 11; } diff --git a/changelogs/current.yaml b/changelogs/current.yaml index 2a33eb492c5d8..c655e0dac29c4 100644 --- a/changelogs/current.yaml +++ b/changelogs/current.yaml @@ -72,6 +72,10 @@ bug_fixes: Fixed per-route configuration for composite filter to support matching on response headers and trailers. Previously, per-route matchers would silently fail when attempting to match on ``HttpResponseHeaderMatchInput`` or ``HttpResponseTrailerMatchInput``, causing the delegated filter to be skipped without error. +- area: dns + change: | + c-ares resolver: add optional ``reinit_channel_on_timeout`` to reinitialize + the resolver after DNS timeouts. removed_config_or_runtime: # *Normally occurs at the end of the* :ref:`deprecation period ` diff --git a/source/extensions/network/dns_resolver/cares/dns_impl.cc b/source/extensions/network/dns_resolver/cares/dns_impl.cc index 2840b1f522e03..9796a90ac7b70 100644 --- a/source/extensions/network/dns_resolver/cares/dns_impl.cc +++ b/source/extensions/network/dns_resolver/cares/dns_impl.cc @@ -61,7 +61,7 @@ DnsResolverImpl::DnsResolverImpl( ? std::chrono::milliseconds(Protobuf::util::TimeUtil::DurationToMilliseconds( config.max_udp_channel_duration())) : std::chrono::milliseconds::zero()), - resolvers_csv_(resolvers_csv), + reinit_channel_on_timeout_(config.reinit_channel_on_timeout()), resolvers_csv_(resolvers_csv), filter_unroutable_families_(config.filter_unroutable_families()), scope_(root_scope.createScope("dns.cares.")), stats_(generateCaresDnsResolverStats(*scope_)) { AresOptions options = defaultAresOptions(); @@ -252,7 +252,7 @@ void DnsResolverImpl::AddrInfoPendingResolution::onAresGetAddrInfoCallback( // If c-ares returns ARES_ECONNREFUSED and there is no fallback we assume that the channel_ is // broken and hence we reinitialize it here. if (status == ARES_ECONNREFUSED || status == ARES_EREFUSED || status == ARES_ESERVFAIL || - status == ARES_ENOTIMP) { + status == ARES_ENOTIMP || (status == ARES_ETIMEOUT && parent_.reinit_channel_on_timeout_)) { parent_.reinitializeChannel(); } } diff --git a/source/extensions/network/dns_resolver/cares/dns_impl.h b/source/extensions/network/dns_resolver/cares/dns_impl.h index 7552fd958cf7e..ad5a91cb2c6d3 100644 --- a/source/extensions/network/dns_resolver/cares/dns_impl.h +++ b/source/extensions/network/dns_resolver/cares/dns_impl.h @@ -206,6 +206,7 @@ class DnsResolverImpl : public DnsResolver, protected Logger::Loggable resolvers_csv_; const bool filter_unroutable_families_; Stats::ScopeSharedPtr scope_; diff --git a/test/extensions/network/dns_resolver/cares/dns_impl_test.cc b/test/extensions/network/dns_resolver/cares/dns_impl_test.cc index b59e32722b8a5..8b77e1e3176c3 100644 --- a/test/extensions/network/dns_resolver/cares/dns_impl_test.cc +++ b/test/extensions/network/dns_resolver/cares/dns_impl_test.cc @@ -733,6 +733,9 @@ class DnsImplTest : public testing::TestWithParam { cares.mutable_edns0_max_payload_size()->set_value(getEdns0MaxPayloadSize()); } + // Enable `reinit_channel_on_timeout` if requested by the test case. + cares.set_reinit_channel_on_timeout(reinitOnTimeout()); + // Copy over the dns_resolver_options_. cares.mutable_dns_resolver_options()->MergeFrom(dns_resolver_options); // setup the typed config @@ -741,6 +744,8 @@ class DnsImplTest : public testing::TestWithParam { return typed_dns_resolver_config; } + // Whether to enable `reinit_channel_on_timeout` in the resolver config for this test. + virtual bool reinitOnTimeout() const { return false; } void SetUp() override { // Instantiate TestDnsServer and listen on a random port on the loopback address. @@ -1958,6 +1963,7 @@ TEST_P(DnsImplFilterUnroutableFamiliesDontFilterTest, DontFilterAllV6) { class DnsImplZeroTimeoutTest : public DnsImplTest { protected: bool queryTimeout() const override { return true; } + bool reinitOnTimeout() const override { return true; } }; // Parameterize the DNS test server socket address. @@ -1965,7 +1971,7 @@ INSTANTIATE_TEST_SUITE_P(IpVersions, DnsImplZeroTimeoutTest, testing::ValuesIn(TestEnvironment::getIpVersionsForTest()), TestUtility::ipTestParamsToString); -// Validate that timeouts result in an empty callback. +// Validate that timeouts result in an empty callback and trigger channel reinitialization. TEST_P(DnsImplZeroTimeoutTest, Timeout) { server_->addHosts("some.good.domain", {"201.134.56.7"}, RecordType::A); @@ -1973,8 +1979,9 @@ TEST_P(DnsImplZeroTimeoutTest, Timeout) { resolveWithExpectations("some.good.domain", DnsLookupFamily::V4Only, DnsResolver::ResolutionStatus::Failure, {}, {}, absl::nullopt)); dispatcher_->run(Event::Dispatcher::RunType::Block); + // After `ARES_ETIMEOUT`, the channel should reinitialize. checkStats(1 /*resolve_total*/, 0 /*pending_resolutions*/, 0 /*not_found*/, - 0 /*get_addr_failure*/, 3 /*timeouts*/, 0 /*reinitializations*/); + 0 /*get_addr_failure*/, 3 /*timeouts*/, 1 /*reinitializations*/); } // Validate that c-ares query cache is disabled by default.