Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 61 additions & 26 deletions source/extensions/common/dynamic_forward_proxy/dns_cache_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -193,14 +193,20 @@ void DnsCacheImpl::startCacheLoad(const std::string& host, uint16_t default_port
return;
}

const auto host_attributes = Http::Utility::parseAuthority(host);

primary_host = createHost(host, default_port);
startResolve(host, *primary_host);
}


DnsCacheImpl::PrimaryHostInfo* DnsCacheImpl::createHost(const std::string& host, uint16_t default_port) {
const auto host_attributes = Http::Utility::parseAuthority(host);
// TODO(mattklein123): Right now, the same host with different ports will become two
// independent primary hosts with independent DNS resolutions. I'm not sure how much this will
// matter, but we could consider collapsing these down and sharing the underlying DNS resolution.
{
absl::WriterMutexLock writer_lock{&primary_hosts_lock_};
primary_host = primary_hosts_
return primary_hosts_
// try_emplace() is used here for direct argument forwarding.
.try_emplace(host, std::make_unique<PrimaryHostInfo>(
*this, std::string(host_attributes.host_),
Expand All @@ -210,8 +216,6 @@ void DnsCacheImpl::startCacheLoad(const std::string& host, uint16_t default_port
[this, host]() { onResolveTimeout(host); }))
.first->second.get();
}

startResolve(host, *primary_host);
}

DnsCacheImpl::PrimaryHostInfo& DnsCacheImpl::getPrimaryHost(const std::string& host) {
Expand Down Expand Up @@ -300,9 +304,19 @@ void DnsCacheImpl::finishResolve(const std::string& host,
return primary_host_it->second.get();
}();

const bool first_resolve = !primary_host_info->host_info_->firstResolveComplete();
primary_host_info->timeout_timer_->disableTimer();
primary_host_info->active_query_ = nullptr;
bool first_resolve = false;

if (!from_cache) {
first_resolve = !primary_host_info->host_info_->firstResolveComplete();
primary_host_info->timeout_timer_->disableTimer();
primary_host_info->active_query_ = nullptr;

if (status == Network::DnsResolver::ResolutionStatus::Failure) {
stats_.dns_query_failure_.inc();
} else {
stats_.dns_query_success_.inc();
}
}

// If the DNS resolver successfully resolved with an empty response list, the dns cache does not
// update. This ensures that a potentially previously resolved address does not stabilize back to
Expand All @@ -312,12 +326,6 @@ void DnsCacheImpl::finishResolve(const std::string& host,
primary_host_info->port_)
: nullptr;

if (status == Network::DnsResolver::ResolutionStatus::Failure) {
stats_.dns_query_failure_.inc();
} else {
stats_.dns_query_success_.inc();
}

// Only the change the address if:
// 1) The new address is valid &&
// 2a) The host doesn't yet have an address ||
Expand All @@ -329,25 +337,34 @@ void DnsCacheImpl::finishResolve(const std::string& host,
auto current_address = primary_host_info->host_info_->address();
if (new_address != nullptr && (current_address == nullptr || *current_address != *new_address)) {
if (!from_cache) {
addCacheEntry(host, new_address);
addCacheEntry(host, new_address, response.front().ttl_);
}
// TODO(alyssawilk) don't immediately push cached entries to threads.
// Only serve stale entries if a configured resolve timeout has fired.
ENVOY_LOG(debug, "host '{}' address has changed", host);
primary_host_info->host_info_->setAddress(new_address);
runAddUpdateCallbacks(host, primary_host_info->host_info_);
address_changed = true;
stats_.host_address_changed_.inc();
}

if (first_resolve || address_changed) {
SystemTime now = main_thread_dispatcher_.timeSource().systemTime();
uint64_t ms_since_epoch =
std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()).count();
uint64_t stale_at = ms_since_epoch + std::chrono::duration_cast<std::chrono::seconds>(response.front().ttl_).count();
primary_host_info->host_info_->updateStale(stale_at);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: It might make sense for this method to take the ttl directly and do the work in the method of coverting the ttl to whatever the internal representation is of "now + ttl"?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done.


if (first_resolve) {
primary_host_info->host_info_->setFirstResolveComplete();
}
if (first_resolve || address_changed) {
// TODO(alyssawilk) only notify threads of stale results after a resolution
// timeout.
notifyThreads(host, primary_host_info->host_info_);
}

// Kick off the refresh timer.
// TODO(mattklein123): Consider jitter here. It may not be necessary since the initial host
// is populated dynamically.
// TODO(alyssawilk) also consider TTL here.
if (status == Network::DnsResolver::ResolutionStatus::Success) {
failure_backoff_strategy_->reset();
primary_host_info->refresh_timer_->enableTimer(refresh_interval_);
Expand Down Expand Up @@ -424,12 +441,16 @@ DnsCacheImpl::PrimaryHostInfo::~PrimaryHostInfo() {
}

void DnsCacheImpl::addCacheEntry(const std::string& host,
const Network::Address::InstanceConstSharedPtr& address) {
const Network::Address::InstanceConstSharedPtr& address,
const std::chrono::seconds& ttl) {
if (!key_value_store_) {
return;
}
// TODO(alyssawilk) cache data should include TTL, or some other indicator.
const std::string value = absl::StrCat(address->asString());
SystemTime now = main_thread_dispatcher_.timeSource().systemTime();
uint64_t ms_since_epoch =
std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()).count();
const std::string value = absl::StrCat(address->asString(), "|", ttl.count(), "|",
ms_since_epoch);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not for this PR, but it seems like we could add a wrapper around KeyValueStore in which the Value is allowed to be a type which implements an interfaces that has methods like std::string serialize() and bool deserialise(const std::string&). Then this wrapper class could be something like: TypedKeyValueStore which would have a vanilla KeyValueStore as a member. When reading from / writing to the store, it would transparently transform the Value types to/from strings.

/shrug. Maybe something to think about down the road...

key_value_store_->addOrUpdate(host, value);
}

Expand All @@ -450,17 +471,31 @@ void DnsCacheImpl::loadCacheEntries(
key_value_store_ = factory.createStore(config.key_value_config(), validation_visitor_,
main_thread_dispatcher_, file_system_);
KeyValueStore::ConstIterateCb load = [this](const std::string& key, const std::string& value) {
auto address = Network::Utility::parseInternetAddressAndPortNoThrow(value);
if (address == nullptr) {
Network::Address::InstanceConstSharedPtr address;
const auto parts = StringUtil::splitToken(value, "|");
std::chrono::seconds ttl(0);
if (parts.size() == 3) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might be misreading this code but it seems like up on line 452, we write 3 things into the cache value. Down here, we spilt on | and ensure that we have 2 parts. This all makes sense. But I only see code looking at parts[0] (line 478) and parts[1] (line 483) but I don't see a reference to parts[2]. Is this expected?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah I wasn't using part two yet. I was going to do in the PR where we start using stale, but I've worked it into this one with the time updates.
I'll say I'm not 100% on the conversion back and forth but it's hard to test until we have stale expirey which is coming in the next PR!

address = Network::Utility::parseInternetAddressAndPortNoThrow(std::string(parts[0]));
if (address == nullptr) {
ENVOY_LOG(warn, "{} is not a valid address", parts[0]);
}
uint64_t ttl_int;
if (absl::SimpleAtoi(parts[1], &ttl_int) && ttl_int != 0) {
ttl = std::chrono::seconds(ttl_int);
} else {
ENVOY_LOG(warn, "{} is not a valid ttl", parts[1]);
}
} else {
ENVOY_LOG(warn, "Incorrect number of tokens in the cache line");
}
if (address == nullptr || ttl == std::chrono::seconds(0)) {
ENVOY_LOG(warn, "Unable to parse cache line '{}'", value);
return KeyValueStore::Iterate::Break;
}
stats_.cache_load_.inc();
std::list<Network::DnsResponse> response;
// TODO(alyssawilk) change finishResolve to actually use the TTL rather than
// putting 0 here, return the remaining TTL or indicate the result is stale.
response.emplace_back(Network::DnsResponse(address, std::chrono::seconds(0) /* ttl */));
startCacheLoad(key, address->ip()->port());
createHost(key, address->ip()->port());
response.emplace_back(Network::DnsResponse(address, ttl));
finishResolve(key, Network::DnsResolver::ResolutionStatus::Success, std::move(response), true);
return KeyValueStore::Iterate::Continue;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ class DnsCacheImpl : public DnsCache, Logger::Loggable<Logger::Id::forward_proxy
const std::string& resolvedHost() const override { return resolved_host_; }
bool isIpAddress() const override { return is_ip_address_; }
void touch() final { last_used_time_ = time_source_.monotonicTime().time_since_epoch(); }
void updateStale(uint64_t time) { stale_at_time_ = time; }

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there by any chance a data type we can use here which expresses the semantics here? For example is this a time_t (seconds since the UNIX epoch) or is it more of a TTL (a time delta)? Perhaps one of the std::chrono types? Oh, or maybe just SystemTime directly?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


void setAddress(Network::Address::InstanceConstSharedPtr address) {
absl::WriterMutexLock lock{&resolve_lock_};
Expand Down Expand Up @@ -141,6 +142,7 @@ class DnsCacheImpl : public DnsCache, Logger::Loggable<Logger::Id::forward_proxy
// Using std::chrono::steady_clock::duration is required for compilation within an atomic vs.
// using MonotonicTime.
std::atomic<std::chrono::steady_clock::duration> last_used_time_;
std::atomic<uint64_t> stale_at_time_;
bool first_resolve_complete_ ABSL_GUARDED_BY(resolve_lock_){false};
};

Expand Down Expand Up @@ -186,10 +188,12 @@ class DnsCacheImpl : public DnsCache, Logger::Loggable<Logger::Id::forward_proxy
PrimaryHostInfo& getPrimaryHost(const std::string& host);

void addCacheEntry(const std::string& host,
const Network::Address::InstanceConstSharedPtr& address);
const Network::Address::InstanceConstSharedPtr& address,
const std::chrono::seconds& ttl);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think normally we'd pass this by value since it's just a wrapper around an integer

void removeCacheEntry(const std::string& host);
void loadCacheEntries(
const envoy::extensions::common::dynamic_forward_proxy::v3::DnsCacheConfig& config);
PrimaryHostInfo* createHost(const std::string& host, uint16_t default_port);

Event::Dispatcher& main_thread_dispatcher_;
const Network::DnsLookupFamily dns_lookup_family_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "test/test_common/test_runtime.h"
#include "test/test_common/utility.h"

using testing::ContainsRegex;
using testing::DoAll;
using testing::InSequence;
using testing::Return;
Expand Down Expand Up @@ -1064,14 +1065,14 @@ TEST_F(DnsCacheImplTest, ResolveSuccessWithCaching) {

EXPECT_CALL(*timeout_timer, disableTimer());
// Make sure the store gets the first insert.
EXPECT_CALL(*store, addOrUpdate("foo.com", "10.0.0.1:80"));
EXPECT_CALL(*store, addOrUpdate("foo.com", ContainsRegex("10.0.0.1:80|30|\\d*)")));

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of using a Regex, could we use the actual value? Maybe that's hard cause we're using the "real" clock and not a mock clock? (If so, would it be possible to use a mock clock? Probably!)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can now!

EXPECT_CALL(update_callbacks_,
onDnsHostAddOrUpdate("foo.com", DnsHostInfoEquals("10.0.0.1:80", "foo.com", false)));
EXPECT_CALL(callbacks,
onLoadDnsCacheComplete(DnsHostInfoEquals("10.0.0.1:80", "foo.com", false)));
EXPECT_CALL(*resolve_timer, enableTimer(std::chrono::milliseconds(60000), _));
resolve_cb(Network::DnsResolver::ResolutionStatus::Success,
TestUtility::makeDnsResponse({"10.0.0.1"}));
TestUtility::makeDnsResponse({"10.0.0.1"}, std::chrono::seconds(30)));

checkStats(1 /* attempt */, 1 /* success */, 0 /* failure */, 1 /* address changed */,
1 /* added */, 0 /* removed */, 1 /* num hosts */);
Expand All @@ -1089,7 +1090,7 @@ TEST_F(DnsCacheImplTest, ResolveSuccessWithCaching) {
EXPECT_CALL(*timeout_timer, disableTimer());
EXPECT_CALL(*resolve_timer, enableTimer(std::chrono::milliseconds(60000), _));
resolve_cb(Network::DnsResolver::ResolutionStatus::Success,
TestUtility::makeDnsResponse({"10.0.0.1"}));
TestUtility::makeDnsResponse({"10.0.0.1"}, std::chrono::seconds(30)));

checkStats(2 /* attempt */, 2 /* success */, 0 /* failure */, 1 /* address changed */,
1 /* added */, 0 /* removed */, 1 /* num hosts */);
Expand All @@ -1105,12 +1106,12 @@ TEST_F(DnsCacheImplTest, ResolveSuccessWithCaching) {

EXPECT_CALL(*timeout_timer, disableTimer());
// Make sure the store gets the updated address.
EXPECT_CALL(*store, addOrUpdate("foo.com", "10.0.0.2:80"));
EXPECT_CALL(*store, addOrUpdate("foo.com", ContainsRegex("10.0.0.2:80|30|\\d*")));
EXPECT_CALL(update_callbacks_,
onDnsHostAddOrUpdate("foo.com", DnsHostInfoEquals("10.0.0.2:80", "foo.com", false)));
EXPECT_CALL(*resolve_timer, enableTimer(std::chrono::milliseconds(60000), _));
resolve_cb(Network::DnsResolver::ResolutionStatus::Success,
TestUtility::makeDnsResponse({"10.0.0.2"}));
TestUtility::makeDnsResponse({"10.0.0.2"}, std::chrono::seconds(30)));

checkStats(3 /* attempt */, 3 /* success */, 0 /* failure */, 2 /* address changed */,
1 /* added */, 0 /* removed */, 1 /* num hosts */);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ name: envoy.clusters.dynamic_forward_proxy
if (write_cache_file_) {
std::string host =
fmt::format("localhost:{}", fake_upstreams_[0]->localAddress()->ip()->port());
std::string value = fake_upstreams_[0]->localAddress()->asString();
std::string value = absl::StrCat(
fake_upstreams_[0]->localAddress()->asString(), "|1000000|0");
TestEnvironment::writeStringToFileForTest(
"dns_cache.txt", absl::StrCat(host.length(), "\n", host, value.length(), "\n", value));
}
Expand Down Expand Up @@ -359,7 +360,6 @@ TEST_P(ProxyFilterIntegrationTest, UseCacheFile) {
sendRequestAndWaitForResponse(request_headers, 1024, default_response_headers_, 1024);
checkSimpleRequestSuccess(1024, 1024, response.get());
EXPECT_EQ(1, test_server_->counter("dns_cache.foo.cache_load")->value());
EXPECT_EQ(1, test_server_->counter("dns_cache.foo.dns_query_attempt")->value());

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be valid to check that this is 0 since the cached entry won't be re-queried?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it might race becasue it reregisters for refresh and I'm not sure how long it'll take

EXPECT_EQ(1, test_server_->counter("dns_cache.foo.host_added")->value());
}
#endif
Expand Down