Skip to content

Commit

Permalink
Merge pull request #4597 from tjamet/add-provider-cache
Browse files Browse the repository at this point in the history
Add provider cache
  • Loading branch information
k8s-ci-robot authored Aug 14, 2024
2 parents c875e65 + a6ab2ba commit c87fcc7
Show file tree
Hide file tree
Showing 21 changed files with 429 additions and 18 deletions.
4 changes: 2 additions & 2 deletions controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ type Controller struct {
// The interval between individual synchronizations
Interval time.Duration
// The DomainFilter defines which DNS records to keep or exclude
DomainFilter endpoint.DomainFilter
DomainFilter endpoint.DomainFilterInterface
// The nextRunAt used for throttling and batching reconciliation
nextRunAt time.Time
// The runAtMutex is for atomic updating of nextRunAt and lastRunAt
Expand Down Expand Up @@ -245,7 +245,7 @@ func (c *Controller) RunOnce(ctx context.Context) error {
Policies: []plan.Policy{c.Policy},
Current: records,
Desired: endpoints,
DomainFilter: endpoint.MatchAllDomainFilters{&c.DomainFilter, &registryFilter},
DomainFilter: endpoint.MatchAllDomainFilters{c.DomainFilter, registryFilter},
ManagedRecords: c.ManagedRecordTypes,
ExcludeRecords: c.ExcludeRecordTypes,
OwnerID: c.Registry.OwnerID(),
Expand Down
2 changes: 1 addition & 1 deletion controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ type errorMockProvider struct {
mockProvider
}

func (p *filteredMockProvider) GetDomainFilter() endpoint.DomainFilter {
func (p *filteredMockProvider) GetDomainFilter() endpoint.DomainFilterInterface {
return p.domainFilter
}

Expand Down
76 changes: 76 additions & 0 deletions docs/rate-limits.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
DNS provider API rate limits considerations
===========================================

## Introduction

By design, external-dns refreshes all the records of a zone using API calls.
This refresh may happen peridically and upon any changed object if the flag `--events` is enabled.

Depending on the size of the zone and the infrastructure deployment, this may lead to external-dns
hitting the DNS provider's rate-limits more easily.

In particular, it has been found that with 200k records in an AWS Route53 zone, each refresh triggers around
70 API calls to retrieve all the records, making it more likely to hit the AWS Route53 API rate limits.

To prevent this problem from happening, external-dns has implemented a cache to reduce the pressure on the DNS
provider APIs.

This cache is optional and systematically invalidated when DNS records have been changed in the cluster
(new or deleted domains or changed target).

## Trade-offs

The major trade-off of this setting relies in the ability to recover from a deleted record on the DNS provider side.
As the DNS records are cached in memory, external-dns will not be made aware of the missing records and will hence
take a longer time to restore the deleted or modified record on the provider side.

This option is enabled using the `--provider-cache-time=15m` command line argument, and turned off when `--provider-cache-time=0m`

## Monitoring

You can evaluate the behaviour of the cache thanks to the built-in metrics

* `external_dns_provider_cache_records_calls`
* The number of calls to the provider cache Records list.
* The label `from_cache=true` indicates that the records were retrieved from memory and the DNS provider was not reached
* The label `from_cache=false` indicates that the cache was not used and the records were retrieved from the provider
* `external_dns_provider_cache_apply_changes_calls`
* The number of calls to the provider cache ApplyChanges.
* Each ApplyChange systematically invalidates the cache and makes subsequent Records list to be retrieved from the provider without cache.

## Related options

This global option is available for all providers and can be used in pair with other global
or provider-specific options to fine-tune the behaviour of external-dns
to match the specific needs of your deployments, with the goal to reduce the number of API calls to your DNS provider.

* Google
* `--google-batch-change-interval=1s` When using the Google provider, set the interval between batch changes. ($EXTERNAL_DNS_GOOGLE_BATCH_CHANGE_INTERVAL)
* `--google-batch-change-size=1000` When using the Google provider, set the maximum number of changes that will be applied in each batch.
* AWS
* `--aws-batch-change-interval=1s` When using the AWS provider, set the interval between batch changes.
* `--aws-batch-change-size=1000` When using the AWS provider, set the maximum number of changes that will be applied in each batch.
* `--aws-batch-change-size-bytes=32000` When using the AWS provider, set the maximum byte size that will be applied in each batch.
* `--aws-batch-change-size-values=1000` When using the AWS provider, set the maximum total record values that will be applied in each batch.
* `--aws-zones-cache-duration=0s` When using the AWS provider, set the zones list cache TTL (0s to disable).
* `--[no-]aws-zone-match-parent` Expand limit possible target by sub-domains
* Cloudflare
* `--cloudflare-dns-records-per-page=100` When using the Cloudflare provider, specify how many DNS records listed per page, max possible 5,000 (default: 100)
* OVH
* `--ovh-api-rate-limit=20` When using the OVH provider, specify the API request rate limit, X operations by seconds (default: 20)

* Global
* `--registry=txt` The registry implementation to use to keep track of DNS record ownership (default: txt, options: txt, noop, dynamodb, aws-sd)
* `--txt-cache-interval=0s` The interval between cache synchronizations in duration format (default: disabled)
* `--interval=1m0s` The interval between two consecutive synchronizations in duration format (default: 1m)
* `--min-event-sync-interval=5s` The minimum interval between two consecutive synchronizations triggered from kubernetes events in duration format (default: 5s)
* `--[no-]events` When enabled, in addition to running every interval, the reconciliation loop will get triggered when supported sources change (default: disabled)

A general recommendation is to enable `--events` and keep `--min-event-sync-interval` relatively low to have a better responsiveness when records are
created or updated inside the cluster.
This should represent an acceptable propagation time between the creation of your k8s resources and the time they become registered in your DNS server.

On a general manner, the higher the `--provider-cache-time`, the lower the impact on the rate limits, but also, the slower the recovery in case of a deletion.
The `--provider-cache-time` value should hence be set to an acceptable time to automatically recover restore deleted records.

✍️ Note that caching is done within the external-dns controller memory. You can invalidate the cache at any point in time by restarting it (for example doing a rolling update).
2 changes: 2 additions & 0 deletions docs/tutorials/aws.md
Original file line number Diff line number Diff line change
Expand Up @@ -912,6 +912,8 @@ Route53 has a [5 API requests per second per account hard quota](https://docs.aw
Running several fast polling ExternalDNS instances in a given account can easily hit that limit. Some ways to reduce the request rate include:
* Reduce the polling loop's synchronization interval at the possible cost of slower change propagation (but see `--events` below to reduce the impact).
* `--interval=5m` (default `1m`)
* Enable a Cache to store the zone records list. It comes with a cost: slower propagation when the zone gets modified from other sources such as the AWS console, terraform, cloudformation or anything similar.
* `--provider-cache-time=15m` (default `0m`)
* Trigger the polling loop on changes to K8s objects, rather than only at `interval` and ensure a minimum of time between events, to have responsive updates with long poll intervals
* `--events`
* `--min-event-sync-interval=5m` (default `5s`)
Expand Down
8 changes: 7 additions & 1 deletion endpoint/domain_filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import (
"strings"
)

type MatchAllDomainFilters []*DomainFilter
type MatchAllDomainFilters []DomainFilterInterface

func (f MatchAllDomainFilters) Match(domain string) bool {
for _, filter := range f {
Expand All @@ -39,6 +39,10 @@ func (f MatchAllDomainFilters) Match(domain string) bool {
return true
}

type DomainFilterInterface interface {
Match(domain string) bool
}

// DomainFilter holds a lists of valid domain names
type DomainFilter struct {
// Filters define what domains to match
Expand All @@ -51,6 +55,8 @@ type DomainFilter struct {
regexExclusion *regexp.Regexp
}

var _ DomainFilterInterface = &DomainFilter{}

// domainFilterSerde is a helper type for serializing and deserializing DomainFilter.
type domainFilterSerde struct {
Include []string `json:"include,omitempty"`
Expand Down
9 changes: 8 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,13 @@ func main() {
os.Exit(0)
}

if cfg.ProviderCacheTime > 0 {
p = provider.NewCachedProvider(
p,
cfg.ProviderCacheTime,
)
}

var r registry.Registry
switch cfg.Registry {
case "dynamodb":
Expand All @@ -414,7 +421,7 @@ func main() {
case "txt":
r, err = registry.NewTXTRegistry(p, cfg.TXTPrefix, cfg.TXTSuffix, cfg.TXTOwnerID, cfg.TXTCacheInterval, cfg.TXTWildcardReplacement, cfg.ManagedDNSRecordTypes, cfg.ExcludeDNSRecordTypes, cfg.TXTEncryptEnabled, []byte(cfg.TXTEncryptAESKey))
case "aws-sd":
r, err = registry.NewAWSSDRegistry(p.(*awssd.AWSSDProvider), cfg.TXTOwnerID)
r, err = registry.NewAWSSDRegistry(p, cfg.TXTOwnerID)
default:
log.Fatalf("unknown registry: %s", cfg.Registry)
}
Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ nav:
- Initial Design: docs/initial-design.md
- TTL: docs/ttl.md
- MultiTarget: docs/proposal/multi-target.md
- Rate Limits: docs/rate-limits.md
- Contributing:
- Kubernetes Contributions: CONTRIBUTING.md
- Release: docs/release.md
Expand Down
3 changes: 3 additions & 0 deletions pkg/apis/externaldns/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ type Config struct {
AlwaysPublishNotReadyAddresses bool
ConnectorSourceServer string
Provider string
ProviderCacheTime time.Duration
GoogleProject string
GoogleBatchChangeSize int
GoogleBatchChangeInterval time.Duration
Expand Down Expand Up @@ -239,6 +240,7 @@ var defaultConfig = &Config{
PublishHostIP: false,
ConnectorSourceServer: "localhost:8080",
Provider: "",
ProviderCacheTime: 0,
GoogleProject: "",
GoogleBatchChangeSize: 1000,
GoogleBatchChangeInterval: time.Second,
Expand Down Expand Up @@ -456,6 +458,7 @@ func (cfg *Config) ParseFlags(args []string) error {
// Flags related to providers
providers := []string{"akamai", "alibabacloud", "aws", "aws-sd", "azure", "azure-dns", "azure-private-dns", "bluecat", "civo", "cloudflare", "coredns", "designate", "digitalocean", "dnsimple", "dyn", "exoscale", "gandi", "godaddy", "google", "ibmcloud", "inmemory", "linode", "ns1", "oci", "ovh", "pdns", "pihole", "plural", "rcodezero", "rdns", "rfc2136", "safedns", "scaleway", "skydns", "tencentcloud", "transip", "ultradns", "vinyldns", "vultr", "webhook"}
app.Flag("provider", "The DNS provider where the DNS records will be created (required, options: "+strings.Join(providers, ", ")+")").Required().PlaceHolder("provider").EnumVar(&cfg.Provider, providers...)
app.Flag("provider-cache-time", "The time to cache the DNS provider record list requests.").Default(defaultConfig.ProviderCacheTime.String()).DurationVar(&cfg.ProviderCacheTime)
app.Flag("domain-filter", "Limit possible target zones by a domain suffix; specify multiple times for multiple domains (optional)").Default("").StringsVar(&cfg.DomainFilter)
app.Flag("exclude-domains", "Exclude subdomains (optional)").Default("").StringsVar(&cfg.ExcludeDomains)
app.Flag("regex-domain-filter", "Limit possible domains and target zones by a Regex filter; Overrides domain-filter (optional)").Default(defaultConfig.RegexDomainFilter.String()).RegexpVar(&cfg.RegexDomainFilter)
Expand Down
2 changes: 1 addition & 1 deletion provider/aws/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,7 @@ func (p *AWSProvider) createUpdateChanges(newEndpoints, oldEndpoints []*endpoint
}

// GetDomainFilter generates a filter to exclude any domain that is not controlled by the provider
func (p *AWSProvider) GetDomainFilter() endpoint.DomainFilter {
func (p *AWSProvider) GetDomainFilter() endpoint.DomainFilterInterface {
zones, err := p.Zones(context.Background())
if err != nil {
log.Errorf("failed to list zones: %v", err)
Expand Down
4 changes: 2 additions & 2 deletions provider/aws/aws_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,10 +319,10 @@ func TestAWSZones(t *testing.T) {
func TestAWSRecordsFilter(t *testing.T) {
provider, _ := newAWSProvider(t, endpoint.DomainFilter{}, provider.ZoneIDFilter{}, provider.ZoneTypeFilter{}, false, false, nil)
domainFilter := provider.GetDomainFilter()
assert.NotNil(t, domainFilter)
require.NotNil(t, domainFilter)
require.IsType(t, endpoint.DomainFilter{}, domainFilter)
count := 0
filters := domainFilter.Filters
filters := domainFilter.(endpoint.DomainFilter).Filters
for _, tld := range []string{
"zone-4.ext-dns-test-3.teapot.zalan.do",
".zone-4.ext-dns-test-3.teapot.zalan.do",
Expand Down
110 changes: 110 additions & 0 deletions provider/cached_provider.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package provider

import (
"context"
"sync"
"time"

"github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus"

"sigs.k8s.io/external-dns/endpoint"
"sigs.k8s.io/external-dns/plan"
)

var (
cachedRecordsCallsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "external_dns",
Subsystem: "provider",
Name: "cache_records_calls",
Help: "Number of calls to the provider cache Records list.",
},
[]string{
"from_cache",
},
)
cachedApplyChangesCallsTotal = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "external_dns",
Subsystem: "provider",
Name: "cache_apply_changes_calls",
Help: "Number of calls to the provider cache ApplyChanges.",
},
)

registerCacheProviderMetrics = sync.Once{}
)

type CachedProvider struct {
Provider
RefreshDelay time.Duration
lastRead time.Time
cache []*endpoint.Endpoint
}

func NewCachedProvider(provider Provider, refreshDelay time.Duration) *CachedProvider {
registerCacheProviderMetrics.Do(func() {
prometheus.MustRegister(cachedRecordsCallsTotal)
})
return &CachedProvider{
Provider: provider,
RefreshDelay: refreshDelay,
}
}

func (c *CachedProvider) Records(ctx context.Context) ([]*endpoint.Endpoint, error) {
if c.needRefresh() {
log.Info("Records cache provider: refreshing records list cache")
records, err := c.Provider.Records(ctx)
if err != nil {
c.cache = nil
return nil, err
}
c.cache = records
c.lastRead = time.Now()
cachedRecordsCallsTotal.WithLabelValues("false").Inc()
} else {
log.Debug("Records cache provider: using records list from cache")
cachedRecordsCallsTotal.WithLabelValues("true").Inc()
}
return c.cache, nil
}
func (c *CachedProvider) ApplyChanges(ctx context.Context, changes *plan.Changes) error {
if !changes.HasChanges() {
log.Info("Records cache provider: no changes to be applied")
return nil
}
c.Reset()
cachedApplyChangesCallsTotal.Inc()
return c.Provider.ApplyChanges(ctx, changes)
}

func (c *CachedProvider) Reset() {
c.cache = nil
c.lastRead = time.Time{}
}

func (c *CachedProvider) needRefresh() bool {
if c.cache == nil {
log.Debug("Records cache provider is not initialized")
return true
}
log.Debug("Records cache last Read: ", c.lastRead, "expiration: ", c.RefreshDelay, " provider expiration:", c.lastRead.Add(c.RefreshDelay), "expired: ", time.Now().After(c.lastRead.Add(c.RefreshDelay)))
return time.Now().After(c.lastRead.Add(c.RefreshDelay))
}
Loading

0 comments on commit c87fcc7

Please sign in to comment.