From 089744c6ff136d80e1ddc07ac0a985b40fee9b5e Mon Sep 17 00:00:00 2001 From: Thibault Jamet Date: Mon, 8 Jul 2024 09:19:12 +0200 Subject: [PATCH 01/14] Add cache at provider level **Description** In the current implementation, DNS providers are called to list all records on every loop. This is expensive in terms of number of requests to the provider and may result in being rate limited, as reported in 1293 and 3397. In our case, we have approximately 20,000 records in our AWS Hosted Zone. The ListResourceRecordSets API call allows a maximum of 300 items per call. That requires 67 API calls per external-dns deployment during every sync period With this, we introduce an optional generic caching mechanism at the provider level, that re-uses the latest known list of records for a given time. This prevents from expensive Provider calls to list all records for each object modification that does not change the actual record (annotations, statuses, ingress routing, ...) This introduces 2 trade-offs: 1. Any changes or corruption directly on the provider side will be longer to detect and to resolve, up to the cache time 2. Any conflicting records in the DNS provider (such as a different external-dns instance) injected during the cache validity will cause the first iteration of the next reconcile loop to fail, and hence add a delay until the next retry **Checklist** - [X] Unit tests updated - [X] End user documentation updated Change-Id: I0bdcfa994ac1b76acedb05d458a97c080284c5aa --- docs/tutorials/aws.md | 2 + main.go | 7 ++ pkg/apis/externaldns/types.go | 3 + provider/cached_provider.go | 73 ++++++++++++++ provider/cached_provider_test.go | 164 +++++++++++++++++++++++++++++++ 5 files changed, 249 insertions(+) create mode 100644 provider/cached_provider.go create mode 100644 provider/cached_provider_test.go diff --git a/docs/tutorials/aws.md b/docs/tutorials/aws.md index 214fa4ca9b..532dd45e33 100644 --- a/docs/tutorials/aws.md +++ b/docs/tutorials/aws.md @@ -912,6 +912,8 @@ Route53 has a [5 API requests per second per account hard quota](https://docs.aw Running several fast polling ExternalDNS instances in a given account can easily hit that limit. Some ways to reduce the request rate include: * Reduce the polling loop's synchronization interval at the possible cost of slower change propagation (but see `--events` below to reduce the impact). * `--interval=5m` (default `1m`) +* Cache the results of the zone at the possible cost of slower propagation when the zone gets modified from other sources + * `--provider-cache-time=15m` (default `0m`) * Trigger the polling loop on changes to K8s objects, rather than only at `interval` and ensure a minimum of time between events, to have responsive updates with long poll intervals * `--events` * `--min-event-sync-interval=5m` (default `5s`) diff --git a/main.go b/main.go index f05c46906f..05765845ff 100644 --- a/main.go +++ b/main.go @@ -401,6 +401,13 @@ func main() { os.Exit(0) } + if cfg.ProviderCacheTime > 0 { + p = &provider.CachedProvider{ + Provider: p, + RefreshDelay: cfg.ProviderCacheTime, + } + } + var r registry.Registry switch cfg.Registry { case "dynamodb": diff --git a/pkg/apis/externaldns/types.go b/pkg/apis/externaldns/types.go index 822993d093..05ba1f2e1b 100644 --- a/pkg/apis/externaldns/types.go +++ b/pkg/apis/externaldns/types.go @@ -67,6 +67,7 @@ type Config struct { AlwaysPublishNotReadyAddresses bool ConnectorSourceServer string Provider string + ProviderCacheTime int GoogleProject string GoogleBatchChangeSize int GoogleBatchChangeInterval time.Duration @@ -239,6 +240,7 @@ var defaultConfig = &Config{ PublishHostIP: false, ConnectorSourceServer: "localhost:8080", Provider: "", + ProviderCacheTime: 0, GoogleProject: "", GoogleBatchChangeSize: 1000, GoogleBatchChangeInterval: time.Second, @@ -456,6 +458,7 @@ func (cfg *Config) ParseFlags(args []string) error { // Flags related to providers providers := []string{"akamai", "alibabacloud", "aws", "aws-sd", "azure", "azure-dns", "azure-private-dns", "bluecat", "civo", "cloudflare", "coredns", "designate", "digitalocean", "dnsimple", "dyn", "exoscale", "gandi", "godaddy", "google", "ibmcloud", "inmemory", "linode", "ns1", "oci", "ovh", "pdns", "pihole", "plural", "rcodezero", "rdns", "rfc2136", "safedns", "scaleway", "skydns", "tencentcloud", "transip", "ultradns", "vinyldns", "vultr", "webhook"} app.Flag("provider", "The DNS provider where the DNS records will be created (required, options: "+strings.Join(providers, ", ")+")").Required().PlaceHolder("provider").EnumVar(&cfg.Provider, providers...) + app.Flag("provider-cache-time", "The time to cache the DNS provider record list requests.").Default(defaultConfig.ProviderCacheTime.String()).DurationVar(&cfg.ProviderCacheTime) app.Flag("domain-filter", "Limit possible target zones by a domain suffix; specify multiple times for multiple domains (optional)").Default("").StringsVar(&cfg.DomainFilter) app.Flag("exclude-domains", "Exclude subdomains (optional)").Default("").StringsVar(&cfg.ExcludeDomains) app.Flag("regex-domain-filter", "Limit possible domains and target zones by a Regex filter; Overrides domain-filter (optional)").Default(defaultConfig.RegexDomainFilter.String()).RegexpVar(&cfg.RegexDomainFilter) diff --git a/provider/cached_provider.go b/provider/cached_provider.go new file mode 100644 index 0000000000..e4b3652662 --- /dev/null +++ b/provider/cached_provider.go @@ -0,0 +1,73 @@ +package provider + +import ( + "context" + "time" + + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/external-dns/endpoint" + "sigs.k8s.io/external-dns/plan" +) + +var ( + cachedRecordsCallsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "external_dns", + Subsystem: "provider", + Name: "cache_records_calls", + Help: "Number of calls to the provider cache Records list.", + }, + []string{ + "from_cache", + }, + ) + cachedApplyChangesCallsTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: "external_dns", + Subsystem: "provider", + Name: "cache_apply_changes_calls", + Help: "Number of calls to the provider cache ApplyChanges.", + }, + ) +) + +type CachedProvider struct { + Provider + RefreshDelay time.Duration + err error + lastRead time.Time + cache []*endpoint.Endpoint +} + +func (c *CachedProvider) Records(ctx context.Context) ([]*endpoint.Endpoint, error) { + if c.needRefresh() { + c.cache, c.err = c.Provider.Records(ctx) + c.lastRead = time.Now() + cachedRecordsCallsTotal.WithLabelValues("false").Inc() + } else { + cachedRecordsCallsTotal.WithLabelValues("true").Inc() + } + return c.cache, c.err +} +func (c *CachedProvider) ApplyChanges(ctx context.Context, changes *plan.Changes) error { + c.Reset() + cachedApplyChangesCallsTotal.Inc() + return c.Provider.ApplyChanges(ctx, changes) +} + +func (c *CachedProvider) Reset() { + c.err = nil + c.cache = nil + c.lastRead = time.Time{} +} + +func (c *CachedProvider) needRefresh() bool { + if c.cache == nil || c.err != nil { + return true + } + return time.Now().After(c.lastRead.Add(c.RefreshDelay)) +} + +func init() { + prometheus.MustRegister(cachedRecordsCallsTotal) +} diff --git a/provider/cached_provider_test.go b/provider/cached_provider_test.go new file mode 100644 index 0000000000..653e8f8b58 --- /dev/null +++ b/provider/cached_provider_test.go @@ -0,0 +1,164 @@ +package provider + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "sigs.k8s.io/external-dns/endpoint" + "sigs.k8s.io/external-dns/plan" +) + +type testProviderFunc struct { + records func(ctx context.Context) ([]*endpoint.Endpoint, error) + applyChanges func(ctx context.Context, changes *plan.Changes) error + propertyValuesEqual func(name string, previous string, current string) bool + adjustEndpoints func(endpoints []*endpoint.Endpoint) []*endpoint.Endpoint + getDomainFilter func() endpoint.DomainFilterInterface +} + +func (p *testProviderFunc) Records(ctx context.Context) ([]*endpoint.Endpoint, error) { + return p.records(ctx) +} + +func (p *testProviderFunc) ApplyChanges(ctx context.Context, changes *plan.Changes) error { + return p.applyChanges(ctx, changes) +} + +func (p *testProviderFunc) PropertyValuesEqual(name string, previous string, current string) bool { + return p.propertyValuesEqual(name, previous, current) +} + +func (p *testProviderFunc) AdjustEndpoints(endpoints []*endpoint.Endpoint) []*endpoint.Endpoint { + return p.adjustEndpoints(endpoints) +} + +func (p *testProviderFunc) GetDomainFilter() endpoint.DomainFilterInterface { + return p.getDomainFilter() +} + +func recordsNotCalled(t *testing.T) func(ctx context.Context) ([]*endpoint.Endpoint, error) { + return func(ctx context.Context) ([]*endpoint.Endpoint, error) { + t.Errorf("unexpected call to Records") + return nil, nil + } +} + +func applyChangesNotCalled(t *testing.T) func(ctx context.Context, changes *plan.Changes) error { + return func(ctx context.Context, changes *plan.Changes) error { + t.Errorf("unexpected call to ApplyChanges") + return nil + } +} + +func propertyValuesEqualNotCalled(t *testing.T) func(name string, previous string, current string) bool { + return func(name string, previous string, current string) bool { + t.Errorf("unexpected call to PropertyValuesEqual") + return false + } +} + +func adjustEndpointsNotCalled(t *testing.T) func(endpoints []*endpoint.Endpoint) []*endpoint.Endpoint { + return func(endpoints []*endpoint.Endpoint) []*endpoint.Endpoint { + t.Errorf("unexpected call to AdjustEndpoints") + return endpoints + } +} + +func newTestProviderFunc(t *testing.T) *testProviderFunc { + return &testProviderFunc{ + records: recordsNotCalled(t), + applyChanges: applyChangesNotCalled(t), + propertyValuesEqual: propertyValuesEqualNotCalled(t), + adjustEndpoints: adjustEndpointsNotCalled(t), + } +} + +func TestCachedProviderCallsProviderOnFirstCall(t *testing.T) { + testProvider := newTestProviderFunc(t) + testProvider.records = func(ctx context.Context) ([]*endpoint.Endpoint, error) { + return []*endpoint.Endpoint{{DNSName: "domain.fqdn"}}, nil + } + provider := CachedProvider{ + Provider: testProvider, + } + endpoints, err := provider.Records(context.Background()) + assert.NoError(t, err) + require.NotNil(t, endpoints) + require.Len(t, endpoints, 1) + require.NotNil(t, endpoints[0]) + assert.Equal(t, "domain.fqdn", endpoints[0].DNSName) +} + +func TestCachedProviderUsesCacheWhileValid(t *testing.T) { + testProvider := newTestProviderFunc(t) + testProvider.records = func(ctx context.Context) ([]*endpoint.Endpoint, error) { + return []*endpoint.Endpoint{{DNSName: "domain.fqdn"}}, nil + } + provider := CachedProvider{ + RefreshDelay: 30 * time.Second, + Provider: testProvider, + } + _, err := provider.Records(context.Background()) + require.NoError(t, err) + + t.Run("With consecutive calls within the caching time frame", func(t *testing.T) { + testProvider.records = recordsNotCalled(t) + endpoints, err := provider.Records(context.Background()) + assert.NoError(t, err) + require.NotNil(t, endpoints) + require.Len(t, endpoints, 1) + require.NotNil(t, endpoints[0]) + assert.Equal(t, "domain.fqdn", endpoints[0].DNSName) + }) + + t.Run("When the caching time frame is exceeded", func(t *testing.T) { + testProvider.records = func(ctx context.Context) ([]*endpoint.Endpoint, error) { + return []*endpoint.Endpoint{{DNSName: "new.domain.fqdn"}}, nil + } + provider.lastRead = time.Now().Add(-20 * time.Minute) + endpoints, err := provider.Records(context.Background()) + assert.NoError(t, err) + require.NotNil(t, endpoints) + require.Len(t, endpoints, 1) + require.NotNil(t, endpoints[0]) + assert.Equal(t, "new.domain.fqdn", endpoints[0].DNSName) + }) +} + +func TestCachedProviderForcesCacheRefreshOnUpdate(t *testing.T) { + testProvider := newTestProviderFunc(t) + testProvider.records = func(ctx context.Context) ([]*endpoint.Endpoint, error) { + return []*endpoint.Endpoint{{DNSName: "domain.fqdn"}}, nil + } + provider := CachedProvider{ + RefreshDelay: 30 * time.Second, + Provider: testProvider, + } + _, err := provider.Records(context.Background()) + require.NoError(t, err) + + t.Run("When the caching time frame is exceeded", func(t *testing.T) { + testProvider.records = recordsNotCalled(t) + testProvider.applyChanges = func(ctx context.Context, changes *plan.Changes) error { + return nil + } + err := provider.ApplyChanges(context.Background(), &plan.Changes{}) + assert.NoError(t, err) + t.Run("Next call to Records is not cached", func(t *testing.T) { + testProvider.applyChanges = applyChangesNotCalled(t) + testProvider.records = func(ctx context.Context) ([]*endpoint.Endpoint, error) { + return []*endpoint.Endpoint{{DNSName: "new.domain.fqdn"}}, nil + } + endpoints, err := provider.Records(context.Background()) + + assert.NoError(t, err) + require.NotNil(t, endpoints) + require.Len(t, endpoints, 1) + require.NotNil(t, endpoints[0]) + assert.Equal(t, "new.domain.fqdn", endpoints[0].DNSName) + }) + }) +} From 29191e23622568d1f4a4350aec192bf0b0e2539e Mon Sep 17 00:00:00 2001 From: Thibault Jamet Date: Mon, 13 Feb 2023 19:31:06 +0100 Subject: [PATCH 02/14] Skip apply empty changes in the cache provider Change-Id: Icaf1ffe34e75c320d4efbb428f831deb8784cd11 --- main.go | 2 +- provider/cached_provider.go | 11 +++++++++++ provider/cached_provider_test.go | 28 +++++++++++++++++++++++++++- 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/main.go b/main.go index 05765845ff..955bca8682 100644 --- a/main.go +++ b/main.go @@ -421,7 +421,7 @@ func main() { case "txt": r, err = registry.NewTXTRegistry(p, cfg.TXTPrefix, cfg.TXTSuffix, cfg.TXTOwnerID, cfg.TXTCacheInterval, cfg.TXTWildcardReplacement, cfg.ManagedDNSRecordTypes, cfg.ExcludeDNSRecordTypes, cfg.TXTEncryptEnabled, []byte(cfg.TXTEncryptAESKey)) case "aws-sd": - r, err = registry.NewAWSSDRegistry(p.(*awssd.AWSSDProvider), cfg.TXTOwnerID) + r, err = registry.NewAWSSDRegistry(p, cfg.TXTOwnerID) default: log.Fatalf("unknown registry: %s", cfg.Registry) } diff --git a/provider/cached_provider.go b/provider/cached_provider.go index e4b3652662..0b646306ff 100644 --- a/provider/cached_provider.go +++ b/provider/cached_provider.go @@ -5,6 +5,7 @@ import ( "time" "github.com/prometheus/client_golang/prometheus" + log "github.com/sirupsen/logrus" "sigs.k8s.io/external-dns/endpoint" "sigs.k8s.io/external-dns/plan" ) @@ -41,15 +42,23 @@ type CachedProvider struct { func (c *CachedProvider) Records(ctx context.Context) ([]*endpoint.Endpoint, error) { if c.needRefresh() { + log.Info("Records cache provider: refreshing records list cache") c.cache, c.err = c.Provider.Records(ctx) + if c.err != nil { + log.Errorf("Records cache provider: list records failed: %v", c.err) + } c.lastRead = time.Now() cachedRecordsCallsTotal.WithLabelValues("false").Inc() } else { + log.Info("Records cache provider: using records list from cache") cachedRecordsCallsTotal.WithLabelValues("true").Inc() } return c.cache, c.err } func (c *CachedProvider) ApplyChanges(ctx context.Context, changes *plan.Changes) error { + if !changes.HasChanges() { + return nil + } c.Reset() cachedApplyChangesCallsTotal.Inc() return c.Provider.ApplyChanges(ctx, changes) @@ -63,8 +72,10 @@ func (c *CachedProvider) Reset() { func (c *CachedProvider) needRefresh() bool { if c.cache == nil || c.err != nil { + log.Debug("Records cache provider is not initialized") return true } + log.Debug("Records cache last Read: ", c.lastRead, "expiration: ", c.RefreshDelay, " provider expiration:", c.lastRead.Add(c.RefreshDelay), "expired: ", time.Now().After(c.lastRead.Add(c.RefreshDelay))) return time.Now().After(c.lastRead.Add(c.RefreshDelay)) } diff --git a/provider/cached_provider_test.go b/provider/cached_provider_test.go index 653e8f8b58..201e9aa664 100644 --- a/provider/cached_provider_test.go +++ b/provider/cached_provider_test.go @@ -140,13 +140,39 @@ func TestCachedProviderForcesCacheRefreshOnUpdate(t *testing.T) { _, err := provider.Records(context.Background()) require.NoError(t, err) - t.Run("When the caching time frame is exceeded", func(t *testing.T) { + t.Run("When empty changes are applied", func(t *testing.T) { testProvider.records = recordsNotCalled(t) testProvider.applyChanges = func(ctx context.Context, changes *plan.Changes) error { return nil } err := provider.ApplyChanges(context.Background(), &plan.Changes{}) assert.NoError(t, err) + t.Run("Next call to Records is cached", func(t *testing.T) { + testProvider.applyChanges = applyChangesNotCalled(t) + testProvider.records = func(ctx context.Context) ([]*endpoint.Endpoint, error) { + return []*endpoint.Endpoint{{DNSName: "new.domain.fqdn"}}, nil + } + endpoints, err := provider.Records(context.Background()) + + assert.NoError(t, err) + require.NotNil(t, endpoints) + require.Len(t, endpoints, 1) + require.NotNil(t, endpoints[0]) + assert.Equal(t, "domain.fqdn", endpoints[0].DNSName) + }) + }) + + t.Run("When changes are applied", func(t *testing.T) { + testProvider.records = recordsNotCalled(t) + testProvider.applyChanges = func(ctx context.Context, changes *plan.Changes) error { + return nil + } + err := provider.ApplyChanges(context.Background(), &plan.Changes{ + Create: []*endpoint.Endpoint{ + {DNSName: "hello.world"}, + }, + }) + assert.NoError(t, err) t.Run("Next call to Records is not cached", func(t *testing.T) { testProvider.applyChanges = applyChangesNotCalled(t) testProvider.records = func(ctx context.Context) ([]*endpoint.Endpoint, error) { From eb07eb990587d992189433662e30046d9b8e0908 Mon Sep 17 00:00:00 2001 From: Thibault Jamet Date: Mon, 13 Feb 2023 20:16:09 +0100 Subject: [PATCH 03/14] Add licence headers Change-Id: I3f4646cabd66216fd028fbae3adf68129a8a2cbf --- provider/cached_provider.go | 15 +++++++++++++++ provider/cached_provider_test.go | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/provider/cached_provider.go b/provider/cached_provider.go index 0b646306ff..4c59087db6 100644 --- a/provider/cached_provider.go +++ b/provider/cached_provider.go @@ -1,3 +1,18 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package provider import ( diff --git a/provider/cached_provider_test.go b/provider/cached_provider_test.go index 201e9aa664..2c7eec95aa 100644 --- a/provider/cached_provider_test.go +++ b/provider/cached_provider_test.go @@ -1,3 +1,18 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package provider import ( From 82c6983fa3c9b932591c1e7f7419d0fb4c4188be Mon Sep 17 00:00:00 2001 From: Thibault Jamet Date: Mon, 13 Feb 2023 21:37:08 +0100 Subject: [PATCH 04/14] Add a log line when no changes on cache provider Change-Id: I13da2aa28eef3e2c8e81b502321c4dc137087b2d --- provider/cached_provider.go | 1 + 1 file changed, 1 insertion(+) diff --git a/provider/cached_provider.go b/provider/cached_provider.go index 4c59087db6..c8d2da9de6 100644 --- a/provider/cached_provider.go +++ b/provider/cached_provider.go @@ -72,6 +72,7 @@ func (c *CachedProvider) Records(ctx context.Context) ([]*endpoint.Endpoint, err } func (c *CachedProvider) ApplyChanges(ctx context.Context, changes *plan.Changes) error { if !changes.HasChanges() { + log.Info("Records cache provider: no changes to be applied") return nil } c.Reset() From b2ff1619f5dfb50187fc75cb64733978d3a0d880 Mon Sep 17 00:00:00 2001 From: Thibault Jamet Date: Fri, 8 Sep 2023 16:21:54 +0300 Subject: [PATCH 05/14] Add Domain filter interface --- controller/controller.go | 4 ++-- controller/controller_test.go | 2 +- endpoint/domain_filter.go | 8 +++++++- pkg/apis/externaldns/types.go | 2 +- provider/aws/aws.go | 2 +- provider/aws/aws_test.go | 4 ++-- provider/cached_provider.go | 16 ++++++++-------- provider/inmemory/inmemory.go | 2 +- provider/provider.go | 4 ++-- registry/aws_sd_registry.go | 2 +- registry/dynamodb.go | 2 +- registry/noop.go | 2 +- registry/registry.go | 2 +- registry/txt.go | 2 +- 14 files changed, 30 insertions(+), 24 deletions(-) diff --git a/controller/controller.go b/controller/controller.go index c8bb6ff185..2521b4436a 100644 --- a/controller/controller.go +++ b/controller/controller.go @@ -186,7 +186,7 @@ type Controller struct { // The interval between individual synchronizations Interval time.Duration // The DomainFilter defines which DNS records to keep or exclude - DomainFilter endpoint.DomainFilter + DomainFilter endpoint.DomainFilterInterface // The nextRunAt used for throttling and batching reconciliation nextRunAt time.Time // The runAtMutex is for atomic updating of nextRunAt and lastRunAt @@ -245,7 +245,7 @@ func (c *Controller) RunOnce(ctx context.Context) error { Policies: []plan.Policy{c.Policy}, Current: records, Desired: endpoints, - DomainFilter: endpoint.MatchAllDomainFilters{&c.DomainFilter, ®istryFilter}, + DomainFilter: endpoint.MatchAllDomainFilters{c.DomainFilter, registryFilter}, ManagedRecords: c.ManagedRecordTypes, ExcludeRecords: c.ExcludeRecordTypes, OwnerID: c.Registry.OwnerID(), diff --git a/controller/controller_test.go b/controller/controller_test.go index e95aa9802e..c6074f8331 100644 --- a/controller/controller_test.go +++ b/controller/controller_test.go @@ -57,7 +57,7 @@ type errorMockProvider struct { mockProvider } -func (p *filteredMockProvider) GetDomainFilter() endpoint.DomainFilter { +func (p *filteredMockProvider) GetDomainFilter() endpoint.DomainFilterInterface { return p.domainFilter } diff --git a/endpoint/domain_filter.go b/endpoint/domain_filter.go index 308599d80e..3acfbcd939 100644 --- a/endpoint/domain_filter.go +++ b/endpoint/domain_filter.go @@ -25,7 +25,7 @@ import ( "strings" ) -type MatchAllDomainFilters []*DomainFilter +type MatchAllDomainFilters []DomainFilterInterface func (f MatchAllDomainFilters) Match(domain string) bool { for _, filter := range f { @@ -39,6 +39,10 @@ func (f MatchAllDomainFilters) Match(domain string) bool { return true } +type DomainFilterInterface interface { + Match(domain string) bool +} + // DomainFilter holds a lists of valid domain names type DomainFilter struct { // Filters define what domains to match @@ -51,6 +55,8 @@ type DomainFilter struct { regexExclusion *regexp.Regexp } +var _ DomainFilterInterface = &DomainFilter{} + // domainFilterSerde is a helper type for serializing and deserializing DomainFilter. type domainFilterSerde struct { Include []string `json:"include,omitempty"` diff --git a/pkg/apis/externaldns/types.go b/pkg/apis/externaldns/types.go index 05ba1f2e1b..423229a8b8 100644 --- a/pkg/apis/externaldns/types.go +++ b/pkg/apis/externaldns/types.go @@ -67,7 +67,7 @@ type Config struct { AlwaysPublishNotReadyAddresses bool ConnectorSourceServer string Provider string - ProviderCacheTime int + ProviderCacheTime time.Duration GoogleProject string GoogleBatchChangeSize int GoogleBatchChangeInterval time.Duration diff --git a/provider/aws/aws.go b/provider/aws/aws.go index 8e13d290fe..1955efb16d 100644 --- a/provider/aws/aws.go +++ b/provider/aws/aws.go @@ -567,7 +567,7 @@ func (p *AWSProvider) createUpdateChanges(newEndpoints, oldEndpoints []*endpoint } // GetDomainFilter generates a filter to exclude any domain that is not controlled by the provider -func (p *AWSProvider) GetDomainFilter() endpoint.DomainFilter { +func (p *AWSProvider) GetDomainFilter() endpoint.DomainFilterInterface { zones, err := p.Zones(context.Background()) if err != nil { log.Errorf("failed to list zones: %v", err) diff --git a/provider/aws/aws_test.go b/provider/aws/aws_test.go index 418e05d09e..6968e7c76f 100644 --- a/provider/aws/aws_test.go +++ b/provider/aws/aws_test.go @@ -319,10 +319,10 @@ func TestAWSZones(t *testing.T) { func TestAWSRecordsFilter(t *testing.T) { provider, _ := newAWSProvider(t, endpoint.DomainFilter{}, provider.ZoneIDFilter{}, provider.ZoneTypeFilter{}, false, false, nil) domainFilter := provider.GetDomainFilter() - assert.NotNil(t, domainFilter) + require.NotNil(t, domainFilter) require.IsType(t, endpoint.DomainFilter{}, domainFilter) count := 0 - filters := domainFilter.Filters + filters := domainFilter.(endpoint.DomainFilter).Filters for _, tld := range []string{ "zone-4.ext-dns-test-3.teapot.zalan.do", ".zone-4.ext-dns-test-3.teapot.zalan.do", diff --git a/provider/cached_provider.go b/provider/cached_provider.go index c8d2da9de6..3ea1fd5166 100644 --- a/provider/cached_provider.go +++ b/provider/cached_provider.go @@ -50,7 +50,6 @@ var ( type CachedProvider struct { Provider RefreshDelay time.Duration - err error lastRead time.Time cache []*endpoint.Endpoint } @@ -58,17 +57,19 @@ type CachedProvider struct { func (c *CachedProvider) Records(ctx context.Context) ([]*endpoint.Endpoint, error) { if c.needRefresh() { log.Info("Records cache provider: refreshing records list cache") - c.cache, c.err = c.Provider.Records(ctx) - if c.err != nil { - log.Errorf("Records cache provider: list records failed: %v", c.err) + records, err := c.Provider.Records(ctx) + if err != nil { + c.cache = nil + return nil, err } + c.cache = records c.lastRead = time.Now() cachedRecordsCallsTotal.WithLabelValues("false").Inc() } else { - log.Info("Records cache provider: using records list from cache") + log.Debug("Records cache provider: using records list from cache") cachedRecordsCallsTotal.WithLabelValues("true").Inc() } - return c.cache, c.err + return c.cache, nil } func (c *CachedProvider) ApplyChanges(ctx context.Context, changes *plan.Changes) error { if !changes.HasChanges() { @@ -81,13 +82,12 @@ func (c *CachedProvider) ApplyChanges(ctx context.Context, changes *plan.Changes } func (c *CachedProvider) Reset() { - c.err = nil c.cache = nil c.lastRead = time.Time{} } func (c *CachedProvider) needRefresh() bool { - if c.cache == nil || c.err != nil { + if c.cache == nil { log.Debug("Records cache provider is not initialized") return true } diff --git a/provider/inmemory/inmemory.go b/provider/inmemory/inmemory.go index eab515a673..1f636dfdad 100644 --- a/provider/inmemory/inmemory.go +++ b/provider/inmemory/inmemory.go @@ -46,7 +46,7 @@ var ( // initialized as dns provider with no records type InMemoryProvider struct { provider.BaseProvider - domain endpoint.DomainFilter + domain endpoint.DomainFilterInterface client *inMemoryClient filter *filter OnApplyChanges func(ctx context.Context, changes *plan.Changes) diff --git a/provider/provider.go b/provider/provider.go index 6a9c591e10..2bbfac1619 100644 --- a/provider/provider.go +++ b/provider/provider.go @@ -48,7 +48,7 @@ type Provider interface { // unnecessary (potentially failing) changes. It may also modify other fields, add, or remove // Endpoints. It is permitted to modify the supplied endpoints. AdjustEndpoints(endpoints []*endpoint.Endpoint) ([]*endpoint.Endpoint, error) - GetDomainFilter() endpoint.DomainFilter + GetDomainFilter() endpoint.DomainFilterInterface } type BaseProvider struct{} @@ -57,7 +57,7 @@ func (b BaseProvider) AdjustEndpoints(endpoints []*endpoint.Endpoint) ([]*endpoi return endpoints, nil } -func (b BaseProvider) GetDomainFilter() endpoint.DomainFilter { +func (b BaseProvider) GetDomainFilter() endpoint.DomainFilterInterface { return endpoint.DomainFilter{} } diff --git a/registry/aws_sd_registry.go b/registry/aws_sd_registry.go index 8a2b023c4f..a1e2103f5d 100644 --- a/registry/aws_sd_registry.go +++ b/registry/aws_sd_registry.go @@ -42,7 +42,7 @@ func NewAWSSDRegistry(provider provider.Provider, ownerID string) (*AWSSDRegistr }, nil } -func (sdr *AWSSDRegistry) GetDomainFilter() endpoint.DomainFilter { +func (sdr *AWSSDRegistry) GetDomainFilter() endpoint.DomainFilterInterface { return sdr.provider.GetDomainFilter() } diff --git a/registry/dynamodb.go b/registry/dynamodb.go index aeb38d9a9a..b13d55ce94 100644 --- a/registry/dynamodb.go +++ b/registry/dynamodb.go @@ -105,7 +105,7 @@ func NewDynamoDBRegistry(provider provider.Provider, ownerID string, dynamodbAPI }, nil } -func (im *DynamoDBRegistry) GetDomainFilter() endpoint.DomainFilter { +func (im *DynamoDBRegistry) GetDomainFilter() endpoint.DomainFilterInterface { return im.provider.GetDomainFilter() } diff --git a/registry/noop.go b/registry/noop.go index 9e06bbb92d..daf29cb01f 100644 --- a/registry/noop.go +++ b/registry/noop.go @@ -36,7 +36,7 @@ func NewNoopRegistry(provider provider.Provider) (*NoopRegistry, error) { }, nil } -func (im *NoopRegistry) GetDomainFilter() endpoint.DomainFilter { +func (im *NoopRegistry) GetDomainFilter() endpoint.DomainFilterInterface { return im.provider.GetDomainFilter() } diff --git a/registry/registry.go b/registry/registry.go index d2955365d0..3117c42056 100644 --- a/registry/registry.go +++ b/registry/registry.go @@ -31,6 +31,6 @@ type Registry interface { Records(ctx context.Context) ([]*endpoint.Endpoint, error) ApplyChanges(ctx context.Context, changes *plan.Changes) error AdjustEndpoints(endpoints []*endpoint.Endpoint) ([]*endpoint.Endpoint, error) - GetDomainFilter() endpoint.DomainFilter + GetDomainFilter() endpoint.DomainFilterInterface OwnerID() string } diff --git a/registry/txt.go b/registry/txt.go index 8e7b6f0968..12445bcb1d 100644 --- a/registry/txt.go +++ b/registry/txt.go @@ -95,7 +95,7 @@ func getSupportedTypes() []string { return []string{endpoint.RecordTypeA, endpoint.RecordTypeAAAA, endpoint.RecordTypeCNAME, endpoint.RecordTypeNS} } -func (im *TXTRegistry) GetDomainFilter() endpoint.DomainFilter { +func (im *TXTRegistry) GetDomainFilter() endpoint.DomainFilterInterface { return im.provider.GetDomainFilter() } From b2d678f7d0f073f06280a69c2d2fed0ce9477623 Mon Sep 17 00:00:00 2001 From: Thibault Jamet Date: Fri, 8 Sep 2023 17:12:18 +0300 Subject: [PATCH 06/14] Run go imports -local sigs.k8s.io/external-dns --- provider/cached_provider.go | 1 + 1 file changed, 1 insertion(+) diff --git a/provider/cached_provider.go b/provider/cached_provider.go index 3ea1fd5166..42f327e07e 100644 --- a/provider/cached_provider.go +++ b/provider/cached_provider.go @@ -21,6 +21,7 @@ import ( "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" + "sigs.k8s.io/external-dns/endpoint" "sigs.k8s.io/external-dns/plan" ) From 9b759f09339a840e051ebf020c0892f71fac7084 Mon Sep 17 00:00:00 2001 From: Thibault Jamet Date: Mon, 8 Jul 2024 09:02:20 +0200 Subject: [PATCH 07/14] Update changes to match latest state of external-dns code --- provider/cached_provider_test.go | 11 ++++++----- provider/webhook/api/httpapi_test.go | 2 +- provider/webhook/webhook.go | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/provider/cached_provider_test.go b/provider/cached_provider_test.go index 2c7eec95aa..4e358066fa 100644 --- a/provider/cached_provider_test.go +++ b/provider/cached_provider_test.go @@ -17,6 +17,7 @@ package provider import ( "context" + "errors" "testing" "time" @@ -30,7 +31,7 @@ type testProviderFunc struct { records func(ctx context.Context) ([]*endpoint.Endpoint, error) applyChanges func(ctx context.Context, changes *plan.Changes) error propertyValuesEqual func(name string, previous string, current string) bool - adjustEndpoints func(endpoints []*endpoint.Endpoint) []*endpoint.Endpoint + adjustEndpoints func(endpoints []*endpoint.Endpoint) ([]*endpoint.Endpoint, error) getDomainFilter func() endpoint.DomainFilterInterface } @@ -46,7 +47,7 @@ func (p *testProviderFunc) PropertyValuesEqual(name string, previous string, cur return p.propertyValuesEqual(name, previous, current) } -func (p *testProviderFunc) AdjustEndpoints(endpoints []*endpoint.Endpoint) []*endpoint.Endpoint { +func (p *testProviderFunc) AdjustEndpoints(endpoints []*endpoint.Endpoint) ([]*endpoint.Endpoint, error) { return p.adjustEndpoints(endpoints) } @@ -75,10 +76,10 @@ func propertyValuesEqualNotCalled(t *testing.T) func(name string, previous strin } } -func adjustEndpointsNotCalled(t *testing.T) func(endpoints []*endpoint.Endpoint) []*endpoint.Endpoint { - return func(endpoints []*endpoint.Endpoint) []*endpoint.Endpoint { +func adjustEndpointsNotCalled(t *testing.T) func(endpoints []*endpoint.Endpoint) ([]*endpoint.Endpoint, error) { + return func(endpoints []*endpoint.Endpoint) ([]*endpoint.Endpoint, error) { t.Errorf("unexpected call to AdjustEndpoints") - return endpoints + return endpoints, errors.New("unexpected call to AdjustEndpoints") } } diff --git a/provider/webhook/api/httpapi_test.go b/provider/webhook/api/httpapi_test.go index 48685dc9d9..a15efec471 100644 --- a/provider/webhook/api/httpapi_test.go +++ b/provider/webhook/api/httpapi_test.go @@ -62,7 +62,7 @@ func (p FakeWebhookProvider) AdjustEndpoints(endpoints []*endpoint.Endpoint) ([] return endpoints, nil } -func (p FakeWebhookProvider) GetDomainFilter() endpoint.DomainFilter { +func (p FakeWebhookProvider) GetDomainFilter() endpoint.DomainFilterInterface { return p.domainFilter } diff --git a/provider/webhook/webhook.go b/provider/webhook/webhook.go index d504842900..8cae0850e2 100644 --- a/provider/webhook/webhook.go +++ b/provider/webhook/webhook.go @@ -296,7 +296,7 @@ func (p WebhookProvider) AdjustEndpoints(e []*endpoint.Endpoint) ([]*endpoint.En } // GetDomainFilter make calls to get the serialized version of the domain filter -func (p WebhookProvider) GetDomainFilter() endpoint.DomainFilter { +func (p WebhookProvider) GetDomainFilter() endpoint.DomainFilterInterface { return p.DomainFilter } From 2955e5d4568b492db5cea2e8444655c9ee0526ee Mon Sep 17 00:00:00 2001 From: Thibault Jamet Date: Mon, 5 Aug 2024 16:59:28 +0200 Subject: [PATCH 08/14] Apply suggestions from code review Co-authored-by: Michel Loiseleur <97035654+mloiseleur@users.noreply.github.com> --- docs/tutorials/aws.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/aws.md b/docs/tutorials/aws.md index 532dd45e33..ad6d3921fa 100644 --- a/docs/tutorials/aws.md +++ b/docs/tutorials/aws.md @@ -912,7 +912,7 @@ Route53 has a [5 API requests per second per account hard quota](https://docs.aw Running several fast polling ExternalDNS instances in a given account can easily hit that limit. Some ways to reduce the request rate include: * Reduce the polling loop's synchronization interval at the possible cost of slower change propagation (but see `--events` below to reduce the impact). * `--interval=5m` (default `1m`) -* Cache the results of the zone at the possible cost of slower propagation when the zone gets modified from other sources +* Enable a Cache to store the zone records list. It comes with a cost: slower propagation when the zone gets modified from other sources. * `--provider-cache-time=15m` (default `0m`) * Trigger the polling loop on changes to K8s objects, rather than only at `interval` and ensure a minimum of time between events, to have responsive updates with long poll intervals * `--events` From 309f34f4f56c7d7cd4fad2bd4cab933b17c7d585 Mon Sep 17 00:00:00 2001 From: Thibault Jamet Date: Mon, 5 Aug 2024 17:47:46 +0200 Subject: [PATCH 09/14] Add an advanced topic about rate limits --- docs/rate-limits.md | 76 +++++++++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 77 insertions(+) create mode 100644 docs/rate-limits.md diff --git a/docs/rate-limits.md b/docs/rate-limits.md new file mode 100644 index 0000000000..4de0763011 --- /dev/null +++ b/docs/rate-limits.md @@ -0,0 +1,76 @@ +DNS provider API rate limits considerations +=========================================== + +## Introduction + +By design, external-dns refreshes all the records of a zone using API calls. +This refresh may happen peridically and upon any changed object if the flag `--events` is enabled. + +Depending on the size of the zone and the infrastructure deployment, this may lead to having external-dns +hitting the DNS provider's rate-limits more easily. + +In particular, it has been found that, with 200k records in an AWS Route53 zone, each refresh triggers around +70 API calls to retrieve all the records, making it more likely to hit the AWS Route53 API rate limits. + +To prevent this problem from happening, external-dns has implemented a cache to reduce the pressure on the DNS +provider APIs. + +This cache is optional and systematically invalidated when DNS records have been changed in the cluster +(new or deleted domains or changed target). + +## Trade-offs + +The major trade-off of this setting relies in the ability to recover from a deleted record on the DNS provider side. +As the DNS records are cached in memory, external-dns will not be made aware of the missing records and will hence +take a longer time to restore the deleted or modified record on the provider side. + +This option is enabled using the `--provider-cache-time=15m` command line argument, and disabled when `--provider-cache-time=0m` + +## Monitoring + +You can evaluate the behaviour of the cache thanks to the built-in metrics + +* `external_dns_provider_cache_records_calls` + The number of calls to the provider cache Records list. + The label `from_cache=true` indicates that the records were retrieved from memory and the DNS provider was not reached + The label `from_cache=false` indicates that the cache was not used and the records were retrieved from the provider +* `external_dns_provider_cache_apply_changes_calls` + The number of calls to the provider cache ApplyChanges. + Each ApplyChange systematically invalidates the cache and makes subsequent Records list to be retrieved from the provider without cache. + +## Related options + +This global option is available for all providers and can be used in pair with other global +or provider-specific options to fine-tune the behaviour of external-dns +to match the specific needs of your deployments, with the goal to reduce the number of API calls to your DNS provider. + +* Google + `--google-batch-change-interval=1s` When using the Google provider, set the interval between batch changes. ($EXTERNAL_DNS_GOOGLE_BATCH_CHANGE_INTERVAL) + `--google-batch-change-size=1000` When using the Google provider, set the maximum number of changes that will be applied in each batch. +* AWS + `--aws-batch-change-interval=1s` When using the AWS provider, set the interval between batch changes. + `--aws-batch-change-size=1000` When using the AWS provider, set the maximum number of changes that will be applied in each batch. + `--aws-batch-change-size-bytes=32000` When using the AWS provider, set the maximum byte size that will be applied in each batch. + `--aws-batch-change-size-values=1000` When using the AWS provider, set the maximum total record values that will be applied in each batch. + `--aws-zones-cache-duration=0s` When using the AWS provider, set the zones list cache TTL (0s to disable). + `--[no-]aws-zone-match-parent` Expand limit possible target by sub-domains +* Cloudflare + `--cloudflare-dns-records-per-page=100` When using the Cloudflare provider, specify how many DNS records listed per page, max possible 5,000 (default: 100) +* OVH + `--ovh-api-rate-limit=20` When using the OVH provider, specify the API request rate limit, X operations by seconds (default: 20) + +* Global + `--registry=txt` The registry implementation to use to keep track of DNS record ownership (default: txt, options: txt, noop, dynamodb, aws-sd) + `--txt-cache-interval=0s` The interval between cache synchronizations in duration format (default: disabled) + `--interval=1m0s` The interval between two consecutive synchronizations in duration format (default: 1m) + `--min-event-sync-interval=5s` The minimum interval between two consecutive synchronizations triggered from kubernetes events in duration format (default: 5s) + `--[no-]events` When enabled, in addition to running every interval, the reconciliation loop will get triggered when supported sources change (default: disabled) + +A general recommendation is to enable `--events` and keep `--min-event-sync-interval` relatively low to have a better responsiveness when records are +created or updated inside the cluster. +This should represent an acceptable propagation time between the creation of your k8s resources and the time they become registered in your DNS server. + +On a general manner, the higher the `--provider-cache-time`, the lower the impact on the rate limits, but also, the slower the recovery in case of a deletion. +The `--provider-cache-time` value should hence be set to an acceptable time to automatically recover restore deleted records. + +✍️ Note that caching is done within the external-dns controller memory. You can invalidate the cache at any point in time by restarting it (for example doing a rolling update). diff --git a/mkdocs.yml b/mkdocs.yml index 82d55858ca..c2d43fd0c7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -32,6 +32,7 @@ nav: - Initial Design: docs/initial-design.md - TTL: docs/ttl.md - MultiTarget: docs/proposal/multi-target.md + - ProviderCache: docs/rate-limits.md - Contributing: - Kubernetes Contributions: CONTRIBUTING.md - Release: docs/release.md From 173604854d98a1effd05c4d9bad7e87e9277939d Mon Sep 17 00:00:00 2001 From: Thibault Jamet Date: Tue, 6 Aug 2024 08:44:08 +0200 Subject: [PATCH 10/14] Rename Advanced topics title to match the content --- mkdocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index c2d43fd0c7..0b4bbe54ab 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -32,7 +32,7 @@ nav: - Initial Design: docs/initial-design.md - TTL: docs/ttl.md - MultiTarget: docs/proposal/multi-target.md - - ProviderCache: docs/rate-limits.md + - Rate Limits: docs/rate-limits.md - Contributing: - Kubernetes Contributions: CONTRIBUTING.md - Release: docs/release.md From ef0dd29cf51d9b44192b767ac015be9cd0d13bb2 Mon Sep 17 00:00:00 2001 From: Thibault Jamet Date: Tue, 13 Aug 2024 11:16:28 +0200 Subject: [PATCH 11/14] Apply suggestions from code review Co-authored-by: Michel Loiseleur <97035654+mloiseleur@users.noreply.github.com> --- docs/rate-limits.md | 46 ++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/docs/rate-limits.md b/docs/rate-limits.md index 4de0763011..f0c7698849 100644 --- a/docs/rate-limits.md +++ b/docs/rate-limits.md @@ -6,10 +6,10 @@ DNS provider API rate limits considerations By design, external-dns refreshes all the records of a zone using API calls. This refresh may happen peridically and upon any changed object if the flag `--events` is enabled. -Depending on the size of the zone and the infrastructure deployment, this may lead to having external-dns +Depending on the size of the zone and the infrastructure deployment, this may lead to external-dns hitting the DNS provider's rate-limits more easily. -In particular, it has been found that, with 200k records in an AWS Route53 zone, each refresh triggers around +In particular, it has been found that with 200k records in an AWS Route53 zone, each refresh triggers around 70 API calls to retrieve all the records, making it more likely to hit the AWS Route53 API rate limits. To prevent this problem from happening, external-dns has implemented a cache to reduce the pressure on the DNS @@ -24,19 +24,19 @@ The major trade-off of this setting relies in the ability to recover from a dele As the DNS records are cached in memory, external-dns will not be made aware of the missing records and will hence take a longer time to restore the deleted or modified record on the provider side. -This option is enabled using the `--provider-cache-time=15m` command line argument, and disabled when `--provider-cache-time=0m` +This option is enabled using the `--provider-cache-time=15m` command line argument, and turned off when `--provider-cache-time=0m` ## Monitoring You can evaluate the behaviour of the cache thanks to the built-in metrics * `external_dns_provider_cache_records_calls` - The number of calls to the provider cache Records list. - The label `from_cache=true` indicates that the records were retrieved from memory and the DNS provider was not reached - The label `from_cache=false` indicates that the cache was not used and the records were retrieved from the provider + * The number of calls to the provider cache Records list. + * The label `from_cache=true` indicates that the records were retrieved from memory and the DNS provider was not reached + * The label `from_cache=false` indicates that the cache was not used and the records were retrieved from the provider * `external_dns_provider_cache_apply_changes_calls` - The number of calls to the provider cache ApplyChanges. - Each ApplyChange systematically invalidates the cache and makes subsequent Records list to be retrieved from the provider without cache. + * The number of calls to the provider cache ApplyChanges. + * Each ApplyChange systematically invalidates the cache and makes subsequent Records list to be retrieved from the provider without cache. ## Related options @@ -45,26 +45,26 @@ or provider-specific options to fine-tune the behaviour of external-dns to match the specific needs of your deployments, with the goal to reduce the number of API calls to your DNS provider. * Google - `--google-batch-change-interval=1s` When using the Google provider, set the interval between batch changes. ($EXTERNAL_DNS_GOOGLE_BATCH_CHANGE_INTERVAL) - `--google-batch-change-size=1000` When using the Google provider, set the maximum number of changes that will be applied in each batch. + * `--google-batch-change-interval=1s` When using the Google provider, set the interval between batch changes. ($EXTERNAL_DNS_GOOGLE_BATCH_CHANGE_INTERVAL) + * `--google-batch-change-size=1000` When using the Google provider, set the maximum number of changes that will be applied in each batch. * AWS - `--aws-batch-change-interval=1s` When using the AWS provider, set the interval between batch changes. - `--aws-batch-change-size=1000` When using the AWS provider, set the maximum number of changes that will be applied in each batch. - `--aws-batch-change-size-bytes=32000` When using the AWS provider, set the maximum byte size that will be applied in each batch. - `--aws-batch-change-size-values=1000` When using the AWS provider, set the maximum total record values that will be applied in each batch. - `--aws-zones-cache-duration=0s` When using the AWS provider, set the zones list cache TTL (0s to disable). - `--[no-]aws-zone-match-parent` Expand limit possible target by sub-domains + * `--aws-batch-change-interval=1s` When using the AWS provider, set the interval between batch changes. + * `--aws-batch-change-size=1000` When using the AWS provider, set the maximum number of changes that will be applied in each batch. + * `--aws-batch-change-size-bytes=32000` When using the AWS provider, set the maximum byte size that will be applied in each batch. + * `--aws-batch-change-size-values=1000` When using the AWS provider, set the maximum total record values that will be applied in each batch. + * `--aws-zones-cache-duration=0s` When using the AWS provider, set the zones list cache TTL (0s to disable). + * `--[no-]aws-zone-match-parent` Expand limit possible target by sub-domains * Cloudflare - `--cloudflare-dns-records-per-page=100` When using the Cloudflare provider, specify how many DNS records listed per page, max possible 5,000 (default: 100) + * `--cloudflare-dns-records-per-page=100` When using the Cloudflare provider, specify how many DNS records listed per page, max possible 5,000 (default: 100) * OVH - `--ovh-api-rate-limit=20` When using the OVH provider, specify the API request rate limit, X operations by seconds (default: 20) + * `--ovh-api-rate-limit=20` When using the OVH provider, specify the API request rate limit, X operations by seconds (default: 20) * Global - `--registry=txt` The registry implementation to use to keep track of DNS record ownership (default: txt, options: txt, noop, dynamodb, aws-sd) - `--txt-cache-interval=0s` The interval between cache synchronizations in duration format (default: disabled) - `--interval=1m0s` The interval between two consecutive synchronizations in duration format (default: 1m) - `--min-event-sync-interval=5s` The minimum interval between two consecutive synchronizations triggered from kubernetes events in duration format (default: 5s) - `--[no-]events` When enabled, in addition to running every interval, the reconciliation loop will get triggered when supported sources change (default: disabled) + * `--registry=txt` The registry implementation to use to keep track of DNS record ownership (default: txt, options: txt, noop, dynamodb, aws-sd) + * `--txt-cache-interval=0s` The interval between cache synchronizations in duration format (default: disabled) + * `--interval=1m0s` The interval between two consecutive synchronizations in duration format (default: 1m) + * `--min-event-sync-interval=5s` The minimum interval between two consecutive synchronizations triggered from kubernetes events in duration format (default: 5s) + * `--[no-]events` When enabled, in addition to running every interval, the reconciliation loop will get triggered when supported sources change (default: disabled) A general recommendation is to enable `--events` and keep `--min-event-sync-interval` relatively low to have a better responsiveness when records are created or updated inside the cluster. From 6c5faafbfe2d5633d4c5adab334c1c136b2cbfd2 Mon Sep 17 00:00:00 2001 From: Thibault Jamet Date: Tue, 13 Aug 2024 11:24:34 +0200 Subject: [PATCH 12/14] Dynamically register cache provider metrics --- main.go | 8 ++++---- provider/cached_provider.go | 17 +++++++++++++---- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/main.go b/main.go index 955bca8682..7fabfd219e 100644 --- a/main.go +++ b/main.go @@ -402,10 +402,10 @@ func main() { } if cfg.ProviderCacheTime > 0 { - p = &provider.CachedProvider{ - Provider: p, - RefreshDelay: cfg.ProviderCacheTime, - } + p = provider.NewCachedProvider( + p, + cfg.ProviderCacheTime, + ) } var r registry.Registry diff --git a/provider/cached_provider.go b/provider/cached_provider.go index 42f327e07e..dcea510f0d 100644 --- a/provider/cached_provider.go +++ b/provider/cached_provider.go @@ -17,6 +17,7 @@ package provider import ( "context" + "sync" "time" "github.com/prometheus/client_golang/prometheus" @@ -46,6 +47,8 @@ var ( Help: "Number of calls to the provider cache ApplyChanges.", }, ) + + registerCacheProviderMetrics = sync.Once{} ) type CachedProvider struct { @@ -55,6 +58,16 @@ type CachedProvider struct { cache []*endpoint.Endpoint } +func NewCachedProvider(provider Provider, refreshDelay time.Duration) *CachedProvider { + registerMetrics.Do(func() { + prometheus.MustRegister(cachedRecordsCallsTotal) + }) + return &CachedProvider{ + Provider: provider, + RefreshDelay: refreshDelay, + } +} + func (c *CachedProvider) Records(ctx context.Context) ([]*endpoint.Endpoint, error) { if c.needRefresh() { log.Info("Records cache provider: refreshing records list cache") @@ -95,7 +108,3 @@ func (c *CachedProvider) needRefresh() bool { log.Debug("Records cache last Read: ", c.lastRead, "expiration: ", c.RefreshDelay, " provider expiration:", c.lastRead.Add(c.RefreshDelay), "expired: ", time.Now().After(c.lastRead.Add(c.RefreshDelay))) return time.Now().After(c.lastRead.Add(c.RefreshDelay)) } - -func init() { - prometheus.MustRegister(cachedRecordsCallsTotal) -} From 8d2f1c0aea39288988bae1aede96242993c541ba Mon Sep 17 00:00:00 2001 From: Thibault Jamet Date: Tue, 13 Aug 2024 12:58:05 +0200 Subject: [PATCH 13/14] Fix invalid private variable reference --- provider/cached_provider.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/provider/cached_provider.go b/provider/cached_provider.go index dcea510f0d..6009c77d74 100644 --- a/provider/cached_provider.go +++ b/provider/cached_provider.go @@ -59,7 +59,7 @@ type CachedProvider struct { } func NewCachedProvider(provider Provider, refreshDelay time.Duration) *CachedProvider { - registerMetrics.Do(func() { + registerCacheProviderMetrics.Do(func() { prometheus.MustRegister(cachedRecordsCallsTotal) }) return &CachedProvider{ From a6ab2badce411919ae9de301d75b39a0f27ab717 Mon Sep 17 00:00:00 2001 From: Thibault Jamet Date: Wed, 14 Aug 2024 11:36:47 +0200 Subject: [PATCH 14/14] Update docs/tutorials/aws.md --- docs/tutorials/aws.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/aws.md b/docs/tutorials/aws.md index ad6d3921fa..e836e6739a 100644 --- a/docs/tutorials/aws.md +++ b/docs/tutorials/aws.md @@ -912,7 +912,7 @@ Route53 has a [5 API requests per second per account hard quota](https://docs.aw Running several fast polling ExternalDNS instances in a given account can easily hit that limit. Some ways to reduce the request rate include: * Reduce the polling loop's synchronization interval at the possible cost of slower change propagation (but see `--events` below to reduce the impact). * `--interval=5m` (default `1m`) -* Enable a Cache to store the zone records list. It comes with a cost: slower propagation when the zone gets modified from other sources. +* Enable a Cache to store the zone records list. It comes with a cost: slower propagation when the zone gets modified from other sources such as the AWS console, terraform, cloudformation or anything similar. * `--provider-cache-time=15m` (default `0m`) * Trigger the polling loop on changes to K8s objects, rather than only at `interval` and ensure a minimum of time between events, to have responsive updates with long poll intervals * `--events`