Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update the OCI Provider to incorporate SoftError to avoid CrashLoopBackoff #4229

Merged
merged 1 commit into from
Feb 14, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 13 additions & 12 deletions provider/oci/oci.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package oci

import (
"context"
"fmt"
"os"
"strings"
"time"
Expand Down Expand Up @@ -81,12 +82,12 @@ type ociDNSClient interface {
func LoadOCIConfig(path string) (*OCIConfig, error) {
contents, err := os.ReadFile(path)
if err != nil {
return nil, errors.Wrapf(err, "reading OCI config file %q", path)
return nil, fmt.Errorf("reading OCI config file %q: %w", path, err)
}

cfg := OCIConfig{}
if err := yaml.Unmarshal(contents, &cfg); err != nil {
return nil, errors.Wrapf(err, "parsing OCI config file %q", path)
return nil, fmt.Errorf("parsing OCI config file %q: %w", path, err)
}
return &cfg, nil
}
Expand All @@ -102,19 +103,19 @@ func NewOCIProvider(cfg OCIConfig, domainFilter endpoint.DomainFilter, zoneIDFil
if cfg.Auth.UseWorkloadIdentity {
// OCI SDK requires specific, dynamic environment variables for workload identity.
if err := os.Setenv(auth.ResourcePrincipalVersionEnvVar, auth.ResourcePrincipalVersion2_2); err != nil {
return nil, errors.Wrapf(err, "unable to set OCI SDK environment variable: %s", auth.ResourcePrincipalVersionEnvVar)
return nil, fmt.Errorf("unable to set OCI SDK environment variable: %s: %w", auth.ResourcePrincipalVersionEnvVar, err)
}
if err := os.Setenv(auth.ResourcePrincipalRegionEnvVar, cfg.Auth.Region); err != nil {
return nil, errors.Wrapf(err, "unable to set OCI SDK environment variable: %s", auth.ResourcePrincipalRegionEnvVar)
return nil, fmt.Errorf("unable to set OCI SDK environment variable: %s: %w", auth.ResourcePrincipalRegionEnvVar, err)
}
configProvider, err = auth.OkeWorkloadIdentityConfigurationProvider()
if err != nil {
return nil, errors.Wrap(err, "error creating OCI workload identity config provider")
return nil, fmt.Errorf("error creating OCI workload identity config provider: %w", err)
}
} else if cfg.Auth.UseInstancePrincipal {
configProvider, err = auth.InstancePrincipalConfigurationProvider()
if err != nil {
return nil, errors.Wrap(err, "error creating OCI instance principal config provider")
return nil, fmt.Errorf("error creating OCI instance principal config provider: %w", err)
}
} else {
configProvider = common.NewRawConfigurationProvider(
Expand All @@ -129,7 +130,7 @@ func NewOCIProvider(cfg OCIConfig, domainFilter endpoint.DomainFilter, zoneIDFil

client, err = dns.NewDnsClientWithConfigurationProvider(configProvider)
if err != nil {
return nil, errors.Wrap(err, "initializing OCI DNS API client")
return nil, fmt.Errorf("initializing OCI DNS API client: %w", err)
}

return &OCIProvider{
Expand Down Expand Up @@ -180,7 +181,7 @@ func (p *OCIProvider) addPaginatedZones(ctx context.Context, zones map[string]dn
Page: page,
})
if err != nil {
return errors.Wrapf(err, "listing zones in %s", p.cfg.CompartmentID)
return provider.NewSoftError(fmt.Errorf("listing zones in %s: %w", p.cfg.CompartmentID, err))
}
for _, zone := range resp.Items {
if p.domainFilter.Match(*zone.Name) && p.zoneIDFilter.Match(*zone.Id) {
Expand Down Expand Up @@ -211,7 +212,7 @@ func (p *OCIProvider) newFilteredRecordOperations(endpoints []*endpoint.Endpoint
func (p *OCIProvider) Records(ctx context.Context) ([]*endpoint.Endpoint, error) {
zones, err := p.zones(ctx)
if err != nil {
return nil, errors.Wrap(err, "getting zones")
return nil, provider.NewSoftError(fmt.Errorf("getting zones: %w", err))
}

endpoints := []*endpoint.Endpoint{}
Expand All @@ -224,7 +225,7 @@ func (p *OCIProvider) Records(ctx context.Context) ([]*endpoint.Endpoint, error)
CompartmentId: &p.cfg.CompartmentID,
})
if err != nil {
return nil, errors.Wrapf(err, "getting records for zone %q", *zone.Id)
return nil, provider.NewSoftError(fmt.Errorf("getting records for zone %q: %w", *zone.Id, err))
}

for _, record := range resp.Items {
Expand Down Expand Up @@ -269,7 +270,7 @@ func (p *OCIProvider) ApplyChanges(ctx context.Context, changes *plan.Changes) e

zones, err := p.zones(ctx)
if err != nil {
return errors.Wrap(err, "fetching zones")
return provider.NewSoftError(fmt.Errorf("fetching zones: %w", err))
}

// Separate into per-zone change sets to be passed to OCI API.
Expand All @@ -291,7 +292,7 @@ func (p *OCIProvider) ApplyChanges(ctx context.Context, changes *plan.Changes) e
ZoneNameOrId: &zoneID,
PatchZoneRecordsDetails: dns.PatchZoneRecordsDetails{Items: ops},
}); err != nil {
return err
return provider.NewSoftError(err)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand the use of NewSoftError for Get or List Records / Zones
Here, it's when patching zone record, so I have doubts.
Can we really be sure it won't have any unexpected side effect ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case, the new behavior is calling this a Fatal error right away. By wrapping this in a SoftError the previous expected behavior is restored. The new unexpected behavior would be to Fatal here. Doesn't it make more sense to have the pod in this case retry in an hour rather than crash the pod and have to recreate the Update immediately. I don't recall seeing external-dns pods being killed when updates failed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Basically, this change to return a Fatal error has already caused a major unexpected failure of external-dns pods across multiple regions in our tenancy. This has never happened before. So this change is trying to restore what was previous behavior to avoid getting bit by this again in a slightly different way.

}
}

Expand Down
Loading