-
Notifications
You must be signed in to change notification settings - Fork 632
fix(target-allocator): add TLS certificate hot-reload for mTLS #4597
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
981e9df
da2f8a2
577a970
dab147a
d2d4425
83296e2
6d77805
6575064
36815ac
98924f9
e2fc078
4a612f6
4296989
2d115e5
d507cbb
33a728e
81d511e
af68589
b86ce3e
6a9b4d5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| # One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' | ||
| change_type: bug_fix | ||
|
|
||
| # The name of the component, or a single word describing the area of concern, (e.g. collector, target allocator, auto-instrumentation, opamp, github action) | ||
| component: target allocator | ||
|
|
||
| # A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). | ||
| note: Fix TLS certificate hot-reload for mTLS connections | ||
|
|
||
| # One or more tracking issues related to the change | ||
| issues: [4368] | ||
|
|
||
| # (Optional) One or more lines of additional information to render under the primary note. | ||
| # These lines will be padded with 2 spaces and then inserted directly into the document. | ||
| # Use pipe (|) for multiline entries. | ||
| subtext: | | ||
| The Target Allocator now automatically reloads TLS certificates when they are renewed | ||
| by cert-manager. Previously, certificate renewals required a pod restart because | ||
| certificates were only loaded once at startup. The fix uses fsnotify to watch the | ||
| certificate directory and dynamically reloads certificates via the GetCertificate | ||
| callback, enabling seamless certificate rotation without downtime. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,221 @@ | ||
| // Copyright The OpenTelemetry Authors | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| package config | ||
|
|
||
| import ( | ||
| "context" | ||
| "crypto/tls" | ||
| "crypto/x509" | ||
| "os" | ||
| "path/filepath" | ||
| "sync" | ||
| "time" | ||
|
|
||
| "github.com/fsnotify/fsnotify" | ||
| "github.com/go-logr/logr" | ||
| ) | ||
|
|
||
| // CertificateReloader watches certificate files and reloads them on change. | ||
| // It provides dynamic certificate reloading for TLS servers without restart. | ||
| type CertificateReloader struct { | ||
| certPath string | ||
| keyPath string | ||
| caPath string | ||
| cert *tls.Certificate | ||
| clientCAs *x509.CertPool | ||
| mu sync.RWMutex | ||
| logger logr.Logger | ||
| debounceDelay time.Duration | ||
| maxDebounceWait time.Duration | ||
| reloadTimer *time.Timer | ||
| firstEventTime *time.Time | ||
| timerMu sync.Mutex | ||
| reloadNotify chan struct{} | ||
| } | ||
|
|
||
| const defaultDebounceDelay = 100 * time.Millisecond | ||
| const defaultMaxDebounceWait = 1 * time.Second | ||
|
|
||
| // NewCertificateReloader creates a new CertificateReloader and loads the initial certificates. | ||
| func NewCertificateReloader(certPath, keyPath, caPath string, logger logr.Logger) (*CertificateReloader, error) { | ||
| r := &CertificateReloader{ | ||
| certPath: certPath, | ||
| keyPath: keyPath, | ||
| caPath: caPath, | ||
| logger: logger.WithName("cert-reloader"), | ||
| debounceDelay: defaultDebounceDelay, | ||
| maxDebounceWait: defaultMaxDebounceWait, | ||
| reloadNotify: make(chan struct{}, 1), | ||
| firstEventTime: nil, | ||
| } | ||
|
|
||
| if err := r.Reload(); err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| return r, nil | ||
| } | ||
|
|
||
| // Reload reads the certificate files from disk and updates the cached certificates. | ||
| func (r *CertificateReloader) Reload() error { | ||
| cert, err := tls.LoadX509KeyPair(r.certPath, r.keyPath) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| caCert, err := os.ReadFile(r.caPath) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| caCertPool := x509.NewCertPool() | ||
| caCertPool.AppendCertsFromPEM(caCert) | ||
|
|
||
| r.mu.Lock() | ||
| r.cert = &cert | ||
| r.clientCAs = caCertPool | ||
| r.mu.Unlock() | ||
|
|
||
| r.logger.Info("Certificates reloaded successfully", | ||
| "certPath", r.certPath, | ||
| "keyPath", r.keyPath, | ||
| "caPath", r.caPath) | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| // GetCertificate returns the current server certificate for TLS handshakes. | ||
| // This is called by the TLS stack for each new connection. | ||
| func (r *CertificateReloader) GetCertificate(*tls.ClientHelloInfo) (*tls.Certificate, error) { | ||
| r.mu.RLock() | ||
| defer r.mu.RUnlock() | ||
| return r.cert, nil | ||
| } | ||
|
|
||
| // GetClientCAs returns the current CA certificate pool for client verification. | ||
| func (r *CertificateReloader) GetClientCAs() *x509.CertPool { | ||
| r.mu.RLock() | ||
| defer r.mu.RUnlock() | ||
| return r.clientCAs | ||
| } | ||
|
|
||
| // scheduleReload schedules a certificate reload after the debounce delay. | ||
| // If a reload is already scheduled, it resets the timer. | ||
| // | ||
| // To prevent timer starvation from continuous events, this implements a | ||
| // maximum debounce wait time. Even if events keep arriving, a reload is | ||
| // guaranteed to happen within maxDebounceWait from the first event. | ||
| func (r *CertificateReloader) scheduleReload() { | ||
|
swiatekm marked this conversation as resolved.
Outdated
|
||
| r.timerMu.Lock() | ||
| defer r.timerMu.Unlock() | ||
|
|
||
| now := time.Now() | ||
|
|
||
| // Track first event time if this is the start of a new debounce window | ||
| if r.firstEventTime == nil { | ||
| r.firstEventTime = &now | ||
| } | ||
|
|
||
| // Calculate how long until we must reload (max wait constraint) | ||
| timeSinceFirstEvent := now.Sub(*r.firstEventTime) | ||
| timeUntilMaxWait := r.maxDebounceWait - timeSinceFirstEvent | ||
|
|
||
| // Determine actual delay: use debounce delay, but cap at max wait time | ||
| var actualDelay time.Duration | ||
| if timeUntilMaxWait <= 0 { | ||
| // We've already waited the maximum time, reload immediately | ||
| actualDelay = 0 | ||
| } else if timeUntilMaxWait < r.debounceDelay { | ||
| // We're close to the max wait time, use remaining time | ||
| actualDelay = timeUntilMaxWait | ||
| } else { | ||
| // Normal case: use standard debounce delay | ||
| actualDelay = r.debounceDelay | ||
| } | ||
|
|
||
| // Stop existing timer if present | ||
| if r.reloadTimer != nil { | ||
| // Stop existing timer and drain channel if it already fired | ||
| if !r.reloadTimer.Stop() { | ||
|
swiatekm marked this conversation as resolved.
Outdated
|
||
| select { | ||
| case <-r.reloadTimer.C: | ||
| default: | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Schedule reload with calculated delay | ||
| r.reloadTimer = time.AfterFunc(actualDelay, func() { | ||
| // Send non-blocking notification | ||
| select { | ||
| case r.reloadNotify <- struct{}{}: | ||
| default: | ||
| // Channel already has a pending reload notification | ||
| } | ||
|
|
||
| // Reset first event time for next debounce window | ||
| r.timerMu.Lock() | ||
| r.firstEventTime = nil | ||
| r.timerMu.Unlock() | ||
| }) | ||
| } | ||
|
|
||
| // Watch starts watching the certificate files for changes and reloads them when modified. | ||
| // It blocks until the context is cancelled. | ||
| func (r *CertificateReloader) Watch(ctx context.Context) error { | ||
| watcher, err := fsnotify.NewWatcher() | ||
| if err != nil { | ||
| return err | ||
| } | ||
| defer watcher.Close() | ||
|
|
||
| // Collect all unique directories containing certificate files. | ||
| // In Kubernetes, secrets are mounted as symlinks that get updated atomically, | ||
| // so we need to watch the directories for changes. | ||
| // Certificate files may be in different directories. | ||
| dirs := make(map[string]struct{}) | ||
| dirs[filepath.Dir(r.certPath)] = struct{}{} | ||
| dirs[filepath.Dir(r.keyPath)] = struct{}{} | ||
| dirs[filepath.Dir(r.caPath)] = struct{}{} | ||
|
|
||
| // Add each unique directory to the watcher | ||
| for dir := range dirs { | ||
| if err := watcher.Add(dir); err != nil { | ||
| return err | ||
| } | ||
| r.logger.Info("Watching certificate directory for changes", "directory", dir) | ||
| } | ||
|
swiatekm marked this conversation as resolved.
Outdated
|
||
|
|
||
| for { | ||
| select { | ||
| case <-ctx.Done(): | ||
| r.logger.Info("Certificate watcher stopped") | ||
| return ctx.Err() | ||
|
|
||
| case event, ok := <-watcher.Events: | ||
| if !ok { | ||
| return nil | ||
| } | ||
|
|
||
| // In Kubernetes, secret updates create a new symlink target. | ||
| // We look for Create or Write events on any file in the directory. | ||
| if event.Op&(fsnotify.Create|fsnotify.Write) != 0 { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This implementation seems fine. In the contrib repo we had a race condition in one watcher impl https://github.com/open-telemetry/opentelemetry-collector-contrib/pull/44104/changes#diff-6e4e17a95db08bf6e5ca6307cea3be19b08d3f3efe62bd6808ec4ffec7871938L128
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the review and link to another implementation, @pavolloffay. This implementation looks at the cert directory instead of the actual files, so we should be good.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't we also need to check if the event involves the file we're interested in? We could have other files in this directory. Technically, this would just cause spurious reloads, so not a big deal, but if we can avoid them easily, we should do so.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a debouncer since we're dealing with symlinks. |
||
| r.logger.Info("Certificate file change detected", "event", event) | ||
| r.scheduleReload() | ||
| } | ||
|
|
||
| case <-r.reloadNotify: | ||
| if err := r.Reload(); err != nil { | ||
| r.logger.Error(err, "Failed to reload certificates") | ||
| // Continue watching, don't exit on reload failure | ||
| } | ||
|
|
||
| case err, ok := <-watcher.Errors: | ||
| if !ok { | ||
| return nil | ||
| } | ||
| r.logger.Error(err, "Certificate watcher error") | ||
| } | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.