Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
981e9df
fix(target-allocator): add TLS certificate hot-reload for mTLS
CharlieTLe Jan 9, 2026
da2f8a2
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 13, 2026
577a970
fix(target-allocator): debounce TLS certificate reloads
CharlieTLe Jan 14, 2026
dab147a
feat(target-allocator): support TLS cert files in different directories
CharlieTLe Jan 14, 2026
d2d4425
fix(tests): use filesystem events instead of manually calling schedul…
CharlieTLe Jan 15, 2026
83296e2
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 15, 2026
6d77805
test(target-allocator): verify certificate changes instead of reload …
CharlieTLe Jan 15, 2026
6575064
fix(target-allocator): prevent timer starvation in TLS cert reload de…
CharlieTLe Jan 15, 2026
36815ac
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 16, 2026
98924f9
fix linting
CharlieTLe Jan 16, 2026
e2fc078
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 16, 2026
4a612f6
Use certwatcher from controller-runtime to handle cert rotation
CharlieTLe Jan 23, 2026
4296989
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 23, 2026
2d115e5
Remove TestCAReloader_ConcurrentAccess
CharlieTLe Jan 23, 2026
d507cbb
Lock CAReloader.Reload() to prevent potential race condition
CharlieTLe Jan 23, 2026
33a728e
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 23, 2026
81d511e
Fix mTLS certificate verification to only verify leaf certificate
CharlieTLe Jan 23, 2026
af68589
Update go.mod
CharlieTLe Jan 23, 2026
b86ce3e
Run make fmt vet lint
CharlieTLe Jan 23, 2026
6a9b4d5
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 26, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .chloggen/fix-ta-tls-cert-hot-reload.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: bug_fix

# The name of the component, or a single word describing the area of concern, (e.g. collector, target allocator, auto-instrumentation, opamp, github action)
component: target allocator

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Fix TLS certificate hot-reload for mTLS connections

# One or more tracking issues related to the change
issues: [4368]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext: |
The Target Allocator now automatically reloads TLS certificates when they are renewed
by cert-manager. Previously, certificate renewals required a pod restart because
certificates were only loaded once at startup. The fix uses fsnotify to watch the
certificate directory and dynamically reloads certificates via the GetCertificate
callback, enabling seamless certificate rotation without downtime.
25 changes: 8 additions & 17 deletions cmd/otel-allocator/internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ package config

import (
"crypto/tls"
"crypto/x509"
"errors"
"fmt"
"io/fs"
Expand Down Expand Up @@ -388,27 +387,19 @@ func ValidateConfig(config *Config) error {
return nil
}

func (c HTTPSServerConfig) NewTLSConfig() (*tls.Config, error) {
cert, err := tls.LoadX509KeyPair(c.TLSCertFilePath, c.TLSKeyFilePath)
func (c HTTPSServerConfig) NewTLSConfig(logger logr.Logger) (*tls.Config, *CertificateReloader, error) {
reloader, err := NewCertificateReloader(c.TLSCertFilePath, c.TLSKeyFilePath, c.CAFilePath, logger)
if err != nil {
return nil, err
}

caCert, err := os.ReadFile(c.CAFilePath)
if err != nil {
return nil, err
return nil, nil, err
}

caCertPool := x509.NewCertPool()
caCertPool.AppendCertsFromPEM(caCert)

tlsConfig := &tls.Config{
Certificates: []tls.Certificate{cert},
ClientAuth: tls.RequireAndVerifyClientCert,
ClientCAs: caCertPool,
MinVersion: tls.VersionTLS12,
GetCertificate: reloader.GetCertificate,
ClientAuth: tls.RequireAndVerifyClientCert,
ClientCAs: reloader.GetClientCAs(),
MinVersion: tls.VersionTLS12,
}
return tlsConfig, nil
return tlsConfig, reloader, nil
}

// GetAllowDenyLists returns the allow and deny lists as maps. If the allow list is empty, it defaults to all namespaces.
Expand Down
221 changes: 221 additions & 0 deletions cmd/otel-allocator/internal/config/tls_reloader.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0

package config

import (
"context"
"crypto/tls"
"crypto/x509"
"os"
"path/filepath"
"sync"
"time"

"github.com/fsnotify/fsnotify"
"github.com/go-logr/logr"
)

// CertificateReloader watches certificate files and reloads them on change.
// It provides dynamic certificate reloading for TLS servers without restart.
type CertificateReloader struct {
certPath string
keyPath string
caPath string
cert *tls.Certificate
clientCAs *x509.CertPool
mu sync.RWMutex
logger logr.Logger
debounceDelay time.Duration
maxDebounceWait time.Duration
reloadTimer *time.Timer
firstEventTime *time.Time
timerMu sync.Mutex
reloadNotify chan struct{}
}

const defaultDebounceDelay = 100 * time.Millisecond
const defaultMaxDebounceWait = 1 * time.Second

// NewCertificateReloader creates a new CertificateReloader and loads the initial certificates.
func NewCertificateReloader(certPath, keyPath, caPath string, logger logr.Logger) (*CertificateReloader, error) {
Comment thread
swiatekm marked this conversation as resolved.
Outdated
r := &CertificateReloader{
certPath: certPath,
keyPath: keyPath,
caPath: caPath,
logger: logger.WithName("cert-reloader"),
debounceDelay: defaultDebounceDelay,
maxDebounceWait: defaultMaxDebounceWait,
reloadNotify: make(chan struct{}, 1),
firstEventTime: nil,
}

if err := r.Reload(); err != nil {
return nil, err
}

return r, nil
}

// Reload reads the certificate files from disk and updates the cached certificates.
func (r *CertificateReloader) Reload() error {
cert, err := tls.LoadX509KeyPair(r.certPath, r.keyPath)
if err != nil {
return err
}

caCert, err := os.ReadFile(r.caPath)
if err != nil {
return err
}

caCertPool := x509.NewCertPool()
caCertPool.AppendCertsFromPEM(caCert)

r.mu.Lock()
r.cert = &cert
r.clientCAs = caCertPool
r.mu.Unlock()

r.logger.Info("Certificates reloaded successfully",
"certPath", r.certPath,
"keyPath", r.keyPath,
"caPath", r.caPath)

return nil
}

// GetCertificate returns the current server certificate for TLS handshakes.
// This is called by the TLS stack for each new connection.
func (r *CertificateReloader) GetCertificate(*tls.ClientHelloInfo) (*tls.Certificate, error) {
r.mu.RLock()
defer r.mu.RUnlock()
return r.cert, nil
}

// GetClientCAs returns the current CA certificate pool for client verification.
func (r *CertificateReloader) GetClientCAs() *x509.CertPool {
r.mu.RLock()
defer r.mu.RUnlock()
return r.clientCAs
}

// scheduleReload schedules a certificate reload after the debounce delay.
// If a reload is already scheduled, it resets the timer.
//
// To prevent timer starvation from continuous events, this implements a
// maximum debounce wait time. Even if events keep arriving, a reload is
// guaranteed to happen within maxDebounceWait from the first event.
func (r *CertificateReloader) scheduleReload() {
Comment thread
swiatekm marked this conversation as resolved.
Outdated
r.timerMu.Lock()
defer r.timerMu.Unlock()

now := time.Now()

// Track first event time if this is the start of a new debounce window
if r.firstEventTime == nil {
r.firstEventTime = &now
}

// Calculate how long until we must reload (max wait constraint)
timeSinceFirstEvent := now.Sub(*r.firstEventTime)
timeUntilMaxWait := r.maxDebounceWait - timeSinceFirstEvent

// Determine actual delay: use debounce delay, but cap at max wait time
var actualDelay time.Duration
if timeUntilMaxWait <= 0 {
// We've already waited the maximum time, reload immediately
actualDelay = 0
} else if timeUntilMaxWait < r.debounceDelay {
// We're close to the max wait time, use remaining time
actualDelay = timeUntilMaxWait
} else {
// Normal case: use standard debounce delay
actualDelay = r.debounceDelay
}

// Stop existing timer if present
if r.reloadTimer != nil {
// Stop existing timer and drain channel if it already fired
if !r.reloadTimer.Stop() {
Comment thread
swiatekm marked this conversation as resolved.
Outdated
select {
case <-r.reloadTimer.C:
default:
}
}
}

// Schedule reload with calculated delay
r.reloadTimer = time.AfterFunc(actualDelay, func() {
// Send non-blocking notification
select {
case r.reloadNotify <- struct{}{}:
default:
// Channel already has a pending reload notification
}

// Reset first event time for next debounce window
r.timerMu.Lock()
r.firstEventTime = nil
r.timerMu.Unlock()
})
}

// Watch starts watching the certificate files for changes and reloads them when modified.
// It blocks until the context is cancelled.
func (r *CertificateReloader) Watch(ctx context.Context) error {
watcher, err := fsnotify.NewWatcher()
if err != nil {
return err
}
defer watcher.Close()

// Collect all unique directories containing certificate files.
// In Kubernetes, secrets are mounted as symlinks that get updated atomically,
// so we need to watch the directories for changes.
// Certificate files may be in different directories.
dirs := make(map[string]struct{})
dirs[filepath.Dir(r.certPath)] = struct{}{}
dirs[filepath.Dir(r.keyPath)] = struct{}{}
dirs[filepath.Dir(r.caPath)] = struct{}{}

// Add each unique directory to the watcher
for dir := range dirs {
if err := watcher.Add(dir); err != nil {
return err
}
r.logger.Info("Watching certificate directory for changes", "directory", dir)
}
Comment thread
swiatekm marked this conversation as resolved.
Outdated

for {
select {
case <-ctx.Done():
r.logger.Info("Certificate watcher stopped")
return ctx.Err()

case event, ok := <-watcher.Events:
if !ok {
return nil
}

// In Kubernetes, secret updates create a new symlink target.
// We look for Create or Write events on any file in the directory.
if event.Op&(fsnotify.Create|fsnotify.Write) != 0 {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review and link to another implementation, @pavolloffay. This implementation looks at the cert directory instead of the actual files, so we should be good.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we also need to check if the event involves the file we're interested in? We could have other files in this directory. Technically, this would just cause spurious reloads, so not a big deal, but if we can avoid them easily, we should do so.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a debouncer since we're dealing with symlinks.

r.logger.Info("Certificate file change detected", "event", event)
r.scheduleReload()
}

case <-r.reloadNotify:
if err := r.Reload(); err != nil {
r.logger.Error(err, "Failed to reload certificates")
// Continue watching, don't exit on reload failure
}

case err, ok := <-watcher.Errors:
if !ok {
return nil
}
r.logger.Error(err, "Certificate watcher error")
}
}
}
Loading
Loading