Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
981e9df
fix(target-allocator): add TLS certificate hot-reload for mTLS
CharlieTLe Jan 9, 2026
da2f8a2
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 13, 2026
577a970
fix(target-allocator): debounce TLS certificate reloads
CharlieTLe Jan 14, 2026
dab147a
feat(target-allocator): support TLS cert files in different directories
CharlieTLe Jan 14, 2026
d2d4425
fix(tests): use filesystem events instead of manually calling schedul…
CharlieTLe Jan 15, 2026
83296e2
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 15, 2026
6d77805
test(target-allocator): verify certificate changes instead of reload …
CharlieTLe Jan 15, 2026
6575064
fix(target-allocator): prevent timer starvation in TLS cert reload de…
CharlieTLe Jan 15, 2026
36815ac
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 16, 2026
98924f9
fix linting
CharlieTLe Jan 16, 2026
e2fc078
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 16, 2026
4a612f6
Use certwatcher from controller-runtime to handle cert rotation
CharlieTLe Jan 23, 2026
4296989
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 23, 2026
2d115e5
Remove TestCAReloader_ConcurrentAccess
CharlieTLe Jan 23, 2026
d507cbb
Lock CAReloader.Reload() to prevent potential race condition
CharlieTLe Jan 23, 2026
33a728e
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 23, 2026
81d511e
Fix mTLS certificate verification to only verify leaf certificate
CharlieTLe Jan 23, 2026
af68589
Update go.mod
CharlieTLe Jan 23, 2026
b86ce3e
Run make fmt vet lint
CharlieTLe Jan 23, 2026
6a9b4d5
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .chloggen/fix-ta-tls-cert-hot-reload.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: bug_fix

# The name of the component, or a single word describing the area of concern, (e.g. collector, target allocator, auto-instrumentation, opamp, github action)
component: target allocator

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Fix TLS certificate hot-reload for mTLS connections

# One or more tracking issues related to the change
issues: [4368]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext: |
The Target Allocator now automatically reloads TLS certificates when they are renewed
by cert-manager. Previously, certificate renewals required a pod restart because
certificates were only loaded once at startup. The fix uses fsnotify to watch the
certificate directory and dynamically reloads certificates via the GetCertificate
callback, enabling seamless certificate rotation without downtime.
67 changes: 67 additions & 0 deletions cmd/otel-allocator/internal/config/ca_reloader.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0

package config

import (
"crypto/x509"
"fmt"
"os"
"sync"

"github.com/go-logr/logr"
)

// CAReloader manages CA certificate reloading for client verification.
// It provides thread-safe access to the current CA certificate pool and can be
// triggered to reload via the Reload() method, typically called by a cert watcher callback.
type CAReloader struct {
caPath string
clientCAs *x509.CertPool
mu sync.RWMutex
logger logr.Logger
}

// NewCAReloader creates a new CAReloader and loads the initial CA certificate.
func NewCAReloader(caPath string, logger logr.Logger) (*CAReloader, error) {
r := &CAReloader{
caPath: caPath,
logger: logger.WithName("ca-reloader"),
}

if err := r.Reload(); err != nil {
return nil, err
}

return r, nil
}

// Reload reads the CA certificate file from disk and updates the cached certificate pool.
// This method is thread-safe and can be called concurrently.
func (r *CAReloader) Reload() error {
r.mu.Lock()
defer r.mu.Unlock()

caCert, err := os.ReadFile(r.caPath)
if err != nil {
return fmt.Errorf("failed to read CA certificate: %w", err)
}

caCertPool := x509.NewCertPool()
if !caCertPool.AppendCertsFromPEM(caCert) {
return fmt.Errorf("failed to parse CA certificate at %s", r.caPath)
}

r.clientCAs = caCertPool

r.logger.Info("CA certificate reloaded successfully", "caPath", r.caPath)
return nil
}

// GetClientCAs returns the current CA certificate pool for client verification.
// This method is safe for concurrent access and is called during TLS handshakes.
func (r *CAReloader) GetClientCAs() *x509.CertPool {
r.mu.RLock()
defer r.mu.RUnlock()
return r.clientCAs
}
69 changes: 69 additions & 0 deletions cmd/otel-allocator/internal/config/ca_reloader_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0

package config

import (
"os"
"path/filepath"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
ctrl "sigs.k8s.io/controller-runtime"
)

func TestCAReloader_Reload(t *testing.T) {
tmpDir := t.TempDir()

// Generate initial CA certificate
caPEM1, _ := generateTestCertificate(t)
caPath := filepath.Join(tmpDir, "ca.crt")
require.NoError(t, os.WriteFile(caPath, caPEM1, 0600))

logger := ctrl.Log.WithName("test")
reloader, err := NewCAReloader(caPath, logger)
require.NoError(t, err)

initialCA := reloader.GetClientCAs()
require.NotNil(t, initialCA)

// Generate new CA certificate
caPEM2, _ := generateTestCertificate(t)
require.NoError(t, os.WriteFile(caPath, caPEM2, 0600))

// Reload CA
err = reloader.Reload()
require.NoError(t, err)

// Verify CA pool was updated
newCA := reloader.GetClientCAs()
require.NotNil(t, newCA)
}

func TestCAReloader_InvalidCA(t *testing.T) {
tmpDir := t.TempDir()

// Create valid CA first
validCAPEM, _ := generateTestCertificate(t)
caPath := filepath.Join(tmpDir, "ca.crt")
require.NoError(t, os.WriteFile(caPath, validCAPEM, 0600))

logger := ctrl.Log.WithName("test")
reloader, err := NewCAReloader(caPath, logger)
require.NoError(t, err)

oldCA := reloader.GetClientCAs()
require.NotNil(t, oldCA)

// Write invalid CA
require.NoError(t, os.WriteFile(caPath, []byte("invalid"), 0600))

// Reload should fail
err = reloader.Reload()
require.Error(t, err)

// Verify old CA is still in use
currentCA := reloader.GetClientCAs()
assert.Equal(t, oldCA, currentCA)
}
60 changes: 48 additions & 12 deletions cmd/otel-allocator/internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"k8s.io/client-go/util/homedir"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/certwatcher"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
)

Expand Down Expand Up @@ -388,27 +389,62 @@ func ValidateConfig(config *Config) error {
return nil
}

func (c HTTPSServerConfig) NewTLSConfig() (*tls.Config, error) {
cert, err := tls.LoadX509KeyPair(c.TLSCertFilePath, c.TLSKeyFilePath)
func (c HTTPSServerConfig) NewTLSConfig(logger logr.Logger) (*tls.Config, *certwatcher.CertWatcher, error) {
// Create certwatcher for server certificate/key reloading
certWatcher, err := certwatcher.New(c.TLSCertFilePath, c.TLSKeyFilePath)
if err != nil {
return nil, err
return nil, nil, fmt.Errorf("failed to create cert watcher: %w", err)
}

caCert, err := os.ReadFile(c.CAFilePath)
// Create CA reloader for client CA certificate reloading
caReloader, err := NewCAReloader(c.CAFilePath, logger)
if err != nil {
return nil, err
return nil, nil, fmt.Errorf("failed to create CA reloader: %w", err)
}

caCertPool := x509.NewCertPool()
caCertPool.AppendCertsFromPEM(caCert)
// Register callback to reload CA when server cert changes
// Since Kubernetes updates secrets atomically, the CA will be updated at the same time
Comment thread
swiatekm marked this conversation as resolved.
certWatcher.RegisterCallback(func(cert tls.Certificate) {
if reloadErr := caReloader.Reload(); reloadErr != nil {
logger.Error(reloadErr, "Failed to reload CA via callback")
}
})

tlsConfig := &tls.Config{
Certificates: []tls.Certificate{cert},
ClientAuth: tls.RequireAndVerifyClientCert,
ClientCAs: caCertPool,
MinVersion: tls.VersionTLS12,
GetCertificate: certWatcher.GetCertificate,
// Request client certificate but don't verify automatically
// We'll do custom verification in VerifyConnection with the dynamic CA pool
ClientAuth: tls.RequestClientCert,
MinVersion: tls.VersionTLS12,
// Use VerifyConnection for dynamic CA pool access
// This allows the CA pool to be reloaded at runtime
VerifyConnection: func(cs tls.ConnectionState) error {
// Require client certificate
if len(cs.PeerCertificates) == 0 {
return fmt.Errorf("no client certificate provided")
}

// Verify using current CA pool (which can be reloaded)
opts := x509.VerifyOptions{
Roots: caReloader.GetClientCAs(),
Intermediates: x509.NewCertPool(),
KeyUsages: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth},
}

// Add intermediate certificates to the pool
for _, cert := range cs.PeerCertificates[1:] {
opts.Intermediates.AddCert(cert)
}

// Verify only the leaf certificate
if _, err := cs.PeerCertificates[0].Verify(opts); err != nil {
return fmt.Errorf("client certificate verification failed: %w", err)
}
return nil
},
}
return tlsConfig, nil

return tlsConfig, certWatcher, nil
}

// GetAllowDenyLists returns the allow and deny lists as maps. If the allow list is empty, it defaults to all namespaces.
Expand Down
Loading
Loading