Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
981e9df
fix(target-allocator): add TLS certificate hot-reload for mTLS
CharlieTLe Jan 9, 2026
da2f8a2
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 13, 2026
577a970
fix(target-allocator): debounce TLS certificate reloads
CharlieTLe Jan 14, 2026
dab147a
feat(target-allocator): support TLS cert files in different directories
CharlieTLe Jan 14, 2026
d2d4425
fix(tests): use filesystem events instead of manually calling schedul…
CharlieTLe Jan 15, 2026
83296e2
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 15, 2026
6d77805
test(target-allocator): verify certificate changes instead of reload …
CharlieTLe Jan 15, 2026
6575064
fix(target-allocator): prevent timer starvation in TLS cert reload de…
CharlieTLe Jan 15, 2026
36815ac
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 16, 2026
98924f9
fix linting
CharlieTLe Jan 16, 2026
e2fc078
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 16, 2026
4a612f6
Use certwatcher from controller-runtime to handle cert rotation
CharlieTLe Jan 23, 2026
4296989
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 23, 2026
2d115e5
Remove TestCAReloader_ConcurrentAccess
CharlieTLe Jan 23, 2026
d507cbb
Lock CAReloader.Reload() to prevent potential race condition
CharlieTLe Jan 23, 2026
33a728e
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 23, 2026
81d511e
Fix mTLS certificate verification to only verify leaf certificate
CharlieTLe Jan 23, 2026
af68589
Update go.mod
CharlieTLe Jan 23, 2026
b86ce3e
Run make fmt vet lint
CharlieTLe Jan 23, 2026
6a9b4d5
Merge branch 'main' into ta-mtls-bug
CharlieTLe Jan 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .chloggen/fix-ta-tls-cert-hot-reload.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: bug_fix

# The name of the component, or a single word describing the area of concern, (e.g. collector, target allocator, auto-instrumentation, opamp, github action)
component: target allocator

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Fix TLS certificate hot-reload for mTLS connections

# One or more tracking issues related to the change
issues: [4368]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext: |
The Target Allocator now automatically reloads TLS certificates when they are renewed
by cert-manager. Previously, certificate renewals required a pod restart because
certificates were only loaded once at startup. The fix uses fsnotify to watch the
certificate directory and dynamically reloads certificates via the GetCertificate
callback, enabling seamless certificate rotation without downtime.
25 changes: 8 additions & 17 deletions cmd/otel-allocator/internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ package config

import (
"crypto/tls"
"crypto/x509"
"errors"
"fmt"
"io/fs"
Expand Down Expand Up @@ -388,27 +387,19 @@ func ValidateConfig(config *Config) error {
return nil
}

func (c HTTPSServerConfig) NewTLSConfig() (*tls.Config, error) {
cert, err := tls.LoadX509KeyPair(c.TLSCertFilePath, c.TLSKeyFilePath)
func (c HTTPSServerConfig) NewTLSConfig(logger logr.Logger) (*tls.Config, *CertificateReloader, error) {
reloader, err := NewCertificateReloader(c.TLSCertFilePath, c.TLSKeyFilePath, c.CAFilePath, logger)
if err != nil {
return nil, err
}

caCert, err := os.ReadFile(c.CAFilePath)
if err != nil {
return nil, err
return nil, nil, err
}

caCertPool := x509.NewCertPool()
caCertPool.AppendCertsFromPEM(caCert)

tlsConfig := &tls.Config{
Certificates: []tls.Certificate{cert},
ClientAuth: tls.RequireAndVerifyClientCert,
ClientCAs: caCertPool,
MinVersion: tls.VersionTLS12,
GetCertificate: reloader.GetCertificate,
ClientAuth: tls.RequireAndVerifyClientCert,
ClientCAs: reloader.GetClientCAs(),
MinVersion: tls.VersionTLS12,
}
return tlsConfig, nil
return tlsConfig, reloader, nil
}

// GetAllowDenyLists returns the allow and deny lists as maps. If the allow list is empty, it defaults to all namespaces.
Expand Down
135 changes: 135 additions & 0 deletions cmd/otel-allocator/internal/config/tls_reloader.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0

package config

import (
"context"
"crypto/tls"
"crypto/x509"
"os"
"path/filepath"
"sync"

"github.com/fsnotify/fsnotify"
"github.com/go-logr/logr"
)

// CertificateReloader watches certificate files and reloads them on change.
// It provides dynamic certificate reloading for TLS servers without restart.
type CertificateReloader struct {
certPath string
keyPath string
caPath string
cert *tls.Certificate
clientCAs *x509.CertPool
mu sync.RWMutex
logger logr.Logger
}

// NewCertificateReloader creates a new CertificateReloader and loads the initial certificates.
func NewCertificateReloader(certPath, keyPath, caPath string, logger logr.Logger) (*CertificateReloader, error) {
Comment thread
swiatekm marked this conversation as resolved.
Outdated
r := &CertificateReloader{
certPath: certPath,
keyPath: keyPath,
caPath: caPath,
logger: logger.WithName("cert-reloader"),
}

if err := r.Reload(); err != nil {
return nil, err
}

return r, nil
}

// Reload reads the certificate files from disk and updates the cached certificates.
func (r *CertificateReloader) Reload() error {
cert, err := tls.LoadX509KeyPair(r.certPath, r.keyPath)
if err != nil {
return err
}

caCert, err := os.ReadFile(r.caPath)
if err != nil {
return err
}

caCertPool := x509.NewCertPool()
caCertPool.AppendCertsFromPEM(caCert)

r.mu.Lock()
r.cert = &cert
r.clientCAs = caCertPool
r.mu.Unlock()

r.logger.Info("Certificates reloaded successfully",
"certPath", r.certPath,
"keyPath", r.keyPath,
"caPath", r.caPath)

return nil
}

// GetCertificate returns the current server certificate for TLS handshakes.
// This is called by the TLS stack for each new connection.
func (r *CertificateReloader) GetCertificate(*tls.ClientHelloInfo) (*tls.Certificate, error) {
r.mu.RLock()
defer r.mu.RUnlock()
return r.cert, nil
}

// GetClientCAs returns the current CA certificate pool for client verification.
func (r *CertificateReloader) GetClientCAs() *x509.CertPool {
r.mu.RLock()
defer r.mu.RUnlock()
return r.clientCAs
}

// Watch starts watching the certificate files for changes and reloads them when modified.
// It blocks until the context is cancelled.
func (r *CertificateReloader) Watch(ctx context.Context) error {
watcher, err := fsnotify.NewWatcher()
if err != nil {
return err
}
defer watcher.Close()

// Watch the directory containing the certificates.
// In Kubernetes, secrets are mounted as symlinks that get updated atomically,
// so we need to watch the directory for changes.
certDir := filepath.Dir(r.certPath)
if err := watcher.Add(certDir); err != nil {
return err
}
Comment thread
swiatekm marked this conversation as resolved.
Outdated
r.logger.Info("Watching certificate directory for changes", "directory", certDir)

for {
select {
case <-ctx.Done():
r.logger.Info("Certificate watcher stopped")
return ctx.Err()

case event, ok := <-watcher.Events:
if !ok {
return nil
}

// In Kubernetes, secret updates create a new symlink target.
// We look for Create or Write events on any file in the directory.
if event.Op&(fsnotify.Create|fsnotify.Write) != 0 {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review and link to another implementation, @pavolloffay. This implementation looks at the cert directory instead of the actual files, so we should be good.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we also need to check if the event involves the file we're interested in? We could have other files in this directory. Technically, this would just cause spurious reloads, so not a big deal, but if we can avoid them easily, we should do so.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a debouncer since we're dealing with symlinks.

r.logger.Info("Certificate file change detected", "event", event)
if err := r.Reload(); err != nil {
r.logger.Error(err, "Failed to reload certificates")
// Continue watching, don't exit on reload failure
}
}

case err, ok := <-watcher.Errors:
if !ok {
return nil
}
r.logger.Error(err, "Certificate watcher error")
}
}
}
2 changes: 1 addition & 1 deletion cmd/otel-allocator/internal/server/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ func TestServer_TargetsHandler(t *testing.T) {

func TestServer_ScrapeConfigsHandler(t *testing.T) {
svrConfig := allocatorconfig.HTTPSServerConfig{}
tlsConfig, _ := svrConfig.NewTLSConfig()
tlsConfig, _, _ := svrConfig.NewTLSConfig(logger)
tests := []struct {
description string
scrapeConfigs map[string]*promconfig.ScrapeConfig
Expand Down
18 changes: 17 additions & 1 deletion cmd/otel-allocator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package main

import (
"context"
"crypto/tls"
"fmt"
"os"
"os/signal"
Expand Down Expand Up @@ -45,6 +46,7 @@ func main() {
discoveryManager *discovery.Manager
collectorWatcher *collector.Watcher
targetDiscoverer *target.Discoverer
certReloader *config.CertificateReloader

discoveryCancel context.CancelFunc
runGroup run.Group
Expand Down Expand Up @@ -96,7 +98,9 @@ func main() {

httpOptions := []server.Option{}
if cfg.HTTPS.Enabled {
tlsConfig, confErr := cfg.HTTPS.NewTLSConfig()
var tlsConfig *tls.Config
var confErr error
tlsConfig, certReloader, confErr = cfg.HTTPS.NewTLSConfig(log)
if confErr != nil {
setupLog.Error(confErr, "Unable to initialize TLS configuration")
os.Exit(1)
Expand Down Expand Up @@ -226,6 +230,18 @@ func main() {
setupLog.Error(shutdownErr, "Error on HTTPS server shutdown")
}
})
// Start certificate watcher for hot-reload
certWatcherCtx, certWatcherCancel := context.WithCancel(ctx)
runGroup.Add(
func() error {
watchErr := certReloader.Watch(certWatcherCtx)
setupLog.Info("Certificate watcher exited")
return watchErr
},
func(_ error) {
setupLog.Info("Closing certificate watcher")
certWatcherCancel()
})
}
meter := otel.GetMeterProvider().Meter("targetallocator")
eventsMetric, err := meter.Int64Counter("opentelemetry_allocator_events", metric.WithDescription("Number of events in the channel."))
Expand Down
11 changes: 11 additions & 0 deletions tests/e2e-ta-collector-mtls/ta-collector-mtls/02b-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: batch/v1
kind: Job
metadata:
name: capture-initial-cert-date
status:
succeeded: 1
---
apiVersion: v1
kind: ConfigMap
metadata:
name: cert-date-tracker
110 changes: 110 additions & 0 deletions tests/e2e-ta-collector-mtls/ta-collector-mtls/02b-install.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# RBAC for cert date tracking Jobs
apiVersion: v1
kind: ServiceAccount
metadata:
name: cert-tracker-sa
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: cert-tracker-role
rules:
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["create", "get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: cert-tracker-rolebinding
subjects:
- kind: ServiceAccount
name: cert-tracker-sa
roleRef:
kind: Role
name: cert-tracker-role
apiGroup: rbac.authorization.k8s.io
---
# Capture initial certificate date before renewal
# This will be compared after cert renewal to verify rotation occurred
apiVersion: batch/v1
kind: Job
metadata:
name: capture-initial-cert-date
spec:
template:
spec:
serviceAccountName: cert-tracker-sa
restartPolicy: OnFailure
containers:
- name: capture-cert
image: docker.io/nicolaka/netshoot:latest
command:
- /bin/sh
- -c
- |
set -e

echo "Attempting to connect to prometheus-cr-targetallocator:443..."

# Retry logic - target allocator might take a moment to be ready
MAX_RETRIES=30
RETRY_INTERVAL=2
CERT_DATE=""

for i in $(seq 1 $MAX_RETRIES); do
echo "Attempt $i/$MAX_RETRIES: Getting certificate from target allocator..."

# Capture both stdout and stderr for debugging
RAW_OUTPUT=$(echo | openssl s_client -connect prometheus-cr-targetallocator:443 -servername prometheus-cr-targetallocator 2>&1) || true

# Try to extract the certificate date
CERT_DATE=$(echo "$RAW_OUTPUT" | openssl x509 -noout -startdate 2>/dev/null | cut -d= -f2) || true

if [ -n "$CERT_DATE" ]; then
echo "Successfully retrieved certificate date"
break
fi

echo "Failed to get certificate. Raw output:"
echo "$RAW_OUTPUT" | head -20
echo "---"

if [ $i -lt $MAX_RETRIES ]; then
echo "Retrying in ${RETRY_INTERVAL}s..."
sleep $RETRY_INTERVAL
fi
done

if [ -z "$CERT_DATE" ]; then
echo "ERROR: Failed to get certificate date after $MAX_RETRIES attempts"
echo "Please check that the target allocator is running and serving TLS"
exit 1
fi

echo "Initial certificate notBefore date: $CERT_DATE"

# Store the date in a ConfigMap using the Kubernetes API
TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
NAMESPACE=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace)

echo "Creating ConfigMap cert-date-tracker in namespace $NAMESPACE..."

# Create ConfigMap and capture response
RESPONSE=$(curl -s -k -X POST \
-H "Authorization: Bearer $TOKEN" \
-H "Content-Type: application/json" \
-w "\n%{http_code}" \
"https://kubernetes.default.svc/api/v1/namespaces/${NAMESPACE}/configmaps" \
-d "{\"apiVersion\":\"v1\",\"kind\":\"ConfigMap\",\"metadata\":{\"name\":\"cert-date-tracker\"},\"data\":{\"initial-date\":\"${CERT_DATE}\"}}")

HTTP_CODE=$(echo "$RESPONSE" | tail -1)
BODY=$(echo "$RESPONSE" | sed '$d')

if [ "$HTTP_CODE" -ge 200 ] && [ "$HTTP_CODE" -lt 300 ]; then
echo "Successfully stored initial cert date in ConfigMap"
else
echo "ERROR: Failed to create ConfigMap. HTTP code: $HTTP_CODE"
echo "Response: $BODY"
exit 1
fi
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ metadata:
status:
succeeded: 1
---
apiVersion: batch/v1
kind: Job
metadata:
name: verify-cert-rotation
status:
succeeded: 1
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
Expand Down
Loading
Loading