-
Notifications
You must be signed in to change notification settings - Fork 631
fix(target-allocator): add TLS certificate hot-reload for mTLS #4597
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
981e9df
da2f8a2
577a970
dab147a
d2d4425
83296e2
6d77805
6575064
36815ac
98924f9
e2fc078
4a612f6
4296989
2d115e5
d507cbb
33a728e
81d511e
af68589
b86ce3e
6a9b4d5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| # One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' | ||
| change_type: bug_fix | ||
|
|
||
| # The name of the component, or a single word describing the area of concern, (e.g. collector, target allocator, auto-instrumentation, opamp, github action) | ||
| component: target allocator | ||
|
|
||
| # A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). | ||
| note: Fix TLS certificate hot-reload for mTLS connections | ||
|
|
||
| # One or more tracking issues related to the change | ||
| issues: [4368] | ||
|
|
||
| # (Optional) One or more lines of additional information to render under the primary note. | ||
| # These lines will be padded with 2 spaces and then inserted directly into the document. | ||
| # Use pipe (|) for multiline entries. | ||
| subtext: | | ||
| The Target Allocator now automatically reloads TLS certificates when they are renewed | ||
| by cert-manager. Previously, certificate renewals required a pod restart because | ||
| certificates were only loaded once at startup. The fix uses fsnotify to watch the | ||
| certificate directory and dynamically reloads certificates via the GetCertificate | ||
| callback, enabling seamless certificate rotation without downtime. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,135 @@ | ||
| // Copyright The OpenTelemetry Authors | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| package config | ||
|
|
||
| import ( | ||
| "context" | ||
| "crypto/tls" | ||
| "crypto/x509" | ||
| "os" | ||
| "path/filepath" | ||
| "sync" | ||
|
|
||
| "github.com/fsnotify/fsnotify" | ||
| "github.com/go-logr/logr" | ||
| ) | ||
|
|
||
| // CertificateReloader watches certificate files and reloads them on change. | ||
| // It provides dynamic certificate reloading for TLS servers without restart. | ||
| type CertificateReloader struct { | ||
| certPath string | ||
| keyPath string | ||
| caPath string | ||
| cert *tls.Certificate | ||
| clientCAs *x509.CertPool | ||
| mu sync.RWMutex | ||
| logger logr.Logger | ||
| } | ||
|
|
||
| // NewCertificateReloader creates a new CertificateReloader and loads the initial certificates. | ||
| func NewCertificateReloader(certPath, keyPath, caPath string, logger logr.Logger) (*CertificateReloader, error) { | ||
| r := &CertificateReloader{ | ||
| certPath: certPath, | ||
| keyPath: keyPath, | ||
| caPath: caPath, | ||
| logger: logger.WithName("cert-reloader"), | ||
| } | ||
|
|
||
| if err := r.Reload(); err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| return r, nil | ||
| } | ||
|
|
||
| // Reload reads the certificate files from disk and updates the cached certificates. | ||
| func (r *CertificateReloader) Reload() error { | ||
| cert, err := tls.LoadX509KeyPair(r.certPath, r.keyPath) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| caCert, err := os.ReadFile(r.caPath) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| caCertPool := x509.NewCertPool() | ||
| caCertPool.AppendCertsFromPEM(caCert) | ||
|
|
||
| r.mu.Lock() | ||
| r.cert = &cert | ||
| r.clientCAs = caCertPool | ||
| r.mu.Unlock() | ||
|
|
||
| r.logger.Info("Certificates reloaded successfully", | ||
| "certPath", r.certPath, | ||
| "keyPath", r.keyPath, | ||
| "caPath", r.caPath) | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| // GetCertificate returns the current server certificate for TLS handshakes. | ||
| // This is called by the TLS stack for each new connection. | ||
| func (r *CertificateReloader) GetCertificate(*tls.ClientHelloInfo) (*tls.Certificate, error) { | ||
| r.mu.RLock() | ||
| defer r.mu.RUnlock() | ||
| return r.cert, nil | ||
| } | ||
|
|
||
| // GetClientCAs returns the current CA certificate pool for client verification. | ||
| func (r *CertificateReloader) GetClientCAs() *x509.CertPool { | ||
| r.mu.RLock() | ||
| defer r.mu.RUnlock() | ||
| return r.clientCAs | ||
| } | ||
|
|
||
| // Watch starts watching the certificate files for changes and reloads them when modified. | ||
| // It blocks until the context is cancelled. | ||
| func (r *CertificateReloader) Watch(ctx context.Context) error { | ||
| watcher, err := fsnotify.NewWatcher() | ||
| if err != nil { | ||
| return err | ||
| } | ||
| defer watcher.Close() | ||
|
|
||
| // Watch the directory containing the certificates. | ||
| // In Kubernetes, secrets are mounted as symlinks that get updated atomically, | ||
| // so we need to watch the directory for changes. | ||
| certDir := filepath.Dir(r.certPath) | ||
| if err := watcher.Add(certDir); err != nil { | ||
| return err | ||
| } | ||
|
swiatekm marked this conversation as resolved.
Outdated
|
||
| r.logger.Info("Watching certificate directory for changes", "directory", certDir) | ||
|
|
||
| for { | ||
| select { | ||
| case <-ctx.Done(): | ||
| r.logger.Info("Certificate watcher stopped") | ||
| return ctx.Err() | ||
|
|
||
| case event, ok := <-watcher.Events: | ||
| if !ok { | ||
| return nil | ||
| } | ||
|
|
||
| // In Kubernetes, secret updates create a new symlink target. | ||
| // We look for Create or Write events on any file in the directory. | ||
| if event.Op&(fsnotify.Create|fsnotify.Write) != 0 { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This implementation seems fine. In the contrib repo we had a race condition in one watcher impl https://github.com/open-telemetry/opentelemetry-collector-contrib/pull/44104/changes#diff-6e4e17a95db08bf6e5ca6307cea3be19b08d3f3efe62bd6808ec4ffec7871938L128
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the review and link to another implementation, @pavolloffay. This implementation looks at the cert directory instead of the actual files, so we should be good.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't we also need to check if the event involves the file we're interested in? We could have other files in this directory. Technically, this would just cause spurious reloads, so not a big deal, but if we can avoid them easily, we should do so.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a debouncer since we're dealing with symlinks. |
||
| r.logger.Info("Certificate file change detected", "event", event) | ||
| if err := r.Reload(); err != nil { | ||
| r.logger.Error(err, "Failed to reload certificates") | ||
| // Continue watching, don't exit on reload failure | ||
| } | ||
| } | ||
|
|
||
| case err, ok := <-watcher.Errors: | ||
| if !ok { | ||
| return nil | ||
| } | ||
| r.logger.Error(err, "Certificate watcher error") | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| apiVersion: batch/v1 | ||
| kind: Job | ||
| metadata: | ||
| name: capture-initial-cert-date | ||
| status: | ||
| succeeded: 1 | ||
| --- | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: cert-date-tracker |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,110 @@ | ||
| # RBAC for cert date tracking Jobs | ||
| apiVersion: v1 | ||
| kind: ServiceAccount | ||
| metadata: | ||
| name: cert-tracker-sa | ||
| --- | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: Role | ||
| metadata: | ||
| name: cert-tracker-role | ||
| rules: | ||
| - apiGroups: [""] | ||
| resources: ["configmaps"] | ||
| verbs: ["create", "get"] | ||
| --- | ||
| apiVersion: rbac.authorization.k8s.io/v1 | ||
| kind: RoleBinding | ||
| metadata: | ||
| name: cert-tracker-rolebinding | ||
| subjects: | ||
| - kind: ServiceAccount | ||
| name: cert-tracker-sa | ||
| roleRef: | ||
| kind: Role | ||
| name: cert-tracker-role | ||
| apiGroup: rbac.authorization.k8s.io | ||
| --- | ||
| # Capture initial certificate date before renewal | ||
| # This will be compared after cert renewal to verify rotation occurred | ||
| apiVersion: batch/v1 | ||
| kind: Job | ||
| metadata: | ||
| name: capture-initial-cert-date | ||
| spec: | ||
| template: | ||
| spec: | ||
| serviceAccountName: cert-tracker-sa | ||
| restartPolicy: OnFailure | ||
| containers: | ||
| - name: capture-cert | ||
| image: docker.io/nicolaka/netshoot:latest | ||
| command: | ||
| - /bin/sh | ||
| - -c | ||
| - | | ||
| set -e | ||
|
|
||
| echo "Attempting to connect to prometheus-cr-targetallocator:443..." | ||
|
|
||
| # Retry logic - target allocator might take a moment to be ready | ||
| MAX_RETRIES=30 | ||
| RETRY_INTERVAL=2 | ||
| CERT_DATE="" | ||
|
|
||
| for i in $(seq 1 $MAX_RETRIES); do | ||
| echo "Attempt $i/$MAX_RETRIES: Getting certificate from target allocator..." | ||
|
|
||
| # Capture both stdout and stderr for debugging | ||
| RAW_OUTPUT=$(echo | openssl s_client -connect prometheus-cr-targetallocator:443 -servername prometheus-cr-targetallocator 2>&1) || true | ||
|
|
||
| # Try to extract the certificate date | ||
| CERT_DATE=$(echo "$RAW_OUTPUT" | openssl x509 -noout -startdate 2>/dev/null | cut -d= -f2) || true | ||
|
|
||
| if [ -n "$CERT_DATE" ]; then | ||
| echo "Successfully retrieved certificate date" | ||
| break | ||
| fi | ||
|
|
||
| echo "Failed to get certificate. Raw output:" | ||
| echo "$RAW_OUTPUT" | head -20 | ||
| echo "---" | ||
|
|
||
| if [ $i -lt $MAX_RETRIES ]; then | ||
| echo "Retrying in ${RETRY_INTERVAL}s..." | ||
| sleep $RETRY_INTERVAL | ||
| fi | ||
| done | ||
|
|
||
| if [ -z "$CERT_DATE" ]; then | ||
| echo "ERROR: Failed to get certificate date after $MAX_RETRIES attempts" | ||
| echo "Please check that the target allocator is running and serving TLS" | ||
| exit 1 | ||
| fi | ||
|
|
||
| echo "Initial certificate notBefore date: $CERT_DATE" | ||
|
|
||
| # Store the date in a ConfigMap using the Kubernetes API | ||
| TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) | ||
| NAMESPACE=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace) | ||
|
|
||
| echo "Creating ConfigMap cert-date-tracker in namespace $NAMESPACE..." | ||
|
|
||
| # Create ConfigMap and capture response | ||
| RESPONSE=$(curl -s -k -X POST \ | ||
| -H "Authorization: Bearer $TOKEN" \ | ||
| -H "Content-Type: application/json" \ | ||
| -w "\n%{http_code}" \ | ||
| "https://kubernetes.default.svc/api/v1/namespaces/${NAMESPACE}/configmaps" \ | ||
| -d "{\"apiVersion\":\"v1\",\"kind\":\"ConfigMap\",\"metadata\":{\"name\":\"cert-date-tracker\"},\"data\":{\"initial-date\":\"${CERT_DATE}\"}}") | ||
|
|
||
| HTTP_CODE=$(echo "$RESPONSE" | tail -1) | ||
| BODY=$(echo "$RESPONSE" | sed '$d') | ||
|
|
||
| if [ "$HTTP_CODE" -ge 200 ] && [ "$HTTP_CODE" -lt 300 ]; then | ||
| echo "Successfully stored initial cert date in ConfigMap" | ||
| else | ||
| echo "ERROR: Failed to create ConfigMap. HTTP code: $HTTP_CODE" | ||
| echo "Response: $BODY" | ||
| exit 1 | ||
| fi |
Uh oh!
There was an error while loading. Please reload this page.