-
Notifications
You must be signed in to change notification settings - Fork 220
Bug 2117524: Update CRLs when they expire #828
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,10 +38,10 @@ var authorityKeyIdentifierOID = asn1.ObjectIdentifier{2, 5, 29, 35} | |
| // specifies a client CA certificate bundle in which any certificates specify | ||
| // any CRL distribution points. Returns a Boolean indicating whether the | ||
| // configmap exists, the configmap if it does exist, and an error value. | ||
| func (r *reconciler) ensureCRLConfigmap(ctx context.Context, ic *operatorv1.IngressController, namespace string, ownerRef metav1.OwnerReference, haveClientCA bool, clientCAConfigmap *corev1.ConfigMap) (bool, *corev1.ConfigMap, error) { | ||
| func (r *reconciler) ensureCRLConfigmap(ctx context.Context, ic *operatorv1.IngressController, namespace string, ownerRef metav1.OwnerReference, haveClientCA bool, clientCAConfigmap *corev1.ConfigMap) (bool, *corev1.ConfigMap, context.Context, error) { | ||
| haveCM, current, err := r.currentCRLConfigMap(ctx, ic) | ||
| if err != nil { | ||
| return false, nil, err | ||
| return false, nil, ctx, err | ||
| } | ||
|
|
||
| var oldCRLs map[string]*pkix.CertificateList | ||
|
|
@@ -60,45 +60,47 @@ func (r *reconciler) ensureCRLConfigmap(ctx context.Context, ic *operatorv1.Ingr | |
| if haveClientCA { | ||
| clientCABundleFilename := "ca-bundle.pem" | ||
| if data, ok := clientCAConfigmap.Data[clientCABundleFilename]; !ok { | ||
| return haveCM, current, fmt.Errorf("client CA configmap %s/%s is missing %q", clientCAConfigmap.Namespace, clientCAConfigmap.Name, clientCABundleFilename) | ||
| return haveCM, current, ctx, fmt.Errorf("client CA configmap %s/%s is missing %q", clientCAConfigmap.Namespace, clientCAConfigmap.Name, clientCABundleFilename) | ||
| } else { | ||
| clientCAData = []byte(data) | ||
| } | ||
| } | ||
|
|
||
| wantCM, desired, err := desiredCRLConfigMap(ic, ownerRef, clientCAData, oldCRLs) | ||
| wantCM, desired, ctx, err := desiredCRLConfigMap(ctx, ic, ownerRef, clientCAData, oldCRLs) | ||
| if err != nil { | ||
| return false, nil, fmt.Errorf("failed to build configmap: %w", err) | ||
| return false, nil, ctx, fmt.Errorf("failed to build configmap: %w", err) | ||
| } | ||
|
|
||
| switch { | ||
| case !wantCM && !haveCM: | ||
| return false, nil, nil | ||
| return false, nil, ctx, nil | ||
| case !wantCM && haveCM: | ||
| if err := r.client.Delete(ctx, current); err != nil { | ||
| if !errors.IsNotFound(err) { | ||
| return true, current, fmt.Errorf("failed to delete configmap: %w", err) | ||
| return true, current, ctx, fmt.Errorf("failed to delete configmap: %w", err) | ||
| } | ||
| } else { | ||
| log.Info("deleted configmap", "namespace", current.Namespace, "name", current.Name) | ||
| } | ||
| return false, nil, nil | ||
| return false, nil, ctx, nil | ||
| case wantCM && !haveCM: | ||
| if err := r.client.Create(ctx, desired); err != nil { | ||
| return false, nil, fmt.Errorf("failed to create configmap: %w", err) | ||
| return false, nil, ctx, fmt.Errorf("failed to create configmap: %w", err) | ||
| } | ||
| log.Info("created configmap", "namespace", desired.Namespace, "name", desired.Name) | ||
| return r.currentCRLConfigMap(ctx, ic) | ||
| exists, current, err := r.currentCRLConfigMap(ctx, ic) | ||
| return exists, current, ctx, err | ||
| case wantCM && haveCM: | ||
| if updated, err := r.updateCRLConfigMap(ctx, current, desired); err != nil { | ||
| return true, current, fmt.Errorf("failed to update configmap: %w", err) | ||
| return true, current, ctx, fmt.Errorf("failed to update configmap: %w", err) | ||
| } else if updated { | ||
| log.Info("updated configmap", "namespace", desired.Namespace, "name", desired.Name) | ||
| return r.currentCRLConfigMap(ctx, ic) | ||
| exists, current, err := r.currentCRLConfigMap(ctx, ic) | ||
| return exists, current, ctx, err | ||
| } | ||
| } | ||
|
|
||
| return true, current, nil | ||
| return true, current, ctx, nil | ||
| } | ||
|
|
||
| // buildCRLMap builds a map of key identifier to certificate list using the | ||
|
|
@@ -130,18 +132,20 @@ func buildCRLMap(crlData []byte) (map[string]*pkix.CertificateList, error) { | |
| } | ||
|
|
||
| // desiredCRLConfigMap returns the desired CRL configmap. Returns a Boolean | ||
| // indicating whether a configmap is desired, as well as the configmap if one is | ||
| // desired. | ||
| func desiredCRLConfigMap(ic *operatorv1.IngressController, ownerRef metav1.OwnerReference, clientCAData []byte, crls map[string]*pkix.CertificateList) (bool, *corev1.ConfigMap, error) { | ||
| // indicating whether a configmap is desired, the configmap if one is desired, | ||
| // the context (containing the next CRL update time as "nextCRLUpdate"), and an | ||
| // error if one occurred | ||
| func desiredCRLConfigMap(ctx context.Context, ic *operatorv1.IngressController, ownerRef metav1.OwnerReference, clientCAData []byte, crls map[string]*pkix.CertificateList) (bool, *corev1.ConfigMap, context.Context, error) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I noticed we don't have unit testing for this function. Should we have a unit test or E2E test to verify this CRL update logic? Can we have static CA's in our test code that can trigger some of logical paths? As far as expiration, I saw in CoreDNS they pass in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I haven't included tests because I haven't had a good idea on how to automate testing it, since the certificates are time sensitive. My plan is to follow up later with an e2e test that generates certificates and CRLs at test run time, but this is a great idea for unit testing. |
||
| if len(ic.Spec.ClientTLS.ClientCertificatePolicy) == 0 || len(ic.Spec.ClientTLS.ClientCA.Name) == 0 { | ||
| return false, nil, nil | ||
| return false, nil, ctx, nil | ||
| } | ||
|
|
||
| if crls == nil { | ||
| crls = make(map[string]*pkix.CertificateList) | ||
| } | ||
|
|
||
| var subjectKeyIds []string | ||
| var nextCRLUpdate time.Time | ||
| now := time.Now() | ||
| for len(clientCAData) > 0 { | ||
| block, data := pem.Decode(clientCAData) | ||
|
|
@@ -151,7 +155,7 @@ func desiredCRLConfigMap(ic *operatorv1.IngressController, ownerRef metav1.Owner | |
| clientCAData = data | ||
| cert, err := x509.ParseCertificate(block.Bytes) | ||
| if err != nil { | ||
| return false, nil, fmt.Errorf("client CA configmap has an invalid certificate: %w", err) | ||
| return false, nil, ctx, fmt.Errorf("client CA configmap has an invalid certificate: %w", err) | ||
| } | ||
| subjectKeyId := hex.EncodeToString(cert.SubjectKeyId) | ||
| if len(cert.CRLDistributionPoints) == 0 { | ||
|
|
@@ -162,6 +166,9 @@ func desiredCRLConfigMap(ic *operatorv1.IngressController, ownerRef metav1.Owner | |
| log.Info("certificate revocation list has expired", "subject key identifier", subjectKeyId) | ||
| } else { | ||
| subjectKeyIds = append(subjectKeyIds, subjectKeyId) | ||
| if (nextCRLUpdate.IsZero() || crl.TBSCertList.NextUpdate.Before(nextCRLUpdate)) && crl.TBSCertList.NextUpdate.After(now) { | ||
| nextCRLUpdate = crl.TBSCertList.NextUpdate | ||
| } | ||
| continue | ||
| } | ||
| } | ||
|
|
@@ -170,29 +177,33 @@ func desiredCRLConfigMap(ic *operatorv1.IngressController, ownerRef metav1.Owner | |
| // Creating or updating the configmap with incomplete | ||
| // data would compromise security by potentially | ||
| // permitting revoked certificates. | ||
| return false, nil, fmt.Errorf("failed to get certificate revocation list for certificate key %s: %w", subjectKeyId, err) | ||
| return false, nil, ctx, fmt.Errorf("failed to get certificate revocation list for certificate key %s: %w", subjectKeyId, err) | ||
| } else { | ||
| crls[subjectKeyId] = crl | ||
| subjectKeyIds = append(subjectKeyIds, subjectKeyId) | ||
| log.Info("new certificate revocation list", "subject key identifier", subjectKeyId, "next update", crl.TBSCertList.NextUpdate.String()) | ||
| if (nextCRLUpdate.IsZero() || crl.TBSCertList.NextUpdate.Before(nextCRLUpdate)) && crl.TBSCertList.NextUpdate.After(now) { | ||
| nextCRLUpdate = crl.TBSCertList.NextUpdate | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if len(subjectKeyIds) == 0 { | ||
| return false, nil, nil | ||
| return false, nil, ctx, nil | ||
| } | ||
|
|
||
| buf := &bytes.Buffer{} | ||
| for _, subjectKeyId := range subjectKeyIds { | ||
| asn1Data, err := asn1.Marshal(*crls[subjectKeyId]) | ||
| if err != nil { | ||
| return false, nil, fmt.Errorf("failed to encode ASN.1 for CRL for certificate key %s: %w", subjectKeyId, err) | ||
| return false, nil, ctx, fmt.Errorf("failed to encode ASN.1 for CRL for certificate key %s: %w", subjectKeyId, err) | ||
| } | ||
| block := &pem.Block{ | ||
| Type: "X509 CRL", | ||
| Bytes: asn1Data, | ||
| } | ||
| if err := pem.Encode(buf, block); err != nil { | ||
| return false, nil, fmt.Errorf("failed to encode PEM for CRL for certificate key %s: %w", subjectKeyId, err) | ||
| return false, nil, ctx, fmt.Errorf("failed to encode PEM for CRL for certificate key %s: %w", subjectKeyId, err) | ||
| } | ||
| } | ||
| crlData := buf.String() | ||
|
|
@@ -209,7 +220,7 @@ func desiredCRLConfigMap(ic *operatorv1.IngressController, ownerRef metav1.Owner | |
| } | ||
| crlConfigmap.SetOwnerReferences([]metav1.OwnerReference{ownerRef}) | ||
|
|
||
| return true, &crlConfigmap, nil | ||
| return true, &crlConfigmap, context.WithValue(ctx, "nextCRLUpdate", nextCRLUpdate), nil | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've never seen context used like this, but that could just because we've never needed to. What's the reason behind using context vs. returning a nextCRLUpdatevariable? Is it because it's time/deadline related? Just curious, I don't have an opinion.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I went back and forth for a while on how to return Ideally, I think this is a reasonable compromise, where
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, I can buy that, sounds like you just want to decouple |
||
| } | ||
|
|
||
| // getCRL gets a certificate revocation list using the provided distribution | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a risk of this requeueing multiple times for the same next CRL update? I.e. on every reconcile, do we queue up this request? Is that okay?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thoughts?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thoughts @rfredette?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry it took me a while to get back to you on this one. I think that's a valid concern. I'm not sure at first glance what's the best way to solve it, but let me look into this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@gcs278 I've pushed a change that seems to fix this from my manual testing; please take a look when you get a chance and let me know what you think.
I added a global variable to track when the next CRL update is expected to be, and now the reconcile is only triggered when the next update as computed in
desiredCRLConfigMapwill be sooner than any already pending requeues.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you worried that the queue may never be empty? I think it is all right. Is there some other potential problem with the logic as it was when you asked about it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think that's what I meant. I just noticed we queue up a request on every reconcile for the same next CRL update. That's not a big problem, just seemed a bit extraneous. We might have 10 or 20 requests that will all trigger at the same time for the same (or immediately after each other) nextCRLUpdate, depending on how many times the CRL was reconciled.
I don't know if there is an easy solution to say "this request is already in the queue, don't queue again". Just pointing it out more than anything.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was able to do a bit more testing on this, and I got the operator to requeue a reconcile for the same time 10 times, but when the time came to do the reconcile, it seems to execute the reconcile twice then stop retrying (at least for a bit; when the new CRL expired 5 minutes later it still re-reconciled). I'm not sure why it's twice; maybe there's a rate limit implemented somewhere, but it doesn't look like this will flood the operator
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I think we're all right here. I did a little more investigation:
reconcileHandlercalls ourReconcilemethod (technically, it calls the controller'sReconcilemethod, but that ultimately results in ours' being called) and requeues the request usingAddAfterif the result has nil error and non-zeroRequeueAfter:cluster-ingress-operator/vendor/sigs.k8s.io/controller-runtime/pkg/internal/controller/controller.go
Lines 320 to 333 in 362a3ec
AddAfteradds the item to the queue usingAddor sends awaitForto a channel:cluster-ingress-operator/vendor/k8s.io/client-go/util/workqueue/delaying_queue.go
Lines 161 to 179 in 362a3ec
waitForoff the channel and adds it to its internalwaitingForQueuequeue usinginsertor adds it to the queue usingAdd:cluster-ingress-operator/vendor/k8s.io/client-go/util/workqueue/delaying_queue.go
Lines 253 to 257 in 362a3ec
insertchecks whether there is an existing entry for the reconcile request; if there is one, it updates the existing entry in the queue instead of pushing a duplicate entry onto the queue:cluster-ingress-operator/vendor/k8s.io/client-go/util/workqueue/delaying_queue.go
Lines 267 to 277 in 362a3ec
Addto prevent adding duplicates:cluster-ingress-operator/vendor/k8s.io/client-go/util/workqueue/queue.go
Lines 119 to 137 in 362a3ec
Between Ryan's testing and my understanding of the queue and delaying_queue logic, I believe that there should be no issue with duplicate items in the queue or the delaying_queue's internal channel or queue.