Skip to content

Commit fe56e19

Browse files
authored
Improve cluster deletion (#1864)
* Improve cluster deletion There were 3 issues: * the STS could get delete before deletion label propagated, leading to some pods terminating for a long time, because they didn't skip quorum checks * there was an unnecessary explicit STS deletion * addRabbitmqDeletionLabel was not filtering by namespace, potentially performing operations on wrong/too many pods (if there were multiple clusters with the same name in different namespaces) * Refactor to use CreateOrUpdate
1 parent 08d7993 commit fe56e19

File tree

4 files changed

+73
-59
lines changed

4 files changed

+73
-59
lines changed

controllers/rabbitmqcluster_controller_test.go

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ var _ = Describe("RabbitmqClusterController", func() {
4343

4444
var (
4545
cluster *rabbitmqv1beta1.RabbitmqCluster
46+
suffix string
4647
defaultNamespace = "default"
4748
)
4849

@@ -781,9 +782,11 @@ var _ = Describe("RabbitmqClusterController", func() {
781782
storageClassName = "my-storage-class"
782783
myStorage = k8sresource.MustParse("100Gi")
783784
q, _ = k8sresource.ParseQuantity("10Gi")
785+
suffix = fmt.Sprintf("-%d", time.Now().UnixNano())
786+
clusterName := "rabbitmq-sts-override" + suffix
784787
cluster = &rabbitmqv1beta1.RabbitmqCluster{
785788
ObjectMeta: metav1.ObjectMeta{
786-
Name: "rabbitmq-sts-override",
789+
Name: clusterName,
787790
Namespace: defaultNamespace,
788791
},
789792
Spec: rabbitmqv1beta1.RabbitmqClusterSpec{
@@ -797,7 +800,7 @@ var _ = Describe("RabbitmqClusterController", func() {
797800
Name: "persistence",
798801
Namespace: defaultNamespace,
799802
Labels: map[string]string{
800-
"app.kubernetes.io/name": "rabbitmq-sts-override",
803+
"app.kubernetes.io/name": clusterName,
801804
},
802805
Annotations: map[string]string{},
803806
},
@@ -815,7 +818,7 @@ var _ = Describe("RabbitmqClusterController", func() {
815818
Name: "disk-2",
816819
Namespace: defaultNamespace,
817820
Labels: map[string]string{
818-
"app.kubernetes.io/name": "rabbitmq-sts-override",
821+
"app.kubernetes.io/name": clusterName,
819822
},
820823
},
821824
Spec: corev1.PersistentVolumeClaimSpec{
@@ -872,14 +875,14 @@ var _ = Describe("RabbitmqClusterController", func() {
872875
defaultMode := int32(420)
873876

874877
Expect(sts.ObjectMeta.Labels).To(Equal(map[string]string{
875-
"app.kubernetes.io/name": "rabbitmq-sts-override",
878+
"app.kubernetes.io/name": "rabbitmq-sts-override" + suffix,
876879
"app.kubernetes.io/component": "rabbitmq",
877880
"app.kubernetes.io/part-of": "rabbitmq",
878881
}))
879882

880-
Expect(sts.Spec.ServiceName).To(Equal("rabbitmq-sts-override-nodes"))
883+
Expect(sts.Spec.ServiceName).To(Equal("rabbitmq-sts-override" + suffix + "-nodes"))
881884
Expect(sts.Spec.Selector.MatchLabels).To(Equal(map[string]string{
882-
"app.kubernetes.io/name": "rabbitmq-sts-override",
885+
"app.kubernetes.io/name": "rabbitmq-sts-override" + suffix,
883886
}))
884887

885888
Expect(len(sts.Spec.VolumeClaimTemplates)).To(Equal(2))
@@ -888,9 +891,9 @@ var _ = Describe("RabbitmqClusterController", func() {
888891
Expect(sts.Spec.VolumeClaimTemplates[0].ObjectMeta.Namespace).To(Equal("default"))
889892
Expect(sts.Spec.VolumeClaimTemplates[0].ObjectMeta.Labels).To(Equal(
890893
map[string]string{
891-
"app.kubernetes.io/name": "rabbitmq-sts-override",
894+
"app.kubernetes.io/name": "rabbitmq-sts-override" + suffix,
892895
}))
893-
Expect(sts.Spec.VolumeClaimTemplates[0].OwnerReferences[0].Name).To(Equal("rabbitmq-sts-override"))
896+
Expect(sts.Spec.VolumeClaimTemplates[0].OwnerReferences[0].Name).To(Equal("rabbitmq-sts-override" + suffix))
894897
Expect(sts.Spec.VolumeClaimTemplates[0].Spec).To(Equal(
895898
corev1.PersistentVolumeClaimSpec{
896899
AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce},
@@ -906,9 +909,9 @@ var _ = Describe("RabbitmqClusterController", func() {
906909
Expect(sts.Spec.VolumeClaimTemplates[1].ObjectMeta.Namespace).To(Equal("default"))
907910
Expect(sts.Spec.VolumeClaimTemplates[1].ObjectMeta.Labels).To(Equal(
908911
map[string]string{
909-
"app.kubernetes.io/name": "rabbitmq-sts-override",
912+
"app.kubernetes.io/name": "rabbitmq-sts-override" + suffix,
910913
}))
911-
Expect(sts.Spec.VolumeClaimTemplates[1].OwnerReferences[0].Name).To(Equal("rabbitmq-sts-override"))
914+
Expect(sts.Spec.VolumeClaimTemplates[1].OwnerReferences[0].Name).To(Equal("rabbitmq-sts-override" + suffix))
912915
Expect(sts.Spec.VolumeClaimTemplates[1].Spec).To(Equal(
913916
corev1.PersistentVolumeClaimSpec{
914917
VolumeMode: &volumeMode,
@@ -941,7 +944,7 @@ var _ = Describe("RabbitmqClusterController", func() {
941944
{
942945
ConfigMap: &corev1.ConfigMapProjection{
943946
LocalObjectReference: corev1.LocalObjectReference{
944-
Name: "rabbitmq-sts-override-server-conf",
947+
Name: "rabbitmq-sts-override" + suffix + "-server-conf",
945948
},
946949
Items: []corev1.KeyToPath{
947950
{
@@ -958,7 +961,7 @@ var _ = Describe("RabbitmqClusterController", func() {
958961
{
959962
Secret: &corev1.SecretProjection{
960963
LocalObjectReference: corev1.LocalObjectReference{
961-
Name: "rabbitmq-sts-override-default-user",
964+
Name: "rabbitmq-sts-override" + suffix + "-default-user",
962965
},
963966
Items: []corev1.KeyToPath{
964967
{
@@ -979,7 +982,7 @@ var _ = Describe("RabbitmqClusterController", func() {
979982
ConfigMap: &corev1.ConfigMapVolumeSource{
980983
DefaultMode: &defaultMode,
981984
LocalObjectReference: corev1.LocalObjectReference{
982-
Name: "rabbitmq-sts-override-plugins-conf",
985+
Name: "rabbitmq-sts-override" + suffix + "-plugins-conf",
983986
},
984987
},
985988
},
@@ -1002,7 +1005,7 @@ var _ = Describe("RabbitmqClusterController", func() {
10021005
VolumeSource: corev1.VolumeSource{
10031006
Secret: &corev1.SecretVolumeSource{
10041007
DefaultMode: &defaultMode,
1005-
SecretName: "rabbitmq-sts-override-erlang-cookie",
1008+
SecretName: "rabbitmq-sts-override" + suffix + "-erlang-cookie",
10061009
},
10071010
},
10081011
},

controllers/reconcile_finalizer.go

Lines changed: 42 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@ package controllers
33
import (
44
"context"
55
"fmt"
6+
"strings"
7+
"time"
68

79
rabbitmqv1beta1 "github.com/rabbitmq/cluster-operator/v2/api/v1beta1"
810
"github.com/rabbitmq/cluster-operator/v2/internal/resource"
9-
appsv1 "k8s.io/api/apps/v1"
1011
corev1 "k8s.io/api/core/v1"
11-
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1212
"k8s.io/apimachinery/pkg/labels"
1313
clientretry "k8s.io/client-go/util/retry"
1414
ctrl "sigs.k8s.io/controller-runtime"
@@ -21,48 +21,42 @@ const deletionFinalizer = "deletion.finalizers.rabbitmqclusters.rabbitmq.com"
2121
// addFinalizerIfNeeded adds a deletion finalizer if the RabbitmqCluster does not have one yet and is not marked for deletion
2222
func (r *RabbitmqClusterReconciler) addFinalizerIfNeeded(ctx context.Context, rabbitmqCluster *rabbitmqv1beta1.RabbitmqCluster) error {
2323
if rabbitmqCluster.ObjectMeta.DeletionTimestamp.IsZero() && !controllerutil.ContainsFinalizer(rabbitmqCluster, deletionFinalizer) {
24-
controllerutil.AddFinalizer(rabbitmqCluster, deletionFinalizer)
25-
if err := r.Client.Update(ctx, rabbitmqCluster); err != nil {
26-
return err
27-
}
24+
return clientretry.RetryOnConflict(clientretry.DefaultRetry, func() error {
25+
controllerutil.AddFinalizer(rabbitmqCluster, deletionFinalizer)
26+
return r.Client.Update(ctx, rabbitmqCluster)
27+
})
2828
}
2929
return nil
3030
}
3131

3232
func (r *RabbitmqClusterReconciler) removeFinalizer(ctx context.Context, rabbitmqCluster *rabbitmqv1beta1.RabbitmqCluster) error {
33-
controllerutil.RemoveFinalizer(rabbitmqCluster, deletionFinalizer)
34-
return r.Client.Update(ctx, rabbitmqCluster)
33+
currentRabbitmqCluster := &rabbitmqv1beta1.RabbitmqCluster{}
34+
currentRabbitmqCluster.Name = rabbitmqCluster.Name
35+
currentRabbitmqCluster.Namespace = rabbitmqCluster.Namespace
36+
37+
_, err := controllerutil.CreateOrUpdate(ctx, r.Client, currentRabbitmqCluster, func() error {
38+
controllerutil.RemoveFinalizer(currentRabbitmqCluster, deletionFinalizer)
39+
return nil
40+
})
41+
42+
if err != nil {
43+
ctrl.LoggerFrom(ctx).Error(err, "Failed to remove finalizer for deletion")
44+
return client.IgnoreNotFound(err)
45+
}
46+
47+
return nil
3548
}
3649

3750
func (r *RabbitmqClusterReconciler) prepareForDeletion(ctx context.Context, rabbitmqCluster *rabbitmqv1beta1.RabbitmqCluster) error {
3851
if controllerutil.ContainsFinalizer(rabbitmqCluster, deletionFinalizer) {
39-
if err := clientretry.RetryOnConflict(clientretry.DefaultRetry, func() error {
40-
uid, err := r.statefulSetUID(ctx, rabbitmqCluster)
41-
if err != nil {
42-
return err
43-
}
44-
sts := &appsv1.StatefulSet{
45-
ObjectMeta: metav1.ObjectMeta{
46-
Name: rabbitmqCluster.ChildResourceName("server"),
47-
Namespace: rabbitmqCluster.Namespace,
48-
},
49-
}
50-
// Add label on all Pods to be picked up in pre-stop hook via Downward API
51-
if err := r.addRabbitmqDeletionLabel(ctx, rabbitmqCluster); err != nil {
52-
return fmt.Errorf("failed to add deletion markers to RabbitmqCluster Pods: %w", err)
53-
}
54-
// Delete StatefulSet immediately after changing pod labels to minimize risk of them respawning.
55-
// There is a window where the StatefulSet could respawn Pods without the deletion label in this order.
56-
// But we can't delete it before because the DownwardAPI doesn't update once a Pod enters Terminating.
57-
// Addressing #648: if both rabbitmqCluster and the statefulSet returned by r.Get() are stale (and match each other),
58-
// setting the stale statefulSet's uid in the precondition can avoid mis-deleting any currently running statefulSet sharing the same name.
59-
if err := r.Client.Delete(ctx, sts, &client.DeleteOptions{Preconditions: &metav1.Preconditions{UID: &uid}}); client.IgnoreNotFound(err) != nil {
60-
return fmt.Errorf("cannot delete StatefulSet: %w", err)
61-
}
62-
63-
return nil
64-
}); err != nil {
65-
ctrl.LoggerFrom(ctx).Error(err, "RabbitmqCluster deletion")
52+
clientretry.RetryOnConflict(clientretry.DefaultRetry, func() error {
53+
return r.addRabbitmqDeletionLabel(ctx, rabbitmqCluster)
54+
})
55+
56+
// wait for up to 3 seconds for the labels to propagate
57+
timeout := time.Now().Add(3 * time.Second)
58+
for time.Now().Before(timeout) && !r.checkIfLabelPropagated(ctx, rabbitmqCluster) {
59+
time.Sleep(200 * time.Millisecond)
6660
}
6761

6862
if err := r.removeFinalizer(ctx, rabbitmqCluster); err != nil {
@@ -81,6 +75,7 @@ func (r *RabbitmqClusterReconciler) addRabbitmqDeletionLabel(ctx context.Context
8175
}
8276
listOptions := client.ListOptions{
8377
LabelSelector: selector,
78+
Namespace: rabbitmqCluster.Namespace,
8479
}
8580

8681
if err := r.Client.List(ctx, pods, &listOptions); err != nil {
@@ -97,3 +92,15 @@ func (r *RabbitmqClusterReconciler) addRabbitmqDeletionLabel(ctx context.Context
9792

9893
return nil
9994
}
95+
96+
func (r *RabbitmqClusterReconciler) checkIfLabelPropagated(ctx context.Context, rabbitmqCluster *rabbitmqv1beta1.RabbitmqCluster) bool {
97+
logger := ctrl.LoggerFrom(ctx)
98+
podName := fmt.Sprintf("%s-0", rabbitmqCluster.ChildResourceName("server"))
99+
cmd := "cat /etc/pod-info/skipPreStopChecks"
100+
stdout, _, err := r.exec(rabbitmqCluster.Namespace, podName, "rabbitmq", "sh", "-c", cmd)
101+
if err != nil {
102+
logger.Info("Failed to check for deletion label propagation, deleting anyway", "pod", podName, "command", cmd, "stdout", stdout)
103+
return true
104+
}
105+
return strings.HasPrefix(stdout, "true")
106+
}

controllers/reconcile_no_persistence_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ var _ = Describe("Persistence", func() {
2828
zeroGi := k8sresource.MustParse("0Gi")
2929
cluster = &rabbitmqv1beta1.RabbitmqCluster{
3030
ObjectMeta: metav1.ObjectMeta{
31-
Name: "rabbitmq-shrink",
31+
Name: "rabbitmq-no-persistence",
3232
Namespace: defaultNamespace,
3333
},
3434
Spec: rabbitmqv1beta1.RabbitmqClusterSpec{

controllers/reconcile_tls_test.go

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package controllers_test
33
import (
44
"context"
55
"fmt"
6+
"time"
7+
68
"github.com/rabbitmq/cluster-operator/v2/internal/status"
79
"k8s.io/utils/ptr"
810
runtimeClient "sigs.k8s.io/controller-runtime/pkg/client"
@@ -31,7 +33,7 @@ var _ = Describe("Reconcile TLS", func() {
3133
rmq := &rabbitmqv1beta1.RabbitmqCluster{}
3234
err := client.Get(ctx, types.NamespacedName{Name: cluster.Name, Namespace: cluster.Namespace}, rmq)
3335
return apierrors.IsNotFound(err)
34-
}, 5).Should(BeTrue())
36+
}, 10).Should(BeTrue())
3537
})
3638

3739
Context("Mutual TLS with single secret", func() {
@@ -112,13 +114,13 @@ var _ = Describe("Reconcile TLS", func() {
112114

113115
Context("Mutual TLS with a separate CA certificate secret", func() {
114116
It("Does not deploy the RabbitmqCluster, and retries every 10 seconds", func() {
115-
tlsSecretWithoutCACert(ctx, "rabbitmq-tls-secret-does-not-exist", defaultNamespace)
117+
tlsSecretWithoutCACert(ctx, "rabbitmq-tls-secret-initially-missing", defaultNamespace)
116118

117119
tlsSpec := rabbitmqv1beta1.TLSSpec{
118-
SecretName: "rabbitmq-tls-secret-does-not-exist",
119-
CaSecretName: "ca-cert-secret",
120+
SecretName: "rabbitmq-tls-secret-initially-missing",
121+
CaSecretName: "ca-cert-secret-initially-missing",
120122
}
121-
cluster = rabbitmqClusterWithTLS(ctx, "rabbitmq-tls-secret-does-not-exist", defaultNamespace, tlsSpec)
123+
cluster = rabbitmqClusterWithTLS(ctx, "rabbitmq-tls-secret-initially-missing", defaultNamespace, tlsSpec)
122124
verifyTLSErrorEvents(ctx, cluster, "Failed to get CA certificate secret")
123125
verifyReconcileSuccessFalse(cluster.Name, cluster.Namespace)
124126

@@ -129,7 +131,7 @@ var _ = Describe("Reconcile TLS", func() {
129131
caData := map[string]string{
130132
"ca.crt": "this is a ca cert",
131133
}
132-
_, err = createSecret(ctx, "ca-cert-secret", defaultNamespace, caData)
134+
_, err = createSecret(ctx, "ca-cert-secret-initially-missing", defaultNamespace, caData)
133135
Expect(err).NotTo(HaveOccurred())
134136

135137
waitForClusterCreation(ctx, cluster, client)
@@ -173,14 +175,15 @@ var _ = Describe("Reconcile TLS", func() {
173175
})
174176

175177
It("Deploys successfully", func() {
178+
suffix := fmt.Sprintf("-%d", time.Now().UnixNano())
176179
cluster = &rabbitmqv1beta1.RabbitmqCluster{
177180
ObjectMeta: metav1.ObjectMeta{
178-
Name: "rabbitmq-tls",
181+
Name: "rabbitmq-tls" + suffix,
179182
Namespace: defaultNamespace,
180183
},
181184
Spec: rabbitmqv1beta1.RabbitmqClusterSpec{
182185
TLS: rabbitmqv1beta1.TLSSpec{
183-
SecretName: "tls-secret",
186+
SecretName: "tls-secret" + suffix,
184187
},
185188
},
186189
}
@@ -218,7 +221,8 @@ var _ = Describe("Reconcile TLS", func() {
218221
tlsSpec := rabbitmqv1beta1.TLSSpec{
219222
SecretName: "tls-secret-does-not-exist",
220223
}
221-
cluster = rabbitmqClusterWithTLS(ctx, "rabbitmq-tls-secret-does-not-exist", defaultNamespace, tlsSpec)
224+
suffix := fmt.Sprintf("-%d", time.Now().UnixNano())
225+
cluster = rabbitmqClusterWithTLS(ctx, "rabbitmq-tls-secret-does-not-exist"+suffix, defaultNamespace, tlsSpec)
222226

223227
verifyTLSErrorEvents(ctx, cluster, "Failed to get TLS secret")
224228
verifyReconcileSuccessFalse(cluster.Name, cluster.Namespace)

0 commit comments

Comments
 (0)