From ff5373f269e6cadcd85058d67d99107346153242 Mon Sep 17 00:00:00 2001 From: Garry Dmello Date: Thu, 20 Feb 2025 16:19:27 -0800 Subject: [PATCH] Add pod liveness check before starting node. --- config/manager/image_config.yaml | 2 +- config/manager/kustomization.yaml | 2 +- pkg/httphelper/client.go | 25 ++++++++++++++++++++ pkg/reconciliation/reconcile_racks.go | 34 +++++++++++++++++++++++++++ tests/kustomize/kustomization.yaml | 2 +- 5 files changed, 62 insertions(+), 3 deletions(-) diff --git a/config/manager/image_config.yaml b/config/manager/image_config.yaml index ed7a3647..28f7c6a1 100644 --- a/config/manager/image_config.yaml +++ b/config/manager/image_config.yaml @@ -3,7 +3,7 @@ kind: ImageConfig metadata: name: image-config images: - system-logger: "k8ssandra/system-logger:v1.24.0-dev.a8fa96c-20241219" + system-logger: "k8ssandra/system-logger:v1.24.0-dev.2736e9a-20250220" config-builder: "datastax/cass-config-builder:1.0-ubi8" k8ssandra-client: "k8ssandra/k8ssandra-client:v0.6.0" # cassandra: diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 20bb3bb5..f5d678c7 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -14,4 +14,4 @@ kind: Kustomization images: - name: controller newName: k8ssandra/cass-operator - newTag: v1.24.0-dev.a8fa96c-20241219 + newTag: v1.24.0-dev.2736e9a-20250220 diff --git a/pkg/httphelper/client.go b/pkg/httphelper/client.go index 4457e4a6..b1fd6d1f 100644 --- a/pkg/httphelper/client.go +++ b/pkg/httphelper/client.go @@ -328,6 +328,31 @@ func (client *NodeMgmtClient) CallCreateRoleEndpoint(pod *corev1.Pod, username s return nil } +func (client *NodeMgmtClient) CallLivenessEndpoint(pod *corev1.Pod) error { + client.Log.Info("requesting Cassandra liveness from Node Management API", "pod", pod.Name) + + podHost, podPort, err := BuildPodHostFromPod(pod) + if err != nil { + return err + } + + request := nodeMgmtRequest{ + endpoint: "/api/v0/probes/liveness", + host: podHost, + port: podPort, + method: http.MethodGet, + timeout: 60 * time.Second, + } + + res, err := callNodeMgmtEndpoint(client, request, "") + client.Log.Info("requesting Cassandra liveness from Node Management API succeeded ", "pod", pod.Name, "response", res) + if err != nil { + return err + + } + return nil +} + // CallDropRoleEndpoint drops an existing role from the cluster func (client *NodeMgmtClient) CallDropRoleEndpoint(pod *corev1.Pod, username string) error { client.Log.Info( diff --git a/pkg/reconciliation/reconcile_racks.go b/pkg/reconciliation/reconcile_racks.go index fda3868a..b4a5571b 100644 --- a/pkg/reconciliation/reconcile_racks.go +++ b/pkg/reconciliation/reconcile_racks.go @@ -2062,6 +2062,17 @@ func (rc *ReconciliationContext) startNode(pod *corev1.Pod, labelSeedBeforeStart "Labeled pod a seed node %s", pod.Name) } + // Check if pod is live before starting server + isAlive, err := rc.waitForPodLiveness(pod) + if err != nil { + return true, err + } + if isAlive { + if err := rc.startCassandra(endpointData, pod); err != nil { + return true, err + } + } + if err := rc.startCassandra(endpointData, pod); err != nil { return true, err } @@ -2071,6 +2082,29 @@ func (rc *ReconciliationContext) startNode(pod *corev1.Pod, labelSeedBeforeStart return false, nil } +func (rc *ReconciliationContext) waitForPodLiveness(pod *corev1.Pod) (bool, error) { + timeout := time.After(5 * time.Minute) // Timeout after 5 minutes + ticker := time.NewTicker(5 * time.Second) // Retry every 5 seconds + + for { + select { + case <-timeout: + // Timeout reached + rc.ReqLogger.Info("Timed out after 15 minutes of retries") + return false, nil + case <-ticker.C: + // Call the Liveness endpoint + if err := rc.NodeMgmtClient.CallLivenessEndpoint(pod); err != nil { + rc.ReqLogger.Info("Liveness probe failing before starting pod " + pod.Name + " with error " + err.Error()) + continue + } + // If no error, return success + rc.ReqLogger.Info("Liveness probe succeeded for pod " + pod.Name) + return true, nil + } + } +} + func (rc *ReconciliationContext) countReadyAndStarted() (int, int) { ready := 0 started := 0 diff --git a/tests/kustomize/kustomization.yaml b/tests/kustomize/kustomization.yaml index 4f33a832..9f13ef81 100644 --- a/tests/kustomize/kustomization.yaml +++ b/tests/kustomize/kustomization.yaml @@ -1,5 +1,5 @@ # This is the default kustomize template for tests. -namespace: kustomize +namespace: test-webhook-validation apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization