Skip to content

Commit

Permalink
Add pod liveness check before starting node.
Browse files Browse the repository at this point in the history
  • Loading branch information
Garry Dmello committed Feb 21, 2025
1 parent 990d1a1 commit 2736e9a
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 0 deletions.
25 changes: 25 additions & 0 deletions pkg/httphelper/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,31 @@ func (client *NodeMgmtClient) CallCreateRoleEndpoint(pod *corev1.Pod, username s
return nil
}

func (client *NodeMgmtClient) CallLivenessEndpoint(pod *corev1.Pod) error {
client.Log.Info("requesting Cassandra liveness from Node Management API", "pod", pod.Name)

podHost, podPort, err := BuildPodHostFromPod(pod)
if err != nil {
return err
}

request := nodeMgmtRequest{
endpoint: "/api/v0/probes/liveness",
host: podHost,
port: podPort,
method: http.MethodGet,
timeout: 60 * time.Second,
}

res, err := callNodeMgmtEndpoint(client, request, "")
client.Log.Info("requesting Cassandra liveness from Node Management API succeeded ", "pod", pod.Name, "response", res)
if err != nil {
return err

}
return nil
}

// CallDropRoleEndpoint drops an existing role from the cluster
func (client *NodeMgmtClient) CallDropRoleEndpoint(pod *corev1.Pod, username string) error {
client.Log.Info(
Expand Down
34 changes: 34 additions & 0 deletions pkg/reconciliation/reconcile_racks.go
Original file line number Diff line number Diff line change
Expand Up @@ -2062,6 +2062,17 @@ func (rc *ReconciliationContext) startNode(pod *corev1.Pod, labelSeedBeforeStart
"Labeled pod a seed node %s", pod.Name)
}

// Check if pod is live before starting server
isAlive, err := rc.waitForPodLiveness(pod)
if err != nil {
return true, err
}
if isAlive {
if err := rc.startCassandra(endpointData, pod); err != nil {
return true, err
}
}

if err := rc.startCassandra(endpointData, pod); err != nil {
return true, err
}
Expand All @@ -2071,6 +2082,29 @@ func (rc *ReconciliationContext) startNode(pod *corev1.Pod, labelSeedBeforeStart
return false, nil
}

func (rc *ReconciliationContext) waitForPodLiveness(pod *corev1.Pod) (bool, error) {
timeout := time.After(1 * time.Minute) // Timeout after 15 minutes
ticker := time.NewTicker(30 * time.Second) // Retry every 30 seconds

for {
select {
case <-timeout:
// Timeout reached
rc.ReqLogger.Info("Timed out after 15 minutes of retries")
return false, nil
case <-ticker.C:
// Call the Liveness endpoint
if err := rc.NodeMgmtClient.CallLivenessEndpoint(pod); err != nil {
rc.ReqLogger.Info("Liveness probe failing before starting pod " + pod.Name + " with error " + err.Error())
continue
}
// If no error, return success
rc.ReqLogger.Info("Liveness probe succeeded for pod " + pod.Name)
return true, nil
}
}
}

func (rc *ReconciliationContext) countReadyAndStarted() (int, int) {
ready := 0
started := 0
Expand Down

0 comments on commit 2736e9a

Please sign in to comment.