Skip to content

Commit 0030f53

Browse files
authored
fix(hatchery:k8s): add delay before pending workers cleanup (#6107)
1 parent f0e0d92 commit 0030f53

File tree

2 files changed

+27
-12
lines changed

2 files changed

+27
-12
lines changed

engine/hatchery/kubernetes/kill_workers.go

+16-12
Original file line numberDiff line numberDiff line change
@@ -40,26 +40,29 @@ func (h *HatcheryKubernetes) killAwolWorkers(ctx context.Context) error {
4040
continue
4141
}
4242

43-
var toDelete, found bool
44-
for _, w := range workers {
45-
if workerName, ok := labels[LABEL_WORKER_NAME]; ok && workerName == w.Name {
46-
found = true
43+
var toDelete bool
44+
for _, container := range pod.Status.ContainerStatuses {
45+
terminated := (container.State.Terminated != nil && (container.State.Terminated.Reason == "Completed" || container.State.Terminated.Reason == "Error"))
46+
errImagePull := (container.State.Waiting != nil && container.State.Waiting.Reason == "ErrImagePull")
47+
if terminated || errImagePull {
48+
toDelete = true
49+
log.Debug(ctx, "pod %s/%s is terminated or in error", pod.Namespace, pod.Name)
4750
break
4851
}
4952
}
50-
if !found {
51-
toDelete = true
52-
}
5353

5454
if !toDelete {
55-
for _, container := range pod.Status.ContainerStatuses {
56-
terminated := (container.State.Terminated != nil && (container.State.Terminated.Reason == "Completed" || container.State.Terminated.Reason == "Error"))
57-
errImagePull := (container.State.Waiting != nil && container.State.Waiting.Reason == "ErrImagePull")
58-
if terminated || errImagePull {
59-
toDelete = true
55+
var found bool
56+
for _, w := range workers {
57+
if workerName, ok := labels[LABEL_WORKER_NAME]; ok && workerName == w.Name {
58+
found = true
6059
break
6160
}
6261
}
62+
if !found && time.Since(pod.CreationTimestamp.Time) > 3*time.Minute {
63+
toDelete = true
64+
log.Debug(ctx, "pod %s/%s didn't match a registered worker and was started since %v", pod.Namespace, pod.Name, pod.CreationTimestamp.Time)
65+
}
6366
}
6467

6568
if toDelete {
@@ -130,6 +133,7 @@ func (h *HatcheryKubernetes) killAwolWorkers(ctx context.Context) error {
130133
globalErr = err
131134
log.Error(ctx, "hatchery:kubernetes> killAwolWorkers> Cannot delete pod %s (%s)", pod.Name, err)
132135
}
136+
log.Debug(ctx, "pod %s/%s killed", pod.Namespace, pod.Name)
133137
}
134138
}
135139
return globalErr

engine/hatchery/kubernetes/kill_workers_test.go

+11
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,17 @@ func TestHatcheryKubernetes_KillAwolWorkers(t *testing.T) {
103103
},
104104
},
105105
},
106+
{
107+
ObjectMeta: metav1.ObjectMeta{
108+
Name: "worker-6",
109+
Namespace: "cds-workers",
110+
Labels: map[string]string{
111+
LABEL_HATCHERY_NAME: "my-hatchery",
112+
LABEL_WORKER_NAME: "worker-6",
113+
},
114+
CreationTimestamp: metav1.Now(),
115+
},
116+
},
106117
},
107118
}
108119
gock.New("http://lolcat.kube").Get("/api/v1/namespaces/cds-workers/pods").Reply(http.StatusOK).JSON(podsList)

0 commit comments

Comments
 (0)