@@ -40,26 +40,29 @@ func (h *HatcheryKubernetes) killAwolWorkers(ctx context.Context) error {
40
40
continue
41
41
}
42
42
43
- var toDelete , found bool
44
- for _ , w := range workers {
45
- if workerName , ok := labels [LABEL_WORKER_NAME ]; ok && workerName == w .Name {
46
- found = true
43
+ var toDelete bool
44
+ for _ , container := range pod .Status .ContainerStatuses {
45
+ terminated := (container .State .Terminated != nil && (container .State .Terminated .Reason == "Completed" || container .State .Terminated .Reason == "Error" ))
46
+ errImagePull := (container .State .Waiting != nil && container .State .Waiting .Reason == "ErrImagePull" )
47
+ if terminated || errImagePull {
48
+ toDelete = true
49
+ log .Debug (ctx , "pod %s/%s is terminated or in error" , pod .Namespace , pod .Name )
47
50
break
48
51
}
49
52
}
50
- if ! found {
51
- toDelete = true
52
- }
53
53
54
54
if ! toDelete {
55
- for _ , container := range pod .Status .ContainerStatuses {
56
- terminated := (container .State .Terminated != nil && (container .State .Terminated .Reason == "Completed" || container .State .Terminated .Reason == "Error" ))
57
- errImagePull := (container .State .Waiting != nil && container .State .Waiting .Reason == "ErrImagePull" )
58
- if terminated || errImagePull {
59
- toDelete = true
55
+ var found bool
56
+ for _ , w := range workers {
57
+ if workerName , ok := labels [LABEL_WORKER_NAME ]; ok && workerName == w .Name {
58
+ found = true
60
59
break
61
60
}
62
61
}
62
+ if ! found && time .Since (pod .CreationTimestamp .Time ) > 3 * time .Minute {
63
+ toDelete = true
64
+ log .Debug (ctx , "pod %s/%s didn't match a registered worker and was started since %v" , pod .Namespace , pod .Name , pod .CreationTimestamp .Time )
65
+ }
63
66
}
64
67
65
68
if toDelete {
@@ -130,6 +133,7 @@ func (h *HatcheryKubernetes) killAwolWorkers(ctx context.Context) error {
130
133
globalErr = err
131
134
log .Error (ctx , "hatchery:kubernetes> killAwolWorkers> Cannot delete pod %s (%s)" , pod .Name , err )
132
135
}
136
+ log .Debug (ctx , "pod %s/%s killed" , pod .Namespace , pod .Name )
133
137
}
134
138
}
135
139
return globalErr
0 commit comments