Skip to content
This repository was archived by the owner on Jan 8, 2024. It is now read-only.

Commit 39471df

Browse files
author
Evan Phoenix
authored
Merge pull request #3143 from hashicorp/fix/k8s-stop
Delete job and pods in k8s StopTask if stuck in pending
2 parents 5325c1f + 4272930 commit 39471df

File tree

3 files changed

+60
-10
lines changed

3 files changed

+60
-10
lines changed

.changelog/3143.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
```release-note:bug
2+
plugin/k8s: clean up pending pods from cancelled jobs
3+
```

builtin/k8s/platform.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,7 @@ func configureContainer(
480480
}
481481
resourceRequests[resourceName] = q
482482
} else {
483-
log.Warn("ignoring unrecognized k8s resources key: %q", k)
483+
log.Warn("ignoring unrecognized k8s resources key", "key", k)
484484
}
485485
}
486486

@@ -622,7 +622,7 @@ func (p *Platform) resourceDeploymentCreate(
622622

623623
// App container must have some kind of port
624624
if len(appContainerSpec.Ports) == 0 {
625-
log.Warn("No ports defined in waypoint.hcl - defaulting to http on port %d", DefaultServicePort)
625+
log.Warn("No ports defined in waypoint.hcl - defaulting to http on port", "port", DefaultServicePort)
626626
appContainerSpec.Ports = append(appContainerSpec.Ports, &Port{Port: DefaultServicePort, Name: "http"})
627627
}
628628

builtin/k8s/task.go

+55-8
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"google.golang.org/grpc/status"
1414
batchv1 "k8s.io/api/batch/v1"
1515
corev1 "k8s.io/api/core/v1"
16+
"k8s.io/apimachinery/pkg/api/errors"
1617
k8sresource "k8s.io/apimachinery/pkg/api/resource"
1718
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1819
"k8s.io/utils/pointer"
@@ -143,13 +144,59 @@ func (p *TaskLauncher) StopTask(
143144
log hclog.Logger,
144145
ti *TaskInfo,
145146
) error {
146-
// Purposely do nothing. We leverage the job TTL feature in Kube 1.19+
147-
// so that Kubernetes automatically deletes old jobs after they complete
148-
// running.
147+
// If a job completes and the coresponding pod exits with a "completed"
148+
// status, we urposely do nothing here. We leverage the job TTL feature in
149+
// Kube 1.19+ so that Kubernetes automatically deletes old jobs and pods
150+
// after they complete running.
149151
//
150-
// In the future, we may want to get more clever about this and explicitly
151-
// delete jobs under certain conditions, but for now we leave them around
152-
// and let K8S clean it up
152+
// If a Waypoint job is cancelled or otherwise times out, we check for
153+
// existing Kubernetes jobs and delete them, and clean up any Pending
154+
// containers.
155+
clientSet, ns, _, err := Clientset(p.config.KubeconfigPath, p.config.Context)
156+
if err != nil {
157+
return err
158+
}
159+
if p.config.Namespace != "" {
160+
ns = p.config.Namespace
161+
}
162+
163+
// Delete the job. This does *not* delete any running pods that the job
164+
// created.
165+
jobsClient := clientSet.BatchV1().Jobs(ns)
166+
if err := jobsClient.Delete(ctx, ti.Id, metav1.DeleteOptions{}); err != nil {
167+
if !errors.IsNotFound(err) {
168+
return err
169+
}
170+
}
171+
172+
// List pods with this job label
173+
podsClient := clientSet.CoreV1().Pods(ns)
174+
pods, err := podsClient.List(ctx, metav1.ListOptions{
175+
LabelSelector: fmt.Sprintf("job-name=%s", ti.Id),
176+
})
177+
// It's not clear from the documentation if an error is returned from the
178+
// List API call if no jobs are found, so we guard here just in case
179+
if err != nil && !errors.IsNotFound(err) {
180+
return err
181+
}
182+
183+
if pods == nil {
184+
log.Info("no pods found for job, returning", "job_id", ti.Id)
185+
return nil
186+
}
187+
188+
// Delete any pods stuck in pending
189+
for _, p := range pods.Items {
190+
if p.Status.Phase == corev1.PodPending {
191+
log.Warn("job pod is in pending phase in StopTask operation, cancelling", "job_id", ti.Id)
192+
if err := podsClient.Delete(ctx, p.Name, metav1.DeleteOptions{}); err != nil {
193+
if !errors.IsNotFound(err) {
194+
return err
195+
}
196+
}
197+
}
198+
}
199+
153200
return nil
154201
}
155202

@@ -205,8 +252,8 @@ func (p *TaskLauncher) StartTask(
205252
}
206253

207254
// Get container resource limits and requests
208-
var resourceLimits = make(map[corev1.ResourceName]k8sresource.Quantity)
209-
var resourceRequests = make(map[corev1.ResourceName]k8sresource.Quantity)
255+
resourceLimits := make(map[corev1.ResourceName]k8sresource.Quantity)
256+
resourceRequests := make(map[corev1.ResourceName]k8sresource.Quantity)
210257
resourceRequirements := corev1.ResourceRequirements{
211258
Limits: resourceLimits,
212259
Requests: resourceRequests,

0 commit comments

Comments
 (0)