Skip to content

Commit

Permalink
fix: Terminate, rather than delete, deadlined pods. Fixes #8545 (#8620)
Browse files Browse the repository at this point in the history
Signed-off-by: Alex Collins <[email protected]>
  • Loading branch information
alexec authored May 6, 2022
1 parent dd56520 commit 859ebe9
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 40 deletions.
18 changes: 0 additions & 18 deletions workflow/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -468,24 +468,6 @@ func (wfc *WorkflowController) processNextPodCleanupItem(ctx context.Context) bo
err := func() error {
pods := wfc.kubeclientset.CoreV1().Pods(namespace)
switch action {
case shutdownPod:
// to shutdown a pod, we signal the wait container to terminate, the wait container in turn will
// kill the main container (using whatever mechanism the executor uses), and will then exit itself
// once the main container exited
pod, err := wfc.getPod(namespace, podName)
if pod == nil || err != nil {
return err
}
for _, c := range pod.Spec.Containers {
if c.Name == common.WaitContainerName {
if err := signal.SignalContainer(wfc.restConfig, pod, common.WaitContainerName, syscall.SIGTERM); err != nil {
return err
}
return nil // done
}
}
// no wait container found
fallthrough
case terminateContainers:
if terminationGracePeriod, err := wfc.signalContainers(namespace, podName, syscall.SIGTERM); err != nil {
return err
Expand Down
38 changes: 17 additions & 21 deletions workflow/controller/exec_control.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (
"time"

apiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

wfv1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1"
"github.com/argoproj/argo-workflows/v3/workflow/common"
Expand Down Expand Up @@ -37,15 +36,13 @@ func (woc *wfOperationCtx) applyExecutionControl(ctx context.Context, pod *apiv1
_, onExitPod := pod.Labels[common.LabelKeyOnExit]

if !woc.GetShutdownStrategy().ShouldExecute(onExitPod) {
woc.log.Infof("Deleting Pending pod %s/%s as part of workflow shutdown with strategy: %s", pod.Namespace, pod.Name, woc.GetShutdownStrategy())
err := woc.controller.kubeclientset.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
if err == nil {
msg := fmt.Sprintf("workflow shutdown with strategy: %s", woc.GetShutdownStrategy())
woc.handleExecutionControlError(nodeID, wfNodesLock, msg)
return
}
// If we fail to delete the pod, fall back to setting the annotation
woc.log.Warnf("Failed to delete %s/%s: %v", pod.Namespace, pod.Name, err)
woc.log.WithField("podName", pod.Name).
WithField("shutdownStrategy", woc.GetShutdownStrategy()).
Info("Terminating pod as part of workflow shutdown")
woc.controller.queuePodForCleanup(pod.Namespace, pod.Name, terminateContainers)
msg := fmt.Sprintf("workflow shutdown with strategy: %s", woc.GetShutdownStrategy())
woc.handleExecutionControlError(nodeID, wfNodesLock, msg)
return
}
}
// Check if we are past the workflow deadline. If we are, and the pod is still pending
Expand All @@ -54,21 +51,20 @@ func (woc *wfOperationCtx) applyExecutionControl(ctx context.Context, pod *apiv1
// pods that are part of an onExit handler aren't subject to the deadline
_, onExitPod := pod.Labels[common.LabelKeyOnExit]
if !onExitPod {
woc.log.Infof("Deleting Pending pod %s/%s which has exceeded workflow deadline %s", pod.Namespace, pod.Name, woc.workflowDeadline)
err := woc.controller.kubeclientset.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
if err == nil {
woc.handleExecutionControlError(nodeID, wfNodesLock, "Step exceeded its deadline")
return
}
// If we fail to delete the pod, fall back to setting the annotation
woc.log.Warnf("Failed to delete %s/%s: %v", pod.Namespace, pod.Name, err)
woc.log.WithField("podName", pod.Name).
WithField(" workflowDeadline", woc.workflowDeadline).
Info("Terminating pod which has exceeded workflow deadline")
woc.controller.queuePodForCleanup(pod.Namespace, pod.Name, terminateContainers)
woc.handleExecutionControlError(nodeID, wfNodesLock, "Step exceeded its deadline")
return
}
}
}
if woc.GetShutdownStrategy().Enabled() {
if _, onExitPod := pod.Labels[common.LabelKeyOnExit]; !woc.GetShutdownStrategy().ShouldExecute(onExitPod) {
woc.log.Infof("Shutting down pod %s", pod.Name)
woc.controller.queuePodForCleanup(woc.wf.Namespace, pod.Name, shutdownPod)
woc.log.WithField("podName", pod.Name).
Info("Terminating on-exit pod")
woc.controller.queuePodForCleanup(woc.wf.Namespace, pod.Name, terminateContainers)
}
}
}
Expand Down Expand Up @@ -101,7 +97,7 @@ func (woc *wfOperationCtx) killDaemonedChildren(nodeID string) {
if !childNode.IsDaemoned() {
continue
}
woc.controller.queuePodForCleanup(woc.wf.Namespace, childNode.ID, shutdownPod)
woc.controller.queuePodForCleanup(woc.wf.Namespace, childNode.ID, terminateContainers)
childNode.Phase = wfv1.NodeSucceeded
childNode.Daemoned = nil
woc.wf.Status.Nodes[childNode.ID] = childNode
Expand Down
1 change: 0 additions & 1 deletion workflow/controller/pod_cleanup_key.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ type (

const (
deletePod podCleanupAction = "deletePod"
shutdownPod podCleanupAction = "shutdownPod"
labelPodCompleted podCleanupAction = "labelPodCompleted"
terminateContainers podCleanupAction = "terminateContainers"
killContainers podCleanupAction = "killContainers"
Expand Down

0 comments on commit 859ebe9

Please sign in to comment.