Skip to content

Commit

Permalink
improved drain timeout calculation
Browse files Browse the repository at this point in the history
  • Loading branch information
sssash18 committed Jun 24, 2024
1 parent fc34188 commit 198c755
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 12 deletions.
19 changes: 10 additions & 9 deletions pkg/util/provider/drain/drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -653,16 +653,12 @@ func (o *Options) evictPodsWithPVInternal(
returnCh chan error,
) (remainingPods []*corev1.Pod, fastTrack bool) {
var (
mainContext context.Context
cancelMainContext context.CancelFunc
retryPods []*corev1.Pod
retryPods []*corev1.Pod
)
mainContext, cancelMainContext = context.WithDeadline(ctx, o.drainStartedOn.Add(o.Timeout))
defer cancelMainContext()

for i, pod := range pods {
select {
case <-mainContext.Done():
case <-ctx.Done():
// Timeout occurred. Abort and report the remaining pods.
returnCh <- nil
return append(retryPods, pods[i+1:]...), true
Expand Down Expand Up @@ -739,7 +735,7 @@ func (o *Options) evictPodsWithPVInternal(
)

podVolumeInfo := podVolumeInfoMap[getPodKey(pod)]
ctx, cancelFn := context.WithTimeout(mainContext, o.getTerminationGracePeriod(pod)+o.PvDetachTimeout)
ctx, cancelFn := context.WithTimeout(ctx, o.getTerminationGracePeriod(pod)+o.PvDetachTimeout)
err = o.waitForDetach(ctx, podVolumeInfo, o.nodeName)
cancelFn()

Expand All @@ -762,7 +758,7 @@ func (o *Options) evictPodsWithPVInternal(
time.Since(podEvictionStartTime),
)

ctx, cancelFn = context.WithTimeout(mainContext, o.PvReattachTimeout)
ctx, cancelFn = context.WithTimeout(ctx, o.PvReattachTimeout)
err = o.waitForReattach(ctx, podVolumeInfo, o.nodeName, volumeAttachmentEventCh)
cancelFn()

Expand Down Expand Up @@ -1000,7 +996,12 @@ func (o *Options) evictPodWithoutPVInternal(ctx context.Context, attemptEvict bo
if i >= nretries {
attemptEvict = false
}

select {
case <-ctx.Done():
// Timeout occurred. Abort and report the remaining pods.
returnCh <- fmt.Errorf("timeout occured while waiting for pod %q terminating scheduled on node %v", pod.Name, pod.Spec.NodeName)
default:
}
if attemptEvict {
err = o.evictPod(ctx, pod, policyGroupVersion)
} else {
Expand Down
9 changes: 6 additions & 3 deletions pkg/util/provider/machinecontroller/machine_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -1044,6 +1044,8 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver
ReadonlyFilesystem v1.NodeConditionType = "ReadonlyFilesystem"
)

drainContext, cancelFn := context.WithDeadline(ctx, deleteMachineRequest.Machine.DeletionTimestamp.Add(timeOutDuration))
defer cancelFn()
if !isValidNodeName(nodeName) {
message := "Skipping drain as nodeName is not a valid one for machine."
printLogInitError(message, &err, &description, machine)
Expand Down Expand Up @@ -1112,7 +1114,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver
}

// update node with the machine's phase prior to termination
if err = c.UpdateNodeTerminationCondition(ctx, machine); err != nil {
if err = c.UpdateNodeTerminationCondition(drainContext, machine); err != nil {
if forceDeleteMachine {
klog.Warningf("Failed to update node conditions: %v. However, since it's a force deletion shall continue deletion of VM.", err)
} else {
Expand Down Expand Up @@ -1153,7 +1155,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver
c.volumeAttachmentHandler,
)
klog.V(3).Infof("(drainNode) Invoking RunDrain, forceDeleteMachine: %t, forceDeletePods: %t, timeOutDuration: %s", forceDeletePods, forceDeleteMachine, timeOutDuration)
err = drainOptions.RunDrain(ctx)
err = drainOptions.RunDrain(drainContext)
if err == nil {
// Drain successful
klog.V(2).Infof("Drain successful for machine %q ,providerID %q, backing node %q. \nBuf:%v \nErrBuf:%v", machine.Name, getProviderID(machine), getNodeName(machine), buf, errBuf)
Expand Down Expand Up @@ -1183,7 +1185,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver
}

updateRetryPeriod, updateErr := c.machineStatusUpdate(
ctx,
drainContext,
machine,
v1alpha1.LastOperation{
Description: description,
Expand Down Expand Up @@ -1408,6 +1410,7 @@ func (c *controller) getEffectiveDrainTimeout(machine *v1alpha1.Machine) *metav1
} else {
effectiveDrainTimeout = &c.safetyOptions.MachineDrainTimeout
}
effectiveDrainTimeout.Duration -= time.Now().Sub(machine.DeletionTimestamp.Time)
return effectiveDrainTimeout
}

Expand Down

0 comments on commit 198c755

Please sign in to comment.