improved drain timeout calculation

gardener · Jun 24, 2024 · 198c755 · 198c755
1 parent fc34188
commit 198c755
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 12 deletions.
diff --git a/pkg/util/provider/drain/drain.go b/pkg/util/provider/drain/drain.go
@@ -653,16 +653,12 @@ func (o *Options) evictPodsWithPVInternal(
 	returnCh chan error,
 ) (remainingPods []*corev1.Pod, fastTrack bool) {
 	var (
-		mainContext       context.Context
-		cancelMainContext context.CancelFunc
-		retryPods         []*corev1.Pod
+		retryPods []*corev1.Pod
 	)
-	mainContext, cancelMainContext = context.WithDeadline(ctx, o.drainStartedOn.Add(o.Timeout))
-	defer cancelMainContext()
 
 	for i, pod := range pods {
 		select {
-		case <-mainContext.Done():
+		case <-ctx.Done():
 			// Timeout occurred. Abort and report the remaining pods.
 			returnCh <- nil
 			return append(retryPods, pods[i+1:]...), true
@@ -739,7 +735,7 @@ func (o *Options) evictPodsWithPVInternal(
 		)
 
 		podVolumeInfo := podVolumeInfoMap[getPodKey(pod)]
-		ctx, cancelFn := context.WithTimeout(mainContext, o.getTerminationGracePeriod(pod)+o.PvDetachTimeout)
+		ctx, cancelFn := context.WithTimeout(ctx, o.getTerminationGracePeriod(pod)+o.PvDetachTimeout)
 		err = o.waitForDetach(ctx, podVolumeInfo, o.nodeName)
 		cancelFn()
 
@@ -762,7 +758,7 @@ func (o *Options) evictPodsWithPVInternal(
 			time.Since(podEvictionStartTime),
 		)
 
-		ctx, cancelFn = context.WithTimeout(mainContext, o.PvReattachTimeout)
+		ctx, cancelFn = context.WithTimeout(ctx, o.PvReattachTimeout)
 		err = o.waitForReattach(ctx, podVolumeInfo, o.nodeName, volumeAttachmentEventCh)
 		cancelFn()
 
@@ -1000,7 +996,12 @@ func (o *Options) evictPodWithoutPVInternal(ctx context.Context, attemptEvict bo
 		if i >= nretries {
 			attemptEvict = false
 		}
-
+		select {
+		case <-ctx.Done():
+			// Timeout occurred. Abort and report the remaining pods.
+			returnCh <- fmt.Errorf("timeout occured while waiting for pod %q terminating scheduled on node %v", pod.Name, pod.Spec.NodeName)
+		default:
+		}
 		if attemptEvict {
 			err = o.evictPod(ctx, pod, policyGroupVersion)
 		} else {

diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go
@@ -1044,6 +1044,8 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver
 		ReadonlyFilesystem      v1.NodeConditionType = "ReadonlyFilesystem"
 	)
 
+	drainContext, cancelFn := context.WithDeadline(ctx, deleteMachineRequest.Machine.DeletionTimestamp.Add(timeOutDuration))
+	defer cancelFn()
 	if !isValidNodeName(nodeName) {
 		message := "Skipping drain as nodeName is not a valid one for machine."
 		printLogInitError(message, &err, &description, machine)
@@ -1112,7 +1114,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver
 		}
 
 		// update node with the machine's phase prior to termination
-		if err = c.UpdateNodeTerminationCondition(ctx, machine); err != nil {
+		if err = c.UpdateNodeTerminationCondition(drainContext, machine); err != nil {
 			if forceDeleteMachine {
 				klog.Warningf("Failed to update node conditions: %v. However, since it's a force deletion shall continue deletion of VM.", err)
 			} else {
@@ -1153,7 +1155,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver
 				c.volumeAttachmentHandler,
 			)
 			klog.V(3).Infof("(drainNode) Invoking RunDrain, forceDeleteMachine: %t, forceDeletePods: %t, timeOutDuration: %s", forceDeletePods, forceDeleteMachine, timeOutDuration)
-			err = drainOptions.RunDrain(ctx)
+			err = drainOptions.RunDrain(drainContext)
 			if err == nil {
 				// Drain successful
 				klog.V(2).Infof("Drain successful for machine %q ,providerID %q, backing node %q. \nBuf:%v \nErrBuf:%v", machine.Name, getProviderID(machine), getNodeName(machine), buf, errBuf)
@@ -1183,7 +1185,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver
 	}
 
 	updateRetryPeriod, updateErr := c.machineStatusUpdate(
-		ctx,
+		drainContext,
 		machine,
 		v1alpha1.LastOperation{
 			Description:    description,
@@ -1408,6 +1410,7 @@ func (c *controller) getEffectiveDrainTimeout(machine *v1alpha1.Machine) *metav1
 	} else {
 		effectiveDrainTimeout = &c.safetyOptions.MachineDrainTimeout
 	}
+	effectiveDrainTimeout.Duration -= time.Now().Sub(machine.DeletionTimestamp.Time)
 	return effectiveDrainTimeout
 }