Skip to content

Commit

Permalink
Fix indefinite stuck Pending pod on a deleted node
Browse files Browse the repository at this point in the history
  • Loading branch information
sunnylovestiramisu committed Mar 20, 2023
1 parent f1f3e83 commit 40f0c77
Showing 1 changed file with 26 additions and 17 deletions.
43 changes: 26 additions & 17 deletions controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1404,6 +1404,10 @@ func (ctrl *ProvisionController) provisionClaimOperation(ctx context.Context, cl
if nodeName, ok := getString(claim.Annotations, annSelectedNode, annAlphaSelectedNode); ok {
if ctrl.nodeLister != nil {
selectedNode, err = ctrl.nodeLister.Get(nodeName)
// if node does not exist, remove volume.kubernetes.io/selected-node annotation
if apierrs.IsNotFound(err) {
return ctrl.provisionVolumeErrorHandling(ctx, err, claim, operation)
}
} else {
selectedNode, err = ctrl.client.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) // TODO (verult) cache Nodes
}
Expand All @@ -1430,26 +1434,11 @@ func (ctrl *ProvisionController) provisionClaimOperation(ctx context.Context, cl
klog.Info(logOperation(operation, "volume provision ignored: %v", ierr))
return ProvisioningFinished, errStopProvision
}

err = fmt.Errorf("failed to provision volume with StorageClass %q: %v", claimClass, err)
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, "ProvisioningFailed", err.Error())
if _, ok := claim.Annotations[annSelectedNode]; ok && result == ProvisioningReschedule {
// For dynamic PV provisioning with delayed binding, the provisioner may fail
// because the node is wrong (permanent error) or currently unusable (not enough
// capacity). If the provisioner wants to give up scheduling with the currently
// selected node, then it can ask for that by returning ProvisioningReschedule
// as state.
//
// `selectedNode` must be removed to notify scheduler to schedule again.
if errLabel := ctrl.rescheduleProvisioning(ctx, claim); errLabel != nil {
klog.Info(logOperation(operation, "volume rescheduling failed: %v", errLabel))
// If unsetting that label fails in ctrl.rescheduleProvisioning, we
// keep the volume in the work queue as if the provisioner had
// returned ProvisioningFinished and simply try again later.
return ProvisioningFinished, err
}
// Label was removed, stop working on the volume.
klog.Info(logOperation(operation, "volume rescheduled because: %v", err))
return ProvisioningFinished, errStopProvision
return ctrl.provisionVolumeErrorHandling(ctx, err, claim, operation)
}

// ProvisioningReschedule shouldn't have been returned for volumes without selected node,
Expand Down Expand Up @@ -1485,6 +1474,26 @@ func (ctrl *ProvisionController) provisionClaimOperation(ctx context.Context, cl
return ProvisioningFinished, nil
}

func (ctrl *ProvisionController) provisionVolumeErrorHandling(ctx context.Context, err error, claim *v1.PersistentVolumeClaim, operation string) (ProvisioningState, error) {
// For dynamic PV provisioning with delayed binding, the provisioner may fail
// because the node is wrong (permanent error) or currently unusable (not enough
// capacity). If the provisioner wants to give up scheduling with the currently
// selected node, then it can ask for that by returning ProvisioningReschedule
// as state.
//
// `selectedNode` must be removed to notify scheduler to schedule again.
if errLabel := ctrl.rescheduleProvisioning(ctx, claim); errLabel != nil {
klog.Info(logOperation(operation, "volume rescheduling failed: %v", errLabel))
// If unsetting that label fails in ctrl.rescheduleProvisioning, we
// keep the volume in the work queue as if the provisioner had
// returned ProvisioningFinished and simply try again later.
return ProvisioningFinished, err
}
// Label was removed, stop working on the volume.
klog.Info(logOperation(operation, "volume rescheduled because: %v", err))
return ProvisioningFinished, errStopProvision
}

// deleteVolumeOperation attempts to delete the volume backing the given
// volume. Returns error, which indicates whether deletion should be retried
// (requeue the volume) or not
Expand Down

0 comments on commit 40f0c77

Please sign in to comment.