Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 100 additions & 1 deletion controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,12 @@ import (
"sync"
"time"

"github.com/kubernetes-csi/csi-lib-utils/slowset"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"golang.org/x/time/rate"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
v1 "k8s.io/api/core/v1"
storage "k8s.io/api/storage/v1"
storagebeta "k8s.io/api/storage/v1beta1"
Expand Down Expand Up @@ -183,6 +186,10 @@ type ProvisionController struct {
volumeStore VolumeStore

volumeNameHook VolumeNameHook

slowSet *slowset.SlowSet

retryIntervalMax time.Duration
}

const (
Expand Down Expand Up @@ -216,6 +223,8 @@ const (
DefaultMetricsPath = "/metrics"
// DefaultAddFinalizer is used when option function AddFinalizer is omitted
DefaultAddFinalizer = false
// DefaultRetryIntervalMax is used when option function RetryIntervalMax is omitted
DefaultRetryIntervalMax = 5 * time.Minute
)

var errRuntime = fmt.Errorf("cannot call option functions after controller has Run")
Expand Down Expand Up @@ -451,6 +460,18 @@ func RetryPeriod(retryPeriod time.Duration) func(*ProvisionController) error {
}
}

// RetryIntervalMax is the maximum retry interval of failed provisioning or deletion.
// Defaults to 5 minutes.
func RetryIntervalMax(retryIntervalMax time.Duration) func(*ProvisionController) error {
return func(c *ProvisionController) error {
if c.HasRun() {
return errRuntime
}
c.retryIntervalMax = retryIntervalMax
return nil
}
}

// ClaimsInformer sets the informer to use for accessing PersistentVolumeClaims.
// Defaults to using a internal informer.
func ClaimsInformer(informer cache.SharedIndexInformer) func(*ProvisionController) error {
Expand Down Expand Up @@ -667,8 +688,11 @@ func NewProvisionController(
hasRun: false,
hasRunLock: &sync.Mutex{},
volumeNameHook: getProvisionedVolumeNameForClaim,
retryIntervalMax: DefaultRetryIntervalMax,
}

controller.slowSet = slowset.NewSlowSet(controller.retryIntervalMax)

for _, option := range options {
err := option(controller)
if err != nil {
Expand Down Expand Up @@ -840,6 +864,8 @@ func (ctrl *ProvisionController) Run(ctx context.Context) {
defer ctrl.claimQueue.ShutDown()
defer ctrl.volumeQueue.ShutDown()

go ctrl.slowSet.Run(ctx.Done())

ctrl.hasRunLock.Lock()
ctrl.hasRun = true
ctrl.hasRunLock.Unlock()
Expand Down Expand Up @@ -1085,6 +1111,10 @@ func (ctrl *ProvisionController) syncClaim(ctx context.Context, obj interface{})
return fmt.Errorf("expected claim but got %+v", obj)
}

if err := ctrl.delayProvisioningIfRecentlyInfeasible(claim); err != nil {
return err
}

should, err := ctrl.shouldProvision(ctx, claim)
if err != nil {
ctrl.updateProvisionStats(claim, err, time.Time{})
Expand Down Expand Up @@ -1494,7 +1524,20 @@ func (ctrl *ProvisionController) provisionClaimOperation(ctx context.Context, cl
}

ctx2 := klog.NewContext(ctx, logger)
err = fmt.Errorf("failed to provision volume with StorageClass %q: %v", claimClass, err)

if isInfeasibleError(err) {
logger.V(2).Info("Detected infeasible volume provisioning request",
"error", err,
"claim", klog.KObj(claim))

ctrl.markForSlowRetry(ctx, claim, err)

ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, "ProvisioningFailed",
fmt.Sprintf("Volume provisioning failed with infeasible error. Retries will be delayed. %v", err))

return ProvisioningFinished, err
}

return ctrl.provisionVolumeErrorHandling(ctx2, result, err, claim)
}

Expand All @@ -1519,6 +1562,62 @@ func (ctrl *ProvisionController) provisionClaimOperation(ctx context.Context, cl
return ProvisioningFinished, nil
}

func (ctrl *ProvisionController) delayProvisioningIfRecentlyInfeasible(claim *v1.PersistentVolumeClaim) error {
key := string(claim.UID)

claimClass := util.GetPersistentVolumeClaimClass(claim)
currentClass, err := ctrl.getStorageClass(claimClass)
if err != nil {
return nil
}

if info, exists := ctrl.slowSet.Get(key); exists {
if info.StorageClassUID != string(currentClass.UID) {
ctrl.slowSet.Remove(key)
return nil
}
}
Comment thread
jsafrane marked this conversation as resolved.
if delay := ctrl.slowSet.TimeRemaining(key); delay > 0 {
return util.NewDelayRetryError(fmt.Sprintf("skipping volume provisioning for pvc %s, because provisioning previously failed with infeasible error", key))
}
return nil
}

func (ctrl *ProvisionController) markForSlowRetry(ctx context.Context, claim *v1.PersistentVolumeClaim, err error) {
if isInfeasibleError(err) {
key := string(claim.UID)

claimClass := util.GetPersistentVolumeClaimClass(claim)
class, err := ctrl.getStorageClass(claimClass)
if err != nil {
logger := klog.FromContext(ctx)
logger.Error(err, "Failed to get StorageClass for delay tracking",
"PVC", klog.KObj(claim))
return
}

info := slowset.ObjectData{
Timestamp: time.Now(),
StorageClassUID: string(class.UID),
}
ctrl.slowSet.Add(key, info)
}
}

func isInfeasibleError(err error) bool {

st, ok := status.FromError(err)
if !ok {
return false
}

switch st.Code() {
case codes.InvalidArgument:
return true
}
return false
}

func (ctrl *ProvisionController) provisionVolumeErrorHandling(ctx context.Context, result ProvisioningState, err error, claim *v1.PersistentVolumeClaim) (ProvisioningState, error) {
logger := klog.FromContext(ctx)
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, "ProvisioningFailed", err.Error())
Expand Down
114 changes: 113 additions & 1 deletion controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ import (

"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
v1 "k8s.io/api/core/v1"
storage "k8s.io/api/storage/v1"
"k8s.io/apimachinery/pkg/api/resource"
Expand All @@ -47,6 +49,7 @@ import (
"k8s.io/klog/v2/ktesting"
_ "k8s.io/klog/v2/ktesting/init"
"sigs.k8s.io/sig-storage-lib-external-provisioner/v11/controller/metrics"
"sigs.k8s.io/sig-storage-lib-external-provisioner/v11/util"
)

const (
Expand Down Expand Up @@ -1618,6 +1621,107 @@ func TestControllerSharedInformers(t *testing.T) {
}
}

// TestInfeasibleRetry tests that sidecar doesn't spam plugin upon infeasible error code (e.g. invalid VAC parameter)
func TestInfeasibleRetry(t *testing.T) {
basePVC := newClaim("test-claim", "uid-1-1", "class-1", "foo.bar/baz", "", nil)
storageClass := newStorageClass("class-1", "foo.bar/baz")

tests := []struct {
name string
pvc *v1.PersistentVolumeClaim
expectedProvisionCallCount int
csiProvisionError error
eventuallyRemoveFromSlowSet bool
}{
{
name: "Should retry non-infeasible error normally",
pvc: basePVC,
expectedProvisionCallCount: 2,
csiProvisionError: status.Errorf(codes.Internal, "fake non-infeasible error"),
eventuallyRemoveFromSlowSet: false,
},
{
name: "Should NOT retry infeasible error normally",
pvc: basePVC,
expectedProvisionCallCount: 1,
csiProvisionError: status.Errorf(codes.InvalidArgument, "fake infeasible error"),
eventuallyRemoveFromSlowSet: false,
},
{
name: "Should EVENTUALLY retry infeasible error",
pvc: basePVC,
expectedProvisionCallCount: 2,
csiProvisionError: status.Errorf(codes.InvalidArgument, "fake infeasible error"),
eventuallyRemoveFromSlowSet: true,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
// Setup
_, ctx := ktesting.NewTestContext(t)

client := fake.NewSimpleClientset(test.pvc, storageClass)

provisioner := newTestProvisioner()
provisioner.returnError = test.csiProvisionError

ctrl := newTestProvisionController(ctx, client, "foo.bar/baz", provisioner)

if err := ctrl.classes.Add(storageClass); err != nil {
t.Fatalf("failed to add StorageClass to cache: %v", err)
}

// First attempt at provision
err := ctrl.syncClaim(ctx, test.pvc)
if !errors.Is(err, test.csiProvisionError) {
t.Errorf("expected error %v but got %v", test.csiProvisionError, err)
}

// For infeasible errors, verify the PVC was added to SlowSet
if status.Code(test.csiProvisionError) == codes.InvalidArgument {
key := string(test.pvc.UID)
if !ctrl.slowSet.Contains(key) {
t.Error("PVC should have been added to SlowSet after infeasible error")
}
}

// Fake time passing by removing from SlowSet
if test.eventuallyRemoveFromSlowSet {
key := string(test.pvc.UID)
ctrl.slowSet.Remove(key)
}

// Second attempt at provision
err2 := ctrl.syncClaim(ctx, test.pvc)
switch test.expectedProvisionCallCount {
case 1:
if !util.IsDelayRetryError(err2) {
t.Errorf("expected delay retry error but got %v", err2)
}
case 2:
if !errors.Is(err2, test.csiProvisionError) {
t.Errorf("expected error %v but got %v", test.csiProvisionError, err2)
}
default:
t.Errorf("unexpected provision error in second attempt: %v", err)
}

// Count the number of provision calls from the channel
provisionCount := 0
for len(provisioner.provisionCalls) > 0 {
<-provisioner.provisionCalls
provisionCount++
}

if test.expectedProvisionCallCount != provisionCount {
t.Errorf("expected %d provision calls, but got %d",
test.expectedProvisionCallCount, provisionCount)
}
})
}
}

type testMetrics struct {
provisioned counts
deleted counts
Expand Down Expand Up @@ -1986,11 +2090,15 @@ type provisionParams struct {
}

func newTestProvisioner() *testProvisioner {
return &testProvisioner{make(chan provisionParams, 16)}
return &testProvisioner{
provisionCalls: make(chan provisionParams, 16),
returnError: nil,
}
}

type testProvisioner struct {
provisionCalls chan provisionParams
returnError error
}

var _ Provisioner = &testProvisioner{}
Expand Down Expand Up @@ -2033,6 +2141,10 @@ func (p *testProvisioner) Provision(ctx context.Context, options ProvisionOption
allowedTopologies: options.StorageClass.AllowedTopologies,
}

if p.returnError != nil {
return nil, ProvisioningFinished, p.returnError
}

// Sleep to simulate work done by Provision...for long enough that
// TestMultipleControllers will consistently fail with lock disabled. If
// Provision happens too fast, the first controller creates the PV too soon
Expand Down
13 changes: 4 additions & 9 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@ module sigs.k8s.io/sig-storage-lib-external-provisioner/v11
go 1.24.0

require (
github.com/kubernetes-csi/csi-lib-utils v0.22.0
github.com/miekg/dns v1.1.66
github.com/prometheus/client_golang v1.22.0
github.com/prometheus/client_model v0.6.2
golang.org/x/time v0.11.0
google.golang.org/grpc v1.69.0
k8s.io/api v0.33.1
k8s.io/apimachinery v0.33.1
k8s.io/client-go v0.33.1
Expand All @@ -16,33 +18,27 @@ require (
require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/emicklei/go-restful/v3 v3.12.2 // indirect
github.com/evanphx/json-patch v5.6.0+incompatible // indirect
github.com/fxamacker/cbor/v2 v2.8.0 // indirect
github.com/go-logr/logr v1.4.3 // indirect
github.com/go-openapi/jsonpointer v0.21.1 // indirect
github.com/go-openapi/jsonreference v0.21.0 // indirect
github.com/go-openapi/swag v0.23.1 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/gnostic-models v0.6.9 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.9.0 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/common v0.64.0 // indirect
github.com/prometheus/procfs v0.16.1 // indirect
github.com/x448/float16 v0.8.4 // indirect
golang.org/x/crypto v0.38.0 // indirect
golang.org/x/mod v0.24.0 // indirect
golang.org/x/net v0.40.0 // indirect
golang.org/x/oauth2 v0.30.0 // indirect
Expand All @@ -51,11 +47,10 @@ require (
golang.org/x/term v0.32.0 // indirect
golang.org/x/text v0.25.0 // indirect
golang.org/x/tools v0.33.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20241216192217-9240e9c98484 // indirect
google.golang.org/protobuf v1.36.6 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 // indirect
Expand Down
Loading